一个简约灵活强大的Java爬虫框架。
Features:1、代码简单易懂,可定制性强2、简单且易于使用的api3、支持文件下载、分块抓取4、请求和相应支持的内容和选项比较丰富、每个请求可定制性强5、支持网络请求前后执行自定义操作6、Selenium+PhantomJS支持7、Redis支持
Future:1、Completethecodecommentandtest(完善代码注释和完善测试代码)
demo:importcom.github.xbynet.crawler.http.DefaultDownloader;importcom.github.xbynet.crawler.http.FileDownloader;importcom.github.xbynet.crawler.http.HttpClientFactory;importcom.github.xbynet.crawler.parser.JsoupParser;importcom.github.xbynet.crawler.scheduler.DefaultScheduler;publicclassGithubCrawlerextendsProcessor{@Overridepublicvoidprocess(Responseresp){StringcurrentUrl=resp.getRequest().getUrl();System.out.println("CurrentUrl:"+currentUrl);intrespCode=resp.getCode();System.out.println("ResponseCode:"+respCode);System.out.println("type:"+resp.getRespType().name());StringcontentType=resp.getContentType();System.out.println("ContentType:"+contentType);Map>headers=resp.getHeaders();System.out.println("ResonseHeaders:");for(Stringkey:headers.keySet()){Listvalues=headers.get(key);for(Stringstr:values){System.out.println(key+":"+str);}}JsoupParserparser=resp.html();//suppportparted,分块抓取是会有个parentresponse来关联所有分块response//System.out.println("isParted:"+resp.isPartResponse());//Responseparent=resp.getParentResponse();//resp.addPartRequest(null);//Mapextras=resp.getRequest().getExtras();if(currentUrl.equals("https://github.com/xbynet")){Stringavatar=parser.single("img.avatar","src");Stringdir=System.getProperty("java.io.tmpdir");StringsavePath=Paths.get(dir,UUID.randomUUID().toString()).toString();booleanavatarDownloaded=download(avatar,savePath);System.out.println("avatar:"+avatar+",saved:"+savePath);//System.out.println("avtardownloadedstatus:"+avatarDownloaded);Stringname=parser.single(".vcard-names>.vcard-fullname","text");System.out.println("name:"+name);Listreponames=parser.list(".pinned-repos-list.repo.js-repo","text");ListrepoUrls=parser.list(".pinned-repo-item.d-block>a","href");System.out.println("reponame:url");if(reponames!=null){for(inti=0;i<reponames.size();i++){StringtmpUrl="https://github.com"+repoUrls.get(i);System.out.println(reponames.get(i)+":"+tmpUrl);Requestreq=newRequest(tmpUrl).putExtra("name",reponames.get(i));resp.addRequest(req);}}}else{Mapextras=resp.getRequest().getExtras();Stringname=extras.get("name").toString();System.out.println("repoName:"+name);StringshortDesc=parser.single(".repository-meta-content","allText");System.out.println("shortDesc:"+shortDesc);}}publicvoidstart(){Sitesite=newSite();Spiderspider=Spider.builder(this).threadNum(5).site(site).urls("https://github.com/xbynet").build();spider.run();}publicstaticvoidmain(String[]args){newGithubCrawler().start();}publicvoidstartCompleteConfig(){StringpcUA="Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/58.0.3029.110Safari/537.36";StringandroidUA="Mozilla/5.0(Linux;Android5.1.1;Nexus6Build/LYZ28E)AppleWebKit/537.36(KHTML,likeGecko)Chrome/48.0.2564.23MobileSafari/537.36";Sitesite=newSite();site.setEncoding("UTF-8").setHeader("Referer","https://github.com/").setRetry(3).setRetrySleep(3000).setSleep(50).setTimeout(30000).setUa(pcUA);Requestrequest=newRequest("https://github.com/xbynet");HttpClientContextctx=newHttpClientContext();BasicCookieStorecookieStore=newBasicCookieStore();ctx.setCookieStore(cookieStore);request.setAction(newRequestAction(){@Overridepublicvoidbefore(CloseableHttpClientclient,HttpUriRequestreq){System.out.println("before-haha");}@Overridepublicvoidafter(CloseableHttpClientclient,CloseableHttpResponseresp){System.out.println("after-haha");}}).setCtx(ctx).setEncoding("UTF-8").putExtra("somekey","Icanuseintheresponsebyyourown").setHeader("User-Agent",pcUA).setMethod(Const.HttpMethod.GET).setPartRequest(null).setEntity(null).setParams("appkeyqqqqqq","1213131232141").setRetryCount(5).setRetrySleepTime(10000);Spiderspider=Spider.builder(this).threadNum(5).name("Spider-github-xbynet").defaultDownloader(newDefaultDownloader()).fileDownloader(newFileDownloader()).httpClientFactory(newHttpClientFactory()).ipProvider(null).listener(null).pool(null).scheduler(newDefaultScheduler()).shutdownOnComplete(true).site(site).build();spider.run();}}Examples:Github(github个人项目信息)OSChinaTweets(开源中国动弹)Qiushibaike(醜事百科)Neihanshequ(内涵段子)ZihuRecommend(知乎推荐)MoreExamples: Pleasesee here
Thinks:webmagic:本项目借鉴了webmagic多处代码,设计上也作了较多参考,非常感谢。xsoup:本项目使用xsoup作为底层xpath处理器 JsonPath:本项目使用JsonPath作为底层jsonpath处理器Jsoup 本项目使用Jsoup作为底层HTML/XML处理器HttpClient 本项目使用HttpClient作为底层网络请求工具
评论