woody是一款基于Java的HTML解析/提取器,用法非常类似 webmagic,是对其抽取模块的完全重写。
功能:
多种结果数据类型(String,char,byte,shortint,long,double,float,string[],Set,List,Data)支持用户之定义脚本处理函数(目前支持Javascript函数配置处理)支持css、xpath内核替换支持filter功能对css、xpath内核对象的缓存一个完整的例子:
publicclassOsChinaBlog{publicstaticvoidmain(String[]args)throwsException{Documentdoc=Jsoup.connect("https://www.oschina.net/news/43879/webmagic-0-3-0").timeout(60000).userAgent("Mozilla/5.0(Macintosh;IntelMacOSX10.8;rv:23.0)Gecko/20100101Firefox/23.0").get();Stringhtml=doc.html();OsChinaBlogModelmodel=AnnotationExtractor.me().process(html,OsChinaBlogModel.class);System.out.println(model.toJson());}publicstaticclassOsChinaBlogModelextendsModel{publicOsChinaBlogModel(){//usetoreflect}@Inject@ComboExtract(value={@ExtractBy(value="h1.OSCTitle",type=ExprType.CSS),@ExtractBy(value="//title/text()",type=ExprType.XPATH)},op=OP.OR)publicStringtitle;@Inject@ExtractBy(value="div.PubDatea[href~=https://my\\.oschina\\.net/]",type=ExprType.CSS)publicStringauthor;@Inject@ExtractBy(value="发布于.\\s*(\\d+年\\d+月\\d+日)",type=ExprType.REGEX)publicDatepublishDate;@Inject@ComboExtract(value={@ExtractBy(value="div.PubDate",type=ExprType.CSS,setting=@Setting(outerHtml=true)),@ExtractBy(value="(\\d+)评",type=ExprType.REGEX)},op=OP.AND)publicintcommentNum;@Inject@ExtractBy(value="span#p_favor_count",type=ExprType.CSS,setting=@Setting(function=@Function(value="replace",args={"+",""})))publicintcollectNum;@Inject@ComboExtract(value={@ExtractBy(value="div[id=userComments]",type=ExprType.CSS,setting=@Setting(outerHtml=true)),@ExtractBy(value="div.TextContent",type=ExprType.CSS)},op=OP.AND,multi=true)publicListcommentContents;@Inject@ExtractBy(value="div[id=toolbar_wrapper]",setting=@Setting(fliters={"b","span"}),type=ExprType.CSS,impl=Document.class)publicStringweibo;}}
评论