问题描述
packagecalf.study.htmlparser;importorg.htmlparser.Node;importorg.htmlparser.NodeFilter;importorg.htmlparser.Parser;importorg.htmlparser.filters.HasAttributeFilter;importorg.htmlparser.filters.TagNameFilter;importorg.htmlparser.util.NodeIterator;importorg.htmlparser.util.NodeList;publicclassTestHtmlParser{publicstaticvoidmain(String[]args)throwsException{Parserp=newParser();p.setURL("http://127.0.0.1:8080/love/20130801/44.html");p.setEncoding("UTF-8");NodeFiltertitleFilter=newTagNameFilter("title");NodeFilterfilterID=newHasAttributeFilter("id");p.parse(titleFilter);NodeListnodeList=p.extractAllNodesThatMatch(titleFilter);if(nodeList!=null&&nodeList.size()>0){Nodenode=nodeList.elementAt(0);Stringtitle=node.toPlainTextString();System.out.println("网页标题:"+title);}else{System.out.println("无法匹配网页标题!");}//p.reset();System.out.println("*****************************");System.out.println(p.getURL());System.out.println(p.getEncoding());}}有上面一段代码,使用htmlparser解析网页。想获取网页title但是得不到想要的结果。运行结果如下:无法匹配网页标题!*****************************http://127.0.0.1:8080/love/20130801/44.htmlUTF-8谁知道是什么原因?
解决方案
解决方案二:
importorg.htmlparser.Parser;importorg.htmlparser.visitors.HtmlPage;publicclassTestHtmlParser{publicstaticvoidmain(String[]args)throwsException{Parserp=newParser();p.setURL("http://www.sina.com.cn/");p.setEncoding("UTF-8");HtmlPagepage=newHtmlPage(p);p.visitAllNodesWith(page);Stringtitle=page.getTitle();System.out.println(title);}}
打印:新浪首页