问题描述
packagewebutil.htmlutil;importorg.htmlparser.Node;importorg.htmlparser.NodeFilter;importorg.htmlparser.Parser;importorg.htmlparser.filters.NodeClassFilter;importorg.htmlparser.filters.OrFilter;importorg.htmlparser.filters.TagNameFilter;importorg.htmlparser.tags.Div;importorg.htmlparser.tags.ParagraphTag;importorg.htmlparser.tags.TableTag;importorg.htmlparser.util.NodeList;/***标题:利用htmlparser提取网页纯文本的例子*@author**/publicclassGetT{publicstaticvoidtestHtml(){try{StringsCurrentLine;StringsTotalString;sCurrentLine="";sTotalString="";java.io.InputStreaml_urlStream;java.net.URLl_url=newjava.net.URL("http://www.51voa.com/VOA_Special_English/researchers-brain-stimulation-cocaine-addiction-49619.html");java.net.HttpURLConnectionl_connection=(java.net.HttpURLConnection)l_url.openConnection();l_connection.connect();l_urlStream=l_connection.getInputStream();java.io.BufferedReaderl_reader=newjava.io.BufferedReader(newjava.io.InputStreamReader(l_urlStream));while((sCurrentLine=l_reader.readLine())!=null){sTotalString+=sCurrentLine+"rn";}//System.out.println(sTotalString);System.out.println("====================");StringtestText=extractText(sTotalString);System.out.println(testText);}catch(Exceptione){e.printStackTrace();}}/***抽取纯文本信息**@paraminputHtml*@return*/publicstaticStringextractText(StringinputHtml)throwsException{StringBuffertext=newStringBuffer();//System.out.println(inputHtml);Parserparser=Parser.createParser(newString(inputHtml.getBytes(),"GBK"),"GBK");//遍历所有的节点NodeFilter[]filters=newNodeFilter[3];filters[0]=newNodeClassFilter(TableTag.class);filters[1]=newNodeClassFilter(ParagraphTag.class);filters[2]=newNodeClassFilter(Div.class);NodeFilterfilter=newOrFilter(filters);NodeListlist=parser.extractAllNodesThatMatch(filter);//for(inti=0;i<list.size();i++){//System.out.println(list.elementAt(i).toHtml());//}System.out.println(list.size());for(inti=0;i<list.size();i++){Nodenodet=list.elementAt(i);//System.out.println("NODES["+i+"]:"+newString(nodet.toPlainTextString().getBytes("GBK")));text.append(newString(nodet.toPlainTextString().getBytes("GBK"))+"rn");}System.out.println(text.toString());returntext.toString();}/***读取文件的方式来分析内容.filePath也可以是一个Url.**@paramresource*文件/Url*/publicstaticvoidtest5(Stringresource)throwsException{ParsermyParser=newParser(resource);//设置编码myParser.setEncoding("GBK");StringfilterStr="table";NodeFilterfilter=newTagNameFilter(filterStr);NodeListnodeList=myParser.extractAllNodesThatMatch(filter);TableTagtabletag=(TableTag)nodeList.elementAt(11);System.out.println(tabletag.toHtml());System.out.println("==============");}publicstaticvoidmain(String[]args)throwsException{//test5("http://news.ccidnet.com/index.htm");testHtml();}}
解决方案
解决方案二:
用HasChildFilter可以解决?
解决方案三:
求"用htmlparser提取html中的纯文本,获得<p></p>之间的内容"完整代码