问题描述
本程序想实现的是把网页保存到本地没有乱码,并能成功的提取网页的标题和关键字。若把“注释一”下面的几行代码注释掉,“注释二”下的while语句内容就能正常运行,从中解析其他网页内容;若“注释一”内容不注释,“注释二”下的while语句内容不能正常运行,不能从本网页中解析url地址。publicvoidgetWebByUrl(StringstrUrl,Stringcharset,StringfileIndex){try{//if(charset==null||"".equals(charset))charset="utf-8";System.out.println("Gettingwebbyurl:"+strUrl);addReport("Gettingwebbyurl:"+strUrl+"n");URLurl=newURL(strUrl);bytebytes[]=newbyte[1024*1000];intindex=0;URLConnectionconn=url.openConnection();conn.setDoOutput(true);InputStreamis=null;is=url.openStream();//注释一//intcount=is.read(bytes,index,1024*100);////while(count!=-1){////index+=count;//count=is.read(bytes,index,1);//}StringfilePath=fPath+"/web"+fileIndex+".htm";//PrintWriterpw=null;FileOutputStreamfos=newFileOutputStream(filePath);//OutputStreamWriterwriter=newOutputStreamWriter(fos);//pw=newPrintWriter(writer);BufferedReaderbReader=newBufferedReader(newInputStreamReader(is));StringBuffersb=newStringBuffer();StringrLine=null;Stringtmp_rLine=null;//注释二while((rLine=bReader.readLine())!=null){tmp_rLine=rLine;intstr_len=tmp_rLine.length();if(str_len>0){sb.append("n"+tmp_rLine);//pw.println(tmp_rLine);//pw.flush();if(deepUrls.get(strUrl)<webDepth)getUrlByString(tmp_rLine,strUrl);}}tmp_rLine=null;fos.write(bytes,0,index);//is.close();//pw.close();fos.close();Stringcontext=sb.toString();Stringtt=getTitle(context);Stringt=getKeywords(context);System.out.println("Getwebsuccessfully!"+strUrl);System.out.println("title:"+tt);System.out.println("keywords:"+t);addReport("Getwebsuccessfully!"+strUrl+"n");addWebSuccessed();}catch(Exceptione){System.out.println("Getwebfailed!"+strUrl);addReport("Getwebfailed!"+strUrl+"n");addWebFailed();}}publicvoidgetUrlByString(StringinputArgs,StringstrUrl){StringtmpStr=inputArgs;StringregUrl="(?<=(href=)["]?[']?)[http://][^\s"'\?]*("+myDomain+")[^\s"'>]*";Patternp=Pattern.compile(regUrl,Pattern.CASE_INSENSITIVE);Matcherm=p.matcher(tmpStr);booleanblnp=m.find();//inti=0;while(blnp==true){if(!allUrls.containsKey(m.group(0))){System.out.println("Findanewurl,depth:"+(deepUrls.get(strUrl)+1)+""+m.group(0));addReport("Findanewurl,depth:"+(deepUrls.get(strUrl)+1)+""+m.group(0)+"n");arrUrls.add(m.group(0));arrUrl.add(m.group(0));allUrls.put(m.group(0),getIntWebIndex());deepUrls.put(m.group(0),(deepUrls.get(strUrl)+1));}tmpStr=tmpStr.substring(m.end(),tmpStr.length());m=p.matcher(tmpStr);blnp=m.find();}}
解决方案
解决方案二:
有人帮忙给看看吗
解决方案三:
该回复于2011-04-26 09:08:31被版主删除