问题描述
packagetools.crawler;importjava.io.IOException;importjava.io.InputStream;importjava.io.InputStreamReader;importjava.util.zip.GZIPInputStream;importorg.apache.commons.httpclient.HttpClient;importorg.apache.commons.httpclient.methods.GetMethod;publicclassDownLoad{publicstaticvoidmain(String[]args)throwsIOException{System.out.println(DownLoad.downfromweb("http://tech.sina.com.cn/mobile/n/2013-05-14/17328338983.shtml"));}publicstaticStringdownfromweb(Stringurl)throwsIOException{HttpClienthttpclient=newHttpClient();//httpclient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);//httpclient.getParams().setParameter("http.protocol.single-cookie-header",true);GetMethodgetMethod=newGetMethod(url);//http://itindex.net//google//getMethod.setRequestHeader("Host","laohuang.iteye.com");//getMethod.setRequestHeader("Connection","Keep-Alive");//getMethod.setRequestHeader("Accept","*/*");//getMethod.setRequestHeader("From","goolebot@googlebot.com");//getMethod.setRequestHeader("User-Agent",//"Mozilla/5.0(compatible;Googlebot/2.1;+http://www.google.com/bot.html)");//getMethod.setRequestHeader("Accept-Encoding","gzip,deflate");//baidugetMethod.setRequestHeader("Host","localhost");//itindex.netgetMethod.setRequestHeader("Connection","Keep-Alive");getMethod.setRequestHeader("Accept","*/*");getMethod.setRequestHeader("From","goolebot@googlebot.com");getMethod.setRequestHeader("User-Agent","Mozilla/5.0(WindowsNT5.1;rv:5.0)Gecko/20100101Firefox/5.0");getMethod.setRequestHeader("Accept-Encoding","gzip");intstatusCode=httpclient.executeMethod(getMethod);System.out.println(statusCode);if(statusCode==200)returngetBodyAsString(getMethod,getMethod.getResponseCharSet());elsereturn"";}privatestaticStringgetBodyAsString(GetMethodgetHC,Stringcharset)throwsIOException{StringacceptEncoding="";if(getHC.getResponseHeader("Content-Encoding")!=null)acceptEncoding=getHC.getResponseHeader("Content-Encoding").getValue();StringBuffersb=newStringBuffer();if(acceptEncoding.toLowerCase().indexOf("gzip")>-1){//建立gzip解压工作流InputStreamis=getHC.getResponseBodyAsStream();GZIPInputStreamgzin=newGZIPInputStream(is);InputStreamReaderisr=newInputStreamReader(gzin,charset);//设置读取流的编码格式,自定义编码java.io.BufferedReaderbr=newjava.io.BufferedReader(isr);Stringtempbf;while((tempbf=br.readLine())!=null){sb.append(tempbf);sb.append("rn");}isr.close();gzin.close();}else{InputStreamReaderisr=newInputStreamReader(getHC.getResponseBodyAsStream(),charset);//设置读取流的编码格式,自定义编码java.io.BufferedReaderbr=newjava.io.BufferedReader(isr);Stringtempbf;while((tempbf=br.readLine())!=null){sb.append(tempbf);sb.append("rn");}isr.close();}getHC.releaseConnection();returnsb.toString();}}
解决方案
解决方案二:
楼主解决了吗,我也遇到这个问题了,返回403
解决方案三:
403应该是权限不够引起的吧,看看你访问的页面是否需要什么特殊权限?
解决方案四:
引用2楼yyy269954107的回复:
403应该是权限不够引起的吧,看看你访问的页面是否需要什么特殊权限?
有的时候可以进去,有的时候403