问题描述
- 关于python采集的问题。
-
为啥采集之后变成下面这样了、采集url:https://www.google.com.hk/?gws_rd=ssl#safe=strict&q=1<!doctype html>
Google(function(){window.google={kEI:'QBvHVMODHIbx8gWir4KACQ',kEXPI:'4011559,4017578,4020346,4020562,4020726,4021587,4021598,4021965,4025828,4025891,4026005,4026109,4026111,4026330,4026376,4028127,4028129,4028398,4028468,4028490,4028508,8300096,8300111,8500393,8500852,8501118,10200083,10200905',authuser:0,kSID:'QBvHVMODHIbx8gWir4KACQ'};google.kHL='zh-HK';})();(function(){google.lc=[];google.li=0;google.getEI=function(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||google.kEI};google.https=function(){return"https:"==window.location.protocol};google.ml=function(){};google.time=function(){return(new Date).getTime()};google.log=function(a,b,d,e,k){var c=new Image,h=google.lc,f=google.li,g="",l=google.ls||"";c.onerror=c.onload=c.onabort=function(){delete h[f]};h[f]=c;d||-1!=b.search("&ei=")||(e=google.getEI(e),g="&ei="+e,e!=google.kEI&&(g+="&lei="+google.kEI));a=d||"/"+(k||"gen_204")+"?atyp=i&ct="+a+"&cad="+b+g+l+"&zx="+google.time();/^http:/i.test(a)&&google.https()?(google.ml(Error("a"),!1,{src:a,glmm:1}),delete h[f]):(c.src=a,google.li=f+1)};google.y={};google.x=function(a,b){google.y[a.id]=[a,b];return!1};google.load=function(a,b,d){google.x({id:a+m++},function(){google.load(a,b,d)})};var m=0;})();google.kCSI={};var _gjwl=location;function _gjuc(){var a=_gjwl.href.indexOf("#");if(0<=a&&(a=_gjwl.href.substring(a),0<a.indexOf("&q=")||0<=a.indexOf("#q="))&&(a=a.substring(1),-1==a.indexOf("#"))){for(var d=0;d<a.length;){var b=d;"&"==a.charAt(b)&&++b;var c=a.indexOf("&",b);-1==c&&(c=a.length);b=a.substring(b,c);if(0==b.indexOf("fp="))a=a.substring(0,d)+a.substring(c,a.length),c=d;else if("cad=h"==b)return 0;d=c}_gjwl.href="/search?"+a+"&cad=h";return 1}return 0}<br>
汉字完全没弄下来唉。。。采集gfsoso 199897.com 也是这样、</p>
解决方案
Google很多都是ajax异步请求,所以你拿到的数据都是返回的页面内容。
而搜索结果是Javascript异步后续插入的。
你这种情况需要考虑用selenium的webdriver来加载页面,然后获取页面加载结束后再扒取页面内容