问题描述
- 求助jsoup取页面之后输出的问题
-
取出的表格数据为空,请问这是string body的问题么...
如果要解决问题应该怎样修改...import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.net.HttpURLConnection; import java.net.URL; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class Main { public static void main(String[] args) throws Throwable { for (int i = 1; i <= 3; i++) { System.out.println(getPrice(i)); } } static List<String> getPrice(int pageNo) throws Throwable { Document doc = Jsoup.parse(getText(pageNo)); Elements trs = doc.select("#ctl00_cphMainFrame_Table1 tr"); List<String> result = new ArrayList<String>(trs.size()); for (int i = 1, l = trs.size(); i < l; i++) { Element tr = trs.get(i); result.add(tr.child(5).text()); } return result; } static String getText(int pageNo) throws Throwable { URL url = new URL("http://www.lnprice.gov.cn/wjjc/jgjc/ReportByDateOfPivot.aspx?PriceBureauMainType_Id=101&YM=201502&DP=28"); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("POST"); conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"); conn.setDoOutput(true); conn.connect(); String body = "ctl00%24cphMainFrame%24ScriptManager1=ctl00%24cphMainFrame%24UpdatePanel1%7Cctl00%24cphMainFrame%24aspnetpager1&ctl00%24cphMainFrame%24ddlYear=2015&ctl00%24cphMainFrame%24ddlMonth=02&ctl00%24cphMainFrame%24ddlTimePoint=28&__EVENTTARGET=ctl00%24cphMainFrame%24aspnetpager1&__LASTFOCUS=&__VIEWSTATE=%2FwEPDwULLTEwNTcyNDc4NjkPZBYCZg9kFgICAQ9kFgQCAQ8WAh4LXyFJdGVtQ291bnQCCxYWZg9kFgJmDxUCAzEwMQzlhpzlia%2Fkuqflk4FkAgEPZBYCZg8VAgMxMDcP5bel5Lia5raI6LS55ZOBZAICD2QWAmYPFQIDMTA4EuW3peS4mueUn%2BS6p%2Bi1hOaWmWQCAw9kFgJmDxUCAzEwORjln47luILlsYXmsJHmnI3liqHku7fmoLxkAgQPZBYCZg8VAgMxMTAY5Yac5p2R5bGF5rCR5pyN5Yqh5Lu35qC8ZAIFD2QWAmYPFQIDMTExDOa2ieWGnOS6p%2BWTgWQCBg9kFgJmDxUCAzEwMhwyMDEz5bm05Lul5YmN5bel5Lia5raI6LS55ZOBZAIHD2QWAmYPFQIDMTAzHzIwMTPlubTku6XliY3lt6XkuJrnlJ%2FkuqfotYTmlplkAggPZBYCZg8VAgMxMDQZMjAxM%2BW5tOS7peWJjeacjeWKoeS7t%2BagvGQCCQ9kFgJmDxUCAzEwNR8yMDEz5bm05Lul5YmN5Yac5Lia55Sf5Lqn6LWE5paZZAIKD2QWAmYPFQIDMTA2GTIwMTPlubTku6XliY3mtonlhpzkuqflk4FkAgMPZBYQAgEPDxYCHgRUZXh0BQzlhpzlia%2Fkuqflk4FkZAIDDw8WBB8BBVw8c3BhbiBzdHlsZT0ibWFyZ2luLWxlZnQ6MjBweDsiICBjbGFzcz0ibXNqZ19jaGF4dW5feHhrMV9iZzAwIiA%2B5oyJ5YiG57G75YWo55yB5p%2Bl6K%2BiPC9zcGFuPh4LTmF2aWdhdGVVcmwFM1JlcG9ydEJ5RGF0ZU9mUGl2b3QuYXNweD9QcmljZUJ1cmVhdU1haW5UeXBlX0lkPTEwMWRkAgQPDxYEHwEFQTxzcGFuIGNsYXNzPSJtc2pnX2NoYXh1bl94eGsxX2JnMTEiID7mjInllYblk4HliIbluILmn6Xor6I8L3NwYW4%2BHwIFNVJlcG9ydEdvb2RzSW5mb0J5Q2l0eS5hc3B4P1ByaWNlQnVyZWF1TWFpblR5cGVfSWQ9MTAxZGQCBQ8PFgQfAQVCPHNwYW4gY2xhc3M9Im1zamdfY2hheHVuX3h4azFfYmcxMSIgPuaMieaXtumXtOWIhuW4guafpeivoiA8L3NwYW4%2BHwIFMFJlcG9ydEluZm9ieVRpbWUuYXNweD9QcmljZUJ1cmVhdU1haW5UeXBlX0lkPTEwMWRkAgcPEA8WBh4NRGF0YVRleHRGaWVsZAUKQ3JlYXRlWWVhch4ORGF0YVZhbHVlRmllbGQFCkNyZWF0ZVllYXIeC18hRGF0YUJvdW5kZ2QQFRAEMjAwMAQyMDAxBDIwMDIEMjAwMwQyMDA0BDIwMDUEMjAwNgQyMDA3BDIwMDgEMjAwOQQyMDEwBDIwMTEEMjAxMgQyMDEzBDIwMTQEMjAxNRUQBDIwMDAEMjAwMQQyMDAyBDIwMDMEMjAwNAQyMDA1BDIwMDYEMjAwNwQyMDA4BDIwMDkEMjAxMAQyMDExBDIwMTIEMjAxMwQyMDE0BDIwMTUUKwMQZ2dnZ2dnZ2dnZ2dnZ2dnZxYBAg9kAgkPEGRkFgECAWQCCw8QDxYCHwVnZBAVBAnor7fpgInmi6kCMDUCMTUCMjUVBAEwAjI4AjI5AjMwFCsDBGdnZ2dkZAIPDxYCHgdWaXNpYmxlZxYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCAgMPFgIfBmcWAmYPZBYCZg9kFgICAQ8PFgYeCFBhZ2VTaXplAhQeEEN1cnJlbnRQYWdlSW5kZXgCAh4LUmVjb3JkY291bnQCL2RkZM%2FO1WQW50DLN7G3eiSyS6q2rewQ&__EVENTVALIDATION=%2FwEWJAKb97l9ArjilMkFApDM2c4FApDMreUCApDMsZgLApDMhT8CkMzp0wgCkMz99gECkMzBrQ4CkMzVwAYCkMz5KQKQzM3MCAL79f%2FVDwL79cOIBAL79devDQL79bvCBQL79Y%2F5AgL79ZOcCwLWm967DgLG9LjWAgLG9LzWAgLG9IDWAgLG9ITWAgLG9IjWAgLG9IzWAgLG9JDWAgLG9NTVAgLG9NjVAgLZ9LTWAgLZ9LjWAgLZ9LzWAgLSx8%2BzDgLMx%2B%2BzDgLMx%2BOzDgLPx4%2BwDgLy%2BZrvCEFw0vATX2wSsTwyj9sMOqdXBRc0&__ASYNCPOST=true&__EVENTARGUMENT=" + pageNo; conn.getOutputStream().write(body.getBytes()); byte[] buff = new byte[4096]; int count; ByteArrayOutputStream out = new ByteArrayOutputStream(4096); InputStream in = conn.getInputStream(); while((count = in.read(buff)) != -1) { out.write(buff, 0, count); } conn.disconnect(); return out.toString("UTF-8"); } }
解决方案
代码没有问题,可能网站数据是实时变动的,。
body内容试试这个参数,这个是刚刚抓到的(如果不行,找个最新的再试试):
String body = "ctl00%24cphMainFrame%24ScriptManager1=ctl00%24cphMainFrame%24UpdatePanel1%7Cctl00%24cphMainFrame%24aspnetpager1&ctl00%24cphMainFrame%24ddlYear=2015&ctl00%24cphMainFrame%24ddlMonth=02&ctl00%24cphMainFrame%24ddlTimePoint=28&__EVENTTARGET=ctl00%24cphMainFrame%24aspnetpager1&__LASTFOCUS=&__VIEWSTATE=%2FwEPDwULLTEwNTcyNDc4NjkPZBYCZg9kFgICAQ9kFgQCAQ8WAh4LXyFJdGVtQ291bnQCCxYWZg9kFgJmDxUCAzEwMQzlhpzlia%2Fkuqflk4FkAgEPZBYCZg8VAgMxMDcP5bel5Lia5raI6LS55ZOBZAICD2QWAmYPFQIDMTA4EuW3peS4mueUn%2BS6p%2Bi1hOaWmWQCAw9kFgJmDxUCAzEwORjln47luILlsYXmsJHmnI3liqHku7fmoLxkAgQPZBYCZg8VAgMxMTAY5Yac5p2R5bGF5rCR5pyN5Yqh5Lu35qC8ZAIFD2QWAmYPFQIDMTExDOa2ieWGnOS6p%2BWTgWQCBg9kFgJmDxUCAzEwMhwyMDEz5bm05Lul5YmN5bel5Lia5raI6LS55ZOBZAIHD2QWAmYPFQIDMTAzHzIwMTPlubTku6XliY3lt6XkuJrnlJ%2FkuqfotYTmlplkAggPZBYCZg8VAgMxMDQZMjAxM%2BW5tOS7peWJjeacjeWKoeS7t%2BagvGQCCQ9kFgJmDxUCAzEwNR8yMDEz5bm05Lul5YmN5Yac5Lia55Sf5Lqn6LWE5paZZAIKD2QWAmYPFQIDMTA2GTIwMTPlubTku6XliY3mtonlhpzkuqflk4FkAgMPZBYQAgEPDxYCHgRUZXh0BQzlhpzlia%2Fkuqflk4FkZAIDDw8WBB8BBVw8c3BhbiBzdHlsZT0ibWFyZ2luLWxlZnQ6MjBweDsiICBjbGFzcz0ibXNqZ19jaGF4dW5feHhrMV9iZzAwIiA%2B5oyJ5YiG57G75YWo55yB5p%2Bl6K%2BiPC9zcGFuPh4LTmF2aWdhdGVVcmwFM1JlcG9ydEJ5RGF0ZU9mUGl2b3QuYXNweD9QcmljZUJ1cmVhdU1haW5UeXBlX0lkPTEwMWRkAgQPDxYEHwEFQTxzcGFuIGNsYXNzPSJtc2pnX2NoYXh1bl94eGsxX2JnMTEiID7mjInllYblk4HliIbluILmn6Xor6I8L3NwYW4%2BHwIFNVJlcG9ydEdvb2RzSW5mb0J5Q2l0eS5hc3B4P1ByaWNlQnVyZWF1TWFpblR5cGVfSWQ9MTAxZGQCBQ8PFgQfAQVCPHNwYW4gY2xhc3M9Im1zamdfY2hheHVuX3h4azFfYmcxMSIgPuaMieaXtumXtOWIhuW4guafpeivoiA8L3NwYW4%2BHwIFMFJlcG9ydEluZm9ieVRpbWUuYXNweD9QcmljZUJ1cmVhdU1haW5UeXBlX0lkPTEwMWRkAgcPEA8WBh4NRGF0YVRleHRGaWVsZAUKQ3JlYXRlWWVhch4ORGF0YVZhbHVlRmllbGQFCkNyZWF0ZVllYXIeC18hRGF0YUJvdW5kZ2QQFRAEMjAwMAQyMDAxBDIwMDIEMjAwMwQyMDA0BDIwMDUEMjAwNgQyMDA3BDIwMDgEMjAwOQQyMDEwBDIwMTEEMjAxMgQyMDEzBDIwMTQEMjAxNRUQBDIwMDAEMjAwMQQyMDAyBDIwMDMEMjAwNAQyMDA1BDIwMDYEMjAwNwQyMDA4BDIwMDkEMjAxMAQyMDExBDIwMTIEMjAxMwQyMDE0BDIwMTUUKwMQZ2dnZ2dnZ2dnZ2dnZ2dnZxYBAg9kAgkPEGRkFgECAWQCCw8QDxYCHwVnZBAVBAnor7fpgInmi6kCMDUCMTUCMjUVBAEwAjI4AjI5AjMwFCsDBGdnZ2dkZAIPDxYCHgdWaXNpYmxlZxYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCAgMPFgIfBmcWAmYPZBYCZg9kFgICAQ8PFgYeCFBhZ2VTaXplAhQeEEN1cnJlbnRQYWdlSW5kZXgCAh4LUmVjb3JkY291bnQCL2RkZMVUAu8HLwRzj1xEKpBi8MSr0fYD&__EVENTVALIDATION=%2FwEWJAKS79jeCgK44pTJBQKQzNnOBQKQzK3lAgKQzLGYCwKQzIU%2FApDM6dMIApDM%2FfYBApDMwa0OApDM1cAGApDM%2BSkCkMzNzAgC%2B%2FX%2F1Q8C%2B%2FXDiAQC%2B%2FXXrw0C%2B%2FW7wgUC%2B%2FWP%2BQIC%2B%2FWTnAsC1pveuw4CxvS41gICxvS81gICxvSA1gICxvSE1gICxvSI1gICxvSM1gICxvSQ1gICxvTU1QICxvTY1QIC2fS01gIC2fS41gIC2fS81gIC0sfPsw4CzMfvsw4CzMfjsw4Cz8ePsA4C8vma7whOBwA2O0BJTn5kLqZv1C98W2UbZQ%3D%3D&__ASYNCPOST=true&&__EVENTARGUMENT=" + pageNo;
解决方案二:
草草写了,测试可用,参考一下:
package com.kukio.jsoup;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Product {
public List<Map<String,String>> getProductInfo(String url) throws IOException{
Document doc = Jsoup.connect(url).get();
Elements ele = doc.select("td[id]");
// System.out.println(ele.toString());
List<Map<String,String>> data = new ArrayList<Map<String,String>>();
for(int i=1;i<21;i++){
//list添加数据
data.add(getFoodInfo(i,ele));
}
return data;
}
private Map<String,String> getFoodInfo(int k,Elements ele){
Map<String,String> info = new HashMap<String,String>();
for(int i=0;i<4;i++){
//map添加数据
int j =0;
for(Element m : ele){
String value = m.attr("id");
if(value.equals("ctl00_cphMainFrame_td"+k+"SecondType")){
info.put("分类", m.text());
}
else if(value.equals("ctl00_cphMainFrame_td"+k+"TypeName")){
info.put("名称", m.text());
}
else if(value.equals("ctl00_cphMainFrame_td"+k+"GoodsTypeName")){
// System.out.println(m.text());
if(j == 0){
info.put("品牌", m.text());
}
else if(i == 01){
info.put("单位", m.text());
}
j++;
}
if(value.equals("ctl00_cphMainFrame_td"+k+"AvevageValue")){
info.put("平均价格", m.text());
}
}
}
return info;
}
}
测试:
package com.kukio.jsoup;
import java.io.IOException;
import java.util.List;
import java.util.Map;
public class Test {
public static void main(String[] args) throws IOException {
String url = "http://www.lnprice.gov.cn/wjjc/jgjc/ReportByDateOfPivot.aspx?PriceBureauMainType_Id=101&YM=201502&DP=28";
Product pro = new Product();
List<Map<String,String>> list = pro.getProductInfo(url);
for(int i=0;i<list.size();i++){
Map<String,String> map = list.get(i);
System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
System.out.println("<<< "+i);
System.out.println("分类: "+map.get("分类"));
System.out.println("名称: "+map.get("名称"));
System.out.println("品牌: "+map.get("品牌"));
System.out.println("单位: "+map.get("单位"));
System.out.println("平均价格: "+map.get("平均价格"));
System.out.println("************************************");
}
}
}
解决方案三:
谷歌浏览器下F12,开发者模式,点击下一页后,查看浏览器Network栏发送的请求头信息里面的表单数据就是你的body参数了。其它浏览器也有开发者模式。