XPath提取内容
//定位根节点
/ 往下层寻找
提取文本内容:/text()
提取属性内容 : /@XXXX
常规匹配
#-*-coding:utf8-*- from lxml import etree html = ''' <!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title>测试-常规用法</title> </head> <body> <div id="content"> <ul id="useful"> <li>这是第一条信息</li> <li>这是第二条信息</li> <li>这是第三条信息</li> </ul> <ul id="useless"> <li>不需要的信息1</li> <li>不需要的信息2</li> <li>不需要的信息3</li> </ul> <div id="url"> <a href="http://jikexueyuan.com">极客学院</a> <a href="http://jikexueyuan.com/course/" title="极客学院课程库">点我打开课程库</a> </div> </div> </body> </html> ''' selector = etree.HTML(html) #提取文本 content = selector.xpath('//ul[@id="useful"]/li/text()') for each in content: print each #提取属性 link = selector.xpath('//a/@href') for each in link: print each title = selector.xpath('//a/@title') print title[0]
特殊匹配
#-*-coding:utf8-*- from lxml import etree html1 = ''' <!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <div id="test-1">需要的内容1</div> <div id="test-2">需要的内容2</div> <div id="testfault">需要的内容3</div> </body> </html> ''' html2 = ''' <!DOCTYPE html> <html> <head lang="en"> <meta charset="UTF-8"> <title></title> </head> <body> <div id="test3"> 我左青龙, <span id="tiger"> 右白虎, <ul>上朱雀, <li>下玄武。</li> </ul> 老牛在当中, </span> 龙头在胸口。 </div> </body> </html> ''' selector = etree.HTML(html1) content = selector.xpath('//div[starts-with(@id,"test")]/text()') for each in content: print each # selector = etree.HTML(html2) # content_1 = selector.xpath('//div[@id="test3"]/text()') # for each in content_1: # print each # # # data = selector.xpath('//div[@id="test3"]')[0] # info = data.xpath('string(.)') # content_2 = info.replace('\n','').replace(' ','') # print content_2
单线程与多线程耗时比较
#-*-coding:utf8-*- from multiprocessing.dummy import Pool as ThreadPool import requests import time def getsource(url): html = requests.get(url) urls = [] for i in range(1,21): newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i) urls.append(newpage) time1 = time.time() for i in urls: print i getsource(i) time2 = time.time() print u'单线程耗时:' + str(time2-time1) pool = ThreadPool(4) time3 = time.time() results = pool.map(getsource, urls) pool.close() pool.join() time4 = time.time() print u'并行耗时:' + str(time4-time3)
多线程爬取百度贴吧
#-*-coding:utf8-*- from lxml import etree from multiprocessing.dummy import Pool as ThreadPool import requests import json import sys reload(sys) sys.setdefaultencoding('utf-8') '''重新运行之前请删除content.txt,因为文件操作使用追加方式,会导致内容太多。''' def towrite(contentdict): f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n') f.writelines(u'回帖内容:' + unicode(contentdict['topic_reply_content']) + '\n') f.writelines(u'回帖人:' + contentdict['user_name'] + '\n\n') def spider(url): html = requests.get(url) selector = etree.HTML(html.text) content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]') print content_field item = {} for each in content_field: reply_info = json.loads(each.xpath('@data-field')[0].replace('"', '')) author = reply_info['author']['user_name'] content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content clearfix"]/text()')[0] reply_time = reply_info['content']['date'] print content print reply_time print author item['user_name'] = author item['topic_reply_content'] = content item['topic_reply_time'] = reply_time towrite(item) if __name__ == '__main__': pool = ThreadPool(4) f = open('content.txt', 'a') page = [] for i in range(1, 21): newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i) page.append(newpage) results = pool.map(spider, page) pool.close() pool.join() f.close()
1、" 在html中表示英语中的双引号,如
<input id="txt" type="text" value=""好感动"">
你看下效果就知道了,主要为了区分语法上的引号
本文出自 “点滴积累” 博客,请务必保留此出处http://tianxingzhe.blog.51cto.com/3390077/1726602
时间: 2024-08-31 00:32:08