也就是从百度抓点对公司有用的问题,然后能及时了解信息。
没有注释 ,太简单了,一看就懂哈。
只是作写烂的地方是,没有直接从结果里发邮件,而是用多线程生成文件,然后通过邮件发这些文件里的内容。。
UI没有,直接干!
主要涉及一个转码,可以直接搜索
#!/usr/bin/python # coding: UTF-8 import os,sys,time,commands import urllib import urllib2 import string from bs4 import BeautifulSoup import threading import smtplib from email.mime.text import MIMEText mail_host = 'smtp.x.x.com' mail_user = 'xx@xx.xx' mail_pwd = 'xxxx' keywordList = [] f1= open("ZhidaoKeyword",'r') for line in f1: line = line.strip('\n') keywordList.append(line) f1.close() def send_email( content, mailto, get_sub ): print 'Setting MIMEText' msg = MIMEText( content.encode('utf8'), _subtype = 'html', _charset = 'utf8') msg['From'] = mail_user msg['Subject'] = u'%s' % get_sub msg['To'] = ",".join( mailto ) try: print 'connecting ', mail_host s = smtplib.SMTP_SSL( mail_host, 465 ) #s.connect(mail_host) print 'login to mail_host' s.login(mail_user, mail_pwd ) print 'send email' s.sendmail(mail_user, mailto, msg.as_string()) print 'close the connection between the mail server' s.close() except Exception as e: print 'Exception: ', e class SearchZhidao(threading.Thread): def __init__(self,keywordList): threading.Thread.__init__(self) self.keywordList = keywordList def SearchZhidao(self): for keyword in self.keywordList: try: str = keyword.encode('gb2312') str_dic = {'word':str} encode_keyword = urllib.urlencode(str_dic) url = "http://zhidao.baidu.com/browse?" + encode_keyword + "&pn=0&cid=0&lm=8960" htmlpage = urllib2.urlopen(url).read() htmlpage = unicode(htmlpage, "gb2312").encode("utf8") soup = BeautifulSoup(htmlpage) for result_li in soup.findAll("li", {"class": "question-item"}): question_time = result_li.find("div", {"class": "question-time"}) q_time = question_time.get_text().split() if len(q_time) > 1 and u'小时' in q_time[1]: html_output = "<tr><td>" + keyword + "</td><td>" a_click = result_li.find("a") html_output += "<a class='question-title' href='http://zhidao.baidu.com" + a_click.get("href") + "' target='_blank'>" html_output += a_click.renderContents() + "</a></td>" html_output += "<td>" + question_time.get_text() + "</td></tr>" file_object.write(html_output) else: pass except Exception as e: print e,keyword continue def run(self): self.SearchZhidao() if __name__ == "__main__": print "start the programe...." SearchZhidaoThreads = [] file_object = open('zhidao_html_mail.html', 'w') file_object.write("<!DOCTYPE html><html><head><meta charset='UTF-8'><title>Seo</title></head><body><table>") for i in range(20): t = SearchZhidao(keywordList[((len(keywordList)+19)/20) * i:((len(keywordList)+19)/20) * (i+1)]) SearchZhidaoThreads.append(t) for i in range(len(SearchZhidaoThreads)): SearchZhidaoThreads[i].start() for i in range(len(SearchZhidaoThreads)): SearchZhidaoThreads[i].join() file_object.write("</table></body></html>") file_object.close() print "finished this job!" html_mail = "" f2= open("zhidao_html_mail.html",'r') for line in f2: html_mail += line f2.close() time_title = curTime = time.strftime("%Y-%m-%d %X", time.localtime(time.time())) to_list = [ 'xxx@xx.xx', 'xx@x.xxx' ] send_email( html_mail, to_list, '百度知道最近问题'+ time_title )
邮件输出样子:
时间: 2024-09-18 23:36:06