问题描述
在做一个抓取工具,使用htmlagilitypack解析时不会出现该错误,换用mshtml后开了多线程内存占用很快就跳到了1G+,随后就程序中断,是不是我在用mshtml解析html后需要主动释放什么资源,求指点voidGrabPaperByVIPWebClient(objecto){VIP.Model.Phasephase=oasVIP.Model.Phase;if(phase==null)return;VIPWebClientwc=newVIPWebClient();if(IsUrl(phase.Url)){if(!grabbing)return;grabjournalcount++;this.BeginInvoke(newEventHandler(SetStatus),"访问页面-"+grabjournalcount+"/"+journalcount);stringstrResponse="";try{byte[]bResponse=wc.DownloadData(phase.Url);strResponse=Encoding.UTF8.GetString(bResponse);//HtmlAgilityPack.HtmlDocumentdocument=newHtmlAgilityPack.HtmlDocument();//document.LoadHtml(strResponse);IHTMLDocument2document=newHTMLDocumentClass();document.designMode="on";document.write(strResponse);document.close();GetPaperData(document,"抓取文章",phase.ID.ToString());document=null;}catch(WebExceptionwebEx){Add2Log("错误","从地址("+phase.Url+")抓取时发生异常,详细信息:"+webEx.Message);if(webEx.Message.Contains("超时")||webEx.Message.Contains("403")){overtimephaseList.Add(phase);Add2Log("信息","从地址("+phase.Url+")抓取时发生错误,将该任务加入重试队列!");}}catch(System.AccessViolationExceptionavEx){Add2Log("错误","将从地址("+phase.Url+")抓取的数据传入mshtml时发生异常,详细信息:"+avEx.Message);}}else{Add2Log("警告",phase.Journal.JournalName+phase.PhaseString+"地址("+phase.Url+")不正确,无法进行抓取操作!");}this.BeginInvoke(newEventHandler(PaperThreaExit),null);}
解决方案
解决方案二:
voidGetPaperData(IHTMLDocument2document,stringtype,stringphaseid){if(document==null)return;VIP.BLL.Paperpp=newVIP.BLL.Paper();List<VIP.Model.Paper>papers=pp.GetPaperList(phaseid);VIPWebClientwc=newVIPWebClient();VIP.Model.Configtag=GetConfig("期刊抓取","文章列表容器tag");VIP.Model.Configattr=GetConfig("期刊抓取","文章列表容器识别参数及值");VIP.Model.Configwtag=GetConfig("期刊抓取","文章容器tag");VIP.Model.Configttag=GetConfig("期刊抓取","文章标题容器tag");VIP.Model.Configtattr=GetConfig("期刊抓取","文章标题容器识别参数及值");VIP.Model.Configatag=GetConfig("期刊抓取","文章作者容器tag");VIP.Model.Configaattr=GetConfig("期刊抓取","文章作者容器识别参数及值");VIP.Model.Configxpath=GetConfig("期刊抓取","文章容器XPath");VIP.Model.Configtxpath=GetConfig("期刊抓取","文章标题容器XPath");VIP.Model.Configaxpath=GetConfig("期刊抓取","文章作者容器XPath");//VIP.Model.Configqattr=GetConfig("期刊抓取","期刊期次容器识别参数及值");if(axpath!=null&&!String.IsNullOrEmpty(axpath.ConfigValue)&&txpath!=null&&!String.IsNullOrEmpty(txpath.ConfigValue)&&xpath!=null&&!String.IsNullOrEmpty(xpath.ConfigValue)&&tag!=null&&!String.IsNullOrEmpty(tag.ConfigValue)&&attr!=null&&!String.IsNullOrEmpty(attr.ConfigValue)&&wtag!=null&&!String.IsNullOrEmpty(wtag.ConfigValue)&&ttag!=null&&!String.IsNullOrEmpty(ttag.ConfigValue)&&tattr!=null&&!String.IsNullOrEmpty(tattr.ConfigValue)&&atag!=null&&!String.IsNullOrEmpty(atag.ConfigValue)&&aattr!=null&&!String.IsNullOrEmpty(aattr.ConfigValue)){string[]attri=attr.ConfigValue.Replace("class","className").Split(newchar[]{';',';'},StringSplitOptions.RemoveEmptyEntries);string[]tattri=tattr.ConfigValue.Replace("class","className").Split(newchar[]{';',';'},StringSplitOptions.RemoveEmptyEntries);string[]aattri=aattr.ConfigValue.Replace("class","className").Split(newchar[]{';',';'},StringSplitOptions.RemoveEmptyEntries);IHTMLElementCollectionhec=((IHTMLElement2)document.body).getElementsByTagName(tag.ConfigValue);VIP.BLL.Phasep=newVIP.BLL.Phase();VIP.Model.Phasephase=p.GetModel(Convert.ToInt32(phaseid));if(phase==null)return;foreach(IHTMLElementheinhec){booltarget=true;foreach(stringatinattri){string[]a=at.Split(newchar[]{':',':'},StringSplitOptions.RemoveEmptyEntries);if(a.Length==1){target=target&&he.innerText.Contains(a[0]);}elseif(a.Length==2){objectvalue=he.getAttribute(a[0]);if(value==null)value="";target=target&&value.ToString()==a[1];}}if(target){IHTMLElementCollectionpapersList=((IHTMLElement2)he).getElementsByTagName(wtag.ConfigValue);foreach(IHTMLElementpaperinpapersList){stringauthorstr="";IHTMLElementCollectionauthors=((IHTMLElement2)paper).getElementsByTagName(atag.ConfigValue);foreach(IHTMLElementauthorinauthors){boolatarget=true;foreach(stringatinaattri){string[]a=at.Split(newchar[]{':',':'},StringSplitOptions.RemoveEmptyEntries);if(a.Length==1){atarget=atarget&&author.innerText.Contains(a[0]);}elseif(a.Length==2){objectvalue=author.getAttribute(a[0]);if(value==null)value="";atarget=atarget&&value.ToString()==a[1];}}if(atarget){authorstr=author.innerText;}}authors=null;IHTMLElementCollectiontitles=((IHTMLElement2)paper).getElementsByTagName(ttag.ConfigValue);foreach(IHTMLElementtitleintitles){boolttarget=true;foreach(stringatintattri){string[]a=at.Split(newchar[]{':',':'},StringSplitOptions.RemoveEmptyEntries);if(a.Length==1){ttarget=ttarget&&title.innerText.Contains(a[0]);}elseif(a.Length==2){objectvalue=title.getAttribute(a[0]);if(value==null)value="";ttarget=ttarget&&value.ToString()==a[1];}}if(ttarget){stringurl=title.getAttribute("href").ToString().Replace("about:","");url=checkUrl(url);VIP.Model.Papermp=null;foreach(VIP.Model.Paperpppinpapers){if(ppp.Url==url){mp=ppp;break;}}if(mp==null)mp=newVIP.Model.Paper();stringtitlestr=title.innerText.Replace("'","");if(mp.Title!=titlestr||mp.JounalID!=Convert.ToInt32(phaseid)||mp.Author!=authorstr||String.IsNullOrEmpty(mp.KeyWords)){if(IsUrl(url)&&String.IsNullOrEmpty(mp.KeyWords)){byte[]bResponse=wc.DownloadData(url);stringstrResponse=Encoding.UTF8.GetString(bResponse);HTMLDocumentClassdoc=newHTMLDocumentClass();doc.designMode="on";doc.IHTMLDocument2_write(strResponse);doc.close();mp=GetPaperDetailData(doc,mp);doc=null;}mp.Title=titlestr;mp.Url=url;mp.JounalID=Convert.ToInt32(phaseid);mp.Author=authorstr;if(mp.ID==0){papers.Add(mp);}//mp=pp.Save(mp);}grabcount++;this.BeginInvoke(newEventHandler(SetCount),grabcount.ToString());}}titles=null;}}}pp.Save(papers);hec=null;}}
解决方案三:
Marshal.ReleaseComObject
解决方案四:
一直的想法就是,直接用开源组件获取HTML源代码(如遇AJAX,就用时间以及模拟鼠标键盘的动作,获取完整的HTML代码),1、假设这个HTML代码是在某个周期内不会进行大幅变动2、然后对某种标签,统计他的个数,计算位置,然后获取标签内的值
解决方案五:
解决方案六:
mshtml是线程安全的吗?
解决方案七:
偷别人的数据,还不如直接解释字符串快