多线程用mshtml解析html时，内存暴涨，程序中断，如何处理？

问题描述

在做一个抓取工具，使用htmlagilitypack解析时不会出现该错误，换用mshtml后开了多线程内存占用很快就跳到了1G+，随后就程序中断，是不是我在用mshtml解析html后需要主动释放什么资源，求指点voidGrabPaperByVIPWebClient(objecto){VIP.Model.Phasephase=oasVIP.Model.Phase;if(phase==null)return;VIPWebClientwc=newVIPWebClient();if(IsUrl(phase.Url)){if(!grabbing)return;grabjournalcount++;this.BeginInvoke(newEventHandler(SetStatus),"访问页面-"+grabjournalcount+"/"+journalcount);stringstrResponse="";try{byte[]bResponse=wc.DownloadData(phase.Url);strResponse=Encoding.UTF8.GetString(bResponse);//HtmlAgilityPack.HtmlDocumentdocument=newHtmlAgilityPack.HtmlDocument();//document.LoadHtml(strResponse);IHTMLDocument2document=newHTMLDocumentClass();document.designMode="on";document.write(strResponse);document.close();GetPaperData(document,"抓取文章",phase.ID.ToString());document=null;}catch(WebExceptionwebEx){Add2Log("错误","从地址（"+phase.Url+"）抓取时发生异常，详细信息："+webEx.Message);if(webEx.Message.Contains("超时")||webEx.Message.Contains("403")){overtimephaseList.Add(phase);Add2Log("信息","从地址（"+phase.Url+"）抓取时发生错误，将该任务加入重试队列！");}}catch(System.AccessViolationExceptionavEx){Add2Log("错误","将从地址（"+phase.Url+"）抓取的数据传入mshtml时发生异常，详细信息："+avEx.Message);}}else{Add2Log("警告",phase.Journal.JournalName+phase.PhaseString+"地址（"+phase.Url+"）不正确，无法进行抓取操作！");}this.BeginInvoke(newEventHandler(PaperThreaExit),null);}

解决方案

解决方案二：
voidGetPaperData(IHTMLDocument2document,stringtype,stringphaseid){if(document==null)return;VIP.BLL.Paperpp=newVIP.BLL.Paper();List<VIP.Model.Paper>papers=pp.GetPaperList(phaseid);VIPWebClientwc=newVIPWebClient();VIP.Model.Configtag=GetConfig("期刊抓取","文章列表容器tag");VIP.Model.Configattr=GetConfig("期刊抓取","文章列表容器识别参数及值");VIP.Model.Configwtag=GetConfig("期刊抓取","文章容器tag");VIP.Model.Configttag=GetConfig("期刊抓取","文章标题容器tag");VIP.Model.Configtattr=GetConfig("期刊抓取","文章标题容器识别参数及值");VIP.Model.Configatag=GetConfig("期刊抓取","文章作者容器tag");VIP.Model.Configaattr=GetConfig("期刊抓取","文章作者容器识别参数及值");VIP.Model.Configxpath=GetConfig("期刊抓取","文章容器XPath");VIP.Model.Configtxpath=GetConfig("期刊抓取","文章标题容器XPath");VIP.Model.Configaxpath=GetConfig("期刊抓取","文章作者容器XPath");//VIP.Model.Configqattr=GetConfig("期刊抓取","期刊期次容器识别参数及值");if(axpath!=null&&!String.IsNullOrEmpty(axpath.ConfigValue)&&txpath!=null&&!String.IsNullOrEmpty(txpath.ConfigValue)&&xpath!=null&&!String.IsNullOrEmpty(xpath.ConfigValue)&&tag!=null&&!String.IsNullOrEmpty(tag.ConfigValue)&&attr!=null&&!String.IsNullOrEmpty(attr.ConfigValue)&&wtag!=null&&!String.IsNullOrEmpty(wtag.ConfigValue)&&ttag!=null&&!String.IsNullOrEmpty(ttag.ConfigValue)&&tattr!=null&&!String.IsNullOrEmpty(tattr.ConfigValue)&&atag!=null&&!String.IsNullOrEmpty(atag.ConfigValue)&&aattr!=null&&!String.IsNullOrEmpty(aattr.ConfigValue)){string[]attri=attr.ConfigValue.Replace("class","className").Split(newchar[]{';','；'},StringSplitOptions.RemoveEmptyEntries);string[]tattri=tattr.ConfigValue.Replace("class","className").Split(newchar[]{';','；'},StringSplitOptions.RemoveEmptyEntries);string[]aattri=aattr.ConfigValue.Replace("class","className").Split(newchar[]{';','；'},StringSplitOptions.RemoveEmptyEntries);IHTMLElementCollectionhec=((IHTMLElement2)document.body).getElementsByTagName(tag.ConfigValue);VIP.BLL.Phasep=newVIP.BLL.Phase();VIP.Model.Phasephase=p.GetModel(Convert.ToInt32(phaseid));if(phase==null)return;foreach(IHTMLElementheinhec){booltarget=true;foreach(stringatinattri){string[]a=at.Split(newchar[]{':','：'},StringSplitOptions.RemoveEmptyEntries);if(a.Length==1){target=target&&he.innerText.Contains(a[0]);}elseif(a.Length==2){objectvalue=he.getAttribute(a[0]);if(value==null)value="";target=target&&value.ToString()==a[1];}}if(target){IHTMLElementCollectionpapersList=((IHTMLElement2)he).getElementsByTagName(wtag.ConfigValue);foreach(IHTMLElementpaperinpapersList){stringauthorstr="";IHTMLElementCollectionauthors=((IHTMLElement2)paper).getElementsByTagName(atag.ConfigValue);foreach(IHTMLElementauthorinauthors){boolatarget=true;foreach(stringatinaattri){string[]a=at.Split(newchar[]{':','：'},StringSplitOptions.RemoveEmptyEntries);if(a.Length==1){atarget=atarget&&author.innerText.Contains(a[0]);}elseif(a.Length==2){objectvalue=author.getAttribute(a[0]);if(value==null)value="";atarget=atarget&&value.ToString()==a[1];}}if(atarget){authorstr=author.innerText;}}authors=null;IHTMLElementCollectiontitles=((IHTMLElement2)paper).getElementsByTagName(ttag.ConfigValue);foreach(IHTMLElementtitleintitles){boolttarget=true;foreach(stringatintattri){string[]a=at.Split(newchar[]{':','：'},StringSplitOptions.RemoveEmptyEntries);if(a.Length==1){ttarget=ttarget&&title.innerText.Contains(a[0]);}elseif(a.Length==2){objectvalue=title.getAttribute(a[0]);if(value==null)value="";ttarget=ttarget&&value.ToString()==a[1];}}if(ttarget){stringurl=title.getAttribute("href").ToString().Replace("about:","");url=checkUrl(url);VIP.Model.Papermp=null;foreach(VIP.Model.Paperpppinpapers){if(ppp.Url==url){mp=ppp;break;}}if(mp==null)mp=newVIP.Model.Paper();stringtitlestr=title.innerText.Replace("'","");if(mp.Title!=titlestr||mp.JounalID!=Convert.ToInt32(phaseid)||mp.Author!=authorstr||String.IsNullOrEmpty(mp.KeyWords)){if(IsUrl(url)&&String.IsNullOrEmpty(mp.KeyWords)){byte[]bResponse=wc.DownloadData(url);stringstrResponse=Encoding.UTF8.GetString(bResponse);HTMLDocumentClassdoc=newHTMLDocumentClass();doc.designMode="on";doc.IHTMLDocument2_write(strResponse);doc.close();mp=GetPaperDetailData(doc,mp);doc=null;}mp.Title=titlestr;mp.Url=url;mp.JounalID=Convert.ToInt32(phaseid);mp.Author=authorstr;if(mp.ID==0){papers.Add(mp);}//mp=pp.Save(mp);}grabcount++;this.BeginInvoke(newEventHandler(SetCount),grabcount.ToString());}}titles=null;}}}pp.Save(papers);hec=null;}}

解决方案三：
Marshal.ReleaseComObject
解决方案四：
一直的想法就是，直接用开源组件获取HTML源代码（如遇AJAX，就用时间以及模拟鼠标键盘的动作，获取完整的HTML代码），1、假设这个HTML代码是在某个周期内不会进行大幅变动2、然后对某种标签，统计他的个数，计算位置，然后获取标签内的值
解决方案五：

解决方案六：
mshtml是线程安全的吗？
解决方案七：
偷别人的数据，还不如直接解释字符串快

时间： 2024-10-31 06:22:35

多线程用mshtml解析html时，内存暴涨，程序中断，如何处理？

问题描述

解决方案

多线程用mshtml解析html时，内存暴涨，程序中断，如何处理？的相关文章

C#菜鸟，将大约六七兆的数据进行加密输出时内存暴涨，出现outofmemoryException,求大神解答!

java解析xml文件多个线程同时访问时内存溢出

记一次 Node.js 应用内存暴涨分析

redis-Redis2.8.17 内存暴涨怎么解决？

c++-用new申请了一个结构体数组，delete时内存错误

outofmemory-Java程序中Jsoup解析大量html内存溢出，资源没有回收，内存一直增长

jsoup解析网页时“www”变成“m”的问题

java-当解析 XML 时出现的错误

2 pdfbox pdf-PDFBox解析PDF时，这种编码格式的解析不了unijis-ucs2-hw-h