在项目中遇到这样一个需求,需要将一段html转换为一般文本返回,万能的正则表达式来了。
正则表达式来拯救你,代码如下:
代码如下 | 复制代码 |
public static string Html2Text(string htmlStr) { if ( String .IsNullOrEmpty(htmlStr)) { return "" ; } string regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>" ; //定义style的正则表达式 string regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>" ; //定义script的正则表达式 string regEx_html = "<[^>]+>" ; //定义HTML标签的正则表达式 htmlStr = Regex .Replace(htmlStr, regEx_style, "" ); //删除css htmlStr = Regex .Replace(htmlStr, regEx_script, "" ); //删除js htmlStr = Regex .Replace(htmlStr, regEx_html, "" ); //删除html标记 htmlStr = Regex .Replace(htmlStr, "\\s*|\t|\r|\n" , "" ); //去除tab、空格、空行 htmlStr = htmlStr.Replace( " " , "" ); htmlStr = htmlStr.Replace( """ , "" ); //去除异常的引号" " " htmlStr = htmlStr.Replace( """ , "" ); return htmlStr.Trim(); }
|
代码二
以下是引用片段:
-----
代码如下 | 复制代码 |
/**/ /// <summary> /// 去除HTML标记 /// </summary> /// <param name="NoHTML">包括HTML的源码 </param> /// <returns>已经去除后的文字</returns> public static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase); Htmlstring.Replace("<", ""); return Htmlstring; /**/ ///提取HTML代码中文字的C#函数 public static string StripHTML(string strHtml) @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[" string[]aryRep = string newReg = aryReg[0]; 写一个静态方法移除HTML标签 #endregion #endregion |
例子三
代码如下 | 复制代码 |
public static string StripHTML(string source) { try } |