问题描述
///我那样设计token结构可以么,书上解析编程语言时把括号都放进去了,但是感觉没用啊,所以我xml解析出来的全是节点值,不会有“<</>”这些的。而且也不是返回token而是返回tokenlist。这算词法分析吗usingSystem.Collections;usingSystem;usingSystem.Collections.Generic;usingSystem.Linq;usingSystem.Text;usingSystem.IO;enumTokenType{VERSION,//<??>STARTTAG,//<ENDTAG,//</COMMENT,NAME,VALUE,TEXT,CDATA}structtoken{publicstringStr;publicTokenTypeType;}enumState{BEGIAN,SATRTTAG,SATRTLB,ENDTAG,TEXT,//?>NAME,VALUE,CDATA,EQUALSIGN,COMMENT,//<!--VERSION,EQUAL,DOUBLEQUOTEVALUE,//双引号SIGNLEQUOTEVALUE,//单引号ENDVALUE,ERROR}classXmlAnalysis{privatestringmXmlText;privateDictionary<string,string>mVersion;privateboolmSyntax;publicXmlAnalysis(){mXmlText=string.Empty;mVersion=newDictionary<string,string>();mSyntax=true;}publicboolIsEmpty(){returnmSyntax;}publicvoidReadxml(stringxml){mXmlText=xml;}publicList<token>gettoken(stringXmlText){intindex=0;tokenTokenNode=newtoken();List<token>ListToken=newList<token>();Statestate=State.BEGIAN;StringBuildersb=newStringBuilder();boolFirstOrSecond=false;while(index<XmlText.Length){charc=XmlText[index];switch(state){caseState.BEGIAN://初始if(c=='<'){state=State.SATRTLB;}elseif(c==''||c=='n'||c=='r'||c=='t'){state=State.BEGIAN;}else{state=State.TEXT;}break;caseState.SATRTLB://开始switch(c){case'/':state=State.ENDTAG;break;case'?':state=State.VERSION;break;case'!':if(index+2<XmlText.Length&&XmlText[index+1]=='-'&&XmlText[index+2]=='-'){state=State.COMMENT;index+=2;}elseif(XmlText.Substring(index+1).StartsWith("[CDATA[")){state=State.CDATA;}else{state=State.ERROR;}break;default:if(char.IsLetter(c)){state=State.SATRTTAG;sb.Append(c);}else{Console.WriteLine("<后有不能识别的字符");state=State.ERROR;}break;}break;caseState.SATRTTAG://TAG的以后字符if(char.IsLetterOrDigit(c)||c=='_'||c=='-'){sb.Append(c);state=State.SATRTTAG;}else{switch(c){case'':Console.WriteLine(sb);state=State.NAME;TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.STARTTAG;ListToken.Add(TokenNode);break;case'>':Console.WriteLine(sb);TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.STARTTAG;ListToken.Add(TokenNode);state=State.TEXT;break;default:Console.WriteLine("标志元素存在不能识别的字符");state=State.ERROR;break;}sb.Remove(0,sb.Length);}break;caseState.TEXT:if(c=='<'){if(sb.Length>0){Console.WriteLine(sb);TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.TEXT;ListToken.Add(TokenNode);sb.Remove(0,sb.Length);}state=State.SATRTLB;}elseif(c=='&'){intnum=0;charReference=GetReference(XmlText.Substring(index+1),refnum);if(Reference!='0'){sb.Append(Reference);index+=num;}else{sb.Append(c);}}else{sb.Append(c);}break;caseState.ENDTAG:if(c=='>'){Console.WriteLine(sb);TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.ENDTAG;ListToken.Add(TokenNode);sb.Remove(0,sb.Length);state=State.BEGIAN;}elseif(char.IsLetterOrDigit(c)||c=='_'||c=='-'){sb.Append(c);}else{Console.WriteLine("结束文本出现");state=State.ERROR;}break;caseState.NAME:if(!FirstOrSecond&&char.IsLetter(c)){FirstOrSecond=true;sb.Append(c);}elseif(FirstOrSecond&&(char.IsLetterOrDigit(c)||c=='-'||c=='_')){sb.Append(c);}elseif(FirstOrSecond&&c==''){break;}elseif(c=='='){Console.WriteLine(sb);TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.NAME;ListToken.Add(TokenNode);sb.Remove(0,sb.Length);state=State.EQUALSIGN;}else{Console.WriteLine("属性名出错");state=State.ERROR;}break;caseState.EQUALSIGN:if(c==''){break;}elseif(c=='"'){state=State.DOUBLEQUOTEVALUE;}elseif(c=='''){state=State.SIGNLEQUOTEVALUE;}else{Console.WriteLine("属性出错");}break;caseState.DOUBLEQUOTEVALUE:if(c=='"'){Console.WriteLine(sb);TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.VALUE;ListToken.Add(TokenNode);sb.Remove(0,sb.Length);state=State.ENDVALUE;}elseif(!char.IsControl(c)){sb.Append(c);}else{Console.WriteLine("属性值出错");state=State.ERROR;}break;
解决方案
解决方案二:
caseState.ENDVALUE:if(c=='>'){state=State.TEXT;}elseif(c==''){state=State.NAME;}elseif(!char.IsControl(c)){sb.Append(c);}else{Console.WriteLine("属性后面出错");state=State.ERROR;}break;caseState.SIGNLEQUOTEVALUE:if(c=='''){Console.WriteLine(sb);TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.VALUE;ListToken.Add(TokenNode);sb.Remove(0,sb.Length);state=State.ENDVALUE;}elseif(!char.IsControl(c)){sb.Append(c);}else{Console.WriteLine("单引号后出错");state=State.ERROR;}break;caseState.VERSION:if(XmlText[index]=='?'&&XmlText[index+1]=='>'){Console.WriteLine(sb);sb.Remove(0,sb.Length);index+=1;state=State.BEGIAN;}else{sb.Append(c);}break;caseState.CDATA:if(index+2<XmlText.Length&&c==']'&&XmlText[index+1]==']'&&XmlText[index+2]=='>'){TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.CDATA;ListToken.Add(TokenNode);sb.Remove(0,sb.Length);state=State.BEGIAN;index+=2;}else{sb.Append(c);}break;caseState.COMMENT:if(c=='-'){if(index+2<XmlText.Length&&XmlText[index+1]=='-'&&XmlText[index+2]=='>'){Console.WriteLine(sb);TokenNode.Str=sb.ToString();TokenNode.Type=TokenType.COMMENT;ListToken.Add(TokenNode);sb.Remove(0,sb.Length);index+=2;state=State.BEGIAN;}else{sb.Append(c);}}elseif(!char.IsControl(c)){sb.Append(c);}else{Console.WriteLine("注释出错");state=State.ERROR;}break;caseState.ERROR:Console.WriteLine("error");returnnull;default:Console.WriteLine("d");state=State.ERROR;break;}index++;}if(state==State.TEXT&&index==XmlText.Length&&sb.Length>0){Console.WriteLine(sb);sb.Remove(0,sb.Length);}returnListToken;}publiccharGetReference(stringStr,refintIndex){if(Str.StartsWith("lt;")){Index=3;return'<';}elseif(Str.StartsWith("gt;")){Index=3;return'>';}elseif(Str.StartsWith("amp;")){Index=4;return'&';}elseif(Str.StartsWith("apos;")){Index=5;return''';}elseif(Str.StartsWith("quot;")){Index=5;return'"';}else{return'0';}}}
解决方案三:
如果是写作业或者是练习,自己写词法分析是个不错的锻炼。如果是工作上用,要考虑用一些开源的项目,象GPLex等等。
解决方案四:
用递归方式应该比swtichcase方式可读性更好一些,也更符合xml的结构至于</>之类我建议保留,因为有可能一个xml的文档通过一些运算后,再存为xml文档,如果按你的做法,你还需要根据这些节点再构造一次xml.
解决方案五:
用有限状态机去解析xml网络上有很多例子啊,可以去参考下,看看你做的和别人做的有什么区别!googler
解决方案六:
引用2楼gomoku的回复:
如果是写作业或者是练习,自己写词法分析是个不错的锻炼。如果是工作上用,要考虑用一些开源的项目,象GPLex等等。
练习就在纠结token的设计思路对不对。
解决方案七:
引用3楼hdt的回复:
用递归方式应该比swtichcase方式可读性更好一些,也更符合xml的结构至于</>之类我建议保留,因为有可能一个xml的文档通过一些运算后,再存为xml文档,如果按你的做法,你还需要根据这些节点再构造一次xml.
递归就不能算词法分析了吧,必须用词法分析啊,而且如果保留</>这些,岂不是字符要回退.