一个简单的C语言词法分析与语法分析器

词法分析
可识别内容:
标识符：id
数字：num
关键字：int,char,if,else,while,do,for
标号：, ， . ， ;
算术运算符号：=，+，-，*，/，&，!，|，&&，||
关系运算符：<，<=，>=，>，==，!=
注释：//

内码定义：
单个符号，如{，+，*，> 等，均使用其ascii码做内码，占双或多个字节的符号（包括保留字，标号，数字，运算符等）为其取名如下：
Enum { END=0,INT,CHAR,IF,ELSE,WHILE=5,
DO,FOR,ARGAND,ARGOR,NUM=10,
ID,LESSEQUAL,EQUAL,GREATEQUAL,NOTEQUAL=15 };
其中NUM代表数字，ID代表标识符.

测试程序1-1的词法分析结果如下：

123 {
11 X
61 =
10 12
43 +
43 +
11 b
47 /
10 13
45 -
10 5
42 *
10 4
42 *
10 9
59 ;
11 Y
61 =

10 4
42 *
10 1024
59 ;
3 if
40 (
11 X
14 >=
11 Y
41 )
123 {
3 if 40 (
11 i
13 ==
11 i

41 )
123 {
11 X
61 =
11 Y
125 }
125 }
59 ;
59 ;
59 ;
59 ;
5 while
40 (
11 X
60 <

11 Y
41 )
123 {
11 X
61 =
11 X
43 +
10 1
59 ;
125 }
125 }

语法分析
C语言子集，可支持
语句块，语句，条件语句，While循环语句，赋值语句，基本算术表达式等。例如：
{
// Comment Supported : This is only a Test ^_^

X = 12 + b / 13 - 5 * 4 * 9; // A AssignmentStatement
Y = 4 * 1024;
if( X >= Y){
if( i == i){   // This is nested if Statement
X=Y;
}
}
;;;;      // This is Null Statement
while( X < Y){   // This is while Statement
X = X +1;
}
}
测试程序1-1

支持错误检测，如将上面例子中X = 12 + b / 13 - 5 * 4 * 9;
故意修改为：X = 12 ++ b / 13 - 5 * 4 * 9; 则会出现如下错误提示，指示了出错行数和行内偏移位置：

规则如下：
<StatementBlock> ::= '{'<StatementSequence>'}'
<StatementSequence> ::= {<NullStatement>|<CommonStatement>|<VariantStatement>}

<AssignmentStatement> ::= ID=<Expression>
<ConditionStatement> ::= if(<Condition> <StatementBlock>
<LoopWhileStatement> ::= while(<Condition> <StatementBlock>

<Condition> ::= <Expression><RelationOperator><Expression>
<Expression> ::= <Item>{+<Item>|-<Item>}
<Item> ::= <Factor>{*<Factor>|/<Factor>}
<Factor> ::= ID|NUM|(<Expression>)
<RelationOperator> ::= <|<=|>=|>|==|!=

//非终结符的英文定义
void StatementBlock(); //语句块
void StatementSequence(); //语句串

// XxxxxStatement() 为三类语句
void NullStatement(); //空语句--仅仅含有一个;号
void CommonStatement(); //语句
void VariantStatement(); //变种语句--包括 if(){},while{},他们都不以;结尾

// 下面的属于CommonStatement
void AssignmentStatement(); //赋值语句

// 下面两种属于VariantStatement
void ConditionStatement(); //条件语句
void LoopWhileStatement(); //while循环语句

void Condition();   //条件
void Expression();   //表达式
void Item();    //项
void Factor();    //因子
void RelationOperator();  //关系运算符

不能支持的主要方面：函数调用的识别，逗号表达式，for循环，switch语句。

词法分析：

// LexAly.cpp :C子集词法分析程序
/**//*

支持内容:
标识符:id
关键字: int,char,if,else,while,do,for
标号: ,, .,  ;
算术运算符号: =,+,-,&,!,|,&&,||

全局字符串:
instr 记录待解析的字符串
string 存贮当前被解析到的ID

接口:
gettoken();

Sample:
输入:
instr =
for(i=0;i<10;i++){
    j=i+10;
    printf("%d",j);
}
输出:
for
(
i
……
}

注意:
要记得处理程序中未显示出来的符号，如空白(' ')，回车(' ')，值表符(' ')
*/

#include "stdafx.h"
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "Constant.h"

extern void grammar_check();

// 注意：这里累计数目(最大值)不能大于32 (10 0000B)TOKEN
//enum {END=0,INT=1,CHAR,IF,ELSE,WHILE,DO,FOR,ARGAND,ARGOR,NUM,ID} ;

char index[][20] = ...{
    ...{"END OF File"},    /**//* 0 END*/
    ...{"int"},    /**//* 1 INT*/
    ...{"char"},    /**//* 2 CHAR*/
    ...{"if"},    /**//* 3 IF*/
    ...{"else"},    /**//* 4 ELSE*/
    ...{"while"},    /**//* 5 WHILE*/
    ...{"do"},    /**//* 6 DO*/
    ...{"for"},    /**//* 7 FOR*/
    ...{"&&"},    /**//* 8 ARGAND*/
    ...{"||"},    /**//* 9 ARGOR*/
    ...{""},    /**//* 10 NUM */
    ...{""},    /**//* 11 ID */
    ...{"<="},    /**//* 12 LESSEQUAL */
    ...{"=="},    /**//* 13 EQUAL */
    ...{">="},    /**//* 14 GREATEQUAL */
    ...{"!="},    /**//* 15 NOTEQUAL */
    ...{""}    /**//* 16 ID */
};

char input[10000] = ...{0};

char * instr = input;
char * const start_of_instr=input;

//string 包含gettoken最新得到的id等串
//gym包含 gettoken得到的内容的代号
//current_line 包含当前行号
char string[MAX_INDENT];
int  sym;
int current_line = 1;
int start_pos_of_current_line;

char *strstart;    //用于辅助识别num,id

int gettoken();
int _gettoken();

void error(char *cur);
char* getlinestring(int line,char *in_buf);
int nextline();
int getline();
int getcurrentpos();

int nextline() ...{    return ++current_line; }
int getline() ...{    return current_line; }
int getcurrentpos() ...{ return (int)instr; }

char* getlinestring(int line,char *in_buf)
...{
    char * t = input;
    int i = 1;

    while(*t != 0 &&  i < line )...{
        if( *t==' ' ) i++;
        t++;
    }

    int len = 0;
    while ( *t != ' ' )...{
        in_buf[len] = *t;
        len++;
        t++;
    }
    in_buf[len] = 0;
    return in_buf;

}

void error(char *cur)
...{
    printf("Spell Error found at line %d  ",getline());
    exit(0);

}

//语法分析
int main_grammar(char *filename)
...{
    int i;

    FILE *f;
    if(!(f=fopen(filename,"r")))...{
        printf("Fail to open source file %s! ",filename);
        exit(0);
    }
    int k=0;
    char c;
    while((c=fgetc(f))!=EOF)
    ...{
        input[k]=c;
        k++;
    }
    input[k] = 0;

    //打印出程序
    printf("%s ",start_of_instr);

    //开始语法检查
    grammar_check();

    printf("Success! ");
    return 0;
}

// 词法分析
int main_spell(char *filename)
...{
    int i;

    FILE *f;
    if(!(f=fopen(filename,"r")))...{
        printf("Fail to open source file %s! ",filename);
        exit(0);
    }



    int k=0;
    char c;
    while((c=fgetc(f))!=EOF)
    ...{
        input[k]=c;
        k++;
    }

    input[k] = 0;

    printf("%s ",start_of_instr);

    while((i=gettoken())!=END)
    ...{
        if(i == ID)...{
            printf("%d %s ",i,string);
            continue;
        }
        if(i == NUM)...{
            printf("%d %s ",i,string);
            continue;
        }

        if(i<20)...{
            printf("%d %s ",i,index[i]);
        }else...{
            printf("%d %c ",i,i);
        }
    }

    return 0;
}

int gettoken()
...{
    int i= (sym = _gettoken());

#if 0
        if(i == ID)...{
            printf("%s",string);
        }
        if(i == NUM)...{
            printf("%s",string);
        }

        if(i<20)...{
            printf("%s",index[i]);
        }else...{
            printf("%c",i);
        }
#endif
    return sym;
}

int _gettoken()
...{
    char *cp = instr;

    for(;;)...{
        if( *instr == 0)
            return END;

        /**//*
        if( 可能读入的字符 > 当前可用缓冲区大小 )
            扩展缓冲区
        */
//int,char,if,else,while,do,for
        switch ( *instr )
        ...{
        case 'i':
            if( instr[1] == 'f' && notda(instr[2]) )
            ...{
                instr+=2;return IF;
            }
            if( instr[1] == 'n' && instr[2] == 't' && notda(instr[3]) )
            ...{
                instr+=3; return INT;
            }
            // not a keyword. but an id.
            strstart = instr;
            instr++;
            goto id_label;
        case 'c':
            if( instr[1] == 'h' && instr[2] == 'a' && instr[3] == 'r' && notda(instr[4]) )
            ...{instr+=4;return CHAR; }
            strstart = instr;
            instr++;
            goto id_label;
            break;
        case 'e':
            if( instr[1] == 'l' && instr[2] == 's' && instr[3] == 'e' && notda(instr[4]) )
            ...{instr+=4;return ELSE; }
            strstart = instr;
            instr++;
            goto id_label;
            break;
        case 'w':
            if( instr[1] == 'h' && instr[2] == 'i' && instr[3] == 'l' && instr[4] == 'e' && notda(instr[5]) )
            ...{instr+=5;return WHILE; }
            strstart = instr;
            instr++;
            goto id_label;
        case 'd':
            if( instr[1] == 'o' && notda(instr[4]) )
            ...{instr+=2;return DO; }
            strstart = instr;
            instr++;
            goto id_label;
        case 'f':
            if( instr[1] == 'o' && instr[2] == 'r' && notda(instr[3]) )
            ...{instr+=3;return FOR; }
            strstart = instr;
            instr++;
            goto id_label;
// deal with IDs.
// EXCLUDE:i,c,d,e,w,f
case 'a': ;  case 'b': ;
case 'g': ;  case 'h': ;
case 'j': ;  case 'k': ;  case 'l': ;
case 'm': ;  case 'n': ;  case 'o': ;
case 'p': ;  case 'q': ;  case 'r': ;
case 's': ;  case 't': ;  case 'u': ;
case 'v': ;                  case 'x': ;
case 'y': ;  case 'z': ;
case 'A': ;  case 'B': ;
case 'C': ;  case 'D': ;  case 'E': ;
case 'F': ;  case 'G': ;  case 'H': ;
case 'I': ;  case 'J': ;  case 'K': ;
case 'L': ;  case 'M': ;  case 'N': ;
case 'O': ;  case 'P': ;  case 'Q': ;
case 'R': ;  case 'S': ;  case 'T': ;
case 'U': ;  case 'V': ;  case 'W': ;
case 'X': ;  case 'Y': ;  case 'Z': ;
            strstart = instr;
            instr++;
            goto id_label;

case '0': ;
case '1': ;  case '2': ;  case '3': ;
case '4': ;  case '5': ;  case '6': ;
case '7': ;  case '8': ;  case '9': ;
            strstart = instr;
            instr++;
            goto num_label;

        case '{':
            instr++;
            return '{';
        case '}':
            instr++;
            return '}';
        case '(':
            instr++;
            return '(';
        case ')':
            instr++;
            return ')';
        case '+':
            instr++;
            return '+';
        case '-':
            instr++;
            return '-';
        case '*':
            instr++;
            return '*';
        case '/':
            if( instr[1] == '/' )...{    // ‘//’形式的注释
                instr += 2;
                while( *(instr) != 10 && *(instr) != 0 )
                    instr ++;
//                instr ++;
            }else...{    // 除号'/'
                instr++;
                return '/';
            }
            break;
        case '=':
            if( instr[1] == '=') ...{instr+=2; return EQUAL; }
            else ...{instr++; return '=';}
            break;
        case '<':
            if( instr[1] == '=') ...{instr+=2; return LESSEQUAL; }
            else ...{instr++; return '<';}
            break;
        case '>':
            if( instr[1] == '=') ...{instr+=2; return GREATEQUAL; }
            else ...{instr++; return '>';}
            break;
        case '!':
            if( instr[1] == '=') ...{instr+=2; return NOTEQUAL; }
            else ...{instr++; return '!';}
            break;

        case '&':
            if( instr[1] == '&') ...{instr+=2; return ARGAND; }
            if( instr[1] == '&' && (isid(instr[2]) || isspace(instr[2])) )...{ instr++; return '&'; }
            error(instr);
            break;

        case '|':
            if( instr[1] == '|') ...{instr+=2; return ARGAND; }
            if( instr[1] == '|' && (isid(instr[2]) || isspace(instr[2])) )...{ instr++; return '|'; }
            error(instr);
            break;

        case ';':
            instr++;
            return ';';
        case ' ':
            //printf("new line (%d) ",getline());
            nextline();
            instr++;
            start_pos_of_current_line = (int)instr;
            break;
        default:
            instr++;
            break;
id_label:
        while( isid(*instr) )
            instr++;
        strncpy(string,strstart,instr-strstart);
        string[instr-strstart]=0;
        return ID;
num_label:
        while( isdigit(*instr) )
            instr++;

//        if(isalpha(*(instr+1))
//            error(instr);  让语法分析来做吧~

        strncpy(string,strstart,instr-strstart);
        string[instr-strstart]=0;
        return NUM;
        }

    }
}

int main(int argc, char* argv[])
...{
    if(argc <= 1 || argc >=4) ...{
        printf("Usage:>LexAly [g|s] [filename] ");
        exit(0);
    }

    if(argc == 3 )...{
        argv[1][0] == 'g'? main_grammar(argv[2]): main_spell(argv[2]);
    }else if(argc == 2)...{
        argv[1][0] == 'g'? main_grammar("source2.txt") : main_spell("source2.txt");
        ;
    }
    return 0;
}

// grammar.cpp :C子集语法分析程序
/**//*
C语言子集，可支持
语句块，语句，条件语句，While循环语句，赋值语句，基本算术表达式等。例如：
{
     // Comment Supported : This is only a Test ^_^

    X = 12 + b / 13 - 5 * 4 * 9;    // A AssignmentStatement
    Y = 4 * 1024;

    if( X >= Y){
        if( i == i){            // This is nested if Statement
            X=Y;
        }
    }

    ;;;;                        // This is Null Statement

    while( X < Y){            // This is while Statement
        X = X +1;
    }
}

规则如下：

  <StatementBlock> ::= '{'<StatementSequence>'}'
  <StatementSequence> ::= {<NullStatement>|<CommonStatement>|<VariantStatement>}

  <NullStatement> ::= ';'
  <CommonStatement> ::= <AssignmentStatement>
  <VariantStatement> ::= <ConditionStatement>| <LoopWhileStatement>

  <AssignmentStatement> ::= ID=<Expression>
  <ConditionStatement> ::= if(<Condition> <StatementBlock>
  <LoopWhileStatement> ::= while(<Condition> <StatementBlock>

  <Condition> ::= <Expression><RelationOperator><Expression>
  <Expression> ::= <Item>{+<Item>|-<Item>}
  <Item> ::= <Factor>{*<Factor>|/<Factor>}
  <Factor> ::= ID|NUM|(<Expression>)
  <RelationOperator> ::= <|<=|>=|>|==|!=

*/

#include "stdafx.h"
#include <ctype.h>
#include <conio.h>
#include <stdlib.h>
#include <string.h>

#include "Constant.h"

extern int gettoken();
extern int getcurrentpos();
extern char *string;
extern int  sym;
extern int  current_line;
extern int  start_pos_of_current_line;
extern char* getlinestring(int line,char *in_buf);
extern char input[];

//非终结符的英文定义

void StatementBlock();        //语句块
void StatementSequence();    //语句串

// XxxxxStatement() 为三类语句
void NullStatement();        //空语句--仅仅含有一个;号
void CommonStatement();        //语句
void VariantStatement();    //变种语句--包括 if(){},while{},他们都不以;结尾

// 下面的属于CommonStatement
void AssignmentStatement();    //赋值语句

// 下面两种属于VariantStatement
void ConditionStatement();    //条件语句
void LoopWhileStatement();    //while循环语句

void Condition();            //条件
void Expression();            //表达式
void Item();                //项
void Factor();                //因子
void RelationOperator();    //关系运算符

/**//*
注：以上未考虑函数调用表达式
*/

void match_error(char * c)
...{
    char error_buf[1024];
    int in_line_pos = getcurrentpos()-start_pos_of_current_line;

    printf("Grammar Error! ");
    printf("Line %d[%d] :  %s expected ",current_line,in_line_pos,c);

    //获取错误行并打印出来
    getlinestring(current_line,error_buf);
    printf("%s ",error_buf);
    //输出错误指示点(Not Exact!)
    for(int i=1; i< in_line_pos; i++)
        printf("%c",'^');
    printf(" ");

    exit(0);
}

// expecetedSym    期望符号
// msg    出错时给出的消息
void match(int expecetedSym, char * msg)
...{
    if(expecetedSym != sym)...{
#if 0
        if(sym < 0x20 )
            printf(" Fail: ExpecetedSym=%d sym=%d ",expecetedSym,sym);
        else
            printf(" Fail: ExpecetedSym=%d sym=%c ",expecetedSym,sym);
#endif
        match_error(msg);
    }
    gettoken();            //预读一个符号

}
void grammar_check()
...{

//    printf("%s",input);
    gettoken();            //开始检查，填充预读区
//    match('-',"DK");
    StatementBlock();
    if(sym != END) match(END,"End Of File");
}

void StatementBlock()        //语句块
...{
    match('{',"{");        //和预读符号比较
    StatementSequence();
    match('}',"}");
}

void StatementSequence()    //语句串
...{

    while(sym == ID ||
          sym == IF ||
          sym == WHILE||
          sym == ';')
    ...{
        while(sym == ID)    //也可以用if(),但从统计角度看,while效率会略高。
                            //因为一般普通CommonStatement()出现概率较大
        ...{
            CommonStatement();
            match(';',";");
        }
        while(sym == IF ||
              sym == WHILE)
        ...{
            VariantStatement();
        }
        while(sym == ';')
        ...{
            NullStatement();
        }
    }
}

void VariantStatement()    //变种语句--包括 if(){},while{},他们都不以;结尾
...{
    switch(sym)...{    //若sym与下面两种均不匹配，什么也不做，空语句是也~
    case IF:
        ConditionStatement();    //条件语句
        break;
    case WHILE:
        LoopWhileStatement();
        break;
    }
    return;
}

void NullStatement()    //空语句--仅仅含有一个;号
...{
    match(';',";");
}

void CommonStatement()            //语句,以;结尾,但不以;开头
...{
    switch(sym)...{    //若sym与下面任何一种均不匹配，什么也不做，空语句是也~
    case ID:
        AssignmentStatement();
        break;
    }
    return;
}

void AssignmentStatement()    //赋值语句
...{
        match(ID,"ID");
        match('=',"=");
        Expression();
}

void ConditionStatement()    //条件语句
...{
    match(IF,"if");
    match('(',"(");
    Condition();
    match(')',")");
    StatementBlock();
}

void LoopWhileStatement()        //循环语句
...{
    match(WHILE,"while");
    match('(',"(");
    Condition();
    match(')',")");
    StatementBlock();
}

void Condition()            //条件
...{
    Expression();
    RelationOperator();
    Expression();
}

// <Expression> ::= <Item>{+<Item>|-<Item>}
void Expression()            //表达式
...{
    Item();
    while(true)...{    // {+<Item>|-<Item>}  可以有多个
        switch(sym)...{
        case '+':
            match('+',"+");
            Item();
            break;
        case '-':
            match('-',"-");
            Item();
            break;
        default:
            return;
        }
    }
    return;

}
// <Item> ::= <Factor>{*<Factor>|/<Factor>}
void Item()                //项
...{
    Factor();
    while(true)...{    // {*<Factor>|/<Factor>} -- 可以有多个
        switch(sym)...{
        case '*':
            match('*',"*");
            Factor();
            break;
        case '/':
            match('/',"/");
            Factor();
            break;
        default:
            return;
        }
    }
    return;
}
// <Factor> ::= ID|NUM|(<Expression>)
void Factor()                //因子
...{
    switch(sym)...{
    case ID:
        match(ID,"ID");
        break;
    case NUM:
        match(NUM,"NUM");
        break;
    case '(':
        match('(',"(");
        Expression();
        match(')',")");
        break;
    default:
        match(ID,"A factor ");    // ID在这里肯定不match，利用它来报错（找不到因子）
        break;
    }
}
// <RelationOperator> ::= <|<=|>=|>|==|!=
void RelationOperator()    //关系运算符
...{
    switch(sym)...{
    case '<':
        match('<',"<");
        break;
    case '>':
        match('>',">");
        break;
    case LESSEQUAL:
        match(LESSEQUAL,"<=");
        break;
    case GREATEQUAL:
        match(GREATEQUAL,">=");
        break;
    case EQUAL:
        match(EQUAL,"==");
        break;
    case NOTEQUAL:
        match(NOTEQUAL,"!=");
        break;
    }
}

核心：规则----

<StatementBlock> ::= '{'<StatementSequence>'}'
<StatementSequence> ::= {<NullStatement>|<CommonStatement>|<VariantStatement>}

<AssignmentStatement> ::= ID=<Expression>
<ConditionStatement> ::= if(<Condition> <StatementBlock>
<LoopWhileStatement> ::= while(<Condition> <StatementBlock>

时间： 2024-08-31 21:08:34

一个简单的C语言词法分析与语法分析器

一个简单的C语言词法分析与语法分析器的相关文章

一个简单的c语言填空初学者求助

分析C语言一个简单程序_C 语言

c语言实现一个简单日历_C 语言

可配置语法分析器开发纪事（一）构造语法树

《ANTLR 4权威指南》一3.2　测试生成的语法分析器

《ANTLR 4权威指南》——3.2节测试生成的语法分析器

一个简单的语言的语法（二）：ANTLR的重写规则

《Python语言程序设计》——2.2　编写一个简单的程序

中文字符-如何用C语言编写一个简单的输入法程序，要求可以输入汉字。