编译原理之正则表达式转NFA

本文转载自http://chriszz.sinaapp.com/?p=257

输入一个正则表达式，输出一个NFA。

我的做法：输入一个字符串表示正则，输出则是把输出到一个.dot文件中并将dot文件编译成pdf，fedora需要sudo yum install dot，然后evince XXX.pdf就可以查看生成的NFA了。

具体算法是按照龙书上的Tompson算法来的。

废话不多说，放码过来：

/*
Author：ChrisZZ(zchrissirhcz@gmail.com)
Time:2013-12-25 14:13:09
输入：正则表达式
输出：自动机
算法步骤：
1.把正则表达式转化为后缀表达式
2.把后缀表达式转化为NFA
3.用dot语言把NFA输出到PDF
参考：
1.Regular Expression Matching Can Be Simple And Fast
http://swtch.com/~rsc/regexp/regexp1.html
2.龙书 chap3.7.4 从正则表达式构造NFA
3.YCC学长的project中dot语言的使用
其他说明：
1.需要安装dot，并添加到系统path中
2.在windows下运行时，控制台因为编码不支持可能导致中文提示无法显示
*/
#include <iostream>
#include <string>
#include <stdio.h>
#include <stack>
#include <string.h>
#include <stdexcept>
#include <stdlib.h>

using namespace std;

const int Match = 256;
const int Split = 257;//表示epsilon分支

struct Paren{//括号结构体
    int natom;
    int nalt;
};

string re2post(string re){
    Paren paren;//括号
    stack<struct Paren>parenStk;
    string postExpr="";
    int i, len=re.length();
    int nalt=0, natom=0;
    const string invalidRegExp = "非法的正则表达式";
    for(i=0; i<len; i++){
        if(isspace(re[i])) continue;
        if(isalpha(re[i])){
            if(natom>1){
                natom--;
                postExpr = postExpr + '.';
            }
            natom++;
            postExpr = postExpr + re[i];
        }
        else if(re[i]=='('){
            if(natom>1){
                postExpr = postExpr + '.';
            }
            paren.natom = natom;
            paren.nalt = nalt;
            parenStk.push(paren);
            nalt = 0;
            natom = 0;
        }
        else if(re[i]==')'){
            if(natom==0 || parenStk.empty())
                throw runtime_error(invalidRegExp+":括号不匹配");
            while(--natom>0){//比如((a|b)(c|d))模式，当上一次匹配完倒数第二个右括号后，natom为2，需要添加'.'
                postExpr = postExpr + '.';
            }
            while(nalt-->0){
                postExpr = postExpr + '|';
            }
            paren=parenStk.top();
            parenStk.pop();
            natom = paren.natom;
            nalt = paren.nalt;
            natom++;
        }
        else if(re[i]=='*'){
            if(natom==0)
                throw runtime_error(invalidRegExp+":提前出现'*'");
            postExpr = postExpr + re[i];
        }
        else if(re[i]=='|'){
            if(natom==0) throw runtime_error(invalidRegExp+":提前出现'|'");
            while(--natom>0){
                postExpr = postExpr + '.';
            }
            nalt++;
        }
        else
            throw runtime_error(invalidRegExp);
    }
    if(!parenStk.empty())
        throw runtime_error(invalidRegExp+":括号不匹配");
    while(--natom>0){
        postExpr = postExpr + '.';
    }
    while(nalt-->0){
        postExpr = postExpr + '|';
    }
    return postExpr;
}

class NFA;

/*
* c<256表示edge权重为c；
* c=256表示终结状态，匹配成功
* c=257表示分支（split）
*/
class State{
    friend class NFA;
    friend void nfa2graph(State* head, const string& re);
public:
    State(int c=256, State* out=NULL, State* out1=NULL){
        this->c = c;
        this->out = out;
        this->out1 = out1;
        this->id = 0;
    }
    void setId(int id){
        this->id = id;
    }

private:
    int c;
    int id;//状态的编号
    State* out;//从本状态出去的状态集合的头指针
    State* out1;//两个分支的情况
};

class NFA{
public:
    NFA(){
        head = NULL;
        tail = NULL;
    }
    NFA(const int& c){
        tail = new State(Match, NULL, NULL);
        head = new State(c, tail, NULL);
    }
    void doCat(NFA& nfa){
        tail->out = nfa.head;
        tail->c = Split;
        tail = nfa.tail;
        nfa.head = NULL;
        nfa.tail = NULL;
    }
    void doUnion(NFA& nfa){
        State* newHead = new State(Split, head, nfa.head);
        State* newTail = new State(Match, NULL, NULL);
        tail->c = Split;
        tail->out = newTail;
        nfa.tail->c = Split;
        nfa.tail->out = newTail;
        tail = newTail;
        head = newHead;
        nfa.head = NULL;
        nfa.tail = NULL;
    }
    void doStar(){
        State* newTail = new State(Match, NULL, NULL);
        State* newHead = new State(Split, head, newTail);
        tail->c = Split;
        tail->out = newTail;
        tail->out1 = head;
        tail = newTail;
        head = newHead;
    }

    void nfa2graph(const string& re){
        char myfile[100];
        printf("请输入一个文件名，用来保存生成的NFA-graph(不必提供后缀):\n");
        scanf("%s", myfile);
        printf("已将DOT文件存储在\"%s.dot\",\n", myfile);
        printf("PDF文件则存储在\"%s.dot.pdf\".\n", myfile);
        int i;
        while(myfile[i]!='\0')
            i++;
        myfile[i] = '.';
        myfile[i+1] = 'd';
        myfile[i+2] = 'o';
        myfile[i+3] = 't';
        myfile[i+4] = '\0';

        FILE *file = fopen(myfile, "w");

        fputs("digraph {\n", file);
        fputs("\t\"", file);
        int len=re.length();
        for(i=0; i<len; i++){
            fprintf(file, "%c", re[i]);
        }

        fputs("\" [shape = plaintext]\n", file);
        fputs("\trankdir = LR\n", file);
        fputs("\t\"\" [shape = point]\n", file);
        fputs("\t\"\" -> 1 [label = Start]\n\n", file);

        int id = 1;

        char circle[2000];
        memset(circle, 0, sizeof(circle));
        State* p;
        stack<State*> staStk;

        head->setId(id++);
        staStk.push(head);

        while(!staStk.empty()){
            p = staStk.top();
            staStk.pop();
            char flag = 1;
            cout << "p->c=" << p->c << endl;
            if(p->c < Match){
                cout << "p->out->id=" << p->out->id << endl;
                if(p->out->id==0){
                    p->out->id = id++;
                    cout << "id=" << id << endl;                }
                else
                    flag = 0;
                fprintf(file, "\t%d -> %d [label = \"%c\"]\n", p->id, (p->out)->id, p->c);
                State *what = p->out;
                if(flag) //push(*what);
                    staStk.push(what);
            } else if(p->c == Match){
                circle[p->id] = 1;
            } else{     //对应Split的情形
                if(p->out->id==0)
                    p->out->id = id++;
                else
                    flag = 0;
                fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out->id);
                State *what = p->out;
                if(flag) staStk.push(what);

                if(p->out1!=NULL){
                    flag = 1;

                    if(p->out1->id==0)
                        p->out1->id = id++;
                    else
                        flag = 0;
                    fprintf(file, "\t%d -> %d [label = <ε>]\n", p->id, p->out1->id);
                    what = p->out1;
                    if(flag) staStk.push(what);
                }
            }
        }

        for(i=1; i<id; i++){
            fprintf(file, "\t%d [shape = circle", i);
            if(circle[i])
                fputs(", peripheries = 2", file);
            fprintf(file, "]\n");
        }

        fputs("}", file);
        fclose(file);

        char cmd[108];
        sprintf(cmd, "dot %s -O -Tpdf", myfile);
        if(system(cmd)==0){
            printf("成功生成pdf图像！\n");
            //printf("Linux用户可以使用evince file.pdf &命令打开~\n");
        }
        else
            printf("悲剧！生成pdf图像时出现错误..\n");
    }
private:
    State* head;
    State* tail;
};

NFA post2nfa(const string& postExpr){
    stack<NFA> nfaStk;
    NFA e1, e2, e;
    int i, len=postExpr.length();
    for(i=0; i<len; i++){
        switch(postExpr[i]){
        case '.':
            e2 = nfaStk.top();
            nfaStk.pop();
            e1 = nfaStk.top();
            nfaStk.pop();
            e1.doCat(e2);
            nfaStk.push(e1);
            break;
        case '|':
            e2 = nfaStk.top();
            nfaStk.pop();
            e1 = nfaStk.top();
            nfaStk.pop();
            e1.doUnion(e2);
            nfaStk.push(e1);
            break;
        case '*':
            e = nfaStk.top();
            nfaStk.pop();
            e.doStar();
            nfaStk.push(e);
            break;
        default://
            NFA alpha(postExpr[i]);
            nfaStk.push(alpha);
        }
    }
    e = nfaStk.top();
    nfaStk.pop();
    if(!nfaStk.empty())
        throw runtime_error("未知错误");
    return e;
}

int main(){
    string re;
    while(true){
        cout << "请输入一个正则表达式:\n";
        cin >> re;
        string postExpr = re2post(re);
        cout << "postExpr is : " << postExpr << endl;
        NFA nfa = post2nfa(postExpr);
        nfa.nfa2graph(re);
        cout << "继续吗?(y/n)\n" << endl;
        char c;
        cin >> c;
        while(c!='y' && c!='n'){
            cout << "请输入'y'或'n'.\n";
            c=getchar();
        }
        if(c=='n')
            break;
    }
    cout << "Bye~\n";
    return 0;
}

时间： 2024-09-18 12:57:05

编译原理之正则表达式转NFA的相关文章

编译原理

编译原理语法是指这样的一组规则,用它可以形成和产生一个合适的程序. 词法规则是指单词符号的形成规则. 语法规则是语法单位的形成规则,规定了如何从单词符号形成更大的结构(即语法单位或语法范畴). 一般程序语言的语法单位有:表达式.语句.分程序.函数.过程和程序等. 程序语言的基本功能是描述数据和对数据的运算.所谓程序,从本质上来说是描述一定数据的处理过程. 强制式语言也称过程式语言.其特点是命令驱动,面向语句.一个强制式语言程序由一系列的语句组成,每个语句的执行引起若干存储单元中的值的改变.

《编译原理实践与指导教程》——1.2　实验指导

1.2 实验指导词法分析和语法分析这两块,可以说是在整个编译器当中被自动化得最好的部分.也就是说即使没有任何的理论基础,在掌握了工具的用法之后,也可以在短时间内做出功能很全很棒的词法分析程序和语法分析程序.当然这并不意味着,词法分析和语法分析部分的理论基础并不重要.恰恰相反,这一部分被认为是计算机理论在工程实践中最成功的应用之一,对它的介绍也是编译理论课中的重点.但本节指导内容的重点不在于理论而在于工具的使用. 本节指导内容将分别介绍词法分析工具GNU Flex和语法分析工具GNU Bison

基于LLVM的编译原理简明教程 (1) - 写编译器越来越容易了

基于LLVM的编译原理简明教程 (1) - 写编译器越来越容易了进入21世纪,新的编程语言如雨后春笋一样不停地冒出来.需求当然是重要的驱动力量,但是在其中起了重要作用的就是工具链的改善. 2000年,UIUC的Chris Lattner主持开发了一套称为LLVM(Low Level Virtual Machine)的编译器工具库套件.后来,LLVM的scope越来越大,Low Level Virtual Machine已经不足以表示LLVM的全部,于是,LLVM就变成了正式的名字.LLVM可以

编译原理课设求大神帮忙

问题描述编译原理课设求大神帮忙使用算符优先算法,处理正规式中必须有括号,生成NFA使用邻接链表存储解决方案 http://www.docin.com/p-403177271.html 解决方案二: 哇,你这个题高大上啊,首先得知道什么是算法优先法,然后知道NFA怎么转换,最后知道邻接链表怎么表示,正规式里里面必须有括号?这题目自己拟出来做? 解决方案三: 这类题目专业性太强.google一下吧,自己再根据你学的动手,问问就有了,至少有一个思考的过程.明白原理解决方案四: 设<表达式>为

浅谈Sizzle的“编译原理”_其它

Sizzle,是jQuery作者John Resig写的DOM选择器引擎,速度号称业界第一.作为一个独立全新的选择器引擎,出现在jQuery 1.3版本之后,并被John Resig作为一个开源的项目.Sizzle是独立的一部分,不依赖任何库,如果你不想用jQuery,可以只用Sizzle,也可以用于其他框架如:Mool, Dojo,YUI等. 前几天在准备一个关于jQuery的分享PPT,问同事关于jQuery除了使用方法之外还有没有其他特别想了解一下的,有人提到了想了解下它的选择器是怎么实现

操作系统与编译原理着2门课

问题描述如果必须选一门而且只能选一门你会选择那门课?以及原因????时间原因我可能先学一个相对重要的,有时间在补另一门解决方案解决方案二:编译原理解决方案三:该回复于2010-12-27 14:26:55被版主删除解决方案四:单就和编程的联系来看,编译原理是和编程的联系要比操作系统和编程的联系紧密些.建议选编译原理吧解决方案五:选编译原理,操作系统主要是一些硬件知识,线程进程知识等,对写程序没有帮助解决方案六:编译原理秒杀操作系统编译原理方面的基本理论还是相当重要的自动机与正则表达式有密不

编译原理scanner的java代码

问题描述编译原理scanner的java代码 package lexer; public class Token { public final int tag;public Token(int t) { tag = t;}public String toString() { return """" + (char) tag;} } package lexer; public class Tag { public final static int AND = 256

求各位大神帮忙做一下编译原理程序设计

问题描述求各位大神帮忙做一下编译原理程序设计 1．设计词法分析器设计各单词的状态转换图,并为不同的单词设计种别码.将词法分析器设计成供语法分析器调用的子程序.功能包括:具备预处理功能.将不翻译的注释等符号先滤掉,只保留要翻译的符号串,即要求设计一个供词法分析调用的预处理子程序:能够拼出语言中的各个单词:http://ask.csdn.net/#将拼出的标识符填入符号表:返回(种别码, 属性值).2．目标代码生成器c. 能完成指定寄存器个数的情况下将一中间代码程序段翻译成汇编语言目标代码(汇

编译原理之文法

关于编译原理这块之前根本没有涉及过,这次要用到这里的知识就需要来接触一下这里的内容.编译原理顾名思义就是处理高级语言,使之称为计算机能够识别的语言(低级语言)的原理.而文法呢?就是用来描述程序设计语言的方法.类似佛法,用来描述佛家的诵经禅道的规则的.不用去纠结这个名字,知道这个含义,足以. 文法概念终结符和非终结符如图:在p这个推导式的集合中,存在六个推导式.其中S.A.B为非终结符.a.b.c.d.q.p为终结符.终结符是原子不可分的. 分类文法的分类也就这几种了,先看各自的定义,在定