view CbC-examples/regexp/dfareg.py @ 62:70947ddafad7

added test code, CbC-example/regexp
author ryoma <e075725@ie.u-ryukyu.ac.jp>
date Sun, 25 Apr 2010 17:46:01 +0900
parents
children
line wrap: on
line source

#!/usr/bin/env python

import copy
import sys

class NondeterministicFiniteAutomaton(object):
    def __init__(self, transition, start, accepts, map_):
        self.transition = transition
        self.start = start
        self.accepts = accepts
        self.map = map_

    def epsilonExpand(self, set_):
        que = set(set_)
        done = set()
        while que:
            stat = que.pop()
            nexts = self.transition(stat, "")
            done.add(stat)
            for nextStat in nexts:
                if not nextStat in done: que.add(nextStat)
        return frozenset(done)

class DeterministicFiniteAutomaton(object):
    def __init__(self, transition, start, accepts, map_):
        self.transition = transition
        self.start = start
        self.accepts = accepts
        self.map = map_
    def getRuntime(self):
        return DFARuntime(self)

class DFARuntime(object):
    def __init__(self, DFA):
        self.DFA = DFA
        self.curState = self.DFA.start

    def doTransition(self, char):
        self.curState = self.DFA.transition(
            self.curState, char)

    def isAcceptState(self):
        return self.curState in self.DFA.accepts

    def doesAccept(self, input):
        for alphabet in input:
            self.doTransition(alphabet)
        return self.isAcceptState()

class Token(object):
    CHARACTER  = 0
    OPE_UNION  = 1
    OPE_STAR   = 2
    LPAREN     = 3
    RPAREN     = 4
    EOF        = 5

    def __init__(self, value, kind):
        self.value = value
        self.kind  = kind


class Lexer(object):
    def __init__(self, str):
        self.stringList = list(str)

    def scan(self):
        if not self.stringList:
            return Token(None, Token.EOF)

        ch = self.stringList.pop(0)
        if   ch == '\\':
            return Token(self.stringList.pop(0), Token.CHARACTER)
        elif ch == '|':
            return Token(ch, Token.OPE_UNION)
        elif ch == '(':
            return Token(ch, Token.LPAREN)
        elif ch == ')':
            return Token(ch, Token.RPAREN)
        elif ch == '*':
            return Token(ch, Token.OPE_STAR)
        else:
            return Token(ch, Token.CHARACTER)

"""
 Context-free Grammer:
 (A) expression -> subexpr EOF
 (B) subexpr -> seq '|' subexpr | seq
 (C) seq -> subseq | ''
 (D) subseq -> star subseq | star
 (E) star -> factor '*' | factor
 (F) factor -> '(' subexpr ')' | CHARACTER
"""

class Parser(object):
    def __init__(self, lexer):
        self.lexer = lexer
        self.look  = None
        self.move()

    def match(self, tag):
        if self.look.kind != tag:
            raise Exeption("syntax error")
        self.move()

    def move(self):
        self.look = self.lexer.scan()

    def factor(self):
        if self.look.kind == Token.LPAREN:
            # factor -> '(' subexpr ')'
            self.match(Token.LPAREN)
            node = self.subexpr()
            self.match(Token.RPAREN)
            return node
        else:
            # factor -> CHARACTER
            node = Character(self.look.value)
            self.match(Token.CHARACTER)
            return node

    def star(self):
        # star -> factor '*' | factor
        node = self.factor()
        if self.look.kind == Token.OPE_STAR:
            self.match(Token.OPE_STAR)
            node = Star(node)
        return node

    def seq(self):
        if self.look.kind == Token.LPAREN \
               or self.look.kind == Token.CHARACTER:
            # seq -> subseq
            return self.subseq()
        else:
            # seq -> ''
            return Character("")

    def subseq(self):
        node1 = self.star()
        if self.look.kind == Token.LPAREN \
               or self.look.kind == Token.CHARACTER:
            # subseq -> star subseq
            node2 = self.subseq()
            node  = Concat(node1, node2)
            return node
        else:
            # subseq -> star
            return node1

    def subexpr(self):
        node1 = self.seq()
        if self.look.kind == Token.OPE_UNION:
            self.match(Token.OPE_UNION)
            # subexpr    -> seq '|' subexpr
            node2 = self.subexpr()
            node  = Union(node1, node2)
            return node
        else:
            # subexpr    -> seq
            return node1

    def expression(self):
        # expression -> subexpr EOF
        node     = self.subexpr()
        self.match(Token.EOF)

        # Create TREE, NFA
        context  = Context()
        fragment = node.assemble(context)
        return fragment.build()

class Context(object):
    def __init__(self):
        self.stateCount = 0

    def newState(self):
        self.stateCount += 1
        return self.stateCount

class NFAFragment(object):
    def __init__(self):
        self.start   = None    # integer
        self.accepts = None    # frozenset
        self.map     = dict()  # transition

    def connect(self, from_, char, to):
        slot = self.map.setdefault((from_, char), set())
        slot.add(to)

    def newSkelton(self):
        # copy fragment (self), and it return
        newFrag = NFAFragment();
        newFrag.map = copy.deepcopy(self.map)
        return newFrag

    def __or__(self, frag):
        newFrag = self.newSkelton()
        for k, v in frag.map.iteritems():
            newFrag.map[k] = v.copy()
        return newFrag

    def build(self):
        map_ = self.map
        def transition(state, char):
            return frozenset(map_.get((state, char), []))

        return NondeterministicFiniteAutomaton(
            transition,
            self.start,
            self.accepts,
            self.map
            )

class Character(object):
    def __init__(self, char):
        self.char = char

    def assemble(self, context):
        frag = NFAFragment()
        s1 = context.newState()
        s2 = context.newState()
        frag.connect(s1, self.char, s2)
        frag.start = s1
        frag.accepts = frozenset([s2])
        return frag

class Union(object):
    def __init__(self, op1, op2):
        self.op1 = op1
        self.op2 = op2

    def assemble(self, context):
        frag1 = self.op1.assemble(context)
        frag2 = self.op2.assemble(context)
        frag = frag1 | frag2
        s = context.newState()
        frag.connect(s, "", frag1.start)
        frag.connect(s, "", frag2.start)
        frag.start = s
        frag.accepts = frag1.accepts | frag2.accepts
        return frag

class Concat(object):
    def __init__(self, op1, op2):
        self.op1 = op1
        self.op2 = op2

    def assemble(self, context):
        frag1 = self.op1.assemble(context)
        frag2 = self.op2.assemble(context)
        frag = frag1 | frag2

        for state in frag1.accepts:
            frag.connect(state, "", frag2.start)

        frag.start   = frag1.start
        frag.accepts = frag2.accepts
        return frag

class Star(object):
    def __init__(self, op):
        self.op = op

    def assemble(self, context):
        fragOrig = self.op.assemble(context)
        frag = fragOrig.newSkelton()

        for state in fragOrig.accepts:
            frag.connect(state, "", fragOrig.start)

        s = context.newState()
        frag.connect(s, "", fragOrig.start)
        frag.start = s
        frag.accepts = fragOrig.accepts | frozenset([s])
        return frag

class NonDisjoinSets(object):
    def __init__(self, sub):
        self.sub = sub
    def __contains__(self, a_set):
        return a_set & self.sub

def nfa2dfa(nfa):
    def transition(set_, alpha):
        ret = set()
        for elem in set_:
            ret |= nfa.transition(elem, alpha)
        return nfa.epsilonExpand(frozenset(ret))

    def stateSetTransition(stateSet, char):
        trans = set()
        for state in stateSet:
            t = nfa.map.setdefault((state, char), None)
            if not t == None: trans.update(nfa.epsilonExpand(t))
        return frozenset(trans)

    map_ = dict()
    que =  set(frozenset([nfa.epsilonExpand(set([nfa.start]))]))
    done = set()

    while que:
        stateSet = que.pop()

        for state in stateSet:
            for k, v in nfa.map.iteritems():
                if state == k[0] and k[1] != '':
                    slot = map_.setdefault((stateSet, k[1]), set())
                    slot.update(nfa.epsilonExpand(v))

        done.add(stateSet)

        for v in map_.itervalues():
            if not v in done:
                que.add(frozenset(v))

    return DeterministicFiniteAutomaton(
        transition,
        nfa.epsilonExpand(frozenset([nfa.start])),
        NonDisjoinSets(nfa.accepts),
        map_
        )

# emit CbC-souce, from dfa
def dfa2CbC(dfa, regexp):

    # return state name from set
    def set2name(set_):
        l = list(set_)
        l.sort()
        return '_'.join(str(x) for x in l)

    funcMap = dict()
    funcMap[dfa.start] = set()

    # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....}
    #             : frozenset(1, 15) x 'd' -> frozenset([16])

    for v in dfa.map.itervalues():
        funcMap[frozenset(v)] = set()

    for key in dfa.map.iterkeys():
        slot = funcMap.setdefault(key[0], set())
        slot.add(key[1])

    # emit cbc code
    print "#include <stdio.h>\n"
    print "#include <stdlib.h>\n"
    for k in funcMap.iterkeys():
        print '__code state_' + set2name(k) + "(char* s);"
    print '__code accept();'
    print '__code reject();'
    print """
int main(int argc, char* argv[]) {
\tif (argc == 1) {
\t\tprintf(\"usage: %%s text\\n\", argv[0]);
\t\texit(0);
}

\tputs(\"regexp: %s\");
\tputs(\"number of state: %d\");
\tprintf(\"string: %%s\\n\", argv[1]);
\tgoto state_%s(argv[1]);
\treturn 0;
}\n""" % (regexp, len(funcMap), set2name(dfa.start))

    for k, v in funcMap.iteritems():
        print '__code state_' + set2name(k) + "(char* s) {"
        print "\tswitch(*s++) {"
        for case in v:
            print "\t\tcase '%c': " % (case)
            print "\t\t\tgoto state_%s(s);\n\t\t\tbreak;" % set2name(dfa.map[(frozenset(k), case)])
        if k in dfa.accepts: print "\t\tcase '\\0': goto accept();\n\t\t\tbreak;"
        print "\t\tdefault: goto reject();\n\t}"
        print "}\n"

    print """
__code accept() {
\tprintf(\"\\nstring matches regexp. \\n\\n\");
}\n"""
    print """
__code reject() {
\tprintf(\"\\nstring does not match regexp. \\n\\n\");
}\n"""

class Regexp(object):
    def __init__(self, regexp):
        self.regexp = regexp
        self.nfa    = None
        self.dfa    = None
        self._compile()

    def _compile(self):
        lexer_  = Lexer(self.regexp)
        parser_ = Parser(lexer_)
        self.nfa = parser_.expression()
        self.dfa = nfa2dfa(self.nfa)

    def emitCbC(self):
        dfa2CbC(self.dfa, self.regexp)

    def matches(self, string):
        runtime = self.dfa.getRuntime()
        return runtime.doesAccept(string)

def compile(regexp):
    return Regexp(regexp)

def main(argv):
    if len(argv) == 1 : exit("usage: dfareg.py regexp [> file]")
    r = compile(argv[1])
    r.emitCbC()

if __name__ == '__main__' : main(sys.argv)