view src/dfareg.py @ 4:7e47839a54ad

add Regexp.emitDot(), Dot file can be converted tex->pdf
author ryoma <e075725@ie.u-ryukyu.ac.jp>
date Tue, 29 Jun 2010 12:46:29 +0900
parents a193b4ff3909
children 11fba907c0af
line wrap: on
line source

#!/usr/bin/env python

import copy
import sys, re
from optparse import OptionParser

class NondeterministicFiniteAutomaton(object):
    def __init__(self, transition, start, accepts, map_):
        self.transition = transition
        self.start = start
        self.accepts = accepts
        self.map = map_

    def epsilonExpand(self, set_):
        que = set(set_)
        done = set()
        while que:
            stat = que.pop()
            nexts = self.transition(stat, "")
            done.add(stat)
            for nextStat in nexts:
                if not nextStat in done: que.add(nextStat)
        return frozenset(done)

class DeterministicFiniteAutomaton(object):
    def __init__(self, transition, start, accepts, map_):
        self.transition = transition
        self.start = start
        self.accepts = accepts
        self.map = map_
    def getRuntime(self):
        return DFARuntime(self)

class DFARuntime(object):
    def __init__(self, DFA):
        self.DFA = DFA
        self.curState = self.DFA.start

    def doTransition(self, char):
        self.curState = self.DFA.transition(
            self.curState, char)

    def isAcceptState(self):
        return self.curState in self.DFA.accepts

    def doesAccept(self, input):
        for alphabet in input:
            self.doTransition(alphabet)
        return self.isAcceptState()

class Token(object):
    CHARACTER  = 0
    OPE_UNION  = 1
    OPE_STAR   = 2
    LPAREN     = 3
    RPAREN     = 4
    EOF        = 5

    def __init__(self, value, kind):
        self.value = value
        self.kind  = kind


class Lexer(object):
    def __init__(self, str):
        self.stringList = list(str)

    def scan(self):
        if not self.stringList:
            return Token(None, Token.EOF)

        ch = self.stringList.pop(0)
        if   ch == '\\':
            return Token(self.stringList.pop(0), Token.CHARACTER)
        elif ch == '|':
            return Token(ch, Token.OPE_UNION)
        elif ch == '(':
            return Token(ch, Token.LPAREN)
        elif ch == ')':
            return Token(ch, Token.RPAREN)
        elif ch == '*':
            return Token(ch, Token.OPE_STAR)
        else:
            return Token(ch, Token.CHARACTER)

"""
 Context-free Grammer:
 (A) expression -> subexpr EOF
 (B) subexpr -> seq '|' subexpr | seq
 (C) seq -> subseq | ''
 (D) subseq -> star subseq | star
 (E) star -> factor '*' | factor
 (F) factor -> '(' subexpr ')' | CHARACTER
"""

class Parser(object):
    def __init__(self, lexer):
        self.lexer = lexer
        self.look  = None
        self.move()

    def match(self, tag):
        if self.look.kind != tag:
            raise Exeption("syntax error")
        self.move()

    def move(self):
        self.look = self.lexer.scan()

    def factor(self):
        if self.look.kind == Token.LPAREN:
            # factor -> '(' subexpr ')'
            self.match(Token.LPAREN)
            node = self.subexpr()
            self.match(Token.RPAREN)
            return node
        else:
            # factor -> CHARACTER
            node = Character(self.look.value)
            self.match(Token.CHARACTER)
            return node

    def star(self):
        # star -> factor '*' | factor
        node = self.factor()
        if self.look.kind == Token.OPE_STAR:
            self.match(Token.OPE_STAR)
            node = Star(node)
        return node

    def seq(self):
        if self.look.kind == Token.LPAREN \
               or self.look.kind == Token.CHARACTER:
            # seq -> subseq
            return self.subseq()
        else:
            # seq -> ''
            return Character("")

    def subseq(self):
        node1 = self.star()
        if self.look.kind == Token.LPAREN \
               or self.look.kind == Token.CHARACTER:
            # subseq -> star subseq
            node2 = self.subseq()
            node  = Concat(node1, node2)
            return node
        else:
            # subseq -> star
            return node1

    def subexpr(self):
        node1 = self.seq()
        if self.look.kind == Token.OPE_UNION:
            self.match(Token.OPE_UNION)
            # subexpr    -> seq '|' subexpr
            node2 = self.subexpr()
            node  = Union(node1, node2)
            return node
        else:
            # subexpr    -> seq
            return node1

    def expression(self):
        # expression -> subexpr EOF
        node     = self.subexpr()
        self.match(Token.EOF)

        # Create TREE, NFA
        context  = Context()
        fragment = node.assemble(context)
        return fragment.build()

class Context(object):
    def __init__(self):
        self.stateCount = 0

    def newState(self):
        self.stateCount += 1
        return self.stateCount

class NFAFragment(object):
    def __init__(self):
        self.start   = None    # integer
        self.accepts = None    # frozenset
        self.map     = dict()  # transition

    def connect(self, from_, char, to):
        slot = self.map.setdefault((from_, char), set())
        slot.add(to)

    def newSkelton(self):
        # copy fragment (self), and it return
        newFrag = NFAFragment();
        newFrag.map = copy.deepcopy(self.map)
        return newFrag

    def __or__(self, frag):
        newFrag = self.newSkelton()
        for k, v in frag.map.iteritems():
            newFrag.map[k] = v.copy()
        return newFrag

    def build(self):
        map_ = self.map
        def transition(state, char):
            return frozenset(map_.get((state, char), []))

        return NondeterministicFiniteAutomaton(
            transition,
            self.start,
            self.accepts,
            self.map
            )

class Character(object):
    def __init__(self, char):
        self.char = char

    def assemble(self, context):
        frag = NFAFragment()
        s1 = context.newState()
        s2 = context.newState()
        frag.connect(s1, self.char, s2)
        frag.start = s1
        frag.accepts = frozenset([s2])
        return frag

class Union(object):
    def __init__(self, op1, op2):
        self.op1 = op1
        self.op2 = op2

    def assemble(self, context):
        frag1 = self.op1.assemble(context)
        frag2 = self.op2.assemble(context)
        frag = frag1 | frag2
        s = context.newState()
        frag.connect(s, "", frag1.start)
        frag.connect(s, "", frag2.start)
        frag.start = s
        frag.accepts = frag1.accepts | frag2.accepts
        return frag

class Concat(object):
    def __init__(self, op1, op2):
        self.op1 = op1
        self.op2 = op2

    def assemble(self, context):
        frag1 = self.op1.assemble(context)
        frag2 = self.op2.assemble(context)
        frag = frag1 | frag2

        for state in frag1.accepts:
            frag.connect(state, "", frag2.start)

        frag.start   = frag1.start
        frag.accepts = frag2.accepts
        return frag

class Star(object):
    def __init__(self, op):
        self.op = op

    def assemble(self, context):
        fragOrig = self.op.assemble(context)
        frag = fragOrig.newSkelton()

        for state in fragOrig.accepts:
            frag.connect(state, "", fragOrig.start)

        s = context.newState()
        frag.connect(s, "", fragOrig.start)
        frag.start = s
        frag.accepts = fragOrig.accepts | frozenset([s])
        return frag

class NonDisjoinSets(object):
    def __init__(self, sub):
        self.sub = sub
    def __contains__(self, a_set):
        return a_set & self.sub

def nfa2dfa(nfa):
    def transition(set_, alpha):
        ret = set()
        for elem in set_:
            ret |= nfa.transition(elem, alpha)
        return nfa.epsilonExpand(frozenset(ret))

    def stateSetTransition(stateSet, char):
        trans = set()
        for state in stateSet:
            t = nfa.map.setdefault((state, char), None)
            if not t == None: trans.update(nfa.epsilonExpand(t))
        return frozenset(trans)

    map_ = dict()
    que =  set(frozenset([nfa.epsilonExpand(set([nfa.start]))]))
    done = set()

    while que:
        stateSet = que.pop()

        for state in stateSet:
            for k, v in nfa.map.iteritems():
                if state == k[0] and k[1] != '':
                    slot = map_.setdefault((stateSet, k[1]), set())
                    slot.update(nfa.epsilonExpand(v))

        done.add(stateSet)

        for v in map_.itervalues():
            if not v in done:
                que.add(frozenset(v))

    return DeterministicFiniteAutomaton(
        transition,
        nfa.epsilonExpand(frozenset([nfa.start])),
        NonDisjoinSets(nfa.accepts),
        map_
        )

# create call graph (as dictionary)
class CallGraph(object):
    def __init__(self, dfa):
        self.dfa = dfa
        self.start = "state_"+self.set2name(dfa.start)
        (self.map, self.accepts) = self.getCallGraph(self.dfa)

    # return state name from set
    def set2name(self, set_):
        l = list(set_)
        l.sort()
        return '_'.join(str(x) for x in l)

    def getCallGraph(self, dfa):

        funcMap = dict()
        funcMap[dfa.start] = set()

        # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....}
        #             : frozenset(1, 15) x 'd' -> frozenset([16])

        for v in dfa.map.itervalues():
            funcMap[frozenset(v)] = set()

        for key in dfa.map.iterkeys():
            slot = funcMap.setdefault(key[0], set())
            slot.add(key[1])

        for v in dfa.map.itervalues():
            funcMap[frozenset(v)] = set()

        for key in dfa.map.iterkeys():
            slot = funcMap.setdefault(key[0], set())
            slot.add(key[1])

        callGraph = dict()
        accepts = list()

        # emit cbc code
        for k in funcMap.iterkeys():
            callGraph["state_"+self.set2name(k)] = set()

        for k, v in funcMap.iteritems():
            caseMap = dict()
            for case in v:
                caseMap[case] = "state_"+self.set2name(dfa.map[(frozenset(k), case)])
            if k in dfa.accepts:
                accepts.append("state_"+self.set2name(k))
                caseMap['\\0'] = "accept"
            callGraph["state_"+self.set2name(k)] = caseMap

        return (callGraph, accepts)

# emit CbC-souce, from CallGraph
def dfa2CbC(cg, regexp, emitCFlag):
    if emitCFlag:
        fun_type = 'void '
        call_t = ''
        break_s = '\n\t\t\tbreak;'
    else:
        fun_type = '__code '
        call_t = 'goto '
        break_s = ''

    # emit cbc(or c) code
    print "#include <stdio.h>\n"
    for k in cg.map.iterkeys():
        print fun_type + k + "(char* s);"
    print fun_type + 'accept(char* s);'
    print fun_type + 'reject(char* s);'
    print """
int main(int argc, char* argv[]) {
\tputs(\"regexp: %s\");
\tputs(\"number of state: %d\");
\tprintf(\"string: %%s\\n\", argv[1]);
\t%s%s(argv[1]);
\treturn 0;
}\n""" % (regexp, len(cg.map), call_t, cg.start)

    for k, v in cg.map.iteritems():
        print fun_type + k + "(char* s) {"
        print "\tswitch(*s++) {"
        for case, next_state in v.iteritems():
            print "\t\tcase '%s': " % (case)
            print "\t\t\t%s%s(s);%s" % (call_t, next_state, break_s)

        print "\t\tdefault: %sreject(s);\n\t}" % call_t
        print "}\n"

    print """
%saccept(char* s) {
\tprintf(\"\\nstring matches regexp. \\n\\n\");
}\n""" % fun_type
    print """
%sreject(char* s) {
\tprintf(\"\\nstring does not match regexp. \\n\\n\");
}\n""" % fun_type

def dfa2Dot(cg, regexp):
    print """
digraph G {
    d2ttikzedgelabels = true;
    d2tstyleonly = true;
    d2tdocpreamble = \"\usetikzlibrary{automata}\";
    d2tfigpreamble = \"\tikzstyle{every state}=\
    [draw=blue!50, shape=circle, very thick,fill=blue!20]\";
    edge [lblstyle=\"fill=blue!20\", style=\"arrows=->\", topath=\"bend left\"];
    node [shape=\"circle\", style=\"state\"];\n
"""
    def escape(string):
        return re.sub("_", "", string)

    print "    %s [style=\"state, initial\"]" % (escape(cg.start))
    for accept in cg.accepts:
        print "    %s [style=\"state, accepting\"]" % (escape(accept))

    for cur_state, trans in cg.map.iteritems():
        for case, next_state in trans.iteritems():
            if next_state != "accept":
                print "    %s -> %s [texlbl=\"%s\"];\n" % (escape(cur_state), escape(next_state), case)
    print "}"


class Regexp(object):
    def __init__(self, regexp):
        self.regexp = regexp
        self.nfa    = None
        self.dfa    = None
        self._compile()

    def _compile(self):
        lexer_  = Lexer(self.regexp)
        parser_ = Parser(lexer_)
        self.nfa = parser_.expression()
        self.dfa = nfa2dfa(self.nfa)

    def matches(self, string):
        runtime = self.dfa.getRuntime()
        return runtime.doesAccept(string)

    def emitCbC(self, emitCFlag=False):
        cg = CallGraph(self.dfa)
        dfa2CbC(cg, self.regexp, emitCFlag)

    def emitDot(self):
        cg = CallGraph(self.dfa)
        dfa2Dot(cg, self.regexp)

def compile(regexp):
    return Regexp(regexp)

def main(argv):
    myusage = "%prog [-C] regexp"
    psr = OptionParser(usage=myusage)
    psr.add_option("-C", action="store_true", dest="emitCFlag", default=False, help="emit C-source")
    psr.add_option("-D", action="store_true", dest="emitDot", default=False, help="emit Dot file")
    (opts, args) = psr.parse_args(sys.argv)
    if len(args) == 1:
        psr.print_help()
        exit()
    r = compile(args[1])
    if opts.emitDot:
        r.emitDot()
    else:
        r.emitCbC(opts.emitCFlag)

if __name__ == '__main__' : main(sys.argv)