Mercurial > hg > Members > shinya > pyrect
view src/dfareg.py @ 4:7e47839a54ad
add Regexp.emitDot(), Dot file can be converted tex->pdf
author | ryoma <e075725@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 29 Jun 2010 12:46:29 +0900 |
parents | a193b4ff3909 |
children | 11fba907c0af |
line wrap: on
line source
#!/usr/bin/env python import copy import sys, re from optparse import OptionParser class NondeterministicFiniteAutomaton(object): def __init__(self, transition, start, accepts, map_): self.transition = transition self.start = start self.accepts = accepts self.map = map_ def epsilonExpand(self, set_): que = set(set_) done = set() while que: stat = que.pop() nexts = self.transition(stat, "") done.add(stat) for nextStat in nexts: if not nextStat in done: que.add(nextStat) return frozenset(done) class DeterministicFiniteAutomaton(object): def __init__(self, transition, start, accepts, map_): self.transition = transition self.start = start self.accepts = accepts self.map = map_ def getRuntime(self): return DFARuntime(self) class DFARuntime(object): def __init__(self, DFA): self.DFA = DFA self.curState = self.DFA.start def doTransition(self, char): self.curState = self.DFA.transition( self.curState, char) def isAcceptState(self): return self.curState in self.DFA.accepts def doesAccept(self, input): for alphabet in input: self.doTransition(alphabet) return self.isAcceptState() class Token(object): CHARACTER = 0 OPE_UNION = 1 OPE_STAR = 2 LPAREN = 3 RPAREN = 4 EOF = 5 def __init__(self, value, kind): self.value = value self.kind = kind class Lexer(object): def __init__(self, str): self.stringList = list(str) def scan(self): if not self.stringList: return Token(None, Token.EOF) ch = self.stringList.pop(0) if ch == '\\': return Token(self.stringList.pop(0), Token.CHARACTER) elif ch == '|': return Token(ch, Token.OPE_UNION) elif ch == '(': return Token(ch, Token.LPAREN) elif ch == ')': return Token(ch, Token.RPAREN) elif ch == '*': return Token(ch, Token.OPE_STAR) else: return Token(ch, Token.CHARACTER) """ Context-free Grammer: (A) expression -> subexpr EOF (B) subexpr -> seq '|' subexpr | seq (C) seq -> subseq | '' (D) subseq -> star subseq | star (E) star -> factor '*' | factor (F) factor -> '(' subexpr ')' | CHARACTER """ class Parser(object): def __init__(self, lexer): self.lexer = lexer self.look = None self.move() def match(self, tag): if self.look.kind != tag: raise Exeption("syntax error") self.move() def move(self): self.look = self.lexer.scan() def factor(self): if self.look.kind == Token.LPAREN: # factor -> '(' subexpr ')' self.match(Token.LPAREN) node = self.subexpr() self.match(Token.RPAREN) return node else: # factor -> CHARACTER node = Character(self.look.value) self.match(Token.CHARACTER) return node def star(self): # star -> factor '*' | factor node = self.factor() if self.look.kind == Token.OPE_STAR: self.match(Token.OPE_STAR) node = Star(node) return node def seq(self): if self.look.kind == Token.LPAREN \ or self.look.kind == Token.CHARACTER: # seq -> subseq return self.subseq() else: # seq -> '' return Character("") def subseq(self): node1 = self.star() if self.look.kind == Token.LPAREN \ or self.look.kind == Token.CHARACTER: # subseq -> star subseq node2 = self.subseq() node = Concat(node1, node2) return node else: # subseq -> star return node1 def subexpr(self): node1 = self.seq() if self.look.kind == Token.OPE_UNION: self.match(Token.OPE_UNION) # subexpr -> seq '|' subexpr node2 = self.subexpr() node = Union(node1, node2) return node else: # subexpr -> seq return node1 def expression(self): # expression -> subexpr EOF node = self.subexpr() self.match(Token.EOF) # Create TREE, NFA context = Context() fragment = node.assemble(context) return fragment.build() class Context(object): def __init__(self): self.stateCount = 0 def newState(self): self.stateCount += 1 return self.stateCount class NFAFragment(object): def __init__(self): self.start = None # integer self.accepts = None # frozenset self.map = dict() # transition def connect(self, from_, char, to): slot = self.map.setdefault((from_, char), set()) slot.add(to) def newSkelton(self): # copy fragment (self), and it return newFrag = NFAFragment(); newFrag.map = copy.deepcopy(self.map) return newFrag def __or__(self, frag): newFrag = self.newSkelton() for k, v in frag.map.iteritems(): newFrag.map[k] = v.copy() return newFrag def build(self): map_ = self.map def transition(state, char): return frozenset(map_.get((state, char), [])) return NondeterministicFiniteAutomaton( transition, self.start, self.accepts, self.map ) class Character(object): def __init__(self, char): self.char = char def assemble(self, context): frag = NFAFragment() s1 = context.newState() s2 = context.newState() frag.connect(s1, self.char, s2) frag.start = s1 frag.accepts = frozenset([s2]) return frag class Union(object): def __init__(self, op1, op2): self.op1 = op1 self.op2 = op2 def assemble(self, context): frag1 = self.op1.assemble(context) frag2 = self.op2.assemble(context) frag = frag1 | frag2 s = context.newState() frag.connect(s, "", frag1.start) frag.connect(s, "", frag2.start) frag.start = s frag.accepts = frag1.accepts | frag2.accepts return frag class Concat(object): def __init__(self, op1, op2): self.op1 = op1 self.op2 = op2 def assemble(self, context): frag1 = self.op1.assemble(context) frag2 = self.op2.assemble(context) frag = frag1 | frag2 for state in frag1.accepts: frag.connect(state, "", frag2.start) frag.start = frag1.start frag.accepts = frag2.accepts return frag class Star(object): def __init__(self, op): self.op = op def assemble(self, context): fragOrig = self.op.assemble(context) frag = fragOrig.newSkelton() for state in fragOrig.accepts: frag.connect(state, "", fragOrig.start) s = context.newState() frag.connect(s, "", fragOrig.start) frag.start = s frag.accepts = fragOrig.accepts | frozenset([s]) return frag class NonDisjoinSets(object): def __init__(self, sub): self.sub = sub def __contains__(self, a_set): return a_set & self.sub def nfa2dfa(nfa): def transition(set_, alpha): ret = set() for elem in set_: ret |= nfa.transition(elem, alpha) return nfa.epsilonExpand(frozenset(ret)) def stateSetTransition(stateSet, char): trans = set() for state in stateSet: t = nfa.map.setdefault((state, char), None) if not t == None: trans.update(nfa.epsilonExpand(t)) return frozenset(trans) map_ = dict() que = set(frozenset([nfa.epsilonExpand(set([nfa.start]))])) done = set() while que: stateSet = que.pop() for state in stateSet: for k, v in nfa.map.iteritems(): if state == k[0] and k[1] != '': slot = map_.setdefault((stateSet, k[1]), set()) slot.update(nfa.epsilonExpand(v)) done.add(stateSet) for v in map_.itervalues(): if not v in done: que.add(frozenset(v)) return DeterministicFiniteAutomaton( transition, nfa.epsilonExpand(frozenset([nfa.start])), NonDisjoinSets(nfa.accepts), map_ ) # create call graph (as dictionary) class CallGraph(object): def __init__(self, dfa): self.dfa = dfa self.start = "state_"+self.set2name(dfa.start) (self.map, self.accepts) = self.getCallGraph(self.dfa) # return state name from set def set2name(self, set_): l = list(set_) l.sort() return '_'.join(str(x) for x in l) def getCallGraph(self, dfa): funcMap = dict() funcMap[dfa.start] = set() # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....} # : frozenset(1, 15) x 'd' -> frozenset([16]) for v in dfa.map.itervalues(): funcMap[frozenset(v)] = set() for key in dfa.map.iterkeys(): slot = funcMap.setdefault(key[0], set()) slot.add(key[1]) for v in dfa.map.itervalues(): funcMap[frozenset(v)] = set() for key in dfa.map.iterkeys(): slot = funcMap.setdefault(key[0], set()) slot.add(key[1]) callGraph = dict() accepts = list() # emit cbc code for k in funcMap.iterkeys(): callGraph["state_"+self.set2name(k)] = set() for k, v in funcMap.iteritems(): caseMap = dict() for case in v: caseMap[case] = "state_"+self.set2name(dfa.map[(frozenset(k), case)]) if k in dfa.accepts: accepts.append("state_"+self.set2name(k)) caseMap['\\0'] = "accept" callGraph["state_"+self.set2name(k)] = caseMap return (callGraph, accepts) # emit CbC-souce, from CallGraph def dfa2CbC(cg, regexp, emitCFlag): if emitCFlag: fun_type = 'void ' call_t = '' break_s = '\n\t\t\tbreak;' else: fun_type = '__code ' call_t = 'goto ' break_s = '' # emit cbc(or c) code print "#include <stdio.h>\n" for k in cg.map.iterkeys(): print fun_type + k + "(char* s);" print fun_type + 'accept(char* s);' print fun_type + 'reject(char* s);' print """ int main(int argc, char* argv[]) { \tputs(\"regexp: %s\"); \tputs(\"number of state: %d\"); \tprintf(\"string: %%s\\n\", argv[1]); \t%s%s(argv[1]); \treturn 0; }\n""" % (regexp, len(cg.map), call_t, cg.start) for k, v in cg.map.iteritems(): print fun_type + k + "(char* s) {" print "\tswitch(*s++) {" for case, next_state in v.iteritems(): print "\t\tcase '%s': " % (case) print "\t\t\t%s%s(s);%s" % (call_t, next_state, break_s) print "\t\tdefault: %sreject(s);\n\t}" % call_t print "}\n" print """ %saccept(char* s) { \tprintf(\"\\nstring matches regexp. \\n\\n\"); }\n""" % fun_type print """ %sreject(char* s) { \tprintf(\"\\nstring does not match regexp. \\n\\n\"); }\n""" % fun_type def dfa2Dot(cg, regexp): print """ digraph G { d2ttikzedgelabels = true; d2tstyleonly = true; d2tdocpreamble = \"\usetikzlibrary{automata}\"; d2tfigpreamble = \"\tikzstyle{every state}=\ [draw=blue!50, shape=circle, very thick,fill=blue!20]\"; edge [lblstyle=\"fill=blue!20\", style=\"arrows=->\", topath=\"bend left\"]; node [shape=\"circle\", style=\"state\"];\n """ def escape(string): return re.sub("_", "", string) print " %s [style=\"state, initial\"]" % (escape(cg.start)) for accept in cg.accepts: print " %s [style=\"state, accepting\"]" % (escape(accept)) for cur_state, trans in cg.map.iteritems(): for case, next_state in trans.iteritems(): if next_state != "accept": print " %s -> %s [texlbl=\"%s\"];\n" % (escape(cur_state), escape(next_state), case) print "}" class Regexp(object): def __init__(self, regexp): self.regexp = regexp self.nfa = None self.dfa = None self._compile() def _compile(self): lexer_ = Lexer(self.regexp) parser_ = Parser(lexer_) self.nfa = parser_.expression() self.dfa = nfa2dfa(self.nfa) def matches(self, string): runtime = self.dfa.getRuntime() return runtime.doesAccept(string) def emitCbC(self, emitCFlag=False): cg = CallGraph(self.dfa) dfa2CbC(cg, self.regexp, emitCFlag) def emitDot(self): cg = CallGraph(self.dfa) dfa2Dot(cg, self.regexp) def compile(regexp): return Regexp(regexp) def main(argv): myusage = "%prog [-C] regexp" psr = OptionParser(usage=myusage) psr.add_option("-C", action="store_true", dest="emitCFlag", default=False, help="emit C-source") psr.add_option("-D", action="store_true", dest="emitDot", default=False, help="emit Dot file") (opts, args) = psr.parse_args(sys.argv) if len(args) == 1: psr.print_help() exit() r = compile(args[1]) if opts.emitDot: r.emitDot() else: r.emitCbC(opts.emitCFlag) if __name__ == '__main__' : main(sys.argv)