# HG changeset patch # User ryoma # Date 1272185161 -32400 # Node ID 70947ddafad7763ea0ce5a1cc494b2412d3d6834 # Parent 60c1b2f8487adcc1ccb6a66a6028b8f6541a8a1c added test code, CbC-example/regexp diff -r 60c1b2f8487a -r 70947ddafad7 CbC-examples/regexp/dfareg Binary file CbC-examples/regexp/dfareg has changed diff -r 60c1b2f8487a -r 70947ddafad7 CbC-examples/regexp/dfareg.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CbC-examples/regexp/dfareg.c Sun Apr 25 17:46:01 2010 +0900 @@ -0,0 +1,182 @@ +#include + +#include + +__code state_20_21(char* s); +__code state_1(char* s); +__code state_3_8_9_13_23_24(char* s); +__code state_6_7(char* s); +__code state_2_3_9_13_23_24_25(char* s); +__code state_10_11(char* s); +__code state_18_19(char* s); +__code state_4_5(char* s); +__code state_14_15(char* s); +__code state_3_9_13_22_23_24(char* s); +__code state_3_9_12_13_23_24(char* s); +__code state_16_17(char* s); +__code accept(); +__code reject(); + +int main(int argc, char* argv[]) { + if (argc == 1) { + printf("usage: %s text\n", argv[0]); + exit(0); +} + + puts("regexp: P(erl|HP|ython)*"); + puts("number of state: 12"); + printf("string: %s\n", argv[1]); + goto state_1(argv[1]); + return 0; +} + +__code state_20_21(char* s) { + switch(*s++) { + case 'n': + goto state_3_9_13_22_23_24(s); + break; + default: goto reject(); + } +} + +__code state_1(char* s) { + switch(*s++) { + case 'P': + goto state_2_3_9_13_23_24_25(s); + break; + default: goto reject(); + } +} + +__code state_3_8_9_13_23_24(char* s) { + switch(*s++) { + case 'y': + goto state_14_15(s); + break; + case 'H': + goto state_10_11(s); + break; + case 'e': + goto state_4_5(s); + break; + case '\0': goto accept(); + break; + default: goto reject(); + } +} + +__code state_6_7(char* s) { + switch(*s++) { + case 'l': + goto state_3_8_9_13_23_24(s); + break; + default: goto reject(); + } +} + +__code state_2_3_9_13_23_24_25(char* s) { + switch(*s++) { + case 'y': + goto state_14_15(s); + break; + case 'H': + goto state_10_11(s); + break; + case 'e': + goto state_4_5(s); + break; + case '\0': goto accept(); + break; + default: goto reject(); + } +} + +__code state_10_11(char* s) { + switch(*s++) { + case 'P': + goto state_3_9_12_13_23_24(s); + break; + default: goto reject(); + } +} + +__code state_18_19(char* s) { + switch(*s++) { + case 'o': + goto state_20_21(s); + break; + default: goto reject(); + } +} + +__code state_4_5(char* s) { + switch(*s++) { + case 'r': + goto state_6_7(s); + break; + default: goto reject(); + } +} + +__code state_14_15(char* s) { + switch(*s++) { + case 't': + goto state_16_17(s); + break; + default: goto reject(); + } +} + +__code state_3_9_13_22_23_24(char* s) { + switch(*s++) { + case 'y': + goto state_14_15(s); + break; + case 'H': + goto state_10_11(s); + break; + case 'e': + goto state_4_5(s); + break; + case '\0': goto accept(); + break; + default: goto reject(); + } +} + +__code state_3_9_12_13_23_24(char* s) { + switch(*s++) { + case 'y': + goto state_14_15(s); + break; + case 'H': + goto state_10_11(s); + break; + case 'e': + goto state_4_5(s); + break; + case '\0': goto accept(); + break; + default: goto reject(); + } +} + +__code state_16_17(char* s) { + switch(*s++) { + case 'h': + goto state_18_19(s); + break; + default: goto reject(); + } +} + + +__code accept() { + printf("\nstring matches regexp. \n\n"); +} + + +__code reject() { + printf("\nstring does not match regexp. \n\n"); +} + diff -r 60c1b2f8487a -r 70947ddafad7 CbC-examples/regexp/dfareg.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CbC-examples/regexp/dfareg.py Sun Apr 25 17:46:01 2010 +0900 @@ -0,0 +1,414 @@ +#!/usr/bin/env python + +import copy +import sys + +class NondeterministicFiniteAutomaton(object): + def __init__(self, transition, start, accepts, map_): + self.transition = transition + self.start = start + self.accepts = accepts + self.map = map_ + + def epsilonExpand(self, set_): + que = set(set_) + done = set() + while que: + stat = que.pop() + nexts = self.transition(stat, "") + done.add(stat) + for nextStat in nexts: + if not nextStat in done: que.add(nextStat) + return frozenset(done) + +class DeterministicFiniteAutomaton(object): + def __init__(self, transition, start, accepts, map_): + self.transition = transition + self.start = start + self.accepts = accepts + self.map = map_ + def getRuntime(self): + return DFARuntime(self) + +class DFARuntime(object): + def __init__(self, DFA): + self.DFA = DFA + self.curState = self.DFA.start + + def doTransition(self, char): + self.curState = self.DFA.transition( + self.curState, char) + + def isAcceptState(self): + return self.curState in self.DFA.accepts + + def doesAccept(self, input): + for alphabet in input: + self.doTransition(alphabet) + return self.isAcceptState() + +class Token(object): + CHARACTER = 0 + OPE_UNION = 1 + OPE_STAR = 2 + LPAREN = 3 + RPAREN = 4 + EOF = 5 + + def __init__(self, value, kind): + self.value = value + self.kind = kind + + +class Lexer(object): + def __init__(self, str): + self.stringList = list(str) + + def scan(self): + if not self.stringList: + return Token(None, Token.EOF) + + ch = self.stringList.pop(0) + if ch == '\\': + return Token(self.stringList.pop(0), Token.CHARACTER) + elif ch == '|': + return Token(ch, Token.OPE_UNION) + elif ch == '(': + return Token(ch, Token.LPAREN) + elif ch == ')': + return Token(ch, Token.RPAREN) + elif ch == '*': + return Token(ch, Token.OPE_STAR) + else: + return Token(ch, Token.CHARACTER) + +""" + Context-free Grammer: + (A) expression -> subexpr EOF + (B) subexpr -> seq '|' subexpr | seq + (C) seq -> subseq | '' + (D) subseq -> star subseq | star + (E) star -> factor '*' | factor + (F) factor -> '(' subexpr ')' | CHARACTER +""" + +class Parser(object): + def __init__(self, lexer): + self.lexer = lexer + self.look = None + self.move() + + def match(self, tag): + if self.look.kind != tag: + raise Exeption("syntax error") + self.move() + + def move(self): + self.look = self.lexer.scan() + + def factor(self): + if self.look.kind == Token.LPAREN: + # factor -> '(' subexpr ')' + self.match(Token.LPAREN) + node = self.subexpr() + self.match(Token.RPAREN) + return node + else: + # factor -> CHARACTER + node = Character(self.look.value) + self.match(Token.CHARACTER) + return node + + def star(self): + # star -> factor '*' | factor + node = self.factor() + if self.look.kind == Token.OPE_STAR: + self.match(Token.OPE_STAR) + node = Star(node) + return node + + def seq(self): + if self.look.kind == Token.LPAREN \ + or self.look.kind == Token.CHARACTER: + # seq -> subseq + return self.subseq() + else: + # seq -> '' + return Character("") + + def subseq(self): + node1 = self.star() + if self.look.kind == Token.LPAREN \ + or self.look.kind == Token.CHARACTER: + # subseq -> star subseq + node2 = self.subseq() + node = Concat(node1, node2) + return node + else: + # subseq -> star + return node1 + + def subexpr(self): + node1 = self.seq() + if self.look.kind == Token.OPE_UNION: + self.match(Token.OPE_UNION) + # subexpr -> seq '|' subexpr + node2 = self.subexpr() + node = Union(node1, node2) + return node + else: + # subexpr -> seq + return node1 + + def expression(self): + # expression -> subexpr EOF + node = self.subexpr() + self.match(Token.EOF) + + # Create TREE, NFA + context = Context() + fragment = node.assemble(context) + return fragment.build() + +class Context(object): + def __init__(self): + self.stateCount = 0 + + def newState(self): + self.stateCount += 1 + return self.stateCount + +class NFAFragment(object): + def __init__(self): + self.start = None # integer + self.accepts = None # frozenset + self.map = dict() # transition + + def connect(self, from_, char, to): + slot = self.map.setdefault((from_, char), set()) + slot.add(to) + + def newSkelton(self): + # copy fragment (self), and it return + newFrag = NFAFragment(); + newFrag.map = copy.deepcopy(self.map) + return newFrag + + def __or__(self, frag): + newFrag = self.newSkelton() + for k, v in frag.map.iteritems(): + newFrag.map[k] = v.copy() + return newFrag + + def build(self): + map_ = self.map + def transition(state, char): + return frozenset(map_.get((state, char), [])) + + return NondeterministicFiniteAutomaton( + transition, + self.start, + self.accepts, + self.map + ) + +class Character(object): + def __init__(self, char): + self.char = char + + def assemble(self, context): + frag = NFAFragment() + s1 = context.newState() + s2 = context.newState() + frag.connect(s1, self.char, s2) + frag.start = s1 + frag.accepts = frozenset([s2]) + return frag + +class Union(object): + def __init__(self, op1, op2): + self.op1 = op1 + self.op2 = op2 + + def assemble(self, context): + frag1 = self.op1.assemble(context) + frag2 = self.op2.assemble(context) + frag = frag1 | frag2 + s = context.newState() + frag.connect(s, "", frag1.start) + frag.connect(s, "", frag2.start) + frag.start = s + frag.accepts = frag1.accepts | frag2.accepts + return frag + +class Concat(object): + def __init__(self, op1, op2): + self.op1 = op1 + self.op2 = op2 + + def assemble(self, context): + frag1 = self.op1.assemble(context) + frag2 = self.op2.assemble(context) + frag = frag1 | frag2 + + for state in frag1.accepts: + frag.connect(state, "", frag2.start) + + frag.start = frag1.start + frag.accepts = frag2.accepts + return frag + +class Star(object): + def __init__(self, op): + self.op = op + + def assemble(self, context): + fragOrig = self.op.assemble(context) + frag = fragOrig.newSkelton() + + for state in fragOrig.accepts: + frag.connect(state, "", fragOrig.start) + + s = context.newState() + frag.connect(s, "", fragOrig.start) + frag.start = s + frag.accepts = fragOrig.accepts | frozenset([s]) + return frag + +class NonDisjoinSets(object): + def __init__(self, sub): + self.sub = sub + def __contains__(self, a_set): + return a_set & self.sub + +def nfa2dfa(nfa): + def transition(set_, alpha): + ret = set() + for elem in set_: + ret |= nfa.transition(elem, alpha) + return nfa.epsilonExpand(frozenset(ret)) + + def stateSetTransition(stateSet, char): + trans = set() + for state in stateSet: + t = nfa.map.setdefault((state, char), None) + if not t == None: trans.update(nfa.epsilonExpand(t)) + return frozenset(trans) + + map_ = dict() + que = set(frozenset([nfa.epsilonExpand(set([nfa.start]))])) + done = set() + + while que: + stateSet = que.pop() + + for state in stateSet: + for k, v in nfa.map.iteritems(): + if state == k[0] and k[1] != '': + slot = map_.setdefault((stateSet, k[1]), set()) + slot.update(nfa.epsilonExpand(v)) + + done.add(stateSet) + + for v in map_.itervalues(): + if not v in done: + que.add(frozenset(v)) + + return DeterministicFiniteAutomaton( + transition, + nfa.epsilonExpand(frozenset([nfa.start])), + NonDisjoinSets(nfa.accepts), + map_ + ) + +# emit CbC-souce, from dfa +def dfa2CbC(dfa, regexp): + + # return state name from set + def set2name(set_): + l = list(set_) + l.sort() + return '_'.join(str(x) for x in l) + + funcMap = dict() + funcMap[dfa.start] = set() + + # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....} + # : frozenset(1, 15) x 'd' -> frozenset([16]) + + for v in dfa.map.itervalues(): + funcMap[frozenset(v)] = set() + + for key in dfa.map.iterkeys(): + slot = funcMap.setdefault(key[0], set()) + slot.add(key[1]) + + # emit cbc code + print "#include \n" + print "#include \n" + for k in funcMap.iterkeys(): + print '__code state_' + set2name(k) + "(char* s);" + print '__code accept();' + print '__code reject();' + print """ +int main(int argc, char* argv[]) { +\tif (argc == 1) { +\t\tprintf(\"usage: %%s text\\n\", argv[0]); +\t\texit(0); +} + +\tputs(\"regexp: %s\"); +\tputs(\"number of state: %d\"); +\tprintf(\"string: %%s\\n\", argv[1]); +\tgoto state_%s(argv[1]); +\treturn 0; +}\n""" % (regexp, len(funcMap), set2name(dfa.start)) + + for k, v in funcMap.iteritems(): + print '__code state_' + set2name(k) + "(char* s) {" + print "\tswitch(*s++) {" + for case in v: + print "\t\tcase '%c': " % (case) + print "\t\t\tgoto state_%s(s);\n\t\t\tbreak;" % set2name(dfa.map[(frozenset(k), case)]) + if k in dfa.accepts: print "\t\tcase '\\0': goto accept();\n\t\t\tbreak;" + print "\t\tdefault: goto reject();\n\t}" + print "}\n" + + print """ +__code accept() { +\tprintf(\"\\nstring matches regexp. \\n\\n\"); +}\n""" + print """ +__code reject() { +\tprintf(\"\\nstring does not match regexp. \\n\\n\"); +}\n""" + +class Regexp(object): + def __init__(self, regexp): + self.regexp = regexp + self.nfa = None + self.dfa = None + self._compile() + + def _compile(self): + lexer_ = Lexer(self.regexp) + parser_ = Parser(lexer_) + self.nfa = parser_.expression() + self.dfa = nfa2dfa(self.nfa) + + def emitCbC(self): + dfa2CbC(self.dfa, self.regexp) + + def matches(self, string): + runtime = self.dfa.getRuntime() + return runtime.doesAccept(string) + +def compile(regexp): + return Regexp(regexp) + +def main(argv): + if len(argv) == 1 : exit("usage: dfareg.py regexp [> file]") + r = compile(argv[1]) + r.emitCbC() + +if __name__ == '__main__' : main(sys.argv)