Mercurial > hg > Members > shinya > pyrect
changeset 30:ef2928cdbdb6
organize directory. (but not separate module-dir yet,,)
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 09 Jul 2010 22:51:30 +0900 |
parents | b833746d9d92 |
children | 66f167c2286c |
files | pyrect/__init__.py pyrect/c_translator.py pyrect/cbc_translator.py pyrect/converter.py pyrect/dfa_translator.py pyrect/dfareg.py pyrect/dot_translator.py pyrect/grep_bench.sh pyrect/grep_translator.py pyrect/jitgrep.py pyrect/llvm_bench.py pyrect/llvm_translator.py pyrect/template/grep.c pyrect/translator.py src/__init__.py src/c_translator.py src/cbc_translator.py src/converter.py src/dfa_translator.py src/dfareg.py src/dot_translator.py src/grep_bench.sh src/grep_translator.py src/jitgrep.py src/llvm_bench.py src/llvm_translator.py src/template/grep.c src/translator.py |
diffstat | 28 files changed, 1382 insertions(+), 1412 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/__init__.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,5 @@ +#!/usr/bin/env python + +from dfareg import Regexp +def compile(regexp): + return Regexp(regexp)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/c_translator.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,119 @@ +#!/Usr/bin/env python + +from dfareg import Regexp, CallGraph +from translator import Translator + +class CTranslator(Translator): + """CTranslator + This Class can translate from DFA or NFA into C source code. + DFA: A simple state transition as tail call (also can implement with CbC). + NFA: using stack, deepening depth-first search. + >>> string = '(A|B)*C' + >>> reg = Regexp(string) + >>> dfacg = CallGraph(reg.dfa) + >>> nfacg = CallGraph(reg.nfa) + >>> CTranslator(string, dfacg).translate() + >>> CTranslator(string, nfacg).translate() + """ + def __init__(self, regexp, cg): + Translator.__init__(self, regexp, cg) + self.funType = 'void ' + self.callType = '' + self.breakStatement = '\t\t\tbreak;' + self.debug = False + self.eols = ('\\0', '\\n') + if self.cg.type == "DFA": + self.name_hash = self.create_name_hash() + + def modify_state_name(self, state_name): + if state_name == "accept" or state_name == "reject": + return state_name + if self.cg.type == "DFA": + return "state_"+self.name_hash[state_name] + else: + return "state_"+state_name + + def emit_accept_state(self): + self.emit (""" +%saccept(char* s) { +\tprintf(\"\\nstring matches regexp. \\n\\n\"); +}\n""" % self.funType) + + def emit_reject_state(self): + self.emit (""" +%sreject(char* s) { +\tprintf(\"\\nstring does not match regexp. \\n\\n\"); +}\n""" % self.funType) + + def emit_driver(self): + self.emit(""" +int main(int argc, char* argv[]) { +\tputs(\"regexp: %s\"); +\tputs(\"number of state: %d\"); +\tprintf(\"string: %%s\\n\", argv[1]); +\t%s%s(argv[1]); +""" % (self.regexp, len(self.cg.states), self.callType, self.modify_state_name(self.cg.start))) + if self.cg.type == "NFA": + self.emit("\treject(argv[1]);\n") + self.emit(""" +\treturn 0; +}\n\n""") + + def emit_switch(self, case, default=None): + self.emit("\tswitch(*s++) {\n") + for input, next_states in case.iteritems(): + if input != '': + self.emit("\t\tcase '%s': \n" % (input)) + for next_state in next_states: + self.emit("\t\t\t%s%s(s);\n" % (self.callType, self.modify_state_name(next_state))) + if self.breakStatement != '': self.emit(self.breakStatement+'\n') + + if default: + self.emit( """\t\tdefault:\n\t\t\t%s%s(s);\n""" % (self.callType, default)) + self.emit("\t}\n") + + + def emit_state(self, cur_state, transition): + self.emit(self.funType + self.modify_state_name(cur_state) + "(char* s) {\n") + if self.debug: + self.emit("\tprintf(\"state: %s, input: %%s\\n\", s);\n" % (cur_state)) + if self.cg.type == "NFA": + if '' in transition: + epsilon_transition = transition.pop('') + for n in epsilon_transition: + self.emit("\t%s%s(s);\n" % (self.callType, self.modify_state_name(n))) + + if cur_state in self.cg.accepts: + for eol in self.eols: + transition[eol] = ["accept"] + + if transition: + if self.cg.type == "DFA": + self.emit_switch(transition, default="reject") + else: + self.emit_switch(transition) + self.emit("}\n\n") + + def emit_initialization(self): + self.emit("#include <stdio.h>\n\n") + for state in self.cg.map.iterkeys(): + self.emit(self.funType + self.modify_state_name(state) + "(char* s);\n") + self.emit(self.funType + 'accept(char* s);\n') + self.emit(self.funType + 'reject(char* s);\n') + + def emit_from_callgraph(self): + # self.emit C-source code + self.emit_initialization() + self.emit_driver() + + for cur_state, transition in self.cg.map.iteritems(): + self.emit_state(cur_state, transition) + + self.emit_accept_state() + self.emit_reject_state() + +def test(): + import doctest + doctest.testmod() + +if __name__ == '__main__': test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/cbc_translator.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,1 @@ +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/converter.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import sys +from dfareg import Regexp, CallGraph +from c_translator import CTranslator +from cbc_translator import CbCTranslator +from dot_translator import DotTranslator +from llvm_translator import LLVMTranslator +from grep_translator import GREPTranslator +from optparse import OptionParser + +def main(argv): + myusage = "%prog [-C] regexp" + psr = OptionParser(usage=myusage) + psr.add_option("--CbC", action="store_true", dest="emit_cbc", default=False, help="emit CbC-source") + psr.add_option("--grep", action="store_true", dest="emit_grep", default=False, help="emit grep-source") + psr.add_option("--LLVM", action="store_true", dest="emit_llvm", default=False, help="emit LLVM-source") + psr.add_option("--Dot", action="store_true", dest="emit_dot", default=False, help="emit Dot-source") + psr.add_option("--from-dfa", action="store_true", dest="dfa", default=True, help="translate from DFA") + psr.add_option("--from-nfa", action="store_true", dest="nfa", default=False, help="translate from NFA") + psr.add_option("-o", action="store", type="string", dest="output", default=False, help="output file", metavar="FILE") + psr.add_option("-O", action="store", type="string", dest="optimize", default=False, help="do optimization (only in llvm).", metavar="FILE") + psr.add_option("-L", action="store_true", dest="label", default=False, help="implement with label&goto. (only in llvm).") + psr.add_option("-g", action="store_true", dest="debug", default=False, help="embed debug info") + psr.add_option("-D", action="store_true", dest="emit_dot", default=False, help="emit Dot file") + (opts, args) = psr.parse_args(sys.argv) + if len(args) == 1: + psr.print_help() + exit() + reg = Regexp(args[1]) + if not opts.output: + output = sys.stdout + else: + output = open(opts.output,"w") + + if opts.nfa: + fa = reg.nfa + else: + fa = reg.dfa + + if opts.emit_dot: + translator = DotTranslator(reg.regexp, CallGraph(fa)) + translator.debug = opts.debug + elif opts.emit_llvm: + translator = LLVMTranslator(reg.regexp, CallGraph(fa)) + translator.debug = opts.debug + translator.optimize = opts.optimize + translator.impl_label = opts.label + elif opts.emit_grep: + translator = GREPTranslator(reg.regexp, CallGraph(fa)) + elif opts.emit_cbc: + translator = CbCTranslator(reg.regexp, CallGraph(fa)) + translator.debug = opts.debug + else: + translator = CTranslator(reg.regexp, CallGraph(fa)) + translator.debug = opts.debug + + translator.translate(output) + +if __name__ == '__main__' : main(sys.argv)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/dfa_translator.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +from grep_translator import GREPTranslator +from dfareg import Regexp, CallGraph + +'''(build|fndecl|gcc)''' +class DFATranslator(GREPTranslator): + """DFATranslator + This class can translate from DFA into size_t DFA(char* s). + which is entirely equivalent to dfaexec(..) in GNU-grep (see src/dfa.c). + * but which is not work currently. (when search large-file, there is fewer + * accepted-lines than grep's dfaexec.) + * probably, there is some problem exists about buffering. + >>> string = '(build|fndecl|gcc)' + >>> reg = Regexp(string) + >>> dfacg = CallGraph(reg.dfa) + >>> tje = DFATranslator(string, dfacg) + >>> tje.translate() + """ + + def __init__(self, regexp, cg): + GREPTranslator.__init__(self, regexp, cg) + self.funType = 'size_t ' + self.callType = 'return ' + self.breakStatement = '' + + def emit_initialization(self): + for state in self.cg.map.iterkeys(): + self.emit(self.funType + self.modify_state_name(state) + "(char* s);\n") + self.emit(self.funType + 'accept(char* s);\n') + self.emit(self.funType + 'reject(char* s);\n') + + def emit_accept_state(self): + self.emit (""" +%saccept(char* s) { +\treturn 1; +}\n""" % self.funType) + + def emit_reject_state(self): + self.emit (""" +%sreject(char* s) { +\treturn 0; +}\n""" % self.funType) + + def emit_driver(self): + self.emit(""" +/* This DFA accept only \'%s\'*/ +%sDFA(char *s) { + char *begin = s; + do { + if (%s(s)) { //(matchhere(regexp+1, text)) + return (char const *) s - begin; + } + } while (*s != '\\n' && *s++ != '\\0'); + return (size_t) -1; +}\n\n""" % (self.regexp, self.funType, self.modify_state_name(self.cg.start))) + + def emit_state(self, cur_state, transition): + self.emit(self.funType + self.modify_state_name(cur_state) + "(char* s) {\n") + if cur_state in self.cg.accepts: + self.emit("\treturn accept(s);\n") + else: + if transition: + if self.cg.type == "DFA": + self.emit_switch(transition, default="reject") + else: + self.emit_switch(transition) + self.emit("}\n\n") + +def test(): + import doctest + doctest.testmod() + +if __name__ == '__main__': test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/dfareg.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,443 @@ +#!/Usr/bin/env python + +import copy +import re +from optparse import OptionParser + +class NondeterministicFiniteAutomaton(object): + def __init__(self, transition, start, accepts, map_): + self.transition = transition + self.start = start + self.accepts = accepts + self.map = map_ + + def epsilon_expand(self, set_): + que = set(set_) + done = set() + while que: + stat = que.pop() + nexts = self.transition(stat, "") + done.add(stat) + for nextStat in nexts: + if not nextStat in done: que.add(nextStat) + return frozenset(done) + +class DeterministicFiniteAutomaton(object): + def __init__(self, transition, start, accepts, map_): + self.transition = transition + self.start = start + self.accepts = accepts + self.map = map_ + def get_runtime(self): + return DFARuntime(self) + +class DFARuntime(object): + def __init__(self, DFA): + self.DFA = DFA + self.curState = self.DFA.start + + def do_transition(self, char): + self.curState = self.DFA.transition( + self.curState, char) + + def is_accept_state(self): + return self.curState in self.DFA.accepts + + def does_accept(self, input): + for alphabet in input: + self.do_transition(alphabet) + return self.is_accept_state() + +class Token(object): + CHARACTER = 0 + OPE_UNION = 1 + OPE_STAR = 2 + LPAREN = 3 + RPAREN = 4 + EOF = 5 + + def __init__(self, value, kind): + self.value = value + self.kind = kind + + +class Lexer(object): + def __init__(self, str): + self.stringList = list(str) + + def scan(self): + if not self.stringList: + return Token(None, Token.EOF) + + ch = self.stringList.pop(0) + if ch == '\\': + return Token(self.stringList.pop(0), Token.CHARACTER) + elif ch == '|': + return Token(ch, Token.OPE_UNION) + elif ch == '(': + return Token(ch, Token.LPAREN) + elif ch == ')': + return Token(ch, Token.RPAREN) + elif ch == '*': + return Token(ch, Token.OPE_STAR) + else: + return Token(ch, Token.CHARACTER) + +"""Context-free Grammer: + (A) expression -> subexpr EOF + (B) subexpr -> seq '|' subexpr | seq + (C) seq -> subseq | '' + (D) subseq -> star subseq | star + (E) star -> factor '*' | factor + (F) factor -> '(' subexpr ')' | CHARACTER +""" + +class Parser(object): + def __init__(self, lexer): + self.lexer = lexer + self.look = None + self.move() + + def match(self, tag): + if self.look.kind != tag: + raise Exeption("syntax error") + self.move() + + def move(self): + self.look = self.lexer.scan() + + def factor(self): + if self.look.kind == Token.LPAREN: + # factor -> '(' subexpr ')' + self.match(Token.LPAREN) + node = self.subexpr() + self.match(Token.RPAREN) + return node + else: + # factor -> CHARACTER + node = Character(self.look.value) + self.match(Token.CHARACTER) + return node + + def star(self): + # star -> factor '*' | factor + node = self.factor() + if self.look.kind == Token.OPE_STAR: + self.match(Token.OPE_STAR) + node = Star(node) + return node + + def seq(self): + if self.look.kind == Token.LPAREN \ + or self.look.kind == Token.CHARACTER: + # seq -> subseq + return self.subseq() + else: + # seq -> '' + return Character("") + + def subseq(self): + node1 = self.star() + if self.look.kind == Token.LPAREN \ + or self.look.kind == Token.CHARACTER: + # subseq -> star subseq + node2 = self.subseq() + node = Concat(node1, node2) + return node + else: + # subseq -> star + return node1 + + def subexpr(self): + node1 = self.seq() + if self.look.kind == Token.OPE_UNION: + self.match(Token.OPE_UNION) + # subexpr -> seq '|' subexpr + node2 = self.subexpr() + node = Union(node1, node2) + return node + else: + # subexpr -> seq + return node1 + + def expression(self): + # expression -> subexpr EOF + node = self.subexpr() + self.match(Token.EOF) + + # Create TREE, NFA + context = Context() + fragment = node.assemble(context) + return fragment.build() + +class Context(object): + def __init__(self): + self.stateCount = 0 + + def new_state(self): + self.stateCount += 1 + return self.stateCount + +class NFAFragment(object): + def __init__(self): + self.start = None # integer + self.accepts = None # frozenset + self.map = dict() # transition + + def connect(self, from_, char, to): + slot = self.map.setdefault((from_, char), set()) + slot.add(to) + + def new_skelton(self): + # copy fragment (self), and it return + newFrag = NFAFragment(); + newFrag.map = copy.deepcopy(self.map) + return newFrag + + def __or__(self, frag): + newFrag = self.new_skelton() + for k, v in frag.map.iteritems(): + newFrag.map[k] = v.copy() + return newFrag + + def build(self): + map_ = self.map + def transition(state, char): + return frozenset(map_.get((state, char), [])) + + return NondeterministicFiniteAutomaton( + transition, + self.start, + self.accepts, + self.map + ) + +class Character(object): + def __init__(self, char): + self.char = char + + def assemble(self, context): + frag = NFAFragment() + s1 = context.new_state() + s2 = context.new_state() + frag.connect(s1, self.char, s2) + frag.start = s1 + frag.accepts = frozenset([s2]) + return frag + +class Union(object): + def __init__(self, op1, op2): + self.op1 = op1 + self.op2 = op2 + + def assemble(self, context): + frag1 = self.op1.assemble(context) + frag2 = self.op2.assemble(context) + frag = frag1 | frag2 + s = context.new_state() + frag.connect(s, "", frag1.start) + frag.connect(s, "", frag2.start) + frag.start = s + frag.accepts = frag1.accepts | frag2.accepts + return frag + +class Concat(object): + def __init__(self, op1, op2): + self.op1 = op1 + self.op2 = op2 + + def assemble(self, context): + frag1 = self.op1.assemble(context) + frag2 = self.op2.assemble(context) + frag = frag1 | frag2 + + for state in frag1.accepts: + frag.connect(state, "", frag2.start) + + frag.start = frag1.start + frag.accepts = frag2.accepts + return frag + +class Star(object): + def __init__(self, op): + self.op = op + + def assemble(self, context): + fragOrig = self.op.assemble(context) + frag = fragOrig.new_skelton() + + for state in fragOrig.accepts: + frag.connect(state, "", fragOrig.start) + + s = context.new_state() + frag.connect(s, "", fragOrig.start) + frag.start = s + frag.accepts = fragOrig.accepts | frozenset([s]) + return frag + +class NonDisjoinSets(object): + def __init__(self, sub): + self.sub = sub + def __contains__(self, a_set): + return a_set & self.sub + +def nfa2dfa(nfa): + def transition(set_, alpha): + ret = set() + for elem in set_: + ret |= nfa.transition(elem, alpha) + return nfa.epsilon_expand(frozenset(ret)) + + def state_set_transition(stateSet, char): + trans = set() + for state in stateSet: + t = nfa.map.setdefault((state, char), None) + if not t == None: trans.update(nfa.epsilon_expand(t)) + return frozenset(trans) + + map_ = dict() + que = set(frozenset([nfa.epsilon_expand(set([nfa.start]))])) + done = set() + + while que: + stateSet = que.pop() + + for state in stateSet: + for k, v in nfa.map.iteritems(): + if state == k[0] and k[1] != '': + slot = map_.setdefault((stateSet, k[1]), set()) + slot.update(nfa.epsilon_expand(v)) + + done.add(stateSet) + + for v in map_.itervalues(): + if not v in done: + que.add(frozenset(v)) + + return DeterministicFiniteAutomaton( + transition, + nfa.epsilon_expand(frozenset([nfa.start])), + NonDisjoinSets(nfa.accepts), + map_ + ) + +# create call graph (as dictionary) +class CallGraph(object): + """CallGraph is State Transition Diagram. + it's can be create from DFA or DFA. + >>> reg = Regexp(\"AA*|B\") + >>> dfacg = CallGraph(reg.dfa) + >>> nfacg = CallGraph(reg.nfa) + >>> dfacg.map + {'1_6_8': {'A': ['2_3_5'], 'B': ['7']}, '2_3_5': {'A': ['3_4']}, '7': {}, '3_4': {'A': ['3_4']}} + >>> dfacg.states + set(['1_6_8', '2_3_5', '7', '3_4']) + >>> dfacg.accepts + set(['2_3_5', '7', '3_4']) + >>> nfacg.map + {'1': {'A': ['2']}, '3': {'A': ['4']}, '2': {'': ['5']}, '5': {'': ['3']}, '4': {'': ['3']}, '7': {}, '6': {'B': ['7']}, '8': {'': ['1', '6']}} + >>> nfacg.states + set(['1', '3', '2', '5', '4', '7', '6', '8']) + >>> nfacg.accepts + set(['5', '4', '7']) + """ + + def __init__(self, fa): + self.fa = fa + if isinstance(fa, DeterministicFiniteAutomaton): + self.type = "DFA" + self.start = self.set2name(fa.start) + else: + self.type = "NFA" + self.start = str(fa.start) + self.inputs = set() + self.states = set() + (self.map, self.accepts) = self.create_call_graph(self.fa) + + # return state name from set + def set2name(self, set_): + l = list(set_) + l.sort() + return ('_'.join(str(x) for x in l)) + + def create_call_graph(self, fa): + transitionCases = dict() + if self.type == "DFA": + transitionCases[fa.start] = set() + else: + transitionCases[frozenset([fa.start])] = set() + + # transitionCases is hash table that binds a state and inputs, + # it's useful for transition definition + + # fa is dfa or nfa + # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....} + # : frozenset(1, 15) x 'd' -> frozenset([16]) + # nfa.map => {(6, ''): set([5, 7]), (1, 'A'): set([2]), ....} + # : 6 x '' -> set([5, 7]) + + for nextState in fa.map.itervalues(): + if self.type == "DFA": + transitionCases[frozenset(nextState)] = set() + else: + for nextState_ in nextState: + transitionCases[frozenset([nextState_])] = set() + + for (state, input) in fa.map.iterkeys(): + if self.type == "NFA": + state = [state] + slot = transitionCases.setdefault(frozenset(state), set()) + slot.add(input) + + callGraph = dict() + accepts = set() + + # create CallGraph + for state in transitionCases.iterkeys(): + callGraph[self.set2name(state)] = set() + + for (state, input) in transitionCases.iteritems(): + caseMap = dict() + self.states.add(self.set2name(state)) + if self.type == "DFA": + for case in input: + self.inputs.add(case) + caseMap[case] = [self.set2name(fa.map[frozenset(state), case])] + if state in fa.accepts: + accepts.add(self.set2name(state)) + else: + self.states.add(str(list(state)[0])) + for case in input: + self.inputs.add(case) + caseMap[case] = [str(next_) for next_ in fa.map[list(state)[0], case]] + if list(state)[0] in fa.accepts: + accepts.add(self.set2name(state)) + callGraph[self.set2name(state)] = caseMap + return (callGraph, accepts) + + +class Regexp(object): + def __init__(self, regexp): + self.regexp = regexp + self.nfa = None + self.dfa = None + self._compile() + + def _compile(self): + lexer_ = Lexer(self.regexp) + parser_ = Parser(lexer_) + self.nfa = parser_.expression() + self.dfa = nfa2dfa(self.nfa) + + def matches(self, string): + runtime = self.dfa.get_runtime() + return runtime.does_accept(string) + +def compile(regexp): + return Regexp(regexp) + +def test(): + import doctest + doctest.testmod() + +if __name__ == '__main__' : test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/dot_translator.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import re +from translator import Translator +from dfareg import CallGraph, Regexp + +class DotTranslator(Translator): + """ + DotToranslator + This Class can translate from DFA or NFA into Dot + Dot is Graph-generater using TeX. + --code/graph/makepdf.sh is generate graph script. + >>> string = \"(A|B)*C\" + >>> reg = Regexp(string) + >>> dfacg = CallGraph(reg.dfa) + >>> nfacg = CallGraph(reg.nfa) + >>> DotTranslator(string, dfacg).translate() + >>> DotTranslator(string, nfacg).translate() + """ + def __init__(self, regexp, cg): + Translator.__init__(self, regexp, cg) + if self.cg.type == "DFA": + self.name_hash = self.create_name_hash() + + def modify_state_name(self, state_name): + if self.cg.type == "DFA": + return "s"+self.name_hash[state_name] + else: + return "s"+state_name + + def emit_from_callgraph(self): + self.emit(''' +digraph G{ +\td2tdocpreamble = "\\usetikzlibrary{automata}"; +\td2tfigpreamble = "\\tikzstyle{every state}= \\ +\t[draw=blue!50,very thick,shape=circle, fill=blue!20]"; +\tnode [style="state"]; +\tedge [lblstyle="fill=blue!20", style="arrows=->", topath="bend left"]; +''') + + self.emit("\t%s [style=\"state, initial\"]\n" % (self.modify_state_name(self.cg.start))) + for accept in self.cg.accepts: + self.emit("\t%s [style=\"state, accepting\"]\n" % (self.modify_state_name(accept))) + + for cur_state, trans in self.cg.map.iteritems(): + for input, next_states in trans.iteritems(): + if input == "" : input = "$\\varepsilon$" + for next_state in next_states: + self.emit("\t%s -> %s [texlbl=\"%s\"]\n" + % (self.modify_state_name(cur_state), self.modify_state_name(next_state), input)) + + self.emit("}") + + +def test(): + import doctest + doctest.testmod() + ''' + reg = Regexp("(A|B)*C") + ct = CTranslator(reg.regexp, CallGraph(reg.dfa)) + ct.translate() + ''' + +if __name__ == '__main__' : test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/grep_bench.sh Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,30 @@ +#!/bin/sh + +egrepout="/tmp/egrep.out" +jitgrepout="/tmp/jitgrep.out" +agrepout="/tmp/agrep.out" +cgrepout="/tmp/cgrep.out" + +echo "[jitgrep]" +time ./jitgrep.py $@ > $jitgrepout +#time /tmp/jitgrep $@ > $jitgrepout + +echo "\n[cgrep]" +time cgrep -E $@ > $cgrepout + +echo "\n[egrep]" +time egrep $@ > $egrepout + +echo "\n[agrep]" +time agrep $@ > $agrepout + +echo "\n[diff egrep jitgrep]" +diff $egrepout $jitgrepout + +echo "[diff cgrep jitgrep]" +diff $cgrepout $jitgrepout + +echo "[diff agrep jitgrep]" +diff $agrepout $jitgrepout + +#rm -f $egrepout $jitgrepout $agrepout $cgrepout
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/grep_translator.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +from c_translator import CTranslator +from dfareg import Regexp, CallGraph + +class GREPTranslateExeption(Exception): + pass + +class GREPTranslator(CTranslator): + """GREPTranslator + This Class can translate form DFA into grep source-code. + which based on (beautiful) mini-grep introduced \"The Practice of Programming\" + written by Rob Pike & Brian W. Kernighan. (see template/grep.c) + >>> string = \"(build|fndecl|gcc)\" + >>> reg = Regexp(string) + >>> dfacg = CallGraph(reg.dfa) + >>> tje = GREPTranslator(string, dfacg) + >>> tje.translate() + """ + + def __init__(self, regexp, cg): + if cg.type == "NFA": raise GREPTranslateExeption("can't translate grep from NFA") + CTranslator.__init__(self, regexp, cg) + self.funType = 'int ' + self.callType = 'return ' + self.breakStatement = '' + + def emit_accept_state(self): + self.emit (""" +%saccept(char* s) { +\treturn 1; +}\n""" % self.funType) + + def emit_reject_state(self): + self.emit (""" +%sreject(char* s) { +\treturn 0; +}\n""" % self.funType) + + def emit_initialization(self): + self.emit("%sFILTER(char* s);\n" % (self.funType)) + self.emit("%sDFA(char* s);\n" % (self.funType)) + self.emit("extern int bmhfilter(char* text, char *key);\n") + self.emit("extern int grepmain(int argc, char* argv[]);\n") + for state in self.cg.map.iterkeys(): + self.emit(self.funType + self.modify_state_name(state) + "(char* s);\n") + self.emit(self.funType + 'accept(char* s);\n') + self.emit(self.funType + 'reject(char* s);\n') + + def emit_filter(self): + self.emit(""" +int FILTER(char *text) { +""") + self.emit("\treturn 1;\n}\n") + + def emit_driver(self): + self.emit(""" +int main(int argc, char* argv[]) { +\treturn grepmain(argc, argv); +} +""") + self.emit_filter() + self.emit(""" +%sDFA(char *s) { + return %s(s); +}\n\n""" % (self.funType, self.modify_state_name(self.cg.start))) + + def emit_state(self, cur_state, transition): + self.emit(self.funType + self.modify_state_name(cur_state) + "(char* s) {\n") + if cur_state in self.cg.accepts: + self.emit("\treturn accept(s);\n") + else: + if transition: + if self.cg.type == "DFA": + self.emit_switch(transition, default="reject") + else: + self.emit_switch(transition) + self.emit("}\n\n") + +def test(): + import doctest + doctest.testmod() + +if __name__ == '__main__': test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/jitgrep.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,110 @@ +#!/usr/bin/env python + +import sys +import os +import re +import time +from optparse import OptionParser +from grep_translator import GREPTranslator +from dfareg import Regexp, CallGraph + +def main(argv): + myusage = "%prog [--time] [--debug] [--cc=compiler] [-Olevel] regexp [file ..]" + psr = OptionParser(usage=myusage) + + redirect = "" + optimize = "-O3" + srcpath = "/tmp/jitgrep_dfa.c" + binpath = "/tmp/jitgrep" + libgrep = "template/libgrep.so" + + argv_iter = argv + for args in argv_iter: + matchp = re.match("^-O[0-9]?$", args) + if matchp: + optimize = matchp.group(0) + argv.remove(optimize) + + psr.add_option("--cc", action="store", type="string", dest="compiler", default="gcc", metavar="FILE", + help="Choose compiler (default is gcc).") + psr.add_option("--CFLAGS", action="store", type="string", dest="cflags", default="-O3", help="Print compile/matching time.") + psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.") + psr.add_option("--debug", action="store_true", dest="debug", default=False, help="Dump commands, not evalute matching (except interactive mode).") + psr.add_option("--out", action="store", type="string", dest="out", default="", metavar="FILE", help="Output file.") + psr.add_option("-O", action="store_true", dest="_", help="Optimizing compilation takes somewhat more time, level be recognized a digits [0-9]. (default: -O3)") + + (opts, args) = psr.parse_args(argv) + libgrep = ".".join([libgrep, opts.compiler]) + + if len(args) < 2: + psr.print_usage() + return + + if opts.compiler == "cbc": + cbc = True + opts.compiler = "gcc" + else: + cbc = False + + if opts.debug: print("option", opts) + if opts.debug: print("args", args) + + if (opts.time) : start_time = time.time() + string = args[1] + reg = Regexp(string) + dfacg = CallGraph(reg.dfa) + grept = GREPTranslator(string, dfacg) + + tmpsrc = open(srcpath, 'w') + grept.translate(tmpsrc) + tmpsrc.close() + if (opts.time): + end_time = time.time() + print("Translation: " + str(end_time - start_time) + " Sec.") + + if not os.path.exists(libgrep): + cmd = " ".join([opts.compiler, optimize, opts.cflags, "-c -fPIC -shared template/grep.c -o", libgrep]) + if opts.debug: + print cmd + else: + os.system(cmd) + + cmd = " ".join([opts.compiler, optimize, srcpath, libgrep, "-o", binpath]) + if opts.debug: + print("compile command", cmd) + else: + if (opts.time): start_time = time.time() + os.system(cmd) + if (opts.time): + end_time = time.time() + print("Compiling : " + str(end_time - start_time) + " Sec.") + + if opts.debug: + print("argv=" + argv) + print("args=" + args) + print("opts=" + opts) + + if len(args) == 2 and not opts.debug: + while True: + try: + os.system(binpath + ' ' + raw_input()) + except (KeyboardInterrupt, EOFError): + break + else: + if (opts.time): redirect = "> /dev/null" + if (opts.out): redirect = ">" + opts.out + cmd = ' '.join([binpath, "dummy"] + args[2:] + [redirect]) + if opts.debug: + print("exec command", cmd) + else: + if (opts.time): start_time = time.time() + os.system(cmd) + if (opts.time): + end_time = time.time() + print("Matching : " + str(end_time - start_time) + " Sec.") + + if not opts.debug: + #os.remove(srcpath) + os.remove(binpath) + +if __name__ == '__main__': main(sys.argv)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/llvm_bench.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +import sys,re +from optparse import OptionParser +from timeit import Timer +from dfareg import Regexp +from dfareg import Regexp, CallGraph +from llvm_translator import LLVMTranslator + +myusage = "%prog [-O] [-p] [-s string] [-r regexp] [-E]" +psr = OptionParser(usage=myusage) +psr.add_option("-O", action="store_true", dest="optimize", default=False, help="optimimzation") +psr.add_option("-L", action="store_true", dest="label", default=False, help="impliment with label") +psr.add_option("-E", action="store_true", dest="execute", default=False, help="execute Module") +psr.add_option("-p", action="store_true", dest="print_", default=False, help="print module") +psr.add_option("-g", action="store_true", dest="debug", default=False, help="add debug stmt to module") +psr.add_option("-n", action="store", type="int", dest="number", default=1000, help="number of eval", metavar="INT") +psr.add_option("-r", action="store", type="string", dest="regexp", default="(A|B)*C", help="regexp", metavar="FILE") +psr.add_option("-s", action="store", type="string", dest="string", default="A"+"B"*500+"C", help="string", metavar="FILE") +(opts, args) = psr.parse_args(sys.argv) + +print "regexp = \"%s\", string = \"%s\", loop = %d" % (opts.regexp, opts.string, opts.number) + +reg = Regexp(opts.regexp) +dfacg = CallGraph(reg.dfa) +llvmt = LLVMTranslator(reg.regexp, dfacg) +llvmt.string = opts.string +llvmt.debug = opts.debug +llvmt.optimize = opts.optimize + +if opts.print_: llvmt.translate() + +if opts.execute: + print "LLVM :", Timer(setup='from __main__ import llvmt', stmt='llvmt.execute()').timeit(opts.number) + + reg = re.compile("^%s$" % opts.regexp) + print "re :", Timer(setup='from __main__ import reg', stmt='reg.search("%s")' % opts.string).timeit(opts.number) + + reg = Regexp(opts.regexp) + print "dfareg:", Timer(setup='from __main__ import reg', stmt='reg.matches("%s")' % opts.string).timeit(opts.number) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/llvm_translator.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,242 @@ +#!/usr/bin/env python + +from llvm.core import * +from llvm.passes import * +from llvm.ee import * +from translator import Translator +from dfareg import Regexp, CallGraph + +class LLVMTranslator(Translator): + """LLVMTranslator + This Class can translate from DFA or NFA into LLVM-IR. + and also can JIT-Compile/evaluate it's self using llvm-py. + >>> string = '(A|B)*C' + >>> reg = Regexp(string) + >>> dfacg = CallGraph(reg.dfa) + >>> lt = LLVMTranslator(string, dfacg) + >>> lt.debug = True + >>> lt.translate() + >>> isinstance(lt.execute(), llvm.ee.GenericValue) + True + """ + # define llvm core types, and const + int_t = Type.int() + char_t = Type.int(8) + charptr_t = Type.pointer(char_t) + charptrptr_t = Type.pointer(charptr_t) + const_zero = Constant.int(int_t, 0) + const_one = Constant.int(int_t, 1) + llvm.GuaranteedTailCallOpt = True + + def __init__(self, regexp, cg): #(self, modName, regexp, string, self.impl_label, optimized, debug): + Translator.__init__(self, regexp, cg) + self.optimize = False + self.debug = False + self.impl_label = False + self.string = "ABC" + if self.cg.type == "DFA": + self.name_hash = self.create_name_hash() + self.compiled = False + + def modify_state_name(self, state_name): + if self.cg.type == "DFA": + return self.name_hash[state_name] + else: + return state_name + + def _compile(self): + self.llvm_module = Module.new(self.cg.type) + self.matchp_str = self.new_str_const(self.string) + self.debug_str = self.new_str_const("state: %s, arg: %c(int %d)\n") + def optional_func_decl(fun): + fun.calling_convertion = CC_X86_FASTCALL + fun.args[0].name = "index" + + def func_decl(state): + optional_func_decl(state) + + state_ref = dict() + main = self.llvm_module.add_function( + Type.function(self.int_t, (self.int_t,)), "main") + optional_func_decl(main) + main_entry = main.append_basic_block("entry") + + if self.impl_label: + accept_state = main.append_basic_block("accpet") + reject_state = main.append_basic_block("reject") + index_ptr = Builder.new(main_entry).malloc(self.int_t) + Builder.new(accept_state).free(index_ptr) + Builder.new(reject_state).free(index_ptr) + else: + # Create function - accept and reject (final state). + accept_state = self.llvm_module.add_function( + Type.function(self.int_t, (self.int_t,)), "accept") + optional_func_decl(accept_state) + reject_state = self.llvm_module.add_function( + Type.function(self.int_t, (self.int_t,)), "reject") + optional_func_decl(reject_state) + + + state_ref["accept"] = accept_state + state_ref["reject"] = reject_state + + # add state to module, (as function or label). + if (self.impl_label): + for state in self.cg.map.iterkeys(): + label = main.append_basic_block(state) + state_ref[state] = label + else: + for state in self.cg.map.iterkeys(): + fun = self.llvm_module.add_function( + Type.function(self.int_t, (self.int_t,)), state) + optional_func_decl(fun) + state_ref[state] = fun + + # emit instructions + if (self.impl_label): emit = Builder.new(accept_state) + else: emit = Builder.new(accept_state.append_basic_block("entry")) + if self.debug: self.emit_call_printf(emit, "%s does match regexp\n", self.gep_first(emit, self.matchp_str)) + emit.ret(self.const_one) + + if (self.impl_label): emit = Builder.new(reject_state) + else: emit = Builder.new(reject_state.append_basic_block("entry")) + if self.debug: self.emit_call_printf(emit, "%s does not match regexp\n", self.gep_first(emit, self.matchp_str)) + emit.ret(self.const_zero) + + if (self.impl_label): + # emit transition instruction with jump instruction + emit = Builder.new(main_entry) + emit.store(main.args[0], index_ptr) + emit.branch(state_ref[self.cg.start]) + + for state, transition in self.cg.map.iteritems(): + emit = Builder.new(state_ref[state]) + index = emit.load(index_ptr) + ptr = emit.gep(self.matchp_str, (self.const_zero, index)) + emit.store(emit.add(self.const_one, index), index_ptr) + char = emit.load(ptr) + si = emit.switch(char, state_ref['reject'], len(transition)) + local_label = 0 + if state in self.cg.accepts: + transition['\\0'] = "accept" + for case, next_states in transition.iteritems(): + bb = main.append_basic_block("%s_case%d" % (state, local_label)) #create default bb + emit = Builder.new(bb) + emit.branch(state_ref[next_states[0]]) + si.add_case(self.char_const(case), bb) + local_label += 1 + else: + for state, transition in self.cg.map.iteritems(): + cases = dict() + if state in self.cg.accepts: + transition['\\0'] = ["accept"] + for case, next_states in transition.iteritems(): + cases[self.char_const(case)] = state_ref[next_states[0]] + state_fun = state_ref[state] + emit = Builder.new(state_fun.append_basic_block("entry")) + ptr = emit.gep(self.matchp_str, (self.const_zero, state_fun.args[0])) + next_index = emit.add(state_fun.args[0], self.const_one) + char = emit.load(ptr) + + if (self.debug): self.emit_call_printf(emit, self.debug_str, self.gep_first(emit, self.new_str_const(fun.name)), char, char) + + label = 0 + default_bb = state_fun.append_basic_block("default") #create default bb + builder = Builder.new(default_bb) # default is reject. + ret = builder.call(reject_state, (next_index,)) + builder.ret(ret) + + si = emit.switch(char, default_bb, len(cases)) # create switch instruction with deafult case. + for case, nextFun in cases.iteritems(): + bb = state_fun.append_basic_block("case%d" % label) #create default bb + builder = Builder.new(bb) + ret = builder.call(nextFun, (next_index,)) + builder.ret(ret) + si.add_case(case, bb) + label += 1 + emit = Builder.new(main_entry) + ret = emit.call(state_ref[self.cg.start], (main.args[0],)) + emit.ret(ret) + + self.mp = ModuleProvider.new(self.llvm_module) + if (self.optimize): self.do_optimize() + self.ee = ExecutionEngine.new(self.mp) + self.main = main + self.compiled = True + + def emit_from_callgraph(self): + if not self.compiled: + self._compile() + self.emit(str(self.llvm_module)) + + def get_execution_engine(self): + if not self.compiled: + self._compile() + return self.ee + + def do_optimize(self): + #optimization passes + pm = PassManager.new() + pm.add(TargetData.new('')) + pm.add(PASS_FUNCTION_INLINING) + pm.run(self.llvm_module) + fp = FunctionPassManager.new(self.mp) + fp.add(TargetData.new('')) + fp.add(PASS_BLOCK_PLACEMENT) + fp.add(PASS_INSTRUCTION_COMBINING) + fp.add(PASS_TAIL_CALL_ELIMINATION) + fp.add(PASS_AGGRESSIVE_DCE) + fp.add(PASS_DEAD_INST_ELIMINATION) + fp.add(PASS_DEAD_CODE_ELIMINATION) + for fun in self.llvm_module.functions: + fp.run(fun) + + def print_module(self): + if not self.compiled: + self._compile() + print self.llvm_module + + def execute(self): + if not self.compiled: + self._compile() + return self.ee.run_function(self.main, + (GenericValue.int(self.int_t, 0),)) + + def new_str_const(self, val): + '''create string(array of int) as a global value ''' + str = self.llvm_module.add_global_variable(Type.array(self.char_t, len(val) + 1), "") + str.initializer = Constant.stringz(val) + return str + + def gep_first(self, emit, val): + '''get pointer of array''' + return emit.gep(val, (self.const_zero, self.const_zero)) + + def char_const(self, val): + '''create constant int value''' + if isinstance(val, str): + if val == '\\0': + return Constant.int(self.char_t, 0) + else: + return Constant.int(self.char_t, ord(val)) + else: + exit('char_const: invalid argument.', val) + + def emit_call_printf(self, emit, string, *args): + '''emit libc printf function call instruction''' + try: + printf = self.llvm_module.get_function_named("printf") + except llvm.LLVMException: + printf = self.llvm_module.add_function( + Type.function(Type.void(), + (Type.pointer(self.char_t, 0),), 1), "printf") + if isinstance(string, str): + string = self.new_str_const(string) + emit.call(printf, + [self.gep_first(emit, string)]+list(args)) + +def test(): + import doctest + doctest.testmod() + +if __name__ == "__main__": test()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/template/grep.c Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,80 @@ +/* Excerpted from 'The Practice of Programming' */ +/* by Brian W. Kernighan and Rob Pike */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define BUFSIZE 1024 + +extern int DFA(char *text); +extern int FILTER(char *text); +int bmhfilter(char *text, char *key); + +int match(char *regexp, char *text) { + if (regexp[0] == '^') + return DFA(text); + do { + if (FILTER(text) && DFA(text)) + return 1; + } while (*text++ != '\0'); + return 0; +} + +int bmhfilter(char *text, char *key) { + int i, j; + i = j = 0; + while (text[i] != '\0' && key[j] != '\0') { + if (text[i] == key[j]) { ++i; ++j; } + else { i = i - j + 1; j = 0; } + } + if (key[j] == '\0') + return 1; /* position = k - j; */ + return 0; +} + +int grep(char * regexp, FILE *f, char *name) { + int n, nmatch; + char buf[BUFSIZE]; + nmatch = 0; + while (fgets(buf, sizeof buf, f) != NULL) { + n = strlen(buf); + if (n > 0 && buf[n-1] == '\n') + buf[n-1] = '\0'; + if (match(regexp, buf)) { + nmatch++; + if (name != NULL) + printf("%s:", name); + printf("%s\n", buf); + } + } + return nmatch; +} + +int grepmain(int argc, char* argv[]) { + int i, nmatch; + FILE *f; + + if (argc < 2) { + fprintf(stderr, "usage: grep regexp [file ...]"); + exit(0); + } + nmatch = 0; + if (argc == 2) { + if (grep(argv[1], stdin, NULL)) + nmatch; + } else { + for (i = 2; i < argc; i++) { + f = fopen(argv[i], "r"); + if (f == NULL) { + fprintf(stderr, "can't open %s:", argv[i]); + continue; + } + if (grep(argv[1], f, argc > 3 ? argv[i] : NULL) > 0) + nmatch++; + fclose(f); + } + } + + return nmatch; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pyrect/translator.py Fri Jul 09 22:51:30 2010 +0900 @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +import sys + +class Translator(object): + def __init__(self, regexp, cg): + self.regexp = regexp + self.cg = cg + self.stream = None + + def emit(self, string): + self.stream.write(string) + + def create_name_hash(self): + name_hash = dict() + states = list(self.cg.states) + for index in range(len(states)): + name_hash[states[index]] = str(index) + return name_hash + + def modify_state_name(self, state_name): + return state_name + + def emit_from_callgraph(self): + pass + + def translate(self, stream=sys.stdout): + self.stream = stream + self.emit_from_callgraph()
--- a/src/__init__.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -#!/usr/bin/env python - -from dfareg import Regexp -def compile(regexp): - return Regexp(regexp)
--- a/src/c_translator.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,119 +0,0 @@ -#!/Usr/bin/env python - -from dfareg import Regexp, CallGraph -from translator import Translator - -class CTranslator(Translator): - """CTranslator - This Class can translate from DFA or NFA into C source code. - DFA: A simple state transition as tail call (also can implement with CbC). - NFA: using stack, deepening depth-first search. - >>> string = '(A|B)*C' - >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> nfacg = CallGraph(reg.nfa) - >>> CTranslator(string, dfacg).translate() - >>> CTranslator(string, nfacg).translate() - """ - def __init__(self, regexp, cg): - Translator.__init__(self, regexp, cg) - self.funType = 'void ' - self.callType = '' - self.breakStatement = '\t\t\tbreak;' - self.debug = False - self.eols = ('\\0', '\\n') - if self.cg.type == "DFA": - self.name_hash = self.create_name_hash() - - def modify_state_name(self, state_name): - if state_name == "accept" or state_name == "reject": - return state_name - if self.cg.type == "DFA": - return "state_"+self.name_hash[state_name] - else: - return "state_"+state_name - - def emit_accept_state(self): - self.emit (""" -%saccept(char* s) { -\tprintf(\"\\nstring matches regexp. \\n\\n\"); -}\n""" % self.funType) - - def emit_reject_state(self): - self.emit (""" -%sreject(char* s) { -\tprintf(\"\\nstring does not match regexp. \\n\\n\"); -}\n""" % self.funType) - - def emit_driver(self): - self.emit(""" -int main(int argc, char* argv[]) { -\tputs(\"regexp: %s\"); -\tputs(\"number of state: %d\"); -\tprintf(\"string: %%s\\n\", argv[1]); -\t%s%s(argv[1]); -""" % (self.regexp, len(self.cg.states), self.callType, self.modify_state_name(self.cg.start))) - if self.cg.type == "NFA": - self.emit("\treject(argv[1]);\n") - self.emit(""" -\treturn 0; -}\n\n""") - - def emit_switch(self, case, default=None): - self.emit("\tswitch(*s++) {\n") - for input, next_states in case.iteritems(): - if input != '': - self.emit("\t\tcase '%s': \n" % (input)) - for next_state in next_states: - self.emit("\t\t\t%s%s(s);\n" % (self.callType, self.modify_state_name(next_state))) - if self.breakStatement != '': self.emit(self.breakStatement+'\n') - - if default: - self.emit( """\t\tdefault:\n\t\t\t%s%s(s);\n""" % (self.callType, default)) - self.emit("\t}\n") - - - def emit_state(self, cur_state, transition): - self.emit(self.funType + self.modify_state_name(cur_state) + "(char* s) {\n") - if self.debug: - self.emit("\tprintf(\"state: %s, input: %%s\\n\", s);\n" % (cur_state)) - if self.cg.type == "NFA": - if '' in transition: - epsilon_transition = transition.pop('') - for n in epsilon_transition: - self.emit("\t%s%s(s);\n" % (self.callType, self.modify_state_name(n))) - - if cur_state in self.cg.accepts: - for eol in self.eols: - transition[eol] = ["accept"] - - if transition: - if self.cg.type == "DFA": - self.emit_switch(transition, default="reject") - else: - self.emit_switch(transition) - self.emit("}\n\n") - - def emit_initialization(self): - self.emit("#include <stdio.h>\n\n") - for state in self.cg.map.iterkeys(): - self.emit(self.funType + self.modify_state_name(state) + "(char* s);\n") - self.emit(self.funType + 'accept(char* s);\n') - self.emit(self.funType + 'reject(char* s);\n') - - def emit_from_callgraph(self): - # self.emit C-source code - self.emit_initialization() - self.emit_driver() - - for cur_state, transition in self.cg.map.iteritems(): - self.emit_state(cur_state, transition) - - self.emit_accept_state() - self.emit_reject_state() - -def test(): - import doctest - doctest.testmod() - -if __name__ == '__main__': test()
--- a/src/cbc_translator.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -#!/usr/bin/env python - -from dfareg import Regexp, CallGraph -from c_translator import CTranslator - -class CbCTranslateExeption(Exception): - pass - -class CbCTranslator(CTranslator): - """ - CbCTranslator - >>> string = \"(A|B)*C\" - >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> ct = CbCTranslator(string, dfacg) - >>> ct.translate() - >>> ct.debug = True - >>> ct.translate() - """ - def __init__(self, regexp, cg): - if cg.type == "NFA": raise CbCTranslateExeption("can't translate CbC from NFA") - CTranslator.__init__(self, regexp, cg) - self.funType = '__code ' - self.callType = 'goto ' - self.breakStatement = '' - -def test(): - import doctest - doctest.testmod() - -if __name__ == '__main__' : test()
--- a/src/converter.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,60 +0,0 @@ -#!/usr/bin/env python - -import sys -from dfareg import Regexp, CallGraph -from c_translator import CTranslator -from cbc_translator import CbCTranslator -from dot_translator import DotTranslator -from llvm_translator import LLVMTranslator -from grep_translator import GREPTranslator -from optparse import OptionParser - -def main(argv): - myusage = "%prog [-C] regexp" - psr = OptionParser(usage=myusage) - psr.add_option("--CbC", action="store_true", dest="emit_cbc", default=False, help="emit CbC-source") - psr.add_option("--grep", action="store_true", dest="emit_grep", default=False, help="emit grep-source") - psr.add_option("--LLVM", action="store_true", dest="emit_llvm", default=False, help="emit LLVM-source") - psr.add_option("--Dot", action="store_true", dest="emit_dot", default=False, help="emit Dot-source") - psr.add_option("--from-dfa", action="store_true", dest="dfa", default=True, help="translate from DFA") - psr.add_option("--from-nfa", action="store_true", dest="nfa", default=False, help="translate from NFA") - psr.add_option("-o", action="store", type="string", dest="output", default=False, help="output file", metavar="FILE") - psr.add_option("-O", action="store", type="string", dest="optimize", default=False, help="do optimization (only in llvm).", metavar="FILE") - psr.add_option("-L", action="store_true", dest="label", default=False, help="implement with label&goto. (only in llvm).") - psr.add_option("-g", action="store_true", dest="debug", default=False, help="embed debug info") - psr.add_option("-D", action="store_true", dest="emit_dot", default=False, help="emit Dot file") - (opts, args) = psr.parse_args(sys.argv) - if len(args) == 1: - psr.print_help() - exit() - reg = Regexp(args[1]) - if not opts.output: - output = sys.stdout - else: - output = open(opts.output,"w") - - if opts.nfa: - fa = reg.nfa - else: - fa = reg.dfa - - if opts.emit_dot: - translator = DotTranslator(reg.regexp, CallGraph(fa)) - translator.debug = opts.debug - elif opts.emit_llvm: - translator = LLVMTranslator(reg.regexp, CallGraph(fa)) - translator.debug = opts.debug - translator.optimize = opts.optimize - translator.impl_label = opts.label - elif opts.emit_grep: - translator = GREPTranslator(reg.regexp, CallGraph(fa)) - elif opts.emit_cbc: - translator = CbCTranslator(reg.regexp, CallGraph(fa)) - translator.debug = opts.debug - else: - translator = CTranslator(reg.regexp, CallGraph(fa)) - translator.debug = opts.debug - - translator.translate(output) - -if __name__ == '__main__' : main(sys.argv)
--- a/src/dfa_translator.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -#!/usr/bin/env python - -from grep_translator import GREPTranslator -from dfareg import Regexp, CallGraph - -'''(build|fndecl|gcc)''' -class DFATranslator(GREPTranslator): - """DFATranslator - This class can translate from DFA into size_t DFA(char* s). - which is entirely equivalent to dfaexec(..) in GNU-grep (see src/dfa.c). - * but which is not work currently. (when search large-file, there is fewer - * accepted-lines than grep's dfaexec.) - * probably, there is some problem exists about buffering. - >>> string = '(build|fndecl|gcc)' - >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> tje = DFATranslator(string, dfacg) - >>> tje.translate() - """ - - def __init__(self, regexp, cg): - GREPTranslator.__init__(self, regexp, cg) - self.funType = 'size_t ' - self.callType = 'return ' - self.breakStatement = '' - - def emit_initialization(self): - for state in self.cg.map.iterkeys(): - self.emit(self.funType + self.modify_state_name(state) + "(char* s);\n") - self.emit(self.funType + 'accept(char* s);\n') - self.emit(self.funType + 'reject(char* s);\n') - - def emit_accept_state(self): - self.emit (""" -%saccept(char* s) { -\treturn 1; -}\n""" % self.funType) - - def emit_reject_state(self): - self.emit (""" -%sreject(char* s) { -\treturn 0; -}\n""" % self.funType) - - def emit_driver(self): - self.emit(""" -/* This DFA accept only \'%s\'*/ -%sDFA(char *s) { - char *begin = s; - do { - if (%s(s)) { //(matchhere(regexp+1, text)) - return (char const *) s - begin; - } - } while (*s != '\\n' && *s++ != '\\0'); - return (size_t) -1; -}\n\n""" % (self.regexp, self.funType, self.modify_state_name(self.cg.start))) - - def emit_state(self, cur_state, transition): - self.emit(self.funType + self.modify_state_name(cur_state) + "(char* s) {\n") - if cur_state in self.cg.accepts: - self.emit("\treturn accept(s);\n") - else: - if transition: - if self.cg.type == "DFA": - self.emit_switch(transition, default="reject") - else: - self.emit_switch(transition) - self.emit("}\n\n") - -def test(): - import doctest - doctest.testmod() - -if __name__ == '__main__': test()
--- a/src/dfareg.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,443 +0,0 @@ -#!/Usr/bin/env python - -import copy -import re -from optparse import OptionParser - -class NondeterministicFiniteAutomaton(object): - def __init__(self, transition, start, accepts, map_): - self.transition = transition - self.start = start - self.accepts = accepts - self.map = map_ - - def epsilon_expand(self, set_): - que = set(set_) - done = set() - while que: - stat = que.pop() - nexts = self.transition(stat, "") - done.add(stat) - for nextStat in nexts: - if not nextStat in done: que.add(nextStat) - return frozenset(done) - -class DeterministicFiniteAutomaton(object): - def __init__(self, transition, start, accepts, map_): - self.transition = transition - self.start = start - self.accepts = accepts - self.map = map_ - def get_runtime(self): - return DFARuntime(self) - -class DFARuntime(object): - def __init__(self, DFA): - self.DFA = DFA - self.curState = self.DFA.start - - def do_transition(self, char): - self.curState = self.DFA.transition( - self.curState, char) - - def is_accept_state(self): - return self.curState in self.DFA.accepts - - def does_accept(self, input): - for alphabet in input: - self.do_transition(alphabet) - return self.is_accept_state() - -class Token(object): - CHARACTER = 0 - OPE_UNION = 1 - OPE_STAR = 2 - LPAREN = 3 - RPAREN = 4 - EOF = 5 - - def __init__(self, value, kind): - self.value = value - self.kind = kind - - -class Lexer(object): - def __init__(self, str): - self.stringList = list(str) - - def scan(self): - if not self.stringList: - return Token(None, Token.EOF) - - ch = self.stringList.pop(0) - if ch == '\\': - return Token(self.stringList.pop(0), Token.CHARACTER) - elif ch == '|': - return Token(ch, Token.OPE_UNION) - elif ch == '(': - return Token(ch, Token.LPAREN) - elif ch == ')': - return Token(ch, Token.RPAREN) - elif ch == '*': - return Token(ch, Token.OPE_STAR) - else: - return Token(ch, Token.CHARACTER) - -"""Context-free Grammer: - (A) expression -> subexpr EOF - (B) subexpr -> seq '|' subexpr | seq - (C) seq -> subseq | '' - (D) subseq -> star subseq | star - (E) star -> factor '*' | factor - (F) factor -> '(' subexpr ')' | CHARACTER -""" - -class Parser(object): - def __init__(self, lexer): - self.lexer = lexer - self.look = None - self.move() - - def match(self, tag): - if self.look.kind != tag: - raise Exeption("syntax error") - self.move() - - def move(self): - self.look = self.lexer.scan() - - def factor(self): - if self.look.kind == Token.LPAREN: - # factor -> '(' subexpr ')' - self.match(Token.LPAREN) - node = self.subexpr() - self.match(Token.RPAREN) - return node - else: - # factor -> CHARACTER - node = Character(self.look.value) - self.match(Token.CHARACTER) - return node - - def star(self): - # star -> factor '*' | factor - node = self.factor() - if self.look.kind == Token.OPE_STAR: - self.match(Token.OPE_STAR) - node = Star(node) - return node - - def seq(self): - if self.look.kind == Token.LPAREN \ - or self.look.kind == Token.CHARACTER: - # seq -> subseq - return self.subseq() - else: - # seq -> '' - return Character("") - - def subseq(self): - node1 = self.star() - if self.look.kind == Token.LPAREN \ - or self.look.kind == Token.CHARACTER: - # subseq -> star subseq - node2 = self.subseq() - node = Concat(node1, node2) - return node - else: - # subseq -> star - return node1 - - def subexpr(self): - node1 = self.seq() - if self.look.kind == Token.OPE_UNION: - self.match(Token.OPE_UNION) - # subexpr -> seq '|' subexpr - node2 = self.subexpr() - node = Union(node1, node2) - return node - else: - # subexpr -> seq - return node1 - - def expression(self): - # expression -> subexpr EOF - node = self.subexpr() - self.match(Token.EOF) - - # Create TREE, NFA - context = Context() - fragment = node.assemble(context) - return fragment.build() - -class Context(object): - def __init__(self): - self.stateCount = 0 - - def new_state(self): - self.stateCount += 1 - return self.stateCount - -class NFAFragment(object): - def __init__(self): - self.start = None # integer - self.accepts = None # frozenset - self.map = dict() # transition - - def connect(self, from_, char, to): - slot = self.map.setdefault((from_, char), set()) - slot.add(to) - - def new_skelton(self): - # copy fragment (self), and it return - newFrag = NFAFragment(); - newFrag.map = copy.deepcopy(self.map) - return newFrag - - def __or__(self, frag): - newFrag = self.new_skelton() - for k, v in frag.map.iteritems(): - newFrag.map[k] = v.copy() - return newFrag - - def build(self): - map_ = self.map - def transition(state, char): - return frozenset(map_.get((state, char), [])) - - return NondeterministicFiniteAutomaton( - transition, - self.start, - self.accepts, - self.map - ) - -class Character(object): - def __init__(self, char): - self.char = char - - def assemble(self, context): - frag = NFAFragment() - s1 = context.new_state() - s2 = context.new_state() - frag.connect(s1, self.char, s2) - frag.start = s1 - frag.accepts = frozenset([s2]) - return frag - -class Union(object): - def __init__(self, op1, op2): - self.op1 = op1 - self.op2 = op2 - - def assemble(self, context): - frag1 = self.op1.assemble(context) - frag2 = self.op2.assemble(context) - frag = frag1 | frag2 - s = context.new_state() - frag.connect(s, "", frag1.start) - frag.connect(s, "", frag2.start) - frag.start = s - frag.accepts = frag1.accepts | frag2.accepts - return frag - -class Concat(object): - def __init__(self, op1, op2): - self.op1 = op1 - self.op2 = op2 - - def assemble(self, context): - frag1 = self.op1.assemble(context) - frag2 = self.op2.assemble(context) - frag = frag1 | frag2 - - for state in frag1.accepts: - frag.connect(state, "", frag2.start) - - frag.start = frag1.start - frag.accepts = frag2.accepts - return frag - -class Star(object): - def __init__(self, op): - self.op = op - - def assemble(self, context): - fragOrig = self.op.assemble(context) - frag = fragOrig.new_skelton() - - for state in fragOrig.accepts: - frag.connect(state, "", fragOrig.start) - - s = context.new_state() - frag.connect(s, "", fragOrig.start) - frag.start = s - frag.accepts = fragOrig.accepts | frozenset([s]) - return frag - -class NonDisjoinSets(object): - def __init__(self, sub): - self.sub = sub - def __contains__(self, a_set): - return a_set & self.sub - -def nfa2dfa(nfa): - def transition(set_, alpha): - ret = set() - for elem in set_: - ret |= nfa.transition(elem, alpha) - return nfa.epsilon_expand(frozenset(ret)) - - def state_set_transition(stateSet, char): - trans = set() - for state in stateSet: - t = nfa.map.setdefault((state, char), None) - if not t == None: trans.update(nfa.epsilon_expand(t)) - return frozenset(trans) - - map_ = dict() - que = set(frozenset([nfa.epsilon_expand(set([nfa.start]))])) - done = set() - - while que: - stateSet = que.pop() - - for state in stateSet: - for k, v in nfa.map.iteritems(): - if state == k[0] and k[1] != '': - slot = map_.setdefault((stateSet, k[1]), set()) - slot.update(nfa.epsilon_expand(v)) - - done.add(stateSet) - - for v in map_.itervalues(): - if not v in done: - que.add(frozenset(v)) - - return DeterministicFiniteAutomaton( - transition, - nfa.epsilon_expand(frozenset([nfa.start])), - NonDisjoinSets(nfa.accepts), - map_ - ) - -# create call graph (as dictionary) -class CallGraph(object): - """CallGraph is State Transition Diagram. - it's can be create from DFA or DFA. - >>> reg = Regexp(\"AA*|B\") - >>> dfacg = CallGraph(reg.dfa) - >>> nfacg = CallGraph(reg.nfa) - >>> dfacg.map - {'1_6_8': {'A': ['2_3_5'], 'B': ['7']}, '2_3_5': {'A': ['3_4']}, '7': {}, '3_4': {'A': ['3_4']}} - >>> dfacg.states - set(['1_6_8', '2_3_5', '7', '3_4']) - >>> dfacg.accepts - set(['2_3_5', '7', '3_4']) - >>> nfacg.map - {'1': {'A': ['2']}, '3': {'A': ['4']}, '2': {'': ['5']}, '5': {'': ['3']}, '4': {'': ['3']}, '7': {}, '6': {'B': ['7']}, '8': {'': ['1', '6']}} - >>> nfacg.states - set(['1', '3', '2', '5', '4', '7', '6', '8']) - >>> nfacg.accepts - set(['5', '4', '7']) - """ - - def __init__(self, fa): - self.fa = fa - if isinstance(fa, DeterministicFiniteAutomaton): - self.type = "DFA" - self.start = self.set2name(fa.start) - else: - self.type = "NFA" - self.start = str(fa.start) - self.inputs = set() - self.states = set() - (self.map, self.accepts) = self.create_call_graph(self.fa) - - # return state name from set - def set2name(self, set_): - l = list(set_) - l.sort() - return ('_'.join(str(x) for x in l)) - - def create_call_graph(self, fa): - transitionCases = dict() - if self.type == "DFA": - transitionCases[fa.start] = set() - else: - transitionCases[frozenset([fa.start])] = set() - - # transitionCases is hash table that binds a state and inputs, - # it's useful for transition definition - - # fa is dfa or nfa - # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....} - # : frozenset(1, 15) x 'd' -> frozenset([16]) - # nfa.map => {(6, ''): set([5, 7]), (1, 'A'): set([2]), ....} - # : 6 x '' -> set([5, 7]) - - for nextState in fa.map.itervalues(): - if self.type == "DFA": - transitionCases[frozenset(nextState)] = set() - else: - for nextState_ in nextState: - transitionCases[frozenset([nextState_])] = set() - - for (state, input) in fa.map.iterkeys(): - if self.type == "NFA": - state = [state] - slot = transitionCases.setdefault(frozenset(state), set()) - slot.add(input) - - callGraph = dict() - accepts = set() - - # create CallGraph - for state in transitionCases.iterkeys(): - callGraph[self.set2name(state)] = set() - - for (state, input) in transitionCases.iteritems(): - caseMap = dict() - self.states.add(self.set2name(state)) - if self.type == "DFA": - for case in input: - self.inputs.add(case) - caseMap[case] = [self.set2name(fa.map[frozenset(state), case])] - if state in fa.accepts: - accepts.add(self.set2name(state)) - else: - self.states.add(str(list(state)[0])) - for case in input: - self.inputs.add(case) - caseMap[case] = [str(next_) for next_ in fa.map[list(state)[0], case]] - if list(state)[0] in fa.accepts: - accepts.add(self.set2name(state)) - callGraph[self.set2name(state)] = caseMap - return (callGraph, accepts) - - -class Regexp(object): - def __init__(self, regexp): - self.regexp = regexp - self.nfa = None - self.dfa = None - self._compile() - - def _compile(self): - lexer_ = Lexer(self.regexp) - parser_ = Parser(lexer_) - self.nfa = parser_.expression() - self.dfa = nfa2dfa(self.nfa) - - def matches(self, string): - runtime = self.dfa.get_runtime() - return runtime.does_accept(string) - -def compile(regexp): - return Regexp(regexp) - -def test(): - import doctest - doctest.testmod() - -if __name__ == '__main__' : test()
--- a/src/dot_translator.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -#!/usr/bin/env python - -import re -from translator import Translator -from dfareg import CallGraph, Regexp - -class DotTranslator(Translator): - """ - DotToranslator - This Class can translate from DFA or NFA into Dot - Dot is Graph-generater using TeX. - --code/graph/makepdf.sh is generate graph script. - >>> string = \"(A|B)*C\" - >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> nfacg = CallGraph(reg.nfa) - >>> DotTranslator(string, dfacg).translate() - >>> DotTranslator(string, nfacg).translate() - """ - def __init__(self, regexp, cg): - Translator.__init__(self, regexp, cg) - if self.cg.type == "DFA": - self.name_hash = self.create_name_hash() - - def modify_state_name(self, state_name): - if self.cg.type == "DFA": - return "s"+self.name_hash[state_name] - else: - return "s"+state_name - - def emit_from_callgraph(self): - self.emit(''' -digraph G{ -\td2tdocpreamble = "\\usetikzlibrary{automata}"; -\td2tfigpreamble = "\\tikzstyle{every state}= \\ -\t[draw=blue!50,very thick,shape=circle, fill=blue!20]"; -\tnode [style="state"]; -\tedge [lblstyle="fill=blue!20", style="arrows=->", topath="bend left"]; -''') - - self.emit("\t%s [style=\"state, initial\"]\n" % (self.modify_state_name(self.cg.start))) - for accept in self.cg.accepts: - self.emit("\t%s [style=\"state, accepting\"]\n" % (self.modify_state_name(accept))) - - for cur_state, trans in self.cg.map.iteritems(): - for input, next_states in trans.iteritems(): - if input == "" : input = "$\\varepsilon$" - for next_state in next_states: - self.emit("\t%s -> %s [texlbl=\"%s\"]\n" - % (self.modify_state_name(cur_state), self.modify_state_name(next_state), input)) - - self.emit("}") - - -def test(): - import doctest - doctest.testmod() - ''' - reg = Regexp("(A|B)*C") - ct = CTranslator(reg.regexp, CallGraph(reg.dfa)) - ct.translate() - ''' - -if __name__ == '__main__' : test()
--- a/src/grep_bench.sh Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -#!/bin/sh - -egrepout="/tmp/egrep.out" -jitgrepout="/tmp/jitgrep.out" -agrepout="/tmp/agrep.out" -cgrepout="/tmp/cgrep.out" - -echo "[jitgrep]" -time ./jitgrep.py $@ > $jitgrepout -#time /tmp/jitgrep $@ > $jitgrepout - -echo "\n[cgrep]" -time cgrep -E $@ > $cgrepout - -echo "\n[egrep]" -time egrep $@ > $egrepout - -echo "\n[agrep]" -time agrep $@ > $agrepout - -echo "\n[diff egrep jitgrep]" -diff $egrepout $jitgrepout - -echo "[diff cgrep jitgrep]" -diff $cgrepout $jitgrepout - -echo "[diff agrep jitgrep]" -diff $agrepout $jitgrepout - -#rm -f $egrepout $jitgrepout $agrepout $cgrepout
--- a/src/grep_translator.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -#!/usr/bin/env python - -from c_translator import CTranslator -from dfareg import Regexp, CallGraph - -class GREPTranslateExeption(Exception): - pass - -class GREPTranslator(CTranslator): - """GREPTranslator - This Class can translate form DFA into grep source-code. - which based on (beautiful) mini-grep introduced \"The Practice of Programming\" - written by Rob Pike & Brian W. Kernighan. (see template/grep.c) - >>> string = \"(build|fndecl|gcc)\" - >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> tje = GREPTranslator(string, dfacg) - >>> tje.translate() - """ - - def __init__(self, regexp, cg): - if cg.type == "NFA": raise GREPTranslateExeption("can't translate grep from NFA") - CTranslator.__init__(self, regexp, cg) - self.funType = 'int ' - self.callType = 'return ' - self.breakStatement = '' - - def emit_accept_state(self): - self.emit (""" -%saccept(char* s) { -\treturn 1; -}\n""" % self.funType) - - def emit_reject_state(self): - self.emit (""" -%sreject(char* s) { -\treturn 0; -}\n""" % self.funType) - - def emit_initialization(self): - self.emit("%sFILTER(char* s);\n" % (self.funType)) - self.emit("%sDFA(char* s);\n" % (self.funType)) - self.emit("extern int bmhfilter(char* text, char *key);\n") - self.emit("extern int grepmain(int argc, char* argv[]);\n") - for state in self.cg.map.iterkeys(): - self.emit(self.funType + self.modify_state_name(state) + "(char* s);\n") - self.emit(self.funType + 'accept(char* s);\n') - self.emit(self.funType + 'reject(char* s);\n') - - def emit_filter(self): - self.emit(""" -int FILTER(char *text) { -""") - self.emit("\treturn 1;\n}\n") - - def emit_driver(self): - self.emit(""" -int main(int argc, char* argv[]) { -\treturn grepmain(argc, argv); -} -""") - self.emit_filter() - self.emit(""" -%sDFA(char *s) { - return %s(s); -}\n\n""" % (self.funType, self.modify_state_name(self.cg.start))) - - def emit_state(self, cur_state, transition): - self.emit(self.funType + self.modify_state_name(cur_state) + "(char* s) {\n") - if cur_state in self.cg.accepts: - self.emit("\treturn accept(s);\n") - else: - if transition: - if self.cg.type == "DFA": - self.emit_switch(transition, default="reject") - else: - self.emit_switch(transition) - self.emit("}\n\n") - -def test(): - import doctest - doctest.testmod() - -if __name__ == '__main__': test()
--- a/src/jitgrep.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,110 +0,0 @@ -#!/usr/bin/env python - -import sys -import os -import re -import time -from optparse import OptionParser -from grep_translator import GREPTranslator -from dfareg import Regexp, CallGraph - -def main(argv): - myusage = "%prog [--time] [--debug] [--cc=compiler] [-Olevel] regexp [file ..]" - psr = OptionParser(usage=myusage) - - redirect = "" - optimize = "-O3" - srcpath = "/tmp/jitgrep_dfa.c" - binpath = "/tmp/jitgrep" - libgrep = "template/libgrep.so" - - argv_iter = argv - for args in argv_iter: - matchp = re.match("^-O[0-9]?$", args) - if matchp: - optimize = matchp.group(0) - argv.remove(optimize) - - psr.add_option("--cc", action="store", type="string", dest="compiler", default="gcc", metavar="FILE", - help="Choose compiler (default is gcc).") - psr.add_option("--CFLAGS", action="store", type="string", dest="cflags", default="-O3", help="Print compile/matching time.") - psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.") - psr.add_option("--debug", action="store_true", dest="debug", default=False, help="Dump commands, not evalute matching (except interactive mode).") - psr.add_option("--out", action="store", type="string", dest="out", default="", metavar="FILE", help="Output file.") - psr.add_option("-O", action="store_true", dest="_", help="Optimizing compilation takes somewhat more time, level be recognized a digits [0-9]. (default: -O3)") - - (opts, args) = psr.parse_args(argv) - libgrep = ".".join([libgrep, opts.compiler]) - - if len(args) < 2: - psr.print_usage() - return - - if opts.compiler == "cbc": - cbc = True - opts.compiler = "gcc" - else: - cbc = False - - if opts.debug: print("option", opts) - if opts.debug: print("args", args) - - if (opts.time) : start_time = time.time() - string = args[1] - reg = Regexp(string) - dfacg = CallGraph(reg.dfa) - grept = GREPTranslator(string, dfacg) - - tmpsrc = open(srcpath, 'w') - grept.translate(tmpsrc) - tmpsrc.close() - if (opts.time): - end_time = time.time() - print("Translation: " + str(end_time - start_time) + " Sec.") - - if not os.path.exists(libgrep): - cmd = " ".join([opts.compiler, optimize, opts.cflags, "-c -fPIC -shared template/grep.c -o", libgrep]) - if opts.debug: - print cmd - else: - os.system(cmd) - - cmd = " ".join([opts.compiler, optimize, srcpath, libgrep, "-o", binpath]) - if opts.debug: - print("compile command", cmd) - else: - if (opts.time): start_time = time.time() - os.system(cmd) - if (opts.time): - end_time = time.time() - print("Compiling : " + str(end_time - start_time) + " Sec.") - - if opts.debug: - print("argv=" + argv) - print("args=" + args) - print("opts=" + opts) - - if len(args) == 2 and not opts.debug: - while True: - try: - os.system(binpath + ' ' + raw_input()) - except (KeyboardInterrupt, EOFError): - break - else: - if (opts.time): redirect = "> /dev/null" - if (opts.out): redirect = ">" + opts.out - cmd = ' '.join([binpath, "dummy"] + args[2:] + [redirect]) - if opts.debug: - print("exec command", cmd) - else: - if (opts.time): start_time = time.time() - os.system(cmd) - if (opts.time): - end_time = time.time() - print("Matching : " + str(end_time - start_time) + " Sec.") - - if not opts.debug: - #os.remove(srcpath) - os.remove(binpath) - -if __name__ == '__main__': main(sys.argv)
--- a/src/llvm_bench.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -#!/usr/bin/env python - -import sys,re -from optparse import OptionParser -from timeit import Timer -from dfareg import Regexp -from dfareg import Regexp, CallGraph -from llvm_translator import LLVMTranslator - -myusage = "%prog [-O] [-p] [-s string] [-r regexp] [-E]" -psr = OptionParser(usage=myusage) -psr.add_option("-O", action="store_true", dest="optimize", default=False, help="optimimzation") -psr.add_option("-L", action="store_true", dest="label", default=False, help="impliment with label") -psr.add_option("-E", action="store_true", dest="execute", default=False, help="execute Module") -psr.add_option("-p", action="store_true", dest="print_", default=False, help="print module") -psr.add_option("-g", action="store_true", dest="debug", default=False, help="add debug stmt to module") -psr.add_option("-n", action="store", type="int", dest="number", default=1000, help="number of eval", metavar="INT") -psr.add_option("-r", action="store", type="string", dest="regexp", default="(A|B)*C", help="regexp", metavar="FILE") -psr.add_option("-s", action="store", type="string", dest="string", default="A"+"B"*500+"C", help="string", metavar="FILE") -(opts, args) = psr.parse_args(sys.argv) - -print "regexp = \"%s\", string = \"%s\", loop = %d" % (opts.regexp, opts.string, opts.number) - -reg = Regexp(opts.regexp) -dfacg = CallGraph(reg.dfa) -llvmt = LLVMTranslator(reg.regexp, dfacg) -llvmt.string = opts.string -llvmt.debug = opts.debug -llvmt.optimize = opts.optimize - -if opts.print_: llvmt.translate() - -if opts.execute: - print "LLVM :", Timer(setup='from __main__ import llvmt', stmt='llvmt.execute()').timeit(opts.number) - - reg = re.compile("^%s$" % opts.regexp) - print "re :", Timer(setup='from __main__ import reg', stmt='reg.search("%s")' % opts.string).timeit(opts.number) - - reg = Regexp(opts.regexp) - print "dfareg:", Timer(setup='from __main__ import reg', stmt='reg.matches("%s")' % opts.string).timeit(opts.number) -
--- a/src/llvm_translator.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,242 +0,0 @@ -#!/usr/bin/env python - -from llvm.core import * -from llvm.passes import * -from llvm.ee import * -from translator import Translator -from dfareg import Regexp, CallGraph - -class LLVMTranslator(Translator): - """LLVMTranslator - This Class can translate from DFA or NFA into LLVM-IR. - and also can JIT-Compile/evaluate it's self using llvm-py. - >>> string = '(A|B)*C' - >>> reg = Regexp(string) - >>> dfacg = CallGraph(reg.dfa) - >>> lt = LLVMTranslator(string, dfacg) - >>> lt.debug = True - >>> lt.translate() - >>> isinstance(lt.execute(), llvm.ee.GenericValue) - True - """ - # define llvm core types, and const - int_t = Type.int() - char_t = Type.int(8) - charptr_t = Type.pointer(char_t) - charptrptr_t = Type.pointer(charptr_t) - const_zero = Constant.int(int_t, 0) - const_one = Constant.int(int_t, 1) - llvm.GuaranteedTailCallOpt = True - - def __init__(self, regexp, cg): #(self, modName, regexp, string, self.impl_label, optimized, debug): - Translator.__init__(self, regexp, cg) - self.optimize = False - self.debug = False - self.impl_label = False - self.string = "ABC" - if self.cg.type == "DFA": - self.name_hash = self.create_name_hash() - self.compiled = False - - def modify_state_name(self, state_name): - if self.cg.type == "DFA": - return self.name_hash[state_name] - else: - return state_name - - def _compile(self): - self.llvm_module = Module.new(self.cg.type) - self.matchp_str = self.new_str_const(self.string) - self.debug_str = self.new_str_const("state: %s, arg: %c(int %d)\n") - def optional_func_decl(fun): - fun.calling_convertion = CC_X86_FASTCALL - fun.args[0].name = "index" - - def func_decl(state): - optional_func_decl(state) - - state_ref = dict() - main = self.llvm_module.add_function( - Type.function(self.int_t, (self.int_t,)), "main") - optional_func_decl(main) - main_entry = main.append_basic_block("entry") - - if self.impl_label: - accept_state = main.append_basic_block("accpet") - reject_state = main.append_basic_block("reject") - index_ptr = Builder.new(main_entry).malloc(self.int_t) - Builder.new(accept_state).free(index_ptr) - Builder.new(reject_state).free(index_ptr) - else: - # Create function - accept and reject (final state). - accept_state = self.llvm_module.add_function( - Type.function(self.int_t, (self.int_t,)), "accept") - optional_func_decl(accept_state) - reject_state = self.llvm_module.add_function( - Type.function(self.int_t, (self.int_t,)), "reject") - optional_func_decl(reject_state) - - - state_ref["accept"] = accept_state - state_ref["reject"] = reject_state - - # add state to module, (as function or label). - if (self.impl_label): - for state in self.cg.map.iterkeys(): - label = main.append_basic_block(state) - state_ref[state] = label - else: - for state in self.cg.map.iterkeys(): - fun = self.llvm_module.add_function( - Type.function(self.int_t, (self.int_t,)), state) - optional_func_decl(fun) - state_ref[state] = fun - - # emit instructions - if (self.impl_label): emit = Builder.new(accept_state) - else: emit = Builder.new(accept_state.append_basic_block("entry")) - if self.debug: self.emit_call_printf(emit, "%s does match regexp\n", self.gep_first(emit, self.matchp_str)) - emit.ret(self.const_one) - - if (self.impl_label): emit = Builder.new(reject_state) - else: emit = Builder.new(reject_state.append_basic_block("entry")) - if self.debug: self.emit_call_printf(emit, "%s does not match regexp\n", self.gep_first(emit, self.matchp_str)) - emit.ret(self.const_zero) - - if (self.impl_label): - # emit transition instruction with jump instruction - emit = Builder.new(main_entry) - emit.store(main.args[0], index_ptr) - emit.branch(state_ref[self.cg.start]) - - for state, transition in self.cg.map.iteritems(): - emit = Builder.new(state_ref[state]) - index = emit.load(index_ptr) - ptr = emit.gep(self.matchp_str, (self.const_zero, index)) - emit.store(emit.add(self.const_one, index), index_ptr) - char = emit.load(ptr) - si = emit.switch(char, state_ref['reject'], len(transition)) - local_label = 0 - if state in self.cg.accepts: - transition['\\0'] = "accept" - for case, next_states in transition.iteritems(): - bb = main.append_basic_block("%s_case%d" % (state, local_label)) #create default bb - emit = Builder.new(bb) - emit.branch(state_ref[next_states[0]]) - si.add_case(self.char_const(case), bb) - local_label += 1 - else: - for state, transition in self.cg.map.iteritems(): - cases = dict() - if state in self.cg.accepts: - transition['\\0'] = ["accept"] - for case, next_states in transition.iteritems(): - cases[self.char_const(case)] = state_ref[next_states[0]] - state_fun = state_ref[state] - emit = Builder.new(state_fun.append_basic_block("entry")) - ptr = emit.gep(self.matchp_str, (self.const_zero, state_fun.args[0])) - next_index = emit.add(state_fun.args[0], self.const_one) - char = emit.load(ptr) - - if (self.debug): self.emit_call_printf(emit, self.debug_str, self.gep_first(emit, self.new_str_const(fun.name)), char, char) - - label = 0 - default_bb = state_fun.append_basic_block("default") #create default bb - builder = Builder.new(default_bb) # default is reject. - ret = builder.call(reject_state, (next_index,)) - builder.ret(ret) - - si = emit.switch(char, default_bb, len(cases)) # create switch instruction with deafult case. - for case, nextFun in cases.iteritems(): - bb = state_fun.append_basic_block("case%d" % label) #create default bb - builder = Builder.new(bb) - ret = builder.call(nextFun, (next_index,)) - builder.ret(ret) - si.add_case(case, bb) - label += 1 - emit = Builder.new(main_entry) - ret = emit.call(state_ref[self.cg.start], (main.args[0],)) - emit.ret(ret) - - self.mp = ModuleProvider.new(self.llvm_module) - if (self.optimize): self.do_optimize() - self.ee = ExecutionEngine.new(self.mp) - self.main = main - self.compiled = True - - def emit_from_callgraph(self): - if not self.compiled: - self._compile() - self.emit(str(self.llvm_module)) - - def get_execution_engine(self): - if not self.compiled: - self._compile() - return self.ee - - def do_optimize(self): - #optimization passes - pm = PassManager.new() - pm.add(TargetData.new('')) - pm.add(PASS_FUNCTION_INLINING) - pm.run(self.llvm_module) - fp = FunctionPassManager.new(self.mp) - fp.add(TargetData.new('')) - fp.add(PASS_BLOCK_PLACEMENT) - fp.add(PASS_INSTRUCTION_COMBINING) - fp.add(PASS_TAIL_CALL_ELIMINATION) - fp.add(PASS_AGGRESSIVE_DCE) - fp.add(PASS_DEAD_INST_ELIMINATION) - fp.add(PASS_DEAD_CODE_ELIMINATION) - for fun in self.llvm_module.functions: - fp.run(fun) - - def print_module(self): - if not self.compiled: - self._compile() - print self.llvm_module - - def execute(self): - if not self.compiled: - self._compile() - return self.ee.run_function(self.main, - (GenericValue.int(self.int_t, 0),)) - - def new_str_const(self, val): - '''create string(array of int) as a global value ''' - str = self.llvm_module.add_global_variable(Type.array(self.char_t, len(val) + 1), "") - str.initializer = Constant.stringz(val) - return str - - def gep_first(self, emit, val): - '''get pointer of array''' - return emit.gep(val, (self.const_zero, self.const_zero)) - - def char_const(self, val): - '''create constant int value''' - if isinstance(val, str): - if val == '\\0': - return Constant.int(self.char_t, 0) - else: - return Constant.int(self.char_t, ord(val)) - else: - exit('char_const: invalid argument.', val) - - def emit_call_printf(self, emit, string, *args): - '''emit libc printf function call instruction''' - try: - printf = self.llvm_module.get_function_named("printf") - except llvm.LLVMException: - printf = self.llvm_module.add_function( - Type.function(Type.void(), - (Type.pointer(self.char_t, 0),), 1), "printf") - if isinstance(string, str): - string = self.new_str_const(string) - emit.call(printf, - [self.gep_first(emit, string)]+list(args)) - -def test(): - import doctest - doctest.testmod() - -if __name__ == "__main__": test()
--- a/src/template/grep.c Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ -/* Excerpted from 'The Practice of Programming' */ -/* by Brian W. Kernighan and Rob Pike */ - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#define BUFSIZE 1024 - -extern int DFA(char *text); -extern int FILTER(char *text); -int bmhfilter(char *text, char *key); - -int match(char *regexp, char *text) { - if (regexp[0] == '^') - return DFA(text); - do { - if (FILTER(text) && DFA(text)) - return 1; - } while (*text++ != '\0'); - return 0; -} - -int bmhfilter(char *text, char *key) { - int i, j; - i = j = 0; - while (text[i] != '\0' && key[j] != '\0') { - if (text[i] == key[j]) { ++i; ++j; } - else { i = i - j + 1; j = 0; } - } - if (key[j] == '\0') - return 1; /* position = k - j; */ - return 0; -} - -int grep(char * regexp, FILE *f, char *name) { - int n, nmatch; - char buf[BUFSIZE]; - nmatch = 0; - while (fgets(buf, sizeof buf, f) != NULL) { - n = strlen(buf); - if (n > 0 && buf[n-1] == '\n') - buf[n-1] = '\0'; - if (match(regexp, buf)) { - nmatch++; - if (name != NULL) - printf("%s:", name); - printf("%s\n", buf); - } - } - return nmatch; -} - -int grepmain(int argc, char* argv[]) { - int i, nmatch; - FILE *f; - - if (argc < 2) { - fprintf(stderr, "usage: grep regexp [file ...]"); - exit(0); - } - nmatch = 0; - if (argc == 2) { - if (grep(argv[1], stdin, NULL)) - nmatch; - } else { - for (i = 2; i < argc; i++) { - f = fopen(argv[i], "r"); - if (f == NULL) { - fprintf(stderr, "can't open %s:", argv[i]); - continue; - } - if (grep(argv[1], f, argc > 3 ? argv[i] : NULL) > 0) - nmatch++; - fclose(f); - } - } - - return nmatch; -}
--- a/src/translator.py Fri Jul 09 14:08:20 2010 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -#!/usr/bin/env python - -import sys - -class Translator(object): - def __init__(self, regexp, cg): - self.regexp = regexp - self.cg = cg - self.stream = None - - def emit(self, string): - self.stream.write(string) - - def create_name_hash(self): - name_hash = dict() - states = list(self.cg.states) - for index in range(len(states)): - name_hash[states[index]] = str(index) - return name_hash - - def modify_state_name(self, state_name): - return state_name - - def emit_from_callgraph(self): - pass - - def translate(self, stream=sys.stdout): - self.stream = stream - self.emit_from_callgraph()