# HG changeset patch # User Ryoma SHINYA # Date 1289688995 -32400 # Node ID dafb393108f31fb1c62c30f0393a26ff0feb626d # Parent d23f12ce0369a46ae2885e21129e0efc42a2e376 impliment goto-based grep. (icc's tailcall-optimization is suck!) diff -r d23f12ce0369 -r dafb393108f3 pyrect/jitgrep.py --- a/pyrect/jitgrep.py Sun Nov 14 04:16:12 2010 +0900 +++ b/pyrect/jitgrep.py Sun Nov 14 07:56:35 2010 +0900 @@ -13,7 +13,7 @@ [--time] [--debug] [--cc=compiler] [-c] [-Olevel] regexp [file..] [--out=file] [--thread=thread_num] [--filter=algorithm] - [--disable-booster]""" + [--disable-booster] [--functional-transition]""" psr = OptionParser(usage=myusage) redirect = "" @@ -72,10 +72,11 @@ if cbc: grept = CbCGREPTranslator(reg) - elif opts.label: - grept = GoToGREPTranslator(reg) else: - grept = GREPTranslator(reg) + if opts.label: + grept = GOTOGREPTranslator(reg) + else: + grept = GREPTranslator(reg) if opts.filter: grept.filter = opts.filter grept.skip_boost = not opts.no_boost grept.table_lookup = opts.table_lookup diff -r d23f12ce0369 -r dafb393108f3 pyrect/translator/__init__.py --- a/pyrect/translator/__init__.py Sun Nov 14 04:16:12 2010 +0900 +++ b/pyrect/translator/__init__.py Sun Nov 14 07:56:35 2010 +0900 @@ -5,7 +5,7 @@ from cbc_translator import CbCTranslator as CbCTranslator from dot_translator import DotTranslator as DotTranslator from grep_translator import GREPTranslator as GREPTranslator -from goto_grep_translator import GoToGREPTranslator as GoToGREPTranslator +from goto_grep_translator import GOTOGREPTranslator as GOTOGREPTranslator from dfa_translator import GNUGREPTranslator as GNUGREPTranslator from cbc_grep_translator import CbCGREPTranslator as CbCGREPTranslator diff -r d23f12ce0369 -r dafb393108f3 pyrect/translator/goto_grep_translator.py --- a/pyrect/translator/goto_grep_translator.py Sun Nov 14 04:16:12 2010 +0900 +++ b/pyrect/translator/goto_grep_translator.py Sun Nov 14 07:56:35 2010 +0900 @@ -1,115 +1,351 @@ #!/usr/bin/env python +import os from c_translator import CTranslator from pyrect.regexp import Regexp +from pyrect.regexp.ast import ASTWalker, AnyChar, Character -class GoToGREPTranslator(CTranslator): - """GoToGREPTranslator +class GREPTranslateExeption(Exception): + pass + +class GOTOGREPTranslator(CTranslator): + """GREPTranslator This Class can translate form DFA into grep source-code. which based on (beautiful) mini-grep introduced \"The Practice of Programming\" - written by Rob Pike & Brian W. Kernighan. (see template/grep.label) - >>> string = \"(build|fndecl|gcc)\" + written by Rob Pike & Brian W. Kernighan. (see template/grep.c) + >>> string = \"(def)\" >>> reg = Regexp(string) - >>> tje = GoToGREPTranslator(reg) + >>> tje = GREPLABELTranslator(reg) >>> tje.translate() """ + BASE_DIR = os.path.dirname(os.path.abspath(__file__)) + def __init__(self, regexp): - CTranslator.__init__(self, regexp) - self.funType = 'void ' - self.callType = 'return ' - self.breakStatement = '' - self.begline = False - self.__bufsize = 1024 + CTranslator.__init__(self, regexp, fa="DFA") + self.__bufsize = 1024 * 1024 + self.thread_dfa = 1 + self.thread_line = 1 + self.filter = "quick" + self.filter_only = False + self.filter_prefix = False + self.filter_key = "" + self.skip_boost = True + self.table_lookup = False + self.start = self.cg.start + self.interface = "UCHARP beg, UCHARP buf, UCHARP end" + self.args = "beg, buf, end" def getbufsize(self,): return self.__bufsize def setbufsize(self, bufsize): self.__bufsize = abs(bufsize) - bufsize = property(getbufsize, setbufsize) - - def emit_accept_state(self): - self.emit (""" -\taccept: -\t\tprintf(\"%s\\n\", buf); -\t\treturn; -""") - - def emit_reject_state(self): - self.emit("\treject:\n") - if not self.begline: - self.emit(""" -\t\tif (*cur++ == '\\0') -\t\t\treturn; -\t\ts = cur; -\t\tgoto %s; -""" % self.state_name(self.cg.start)) - self.emit("return;\n") - - - def emit_initialization(self): - self.emit("#include \n") - self.emit("#include \n") - self.emit("#include \n\n") - - self.emit("#define LINEBUFSIZE 1024\n") - self.emit("#define READBUFSIZE %d\n\n" % (self.bufsize)) - self.emit("char readbuf[%d];\n\n" % (self.bufsize)) + bufsize = property(getbufsize, setbufsize) - self.emit("%sDFA(char* s, char *buf, char* filename);\n" % self.funType) - grepsource = open("template/grep.label") - self.emit(grepsource.read()) - - def emit_filter(self): - pass - - def emit_driver(self): - self.emit(""" -int main(int argc, char* argv[]) { -\treturn grepmain(argc, argv); -}\n\n""") + def state_name(self, name): + if name in ["accept", "reject", "matcher", "entry"\ + , "bmh_filter", "quick_filter", "booster"]: + return name + else: + return "state_"+str(name) - def emit_switch(self, case, default=None): - self.emit("\t\tswitch(*s++) {\n") - for input, next_state in case.iteritems(): - if input != '': - self.emit("\t\t\tcase '%s': \n" % (input)) - self.emit("\t\t\t\tgoto %s;\n" % (self.state_name(next_state))) - if self.breakStatement != '': self.emit(self.breakStatement+'\n') - - if default: - self.emit( """\t\t\tdefault:\n\t\t\tgoto %s;\n""" % default) - self.emit("\t\t}\n") - - def emit_state(self, cur_state, transition): - self.emit("\t" + self.state_name(cur_state) + ":\n") - if cur_state in self.cg.accepts: - self.emit("\t\tgoto accept;\n") - else: - if transition: - if self.cg.type == "DFA": - self.emit_switch(transition, default="reject") - else: - self.emit_switch(transition) - self.emit("\n") + def emit_goto(self, state): + self.emit("goto %s;" % self.state_name(state)) def emit_from_callgraph(self): # self.emit C-source code self.emit_initialization() + + self.emiti("void matcher(%s) {" % self.interface) self.emit_driver() - - self.emit("void DFA(char *s, char *buf, char* filename) {\n") - self.emit("\tchar *cur = s;\n") - self.emit("\tgoto %s;\n" % self.state_name(self.cg.start)) - for cur_state, transition in self.cg.map.iteritems(): self.emit_state(cur_state, transition) - self.emit_accept_state() self.emit_reject_state() + self.emit("return;") + self.emitd("}", 2) - self.emit("}\n\n") + def emit_initialization(self): + self.emit("#include ") + self.emit("#include ") + self.emit("#include ") + self.emit("#include ") + self.emit("#include ") + self.emit("#include ") + self.emit("#include ") + self.emit("#include ", 2) + + self.emit("typedef unsigned char UCHAR;") + self.emit("typedef unsigned char *UCHARP;") + + self.emit("void matcher(%s);" % self.interface); + self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);") + self.emit("void print_line(UCHARP buf, UCHARP beg);", 2) + + if (self.filter == "bmh" or self.filter == "quick")\ + and self.regexp.must_words: + self.filter_key = max(self.regexp.must_words, key=len) + if len(self.regexp.must_words) == 1 and len(self.filter_key) == self.regexp.min_len: + self.filter_only = True + if self.filter == "bmh": + self.start = "bmh_filter" + else: + self.start = "quick_filter" + else: + self.filter = False + + if self.skip_boost and not self.filter_only and \ + not AnyChar() in self.regexp.chars and \ + self.regexp.min_len > 2: + pass + else: + self.skip_boost = False + + grepsource = open(self.BASE_DIR + "/template/grep.c") + self.emit(grepsource.read()) + + def emit_bmh_filter(self, key): + l = len(key) + + self.emiti("bmh_filter:") + + def emit_next(): + if self.filter_only: + self.emit_goto("accept") + elif self.filter_prefix: + self.emit("buf++;") + self.emit_goto(self.cg.start) + else: + self.emit("beg = get_line_beg(buf, beg);") + self.emit("buf = beg;") + self.emit_goto(self.cg.start) + + l = len(key) + if l == 1: + self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) + self.emit("if (buf == NULL) return;") + emit_next() + self.emitd("", 2) + return + + skip = dict() + for i in range(l-1): + skip[key[i]] = l-1-i + + self.emit("buf += %d;" % (l-1), 2) + + self.emiti("while (buf < end) {") + self.emiti( "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1]))) + self.emit( "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1)) + self.emiti( "while (*(--tmp1) == *(--tmp2)) {") + self.emiti( "if (tmp2 == key) {") + emit_next() + self.emitd( "}") + self.emitd( "}") + self.emitd( "}") + self.emiti( "switch(*buf) {") + for k, v in skip.iteritems(): + self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) + self.emit( "buf += %d; break;" % v), self.dedent() + self.emiti("default: buf += %d;" % l), self.dedent() + self.emitd( "}") + self.emitd("}") + self.emit( "return;") + self.emitd("", 2) + + def emit_quick_filter(self, key): + l = len(key) + + self.emiti("quick_filter:") + + def emit_next(): + if self.filter_only: + self.emit_goto("accept") + elif self.filter_prefix: + self.emit("buf+%d;" % l) + self.emit_goto(self.cg.start) + else: + self.emit("beg = get_line_beg(buf, beg);") + self.emit("buf = beg;") + self.emit_goto(self.cg.start) + + l = len(key) + if l == 1: + self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) + self.emit("if (buf == NULL) return;") + emit_next() + self.emitd("}", 2) + return + + skip = dict() + for i in range(l): + skip[key[i]] = l-i + + self.emit("end_ = end - %d;" % (l-1), 2) + self.emiti("while (buf < end_) {") + self.emiti( "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0]))) + self.emit( "tmp1 = buf, tmp2 = (UCHARP)key;") + self.emiti( "while (*(++tmp1) == *(++tmp2)){") + self.emiti( "if (tmp2 == key+%d) {" % (l-1)) + emit_next() + self.emitd( "}") + self.emitd( "}") + self.emitd( "}") + self.emiti( "switch(buf[%d]) {" % l) + for k, v in skip.iteritems(): + self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) + self.emit( "buf += %d; break;" % v), self.dedent() + self.emiti("default: buf += %d;" % (l+1)), self.dedent() + self.emitd( "}") + self.emitd("}") + self.emit( "return;") + self.emitd("", 2) + + def emit_booster(self, min_len, chars): + self.emiti("booster:") + self.emit( "end_ = end - %d;" % (min_len-1)) + self.emit( "if (buf > end_) return;") + self.emiti( "do {") + self.emiti( "switch (buf[%d]) {" % (min_len-1)) + for c in chars: + self.emit( "case %d: /* %s */" % (ord(c), Character.ascii(c))) + self.emit_goto(self.cg.start) + self.emitd( "}") + self.emitd( "} while((buf += %d) <= end_);" % min_len) + self.emit("return;") + self.emit("", 2) + + def emit_driver(self): + self.emit("UCHARP end_, ret;") + if self.skip_boost: + self.emit_booster(self.regexp.min_len, self.regexp.chars) + if self.filter: + self.emit("UCHARP tmp1, tmp2; static const UCHAR key[] = \"%s\";" % self.filter_key) + self.emit_goto(self.filter + "_filter") + if self.filter == "bmh": + self.emit_bmh_filter(self.filter_key) + else: + self.emit_quick_filter(self.filter_key) + else: + self.emit_goto(self.cg.start) + return + + def emit_accept_state(self): + self.emiti("accept:") + self.emit( "ret = (UCHARP)memchr(buf, '\\n', (buf - end));") + if self.skip_boost or self.filter: + self.emit( "beg = get_line_beg(buf, beg);") + self.emiti( "if (ret == NULL) {") + self.emit( "print_line(beg, end);") + self.emit( "return;") + self.emitd( "}") + self.emit( "print_line(beg, ret);") + self.emit( "beg = buf = ret + 1;") + self.emit_goto(self.start) + self.emitd("", 2) + + def emit_reject_state(self): + self.emiti("reject:") + self.emit( "if (buf >= end) return;") + self.emit( "beg = buf;") + self.emit_goto(self.start) + self.emitd("") + + def emit_switch(self, case, default=None): + if not case: + if default: + self.emit_goto(default) + return + self.emiti("switch(*buf++) {") + for case, next_ in case.iteritems(): + self.trans_stmt.emit(case, self.state_name(next_)) + if default == self.state_name(self.cg.start) and self.skip_boost: + self.emit("default: ") + self.emit_goto("booster") + else: + self.emit("default: ") + self.emit_goto(default) + self.emitd("}") + + def emit_state(self, cur_state, transition): + if self.filter_only: return + + self.emiti("%s:" % self.state_name(cur_state)) + + if cur_state in self.cg.accepts: + self.emit( "buf--;") + self.emit_goto("accept") + self.emitd("", 2) + return + + if transition.has_key(AnyChar()): + default = transition.pop(AnyChar()) + else: + default = self.cg.start + + if self.table_lookup and (cur_state == self.cg.start or \ + cur_state == default): + if self.skip_boost and default == self.cg.start: + default = "booster" + tbl = [default] * 256 + for eol in self.eols: + tbl[eol.char] = "reject" + for c, n in transition.iteritems(): + tbl[c.char] = self.state_name(n) + self.emit( "static const void *%s_table[256] = {%s};" + % (self.state_name(cur_state), ", ".join(["&&"+x for x in tbl]))) + self.emit( "goto *%s_table[*buf++];" % (self.state_name(cur_state), self.args)) + self.emitd("", 2) + return + + for eol in self.eols: + transition[eol] = "reject" + + for input_ in transition.keys(): + if type(input_) in self.special_rule: + self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) + + self.emit_switch(transition, default) + + self.emitd("", 2) + + class _trans_stmt(ASTWalker): + def __init__(self, emit): + self._emit = emit + self.args = "beg, buf, end" + + def emit(self, input_node, next_): + self.next = next_ + input_node.accept(self) + + def visit(self, input_node): + self._emit("/* UNKNOW RULE */") + self._emit("/* %s */" % input_node.__repr__()) + + def visit_Character(self, char): + self._emit("case %d: /* %s */" % (char.char, char)) + self._emit(" goto %s;" % self.next) + + # Special Rule + def visit_BegLine(self, begline): + self._emit("/* begin of line */") + self._emit("if (buf == beg)") + self._emit(" goto %s;" % self.next, 2) + + def visit_Range(self, range): + if isinstance(range.lower, MBCharacter) and not \ + isinstance(range.upper, MBCharacter) or \ + isinstance(range.upper, MBCharacter) and not \ + isinstance(range.lower, MBCharacter): + return + + if isinstance(range.lower, MBCharacter): + self.visit(range) + else: + self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char)) + self._emit(" buf++;") + self._emit(" goto %s;" % self.next, 2) def test(): import doctest