changeset 88:dafb393108f3

impliment goto-based grep. (icc's tailcall-optimization is suck!)
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sun, 14 Nov 2010 07:56:35 +0900
parents d23f12ce0369
children 933d422f21f0
files pyrect/jitgrep.py pyrect/translator/__init__.py pyrect/translator/goto_grep_translator.py
diffstat 3 files changed, 324 insertions(+), 87 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/jitgrep.py	Sun Nov 14 04:16:12 2010 +0900
+++ b/pyrect/jitgrep.py	Sun Nov 14 07:56:35 2010 +0900
@@ -13,7 +13,7 @@
                   [--time] [--debug] [--cc=compiler] [-c]
                   [-Olevel] regexp [file..] [--out=file]
                   [--thread=thread_num] [--filter=algorithm]
-                  [--disable-booster]"""
+                  [--disable-booster] [--functional-transition]"""
     psr = OptionParser(usage=myusage)
 
     redirect = ""
@@ -72,10 +72,11 @@
 
     if cbc:
         grept = CbCGREPTranslator(reg)
-    elif opts.label:
-        grept = GoToGREPTranslator(reg)
     else:
-        grept = GREPTranslator(reg)
+        if opts.label:
+            grept = GOTOGREPTranslator(reg)
+        else:
+            grept = GREPTranslator(reg)
         if opts.filter: grept.filter = opts.filter
         grept.skip_boost = not opts.no_boost
         grept.table_lookup = opts.table_lookup
--- a/pyrect/translator/__init__.py	Sun Nov 14 04:16:12 2010 +0900
+++ b/pyrect/translator/__init__.py	Sun Nov 14 07:56:35 2010 +0900
@@ -5,7 +5,7 @@
 from cbc_translator       import CbCTranslator as CbCTranslator
 from dot_translator       import DotTranslator as DotTranslator
 from grep_translator      import GREPTranslator as GREPTranslator
-from goto_grep_translator import GoToGREPTranslator as GoToGREPTranslator
+from goto_grep_translator import GOTOGREPTranslator as GOTOGREPTranslator
 from dfa_translator       import GNUGREPTranslator as GNUGREPTranslator
 from cbc_grep_translator  import CbCGREPTranslator as CbCGREPTranslator
 
--- a/pyrect/translator/goto_grep_translator.py	Sun Nov 14 04:16:12 2010 +0900
+++ b/pyrect/translator/goto_grep_translator.py	Sun Nov 14 07:56:35 2010 +0900
@@ -1,115 +1,351 @@
 #!/usr/bin/env python
 
+import os
 from c_translator import CTranslator
 from pyrect.regexp import Regexp
+from pyrect.regexp.ast import ASTWalker, AnyChar, Character
 
-class GoToGREPTranslator(CTranslator):
-    """GoToGREPTranslator
+class GREPTranslateExeption(Exception):
+    pass
+
+class GOTOGREPTranslator(CTranslator):
+    """GREPTranslator
     This Class can translate form DFA into grep source-code.
     which based on (beautiful) mini-grep introduced  \"The Practice of Programming\"
-    written by Rob Pike & Brian W. Kernighan. (see template/grep.label)
-    >>> string = \"(build|fndecl|gcc)\"
+    written by Rob Pike & Brian W. Kernighan. (see template/grep.c)
+    >>> string = \"(def)\"
     >>> reg = Regexp(string)
-    >>> tje = GoToGREPTranslator(reg)
+    >>> tje = GREPLABELTranslator(reg)
     >>> tje.translate()
     """
 
+    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
     def __init__(self, regexp):
-        CTranslator.__init__(self, regexp)
-        self.funType = 'void '
-        self.callType = 'return '
-        self.breakStatement = ''
-        self.begline = False
-        self.__bufsize = 1024
+        CTranslator.__init__(self, regexp, fa="DFA")
+        self.__bufsize = 1024 * 1024
+        self.thread_dfa = 1
+        self.thread_line = 1
+        self.filter = "quick"
+        self.filter_only = False
+        self.filter_prefix = False
+        self.filter_key = ""
+        self.skip_boost = True
+        self.table_lookup = False
+        self.start = self.cg.start
+        self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
+        self.args = "beg, buf, end"
 
     def getbufsize(self,):
         return self.__bufsize
     def setbufsize(self, bufsize):
         self.__bufsize = abs(bufsize)
 
-    bufsize = property(getbufsize, setbufsize)
-
-    def emit_accept_state(self):
-        self.emit ("""
-\taccept:
-\t\tprintf(\"%s\\n\", buf);
-\t\treturn;
-""")
-
-    def emit_reject_state(self):
-        self.emit("\treject:\n")
-        if not self.begline:
-            self.emit("""
-\t\tif (*cur++ == '\\0')
-\t\t\treturn;
-\t\ts = cur;
-\t\tgoto %s;
-""" % self.state_name(self.cg.start))
-        self.emit("return;\n")
-
-
-    def emit_initialization(self):
-        self.emit("#include <stdio.h>\n")
-        self.emit("#include <stdlib.h>\n")
-        self.emit("#include <string.h>\n\n")
-
-        self.emit("#define LINEBUFSIZE 1024\n")
-        self.emit("#define READBUFSIZE %d\n\n" % (self.bufsize))
-        self.emit("char readbuf[%d];\n\n" % (self.bufsize))
+        bufsize = property(getbufsize, setbufsize)
 
-        self.emit("%sDFA(char* s, char *buf, char* filename);\n" % self.funType)
-        grepsource = open("template/grep.label")
-        self.emit(grepsource.read())
-
-    def emit_filter(self):
-        pass
-
-    def emit_driver(self):
-        self.emit("""
-int main(int argc, char* argv[]) {
-\treturn grepmain(argc, argv);
-}\n\n""")
+    def state_name(self, name):
+        if name in ["accept", "reject", "matcher", "entry"\
+                    , "bmh_filter", "quick_filter", "booster"]:
+            return name
+        else:
+            return "state_"+str(name)
 
-    def emit_switch(self, case, default=None):
-        self.emit("\t\tswitch(*s++) {\n")
-        for input, next_state in case.iteritems():
-            if input != '':
-                self.emit("\t\t\tcase '%s': \n" % (input))
-                self.emit("\t\t\t\tgoto %s;\n" % (self.state_name(next_state)))
-                if self.breakStatement != '': self.emit(self.breakStatement+'\n')
-
-        if default:
-            self.emit( """\t\t\tdefault:\n\t\t\tgoto %s;\n""" % default)
-        self.emit("\t\t}\n")
-
-    def emit_state(self, cur_state, transition):
-        self.emit("\t" + self.state_name(cur_state) + ":\n")
-        if cur_state in self.cg.accepts:
-            self.emit("\t\tgoto accept;\n")
-        else:
-            if transition:
-                if self.cg.type == "DFA":
-                    self.emit_switch(transition, default="reject")
-                else:
-                    self.emit_switch(transition)
-        self.emit("\n")
+    def emit_goto(self, state):
+        self.emit("goto %s;" % self.state_name(state))
 
     def emit_from_callgraph(self):
         # self.emit C-source code
         self.emit_initialization()
+
+        self.emiti("void matcher(%s) {" % self.interface)
         self.emit_driver()
-
-        self.emit("void DFA(char *s, char *buf, char* filename) {\n")
-        self.emit("\tchar *cur = s;\n")
-        self.emit("\tgoto %s;\n" % self.state_name(self.cg.start))
-
         for cur_state, transition in self.cg.map.iteritems():
             self.emit_state(cur_state, transition)
-
         self.emit_accept_state()
         self.emit_reject_state()
+        self.emit("return;")
+        self.emitd("}", 2)
 
-        self.emit("}\n\n")
+    def emit_initialization(self):
+        self.emit("#include <stdio.h>")
+        self.emit("#include <stdlib.h>")
+        self.emit("#include <sys/mman.h>")
+        self.emit("#include <sys/types.h>")
+        self.emit("#include <sys/stat.h>")
+        self.emit("#include <fcntl.h>")
+        self.emit("#include <unistd.h>")
+        self.emit("#include <string.h>", 2)
+
+        self.emit("typedef unsigned char   UCHAR;")
+        self.emit("typedef unsigned char *UCHARP;")
+
+        self.emit("void matcher(%s);" % self.interface);
+        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);")
+        self.emit("void print_line(UCHARP buf, UCHARP beg);", 2)
+
+        if (self.filter == "bmh" or self.filter == "quick")\
+               and self.regexp.must_words:
+            self.filter_key = max(self.regexp.must_words, key=len)
+            if len(self.regexp.must_words) == 1 and len(self.filter_key) == self.regexp.min_len:
+                self.filter_only = True
+            if self.filter == "bmh":
+                self.start = "bmh_filter"
+            else:
+                self.start = "quick_filter"
+        else:
+            self.filter = False
+
+        if self.skip_boost and not self.filter_only and \
+               not AnyChar() in self.regexp.chars and \
+               self.regexp.min_len > 2:
+            pass
+        else:
+            self.skip_boost = False
+
+        grepsource = open(self.BASE_DIR + "/template/grep.c")
+        self.emit(grepsource.read())
+
+    def emit_bmh_filter(self, key):
+        l = len(key)
+
+        self.emiti("bmh_filter:")
+
+        def emit_next():
+            if self.filter_only:
+                self.emit_goto("accept")
+            elif self.filter_prefix:
+                self.emit("buf++;")
+                self.emit_goto(self.cg.start)
+            else:
+                self.emit("beg = get_line_beg(buf, beg);")
+                self.emit("buf = beg;")
+                self.emit_goto(self.cg.start)
+
+        l = len(key)
+        if l == 1:
+            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
+            self.emit("if (buf == NULL) return;")
+            emit_next()
+            self.emitd("", 2)
+            return
+
+        skip = dict()
+        for i in range(l-1):
+            skip[key[i]] = l-1-i
+
+        self.emit("buf += %d;" % (l-1), 2)
+
+        self.emiti("while (buf < end) {")
+        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1])))
+        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1))
+        self.emiti(    "while (*(--tmp1) == *(--tmp2)) {")
+        self.emiti(       "if (tmp2 == key) {")
+        emit_next()
+        self.emitd(       "}")
+        self.emitd(    "}")
+        self.emitd(  "}")
+        self.emiti(  "switch(*buf) {")
+        for k, v in skip.iteritems():
+            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
+            self.emit(     "buf += %d; break;" % v), self.dedent()
+        self.emiti("default: buf += %d;" % l), self.dedent()
+        self.emitd(  "}")
+        self.emitd("}")
+        self.emit( "return;")
+        self.emitd("", 2)
+
+    def emit_quick_filter(self, key):
+        l = len(key)
+
+        self.emiti("quick_filter:")
+
+        def emit_next():
+            if self.filter_only:
+                self.emit_goto("accept")
+            elif self.filter_prefix:
+                self.emit("buf+%d;" % l)
+                self.emit_goto(self.cg.start)
+            else:
+                self.emit("beg = get_line_beg(buf, beg);")
+                self.emit("buf = beg;")
+                self.emit_goto(self.cg.start)
+
+        l = len(key)
+        if l == 1:
+            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
+            self.emit("if (buf == NULL) return;")
+            emit_next()
+            self.emitd("}", 2)
+            return
+
+        skip = dict()
+        for i in range(l):
+            skip[key[i]] = l-i
+
+        self.emit("end_ = end - %d;" % (l-1),  2)
+        self.emiti("while (buf < end_) {")
+        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0])))
+        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key;")
+        self.emiti(    "while (*(++tmp1) == *(++tmp2)){")
+        self.emiti(       "if (tmp2 == key+%d) {" % (l-1))
+        emit_next()
+        self.emitd(       "}")
+        self.emitd(    "}")
+        self.emitd(  "}")
+        self.emiti(  "switch(buf[%d]) {" % l)
+        for k, v in skip.iteritems():
+            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
+            self.emit(     "buf += %d; break;" % v), self.dedent()
+        self.emiti("default: buf += %d;" % (l+1)), self.dedent()
+        self.emitd(  "}")
+        self.emitd("}")
+        self.emit( "return;")
+        self.emitd("", 2)
+
+    def emit_booster(self, min_len, chars):
+        self.emiti("booster:")
+        self.emit(   "end_ = end - %d;" % (min_len-1))
+        self.emit(   "if (buf > end_) return;")
+        self.emiti(  "do {")
+        self.emiti(    "switch (buf[%d]) {" % (min_len-1))
+        for c in chars:
+            self.emit(   "case %d: /* %s */" % (ord(c), Character.ascii(c)))
+        self.emit_goto(self.cg.start)
+        self.emitd(    "}")
+        self.emitd(  "} while((buf += %d) <= end_);" % min_len)
+        self.emit("return;")
+        self.emit("", 2)
+
+    def emit_driver(self):
+        self.emit("UCHARP end_, ret;")
+        if self.skip_boost:
+            self.emit_booster(self.regexp.min_len, self.regexp.chars)
+        if self.filter:
+            self.emit("UCHARP tmp1, tmp2; static const UCHAR key[] = \"%s\";" % self.filter_key)
+            self.emit_goto(self.filter + "_filter")
+            if self.filter == "bmh":
+                self.emit_bmh_filter(self.filter_key)
+            else:
+                self.emit_quick_filter(self.filter_key)
+        else:
+            self.emit_goto(self.cg.start)
+        return
+
+    def emit_accept_state(self):
+        self.emiti("accept:")
+        self.emit(   "ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
+        if self.skip_boost or self.filter:
+            self.emit(   "beg = get_line_beg(buf, beg);")
+        self.emiti(  "if (ret == NULL) {")
+        self.emit(     "print_line(beg, end);")
+        self.emit(     "return;")
+        self.emitd(  "}")
+        self.emit(   "print_line(beg, ret);")
+        self.emit(   "beg = buf = ret + 1;")
+        self.emit_goto(self.start)
+        self.emitd("", 2)
+
+    def emit_reject_state(self):
+        self.emiti("reject:")
+        self.emit(   "if (buf >= end) return;")
+        self.emit(   "beg = buf;")
+        self.emit_goto(self.start)
+        self.emitd("")
+
+    def emit_switch(self, case, default=None):
+        if not case:
+            if default:
+                self.emit_goto(default)
+            return
+        self.emiti("switch(*buf++) {")
+        for case, next_ in case.iteritems():
+            self.trans_stmt.emit(case, self.state_name(next_))
+        if default == self.state_name(self.cg.start) and self.skip_boost:
+            self.emit("default: ")
+            self.emit_goto("booster")
+        else:
+            self.emit("default: ")
+            self.emit_goto(default)
+        self.emitd("}")
+
+    def emit_state(self, cur_state, transition):
+        if self.filter_only: return
+
+        self.emiti("%s:" % self.state_name(cur_state))
+
+        if cur_state in self.cg.accepts:
+            self.emit(   "buf--;")
+            self.emit_goto("accept")
+            self.emitd("", 2)
+            return
+
+        if transition.has_key(AnyChar()):
+            default = transition.pop(AnyChar())
+        else:
+            default = self.cg.start
+
+        if self.table_lookup and (cur_state == self.cg.start or \
+                                  cur_state == default):
+            if self.skip_boost and default == self.cg.start:
+                default = "booster"
+            tbl = [default] * 256
+            for eol in self.eols:
+                tbl[eol.char] = "reject"
+            for c, n in transition.iteritems():
+                tbl[c.char] = self.state_name(n)
+                self.emit(   "static const void *%s_table[256] = {%s};"
+                             % (self.state_name(cur_state), ", ".join(["&&"+x for x in tbl])))
+                self.emit(   "goto *%s_table[*buf++];" % (self.state_name(cur_state), self.args))
+                self.emitd("", 2)
+            return
+
+        for eol in self.eols:
+            transition[eol] = "reject"
+
+        for input_ in transition.keys():
+            if type(input_) in self.special_rule:
+                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
+
+        self.emit_switch(transition, default)
+
+        self.emitd("", 2)
+
+    class _trans_stmt(ASTWalker):
+        def __init__(self, emit):
+            self._emit = emit
+            self.args = "beg, buf, end"
+
+        def emit(self, input_node, next_):
+            self.next = next_
+            input_node.accept(self)
+
+        def visit(self, input_node):
+            self._emit("/* UNKNOW RULE */")
+            self._emit("/* %s */" % input_node.__repr__())
+
+        def visit_Character(self, char):
+            self._emit("case %d: /* %s */" % (char.char, char))
+            self._emit("  goto %s;" % self.next)
+
+        # Special Rule
+        def visit_BegLine(self, begline):
+            self._emit("/* begin of line  */")
+            self._emit("if (buf == beg)")
+            self._emit("  goto %s;" % self.next, 2)
+
+        def visit_Range(self, range):
+            if isinstance(range.lower, MBCharacter) and not \
+               isinstance(range.upper, MBCharacter) or  \
+               isinstance(range.upper, MBCharacter) and not \
+               isinstance(range.lower, MBCharacter):
+                return
+
+            if isinstance(range.lower, MBCharacter):
+                self.visit(range)
+            else:
+                self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char))
+                self._emit("  buf++;")
+                self._emit("  goto %s;" % self.next, 2)
 
 def test():
     import doctest