changeset 101:5e509a9c951c

modify CbC-grep.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Tue, 14 Dec 2010 04:08:09 +0900
parents 6aab6b1038f0
children a38b57592d45
files pyrect/jitgrep.py pyrect/regexp/callgraph.py pyrect/regexp/dfa_translator.py pyrect/regexp/fa.py pyrect/translator/cbc_grep_translator.py
diffstat 5 files changed, 328 insertions(+), 101 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/jitgrep.py	Sun Dec 12 23:14:05 2010 +0900
+++ b/pyrect/jitgrep.py	Tue Dec 14 04:08:09 2010 +0900
@@ -9,7 +9,7 @@
 from pyrect.regexp import Regexp, CharCollector
 
 def main(argv):
-    myusage = """%prog [--buf-size=size] [--dump]
+    myusage = """%prog [--buf-size=size] [--dump] [--CbC]
                   [--time] [--debug] [--cc=compiler] [-c]
                   [-Olevel] regexp [file..] [--out=file]
                   [--thread=thread_num] [--filter=algorithm]
@@ -34,6 +34,7 @@
     psr.add_option("--label", action="store_true", dest="label", default=False, help="label implimentation in C.")
     psr.add_option("--dump", action="store_true", dest="dump", default=False, help="Dump generated grep-source.")
     psr.add_option("--out", action="store", type="string", dest="out", default="", metavar="FILE", help="Output file.")
+    psr.add_option("--CbC", action="store_true", dest="cbc", default=False, help="emit CbC-source.")
 
     (opts, args) = psr.parse_args(argv)
 
@@ -41,12 +42,8 @@
         psr.print_usage()
         return
 
-    if opts.cc == "cbc":
-        cbc = True
-        opts.cc = "$CBCROOT/INSTALL_DIR/bin/gcc"
-        opts.cflags += " -L$CBCROOT/gcc -w"
-    else:
-        cbc = False
+    if opts.cbc == True:
+        opts.cc = "gcc-cbc"
 
     if opts.debug: print("option", opts)
     if opts.debug: print("args", args)
@@ -70,7 +67,7 @@
     reg.chars = Regexp.get_chars(string)
     (reg.max_len, _, _) = Regexp.get_analyze(string)
 
-    if cbc:
+    if opts.cbc:
         grept = CbCGREPTranslator(reg)
     else:
         if opts.label:
--- a/pyrect/regexp/callgraph.py	Sun Dec 12 23:14:05 2010 +0900
+++ b/pyrect/regexp/callgraph.py	Tue Dec 14 04:08:09 2010 +0900
@@ -1,4 +1,4 @@
-#!/Usr/bin/env python
+#!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
 import copy
--- a/pyrect/regexp/dfa_translator.py	Sun Dec 12 23:14:05 2010 +0900
+++ b/pyrect/regexp/dfa_translator.py	Tue Dec 14 04:08:09 2010 +0900
@@ -2,7 +2,7 @@
 #-*- encoding: utf-8 -*-
 
 from pyrect.regexp.parser import Parser
-from pyrect.regexp.ast import ASTWalker, AnyChar
+from pyrect.regexp.ast import ASTWalker, AnyChar, Range
 from pyrect.regexp.fa import Transition
 from pyrect.regexp.nfa_translator import NFATranslator
 from pyrect.regexp.nfa import NFA, SuffixNFA, SuffixTrieNFA
@@ -42,13 +42,12 @@
 
         while que:
             states = que.pop()
+
             anys = set()
             for state in states:
                 trans = nfa.transition[state]
-                if trans == None: continue
-                for i, n in trans.iteritems():
-                    if i == AnyChar():
-                        anys.update(nfa.epsilon_expand(n))
+                if trans and trans.has_key(AnyChar()):
+                    anys.update(nfa.epsilon_expand(trans[AnyChar()]))
 
             for state in states:
                 trans = nfa.transition[state]
--- a/pyrect/regexp/fa.py	Sun Dec 12 23:14:05 2010 +0900
+++ b/pyrect/regexp/fa.py	Tue Dec 14 04:08:09 2010 +0900
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+from pyrect.regexp.ast import AnyChar, Range
+
 class FA(object):
     def __init__(self, transition, start, accepts, states=None):
         self.transition = transition
--- a/pyrect/translator/cbc_grep_translator.py	Sun Dec 12 23:14:05 2010 +0900
+++ b/pyrect/translator/cbc_grep_translator.py	Tue Dec 14 04:08:09 2010 +0900
@@ -1,128 +1,357 @@
 #!/usr/bin/env python
 
 import os
+from c_translator import CTranslator
 from pyrect.regexp import Regexp
-from pyrect.translator import CbCTranslator
+from pyrect.regexp.ast import ASTWalker, AnyChar, Character, SpecialInputNode
 
 class CbCGREPTranslateExeption(Exception):
     pass
 
-class CbCGREPTranslator(CbCTranslator):
+class CbCGREPTranslator(CTranslator):
     """CbCGREPTranslator
     This Class can translate form DFA into grep source-code.
     which based on (beautiful) mini-grep introduced  \"The Practice of Programming\"
     written by Rob Pike & Brian W. Kernighan. (see template/grep.c)
     >>> string = \"(build|fndecl|gcc)\"
     >>> reg = Regexp(string)
-    >>> tje = GREPTranslator(reg)
+    >>> tje = CbCGREPTranslator(reg)
     >>> tje.translate()
     """
 
     BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
     def __init__(self, regexp):
-        CbCTranslator.__init__(self, regexp)
-        self.interface = "unsigned char *s, unsigned char* cur, unsigned char* buf, FILE *f, char* filename"
-        self.args = "s, cur, buf, f, filename"
-        self.print_file = False
+        CTranslator.__init__(self, regexp)
         self.__bufsize = 1024 * 1024
-        self.trans_stmt = self._trans_stmt(self.emit, self.args)
+        self.thread_dfa = 1
+        self.thread_line = 1
+        self.filter = "quick"
+        self.filter_only = False
+        self.filter_prefix = False
+        self.skip_boost = True
+        self.table_lookup = False
+        self.start = "entry"
+        self.interface = "UCHARP beg, UCHARP buf, UCHARP end, ENVP envp"
+        self.args = "beg, buf, end, envp"
 
-    def getbufsize(self):
+    def getbufsize(self,):
         return self.__bufsize
     def setbufsize(self, bufsize):
         self.__bufsize = abs(bufsize)
 
-    bufsize = property(getbufsize, setbufsize)
-
-    def state_name(self, state):
-        if state in ("accept", "reject", "next_ptr", "next_line", "returner"):
-            return state
-        else:
-            return "state_" + state
-
-
-    def emit_accept_state(self):
-        self.emit("__code accept(%s) {\n" % self.interface)
-        if self.print_file:
-            self.emit("  printf(\"%s: %s\\n\", filename, buf);\n")
-        else:
-            self.emit("  printf(\"%s\\n\", buf);\n")
-        self.emit("    goto next_line(%s);\n}\n\n" % self.args)
-
-    def emit_reject_state(self):
-        self.emit("""
-__code reject(%s) {
-  goto next_ptr(%s);
-}
-""" % (self.interface, self.args))
-
-    def emit_next_state(self):
-        self.emit ("""
-__code next_ptr(%s) {
-  if(*cur++ == '\\0')
-    goto next_line(%s);
-  s = cur;
-  goto DFA(%s);
-}
-""" % (self.interface, self.args, self.args))
-
-        self.emit("""
-__code next_line(%s) {
-  if(fgets(buf, LINEBUFSIZE, f) == NULL) {
-    goto returner(%s);
-  }
-  int n = strlen(buf);
-  if (n > 0 && buf[n-1] == '\\n')
-    buf[n-1] = '\\0';
-  cur = buf;
-  s   = cur;
-  goto DFA(%s);
-}
-""" % (self.interface, self.args, self.args))
-        self.emit("""
-__code returner(%s) {
-  return;
-}""" % self.interface)
+        bufsize = property(getbufsize, setbufsize)
 
     def emit_initialization(self):
         self.emit("#include <stdio.h>")
         self.emit("#include <stdlib.h>")
+        self.emit("#include <sys/mman.h>")
+        self.emit("#include <sys/types.h>")
+        self.emit("#include <sys/stat.h>")
+        self.emit("#include <fcntl.h>")
+        self.emit("#include <unistd.h>")
         self.emit("#include <string.h>", 2)
-        self.emit("#define LINEBUFSIZE 1024")
-        self.emit("#define READBUFSIZE %d" % self.bufsize, 2)
+
+        self.emit("typedef unsigned char   UCHAR;")
+        self.emit("typedef unsigned char *UCHARP;")
+        self.emiti("typedef struct env_ {")
+        self.emit(   "void (*ret)();")
+        self.demit("} ENV;")
+        self.emit("typedef ENV *ENVP;", 2)
+
+        self.emit("void matcher(UCHARP beg, UCHARP buf, UCHARP end);")
+        self.emit('__code entry(%s);' % self.interface, 2)
+
+        self.emit('__code accept(%s);' % self.interface)
+        self.emit('__code reject(%s);' % self.interface, 2)
+
+        key = None
+
+        if (self.filter == "bmh" or self.filter == "quick")\
+               and self.regexp.must_words:
+            key = max(self.regexp.must_words, key=len)
+            if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len:
+                self.filter_only = True
+        else:
+            self.filter = False
+
+        if not self.filter_only:
+            for state in self.fa.transition.iterkeys():
+                self.emit("__code %s(%s);" % (self.state_name(state), self.interface))
+            self.emit()
+
+        if self.filter == "bmh":
+            self.emit_bmh_filter(key)
+        elif self.filter:
+            self.emit_quick_filter(key)
+
+        if self.skip_boost and not self.filter_only and \
+               not AnyChar() in self.regexp.chars and \
+               self.regexp.min_len > 2:
+            self.emit_booster(self.regexp.min_len, self.regexp.chars)
+        else:
+            self.skip_boost = False
+
+        grepsource = open(self.BASE_DIR + "/template/grep.c")
+        self.emit(grepsource.read())
+
+    def emit_bmh_filter(self, key):
+        l = len(key)
+        def emit_next():
+            if self.filter_only:
+                self.emit("goto accept(%s);" % self.args)
+            elif self.filter_prefix:
+                self.emit("buf++;")
+                self.emit("goto %s(%s);" % (self.state_name(self.fa.start), self.args))
+            else:
+                self.emit("beg = get_line_beg(buf, beg);")
+                self.emit("buf = beg;")
+                self.emit("goto %s(%s);" % (self.state_name(self.fa.start), self.args))
+
+        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2)
+        self.emiti("__code bmh_filter(%s) {" % self.interface)
+        l = len(key)
+        if l == 1:
+            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
+            self.emit("if (buf == NULL) (*envp->ret)();")
+            emit_next()
+            self.demit("}", 2)
+            return
+
+        self.emit('static const UCHAR key[] = "%s";' % key)
+
+        skip = dict()
+        for i in range(l-1):
+            skip[key[i]] = l-1-i
+
+        self.emit("UCHARP tmp1, tmp2; buf += %d;" % (l-1), 2)
 
-        self.emit("__code DFA(%s);\n" % self.interface)
-        for state in self.cg.map.keys() + ["accept", "reject", "next_ptr", "next_line", "returner"]:
-            self.emit("__code %s(%s);" % (self.state_name(state), self.interface))
-        self.emit()
-        grepsource = open(self.BASE_DIR + "/template/grep.cbc")
-        self.emit(grepsource.read())
-        self.emit_next_state()
+        self.emiti("while (buf < end) {")
+        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1])))
+        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1))
+        self.emiti(    "while (*(--tmp1) == *(--tmp2)) {")
+        self.emit(       "if (tmp2 == key) goto next;")
+        self.demit(    "}")
+        self.demit(  "}")
+        self.emiti(  "switch(*buf) {")
+        for k, v in skip.iteritems():
+            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
+            self.emit(     "buf += %d; break;" % v), self.dedent()
+        self.emiti("default: buf += %d;" % l), self.dedent()
+        self.demit(  "}")
+        self.demit("}")
+        self.emit( "(*envp->ret)();")
+        self.emit( "next:")
+        emit_next()
+        self.demit("}", 2)
+
+    def emit_quick_filter(self, key):
+        l = len(key)
+        def emit_next():
+            if self.filter_only:
+                self.emit("goto accept(%s);" % self.args)
+            elif self.filter_prefix:
+                self.emit("buf+%d;" % l)
+                self.emit("goto %s(%s);" % (self.state_name(self.fa.start) ,self.args))
+            else:
+                self.emit("beg = get_line_beg(buf, beg);")
+                self.emit("buf = beg;")
+                self.emit("goto %s(%s);" % (self.state_name(self.fa.start), self.args))
+
+        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2)
+
+        self.emiti("__code quick_filter(%s) {" % self.interface)
+        l = len(key)
+        if l == 1:
+            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
+            self.emit("if (buf == NULL) (*envp->ret)();")
+            emit_next()
+            self.demit("}", 2)
+            return
+
+        self.emit('static const UCHAR key[] = "%s";' % key)
 
-    def emit_filter(self):
-        pass
+        skip = dict()
+        for i in range(l):
+            skip[key[i]] = l-i
+
+        self.emit("UCHARP tmp1, tmp2, end_ = end - %d;" % (l-1),  2)
+        self.emiti("while (buf < end_) {")
+        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0])))
+        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key;")
+        self.emiti(    "while (*(++tmp1) == *(++tmp2)){")
+        self.emit(       "if (tmp2 == key+%d) goto next;" % (l-1))
+        self.demit(    "}")
+        self.demit(  "}")
+        if self.table_lookup:
+            self.emiti("static const void * tbl[256] = {[0 ... 255] &&any, %s};"
+                       % ", ".join("[%d] &&add%s" % (ord(c), s) for c, s in skip.iteritems()))
+            self.emit("goto *tbl[buf[%d]];" % l)
+            defun = []
+            for s in skip.itervalues():
+                if s in defun: continue
+                defun.append(s)
+                self.emit("add%s: buf += %s; goto ends;" % (s, s))
+            self.emit("any: buf += %d; ends:;" % (l+1))
+        else:
+            self.emiti(  "switch(buf[%d]) {" % l)
+            for k, v in skip.iteritems():
+                self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
+                self.emit(     "buf += %d; break;" % v), self.dedent()
+            self.emiti("default: buf += %d;" % (l+1)), self.dedent()
+            self.demit(  "}")
+        self.demit("}")
+        self.emit( "(*envp->ret)();")
+        self.emit( "next:")
+        emit_next()
+        self.demit("}", 2)
+
+    def emit_booster(self, min_len, chars):
+        self.emiti("__code booster(%s) {" % self.interface)
+        self.emit(   "UCHARP end_ = end - %d;" % (min_len-1))
+        self.emit(   "if (buf > end_) (*envp->ret)();")
+        self.emiti(  "do {")
+        self.emiti(    "switch (buf[%d]) {" % (min_len-1))
+        for c in chars:
+            self.emit(   "case %d: /* %s */" % (ord(c), Character.ascii(c)))
+        self.emit(       "goto ret;")
+        self.demit(    "}")
+        self.demit(  "} while((buf += %d) <= end_);" % min_len)
+        self.emit(   "ret: goto %s(%s);" % (self.state_name(self.fa.start) , self.args))
+        self.demit("}", 2)
 
     def emit_driver(self):
-        self.emit("""
-int main(int argc, char* argv[]) {
-  grepmain(argc, argv);
-  return;
-}
-""")
-        self.emit("""
-__code DFA(%s) {
-  goto %s(%s);
-}
-""" % (self.interface, self.state_name(self.cg.start), self.args))
+        self.emiti("void matcher(UCHARP beg, UCHARP buf, UCHARP end) {")
+        self.emit(   "__label__ _return;")
+        self.emiti(  "void __return() {")
+        self.emit(     "goto _return;")
+        self.demit(  "}")
+        self.emit(  "ENVP envp = malloc(sizeof(ENV));")
+        self.emit(  "envp->ret = __return;")
+        self.emit(   "goto entry(%s);" % self.args)
+        self.demiti("_return:")
+        self.emit(   "return;")
+        self.demit("}", 2)
+
+        self.emiti("__code entry(%s) {" % self.interface)
+        if self.filter:
+            self.emit(   "goto %s(%s);" % (self.filter + "_filter", self.args))
+        else:
+            self.emit(   "goto %s(%s);" % (self.state_name(self.fa.start), self.args))
+        self.demit("}", 2)
+
+    def emit_accept_state(self):
+        self.emiti("__code accept(%s) {" % self.interface)
+        self.emit(   "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
+        if self.skip_boost or self.filter:
+            self.emit(   "beg = get_line_beg(buf, beg);")
+        self.emiti(  "if (ret == NULL) {")
+        self.emit(     "print_line(beg, end);")
+        self.emit(     "(*envp->ret)();")
+        self.demit(  "}")
+        self.emit(   "print_line(beg, ret);")
+        self.emit(   "beg = buf = ret + 1;")
+        self.emit(   "goto %s(%s);" % (self.start, self.args))
+        self.demit("}", 2)
+
+    def emit_reject_state(self):
+        self.emiti("__code reject(%s) {" % self.interface)
+        self.emit(   "if (buf >= end) (*envp->ret)();")
+        self.emit(   "beg = buf;")
+        self.emit(   "goto %s(%s);" % (self.start, self.args))
+        self.demit("}", 2)
+
+    def emit_switch(self, case, default=None):
+        if not case:
+            if default:
+                self.emit("goto %s(%s);" % (default, self.args))
+            return
+        self.emiti("switch(*buf++) {")
+        for case, next_ in case.iteritems():
+            self.trans_stmt.emit(case, self.state_name(next_))
+        if default == self.state_name(self.fa.start) and self.skip_boost:
+            self.emit("default: goto booster(%s);" % self.args)
+        else:
+            self.emit("default: goto %s(%s);" % (default, self.args))
+        self.demit("}")
 
     def emit_state(self, cur_state, transition):
-        if cur_state in self.cg.accepts:
-            self.emiti("__code %s(%s) {" % (self.state_name(cur_state), self.interface))
-            self.emit(   "goto accept(%s);" % self.args)
-            self.emitd("}")
+        if self.filter_only: return
+
+        self.emiti("__code %s(%s) {" % (self.state_name(cur_state), self.interface))
+
+        if cur_state in self.fa.accepts:
+            self.emit(   "goto accept(beg, buf-1, end, envp);")
+            self.demit("}", 2)
+            return
+
+        if transition.has_key(AnyChar()):
+            default = self.state_name(transition.pop(AnyChar()))
         else:
-            CbCTranslator.emit_state(self, cur_state, transition)
+            default = self.state_name(self.fa.start)
+
+        if self.table_lookup and (cur_state == self.fa.start or \
+           self.state_name(cur_state) == default):
+            if self.skip_boost and default == self.state_name(self.fa.start):
+                default = "booster"
+            tbl = dict()
+            for eol in self.eols:
+                tbl[eol.char] = "reject"
+            for c, n in transition.iteritems():
+                tbl[c.char] = self.state_name(n)
+            self.emit(   "static __code (*%s_table[256])(UCHARP, UCHARP, UCHARP) = {[0 ... 255] = (void*)%s, %s};"
+                         % (self.state_name(cur_state), default,
+                            ", ".join("[%d] = %s" % (i, s) for (i, s) in tbl.items())))
+            self.emit(   "goto %s_table[*buf++](%s);" % (self.state_name(cur_state), self.args))
+            self.demit("}", 2)
+            return
+
+        for eol in self.eols:
+            transition[eol] = "reject"
+
+        for input_ in transition.keys():
+            if isinstance(input_, SpecialInputNode):
+                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
+
+        self.emit_switch(transition, default)
+
+        self.demit("}", 2)
+
+    class _trans_stmt(ASTWalker):
+        def __init__(self, emit):
+            self._emit = emit
+            self.args = "beg, buf, end, envp"
+
+        def emit(self, input_node, next_):
+            self.next = next_
+            input_node.accept(self)
+
+        def visit(self, input_node):
+            self._emit("/* UNKNOW RULE */")
+            self._emit("/* %s */" % input_node.__repr__())
+
+        def visit_Character(self, char):
+            self._emit("case %d: /* %s */" % (char.char, char))
+            self._emit("  goto %s(%s);" % (self.next, self.args))
+
+        # Special Rule
+        def visit_BegLine(self, begline):
+            self._emit("/* begin of line  */")
+            self._emit("if (buf == beg)")
+            self._emit("  goto %s(%s);" % (self.next, self.args), 2)
+
+        def visit_Range(self, range):
+            if isinstance(range.lower, MBCharacter) and not \
+               isinstance(range.upper, MBCharacter) or  \
+               isinstance(range.upper, MBCharacter) and not \
+               isinstance(range.lower, MBCharacter):
+                return
+
+            if isinstance(range.lower, MBCharacter):
+                self.visit(range)
+            else:
+                self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char))
+                self._emit("  goto %s(beg, buf+1, end, envp);" % self.next, 2)
 
 def test():
     import doctest