changeset 67:b02b321d0e06

implement bm_filter on mmap. but it's slower than dfa. ?;-(
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sun, 07 Nov 2010 01:48:19 +0900
parents c657ca0f0b30
children 56a997f2c121
files pyrect/regexp/analyzer.py pyrect/translator/grep_translator.py pyrect/translator/template/grep.c
diffstat 3 files changed, 67 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/regexp/analyzer.py	Sat Nov 06 00:55:01 2010 +0900
+++ b/pyrect/regexp/analyzer.py	Sun Nov 07 01:48:19 2010 +0900
@@ -8,7 +8,7 @@
 """
 
 from pyrect.regexp.parser import Parser
-from pyrect.regexp.ast import ASTWalker
+from pyrect.regexp.ast import ASTWalker, Plus
 
 class Analyzer(ASTWalker):
     """ Extract with Visitor-Pattern.
@@ -25,7 +25,10 @@
     (inf, 1, [])
     >>> an.analyze(prs.parse('(plus)?(qmark)?'))
     (9, 0, [])
+    >>> an.analyze(prs.parse('\*+ \[\['))
+    (inf, 1, [])
     """
+
     def __init__(self, ast=None):
         if ast:
             self.analyze(ast)
@@ -65,7 +68,7 @@
 
     def visit_Plus(self, plus):
         (_, m, k) = plus.op.accept(self)
-        return float("inf"), m, ["", k, ""]
+        return float("inf"), m, k + [""] + k
 
     def visit_Qmark(self, qmark):
         (m, _, _) = qmark.op.accept(self)
--- a/pyrect/translator/grep_translator.py	Sat Nov 06 00:55:01 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Sun Nov 07 01:48:19 2010 +0900
@@ -27,6 +27,7 @@
         self.thread_dfa = 1
         self.thread_line = 1
         self.filter = True
+        self.start = "matcher"
         self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
         self.args = "beg, buf, end"
 
@@ -40,6 +41,7 @@
     def emit_initialization(self):
         self.emit("#include <stdio.h>")
         self.emit("#define GREP grep")
+        self.emit("#define UCHAR unsigned char")
         self.emit("#define UCHARP unsigned char *")
         self.emit("#include <stdlib.h>")
         self.emit("#include <sys/mman.h>")
@@ -54,10 +56,11 @@
             self.emit("void %s(%s);" % (self.state_name(state), self.interface))
         self.emit('void accept(%s);' % self.interface)
         self.emit('void reject(%s);' % self.interface)
-        self.emit("void dfa(%s);" % self.interface, 2)
+        self.emit("void matcher(%s);" % self.interface, 2)
 
-        #if self.filter and self.regexp.must_words:
-        #    self.emit_filter(self.regexp.must_words)
+        if self.filter and self.regexp.must_words:
+            self.emit_filter(self.regexp.must_words)
+        else: self.filter = False
 
         grepsource = open(self.BASE_DIR + "/template/grep.c")
         self.emit(grepsource.read())
@@ -71,17 +74,31 @@
 
         key = reduce(longest, words)
 
-        if len(words) == 1:
-            if len(key) == self.regexp.min_len:
-                self.emit("#define MATCH (bm_filter(beg, buf, n-1))", 1)
+        if len(words) == 1 and len(key) == self.regexp.min_len:
+            filter_only = True
         else:
-            self.emit("#define (bm_filter(beg, buf, n-1) && DFA(beg, buf, n-1))", 1)
+            filter_only = False
+            filter_prefix = False
 
-        self.emit("#define FILTER bm_filter", 2)
-        self.emiti("int bm_filter(unsigned char* buf, int n) {")
+        def emit_next():
+            self.emit("beg = memrchr(buf, '\\n', beg);")
+            if filter_only:
+                self.emit("accept(%s);" % self.args)
+            elif filter_prefix:
+                self.emit("buf -= %d;" % len(key))
+                self.emit("dfa(%s);" % self.args)
+            else:
+                self.emit("buf = beg;")
+                self.emit("dfa(%s);" % self.args)
+
+        self.emit("UCHARP memrchr(UCHARP p, int c, UCHARP beg);", 2)
+
+        self.emiti("void bm_filter(%s) {" % self.interface)
         l = len(key)
         if l == 1:
-            self.emit("   return (strchr(buf, %d) != NULL)" % ord(key))
+            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
+            self.emit("if (buf == NULL) return;")
+            emit_next()
             self.emitd("}", 2)
             return
 
@@ -89,37 +106,46 @@
         for i in range(l - 1):
             skip[ord(key[i])] = str(l-1-i)
 
-        self.emit('static unsigned char key[] = "%s";' % key)
+        self.emit('static UCHAR key[] = "%s";' % key)
         self.emiti(   "static int skip[256] = {")
         for i in range(8):
             i = i * 32
             self.emit(",".join(skip[i:i+32]) + ",")
         self.emitd(   "};")
 
-        self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l))
-        self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2)
-        self.emiti("while (i < n) {")
-        self.emit(   "c = buf[i];")
-        self.emiti(  "if (c == tail) {")
-        self.emit(     "j = len - 1; k = i;")
-        self.emiti(    "while (key[--j] == buf[--k]) {")
-        self.emit(       "if (j == 0) return 1;")
+        self.emit("UCHARP tmp; register UCHAR c;",  2)
+        self.emit("int i; buf += %d;" % (l-1))
+        self.emiti("while (buf < end) {")
+        self.emiti(  "if ((c = *buf) == %d /*'%c'*/) {" % (ord(key[l-1]), key[l-1]))
+        self.emit(     "i = %d; tmp = buf;" % (l-1))
+        self.emiti(    "while (key[--i] == *(--tmp)) {")
+        self.emiti(       "if (i == 0) {")
+        self.emit(          "beg = memrchr(buf, '\\n', beg);")
+        self.emit(          "goto next;")
+        self.emitd(       "}")
         self.emitd(    "}")
         self.emitd(  "}")
-        self.emit(   "i += skip[c];")
+        self.emit(   "buf += skip[c];")
         self.emitd("}")
-        self.emit( "return 0;")
+        self.emit( "return;")
+        self.emit( "next:")
+        emit_next()
         self.emitd("}", 2)
 
     def emit_driver(self):
-        self.emiti("void dfa(%s) {" % self.interface)
-        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emiti("void matcher(%s) {" % self.interface)
+        if self.filter:
+            self.emit(   "%s(%s);" % ("bm_filter", self.args))
+        else:
+            self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
         self.emit(   "return;")
         self.emitd("}")
         return
 
     def emit_accept_state(self):
         self.emiti("void accept(%s) {" % self.interface)
+        #self.emit(   "printf(\"*beg = %c, *buf = %c, *end = %c; \\n\", *beg, *buf, *end);")
+        #self.emit(   "printf(\"beg = %x, buf = %x, end = %x; \\n\", beg, buf, end);")
         self.emit(   "buf--;")
         self.emit(   "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
         self.emit(   'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}')
@@ -130,14 +156,14 @@
         self.emitd(  "}")
         self.emit(   "print_line(beg, ret);")
         self.emit(   "beg = buf = ret + 1;")
-        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emit(   "%s(%s);" % (self.start, self.args))
         self.emitd("}", 2)
 
     def emit_reject_state(self):
         self.emiti("void reject(%s) {" % self.interface)
         self.emit(   "if (buf >= end) return;")
         self.emit(   "beg = buf;")
-        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emit(   "%s(%s);" % (self.start, self.args))
         self.emitd("}", 2)
 
     def emit_switch(self, case, default=None):
--- a/pyrect/translator/template/grep.c	Sat Nov 06 00:55:01 2010 +0900
+++ b/pyrect/translator/template/grep.c	Sun Nov 07 01:48:19 2010 +0900
@@ -1,10 +1,17 @@
-void print_line(unsigned char *beg, unsigned char *end) {
+UCHARP memrchr(UCHARP p, int c, UCHARP beg) {
+  while(p > beg) {
+    if ((*--p) == c) return p+1;
+  }
+  return beg;
+}
+
+void print_line(UCHARP beg, UCHARP end) {
   fwrite(beg, sizeof(char), (end - beg + 1), stdout);
 }
 
 void grep(char *regexp, int fd, char *name) {
   caddr_t file_mmap;
-  unsigned char *buf, *end, *beg;
+  UCHARP buf, *end, *beg;
   off_t size;
   struct stat sb;
 
@@ -21,10 +28,10 @@
     exit(0);
   }
 
-  beg = buf = (unsigned char *) file_mmap;
+  beg = buf = (UCHARP) file_mmap;
   end = beg + size - 1;
 
-  dfa(beg, beg, end);
+  matcher(beg, beg, end);
 
   munmap(file_mmap, size);
   pthread_exit(NULL);