diff pyrect/translator/grep_translator.py @ 57:81b44ae1cd73

add fixed-string filter(Boyer-Moore), and add option '--disable-filter'.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Mon, 01 Nov 2010 14:41:03 +0900
parents ee9945561f80
children 81337db23999
line wrap: on
line diff
--- a/pyrect/translator/grep_translator.py	Wed Oct 27 20:46:41 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Mon Nov 01 14:41:03 2010 +0900
@@ -2,7 +2,7 @@
 
 import os
 from c_translator import CTranslator
-from pyrect.regexp import Regexp
+from pyrect.regexp import Regexp, Analyzer
 
 class GREPTranslateExeption(Exception):
     pass
@@ -25,6 +25,7 @@
         self.__bufsize = 1024 * 1024
         self.parallel_match = False
         self.thread_num = 0
+        self.filter = True
 
     def getbufsize(self,):
         return self.__bufsize
@@ -35,24 +36,71 @@
 
     def emit_initialization(self):
         CTranslator.emit_initialization(self)
+
+        if self.thread_num > 1:
+            self.emit("#define GREP paragrep")
+        else:
+            self.emit("#define GREP grep")
+
         self.emit("#define LINEBUFSIZE %d" % self.bufsize)
         self.emit("#define READBUFSIZE %d" % self.bufsize)
         self.emit('#define THREAD_NUM %d' % self.thread_num)
         self.emit('#define THREAD_BUF %d' % 3)
         self.emit('#include <pthread.h>')
-        if self.thread_num > 1:
-            self.emit("#define GREP paragrep")
-        else:
-            self.emit("#define GREP grep")
         self.emit("#include <stdlib.h>")
         self.emit("#include <string.h>")
         self.emit("char readbuf[%d];" % (self.bufsize))
         self.emit("int DFA(unsigned char* s);", 2)
+
+        if self.filter and self.regexp.must_words:
+            self.emit_filter(self.regexp.must_words)
+
         grepsource = open(self.BASE_DIR + "/template/grep.c")
         self.emit(grepsource.read())
 
-    def emit_filter(self):
-        pass
+    def emit_filter(self, words):
+        def longest(s1, s2):
+            return s1 if len(s1) >= len(s2) else s2
+        key = reduce(longest, words)
+
+        if len(words) == 1:
+            if len(key) == self.regexp.min_len:
+                self.emit("#define FILTER_ONLY 1", 1)
+        else:
+            self.emit("#define WITH_FILTER 1", 1)
+
+        self.emiti("int FILTER(unsigned char* text, int n) {")
+        l = len(key)
+        if l == 1:
+            self.emit("   return (strchr(text, %d) != NULL)" % ord(key))
+            self.emitd("}", 2)
+            return
+
+        skip = [str(l)] * 256
+        for i in range(l - 1):
+            skip[ord(key[i])] = str(l-1-i)
+
+        self.emit('static unsigned char key[] = "%s";' % key)
+        self.emiti(   "static int skip[256] = {")
+        for i in range(8):
+            i = i * 32
+            self.emit(",".join(skip[i:i+32]) + ",")
+        self.emitd(   "};")
+
+        self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l))
+        self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2)
+        self.emiti("while (i < n) {")
+        self.emit(   "c = text[i];")
+        self.emiti(  "if (c == tail) {")
+        self.emit(     "j = len - 1; k = i;")
+        self.emiti(    "while (key[--j] == text[--k]) {")
+        self.emit(       "if (j == 0) return 1;")
+        self.emitd(    "}")
+        self.emitd(  "}")
+        self.emit(   "i += skip[c];")
+        self.emitd("}")
+        self.emit( "return 0;")
+        self.emitd("}", 2)
 
     def emit_driver(self):
         self.emiti("int DFA(unsigned char *text) {")