diff pyrect/translator/grep_translator.py @ 62:a05baa7dc7ba

modify I/O routine. use mmap. it's really faster than fgets ;-)
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Fri, 05 Nov 2010 01:37:35 +0900
parents fd3d0b8326fe
children c981dc66b258 bee3a64d6cbc b02b321d0e06
line wrap: on
line diff
--- a/pyrect/translator/grep_translator.py	Thu Nov 04 22:04:34 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Fri Nov 05 01:37:35 2010 +0900
@@ -3,6 +3,7 @@
 import os
 from c_translator import CTranslator
 from pyrect.regexp import Regexp, Analyzer
+from pyrect.regexp.ast import ASTWalker, AnyChar, Character
 
 class GREPTranslateExeption(Exception):
     pass
@@ -26,6 +27,8 @@
         self.thread_dfa = 1
         self.thread_line = 1
         self.filter = True
+        self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
+        self.args = "beg, buf, end"
 
     def getbufsize(self,):
         return self.__bufsize
@@ -35,28 +38,26 @@
     bufsize = property(getbufsize, setbufsize)
 
     def emit_initialization(self):
-        CTranslator.emit_initialization(self)
-        if self.thread_dfa > 1 and self.regexp.max_len != float("inf"):
-            self.emit("#define DFA paradfa")
-            self.emit("#define THREAD_NUM %d" % self.thread_dfa)
-            self.emit("#define REG_MAX_LEN %d" % self.regexp.max_len)
-        else:
-            self.emit("#define DFA dfa")
-            self.emit("#define THREAD_NUM 1 // no threading")
-            self.emit("#define REG_MAX_LEN -1")
-
+        self.emit("#include <stdio.h>")
         self.emit("#define GREP grep")
-        self.emit("#define LINEBUFSIZE %d" % self.bufsize)
-        self.emit("#define READBUFSIZE %d" % self.bufsize)
-        self.emit('#include <pthread.h>')
+        self.emit("#define UCHARP unsigned char *")
         self.emit("#include <stdlib.h>")
+        self.emit("#include <sys/mman.h>")
+        self.emit("#include <sys/types.h>")
+        self.emit("#include <sys/stat.h>")
+        self.emit("#include <fcntl.h>")
         self.emit("#include <string.h>")
-        self.emit("char readbuf[%d];" % (self.bufsize))
-        self.emit("int dfa(unsigned char* s, int len);", 2)
-        self.emit("int paradfa(unsigned char* s, int len);", 2)
+
+        self.emit_skip()
 
-        if self.filter and self.regexp.must_words:
-            self.emit_filter(self.regexp.must_words)
+        for state in self.cg.map.iterkeys():
+            self.emit("void %s(%s);" % (self.state_name(state), self.interface))
+        self.emit('void accept(%s);' % self.interface)
+        self.emit('void reject(%s);' % self.interface)
+        self.emit("void dfa(%s);" % self.interface, 2)
+
+        #if self.filter and self.regexp.must_words:
+        #    self.emit_filter(self.regexp.must_words)
 
         grepsource = open(self.BASE_DIR + "/template/grep.c")
         self.emit(grepsource.read())
@@ -72,15 +73,15 @@
 
         if len(words) == 1:
             if len(key) == self.regexp.min_len:
-                self.emit("#define FILTER_ONLY 1", 1)
+                self.emit("#define MATCH (bm_filter(beg, buf, n-1))", 1)
         else:
-            self.emit("#define WITH_FILTER 1", 1)
+            self.emit("#define (bm_filter(beg, buf, n-1) && DFA(beg, buf, n-1))", 1)
 
         self.emit("#define FILTER bm_filter", 2)
-        self.emiti("int bm_filter(unsigned char* text, int n) {")
+        self.emiti("int bm_filter(unsigned char* buf, int n) {")
         l = len(key)
         if l == 1:
-            self.emit("   return (strchr(text, %d) != NULL)" % ord(key))
+            self.emit("   return (strchr(buf, %d) != NULL)" % ord(key))
             self.emitd("}", 2)
             return
 
@@ -98,10 +99,10 @@
         self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l))
         self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2)
         self.emiti("while (i < n) {")
-        self.emit(   "c = text[i];")
+        self.emit(   "c = buf[i];")
         self.emiti(  "if (c == tail) {")
         self.emit(     "j = len - 1; k = i;")
-        self.emiti(    "while (key[--j] == text[--k]) {")
+        self.emiti(    "while (key[--j] == buf[--k]) {")
         self.emit(       "if (j == 0) return 1;")
         self.emitd(    "}")
         self.emitd(  "}")
@@ -111,22 +112,104 @@
         self.emitd("}", 2)
 
     def emit_driver(self):
-        self.emiti("int dfa(unsigned char *text, int len) {")
-        self.emit(   "len++; //try match for n+1 times.")
-        self.emiti(  "while (len--) {")
-        self.emit(     "if (%s(text++)) return 1;" % self.state_name(self.cg.start))
-        self.emitd(  "}")
-        self.emit(   "return 0;")
+        self.emiti("void dfa(%s) {" % self.interface)
+        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emit(   "return;")
         self.emitd("}")
         return
 
+    def emit_accept_state(self):
+        self.emiti("void accept(%s) {" % self.interface)
+        self.emit(   "buf--;")
+        self.emit(   "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
+        self.emit(   'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}')
+        self.emiti(  "if (ret > end) {")
+        self.emit(     "ret--;")
+        self.emit(     "print_line(beg, ret);")
+        self.emit(     "return;")
+        self.emitd(  "}")
+        self.emit(   "print_line(beg, ret);")
+        self.emit(   "beg = buf = ret + 1;")
+        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emitd("}", 2)
+
+    def emit_reject_state(self):
+        self.emiti("void reject(%s) {" % self.interface)
+        self.emit(   "if (buf >= end) return;")
+        self.emit(   "beg = buf;")
+        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
+        self.emitd("}", 2)
+
+    def emit_switch(self, case, default=None):
+        if not case:
+            if default:
+                self.emit("return %s(%s);" % (default, self.args))
+            return
+        self.emiti("switch(*buf++) {")
+        for case, next_ in case.iteritems():
+            self.trans_stmt.emit(case, self.state_name(next_))
+        if default:
+            if default == self.state_name(self.cg.start):
+                self.emit("default: return %s(%s);" % (default, self.args))
+        self.emitd("}")
+
     def emit_state(self, cur_state, transition):
+        self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface))
+
         if cur_state in self.cg.accepts:
-            self.emiti("int %s(unsigned char* s) {" % self.state_name(cur_state))
-            self.emit(   "return accept(s);")
-            self.emitd("}")
-        else:
-            CTranslator.emit_state(self, cur_state, transition)
+            self.emit(   "return accept(beg, buf-1, end);")
+            self.emitd("}", 2)
+            return
+
+        default = self.state_name(self.cg.start)
+        for eol in self.eols:
+            transition[eol] = "reject"
+
+        for input_ in transition.keys():
+            if type(input_) in self.special_rule:
+                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
+            elif type(input_) is AnyChar:
+                default = self.state_name(transition.pop(input_))
+
+        self.emit_switch(transition, default)
+
+        self.emitd("}", 2)
+
+    class _trans_stmt(ASTWalker):
+        def __init__(self, emit):
+            self._emit = emit
+            self.args = "beg, buf, end"
+
+        def emit(self, input_node, next_):
+            self.next = next_
+            input_node.accept(self)
+
+        def visit(self, input_node):
+            self._emit("/* UNKNOW RULE */")
+            self._emit("/* %s */" % input_node.__repr__())
+
+        def visit_Character(self, char):
+            self._emit("case %d: /* match %s */" % (char.char, char))
+            self._emit("  return %s(%s);" % (self.next, self.args))
+
+        # Special Rule
+        def visit_BegLine(self, begline):
+            self._emit("/* begin of line  */")
+            self._emit("if (buf == beg)")
+            self._emit("  return %s(%s);" % (self.next, self.args), 2)
+
+        def visit_Range(self, range):
+            if isinstance(range.lower, MBCharacter) and not \
+               isinstance(range.upper, MBCharacter) or  \
+               isinstance(range.upper, MBCharacter) and not \
+               isinstance(range.lower, MBCharacter):
+                return
+
+            if isinstance(range.lower, MBCharacter):
+                self.visit(range)
+            else:
+                self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char))
+                self._emit("  return %s(beg, buf+1, end);" % self.next, 2)
 
 def test():
     import doctest