changeset 59:fd3d0b8326fe fgest stable

implement regexp-syntax any-char ('.').
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Thu, 04 Nov 2010 17:06:44 +0900
parents 81337db23999
children 5ab54a732ddb
files pyrect/jitgrep.py pyrect/regexp/ast.py pyrect/regexp/dfa_translator.py pyrect/regexp/nfa_translator.py pyrect/translator/grep_translator.py pyrect/translator/template/grep.c
diffstat 6 files changed, 70 insertions(+), 29 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/jitgrep.py	Mon Nov 01 14:50:52 2010 +0900
+++ b/pyrect/jitgrep.py	Thu Nov 04 17:06:44 2010 +0900
@@ -70,7 +70,7 @@
     else:
         grept = GREPTranslator(reg)
         grept.filter = not opts.no_filter
-        grept.thread_num = int(opts.thread)
+        grept.thread_line = int(opts.thread)
     grept.bufsize = bufsize
 
     if opts.dump:
--- a/pyrect/regexp/ast.py	Mon Nov 01 14:50:52 2010 +0900
+++ b/pyrect/regexp/ast.py	Thu Nov 04 17:06:44 2010 +0900
@@ -102,7 +102,7 @@
         return FixedString(self, other)
 
     def __hash__(self):
-        return id(self)
+        return id(self.__str__())
 
     def __cmp__(self, other):
         if self.__hash__() == other.__hash__():
--- a/pyrect/regexp/dfa_translator.py	Mon Nov 01 14:50:52 2010 +0900
+++ b/pyrect/regexp/dfa_translator.py	Thu Nov 04 17:06:44 2010 +0900
@@ -2,7 +2,7 @@
 #-*- encoding: utf-8 -*-
 
 from pyrect.regexp.parser import Parser
-from pyrect.regexp.ast import ASTWalker
+from pyrect.regexp.ast import ASTWalker, AnyChar
 from pyrect.regexp.nfa import NFA
 from pyrect.regexp.nfa_translator import NFATranslator
 from pyrect.regexp.dfa import DFA
@@ -40,13 +40,20 @@
 
         while que:
             states = que.pop()
+            anys = set()
+            for state in states:
+                for k, v in nfa.map.iteritems():
+                    if state == k[0] and k[1] == AnyChar():
+                        anys.update(nfa.epsilon_expand(v))
 
             for state in states:
                 for k, v in nfa.map.iteritems():
                     if state == k[0] and k[1] != '':
-                        slot = map_.setdefault((states, k[1]), set())
+                        default = set(anys)
+                        slot = map_.setdefault((states, k[1]), default)
                         slot.update(nfa.epsilon_expand(v))
 
+
             done.add(states)
 
             for v in map_.itervalues():
--- a/pyrect/regexp/nfa_translator.py	Mon Nov 01 14:50:52 2010 +0900
+++ b/pyrect/regexp/nfa_translator.py	Thu Nov 04 17:06:44 2010 +0900
@@ -63,6 +63,7 @@
         frag2 = union.op2.accept(self)
 
         frag = frag1 | frag2
+
         s = self.state_id
         frag.connect(s, '', frag1.start)
         frag.connect(s, '', frag2.start)
--- a/pyrect/translator/grep_translator.py	Mon Nov 01 14:50:52 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Thu Nov 04 17:06:44 2010 +0900
@@ -23,8 +23,8 @@
     def __init__(self, regexp):
         CTranslator.__init__(self, regexp, fa="DFA")
         self.__bufsize = 1024 * 1024
-        self.parallel_match = False
-        self.thread_num = 0
+        self.thread_dfa = 1
+        self.thread_line = 1
         self.filter = True
 
     def getbufsize(self,):
@@ -36,21 +36,24 @@
 
     def emit_initialization(self):
         CTranslator.emit_initialization(self)
+        if self.thread_dfa > 1 and self.regexp.max_len != float("inf"):
+            self.emit("#define DFA paradfa")
+            self.emit("#define THREAD_NUM %d" % self.thread_dfa)
+            self.emit("#define REG_MAX_LEN %d" % self.regexp.max_len)
+        else:
+            self.emit("#define DFA dfa")
+            self.emit("#define THREAD_NUM 1 // no threading")
+            self.emit("#define REG_MAX_LEN -1")
 
-        if self.thread_num > 1:
-            self.emit("#define GREP paragrep")
-        else:
-            self.emit("#define GREP grep")
-
+        self.emit("#define GREP grep")
         self.emit("#define LINEBUFSIZE %d" % self.bufsize)
         self.emit("#define READBUFSIZE %d" % self.bufsize)
-        self.emit('#define THREAD_NUM %d' % self.thread_num)
-        self.emit('#define THREAD_BUF %d' % 3)
         self.emit('#include <pthread.h>')
         self.emit("#include <stdlib.h>")
         self.emit("#include <string.h>")
         self.emit("char readbuf[%d];" % (self.bufsize))
-        self.emit("int DFA(unsigned char* s);", 2)
+        self.emit("int dfa(unsigned char* s, int len);", 2)
+        self.emit("int paradfa(unsigned char* s, int len);", 2)
 
         if self.filter and self.regexp.must_words:
             self.emit_filter(self.regexp.must_words)
@@ -73,7 +76,8 @@
         else:
             self.emit("#define WITH_FILTER 1", 1)
 
-        self.emiti("int FILTER(unsigned char* text, int n) {")
+        self.emit("#define FILTER bm_filter", 2)
+        self.emiti("int bm_filter(unsigned char* text, int n) {")
         l = len(key)
         if l == 1:
             self.emit("   return (strchr(text, %d) != NULL)" % ord(key))
@@ -107,13 +111,14 @@
         self.emitd("}", 2)
 
     def emit_driver(self):
-        self.emiti("int DFA(unsigned char *text) {")
-        self.emiti(  "do {")
-        self.emiti(  "if(%s(text))" % self.state_name(self.cg.start))
-        self.emit(     "return 1;")
-        self.emitd( r"} while (*text++ != '\0');")
-        self.emitd("return 0;")
-        self.emitd("}", 2)
+        self.emiti("int dfa(unsigned char *text, int len) {")
+        self.emit(   "len++; //try match for n+1 times.")
+        self.emiti(  "while (len--) {")
+        self.emit(     "if (%s(text++)) return 1;" % self.state_name(self.cg.start))
+        self.emitd(  "}")
+        self.emit(   "return 0;")
+        self.emitd("}")
+        return
 
     def emit_state(self, cur_state, transition):
         if cur_state in self.cg.accepts:
--- a/pyrect/translator/template/grep.c	Mon Nov 01 14:50:52 2010 +0900
+++ b/pyrect/translator/template/grep.c	Thu Nov 04 17:06:44 2010 +0900
@@ -1,14 +1,43 @@
 typedef struct _thread_arg {
   unsigned char *buf;
+  int len;
   int match;
 } thread_arg_t;
 
 void* thread_dfa(void *arg) {
   thread_arg_t* targ = (thread_arg_t*)arg;
-  targ->match = DFA(targ->buf);
+  targ->match = DFA(targ->buf, targ->len);
   return NULL;
 }
 
+int paradfa(unsigned char *text, int len) {
+  pthread_t hundle[THREAD_NUM];
+  thread_arg_t targ[THREAD_NUM];
+
+  if (len + REG_MAX_LEN <= REG_MAX_LEN)
+    return dfa(text, len);
+
+  int i, t_len = (len + THREAD_NUM - 1) / THREAD_NUM;
+  for (i = 0; i < THREAD_NUM; i++) {
+    targ[i].buf = text + (unsigned char)(i * t_len);
+    targ[i].len = t_len + REG_MAX_LEN;
+  }
+  targ[THREAD_NUM - 1].len = len - (t_len * (THREAD_NUM - 1));
+
+  for (i = 0; i < THREAD_NUM; i++) {
+    pthread_create(&hundle[i], NULL, (void *)thread_dfa, (void *)&targ[i]);
+  }
+
+  for (i = 0; i < THREAD_NUM; i++) {
+    pthread_join(hundle[i], NULL);
+  }
+
+  for (i = 0; i < THREAD_NUM; i++) {
+    if (targ[i].match) return 1;
+  }
+  return 0;
+}
+
 int paragrep(char *regexp, FILE *f, char *name) {
   int nmatch, used_buf = 0,
     reading = 1;
@@ -25,13 +54,12 @@
         n = strlen(lbuf[i]);
         if (n > 0 && lbuf[i][n-1] == '\n')
           lbuf[i][n-1] = '\0';
+        targ[i].buf = (unsigned char *)lbuf[i];
+        targ[i].len = n - 1;
+        pthread_create(&hundle[i], NULL, (void *)thread_dfa, (void *)&targ[i]);
       }
     }
     for (j = 0; j < i; j++) {
-      targ[j].buf = (unsigned char *)lbuf[j];
-      pthread_create(&hundle[j], NULL, (void *)thread_dfa, (void *)&targ[j]);
-    }
-    for (j = 0; j < i; j++) {
       pthread_join(hundle[j], NULL);
       if (targ[j].match) {
         nmatch++;
@@ -57,9 +85,9 @@
 #ifdef FILTER_ONLY
     if (FILTER(buf, n-1)) {
 #elif  defined(WITH_FILTER)
-    if (FILTER(buf, n-1) && DFA(buf)) {
+    if (FILTER(buf, n-1) && DFA(buf, n-1)) {
 #else
-    if (DFA(buf)) {
+    if (DFA(buf, n-1)) {
 #endif
       nmatch++;
       if (name != NULL)