changeset 12:41391400fe68

add GREPTranslator(Translator) and implement jit-compile-grep, which faster than grep!! in case of regular expression search in large files.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sun, 04 Jul 2010 08:40:59 +0900
parents 94984eaa03e2
children fb7922f6d9ef
files src/__init__.py src/benchgrep.sh src/benchgrep.sh~ src/cTranslator.py src/cTranslator.pyc src/cbcTranslator.py src/cbcTranslator.pyc src/converter.py src/dfareg.pyc src/dotTranslator.py src/grep_translator.py src/grep_translator.pyc src/jitgrep.py src/jitgrep.py~ src/reg2llvm.pyc src/template/grep.template src/translator.py src/translator.pyc
diffstat 13 files changed, 212 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/benchgrep.sh	Sun Jul 04 08:40:59 2010 +0900
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+egrepout="/tmp/egrep.out"
+jitgrepout="/tmp/jitgrep.out"
+
+echo "[jitgrep]"
+time ./jitgrep.py $@ > $jitgrepout
+
+echo "\n[egrep]"
+time egrep    $@ > $egrepout
+
+echo "\n[diff egrep jitgrep]"
+diff $egrepout $jitgrepout
+
+#rm -f $egrepout $jitgrepout
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/benchgrep.sh~	Sun Jul 04 08:40:59 2010 +0900
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+egrepout="/tmp/egrep.out"
+jitgrepout="/tmp/jitgrep.out"
+
+echo "[egrep]"
+time egrep    $@ > $egrepout
+
+echo "\n[jitgrep]"
+time ./jitgrep.py $@ > $jitgrepout
+
+echo "\n[diff egrep jitgrep]"
+diff $egrepout $jitgrepout
+
+#rm -f $egrepout $jitgrepout
--- a/src/cTranslator.py	Sun Jul 04 00:48:24 2010 +0900
+++ b/src/cTranslator.py	Sun Jul 04 08:40:59 2010 +0900
@@ -45,8 +45,6 @@
 }\n""" % self.funType)
 
     def emit_driver(self):
-        self.emit(self.funType + 'accept(char* s);\n')
-        self.emit(self.funType + 'reject(char* s);\n')
         self.emit("""
 int main(int argc, char* argv[]) {
 \tputs(\"regexp: %s\");
@@ -77,7 +75,7 @@
 
         if default:
             self.emit( """\t\tdefault:\n\t\t\t%s%s(NULL);\n""" % (self.callType, default))
-        self.emit("\t}")
+        self.emit("\t}\n")
 
 
     def emit_state(self, cur_state, transition):
@@ -102,14 +100,18 @@
                 self.emit_switch(transition, default="reject")
             else:
                 self.emit_switch(transition)
-        self.emit("\n}\n\n")
+        self.emit("}\n\n")
+
+    def emit_initialization(self):
+        self.emit("#include <stdio.h>\n\n")
+        for state in self.cg.map.iterkeys():
+            self.emit(self.funType + self.modify_state_name(state) + "(char* s);\n")
+        self.emit(self.funType + 'accept(char* s);\n')
+        self.emit(self.funType + 'reject(char* s);\n')
 
     def emit_from_callgraph(self):
         # self.emit C-source code
-        self.emit("#include <stdio.h>\n")
-        for k in self.cg.map.iterkeys():
-            self.emit(self.funType + self.modify_state_name(k) + "(char* s);\n")
-
+        self.emit_initialization()
         self.emit_driver()
 
         for cur_state, transition in self.cg.map.iteritems():
Binary file src/cTranslator.pyc has changed
Binary file src/cbcTranslator.pyc has changed
Binary file src/dfareg.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/grep_translator.py	Sun Jul 04 08:40:59 2010 +0900
@@ -0,0 +1,68 @@
+from cTranslator import CTranslator
+from dfareg import Regexp, CallGraph
+
+class GREPTranslator(CTranslator):
+    """GREPTranslator
+    >>> string = \"(A|B)*C\"
+    >>> reg = Regexp(string)
+    >>> dfacg = CallGraph(reg.dfa)
+    >>> tje = GREPTranslator(string, dfacg)
+    >>> tje.translate()
+    """
+    def __init__(self, regexp, cg):
+        CTranslator.__init__(self, regexp, cg)
+        self.funType = 'int '
+        self.callType = 'return '
+        self.breakStatement = ''
+
+    def emit_accept_state(self):
+        self.emit ("""
+%saccept(char* s) {
+\treturn 1;
+}\n""" % self.funType)
+
+    def emit_reject_state(self):
+        self.emit ("""
+%sreject(char* s) {
+\treturn 0;
+}\n""" % self.funType)
+
+    def emit_initialization(self):
+        self.emit("#include <stdio.h>\n")
+        self.emit("#include <stdlib.h>\n")
+        self.emit("#include <string.h>\n\n")
+        self.emit("#define BUFSIZE 1024\n\n")
+        for state in self.cg.map.iterkeys():
+            self.emit(self.funType + self.modify_state_name(state) + "(char* s);\n")
+        self.emit(self.funType + 'accept(char* s);\n')
+        self.emit(self.funType + 'reject(char* s);\n')
+
+    def emit_driver(self):
+        self.emit("""
+int match(char *text) {
+  do {
+    if (%s(text))
+      return 1;
+  } while (*text++ != '\\0');
+  return 0;
+}\n\n""" % (self.modify_state_name(self.cg.start)))
+        self.emit(open("template/grep.template", "r").read())
+        self.emit("\n")
+
+    def emit_state(self, cur_state, transition):
+        self.emit(self.funType + self.modify_state_name(cur_state) + "(char* s) {\n")
+        if cur_state in self.cg.accepts:
+            self.emit("\treturn accept(s);\n")
+        else:
+            if transition:
+                if self.cg.type is "DFA":
+                    self.emit_switch(transition, default="reject")
+                else:
+                    self.emit_switch(transition)
+        self.emit("}\n\n")
+
+def test():
+    import doctest
+    doctest.testmod()
+
+if __name__ == '__main__': test()
Binary file src/grep_translator.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/jitgrep.py	Sun Jul 04 08:40:59 2010 +0900
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import re
+from grep_translator import GREPTranslator
+from dfareg import Regexp, CallGraph
+
+def main(argv):
+    if len(argv) < 2:
+        print("usage: jitgrep regexp [file ..]")
+        return
+
+    string = argv[1]
+    reg = Regexp(string)
+    dfacg = CallGraph(reg.dfa)
+    tje = GREPTranslator(string, dfacg)
+
+    srcpath = "/tmp/jitgrep_emit.c"
+    binpath = "/tmp/jitgrep_emit"
+
+    tmpsrc = open(srcpath, 'w')
+    tje.translate(tmpsrc)
+    tmpsrc.close()
+
+    cmd = 'gcc '  + srcpath + " -o " + binpath
+    # print(cmd)
+    os.system(cmd)
+
+    # print("argc=" + str(len(argv)))
+    # print(argv)
+
+    if len(argv) == 2:
+        while True:
+            try:
+                os.system(binpath + ' ' + raw_input())
+            except KeyboardInterrupt:
+                break
+    else:
+        cmd = binpath + ' dummy_option ' + ' '.join(argv[2:])
+        # print(cmd)
+        os.system(cmd)
+
+    os.remove(srcpath)
+    os.remove(binpath)
+
+if __name__ == '__main__': main(sys.argv)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/jitgrep.py~	Sun Jul 04 08:40:59 2010 +0900
@@ -0,0 +1,14 @@
+import sys
+from grep_translator import GREPTranslator
+from dfareg import Regexp, CallGraph
+
+def main(sys.argv):
+    string = "(gcc|fndecl|build)"
+    reg = Regexp(string)
+    dfacg = CallGraph(reg.dfa)
+    tje = GREPTranslator(string, dfacg)
+    tmpsrc = open("/tmp/jitgrep_emit.c", "w")
+    tje.translate(tmpsrc)
+    print(sys.argv)
+
+if __name__ == '__main__': main(sys.argv)
Binary file src/reg2llvm.pyc has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/template/grep.template	Sun Jul 04 08:40:59 2010 +0900
@@ -0,0 +1,43 @@
+int grep(FILE *f, char *name) {
+  int n, nmatch;
+  char buf[BUFSIZE];
+  nmatch = 0;
+  while (fgets(buf, sizeof buf, f) != NULL) {
+    n = strlen(buf);
+    if (n > 0 && buf[n-1] == '\n')
+      buf[n-1] = '\0';
+    if (match(buf)) {
+      nmatch++;
+      if (name != NULL)
+        printf("%s:", name);
+      printf("%s\n", buf);
+    }
+  }
+  return nmatch;
+}
+
+int main(int argc, char* argv[]) {
+  int i, nmatch;
+  FILE *f;
+  nmatch = 0;
+  /* for (i = 0; i < argc; printf("%s\n", argv[i++])); */
+  if (argc == 2) {
+    if (match(argv[1])) {
+      printf("%s\n", argv[1]);
+      nmatch++;
+    }
+  } else {
+    for (i = 2; i < argc; i++) {
+      f = fopen(argv[i], "r");
+      if (f == NULL) {
+        fprintf(stderr, "can't open: %s\n", argv[i]);
+        continue;
+      }
+      if (grep(f, argc > 3 ? argv[i] : NULL) > 0)
+        nmatch++;
+      fclose(f);
+    }
+  }
+
+  return nmatch;
+}
Binary file src/translator.pyc has changed