changeset 49:7f4221018adf

accept UTF-8 encoding. but some foundational bug in converting algorithm NFA. maybe, which is not too difficult.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Mon, 09 Aug 2010 04:34:13 +0900
parents dd2c7e815346
children d1afae06e776
files pyrect/regexp/ast.py pyrect/regexp/lexer.py pyrect/regexp/parser.py pyrect/translator/c_translator.py pyrect/translator/grep_translator.py pyrect/translator/template/grep.c
diffstat 6 files changed, 121 insertions(+), 148 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/regexp/ast.py	Sun Aug 08 04:14:10 2010 +0900
+++ b/pyrect/regexp/ast.py	Mon Aug 09 04:34:13 2010 +0900
@@ -121,10 +121,8 @@
 
 class MBCharacter(Character):
     def __init__(self, mbchar):
-        Character.__init__(self, unicode(mbchar))
-
-    def __str__(self):
-        return "'" + self.char.encode('utf-8') + "'"
+        Character.__init__(self, mbchar)
+        self.bytes = map(ord, str(mbchar))
 
 class EscapeCharacter(Character):
     def __init__(self, char):
--- a/pyrect/regexp/lexer.py	Sun Aug 08 04:14:10 2010 +0900
+++ b/pyrect/regexp/lexer.py	Mon Aug 09 04:34:13 2010 +0900
@@ -80,7 +80,8 @@
     return t
 
 def t_MBCHAR(t):
-    ur'[^ -~]+' # match multi byte code.
+    # match multi byte code. -> see http://ja.wikipedia.org/wiki/UTF-8
+    u'(\xc2-\xdf].|[\xe0-\xef]..|[\xe0-\xef]..|[\xf0-\xf7]...|[\xf8-\xfb]....|[\xfc-\xfd].....)'
     return t
 
 def t_error(t):
--- a/pyrect/regexp/parser.py	Sun Aug 08 04:14:10 2010 +0900
+++ b/pyrect/regexp/parser.py	Mon Aug 09 04:34:13 2010 +0900
@@ -20,33 +20,19 @@
     ((((^.'^').(('A'|'B'))?).('C')+).$)
 
     multi byte も OK!!
-    >>> parser.parse('aあいc?')
-    Concat(Concat((Character:'a').Concat((MBCharacter:'あ').(MBCharacter:'い'))).(Qmark:('c')?))
-    >>> parser.parse('[123A-Zあ]')
-    CharClass[(Character:'1'),(Character:'2'),(Range:'Z'-'A'),(Character:'3'),(MBCharacter:'あ')]
+    >>> parser.parse('Aあ*い+う?B')
+    Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B'))
+    >>> parser.parse('あい*う')
+    Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う'))
     """
     BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
-    @staticmethod
-    def mbparse(uni_bytes):
-        mbchars = list()
-
-        for mbchar in uni_bytes:
-            mbchars.append(MBCharacter(mbchar))
-
-        ret = mbchars[0]
-
-        for mbchar in mbchars[1:]:
-            ret = Concat(ret, mbchar)
-
-        return ret
-
     def __init__(self):
         self.yacc  = yacc.yacc(outputdir=self.BASE_DIR, debug=False)
         self.lexer = lex
 
     def parse(self, expression):
-        self.lexer.input(unicode(expression, 'utf-8'))
+        self.lexer.input(expression)
         self.ast = self.yacc.parse(lexer=self.lexer)
         return self.ast
 
@@ -173,7 +159,7 @@
 
 def p_atom7(p):
     'atom : MBCHAR'
-    p[0] = Parser.mbparse(p[1])
+    p[0] = MBCharacter(p[1])
 
 def p_atom8(p):
     'atom : ESCAPECHAR'
@@ -188,13 +174,13 @@
     p[0] = p[1]
 
 def p_cclass1(p):
+    'cclass : cset'
+    p[0] = frozenset([p[1]])
+
+def p_cclass2(p):
     'cclass : cset DASH cset'
     p[0] = frozenset([Range(p[1], p[3])])
 
-def p_cclass2(p):
-    'cclass : cset'
-    p[0] = frozenset([p[1]])
-
 def p_cset1(p):
     '''cset : NORMALCHAR
             | LPAREN
--- a/pyrect/translator/c_translator.py	Sun Aug 08 04:14:10 2010 +0900
+++ b/pyrect/translator/c_translator.py	Mon Aug 09 04:34:13 2010 +0900
@@ -20,7 +20,7 @@
         if fa == "DFA":
             self.cg = regexp.dfacg
         self.debug = False
-        self.special_type = (Range, Anchor)
+        self.special_rule = (Range, BegLine, MBCharacter)
         self.trans_stmt = self._trans_stmt(self.emit)
 
     def state_name(self, name):
@@ -30,24 +30,39 @@
             return "state_"+str(name)
 
     def emit_accept_state(self):
-        self.emiti("void accept(char* s) {")
-        self.emit(  r'printf("string matches regexp. \n\n");')
+        self.emiti("int accept(unsigned char* s) {")
+        self.emit(   "return 1;")
         self.emitd("}", 2)
 
     def emit_reject_state(self):
-        self.emiti("void reject(char* s) {")
-        self.emit(  r'printf("string matches regexp. \n\n");')
+        self.emiti("int reject(unsigned char* s) {")
+        self.emit(   "return 0;")
         self.emitd("}", 2)
 
+    def emit_skip(self):
+        self.emiti("const char skip_tbl[256] = {")
+        self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,")
+        self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,")
+        self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,")
+        self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,")
+        self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,")
+        self.emit("1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,")
+        self.emit("2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,")
+        self.emit("3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,")
+        self.emit("};")
+        self.emitd("#define SKIP(s) ((s) + skip_tbl[*(unsigned char *)s])", 2)
+        #self.emitd("#define SKIP(s) s+1", 2)
+
     def emit_driver(self):
-        self.emiti("int main(int argc, char* argv[]) {")
+        self.emiti("int main(int argc, unsigned char* argv[]) {")
         self.emit(   'buf = argv[1];')
-        self.emit(   'puts("regexp: %s");"' % self.regexp.regexp)
+        self.emit(   'puts("regexp: %s");' % self.regexp.regexp)
         self.emit(   'puts("number of state: %d");' % len(self.cg.states))
-        self.emit(  r'printf(\"string: %s\n\", argv[1]);')
-        self.emit(   "%s(argv[1]);" % self.state_name(self.cg.start))
-        if self.cg.type == "NFA":
-            self.emit("reject(argv[1]);")
+        self.emit(  r'printf("string: %s\n", argv[1]);')
+        self.emit0(  "if (%s(argv[1]))" % self.state_name(self.cg.start))
+        self.emit(  r'printf("accept: regexp matches string. \n\n");')
+        self.emit0(  "else ")
+        self.emit( r'printf("reject: regexp not matches string. \n\n");')
         self.emit(   "return 0;")
         self.emitd("}", 2)
 
@@ -73,13 +88,13 @@
             string = string[2:]
         if len(string) == 1:
             ptr = "'%s'" % string[0]
-            cmp_stmt.append(("char *", offset, ptr))
+            cmp_stmt.append(("unsigned char *", offset, ptr))
             offset += 1
 
         self.emit()
         self.emit0("if (")
         for stmt in cmp_stmt:
-            self.emit0("*(%s)((char *)s+%d) == %s" % stmt)
+            self.emit0("*(%s)((unsigned char *)s+%d) == %s" % stmt)
             if stmt != cmp_stmt[-1]:
                 self.emit(" && ")
         self.emiti(")")
@@ -99,12 +114,16 @@
         self.emitd()
 
     def emit_strcmp3(self, string, next):
-        self.emit('static char* string = \"%s\";' % string)
+        self.emit('static unsigned char* string = \"%s\";' % string)
         self.emiti("if (memcmp(string, s, %d) == 0)" % len(string))
         self.emit("return %s(s+%d);" % (self.state_name(next), len(string)))
         self.emitd()
 
     def emit_switch(self, case, default=None):
+        if not case:
+            if default:
+                self.emit("return %s(s);" % default)
+            return
         self.emiti("switch(*s++) {")
         for case, next_ in case.iteritems():
             self.trans_stmt.emit(case, self.state_name(next_))
@@ -113,40 +132,47 @@
         self.emitd("}")
 
     def emit_state(self, cur_state, transition):
-        self.emiti("int %s(char* s) {" % self.state_name(cur_state))
+        self.emiti("int %s(unsigned char* s) {" % self.state_name(cur_state))
 
         if self.debug:
             self.emit(r'printf("state: %s, input: %%s\n", s);' % cur_state)
         if self.cg.type == "NFA":
+            default = None
             if '' in transition:
                 epsilon_transition = transition.pop('')
                 for n in epsilon_transition:
                     self.emit("\t%s%s(s);\n" % (self.callType, self.state_name(n)))
+        else:
+            default = "reject"
+
+        any_ = None
+
+        for input_ in transition.keys():
+            if type(input_) in self.special_rule:
+                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
+            elif type(input_) is AnyChar:
+                any_ = (input_, self.state_name(transition.pop(input_)))
+                default = None
 
         if cur_state in self.cg.accepts:
             eol = Character(r'\0')
             transition[eol] = "accept"
 
-        for input_ in transition.keys():
-            for special in self.special_type:
-                if isinstance(input_, special):
-                    self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
+        self.emit_switch(transition, default)
 
-        if transition:
-            if self.cg.type == "DFA":
-                self.emit_switch(transition, default="reject")
-            else:
-                self.emit_switch(transition)
+        if any_:
+            self.trans_stmt.emit(any_[0], any_[1])
 
         self.emitd("}", 2)
 
     def emit_initialization(self):
         self.emit("#include <stdio.h>")
         for state in self.cg.map.iterkeys():
-            self.emit("int %s(char* s);" % self.state_name(state))
-        self.emit('int accept(char* s);')
-        self.emit('int reject(char* s);')
-        self.emit('char* buf;', 2)
+            self.emit("int %s(unsigned char* s);" % self.state_name(state))
+        self.emit('int accept(unsigned char* s);')
+        self.emit('int reject(unsigned char* s);')
+        self.emit('unsigned char* buf;')
+        self.emit_skip()
 
     def emit_from_callgraph(self):
         # self.emit C-source code
@@ -171,22 +197,27 @@
             self._emit("/* UNKNOW RULE */")
             self._emit("/* %s */" % input_node.__repr__())
 
-        def visit_MBCharacter(self, mbchar):
-            self.visit(mbchar)
-
         def visit_Character(self, char):
             self._emit("case '%s':" % char.char)
             self._emit("  return %s(s);" % self.next)
 
-        def visit_BegLine(self, begline):
-            self._emit("if (s == buf)")
-            self._emit("  return %s(s);" % self.next)
-
         def visit_EndLine(self, endline):
-            self._emit(r"if (*s == '\0')")
+            self._emit(r"case '\0':")
             self._emit("  return %s(s);" % self.next)
 
         # Special Rule
+
+        def visit_MBCharacter(self, mbchar):
+            self._emit("/* match %s  */" % mbchar)
+            bytes = mbchar.bytes
+            self._emit("  if(%s)" % \
+                       " && ".join(["*(s+%d) == 0x%x" % (d, x) for d, x in enumerate(bytes)]))
+            self._emit("    return %s(s+%d);" % (self.next, len(bytes)), 2)
+
+        def visit_BegLine(self, begline):
+            self._emit("if (s == buf)")
+            self._emit("  return %s(s);" % self.next, 2)
+
         def visit_Range(self, range):
             if isinstance(range.lower, MBCharacter) and not \
                isinstance(range.upper, MBCharacter) or  \
@@ -197,8 +228,13 @@
             if isinstance(range.lower, MBCharacter):
                 self.visit(range)
             else:
-                self._emit("if ('%s' <= *s && *s =< '%s')" % (range.lower.char, range.upper.char))
-                self._emit("  return %s(s+1);" % self.next)
+                self._emit("if ('%s' <= *s && *s <= '%s')" % (range.lower.char, range.upper.char))
+                self._emit("  return %s(s+1);" % self.next, 2)
+
+        def visit_AnyChar(self, anychar):
+            self._emit(r"if (*s != '\0')")
+            self._emit(   "return %s(SKIP(s));" % self.next, 2)
+            self._emit("return reject(s);")
 
 def test():
     import doctest
--- a/pyrect/translator/grep_translator.py	Sun Aug 08 04:14:10 2010 +0900
+++ b/pyrect/translator/grep_translator.py	Mon Aug 09 04:34:13 2010 +0900
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-import copy
+import os
 from c_translator import CTranslator
 from pyrect.regexp import Regexp
 
@@ -18,12 +18,10 @@
     >>> tje.translate()
     """
 
+    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+
     def __init__(self, regexp):
-        CTranslator.__init__(self, regexp)
-        self.funType = 'int '
-        self.callType = 'return '
-        self.breakStatement = ''
-        self.begline = False
+        CTranslator.__init__(self, regexp, fa="DFA")
         self.__bufsize = 1024
 
     def getbufsize(self,):
@@ -33,83 +31,36 @@
 
     bufsize = property(getbufsize, setbufsize)
 
-    def emit_accept_state(self):
-        self.emit ("""
-%saccept(char* s) {
-\treturn 1;
-}\n""" % self.funType)
-
-    def emit_reject_state(self):
-        self.emit ("""
-%sreject(char* s) {
-\treturn 0;
-}\n""" % self.funType)
-
     def emit_initialization(self):
-        self.emit("#include <stdio.h>\n")
-        self.emit("#include <stdlib.h>\n")
-        self.emit("#include <string.h>\n\n")
-
-        self.emit("#define LINEBUFSIZE 1024\n")
-        self.emit("#define READBUFSIZE %d\n\n" % (self.bufsize))
-        self.emit("char readbuf[%d];\n\n" % (self.bufsize))
-
-        self.emit("%sDFA(char* s);\n" % (self.funType))
-        for state in self.cg.map.iterkeys():
-            self.emit(self.funType + self.state_name(state) + "(char* s);\n")
-        self.emit(self.funType + 'accept(char* s);\n')
-        self.emit(self.funType + 'reject(char* s);\n')
-        grepsource = open("template/grep.c")
+        CTranslator.emit_initialization(self)
+        self.emit("#define LINEBUFSIZE 1024")
+        self.emit("#define READBUFSIZE %d" % (self.bufsize))
+        self.emit("#include <stdlib.h>")
+        self.emit("#include <string.h>")
+        self.emit("char readbuf[%d];" % (self.bufsize))
+        self.emit("int DFA(unsigned char* s);", 2)
+        grepsource = open(self.BASE_DIR + "/template/grep.c")
         self.emit(grepsource.read())
 
     def emit_filter(self):
         pass
 
-    def emit_matcher(self):
-        self.emit("int match(char *regexp, char *text) {\n")
-        if self.begline:
-            self.emit("\t\treturn DFA(text);\n")
-        else:
-            self.emit("""
-\tdo {
-\t\tif  (DFA(text))
-\t\t\treturn 1;
-\t} while (*text++ != '\\0');
-\treturn 0;
-""")
-        self.emit("}\n\n")
-
     def emit_driver(self):
-        self.emit("""
-int main(int argc, char* argv[]) {
-\treturn grepmain(argc, argv);
-}\n\n""")
-        self.emit_matcher()
-        self.emit("""
-%sDFA(char *s) {
-  return %s(s);
-}\n\n""" % (self.funType, self.state_name(self.cg.start)))
+        self.emiti("int DFA(unsigned char *text) {")
+        self.emiti(  "do {")
+        self.emiti(  "if(%s(text))" % self.state_name(self.cg.start))
+        self.emit(     "return 1;")
+        self.emitd( r"} while (*text++ != '\0');")
+        self.emit("return 0;")
+        self.emitd("}", 2)
 
     def emit_state(self, cur_state, transition):
-        transition = copy.deepcopy(transition)
-        self.emit(self.funType + self.state_name(cur_state) + "(char* s) {\n")
-        strings = [x for x in transition.keys() if len(x) > 1]
-        if strings:
-            self.emit("\tchar *ls;\n")
-            for string in strings:
-                self.emit_strcmp2(string, transition.pop(string))
-
         if cur_state in self.cg.accepts:
-            self.emit("\treturn accept(s);\n")
+            self.emiti("int %s(unsigned char* s) {" % self.state_name(cur_state))
+            self.emit(   "return accept(s);")
+            self.emitd("}")
         else:
-            if transition:
-                if self.cg.type == "DFA":
-                    self.emit_switch(transition, default="reject")
-                else:
-                    self.emit_switch(transition)
-            else:
-                self.emit("\t return reject(s);\n")
-        self.emit("}\n\n")
+            CTranslator.emit_state(self, cur_state, transition)
 
 def test():
     import doctest
--- a/pyrect/translator/template/grep.c	Sun Aug 08 04:14:10 2010 +0900
+++ b/pyrect/translator/template/grep.c	Mon Aug 09 04:34:13 2010 +0900
@@ -1,22 +1,23 @@
-int grep(char * regexp, FILE *f, char *name) {
+int grep(char *regexp, FILE *f, char *name) {
   int n, nmatch;
-  char buf[LINEBUFSIZE];
+  char lbuf[LINEBUFSIZE];
+  buf = (unsigned char *)lbuf;
   nmatch = 0;
-  while (fgets(buf, sizeof buf, f) != NULL) {
-    n = strlen(buf);
+  while (fgets(lbuf, sizeof lbuf, f) != NULL) {
+    n = strlen(lbuf);
     if (n > 0 && buf[n-1] == '\n')
-      buf[n-1] = '\0';
-    if (match(regexp, buf)) {
+      lbuf[n-1] = '\0';
+    if (DFA(buf)) {
       nmatch++;
       if (name != NULL)
         printf("%s:", name);
-      printf("%s\n", buf);
+      printf("%s\n", lbuf);
     }
   }
   return nmatch;
 }
 
-int grepmain(int argc, char* argv[]) {
+int main(int argc, char* argv[]) {
   int i, nmatch;
   FILE *f;
 
@@ -27,7 +28,7 @@
   nmatch = 0;
   if (argc == 2) {
     if (grep(argv[1], stdin, NULL))
-      nmatch;
+      nmatch++;
   } else {
     for (i = 2; i < argc; i++) {
       f = fopen(argv[i], "r");