changeset 51:c48284580d5a

dispose op MultiByte Character as concatnated SingleByte Characters
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Tue, 10 Aug 2010 15:56:23 +0900
parents d1afae06e776
children abb0691e792a
files pyrect/grep_bench.sh pyrect/regexp/ast.py pyrect/regexp/dfa_translator.py pyrect/regexp/nfa_translator.py pyrect/regexp/parser.py pyrect/translator/c_translator.py
diffstat 6 files changed, 47 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/grep_bench.sh	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/grep_bench.sh	Tue Aug 10 15:56:23 2010 +0900
@@ -13,8 +13,8 @@
 echo "[jitgrep - with out compiling]"
 time /tmp/jitgrep $@ > /dev/null
 
-echo "\n[llgrep]"
-time ./llgrep.py -O $@ 2> /dev/null > $llgrepout
+#echo "\n[llgrep]"
+#time ./llgrep.py -O $@ 2> /dev/null > $llgrepout
 
 echo "\n[cgrep]"
 time cgrep -E $@ > $cgrepout
@@ -22,8 +22,8 @@
 echo "\n[egrep]"
 time egrep    $@ > $egrepout
 
-echo "\n[dgrep (non-filter grep)]"
-time dgrep -E $@ > $dgrepout
+#echo "\n[dgrep (non-filter grep)]"
+#time dgrep -E $@ > $dgrepout
 
 #echo "\n[agrep]"
 #time agrep $@ > $agrepout
@@ -31,14 +31,14 @@
 echo "\n[diff egrep jitgrep]"
 diff $egrepout $jitgrepout
 
-echo "[diff egrep llgrep]"
-diff $egrepout $llgrepout
+#echo "[diff egrep llgrep]"
+#diff $egrepout $llgrepout
 
 echo "[diff cgrep jitgrep]"
 diff $cgrepout $jitgrepout
 
-echo "[diff cgrep llgrep]"
-diff $cgrepout $llgrepout
+#echo "[diff cgrep llgrep]"
+#diff $cgrepout $llgrepout
 
 echo "\n[matches]"
 wc $egrepout
--- a/pyrect/regexp/ast.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/regexp/ast.py	Tue Aug 10 15:56:23 2010 +0900
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
+#-*- encoding: utf-8 -*-
 
 """
 General-Node-set. Parser create AST (be composed of Nodes) from Regexp.
@@ -114,14 +114,21 @@
 
 class Character(InputNode):
     def __init__(self, char):
-        self.char = char
+        self.char = ord(char)
 
     def __str__(self):
-        return "'" + self.char + "'"
+        if not self.char in range(33, 127): # not Ascii
+            c = r"\\x%x" % self.char
+        else:
+            c = chr(self.char)
+        return "'" + c + "'"
+
+    def __hash__(self):
+        return self.char.__hash__()
 
 class MBCharacter(Character):
     def __init__(self, mbchar):
-        Character.__init__(self, mbchar)
+        ret = Character.__init__(self, mbchar)
         self.bytes = map(ord, str(mbchar))
 
 class EscapeCharacter(Character):
@@ -181,5 +188,9 @@
         self.lower = lower
         self.upper = upper
 
+    def __contains__(self, input_node):
+        if isinstance(input_node, Character):
+            self.lower
+
     def __str__(self):
-        return "%s-%s" % (self.upper, self.lower)
+        return "%s-%s" % (self.lower, self.upper)
--- a/pyrect/regexp/dfa_translator.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/regexp/dfa_translator.py	Tue Aug 10 15:56:23 2010 +0900
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
+#-*- encoding: utf-8 -*-
 
 from pyrect.regexp.parser import Parser
 from pyrect.regexp.ast import ASTWalker
@@ -41,6 +41,7 @@
         while que:
             stateSet = que.pop()
 
+
             for state in stateSet:
                 for k, v in nfa.map.iteritems():
                     if state == k[0] and k[1] != '':
--- a/pyrect/regexp/nfa_translator.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/regexp/nfa_translator.py	Tue Aug 10 15:56:23 2010 +0900
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
+#-*- encoding: utf-8 -*-
 
 from pyrect.regexp.parser import Parser
 from pyrect.regexp.ast import *
@@ -61,6 +61,7 @@
     def visit_Union(self, union):
         frag1 = union.op1.accept(self)
         frag2 = union.op2.accept(self)
+
         frag = frag1 | frag2
         s = self.state_id
         frag.connect(s, '', frag1.start)
--- a/pyrect/regexp/parser.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/regexp/parser.py	Tue Aug 10 15:56:23 2010 +0900
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
+#-*- encoding: utf-8 -*-
 
 from ply import yacc
 import os
@@ -24,6 +24,8 @@
     Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B'))
     >>> parser.parse('あい*う')
     Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う'))
+    >>> parser.parse('[a-f123]')
+    CharClass[(Range:'a'-'f'),(Character:'1'),(Character:'2'),(Character:'3')]
     """
     BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
@@ -159,27 +161,30 @@
 
 def p_atom7(p):
     'atom : MBCHAR'
-    p[0] = MBCharacter(p[1])
+    ret = Character(p[1][0])
+    for byte in p[1][1:]:
+        ret = Concat(ret, Character(byte))
+    p[0] = ret
 
 def p_atom8(p):
     'atom : ESCAPECHAR'
     p[0] = EscapeCharacter(p[1])
 
-def p_charclass2(p):
+def p_charclass1(p):
     'charclass : charclass cclass'
-    p[0] = p[1].union(p[2])
+    p[0] = p[1] + p[2]
 
-def p_charclass1(p):
+def p_charclass2(p):
     'charclass : cclass'
     p[0] = p[1]
 
 def p_cclass1(p):
     'cclass : cset'
-    p[0] = frozenset([p[1]])
+    p[0] = (p[1],)
 
 def p_cclass2(p):
     'cclass : cset DASH cset'
-    p[0] = frozenset([Range(p[1], p[3])])
+    p[0] = (Range(p[1], p[3]),)
 
 def p_cset1(p):
     '''cset : NORMALCHAR
@@ -196,7 +201,11 @@
 
 def p_cset2(p):
     'cset : MBCHAR'
-    p[0] = MBCharacter(p[1])
+    ret = Character(p[1][0])
+    for byte in p[1][1:]:
+        ret = Concat(ret, Character(byte))
+
+    p[0] = ret
 
 def p_error(p):
     raise Exception("syntax error")
--- a/pyrect/translator/c_translator.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/translator/c_translator.py	Tue Aug 10 15:56:23 2010 +0900
@@ -154,7 +154,7 @@
                 default = None
 
         if cur_state in self.cg.accepts:
-            eol = Character(r'\0')
+            eol = Character('\0')
             transition[eol] = "accept"
 
         self.emit_switch(transition, default)
@@ -197,7 +197,7 @@
             self._emit("/* %s */" % input_node.__repr__())
 
         def visit_Character(self, char):
-            self._emit("case '%s':" % char.char)
+            self._emit("case %d:" % char.char)
             self._emit("  return %s(s);" % self.next)
 
         def visit_EndLine(self, endline):