Mercurial > hg > Members > shinya > pyrect

--- a/pyrect/grep_bench.sh	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/grep_bench.sh	Tue Aug 10 15:56:23 2010 +0900
@@ -13,8 +13,8 @@
 echo "[jitgrep - with out compiling]"
 time /tmp/jitgrep $@ > /dev/null

-echo "\n[llgrep]"
-time ./llgrep.py -O $@ 2> /dev/null > $llgrepout
+#echo "\n[llgrep]"
+#time ./llgrep.py -O $@ 2> /dev/null > $llgrepout

 echo "\n[cgrep]"
 time cgrep -E $@ > $cgrepout
@@ -22,8 +22,8 @@
 echo "\n[egrep]"
 time egrep    $@ > $egrepout

-echo "\n[dgrep (non-filter grep)]"
-time dgrep -E $@ > $dgrepout
+#echo "\n[dgrep (non-filter grep)]"
+#time dgrep -E $@ > $dgrepout

 #echo "\n[agrep]"
 #time agrep $@ > $agrepout
@@ -31,14 +31,14 @@
 echo "\n[diff egrep jitgrep]"
 diff $egrepout $jitgrepout

-echo "[diff egrep llgrep]"
-diff $egrepout $llgrepout
+#echo "[diff egrep llgrep]"
+#diff $egrepout $llgrepout

 echo "[diff cgrep jitgrep]"
 diff $cgrepout $jitgrepout

-echo "[diff cgrep llgrep]"
-diff $cgrepout $llgrepout
+#echo "[diff cgrep llgrep]"
+#diff $cgrepout $llgrepout

 echo "\n[matches]"
 wc $egrepout
--- a/pyrect/regexp/ast.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/regexp/ast.py	Tue Aug 10 15:56:23 2010 +0900
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
+#-*- encoding: utf-8 -*-

 """
 General-Node-set. Parser create AST (be composed of Nodes) from Regexp.
@@ -114,14 +114,21 @@

 class Character(InputNode):
     def __init__(self, char):
-        self.char = char
+        self.char = ord(char)

     def __str__(self):
-        return "'" + self.char + "'"
+        if not self.char in range(33, 127): # not Ascii
+            c = r"\\x%x" % self.char
+        else:
+            c = chr(self.char)
+        return "'" + c + "'"
+
+    def __hash__(self):
+        return self.char.__hash__()

 class MBCharacter(Character):
     def __init__(self, mbchar):
-        Character.__init__(self, mbchar)
+        ret = Character.__init__(self, mbchar)
         self.bytes = map(ord, str(mbchar))

 class EscapeCharacter(Character):
@@ -181,5 +188,9 @@
         self.lower = lower
         self.upper = upper

+    def __contains__(self, input_node):
+        if isinstance(input_node, Character):
+            self.lower
+
     def __str__(self):
-        return "%s-%s" % (self.upper, self.lower)
+        return "%s-%s" % (self.lower, self.upper)
--- a/pyrect/regexp/dfa_translator.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/regexp/dfa_translator.py	Tue Aug 10 15:56:23 2010 +0900
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
+#-*- encoding: utf-8 -*-

 from pyrect.regexp.parser import Parser
 from pyrect.regexp.ast import ASTWalker
@@ -41,6 +41,7 @@
         while que:
             stateSet = que.pop()

+
             for state in stateSet:
                 for k, v in nfa.map.iteritems():
                     if state == k[0] and k[1] != '':
--- a/pyrect/regexp/nfa_translator.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/regexp/nfa_translator.py	Tue Aug 10 15:56:23 2010 +0900
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
+#-*- encoding: utf-8 -*-

 from pyrect.regexp.parser import Parser
 from pyrect.regexp.ast import *
@@ -61,6 +61,7 @@
     def visit_Union(self, union):
         frag1 = union.op1.accept(self)
         frag2 = union.op2.accept(self)
+
         frag = frag1 | frag2
         s = self.state_id
         frag.connect(s, '', frag1.start)
--- a/pyrect/regexp/parser.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/regexp/parser.py	Tue Aug 10 15:56:23 2010 +0900
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
+#-*- encoding: utf-8 -*-

 from ply import yacc
 import os
@@ -24,6 +24,8 @@
     Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B'))
     >>> parser.parse('あい*う')
     Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う'))
+    >>> parser.parse('[a-f123]')
+    CharClass[(Range:'a'-'f'),(Character:'1'),(Character:'2'),(Character:'3')]
     """
     BASE_DIR = os.path.dirname(os.path.abspath(__file__))

@@ -159,27 +161,30 @@

 def p_atom7(p):
     'atom : MBCHAR'
-    p[0] = MBCharacter(p[1])
+    ret = Character(p[1][0])
+    for byte in p[1][1:]:
+        ret = Concat(ret, Character(byte))
+    p[0] = ret

 def p_atom8(p):
     'atom : ESCAPECHAR'
     p[0] = EscapeCharacter(p[1])

-def p_charclass2(p):
+def p_charclass1(p):
     'charclass : charclass cclass'
-    p[0] = p[1].union(p[2])
+    p[0] = p[1] + p[2]

-def p_charclass1(p):
+def p_charclass2(p):
     'charclass : cclass'
     p[0] = p[1]

 def p_cclass1(p):
     'cclass : cset'
-    p[0] = frozenset([p[1]])
+    p[0] = (p[1],)

 def p_cclass2(p):
     'cclass : cset DASH cset'
-    p[0] = frozenset([Range(p[1], p[3])])
+    p[0] = (Range(p[1], p[3]),)

 def p_cset1(p):
     '''cset : NORMALCHAR
@@ -196,7 +201,11 @@

 def p_cset2(p):
     'cset : MBCHAR'
-    p[0] = MBCharacter(p[1])
+    ret = Character(p[1][0])
+    for byte in p[1][1:]:
+        ret = Concat(ret, Character(byte))
+
+    p[0] = ret

 def p_error(p):
     raise Exception("syntax error")
--- a/pyrect/translator/c_translator.py	Mon Aug 09 15:06:24 2010 +0900
+++ b/pyrect/translator/c_translator.py	Tue Aug 10 15:56:23 2010 +0900
@@ -154,7 +154,7 @@
                 default = None

         if cur_state in self.cg.accepts:
-            eol = Character(r'\0')
+            eol = Character('\0')
             transition[eol] = "accept"

         self.emit_switch(transition, default)
@@ -197,7 +197,7 @@
             self._emit("/* %s */" % input_node.__repr__())

         def visit_Character(self, char):
-            self._emit("case '%s':" % char.char)
+            self._emit("case %d:" % char.char)
             self._emit("  return %s(s);" % self.next)

         def visit_EndLine(self, endline):