Mercurial > hg > Members > shinya > pyrect
changeset 51:c48284580d5a
dispose op MultiByte Character as concatnated SingleByte Characters
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 10 Aug 2010 15:56:23 +0900 |
parents | d1afae06e776 |
children | abb0691e792a |
files | pyrect/grep_bench.sh pyrect/regexp/ast.py pyrect/regexp/dfa_translator.py pyrect/regexp/nfa_translator.py pyrect/regexp/parser.py pyrect/translator/c_translator.py |
diffstat | 6 files changed, 47 insertions(+), 25 deletions(-) [+] |
line wrap: on
line diff
--- a/pyrect/grep_bench.sh Mon Aug 09 15:06:24 2010 +0900 +++ b/pyrect/grep_bench.sh Tue Aug 10 15:56:23 2010 +0900 @@ -13,8 +13,8 @@ echo "[jitgrep - with out compiling]" time /tmp/jitgrep $@ > /dev/null -echo "\n[llgrep]" -time ./llgrep.py -O $@ 2> /dev/null > $llgrepout +#echo "\n[llgrep]" +#time ./llgrep.py -O $@ 2> /dev/null > $llgrepout echo "\n[cgrep]" time cgrep -E $@ > $cgrepout @@ -22,8 +22,8 @@ echo "\n[egrep]" time egrep $@ > $egrepout -echo "\n[dgrep (non-filter grep)]" -time dgrep -E $@ > $dgrepout +#echo "\n[dgrep (non-filter grep)]" +#time dgrep -E $@ > $dgrepout #echo "\n[agrep]" #time agrep $@ > $agrepout @@ -31,14 +31,14 @@ echo "\n[diff egrep jitgrep]" diff $egrepout $jitgrepout -echo "[diff egrep llgrep]" -diff $egrepout $llgrepout +#echo "[diff egrep llgrep]" +#diff $egrepout $llgrepout echo "[diff cgrep jitgrep]" diff $cgrepout $jitgrepout -echo "[diff cgrep llgrep]" -diff $cgrepout $llgrepout +#echo "[diff cgrep llgrep]" +#diff $cgrepout $llgrepout echo "\n[matches]" wc $egrepout
--- a/pyrect/regexp/ast.py Mon Aug 09 15:06:24 2010 +0900 +++ b/pyrect/regexp/ast.py Tue Aug 10 15:56:23 2010 +0900 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- +#-*- encoding: utf-8 -*- """ General-Node-set. Parser create AST (be composed of Nodes) from Regexp. @@ -114,14 +114,21 @@ class Character(InputNode): def __init__(self, char): - self.char = char + self.char = ord(char) def __str__(self): - return "'" + self.char + "'" + if not self.char in range(33, 127): # not Ascii + c = r"\\x%x" % self.char + else: + c = chr(self.char) + return "'" + c + "'" + + def __hash__(self): + return self.char.__hash__() class MBCharacter(Character): def __init__(self, mbchar): - Character.__init__(self, mbchar) + ret = Character.__init__(self, mbchar) self.bytes = map(ord, str(mbchar)) class EscapeCharacter(Character): @@ -181,5 +188,9 @@ self.lower = lower self.upper = upper + def __contains__(self, input_node): + if isinstance(input_node, Character): + self.lower + def __str__(self): - return "%s-%s" % (self.upper, self.lower) + return "%s-%s" % (self.lower, self.upper)
--- a/pyrect/regexp/dfa_translator.py Mon Aug 09 15:06:24 2010 +0900 +++ b/pyrect/regexp/dfa_translator.py Tue Aug 10 15:56:23 2010 +0900 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- +#-*- encoding: utf-8 -*- from pyrect.regexp.parser import Parser from pyrect.regexp.ast import ASTWalker @@ -41,6 +41,7 @@ while que: stateSet = que.pop() + for state in stateSet: for k, v in nfa.map.iteritems(): if state == k[0] and k[1] != '':
--- a/pyrect/regexp/nfa_translator.py Mon Aug 09 15:06:24 2010 +0900 +++ b/pyrect/regexp/nfa_translator.py Tue Aug 10 15:56:23 2010 +0900 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- +#-*- encoding: utf-8 -*- from pyrect.regexp.parser import Parser from pyrect.regexp.ast import * @@ -61,6 +61,7 @@ def visit_Union(self, union): frag1 = union.op1.accept(self) frag2 = union.op2.accept(self) + frag = frag1 | frag2 s = self.state_id frag.connect(s, '', frag1.start)
--- a/pyrect/regexp/parser.py Mon Aug 09 15:06:24 2010 +0900 +++ b/pyrect/regexp/parser.py Tue Aug 10 15:56:23 2010 +0900 @@ -1,5 +1,5 @@ #!/usr/bin/env python -# -*- encoding: utf-8 -*- +#-*- encoding: utf-8 -*- from ply import yacc import os @@ -24,6 +24,8 @@ Concat(Concat(Concat(Concat((Character:'A').(Star:('あ')*)).(Plus:('い')+)).(Qmark:('う')?)).(Character:'B')) >>> parser.parse('あい*う') Concat(Concat((MBCharacter:'あ').(Star:('い')*)).(MBCharacter:'う')) + >>> parser.parse('[a-f123]') + CharClass[(Range:'a'-'f'),(Character:'1'),(Character:'2'),(Character:'3')] """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -159,27 +161,30 @@ def p_atom7(p): 'atom : MBCHAR' - p[0] = MBCharacter(p[1]) + ret = Character(p[1][0]) + for byte in p[1][1:]: + ret = Concat(ret, Character(byte)) + p[0] = ret def p_atom8(p): 'atom : ESCAPECHAR' p[0] = EscapeCharacter(p[1]) -def p_charclass2(p): +def p_charclass1(p): 'charclass : charclass cclass' - p[0] = p[1].union(p[2]) + p[0] = p[1] + p[2] -def p_charclass1(p): +def p_charclass2(p): 'charclass : cclass' p[0] = p[1] def p_cclass1(p): 'cclass : cset' - p[0] = frozenset([p[1]]) + p[0] = (p[1],) def p_cclass2(p): 'cclass : cset DASH cset' - p[0] = frozenset([Range(p[1], p[3])]) + p[0] = (Range(p[1], p[3]),) def p_cset1(p): '''cset : NORMALCHAR @@ -196,7 +201,11 @@ def p_cset2(p): 'cset : MBCHAR' - p[0] = MBCharacter(p[1]) + ret = Character(p[1][0]) + for byte in p[1][1:]: + ret = Concat(ret, Character(byte)) + + p[0] = ret def p_error(p): raise Exception("syntax error")
--- a/pyrect/translator/c_translator.py Mon Aug 09 15:06:24 2010 +0900 +++ b/pyrect/translator/c_translator.py Tue Aug 10 15:56:23 2010 +0900 @@ -154,7 +154,7 @@ default = None if cur_state in self.cg.accepts: - eol = Character(r'\0') + eol = Character('\0') transition[eol] = "accept" self.emit_switch(transition, default) @@ -197,7 +197,7 @@ self._emit("/* %s */" % input_node.__repr__()) def visit_Character(self, char): - self._emit("case '%s':" % char.char) + self._emit("case %d:" % char.char) self._emit(" return %s(s);" % self.next) def visit_EndLine(self, endline):