changeset 39:43b277a00905

add Lexer/Parser/AST class. it's can parse Regexp to AST. (Pyrect will shift to more flexible/robust system.)
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Tue, 13 Jul 2010 07:53:28 +0900
parents 06826250198b
children 962ae4154724
files pyrect/llgrep.py pyrect/regexp/__init__.py pyrect/regexp/ast.py pyrect/regexp/kwset.py pyrect/regexp/lexer.py pyrect/regexp/parser.py
diffstat 6 files changed, 306 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/llgrep.py	Mon Jul 12 06:24:57 2010 +0900
+++ b/pyrect/llgrep.py	Tue Jul 13 07:53:28 2010 +0900
@@ -14,8 +14,6 @@
     psr = OptionParser(usage=myusage)
 
     redirect = ""
-    srcpath = "/tmp/jitgrep_dfa.c"
-    binpath = "/tmp/jitgrep"
 
     psr.add_option("-O", action="store_true", dest="optimize", default=False, help="Print compile/matching time.")
     psr.add_option("--time", action="store_true", dest="time", default=False, help="Print compile/matching time.")
@@ -49,11 +47,12 @@
         end_time = time.time()
         print("Compiling  : " + str(end_time - start_time) + " Sec.")
 
-    if (opts.time): redirect = "> /dev/null"
-    if (opts.out):  redirect = ">" + opts.out
-
+    if (opts.time): sys.stdout = open("/dev/null", "r")
+    if (opts.out):  sys.stdout = open(opts.out, "r")
+    sys.stderr = open("/dev/null", "r")
     if (opts.time): start_time = time.time()
     grept.execute()
+    sys.stdout = sys.__stdout__
     if (opts.time):
         end_time = time.time()
         print("Matching   : " + str(end_time - start_time) + " Sec.")
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrect/regexp/__init__.py	Tue Jul 13 07:53:28 2010 +0900
@@ -0,0 +1,6 @@
+#!/usr/env/bin python
+
+from parser import Parser
+from node import *
+
+__all__ = [Parser, "node"]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrect/regexp/ast.py	Tue Jul 13 07:53:28 2010 +0900
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+"""
+General-Node-set. Parser create AST (be composed of Nodes) from Regexp.
+Node are Printable, and Keywords Countable(kwset_node).
+"""
+
+class ASTWalker():
+    def visit(self, ast):
+        pass
+
+# AST-Nodes
+class Node(object):
+    def __init__(self):
+        pass
+    def __str__(self):
+        return str(self.__class__)
+    def accept(self, visitor):
+        visit = "visit_%s" % self.__class__.__name__
+        return getattr(visitor, visit, visitor.visit)(self)
+
+class Character(Node):
+    def __init__(self, char):
+        self.char = char
+
+    def __str__(self):
+        return "'" + self.char + "'"
+
+class Concat(Node):
+    def __init__(self, op1, op2):
+        self.op1 = op1
+        self.op2 = op2
+
+    def concat(self, key1, key2):
+        if isinstance(key1, str) and isinstance(key2, str):
+            return key1 + key2
+        elif isinstance(key1, str) and isinstance(key2, list):
+            if key2[0]:
+                key2[0] = key1 + key2[0]
+            else:
+                key2 = [key1] + key2
+            return key2
+        elif isinstance(key1, list) and isinstance(key2, str):
+            if key1[-1]:
+                key1[-1] = key1[-1] + key2
+            else:
+                key1 = key1 + [key2]
+            return key1
+        else:
+            key1 + key2
+
+    def __str__(self):
+        return "(%s.%s)" % (self.op1, self.op2)
+
+class Union(Node):
+    def __init__(self, op1, op2):
+        self.op1 = op1
+        self.op2 = op2
+
+    def __str__(self):
+        return "(%s|%s)" % (self.op1, self.op2)
+
+class Star(Node):
+    def __init__(self, op):
+        self.op = op
+
+    def __str__(self):
+        return "(%s)*" % self.op
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrect/regexp/kwset.py	Tue Jul 13 07:53:28 2010 +0900
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+"""
+Extract Keywords from AST. Keywords,
+which are necessary words to be accepted with Regular-Expression.
+and which are used to Fixed-String-Filtering (ex: Boyer-Moore).
+kwset is also used in GNU-GREP.
+"""
+
+from parser import Parser
+from ast import ASTWalker
+
+class KeywordsExtractor(ASTWalker):
+    """ Extract with Visitor-Pattern.
+    AST (ast), is represented by Node-Tree.
+    """
+    def __init__(self):
+        self.keywords = []
+
+    def extract_keywords(self, ast=None):
+        if ast:
+            self.keywords = ast.accept(self)
+        return self.keywords
+
+    def visit(self, ast):
+        """Following Classes contain no-Keywords.
+        Union, Star
+        """
+        return ['']
+
+    def visit_Character(self, character):
+        return character.char
+
+    def visit_Concat(self, concat):
+        key1 = concat.op1.accept(self)
+        key2 = concat.op2.accept(self)
+
+        if isinstance(key1, str) and isinstance(key2, str):
+            return key1 + key2
+        elif isinstance(key1, str) and isinstance(key2, list):
+            if key2[0]:
+                key2[0] = key1 + key2[0]
+            else:
+                key2 = [key1] + key2
+            return key2
+        elif isinstance(key1, list) and isinstance(key2, str):
+            if key1[-1]:
+                key1[-1] = key1[-1] + key2
+            else:
+                key1 = key1 + [key2]
+            return key1
+        else:
+            return key1 + key2
+
+def extract_keywords(ast):
+    return KeywordsExtractor().extract_keywords(ast)
+
+def test():
+    """
+    >>> prs = Parser()
+    >>> kex = KeywordsExtractor()
+    >>> kex.extract_keywords(prs.parse('(AB|CD)*123'))
+    ['', '123']
+    >>> kex.extract_keywords(prs.parse('WOOO*PS!!'))
+    ['WOO', '', 'PS!!']
+    >>> kex.extract_keywords(prs.parse('(build|fndecl|gcc)'))
+    ['']
+    """
+    import doctest
+    doctest.testmod()
+
+if __name__ == "__main__": test()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrect/regexp/lexer.py	Tue Jul 13 07:53:28 2010 +0900
@@ -0,0 +1,36 @@
+from ply import lex
+
+tokens = (
+    'UNION',
+    'STAR',
+    'LPAREN',
+    'RPAREN',
+    'ANYCHAR',
+    )
+
+def t_UNION(t):
+    ur'\|'
+    return t
+
+def t_STAR(t):
+    ur'\*'
+    return t
+
+def t_LPAREN(t):
+    ur'\('
+    return t
+
+def t_RPAREN(t):
+    ur'\)'
+    return t
+
+def t_ANYCHAR(t):
+    ur'\\?(.)'
+    t.value = t.value[-1:]
+    return t
+
+def t_error(t):
+    print "Illegal character '%s'" % t.value[0]
+    raise t
+
+lex.lex()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pyrect/regexp/parser.py	Tue Jul 13 07:53:28 2010 +0900
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+from ply        import yacc
+from lexer      import lex, tokens
+from ast import *
+
+class Parser(object):
+    """Parser
+    This class can parse from Regexp to AST.
+    if you want something to do from AST, then modify Nodes.
+    >>> parser = Parser()
+    >>> ast = parser.parse('(AB|CD)*123')
+    >>> print ast
+    (((((('A'.'B')|('C'.'D')))*.'1').'2').'3')
+    """
+
+    def __init__(self):
+        self.yacc  = yacc.yacc()
+        self.lexer = lex
+
+    def parse(self, expression):
+        self.lexer.input(expression)
+        self.ast = self.yacc.parse(lexer=self.lexer)
+        return self.ast
+
+"""Parse following language
+----------------------------------------
+regexp   -> regexp UNION branch | branch
+branch   -> branch closure | closure
+closure  -> closure STAR | atom
+atom     -> LPAREN regexp RPAREN | ANYCHAR
+
+old parse rule
+(A) expression -> subexpr EOF
+(B) subexpr -> seq '|' subexpr | seq
+(C) seq -> subseq | ''
+(D) subseq -> star subseq | star
+(E) star -> factor '*' | factor
+(F) factor -> '(' subexpr ')' | CHARACTER
+
+hairy...
+
+/*from gnu-grep, The grammar understod by the parser is as follow. (dfa.c:1221)
+   regexp:
+     regexp OR branch
+     branch
+
+   branch:
+     branch closure
+     closure
+
+   closure:
+     closure QMARK
+     closure STAR
+     closure PLUS
+     closure REPMN
+     atom
+
+   atom:
+     <normal character>
+     <multibyte character>
+     ANYCHAR
+     MBCSET
+     CSET
+     BACKREF
+     BEGLINE
+     ENDLINE
+     BEGWORD
+     ENDWORD
+     LIMWORD
+     NOTLIMWORD
+     CRANGE
+     LPAREN regexp RPAREN
+     <empty>
+     The parser builds a parse tree in postfix form in an array of tokens. */
+
+     *and more detail for gnu-grep's grammar, see grep/src/dfa.h .
+"""
+
+# Parsing-Rule
+def p_regexp1(p):
+    'regexp : regexp UNION branch'
+    p[0] = Union(p[1], p[3])
+
+def p_regexp2(p):
+    'regexp : branch'
+    p[0] = p[1]
+
+def p_branch1(p):
+    'branch : branch closure'
+    p[0] = Concat(p[1], p[2])
+
+def p_branch2(p):
+    'branch : closure'
+    p[0] = p[1]
+
+def p_closure1(p):
+    'closure : closure STAR'
+    p[0] = Star(p[1])
+
+def p_closure2(p):
+    'closure : atom'
+    p[0] = p[1]
+
+def p_atom1(p):
+    'atom : LPAREN regexp RPAREN'
+    p[0] = p[2]
+
+def p_atom2(p):
+    'atom : ANYCHAR'
+    p[0] = Character(p[1])
+
+def p_error(p):
+    raise Exception("syntax error")
+
+def test():
+    import doctest
+    doctest.testmod()
+
+if __name__ == "__main__": test()