view pyrect/pyrect/regexp/analyzer.py @ 9:493c96d030c0

add pyrect
author nobuyasu <dimolto@cr.ie.u-ryukyu.ac.jp>
date Tue, 14 Jun 2011 17:24:03 +0900
parents
children
line wrap: on
line source

#!/usr/bin/env python

"""
Extract Keywords from AST. Keywords,
which are necessary words to be accepted with Regular-Expression.
and which are used to Fixed-String-Filtering (ex: Boyer-Moore).
kwset is also used in GNU-GREP.
"""

from pyrect.regexp.parser import Parser
from pyrect.regexp.ast import ASTWalker, Plus

class Analyzer(ASTWalker):
    """ Extract with Visitor-Pattern.
    AST (ast), is represented by Node-Tree.
    >>> prs = Parser()
    >>> an  = Analyzer()
    >>> an.analyze(prs.parse('fixed-string'))
    (12, 12, ['fixed-string'])
    >>> an.analyze(prs.parse('(build|fndecl|gcc)'))
    (6, 3, [])
    >>> an.analyze(prs.parse('123(AB|CD)*456'))
    (inf, 6, ['123', '456'])
    >>> an.analyze(prs.parse('((12)+|3)|456'))
    (inf, 1, [])
    >>> an.analyze(prs.parse('^(plus)?(qmark)?'))
    (9, 0, [])
    >>> an.analyze(prs.parse('\*+ \[\['))
    (inf, 4, ['* [['])
    """

    def __init__(self, ast=None):
        if ast:
            self.analyze(ast)
        else:
            self.max_len = 0
            self.min_len = 0

    def analyze(self, ast=None):
        if ast:
            self.max_len, self.min_len, self.must_words = ast.accept(self)
            self.must_words = [x for x in self.must_words if x != ""]
        return self.max_len, self.min_len, self.must_words

    def visit(self, ast):
        """Following Classes contain no-Keywords.
        Union, Star
        """
        return 1, 1, [str(ast)]

    def visit_Character(self, ast):
        return 1, 1, [chr(ast.char)]

    def concat(self, (max1, min1, key1), (max2, min2, key2)):
        return max1 + max2, min1 + min2, key1[0:-1] \
               + ([key1[-1] + key2[0]]) + key2[1:]

    def union(self, (max1, min1, _), (max2, min2, __)):
        return max(max1, max2), min(min1, min2), ["", ""]

    def visit_Star(self, star):
        return float("inf"), 0, ["", ""]

    def visit_Plus(self, plus):
        (_, m, k) = plus.op.accept(self)
        return float("inf"), m, k + [""] + k

    def visit_Qmark(self, qmark):
        (m, _, _) = qmark.op.accept(self)
        return m, 0, ["", ""]

    def visit_CharClass(self, cclass):
        return 1, 1, ["", ""]

def test():
    import doctest
    doctest.testmod()

if __name__ == "__main__": test()