view pyrect/translator/grep_translator.py @ 79:623eccb93ca1

modify filter emit-option's bug.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Mon, 08 Nov 2010 10:56:55 +0900
parents 240475723cd8
children 53c3ce58fc8a
line wrap: on
line source

#!/usr/bin/env python

import os
from c_translator import CTranslator
from pyrect.regexp import Regexp
from pyrect.regexp.ast import ASTWalker, AnyChar, Character

class GREPTranslateExeption(Exception):
    pass

class GREPTranslator(CTranslator):
    """GREPTranslator
    This Class can translate form DFA into grep source-code.
    which based on (beautiful) mini-grep introduced  \"The Practice of Programming\"
    written by Rob Pike & Brian W. Kernighan. (see template/grep.c)
    >>> string = \"(build|fndecl|gcc)\"
    >>> reg = Regexp(string)
    >>> tje = GREPTranslator(reg)
    >>> tje.translate()
    """

    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

    def __init__(self, regexp):
        CTranslator.__init__(self, regexp, fa="DFA")
        self.__bufsize = 1024 * 1024
        self.thread_dfa = 1
        self.thread_line = 1
        self.filter = "quick"
        self.filter_only = False
        self.filter_prefix = False
        self.skip_boost = True
        self.start = "matcher"
        self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
        self.args = "beg, buf, end"

    def getbufsize(self,):
        return self.__bufsize
    def setbufsize(self, bufsize):
        self.__bufsize = abs(bufsize)

        bufsize = property(getbufsize, setbufsize)

    def emit_initialization(self):
        self.emit("#include <stdio.h>")
        self.emit("#define GREP grep")
        self.emit("#define UCHAR unsigned char")
        self.emit("#define UCHARP unsigned char *")
        self.emit("#include <stdlib.h>")
        self.emit("#include <sys/mman.h>")
        self.emit("#include <sys/types.h>")
        self.emit("#include <sys/stat.h>")
        self.emit("#include <fcntl.h>")
        self.emit("#include <string.h>")

        self.emit('void reject(%s);' % self.interface)
        self.emit("void matcher(%s);" % self.interface, 2)
        self.emit('void accept(%s);' % self.interface)

        key = None

        if (self.filter == "bmh" or self.filter == "quick")\
               and self.regexp.must_words:
            key = max(self.regexp.must_words, key=len)
            if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len:
                self.filter_only = True
        else:
            self.filter = False

        if not self.filter_only:
            for state in self.cg.map.iterkeys():
                self.emit("void %s(%s);" % (self.state_name(state), self.interface))
            self.emit()

        if self.filter == "bmh":
            self.emit_bmh_filter(key)
        elif self.filter:
            self.emit_quick_filter(key)

        if self.skip_boost and not self.filter_only and \
               not AnyChar() in self.regexp.chars and \
               self.regexp.min_len > 2:
            self.emit_booster(self.regexp.min_len, self.regexp.chars)
        else:
            self.skip_boost = False

        grepsource = open(self.BASE_DIR + "/template/grep.c")
        self.emit(grepsource.read())

    def emit_bmh_filter(self, key):
        l = len(key)
        def emit_next():
            if self.filter_only:
                self.emit("accept(%s);" % self.args)
            elif self.filter_prefix:
                self.emit("buf++;")
                self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args))
            else:
                self.emit("beg = get_line_beg(buf, beg);")
                self.emit("buf = beg;")
                self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args))

        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2)
        self.emiti("void bmh_filter(%s) {" % self.interface)
        l = len(key)
        if l == 1:
            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
            self.emit("if (buf == NULL) return;")
            emit_next()
            self.emitd("}", 2)
            return

        self.emit('static const UCHAR key[] = "%s";' % key)

        skip = dict()
        for i in range(l-1):
            skip[key[i]] = l-1-i

        self.emit("UCHARP tmp1, *tmp2; buf += %d;" % (l-1), 2)

        self.emiti("while (buf < end) {")
        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1])))
        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1))
        self.emiti(    "while (*(--tmp1) == *(--tmp2)) {")
        self.emit(       "if (tmp2 == key) goto next;")
        self.emitd(    "}")
        self.emitd(  "}")
        self.emiti(  "switch(*buf) {")
        for k, v in skip.iteritems():
            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
            self.emit(     "buf += %d; break;" % v), self.dedent()
        self.emiti("default: buf += %d;" % l), self.dedent()
        self.emitd(  "}")
        self.emitd("}")
        self.emit( "return;")
        self.emit( "next:")
        emit_next()
        self.emitd("}", 2)

    def emit_quick_filter(self, key):
        l = len(key)
        def emit_next():
            if self.filter_only:
                self.emit("accept(%s);" % self.args)
            elif self.filter_prefix:
                self.emit("buf+%d;" % l)
                self.emit("%s(%s);" % (self.state_name(self.cg.start) ,self.args))
            else:
                self.emit("beg = get_line_beg(buf, beg);")
                self.emit("buf = beg;")
                self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args))

        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2)

        self.emiti("void quick_filter(%s) {" % self.interface)
        l = len(key)
        if l == 1:
            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
            self.emit("if (buf == NULL) return;")
            emit_next()
            self.emitd("}", 2)
            return

        self.emit('static const UCHAR key[] = "%s";' % key)

        skip = dict()
        for i in range(l):
            skip[key[i]] = l-i

        self.emit("UCHARP tmp1, *tmp2, *end_ = end - %d;" % (l-1),  2)
        self.emiti("while (buf < end_) {")
        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0])))
        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key;")
        self.emiti(    "while (*(++tmp1) == *(++tmp2)){")
        self.emit(       "if (tmp2 == key+%d) goto next;" % (l-1))
        self.emitd(    "}")
        self.emitd(  "}")
        self.emiti(  "switch(buf[%d]) {" % l)
        for k, v in skip.iteritems():
            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
            self.emit(     "buf += %d; break;" % v), self.dedent()
        self.emiti("default: buf += %d;" % (l+1)), self.dedent()
        self.emitd(  "}")
        self.emitd("}")
        self.emit( "return;")
        self.emit( "next:")
        emit_next()
        self.emitd("}", 2)

    def emit_booster(self, min_len, chars):
        self.emiti("void booster(%s) {" % self.interface)
        self.emiti(  "do {")
        self.emit(     "if (buf > end) return;")
        self.emiti(    "switch (*(buf+%d)) {" % (min_len-1))
        for c in chars:
            self.emit(   "case %d: /* %s */" % (ord(c), Character.ascii(c)))
        self.emit(       "goto ret;")
        self.emitd(    "}")
        self.emitd(  "} while(buf += %d);" % (min_len-1))
        self.emit(   "ret: return %s(%s);" % (self.state_name(self.cg.start) , self.args))
        self.emitd("}", 2)

    def emit_driver(self):
        self.emiti("void matcher(%s) {" % self.interface)
        if self.filter:
            self.emit(   "%s(%s);" % (self.filter + "_filter", self.args))
        else:
            self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
        self.emit(   "return;")
        self.emitd("}")
        return

    def emit_accept_state(self):
        self.emiti("void accept(%s) {" % self.interface)
        self.emit(   "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
        if self.skip_boost or self.filter:
            self.emit(   "beg = get_line_beg(buf, beg);")
        self.emiti(  "if (ret == NULL) {")
        self.emit(     "print_line(beg, end);")
        self.emit(     "return;")
        self.emitd(  "}")
        self.emit(   "print_line(beg, ret);")
        self.emit(   "beg = buf = ret + 1;")
        self.emit(   "%s(%s);" % (self.start, self.args))
        self.emitd("}", 2)

    def emit_reject_state(self):
        self.emiti("void reject(%s) {" % self.interface)
        self.emit(   "if (buf >= end) return;")
        self.emit(   "beg = buf;")
        self.emit(   "return %s(%s);" % (self.start, self.args))
        self.emitd("}", 2)

    def emit_switch(self, case, default=None):
        if not case:
            if default:
                self.emit("return %s(%s);" % (default, self.args))
            return
        self.emiti("switch(*buf++) {")
        for case, next_ in case.iteritems():
            self.trans_stmt.emit(case, self.state_name(next_))
        if default == self.state_name(self.cg.start) and self.skip_boost:
            self.emit("default: return booster(%s);" % self.args)
        else:
            self.emit("default: return %s(%s);" % (default, self.args))
        self.emitd("}")

    def emit_state(self, cur_state, transition):
        if self.filter_only: return

        self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface))

        if cur_state in self.cg.accepts:
            self.emit(   "return accept(beg, buf-1, end);")
            self.emitd("}", 2)
            return

        default = self.state_name(self.cg.start)
        for eol in self.eols:
            transition[eol] = "reject"

        for input_ in transition.keys():
            if type(input_) in self.special_rule:
                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
            elif type(input_) is AnyChar:
                default = self.state_name(transition.pop(input_))

        self.emit_switch(transition, default)

        self.emitd("}", 2)

    class _trans_stmt(ASTWalker):
        def __init__(self, emit):
            self._emit = emit
            self.args = "beg, buf, end"

        def emit(self, input_node, next_):
            self.next = next_
            input_node.accept(self)

        def visit(self, input_node):
            self._emit("/* UNKNOW RULE */")
            self._emit("/* %s */" % input_node.__repr__())

        def visit_Character(self, char):
            self._emit("case %d: /* %s */" % (char.char, char))
            self._emit("  return %s(%s);" % (self.next, self.args))

        # Special Rule
        def visit_BegLine(self, begline):
            self._emit("/* begin of line  */")
            self._emit("if (buf == beg)")
            self._emit("  return %s(%s);" % (self.next, self.args), 2)

        def visit_Range(self, range):
            if isinstance(range.lower, MBCharacter) and not \
               isinstance(range.upper, MBCharacter) or  \
               isinstance(range.upper, MBCharacter) and not \
               isinstance(range.lower, MBCharacter):
                return

            if isinstance(range.lower, MBCharacter):
                self.visit(range)
            else:
                self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char))
                self._emit("  return %s(beg, buf+1, end);" % self.next, 2)

def test():
    import doctest
    doctest.testmod()

if __name__ == '__main__': test()