view pyrect/translator/goto_grep_translator.py @ 88:dafb393108f3

impliment goto-based grep. (icc's tailcall-optimization is suck!)
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sun, 14 Nov 2010 07:56:35 +0900
parents 701beabd7d97
children 8cfa81638130
line wrap: on
line source

#!/usr/bin/env python

import os
from c_translator import CTranslator
from pyrect.regexp import Regexp
from pyrect.regexp.ast import ASTWalker, AnyChar, Character

class GREPTranslateExeption(Exception):
    pass

class GOTOGREPTranslator(CTranslator):
    """GREPTranslator
    This Class can translate form DFA into grep source-code.
    which based on (beautiful) mini-grep introduced  \"The Practice of Programming\"
    written by Rob Pike & Brian W. Kernighan. (see template/grep.c)
    >>> string = \"(def)\"
    >>> reg = Regexp(string)
    >>> tje = GREPLABELTranslator(reg)
    >>> tje.translate()
    """

    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

    def __init__(self, regexp):
        CTranslator.__init__(self, regexp, fa="DFA")
        self.__bufsize = 1024 * 1024
        self.thread_dfa = 1
        self.thread_line = 1
        self.filter = "quick"
        self.filter_only = False
        self.filter_prefix = False
        self.filter_key = ""
        self.skip_boost = True
        self.table_lookup = False
        self.start = self.cg.start
        self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
        self.args = "beg, buf, end"

    def getbufsize(self,):
        return self.__bufsize
    def setbufsize(self, bufsize):
        self.__bufsize = abs(bufsize)

        bufsize = property(getbufsize, setbufsize)

    def state_name(self, name):
        if name in ["accept", "reject", "matcher", "entry"\
                    , "bmh_filter", "quick_filter", "booster"]:
            return name
        else:
            return "state_"+str(name)

    def emit_goto(self, state):
        self.emit("goto %s;" % self.state_name(state))

    def emit_from_callgraph(self):
        # self.emit C-source code
        self.emit_initialization()

        self.emiti("void matcher(%s) {" % self.interface)
        self.emit_driver()
        for cur_state, transition in self.cg.map.iteritems():
            self.emit_state(cur_state, transition)
        self.emit_accept_state()
        self.emit_reject_state()
        self.emit("return;")
        self.emitd("}", 2)

    def emit_initialization(self):
        self.emit("#include <stdio.h>")
        self.emit("#include <stdlib.h>")
        self.emit("#include <sys/mman.h>")
        self.emit("#include <sys/types.h>")
        self.emit("#include <sys/stat.h>")
        self.emit("#include <fcntl.h>")
        self.emit("#include <unistd.h>")
        self.emit("#include <string.h>", 2)

        self.emit("typedef unsigned char   UCHAR;")
        self.emit("typedef unsigned char *UCHARP;")

        self.emit("void matcher(%s);" % self.interface);
        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);")
        self.emit("void print_line(UCHARP buf, UCHARP beg);", 2)

        if (self.filter == "bmh" or self.filter == "quick")\
               and self.regexp.must_words:
            self.filter_key = max(self.regexp.must_words, key=len)
            if len(self.regexp.must_words) == 1 and len(self.filter_key) == self.regexp.min_len:
                self.filter_only = True
            if self.filter == "bmh":
                self.start = "bmh_filter"
            else:
                self.start = "quick_filter"
        else:
            self.filter = False

        if self.skip_boost and not self.filter_only and \
               not AnyChar() in self.regexp.chars and \
               self.regexp.min_len > 2:
            pass
        else:
            self.skip_boost = False

        grepsource = open(self.BASE_DIR + "/template/grep.c")
        self.emit(grepsource.read())

    def emit_bmh_filter(self, key):
        l = len(key)

        self.emiti("bmh_filter:")

        def emit_next():
            if self.filter_only:
                self.emit_goto("accept")
            elif self.filter_prefix:
                self.emit("buf++;")
                self.emit_goto(self.cg.start)
            else:
                self.emit("beg = get_line_beg(buf, beg);")
                self.emit("buf = beg;")
                self.emit_goto(self.cg.start)

        l = len(key)
        if l == 1:
            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
            self.emit("if (buf == NULL) return;")
            emit_next()
            self.emitd("", 2)
            return

        skip = dict()
        for i in range(l-1):
            skip[key[i]] = l-1-i

        self.emit("buf += %d;" % (l-1), 2)

        self.emiti("while (buf < end) {")
        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1])))
        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1))
        self.emiti(    "while (*(--tmp1) == *(--tmp2)) {")
        self.emiti(       "if (tmp2 == key) {")
        emit_next()
        self.emitd(       "}")
        self.emitd(    "}")
        self.emitd(  "}")
        self.emiti(  "switch(*buf) {")
        for k, v in skip.iteritems():
            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
            self.emit(     "buf += %d; break;" % v), self.dedent()
        self.emiti("default: buf += %d;" % l), self.dedent()
        self.emitd(  "}")
        self.emitd("}")
        self.emit( "return;")
        self.emitd("", 2)

    def emit_quick_filter(self, key):
        l = len(key)

        self.emiti("quick_filter:")

        def emit_next():
            if self.filter_only:
                self.emit_goto("accept")
            elif self.filter_prefix:
                self.emit("buf+%d;" % l)
                self.emit_goto(self.cg.start)
            else:
                self.emit("beg = get_line_beg(buf, beg);")
                self.emit("buf = beg;")
                self.emit_goto(self.cg.start)

        l = len(key)
        if l == 1:
            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
            self.emit("if (buf == NULL) return;")
            emit_next()
            self.emitd("}", 2)
            return

        skip = dict()
        for i in range(l):
            skip[key[i]] = l-i

        self.emit("end_ = end - %d;" % (l-1),  2)
        self.emiti("while (buf < end_) {")
        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0])))
        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key;")
        self.emiti(    "while (*(++tmp1) == *(++tmp2)){")
        self.emiti(       "if (tmp2 == key+%d) {" % (l-1))
        emit_next()
        self.emitd(       "}")
        self.emitd(    "}")
        self.emitd(  "}")
        self.emiti(  "switch(buf[%d]) {" % l)
        for k, v in skip.iteritems():
            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
            self.emit(     "buf += %d; break;" % v), self.dedent()
        self.emiti("default: buf += %d;" % (l+1)), self.dedent()
        self.emitd(  "}")
        self.emitd("}")
        self.emit( "return;")
        self.emitd("", 2)

    def emit_booster(self, min_len, chars):
        self.emiti("booster:")
        self.emit(   "end_ = end - %d;" % (min_len-1))
        self.emit(   "if (buf > end_) return;")
        self.emiti(  "do {")
        self.emiti(    "switch (buf[%d]) {" % (min_len-1))
        for c in chars:
            self.emit(   "case %d: /* %s */" % (ord(c), Character.ascii(c)))
        self.emit_goto(self.cg.start)
        self.emitd(    "}")
        self.emitd(  "} while((buf += %d) <= end_);" % min_len)
        self.emit("return;")
        self.emit("", 2)

    def emit_driver(self):
        self.emit("UCHARP end_, ret;")
        if self.skip_boost:
            self.emit_booster(self.regexp.min_len, self.regexp.chars)
        if self.filter:
            self.emit("UCHARP tmp1, tmp2; static const UCHAR key[] = \"%s\";" % self.filter_key)
            self.emit_goto(self.filter + "_filter")
            if self.filter == "bmh":
                self.emit_bmh_filter(self.filter_key)
            else:
                self.emit_quick_filter(self.filter_key)
        else:
            self.emit_goto(self.cg.start)
        return

    def emit_accept_state(self):
        self.emiti("accept:")
        self.emit(   "ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
        if self.skip_boost or self.filter:
            self.emit(   "beg = get_line_beg(buf, beg);")
        self.emiti(  "if (ret == NULL) {")
        self.emit(     "print_line(beg, end);")
        self.emit(     "return;")
        self.emitd(  "}")
        self.emit(   "print_line(beg, ret);")
        self.emit(   "beg = buf = ret + 1;")
        self.emit_goto(self.start)
        self.emitd("", 2)

    def emit_reject_state(self):
        self.emiti("reject:")
        self.emit(   "if (buf >= end) return;")
        self.emit(   "beg = buf;")
        self.emit_goto(self.start)
        self.emitd("")

    def emit_switch(self, case, default=None):
        if not case:
            if default:
                self.emit_goto(default)
            return
        self.emiti("switch(*buf++) {")
        for case, next_ in case.iteritems():
            self.trans_stmt.emit(case, self.state_name(next_))
        if default == self.state_name(self.cg.start) and self.skip_boost:
            self.emit("default: ")
            self.emit_goto("booster")
        else:
            self.emit("default: ")
            self.emit_goto(default)
        self.emitd("}")

    def emit_state(self, cur_state, transition):
        if self.filter_only: return

        self.emiti("%s:" % self.state_name(cur_state))

        if cur_state in self.cg.accepts:
            self.emit(   "buf--;")
            self.emit_goto("accept")
            self.emitd("", 2)
            return

        if transition.has_key(AnyChar()):
            default = transition.pop(AnyChar())
        else:
            default = self.cg.start

        if self.table_lookup and (cur_state == self.cg.start or \
                                  cur_state == default):
            if self.skip_boost and default == self.cg.start:
                default = "booster"
            tbl = [default] * 256
            for eol in self.eols:
                tbl[eol.char] = "reject"
            for c, n in transition.iteritems():
                tbl[c.char] = self.state_name(n)
                self.emit(   "static const void *%s_table[256] = {%s};"
                             % (self.state_name(cur_state), ", ".join(["&&"+x for x in tbl])))
                self.emit(   "goto *%s_table[*buf++];" % (self.state_name(cur_state), self.args))
                self.emitd("", 2)
            return

        for eol in self.eols:
            transition[eol] = "reject"

        for input_ in transition.keys():
            if type(input_) in self.special_rule:
                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))

        self.emit_switch(transition, default)

        self.emitd("", 2)

    class _trans_stmt(ASTWalker):
        def __init__(self, emit):
            self._emit = emit
            self.args = "beg, buf, end"

        def emit(self, input_node, next_):
            self.next = next_
            input_node.accept(self)

        def visit(self, input_node):
            self._emit("/* UNKNOW RULE */")
            self._emit("/* %s */" % input_node.__repr__())

        def visit_Character(self, char):
            self._emit("case %d: /* %s */" % (char.char, char))
            self._emit("  goto %s;" % self.next)

        # Special Rule
        def visit_BegLine(self, begline):
            self._emit("/* begin of line  */")
            self._emit("if (buf == beg)")
            self._emit("  goto %s;" % self.next, 2)

        def visit_Range(self, range):
            if isinstance(range.lower, MBCharacter) and not \
               isinstance(range.upper, MBCharacter) or  \
               isinstance(range.upper, MBCharacter) and not \
               isinstance(range.lower, MBCharacter):
                return

            if isinstance(range.lower, MBCharacter):
                self.visit(range)
            else:
                self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char))
                self._emit("  buf++;")
                self._emit("  goto %s;" % self.next, 2)

def test():
    import doctest
    doctest.testmod()

if __name__ == '__main__': test()