view pyrect/translator/grep_translator.py @ 99:e327e93aeb3a

remove callgraph and use Transition.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sun, 12 Dec 2010 23:09:19 +0900
parents 5db856953793
children 6aab6b1038f0
line wrap: on
line source

#!/usr/bin/env python

import os
from c_translator import CTranslator
from pyrect.regexp import Regexp
from pyrect.regexp.ast import ASTWalker, AnyChar, Character, SpecialInputNode

class GREPTranslateExeption(Exception):
    pass

class GREPTranslator(CTranslator):
    """GREPTranslator
    This Class can translate form DFA into grep source-code.
    which based on (beautiful) mini-grep introduced  \"The Practice of Programming\"
    written by Rob Pike & Brian W. Kernighan. (see template/grep.c)
    >>> string = \"(build|fndecl|gcc)\"
    >>> reg = Regexp(string)
    >>> tje = GREPTranslator(reg)
    >>> tje.translate()
    """

    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

    def __init__(self, regexp):
        CTranslator.__init__(self, regexp)
        self.__bufsize = 1024 * 1024
        self.thread_dfa = 1
        self.thread_line = 1
        self.filter = "quick"
        self.filter_only = False
        self.filter_prefix = False
        self.skip_boost = True
        self.table_lookup = False
        self.start = "matcher"
        self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
        self.args = "beg, buf, end"

    def getbufsize(self,):
        return self.__bufsize
    def setbufsize(self, bufsize):
        self.__bufsize = abs(bufsize)

        bufsize = property(getbufsize, setbufsize)

    def emit_initialization(self):
        self.emit("#include <stdio.h>")
        self.emit("#include <stdlib.h>")
        self.emit("#include <sys/mman.h>")
        self.emit("#include <sys/types.h>")
        self.emit("#include <sys/stat.h>")
        self.emit("#include <fcntl.h>")
        self.emit("#include <unistd.h>")
        self.emit("#include <string.h>", 2)

        self.emit("typedef unsigned char   UCHAR;")
        self.emit("typedef unsigned char *UCHARP;", 2)

        self.emit('void reject(%s);' % self.interface)
        self.emit("void matcher(%s);" % self.interface)
        self.emit('void accept(%s);' % self.interface, 2)

        key = None

        if (self.filter == "bmh" or self.filter == "quick")\
               and self.regexp.must_words:
            key = max(self.regexp.must_words, key=len)
            if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len:
                self.filter_only = True
        else:
            self.filter = False

        if not self.filter_only:
            for state in self.cg.transition.iterkeys():
                self.emit("void %s(%s);" % (self.state_name(state), self.interface))
            self.emit()

        if self.filter == "bmh":
            self.emit_bmh_filter(key)
        elif self.filter:
            self.emit_quick_filter(key)

        if self.skip_boost and not self.filter_only and \
               not AnyChar() in self.regexp.chars and \
               self.regexp.min_len > 2:
            self.emit_booster(self.regexp.min_len, self.regexp.chars)
        else:
            self.skip_boost = False

        grepsource = open(self.BASE_DIR + "/template/grep.c")
        self.emit(grepsource.read())

    def emit_bmh_filter(self, key):
        l = len(key)
        def emit_next():
            if self.filter_only:
                self.emit("accept(%s);" % self.args)
            elif self.filter_prefix:
                self.emit("buf++;")
                self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args))
            else:
                self.emit("beg = get_line_beg(buf, beg);")
                self.emit("buf = beg;")
                self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args))

        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2)
        self.emiti("void bmh_filter(%s) {" % self.interface)
        l = len(key)
        if l == 1:
            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
            self.emit("if (buf == NULL) return;")
            emit_next()
            self.demit("}", 2)
            return

        self.emit('static const UCHAR key[] = "%s";' % key)

        skip = dict()
        for i in range(l-1):
            skip[key[i]] = l-1-i

        self.emit("UCHARP tmp1, tmp2; buf += %d;" % (l-1), 2)

        self.emiti("while (buf < end) {")
        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1])))
        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1))
        self.emiti(    "while (*(--tmp1) == *(--tmp2)) {")
        self.emit(       "if (tmp2 == key) goto next;")
        self.demit(    "}")
        self.demit(  "}")
        self.emiti(  "switch(*buf) {")
        for k, v in skip.iteritems():
            self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
            self.emit(     "buf += %d; break;" % v), self.dedent()
        self.emiti("default: buf += %d;" % l), self.dedent()
        self.demit(  "}")
        self.demit("}")
        self.emit( "return;")
        self.emit( "next:")
        emit_next()
        self.demit("}", 2)

    def emit_quick_filter(self, key):
        l = len(key)
        def emit_next():
            if self.filter_only:
                self.emit("accept(%s);" % self.args)
            elif self.filter_prefix:
                self.emit("buf+%d;" % l)
                self.emit("%s(%s);" % (self.state_name(self.cg.start) ,self.args))
            else:
                self.emit("beg = get_line_beg(buf, beg);")
                self.emit("buf = beg;")
                self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args))

        self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2)

        self.emiti("void quick_filter(%s) {" % self.interface)
        l = len(key)
        if l == 1:
            self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key))
            self.emit("if (buf == NULL) return;")
            emit_next()
            self.demit("}", 2)
            return

        self.emit('static const UCHAR key[] = "%s";' % key)

        skip = dict()
        for i in range(l):
            skip[key[i]] = l-i

        self.emit("UCHARP tmp1, tmp2, end_ = end - %d;" % (l-1),  2)
        self.emiti("while (buf < end_) {")
        self.emiti(  "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0])))
        self.emit(     "tmp1 = buf, tmp2 = (UCHARP)key;")
        self.emiti(    "while (*(++tmp1) == *(++tmp2)){")
        self.emit(       "if (tmp2 == key+%d) goto next;" % (l-1))
        self.demit(    "}")
        self.demit(  "}")
        if self.table_lookup:
            self.emiti("static const void * tbl[256] = {[0 ... 255] &&any, %s};"
                       % ", ".join("[%d] &&add%s" % (ord(c), s) for c, s in skip.iteritems()))
            self.emit("goto *tbl[buf[%d]];" % l)
            defun = []
            for s in skip.itervalues():
                if s in defun: continue
                defun.append(s)
                self.emit("add%s: buf += %s; goto ends;" % (s, s))
            self.emit("any: buf += %d; ends:;" % (l+1))
        else:
            self.emiti(  "switch(buf[%d]) {" % l)
            for k, v in skip.iteritems():
                self.emiti(  "case %d: /* %s */" % (ord(k), Character.ascii(k)))
                self.emit(     "buf += %d; break;" % v), self.dedent()
            self.emiti("default: buf += %d;" % (l+1)), self.dedent()
            self.demit(  "}")
        self.demit("}")
        self.emit( "return;")
        self.emit( "next:")
        emit_next()
        self.demit("}", 2)

    def emit_booster(self, min_len, chars):
        self.emiti("void booster(%s) {" % self.interface)
        self.emit(   "UCHARP end_ = end - %d;" % (min_len-1))
        self.emit(   "if (buf > end_) return;")
        self.emiti(  "do {")
        self.emiti(    "switch (buf[%d]) {" % (min_len-1))
        for c in chars:
            self.emit(   "case %d: /* %s */" % (ord(c), Character.ascii(c)))
        self.emit(       "goto ret;")
        self.demit(    "}")
        self.demit(  "} while((buf += %d) <= end_);" % min_len)
        self.emit(   "ret: return %s(%s);" % (self.state_name(self.cg.start) , self.args))
        self.demit("}", 2)

    def emit_driver(self):
        self.emiti("void matcher(%s) {" % self.interface)
        if self.filter:
            self.emit(   "%s(%s);" % (self.filter + "_filter", self.args))
        else:
            self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
        self.emit(   "return;")
        self.demit("}", 2)
        return

    def emit_accept_state(self):
        self.emiti("void accept(%s) {" % self.interface)
        self.emit(   "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
        if self.skip_boost or self.filter:
            self.emit(   "beg = get_line_beg(buf, beg);")
        self.emiti(  "if (ret == NULL) {")
        self.emit(     "print_line(beg, end);")
        self.emit(     "return;")
        self.demit(  "}")
        self.emit(   "print_line(beg, ret);")
        self.emit(   "beg = buf = ret + 1;")
        self.emit(   "%s(%s);" % (self.start, self.args))
        self.demit("}", 2)

    def emit_reject_state(self):
        self.emiti("void reject(%s) {" % self.interface)
        self.emit(   "if (buf >= end) return;")
        self.emit(   "beg = buf;")
        self.emit(   "return %s(%s);" % (self.start, self.args))
        self.demit("}", 2)

    def emit_switch(self, case, default=None):
        if not case:
            if default:
                self.emit("return %s(%s);" % (default, self.args))
            return
        self.emiti("switch(*buf++) {")
        for case, next_ in case.iteritems():
            self.trans_stmt.emit(case, self.state_name(next_))
        if default == self.state_name(self.cg.start) and self.skip_boost:
            self.emit("default: return booster(%s);" % self.args)
        else:
            self.emit("default: return %s(%s);" % (default, self.args))
        self.demit("}")

    def emit_state(self, cur_state, transition):
        if self.filter_only: return

        self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface))

        if cur_state in self.cg.accepts:
            self.emit(   "return accept(beg, buf-1, end);")
            self.demit("}", 2)
            return

        if transition.has_key(AnyChar()):
            default = self.state_name(transition.pop(AnyChar()))
        else:
            default = self.state_name(self.cg.start)

        if self.table_lookup and (cur_state == self.cg.start or \
           self.state_name(cur_state) == default):
            if self.skip_boost and default == self.state_name(self.cg.start):
                default = "booster"
            tbl = dict()
            for eol in self.eols:
                tbl[eol.char] = "reject"
            for c, n in transition.iteritems():
                tbl[c.char] = self.state_name(n)
            self.emit(   "static void (*%s_table[256])(UCHARP, UCHARP, UCHARP) = {[0 ... 255] = (void*)%s, %s};"
                         % (self.state_name(cur_state), default,
                            ", ".join("[%d] = %s" % (i, s) for (i, s) in tbl.items())))
            self.emit(   "return %s_table[*buf++](%s);" % (self.state_name(cur_state), self.args))
            self.demit("}", 2)
            return

        for eol in self.eols:
            transition[eol] = "reject"

        for input_ in transition.keys():
            if isinstance(input_, SpecialInputNode):
                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))

        self.emit_switch(transition, default)

        self.demit("}", 2)

    class _trans_stmt(ASTWalker):
        def __init__(self, emit):
            self._emit = emit
            self.args = "beg, buf, end"

        def emit(self, input_node, next_):
            self.next = next_
            input_node.accept(self)

        def visit(self, input_node):
            self._emit("/* UNKNOW RULE */")
            self._emit("/* %s */" % input_node.__repr__())

        def visit_Character(self, char):
            self._emit("case %d: /* %s */" % (char.char, char))
            self._emit("  return %s(%s);" % (self.next, self.args))

        # Special Rule
        def visit_BegLine(self, begline):
            self._emit("/* begin of line  */")
            self._emit("if (buf == beg)")
            self._emit("  return %s(%s);" % (self.next, self.args), 2)

        def visit_Range(self, range):
            if isinstance(range.lower, MBCharacter) and not \
               isinstance(range.upper, MBCharacter) or  \
               isinstance(range.upper, MBCharacter) and not \
               isinstance(range.lower, MBCharacter):
                return

            if isinstance(range.lower, MBCharacter):
                self.visit(range)
            else:
                self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char))
                self._emit("  return %s(beg, buf+1, end);" % self.next, 2)

def test():
    import doctest
    doctest.testmod()

if __name__ == '__main__': test()