view pyrect/translator/grep_translator.py @ 64:c981dc66b258

add (maybe :-p) non_blocking_print_line. it's used pthread.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sat, 06 Nov 2010 00:46:09 +0900
parents 020ba001c58a
children
line wrap: on
line source

#!/usr/bin/env python

import os
from c_translator import CTranslator
from pyrect.regexp import Regexp, Analyzer
from pyrect.regexp.ast import ASTWalker, AnyChar, Character

class GREPTranslateExeption(Exception):
    pass

class GREPTranslator(CTranslator):
    """GREPTranslator
    This Class can translate form DFA into grep source-code.
    which based on (beautiful) mini-grep introduced  \"The Practice of Programming\"
    written by Rob Pike & Brian W. Kernighan. (see template/grep.c)
    >>> string = \"(build|fndecl|gcc)\"
    >>> reg = Regexp(string)
    >>> tje = GREPTranslator(reg)
    >>> tje.translate()
    """

    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

    def __init__(self, regexp):
        CTranslator.__init__(self, regexp, fa="DFA")
        self.__bufsize = 1024 * 1024
        self.thread_dfa = 1
        self.thread_line = 1
        self.filter = True
        self.interface = "UCHARP beg, UCHARP buf, UCHARP end"
        self.args = "beg, buf, end"

    def getbufsize(self,):
        return self.__bufsize
    def setbufsize(self, bufsize):
        self.__bufsize = abs(bufsize)

    bufsize = property(getbufsize, setbufsize)

    def emit_initialization(self):
        self.emit("#include <stdio.h>")
        self.emit("#define GREP grep")
        self.emit("#define UCHARP unsigned char *")
        self.emit("#include <stdlib.h>")
        self.emit("#include <sys/mman.h>")
        self.emit("#include <sys/types.h>")
        self.emit("#include <sys/stat.h>")
        self.emit("#include <fcntl.h>")
        self.emit("#include <string.h>")
        self.emit("#include <pthread.h>")

        self.emit_skip()

        for state in self.cg.map.iterkeys():
            self.emit("void %s(%s);" % (self.state_name(state), self.interface))
        self.emit('void accept(%s);' % self.interface)
        self.emit('void reject(%s);' % self.interface)
        self.emit("void dfa(%s);" % self.interface, 2)

        #if self.filter and self.regexp.must_words:
        #    self.emit_filter(self.regexp.must_words)

        grepsource = open(self.BASE_DIR + "/template/grep.c")
        self.emit(grepsource.read())

    def emit_filter(self, words):
        def longest(s1, s2):
            if len(s1) >= len(s2):
                return s1
            else:
                return s2

        key = reduce(longest, words)

        if len(words) == 1:
            if len(key) == self.regexp.min_len:
                self.emit("#define MATCH (bm_filter(beg, buf, n-1))", 1)
        else:
            self.emit("#define (bm_filter(beg, buf, n-1) && DFA(beg, buf, n-1))", 1)

        self.emit("#define FILTER bm_filter", 2)
        self.emiti("int bm_filter(unsigned char* buf, int n) {")
        l = len(key)
        if l == 1:
            self.emit("   return (strchr(buf, %d) != NULL)" % ord(key))
            self.emitd("}", 2)
            return

        skip = [str(l)] * 256
        for i in range(l - 1):
            skip[ord(key[i])] = str(l-1-i)

        self.emit('static unsigned char key[] = "%s";' % key)
        self.emiti(   "static int skip[256] = {")
        for i in range(8):
            i = i * 32
            self.emit(",".join(skip[i:i+32]) + ",")
        self.emitd(   "};")

        self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l))
        self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2)
        self.emiti("while (i < n) {")
        self.emit(   "c = buf[i];")
        self.emiti(  "if (c == tail) {")
        self.emit(     "j = len - 1; k = i;")
        self.emiti(    "while (key[--j] == buf[--k]) {")
        self.emit(       "if (j == 0) return 1;")
        self.emitd(    "}")
        self.emitd(  "}")
        self.emit(   "i += skip[c];")
        self.emitd("}")
        self.emit( "return 0;")
        self.emitd("}", 2)

    def emit_driver(self):
        self.emiti("void dfa(%s) {" % self.interface)
        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
        self.emit(   "return;")
        self.emitd("}")
        return

    def emit_accept_state(self):
        self.emiti("void accept(%s) {" % self.interface)
        self.emit(   "buf--;")
        self.emit(   "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));")
        self.emit(   'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}')
        self.emiti(  "if (ret > end) {")
        self.emit(     "ret--;")
        self.emit(     "non_block_print_line(beg, ret);")
        #self.emit(     "print_line(beg, ret);")
        self.emit(     "return;")
        self.emitd(  "}")
        self.emit(   "print_line(beg, ret);")
        self.emit(   "beg = buf = ret + 1;")
        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
        self.emitd("}", 2)

    def emit_reject_state(self):
        self.emiti("void reject(%s) {" % self.interface)
        self.emit(   "if (buf >= end) return;")
        self.emit(   "beg = buf;")
        self.emit(   "%s(%s);" % (self.state_name(self.cg.start), self.args))
        self.emitd("}", 2)

    def emit_switch(self, case, default=None):
        if not case:
            if default:
                self.emit("return %s(%s);" % (default, self.args))
            return
        self.emiti("switch(*buf++) {")
        for case, next_ in case.iteritems():
            self.trans_stmt.emit(case, self.state_name(next_))
        if default:
            if default == self.state_name(self.cg.start):
                self.emit("default: return %s(%s);" % (default, self.args))
        self.emitd("}")

    def emit_state(self, cur_state, transition):
        self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface))

        if cur_state in self.cg.accepts:
            self.emit(   "return accept(beg, buf-1, end);")
            self.emitd("}", 2)
            return

        default = self.state_name(self.cg.start)
        for eol in self.eols:
            transition[eol] = "reject"

        for input_ in transition.keys():
            if type(input_) in self.special_rule:
                self.trans_stmt.emit(input_, self.state_name(transition.pop(input_)))
            elif type(input_) is AnyChar:
                default = self.state_name(transition.pop(input_))

        self.emit_switch(transition, default)

        self.emitd("}", 2)

    class _trans_stmt(ASTWalker):
        def __init__(self, emit):
            self._emit = emit
            self.args = "beg, buf, end"

        def emit(self, input_node, next_):
            self.next = next_
            input_node.accept(self)

        def visit(self, input_node):
            self._emit("/* UNKNOW RULE */")
            self._emit("/* %s */" % input_node.__repr__())

        def visit_Character(self, char):
            self._emit("case %d: /* match %s */" % (char.char, char))
            self._emit("  return %s(%s);" % (self.next, self.args))

        # Special Rule
        def visit_BegLine(self, begline):
            self._emit("/* begin of line  */")
            self._emit("if (buf == beg)")
            self._emit("  return %s(%s);" % (self.next, self.args), 2)

        def visit_Range(self, range):
            if isinstance(range.lower, MBCharacter) and not \
               isinstance(range.upper, MBCharacter) or  \
               isinstance(range.upper, MBCharacter) and not \
               isinstance(range.lower, MBCharacter):
                return

            if isinstance(range.lower, MBCharacter):
                self.visit(range)
            else:
                self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char))
                self._emit("  return %s(beg, buf+1, end);" % self.next, 2)

def test():
    import doctest
    doctest.testmod()

if __name__ == '__main__': test()