Mercurial > hg > Members > shinya > pyrect
view pyrect/translator/goto_grep_translator.py @ 95:f42e37e2fe93
implement table-lookup at booster.
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 06 Dec 2010 05:02:15 +0900 |
parents | 492f543703d5 |
children |
line wrap: on
line source
#!/usr/bin/env python import os from c_translator import CTranslator from pyrect.regexp import Regexp from pyrect.regexp.ast import ASTWalker, AnyChar, Character class GREPTranslateExeption(Exception): pass class GOTOGREPTranslator(CTranslator): """GREPTranslator This Class can translate form DFA into grep source-code. which based on (beautiful) mini-grep introduced \"The Practice of Programming\" written by Rob Pike & Brian W. Kernighan. (see template/grep.c) >>> string = \"(hoge|fuga|piyo)\" >>> reg = Regexp(string) >>> tje = GOTOGREPTranslator(reg) >>> tje.translate() """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) def __init__(self, regexp): CTranslator.__init__(self, regexp, fa="DFA") self.__bufsize = 1024 * 1024 self.thread_dfa = 1 self.thread_line = 1 self.filter = "quick" self.filter_only = False self.filter_prefix = False self.filter_key = "" self.skip_boost = True self.table_lookup = False self.start = self.cg.start self.interface = "UCHARP beg, UCHARP buf, UCHARP end" self.args = "beg, buf, end" self.thread_interface = "UCHARP beg, UCHARP buf, UCHARP end, thread_arg_t *targ" self.thread_args = "beg, buf, end, targ" def getbufsize(self,): return self.__bufsize def setbufsize(self, bufsize): self.__bufsize = abs(bufsize) bufsize = property(getbufsize, setbufsize) def state_name(self, name): if name in ["accept", "reject", "matcher", "entry"\ , "bmh_filter", "quick_filter", "booster"]: return name else: return "state_"+str(name) def emit_goto(self, state, n=1): self.emit("goto %s;" % self.state_name(state), n) def emit_from_callgraph(self): # self.emit C-source code self.emit_initialization() self.emiti("void matcher(%s) {" % self.interface) self.emit_driver() for cur_state, transition in self.cg.map.iteritems(): self.emit_state(cur_state, transition) self.emit_accept_state() self.emit_reject_state() self.emit("return;") self.demit("}", 1) def emit_initialization(self): self.emit("#include <stdio.h>") self.emit("#include <stdlib.h>") self.emit("#include <sys/mman.h>") self.emit("#include <sys/types.h>") self.emit("#include <sys/stat.h>") self.emit("#include <fcntl.h>") self.emit("#include <unistd.h>") self.emit("#include <string.h>", 1) self.emit("typedef unsigned char UCHAR;") self.emit("typedef unsigned char *UCHARP;") self.emit("void matcher(%s);" % self.interface); self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);") self.emit("void print_line(UCHARP buf, UCHARP beg);", 2) if (self.filter == "bmh" or self.filter == "quick")\ and self.regexp.must_words: self.filter_key = max(self.regexp.must_words, key=len) if len(self.regexp.must_words) == 1 and len(self.filter_key) == self.regexp.min_len: self.filter_only = True if self.filter == "bmh": self.start = "bmh_filter" else: self.start = "quick_filter" else: self.filter = False if self.skip_boost and not self.filter_only and \ not AnyChar() in self.regexp.chars and \ self.regexp.min_len > 2: pass else: self.skip_boost = False grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) def emit_bmh_filter(self, key): l = len(key) self.demiti("bmh_filter:") def emit_next(): if self.filter_only: self.emit_goto("accept") elif self.filter_prefix: self.emit("buf++;") self.emit_goto(self.cg.start) else: self.emit("beg = get_line_beg(buf, beg);") self.emit("buf = beg;") self.emit_goto(self.cg.start) l = len(key) if l == 1: self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) self.emit("if (buf == NULL) return;") emit_next() self.demit("", 1) return skip = dict() for i in range(l-1): skip[key[i]] = l-1-i self.emit("buf += %d;" % (l-1), 1) self.emiti("while (buf < end) {") self.emiti( "if (*buf == %d /* %s */) {" % (ord(key[-1]), Character.ascii(key[-1]))) self.emit( "tmp1 = buf, tmp2 = (UCHARP)key+%d;" % (l-1)) self.emiti( "while (*(--tmp1) == *(--tmp2)) {") self.emiti( "if (tmp2 == key) {") emit_next() self.demit( "}") self.demit( "}") self.demit( "}") self.emiti( "switch(*buf) {") for k, v in skip.iteritems(): self.demiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) self.emit( "buf += %d; break;" % v) self.emiti("default: buf += %d;" % l) self.demit( "}") self.demit("}") self.emit( "return;") self.demit("", 1) def emit_quick_filter(self, key): l = len(key) self.demiti("quick_filter:") def emit_next(): if self.filter_only: self.emit_goto("accept") elif self.filter_prefix: self.emit("buf+%d;" % l) self.emit_goto(self.cg.start) else: self.emit("beg = get_line_beg(buf, beg);") self.emit("buf = beg;") self.emit_goto(self.cg.start) l = len(key) if l == 1: self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) self.emit("if (buf == NULL) return;") emit_next() self.demit("", 1) return skip = dict() for i in range(l): skip[key[i]] = l-i self.emit("end_ = end - %d;" % (l-1), 1) self.emiti("while (buf < end_) {") self.emiti( "if (*buf == %d /* %s */) {" % (ord(key[0]), Character.ascii(key[0]))) self.emit( "tmp1 = buf, tmp2 = (UCHARP)key;") self.emiti( "while (*(++tmp1) == *(++tmp2)){") self.emiti( "if (tmp2 == key+%d) {" % (l-1)) emit_next() self.demit( "}") self.demit( "}") self.demit( "}") self.emiti( "switch(buf[%d]) {" % l) for k, v in skip.iteritems(): self.demiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) self.emit( "buf += %d; break;" % v) self.emiti("default: buf += %d;" % (l+1)) self.demit( "}") self.demit("}") self.emit( "return;") self.demit("", 1) def emit_booster(self, min_len, chars): self.demiti("booster:") self.emit( "end_ = end - %d;" % (min_len-1)) self.emit( "if (buf > end_) return;") self.emiti( "do {") self.emiti( "switch (buf[%d]) {" % (min_len-1)) for c in chars: self.demiti( "case %d: /* %s */" % (ord(c), Character.ascii(c))) self.emit_goto(self.cg.start) self.demit( "}") self.demit( "} while((buf += %d) <= end_);" % min_len) self.emit("return;") self.emit("", 1) def emit_driver(self): self.emit("UCHARP end_, ret;", 2) if self.skip_boost: self.emit_booster(self.regexp.min_len, self.regexp.chars) if self.filter: self.emit("UCHARP tmp1, tmp2; static const UCHAR key[] = \"%s\";" % self.filter_key) self.emit_goto(self.filter + "_filter", 2) if self.filter == "bmh": self.emit_bmh_filter(self.filter_key) else: self.emit_quick_filter(self.filter_key) else: self.emit_goto(self.cg.start) return def emit_accept_state(self): self.demiti("accept:") self.emit( "ret = (UCHARP)memchr(buf, '\\n', (buf - end));") if self.skip_boost or self.filter: self.emit( "beg = get_line_beg(buf, beg);") self.emiti( "if (ret == NULL) {") self.emit( "print_line(beg, end);") self.emit( "return;") self.demit( "}") self.emit( "print_line(beg, ret);") self.emit( "beg = buf = ret + 1;") self.emit_goto(self.start) self.demit("", 1) def emit_reject_state(self): self.demiti("reject:") self.emit( "if (buf >= end) return;") self.emit( "beg = buf;") self.emit_goto(self.start) self.demit("") def emit_switch(self, case, default=None): if not case: if default: self.emit_goto(default) return self.emiti("switch(*buf++) {") for case, next_ in case.iteritems(): self.demiti("case %d: /* %s */" % (case.char, case)) self.emit_goto(next_) if default == self.cg.start and self.skip_boost: self.demiti("default: ") self.emit_goto("booster") else: self.demiti("default: ") self.emit_goto(default) self.demit("}") def emit_state(self, cur_state, transition): if self.filter_only: return self.demiti("%s:" % self.state_name(cur_state)) if cur_state in self.cg.accepts: self.emit( "buf--;") self.emit_goto("accept") self.demit("", 1) return if transition.has_key(AnyChar()): default = transition.pop(AnyChar()) else: default = self.cg.start if self.table_lookup and (cur_state == self.cg.start or \ cur_state == default): if self.skip_boost and default == self.cg.start: default = "booster" default = self.state_name(default) tbl = dict() for eol in self.eols: tbl[eol.char] = "reject" for c, n in transition.iteritems(): tbl[c.char] = self.state_name(n) self.emit( ";static const void *%s_table[256] = {[0 ... 255] &&%s, %s};" % (self.state_name(cur_state), default, ", ".join(["[%d] &&%s" % (i, s) for (i, s) in tbl.items()]))) self.emit( "goto *%s_table[*buf++];" % self.state_name(cur_state)) self.demit("", 1) return for eol in self.eols: transition[eol] = "reject" for input_ in transition.keys(): if type(input_) in self.special_rule: self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) self.emit_switch(transition, default) self.demit("", 1) def test(): import doctest doctest.testmod() if __name__ == '__main__': test()