Mercurial > hg > Members > shinya > pyrect
view pyrect/translator/grep_translator.py @ 73:a6a0504dea7b
modify bm-filter's implimentation. table-lookup -> switch. it's more simple and beautiful.
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 08 Nov 2010 02:19:18 +0900 |
parents | 8b9c3a924744 |
children | 2ad03c243524 e06786b3c2dc |
line wrap: on
line source
#!/usr/bin/env python import os from c_translator import CTranslator from pyrect.regexp import Regexp from pyrect.regexp.ast import ASTWalker, AnyChar, Character class GREPTranslateExeption(Exception): pass class GREPTranslator(CTranslator): """GREPTranslator This Class can translate form DFA into grep source-code. which based on (beautiful) mini-grep introduced \"The Practice of Programming\" written by Rob Pike & Brian W. Kernighan. (see template/grep.c) >>> string = \"(build|fndecl|gcc)\" >>> reg = Regexp(string) >>> tje = GREPTranslator(reg) >>> tje.translate() """ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) def __init__(self, regexp): CTranslator.__init__(self, regexp, fa="DFA") self.__bufsize = 1024 * 1024 self.thread_dfa = 1 self.thread_line = 1 self.filter = True self.filter_only = False self.filter_prefix = False self.skip_boost = True self.start = "matcher" self.interface = "UCHARP beg, UCHARP buf, UCHARP end" self.args = "beg, buf, end" def getbufsize(self,): return self.__bufsize def setbufsize(self, bufsize): self.__bufsize = abs(bufsize) bufsize = property(getbufsize, setbufsize) def emit_initialization(self): self.emit("#include <stdio.h>") self.emit("#define GREP grep") self.emit("#define UCHAR unsigned char") self.emit("#define UCHARP unsigned char *") self.emit("#include <stdlib.h>") self.emit("#include <sys/mman.h>") self.emit("#include <sys/types.h>") self.emit("#include <sys/stat.h>") self.emit("#include <fcntl.h>") self.emit("#include <string.h>") self.emit('void reject(%s);' % self.interface) self.emit("void matcher(%s);" % self.interface, 2) self.emit('void accept(%s);' % self.interface) key = None if self.filter and self.regexp.must_words: key = max(self.regexp.must_words, key=len) if len(self.regexp.must_words) == 1 and len(key) == self.regexp.min_len: self.filter_only = True else: self.filter = False if not self.filter_only: for state in self.cg.map.iterkeys(): self.emit("void %s(%s);" % (self.state_name(state), self.interface)) self.emit() if self.filter: self.emit_filter(key) if self.skip_boost and not self.filter_only and \ not AnyChar() in self.regexp.chars and \ self.regexp.min_len > 2: self.emit_booster(self.regexp.min_len, self.regexp.chars) else: self.skip_boost = False grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) def emit_filter(self, key): l = len(key) def emit_next(): if self.filter_only: self.emit("accept(%s);" % self.args) elif self.filter_prefix: self.emit("buf -= %d;" % l) self.emit("%s(%s);" % (self.state_name(self.cg.start) ,self.args)) else: self.emit("beg = get_line_beg(buf, beg);") self.emit("buf = beg;") self.emit("%s(%s);" % (self.state_name(self.cg.start), self.args)) self.emit("UCHARP get_line_beg(UCHARP p, UCHARP beg);", 2) self.emiti("void bm_filter(%s) {" % self.interface) l = len(key) if l == 1: self.emit("buf = memchr(buf, %d, (end - buf));" % ord(key)) self.emit("if (buf == NULL) return;") emit_next() self.emitd("}", 2) return self.emit('static UCHAR key[] = "%s";' % key) skip = dict() for i in range(l-1): skip[key[i]] = l-1-i self.emit("UCHARP tmp1, *tmp2;", 2) self.emit("int i; buf += %d;" % (l-1)) self.emiti("while (buf < end) {") self.emiti( "if (*buf == %d /*'%c'*/) {" % (ord(key[l-1]), key[l-1])) self.emit( "tmp1 = key+%d; tmp2 = buf;" % (l-1)) self.emit( "while (*(--tmp1) == *(--tmp2)) {") self.emit( "if (tmp1 == key) goto next;") self.emitd( "}") self.emitd( "}") self.emiti( "switch(*buf) {") for k, v in skip.iteritems(): self.emiti( "case %d: /* %s */" % (ord(k), Character.ascii(k))) self.emit( "buf += %d; break;" % v), self.dedent() self.emiti("default: buf += %d;" % l), self.dedent() self.emitd( "}") self.emitd("}") self.emit( "return;") self.emit( "next:") emit_next() self.emitd("}", 2) def emit_booster(self, min_len, chars): self.emiti("void booster(%s) {" % self.interface) self.emiti( "do {") self.emit( "if (buf > end) return;") self.emiti( "switch (*(buf+%d)) {" % (min_len-1)) for c in chars: self.emit( "case %d: /* %s */" % (ord(c), Character.ascii(c))) self.emit( "goto ret;") self.emitd( "}") self.emitd( "} while(buf += %d);" % (min_len-1)) self.emit( "ret: return %s(%s);" % (self.state_name(self.cg.start) , self.args)) self.emitd("}", 2) def emit_driver(self): self.emiti("void matcher(%s) {" % self.interface) if self.filter: self.emit( "%s(%s);" % ("bm_filter", self.args)) else: self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) self.emit( "return;") self.emitd("}") return def emit_accept_state(self): self.emiti("void accept(%s) {" % self.interface) self.emit( "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));") if self.skip_boost or self.filter: self.emit( "beg = get_line_beg(buf, beg);") self.emit( 'if (ret == NULL) ret = end;') self.emiti( "if (ret > end) {") self.emit( "print_line(beg, end);") self.emit( "return;") self.emitd( "}") self.emit( "print_line(beg, ret);") self.emit( "beg = buf = ret + 1;") self.emit( "%s(%s);" % (self.start, self.args)) self.emitd("}", 2) def emit_reject_state(self): self.emiti("void reject(%s) {" % self.interface) self.emit( "if (buf >= end) return;") self.emit( "beg = buf;") self.emit( "return %s(%s);" % (self.start, self.args)) self.emitd("}", 2) def emit_switch(self, case, default=None): if not case: if default: self.emit("return %s(%s);" % (default, self.args)) return self.emiti("switch(*buf++) {") for case, next_ in case.iteritems(): self.trans_stmt.emit(case, self.state_name(next_)) if default == self.state_name(self.cg.start) and self.skip_boost: self.emit("default: return booster(%s);" % self.args) else: self.emit("default: return %s(%s);" % (default, self.args)) self.emitd("}") def emit_state(self, cur_state, transition): if self.filter_only: return self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface)) if cur_state in self.cg.accepts: self.emit( "return accept(beg, buf-1, end);") self.emitd("}", 2) return default = self.state_name(self.cg.start) for eol in self.eols: transition[eol] = "reject" for input_ in transition.keys(): if type(input_) in self.special_rule: self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) elif type(input_) is AnyChar: default = self.state_name(transition.pop(input_)) self.emit_switch(transition, default) self.emitd("}", 2) class _trans_stmt(ASTWalker): def __init__(self, emit): self._emit = emit self.args = "beg, buf, end" def emit(self, input_node, next_): self.next = next_ input_node.accept(self) def visit(self, input_node): self._emit("/* UNKNOW RULE */") self._emit("/* %s */" % input_node.__repr__()) def visit_Character(self, char): self._emit("case %d: /* %s */" % (char.char, char)) self._emit(" return %s(%s);" % (self.next, self.args)) # Special Rule def visit_BegLine(self, begline): self._emit("/* begin of line */") self._emit("if (buf == beg)") self._emit(" return %s(%s);" % (self.next, self.args), 2) def visit_Range(self, range): if isinstance(range.lower, MBCharacter) and not \ isinstance(range.upper, MBCharacter) or \ isinstance(range.upper, MBCharacter) and not \ isinstance(range.lower, MBCharacter): return if isinstance(range.lower, MBCharacter): self.visit(range) else: self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char)) self._emit(" return %s(beg, buf+1, end);" % self.next, 2) def test(): import doctest doctest.testmod() if __name__ == '__main__': test()