Mercurial > hg > Members > shinya > pyrect
diff pyrect/translator/grep_translator.py @ 62:a05baa7dc7ba
modify I/O routine. use mmap. it's really faster than fgets ;-)
author | Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Fri, 05 Nov 2010 01:37:35 +0900 |
parents | fd3d0b8326fe |
children | c981dc66b258 bee3a64d6cbc b02b321d0e06 |
line wrap: on
line diff
--- a/pyrect/translator/grep_translator.py Thu Nov 04 22:04:34 2010 +0900 +++ b/pyrect/translator/grep_translator.py Fri Nov 05 01:37:35 2010 +0900 @@ -3,6 +3,7 @@ import os from c_translator import CTranslator from pyrect.regexp import Regexp, Analyzer +from pyrect.regexp.ast import ASTWalker, AnyChar, Character class GREPTranslateExeption(Exception): pass @@ -26,6 +27,8 @@ self.thread_dfa = 1 self.thread_line = 1 self.filter = True + self.interface = "UCHARP beg, UCHARP buf, UCHARP end" + self.args = "beg, buf, end" def getbufsize(self,): return self.__bufsize @@ -35,28 +38,26 @@ bufsize = property(getbufsize, setbufsize) def emit_initialization(self): - CTranslator.emit_initialization(self) - if self.thread_dfa > 1 and self.regexp.max_len != float("inf"): - self.emit("#define DFA paradfa") - self.emit("#define THREAD_NUM %d" % self.thread_dfa) - self.emit("#define REG_MAX_LEN %d" % self.regexp.max_len) - else: - self.emit("#define DFA dfa") - self.emit("#define THREAD_NUM 1 // no threading") - self.emit("#define REG_MAX_LEN -1") - + self.emit("#include <stdio.h>") self.emit("#define GREP grep") - self.emit("#define LINEBUFSIZE %d" % self.bufsize) - self.emit("#define READBUFSIZE %d" % self.bufsize) - self.emit('#include <pthread.h>') + self.emit("#define UCHARP unsigned char *") self.emit("#include <stdlib.h>") + self.emit("#include <sys/mman.h>") + self.emit("#include <sys/types.h>") + self.emit("#include <sys/stat.h>") + self.emit("#include <fcntl.h>") self.emit("#include <string.h>") - self.emit("char readbuf[%d];" % (self.bufsize)) - self.emit("int dfa(unsigned char* s, int len);", 2) - self.emit("int paradfa(unsigned char* s, int len);", 2) + + self.emit_skip() - if self.filter and self.regexp.must_words: - self.emit_filter(self.regexp.must_words) + for state in self.cg.map.iterkeys(): + self.emit("void %s(%s);" % (self.state_name(state), self.interface)) + self.emit('void accept(%s);' % self.interface) + self.emit('void reject(%s);' % self.interface) + self.emit("void dfa(%s);" % self.interface, 2) + + #if self.filter and self.regexp.must_words: + # self.emit_filter(self.regexp.must_words) grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) @@ -72,15 +73,15 @@ if len(words) == 1: if len(key) == self.regexp.min_len: - self.emit("#define FILTER_ONLY 1", 1) + self.emit("#define MATCH (bm_filter(beg, buf, n-1))", 1) else: - self.emit("#define WITH_FILTER 1", 1) + self.emit("#define (bm_filter(beg, buf, n-1) && DFA(beg, buf, n-1))", 1) self.emit("#define FILTER bm_filter", 2) - self.emiti("int bm_filter(unsigned char* text, int n) {") + self.emiti("int bm_filter(unsigned char* buf, int n) {") l = len(key) if l == 1: - self.emit(" return (strchr(text, %d) != NULL)" % ord(key)) + self.emit(" return (strchr(buf, %d) != NULL)" % ord(key)) self.emitd("}", 2) return @@ -98,10 +99,10 @@ self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l)) self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2) self.emiti("while (i < n) {") - self.emit( "c = text[i];") + self.emit( "c = buf[i];") self.emiti( "if (c == tail) {") self.emit( "j = len - 1; k = i;") - self.emiti( "while (key[--j] == text[--k]) {") + self.emiti( "while (key[--j] == buf[--k]) {") self.emit( "if (j == 0) return 1;") self.emitd( "}") self.emitd( "}") @@ -111,22 +112,104 @@ self.emitd("}", 2) def emit_driver(self): - self.emiti("int dfa(unsigned char *text, int len) {") - self.emit( "len++; //try match for n+1 times.") - self.emiti( "while (len--) {") - self.emit( "if (%s(text++)) return 1;" % self.state_name(self.cg.start)) - self.emitd( "}") - self.emit( "return 0;") + self.emiti("void dfa(%s) {" % self.interface) + self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emit( "return;") self.emitd("}") return + def emit_accept_state(self): + self.emiti("void accept(%s) {" % self.interface) + self.emit( "buf--;") + self.emit( "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));") + self.emit( 'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}') + self.emiti( "if (ret > end) {") + self.emit( "ret--;") + self.emit( "print_line(beg, ret);") + self.emit( "return;") + self.emitd( "}") + self.emit( "print_line(beg, ret);") + self.emit( "beg = buf = ret + 1;") + self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emitd("}", 2) + + def emit_reject_state(self): + self.emiti("void reject(%s) {" % self.interface) + self.emit( "if (buf >= end) return;") + self.emit( "beg = buf;") + self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emitd("}", 2) + + def emit_switch(self, case, default=None): + if not case: + if default: + self.emit("return %s(%s);" % (default, self.args)) + return + self.emiti("switch(*buf++) {") + for case, next_ in case.iteritems(): + self.trans_stmt.emit(case, self.state_name(next_)) + if default: + if default == self.state_name(self.cg.start): + self.emit("default: return %s(%s);" % (default, self.args)) + self.emitd("}") + def emit_state(self, cur_state, transition): + self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface)) + if cur_state in self.cg.accepts: - self.emiti("int %s(unsigned char* s) {" % self.state_name(cur_state)) - self.emit( "return accept(s);") - self.emitd("}") - else: - CTranslator.emit_state(self, cur_state, transition) + self.emit( "return accept(beg, buf-1, end);") + self.emitd("}", 2) + return + + default = self.state_name(self.cg.start) + for eol in self.eols: + transition[eol] = "reject" + + for input_ in transition.keys(): + if type(input_) in self.special_rule: + self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) + elif type(input_) is AnyChar: + default = self.state_name(transition.pop(input_)) + + self.emit_switch(transition, default) + + self.emitd("}", 2) + + class _trans_stmt(ASTWalker): + def __init__(self, emit): + self._emit = emit + self.args = "beg, buf, end" + + def emit(self, input_node, next_): + self.next = next_ + input_node.accept(self) + + def visit(self, input_node): + self._emit("/* UNKNOW RULE */") + self._emit("/* %s */" % input_node.__repr__()) + + def visit_Character(self, char): + self._emit("case %d: /* match %s */" % (char.char, char)) + self._emit(" return %s(%s);" % (self.next, self.args)) + + # Special Rule + def visit_BegLine(self, begline): + self._emit("/* begin of line */") + self._emit("if (buf == beg)") + self._emit(" return %s(%s);" % (self.next, self.args), 2) + + def visit_Range(self, range): + if isinstance(range.lower, MBCharacter) and not \ + isinstance(range.upper, MBCharacter) or \ + isinstance(range.upper, MBCharacter) and not \ + isinstance(range.lower, MBCharacter): + return + + if isinstance(range.lower, MBCharacter): + self.visit(range) + else: + self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char)) + self._emit(" return %s(beg, buf+1, end);" % self.next, 2) def test(): import doctest