# HG changeset patch # User Ryoma SHINYA # Date 1288888655 -32400 # Node ID a05baa7dc7ba620cbbc5fbb4c6d34343ee7d2551 # Parent 5ab54a732ddbd86f30cc56ba05d676c9dde10064 modify I/O routine. use mmap. it's really faster than fgets ;-) diff -r 5ab54a732ddb -r a05baa7dc7ba pyrect/jitgrep.py --- a/pyrect/jitgrep.py Thu Nov 04 22:04:34 2010 +0900 +++ b/pyrect/jitgrep.py Fri Nov 05 01:37:35 2010 +0900 @@ -62,7 +62,7 @@ return if opts.time : start_time = time.time() - reg = Regexp(string) + reg = Regexp(".*"+string) if cbc: grept = CbCGREPTranslator(reg) elif opts.label: diff -r 5ab54a732ddb -r a05baa7dc7ba pyrect/regexp/ast.py --- a/pyrect/regexp/ast.py Thu Nov 04 22:04:34 2010 +0900 +++ b/pyrect/regexp/ast.py Fri Nov 05 01:37:35 2010 +0900 @@ -113,15 +113,16 @@ return -1 class Character(InputNode): + import curses.ascii as ascii + ASCII = ascii.controlnames + \ + ["'"+chr(c)+"'" for c in range(33, 127)]\ + + ['DEL'] + [r"\x%x" % c for c in range(128, 256)] + def __init__(self, char): self.char = ord(char) def __str__(self): - if not self.char in range(33, 127): # not Ascii - c = r"\\x%x" % self.char - else: - c = chr(self.char) - return "'" + c + "'" + return self.ASCII[self.char] def __hash__(self): return self.char.__hash__() diff -r 5ab54a732ddb -r a05baa7dc7ba pyrect/translator/c_translator.py --- a/pyrect/translator/c_translator.py Thu Nov 04 22:04:34 2010 +0900 +++ b/pyrect/translator/c_translator.py Fri Nov 05 01:37:35 2010 +0900 @@ -19,6 +19,7 @@ if fa == "DFA": self.cg = regexp.dfacg self.debug = False + self.eols = (Character('\0'), Character('\n'), Character('\r')) self.special_rule = (Range, BegLine, MBCharacter) self.trans_stmt = self._trans_stmt(self.emit) @@ -143,24 +144,21 @@ else: default = "reject" - any_ = None - for input_ in transition.keys(): if type(input_) in self.special_rule: self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) elif type(input_) is AnyChar: - any_ = (input_, self.state_name(transition.pop(input_))) - default = None + default = self.state_name(transition.pop(input_)) if cur_state in self.cg.accepts: - eol = Character('\0') - transition[eol] = "accept" + for eol in self.eols: + transition[eol] = "accept" + elif default != "reject": + for eol in self.eols: + transition[eol] = "reject" self.emit_switch(transition, default) - if any_: - self.trans_stmt.emit(any_[0], any_[1]) - self.emitd("}", 2) def emit_initialization(self): @@ -196,7 +194,7 @@ self._emit("/* %s */" % input_node.__repr__()) def visit_Character(self, char): - self._emit("case %d: /* match %s */" % (char.char, chr(char.char))) + self._emit("case %d: /* match %s */" % (char.char, char)) self._emit(" return %s(s);" % self.next) def visit_EndLine(self, endline): @@ -231,11 +229,6 @@ self._emit("if ('%s' <= *s && *s <= '%s')" % (range.lower.char, range.upper.char)) self._emit(" return %s(s+1);" % self.next, 2) - def visit_AnyChar(self, anychar): - self._emit(r"if (*s != '\0')") - self._emit( "return %s(SKIP(s));" % self.next, 2) - self._emit("return reject(s);") - def test(): import doctest doctest.testmod() diff -r 5ab54a732ddb -r a05baa7dc7ba pyrect/translator/grep_translator.py --- a/pyrect/translator/grep_translator.py Thu Nov 04 22:04:34 2010 +0900 +++ b/pyrect/translator/grep_translator.py Fri Nov 05 01:37:35 2010 +0900 @@ -3,6 +3,7 @@ import os from c_translator import CTranslator from pyrect.regexp import Regexp, Analyzer +from pyrect.regexp.ast import ASTWalker, AnyChar, Character class GREPTranslateExeption(Exception): pass @@ -26,6 +27,8 @@ self.thread_dfa = 1 self.thread_line = 1 self.filter = True + self.interface = "UCHARP beg, UCHARP buf, UCHARP end" + self.args = "beg, buf, end" def getbufsize(self,): return self.__bufsize @@ -35,28 +38,26 @@ bufsize = property(getbufsize, setbufsize) def emit_initialization(self): - CTranslator.emit_initialization(self) - if self.thread_dfa > 1 and self.regexp.max_len != float("inf"): - self.emit("#define DFA paradfa") - self.emit("#define THREAD_NUM %d" % self.thread_dfa) - self.emit("#define REG_MAX_LEN %d" % self.regexp.max_len) - else: - self.emit("#define DFA dfa") - self.emit("#define THREAD_NUM 1 // no threading") - self.emit("#define REG_MAX_LEN -1") - + self.emit("#include ") self.emit("#define GREP grep") - self.emit("#define LINEBUFSIZE %d" % self.bufsize) - self.emit("#define READBUFSIZE %d" % self.bufsize) - self.emit('#include ') + self.emit("#define UCHARP unsigned char *") self.emit("#include ") + self.emit("#include ") + self.emit("#include ") + self.emit("#include ") + self.emit("#include ") self.emit("#include ") - self.emit("char readbuf[%d];" % (self.bufsize)) - self.emit("int dfa(unsigned char* s, int len);", 2) - self.emit("int paradfa(unsigned char* s, int len);", 2) + + self.emit_skip() - if self.filter and self.regexp.must_words: - self.emit_filter(self.regexp.must_words) + for state in self.cg.map.iterkeys(): + self.emit("void %s(%s);" % (self.state_name(state), self.interface)) + self.emit('void accept(%s);' % self.interface) + self.emit('void reject(%s);' % self.interface) + self.emit("void dfa(%s);" % self.interface, 2) + + #if self.filter and self.regexp.must_words: + # self.emit_filter(self.regexp.must_words) grepsource = open(self.BASE_DIR + "/template/grep.c") self.emit(grepsource.read()) @@ -72,15 +73,15 @@ if len(words) == 1: if len(key) == self.regexp.min_len: - self.emit("#define FILTER_ONLY 1", 1) + self.emit("#define MATCH (bm_filter(beg, buf, n-1))", 1) else: - self.emit("#define WITH_FILTER 1", 1) + self.emit("#define (bm_filter(beg, buf, n-1) && DFA(beg, buf, n-1))", 1) self.emit("#define FILTER bm_filter", 2) - self.emiti("int bm_filter(unsigned char* text, int n) {") + self.emiti("int bm_filter(unsigned char* buf, int n) {") l = len(key) if l == 1: - self.emit(" return (strchr(text, %d) != NULL)" % ord(key)) + self.emit(" return (strchr(buf, %d) != NULL)" % ord(key)) self.emitd("}", 2) return @@ -98,10 +99,10 @@ self.emit("int i = %d, j, k, len = %d;" % (l-1 ,l)) self.emit("unsigned char c, tail = %d; //'%c'" % (ord(key[l-1]), key[l-1]), 2) self.emiti("while (i < n) {") - self.emit( "c = text[i];") + self.emit( "c = buf[i];") self.emiti( "if (c == tail) {") self.emit( "j = len - 1; k = i;") - self.emiti( "while (key[--j] == text[--k]) {") + self.emiti( "while (key[--j] == buf[--k]) {") self.emit( "if (j == 0) return 1;") self.emitd( "}") self.emitd( "}") @@ -111,22 +112,104 @@ self.emitd("}", 2) def emit_driver(self): - self.emiti("int dfa(unsigned char *text, int len) {") - self.emit( "len++; //try match for n+1 times.") - self.emiti( "while (len--) {") - self.emit( "if (%s(text++)) return 1;" % self.state_name(self.cg.start)) - self.emitd( "}") - self.emit( "return 0;") + self.emiti("void dfa(%s) {" % self.interface) + self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emit( "return;") self.emitd("}") return + def emit_accept_state(self): + self.emiti("void accept(%s) {" % self.interface) + self.emit( "buf--;") + self.emit( "UCHARP ret = (UCHARP)memchr(buf, '\\n', (buf - end));") + self.emit( 'if (ret == NULL) {fprintf(stderr, "memchr NULL err!"); exit(0);}') + self.emiti( "if (ret > end) {") + self.emit( "ret--;") + self.emit( "print_line(beg, ret);") + self.emit( "return;") + self.emitd( "}") + self.emit( "print_line(beg, ret);") + self.emit( "beg = buf = ret + 1;") + self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emitd("}", 2) + + def emit_reject_state(self): + self.emiti("void reject(%s) {" % self.interface) + self.emit( "if (buf >= end) return;") + self.emit( "beg = buf;") + self.emit( "%s(%s);" % (self.state_name(self.cg.start), self.args)) + self.emitd("}", 2) + + def emit_switch(self, case, default=None): + if not case: + if default: + self.emit("return %s(%s);" % (default, self.args)) + return + self.emiti("switch(*buf++) {") + for case, next_ in case.iteritems(): + self.trans_stmt.emit(case, self.state_name(next_)) + if default: + if default == self.state_name(self.cg.start): + self.emit("default: return %s(%s);" % (default, self.args)) + self.emitd("}") + def emit_state(self, cur_state, transition): + self.emiti("void %s(%s) {" % (self.state_name(cur_state), self.interface)) + if cur_state in self.cg.accepts: - self.emiti("int %s(unsigned char* s) {" % self.state_name(cur_state)) - self.emit( "return accept(s);") - self.emitd("}") - else: - CTranslator.emit_state(self, cur_state, transition) + self.emit( "return accept(beg, buf-1, end);") + self.emitd("}", 2) + return + + default = self.state_name(self.cg.start) + for eol in self.eols: + transition[eol] = "reject" + + for input_ in transition.keys(): + if type(input_) in self.special_rule: + self.trans_stmt.emit(input_, self.state_name(transition.pop(input_))) + elif type(input_) is AnyChar: + default = self.state_name(transition.pop(input_)) + + self.emit_switch(transition, default) + + self.emitd("}", 2) + + class _trans_stmt(ASTWalker): + def __init__(self, emit): + self._emit = emit + self.args = "beg, buf, end" + + def emit(self, input_node, next_): + self.next = next_ + input_node.accept(self) + + def visit(self, input_node): + self._emit("/* UNKNOW RULE */") + self._emit("/* %s */" % input_node.__repr__()) + + def visit_Character(self, char): + self._emit("case %d: /* match %s */" % (char.char, char)) + self._emit(" return %s(%s);" % (self.next, self.args)) + + # Special Rule + def visit_BegLine(self, begline): + self._emit("/* begin of line */") + self._emit("if (buf == beg)") + self._emit(" return %s(%s);" % (self.next, self.args), 2) + + def visit_Range(self, range): + if isinstance(range.lower, MBCharacter) and not \ + isinstance(range.upper, MBCharacter) or \ + isinstance(range.upper, MBCharacter) and not \ + isinstance(range.lower, MBCharacter): + return + + if isinstance(range.lower, MBCharacter): + self.visit(range) + else: + self._emit("if ('%s' <= *buf && *buf <= '%s')" % (range.lower.char, range.upper.char)) + self._emit(" return %s(beg, buf+1, end);" % self.next, 2) def test(): import doctest diff -r 5ab54a732ddb -r a05baa7dc7ba pyrect/translator/template/grep.c --- a/pyrect/translator/template/grep.c Thu Nov 04 22:04:34 2010 +0900 +++ b/pyrect/translator/template/grep.c Fri Nov 05 01:37:35 2010 +0900 @@ -1,3 +1,4 @@ +/* typedef struct _thread_arg { unsigned char *buf; int len; @@ -38,92 +39,60 @@ return 0; } -int paragrep(char *regexp, FILE *f, char *name) { - int nmatch, used_buf = 0, - reading = 1; - char lbuf[THREAD_NUM][LINEBUFSIZE]; - pthread_t hundle[THREAD_NUM]; - thread_arg_t targ[THREAD_NUM]; - int i, j, n, m; - do { - for (i = 0; i < THREAD_NUM; i++) { - if (fgets(lbuf[i], sizeof lbuf[i], f) == NULL) { - reading = 0; - break; - } else { - n = strlen(lbuf[i]); - if (n > 0 && lbuf[i][n-1] == '\n') - lbuf[i][n-1] = '\0'; - targ[i].buf = (unsigned char *)lbuf[i]; - targ[i].len = n - 1; - pthread_create(&hundle[i], NULL, (void *)thread_dfa, (void *)&targ[i]); - } - } - for (j = 0; j < i; j++) { - pthread_join(hundle[j], NULL); - if (targ[j].match) { - nmatch++; - if (name != NULL) - printf("%s:", name); - printf("%s\n", targ[j].buf); - } - } - } while (i != 0); - return nmatch; +*/ + +void print_line(unsigned char *beg, unsigned char *end) { + fwrite(beg, sizeof(char), (end - beg + 1), stdout); } -int grep(char *regexp, FILE *f, char *name) { - int n, nmatch; - char lbuf[LINEBUFSIZE]; - buf = (unsigned char *)lbuf; - nmatch = 0; - while (fgets(lbuf, sizeof lbuf, f) != NULL) { - n = strlen(lbuf); - if (n > 0 && buf[n-1] == '\n') - lbuf[n-1] = '\0'; +void grep(char *regexp, int fd, char *name) { + caddr_t file_mmap; + unsigned char *buf, *end, *beg; + off_t size; + struct stat sb; + + if (fstat(fd, &sb)) { + fprintf(stderr, "can't fstat %s\n", name); + exit(0); + } -#ifdef FILTER_ONLY - if (FILTER(buf, n-1)) { -#elif defined(WITH_FILTER) - if (FILTER(buf, n-1) && DFA(buf, n-1)) { -#else - if (DFA(buf, n-1)) { -#endif - nmatch++; - if (name != NULL) - printf("%s:", name); - printf("%s\n", lbuf); - } + size = sb.st_size; + file_mmap = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, (off_t)0); + + if (file_mmap == (caddr_t)-1) { + fprintf(stderr, "can't mmap %s\n", name); + exit(0); } - return nmatch; + + beg = buf = (unsigned char *) file_mmap; + end = beg + size - 1; + + dfa(beg, beg, end); + + munmap(file_mmap, size); + return; } int main(int argc, char* argv[]) { - int i, nmatch; - FILE *f; + int i, fd; if (argc < 2) { fprintf(stderr, "usage: grep regexp [file ...]"); exit(0); } - nmatch = 0; if (argc == 2) { - if (GREP(argv[1], stdin, NULL)) - nmatch++; + GREP(argv[1], stdin, NULL); } else { for (i = 2; i < argc; i++) { - f = fopen(argv[i], "r"); - if (f == NULL) { + fd = open(argv[i], O_RDONLY, 0666); + if (fd == 0) { fprintf(stderr, "can't open %s:", argv[i]); continue; } - if (READBUFSIZE > 0) - setvbuf(f, readbuf, _IOFBF, READBUFSIZE); - if (GREP(argv[1], f, argc > 3 ? argv[i] : NULL) > 0) - nmatch++; - fclose(f); + GREP(argv[1], fd, argc > 3 ? argv[i] : NULL); + close(fd); } } - return nmatch; + return 0; }