# HG changeset patch # User mir3636 # Date 1462715600 -32400 # Node ID 7b8234c090f77b288e2f7fdd078faf15e1f1815b # Parent c9458ffecb875614cabc6ddc73b27847c822e13a bmSearch diff -r c9458ffecb87 -r 7b8234c090f7 regexParser/CharClass.cc --- a/regexParser/CharClass.cc Sun May 08 11:56:49 2016 +0900 +++ b/regexParser/CharClass.cc Sun May 08 22:53:20 2016 +0900 @@ -17,6 +17,7 @@ cc->cond.w.word = NULL; cc->cond.w.length = 0; cc->cond.w.next = NULL; + cc->cond.w.bm = NULL; cc->left = left; cc->right = right; cc->nextState.bitContainer = state; @@ -30,6 +31,7 @@ cc->cond.w.word = ri->tokenValue; cc->cond.w.length = ri->ptr - ri->tokenValue; cc->cond.w.next = NULL; + cc->cond.w.bm = NULL; cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue; return cc; } diff -r c9458ffecb87 -r 7b8234c090f7 regexParser/bmSearch.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexParser/bmSearch.cc Sun May 08 22:53:20 2016 +0900 @@ -0,0 +1,50 @@ +#include "regexPaser.h" +#include "CharClass.h" + +/** + * if start node contains only words, Boyer-Moore Search can be used. + * if so skip table is created for each word. +*/ + +static void +create_BMskiptable(BMPtr bm,unsigned char *word,int len) +{ + bm->skip_table = (int*)malloc(sizeof(int)*256); + for (int i = 0; i < 256; ++i) { + bm->skip_table[i] = len; + } + + for (int j = 0; j < len - 1; ++j) { + bm->skip_table[word[j]] = len - j - 1; + } +} + +void checkBMSearch(CharaClassPtr cc) { + + // first check there is no Chareclass range + CharClassWalkerPtr cw = createCharClassWalker(st->cc); + while (hasNext(cw)) { + CharClassPtr cc = getNext(cw); + if (cc->cond.w.word == NULL) { + free(cw); + return; + } + } + free(cw); + + // make skip table for each word + cw = createCharClassWalker(st->cc); + while (hasNext(cw)) { + CharClassPtr cc = getNext(cw); + if (cc->cond.w.word) { + WordPtr w = &cc->cond.w; + while (w) { + BMPtr bm = NEW(BM); + cc->cond.w.bm = bm; + create_BMskiptable(bm,cc->cond.w.word,cc->cond.length); + w = w->next; + } + } + } + free(cw); +} diff -r c9458ffecb87 -r 7b8234c090f7 regexParser/bmSearch.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexParser/bmSearch.h Sun May 08 22:53:20 2016 +0900 @@ -0,0 +1,1 @@ +extern void checkBMSearch(CharaClassPtr cc); diff -r c9458ffecb87 -r 7b8234c090f7 regexParser/grepWalk.cc --- a/regexParser/grepWalk.cc Sun May 08 11:56:49 2016 +0900 +++ b/regexParser/grepWalk.cc Sun May 08 22:53:20 2016 +0900 @@ -6,6 +6,7 @@ #include "subsetConstruction.h" #include "CharClass.h" #include "threadedSearch.h" +#include "bmSearch.h" StatePtr nextState(BitVector bi,TransitionGeneratorPtr tg) { // create tSearch in next state. @@ -25,6 +26,7 @@ *tg->stateStart = *tg->stateList; tg->stateStart->accept = false; // Start state never accept StatePtr state = tg->stateStart; + checkBMSearch(state->cc); #if DEBUG TSValuePtr tsvp = &tsv; // make tsv visible in lldb diff -r c9458ffecb87 -r 7b8234c090f7 regexParser/regexParser.h --- a/regexParser/regexParser.h Sun May 08 11:56:49 2016 +0900 +++ b/regexParser/regexParser.h Sun May 08 22:53:20 2016 +0900 @@ -8,11 +8,18 @@ unsigned long bitContainer; } BitVector,*BitVectorPtr; +// skip table of Boyer-Moore Search +typedef struct bm { + int* skip_table; + unsigned char *search_word; + int search_word_len; + struct bm *next; +} BM, *BMPtr; + typedef struct word { unsigned char *word; int length; - // look up table for BM search. - // BitVector nextState; + BMPtr bm; struct word *next; } Word, *WordPtr; @@ -90,6 +97,7 @@ typedef struct transitionGenerator { long totalStateCount; long totalBasicState; + long maxWordLen; StateStackPtr stack; StatePtr stateEnd; StatePtr stateStart; // start state without accept flag @@ -154,6 +162,7 @@ unsigned char tokenType; unsigned char *tokenValue; int stateNumber; + long maxWordLen; bool wordMode; } RegexInfo, *RegexInfoPtr; diff -r c9458ffecb87 -r 7b8234c090f7 regexParser/threadedSearch.cc --- a/regexParser/threadedSearch.cc Sun May 08 11:56:49 2016 +0900 +++ b/regexParser/threadedSearch.cc Sun May 08 22:53:20 2016 +0900 @@ -20,6 +20,31 @@ tsv.matchEnd = NULL; } tsv.matchBegin = tsv.buff.buffptr; // next char may be matchBegin + // if possible use bmsearch + while (tsv.buff.buffptr < tsv.buff.buffend) { + long skip = tsv.tg->maxWordLen; + for (int k = 0; k < tsv.current->ccvSize; k++) { + CCVPtr ccv = &tsv.current->ccv[k]; + if (ccv.w.word) { + int i = ccv.w.length - 1; + while (tsv.buff.buffptr[i] == ccv.w.word[i]) { + if (i == 0) { + if (ccv->tState) { + tsv.current = ccv->tState; + } else { + tsv.current = nextTState(ccv->state,tsv.tg); + ccv->tState = tsv.current; + } + tsv.buff.buffptr += ccv.w.length - 1; + return tsv; + } + --i; + } + skip = min(skip,max(ccv.w.bm->skip[tsv.buff.buffptr[i]],ccv.w.length - i)); + } + } + tsv.buff.buffptr += skip; + } return tsv; }