changeset 319:7b8234c090f7

bmSearch
author mir3636
date Sun, 08 May 2016 22:53:20 +0900
parents c9458ffecb87
children da02a7258d54
files regexParser/CharClass.cc regexParser/bmSearch.cc regexParser/bmSearch.h regexParser/grepWalk.cc regexParser/regexParser.h regexParser/threadedSearch.cc
diffstat 6 files changed, 91 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/regexParser/CharClass.cc	Sun May 08 11:56:49 2016 +0900
+++ b/regexParser/CharClass.cc	Sun May 08 22:53:20 2016 +0900
@@ -17,6 +17,7 @@
     cc->cond.w.word = NULL;
     cc->cond.w.length = 0;
     cc->cond.w.next = NULL;
+    cc->cond.w.bm = NULL;
     cc->left = left;
     cc->right = right;
     cc->nextState.bitContainer = state;
@@ -30,6 +31,7 @@
     cc->cond.w.word = ri->tokenValue;
     cc->cond.w.length = ri->ptr - ri->tokenValue;
     cc->cond.w.next = NULL;
+    cc->cond.w.bm = NULL;
     cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue;
     return cc;
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/regexParser/bmSearch.cc	Sun May 08 22:53:20 2016 +0900
@@ -0,0 +1,50 @@
+#include "regexPaser.h"
+#include "CharClass.h"
+
+/**
+ * if start node contains only words, Boyer-Moore Search can be used.
+ * if so skip table is created for each word. 
+*/
+
+static void
+create_BMskiptable(BMPtr bm,unsigned char *word,int len)
+{
+    bm->skip_table = (int*)malloc(sizeof(int)*256);
+    for (int i = 0; i < 256; ++i) {
+        bm->skip_table[i] = len;
+    }
+
+    for (int j = 0; j < len - 1; ++j) {
+        bm->skip_table[word[j]] = len - j - 1;
+    }
+}
+
+void checkBMSearch(CharaClassPtr cc) {
+
+    // first check there is no Chareclass range
+    CharClassWalkerPtr cw = createCharClassWalker(st->cc);
+    while (hasNext(cw)) {
+        CharClassPtr cc = getNext(cw);
+        if (cc->cond.w.word == NULL) {
+            free(cw);
+            return;
+        }
+    }
+    free(cw);
+
+    // make skip table for each word
+    cw = createCharClassWalker(st->cc);
+    while (hasNext(cw)) {
+        CharClassPtr cc = getNext(cw);
+        if (cc->cond.w.word) {
+            WordPtr w = &cc->cond.w;
+            while (w) {
+                BMPtr bm = NEW(BM);
+                cc->cond.w.bm = bm;
+                create_BMskiptable(bm,cc->cond.w.word,cc->cond.length);
+                w = w->next;
+            }
+        } 
+    }
+    free(cw);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/regexParser/bmSearch.h	Sun May 08 22:53:20 2016 +0900
@@ -0,0 +1,1 @@
+extern void checkBMSearch(CharaClassPtr cc);
--- a/regexParser/grepWalk.cc	Sun May 08 11:56:49 2016 +0900
+++ b/regexParser/grepWalk.cc	Sun May 08 22:53:20 2016 +0900
@@ -6,6 +6,7 @@
 #include "subsetConstruction.h"
 #include "CharClass.h"
 #include "threadedSearch.h"
+#include "bmSearch.h"
 
 StatePtr nextState(BitVector bi,TransitionGeneratorPtr tg) {
     // create tSearch in next state.
@@ -25,6 +26,7 @@
     *tg->stateStart = *tg->stateList;
     tg->stateStart->accept = false; // Start state never accept
     StatePtr state = tg->stateStart;
+    checkBMSearch(state->cc); 
 
 #if DEBUG
     TSValuePtr tsvp = &tsv;   // make tsv visible in lldb
--- a/regexParser/regexParser.h	Sun May 08 11:56:49 2016 +0900
+++ b/regexParser/regexParser.h	Sun May 08 22:53:20 2016 +0900
@@ -8,11 +8,18 @@
     unsigned long bitContainer;
 } BitVector,*BitVectorPtr;
 
+// skip table of Boyer-Moore Search
+typedef struct bm {
+    int* skip_table;
+    unsigned char *search_word;
+    int search_word_len;
+    struct bm *next;
+} BM, *BMPtr;
+
 typedef struct word {
     unsigned char *word;
     int length;
-    // look up table for BM search.
-    // BitVector nextState;
+    BMPtr bm;
     struct word *next;
 } Word, *WordPtr;
 
@@ -90,6 +97,7 @@
 typedef struct transitionGenerator {
     long totalStateCount;
     long totalBasicState;
+    long maxWordLen;
     StateStackPtr stack;
     StatePtr stateEnd;
     StatePtr stateStart;   // start state without accept flag
@@ -154,6 +162,7 @@
     unsigned char tokenType;
     unsigned char *tokenValue;
     int stateNumber;
+    long maxWordLen;
     bool wordMode;
 } RegexInfo, *RegexInfoPtr;
 
--- a/regexParser/threadedSearch.cc	Sun May 08 11:56:49 2016 +0900
+++ b/regexParser/threadedSearch.cc	Sun May 08 22:53:20 2016 +0900
@@ -20,6 +20,31 @@
         tsv.matchEnd = NULL;
     }
     tsv.matchBegin = tsv.buff.buffptr;  // next char may be matchBegin
+    // if possible use bmsearch
+    while (tsv.buff.buffptr < tsv.buff.buffend) {
+        long skip = tsv.tg->maxWordLen;
+        for (int k = 0; k < tsv.current->ccvSize; k++) {
+            CCVPtr ccv = &tsv.current->ccv[k];
+            if (ccv.w.word) {
+                int i = ccv.w.length - 1;
+                while (tsv.buff.buffptr[i] == ccv.w.word[i]) {
+                    if (i == 0) {
+                        if (ccv->tState) {
+                            tsv.current = ccv->tState;
+                         } else {
+                            tsv.current = nextTState(ccv->state,tsv.tg);
+                            ccv->tState = tsv.current;
+                        }
+                        tsv.buff.buffptr += ccv.w.length - 1;
+                        return tsv; 
+                    }
+                    --i;
+                }
+                skip = min(skip,max(ccv.w.bm->skip[tsv.buff.buffptr[i]],ccv.w.length - i));
+            }
+        }
+        tsv.buff.buffptr += skip;
+    }
     return tsv;
 }