changeset 248:2b1fbfb92d54

implement tSearch
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Fri, 22 Jan 2016 20:09:42 +0900
parents 96c2507fd22d
children 9493800265a8
files regexParser/regexParser.h regexParser/subsetConstruction.cc regexParser/subsetConstruction.h regexParser/threadedSearch.cc
diffstat 4 files changed, 103 insertions(+), 84 deletions(-) [+]
line wrap: on
line diff
--- a/regexParser/regexParser.h	Fri Jan 22 18:37:04 2016 +0900
+++ b/regexParser/regexParser.h	Fri Jan 22 20:09:42 2016 +0900
@@ -11,6 +11,9 @@
 typedef struct word {
     unsigned char *word;
     int length;
+    // look up table for BM search.
+    // BitVector nextState;
+    // struct word *next;
 } Word, *WordPtr;
 
 typedef struct utf8Range {
@@ -40,10 +43,34 @@
     BitVector bitState;
     CharClassPtr cc;
     bool accept;
+    struct tState *tState;
     struct node *node;
     struct state *next;
 } State, *StatePtr;
 
+struct tsValue;
+
+typedef struct ccv {
+    unsigned long begin;
+    unsigned long end;
+    Word w;
+    BitVector state;
+    struct tState *tState;
+} CCV,*CCVPtr;
+
+typedef struct tState {
+    State *state;
+    void stateSkip(tsValue);
+    int ccvSize;
+    CCV ccv;
+} TState, *TStatePtr;
+
+typedef struct result {
+    unsigned char begin;
+    unsigned char end;
+    struct result *next;
+} Result, *ResultPtr;
+
 typedef struct node {
     unsigned char tokenType;
     CharClassPtr cc;
@@ -63,17 +90,20 @@
 typedef struct transitionGenerator {
     long totalStateCount;
     StateStackPtr stack;
+    StatePtr stateTop;
+    StatePtr stateEnd;
     StatePtr *stateArray;
     StatePtr stateList;
 } TransitionGenerator, *TransitionGeneratorPtr;
 
-// Value Container is passed directly in functions
-// Don't use it's pointer
-typedef struct scValue {
-    StatePtr stateTop;
-    StatePtr stateEnd;
+typedef struct tsValue {
+    Buffer buff;
+    ResultPtr result;
     TransitionGeneratorPtr tg;
-} SCValue, *SCValuePtr;
+    TState *current;
+    TState *blockBegin;
+    TState *blockEnd;
+} TSValue, *TSValuePtr;
 
 typedef struct tgValue {
     StatePtr asterisk;   // last * state of the expression
--- a/regexParser/subsetConstruction.cc	Fri Jan 22 18:37:04 2016 +0900
+++ b/regexParser/subsetConstruction.cc	Fri Jan 22 20:09:42 2016 +0900
@@ -164,6 +164,7 @@
 }
 
 void setState(CharClassPtr cc, BitVector bi) {
+    // if (word case) setNext(bitVector to the word list)
     cc->nextState = bi;
     if (cc->left) {
         setState(cc->left,bi);
@@ -198,19 +199,30 @@
     state に対応する NodePtr を
  */
 StatePtr createState(TGValue tgv,NodePtr n) {
-    StatePtr s = NEW(State);
-    s->stateNum = n->stateNum = tgv.tg->totalStateCount;
+    BitVector bi = createBitVector(tgv.tg->totalStateCount);
+    StatePtr s = createState(tgv.tg,bi);
+    n->stateNum = s->stateNum;
     s->next = tgv.tg->stateList;
     tgv.tg->stateList = s;
     s->node = n;
-    BitVector bi = createBitVector(n->stateNum);
     s->bitState = bi;
     s->accept = false;
-    tgv.tg->totalStateCount++;
     s->cc = n->cc;
     return s;
 }
 
+SCValue createState(TransitionGeneratorPtr tg,BitVector bi) {
+    StatePtr s = NEW(State);
+    s->stateNum = tg->totalStateCount++;
+    s->next = NULL;
+    tg->stateEnd->next = s;
+    tg->stateEnd = s;
+    s->bitState = bi;
+    s->cc = NULL;
+    s->node = NULL;
+    return scv;
+}
+
 /**
     pass 1 : 正規表現に必要な状態を探して、それぞれに番号を割り振る
         前が * でない + は新しく状態を作る
@@ -351,18 +363,6 @@
     return scv;
 }
 
-SCValue createState(SCValue scv,BitVector bi) {
-    StatePtr s = NEW(State);
-    s->stateNum = ++scv.tg->totalStateCount;
-    s->next = NULL;
-    scv.stateEnd->next = s;
-    scv.stateEnd = s;
-    s->bitState = bi;
-    s->cc = NULL;
-    s->node = NULL;
-    return scv;
-}
-
 /**
     現在のステートに含まれる組み合わせ状態をとってくる
     組み合わされた個々の charclass をmerge して新しい charclass をつくり、組み合わせ状態に登録する
@@ -371,32 +371,35 @@
     
     charClassMerge の段階で新しい状態をリストに登録したら、charclasswalk をする必要がなくなる
  */
-SCValue subsetConstruction(SCValue scv) {
-    for (;scv.stateTop;scv.stateTop = scv.stateTop->next) {
-        CharClassWalkerPtr cw = createCharClassWalker(scv.stateTop->cc);
+void determinize(StatePtr s, TransitionGeneratorPtr tg) {
+    BitVector bi = s->bitState;
+    for (;;) {
+        int bitPosition = searchBit(bi);
+        if (!bitPosition) break;
+        unsigned long baseNum = 1 << (bitPosition-1);
+        // printf("bit %lx pos %d baseNum %lx\n",bi.bitContainer,bitPosition,baseNum);
+        bi.bitContainer ^= baseNum; 
+        if (baseNum==2) continue;   // EOF case
+        StatePtr base = tg->stateArray[baseNum];
+        if (base == NULL) {
+            errorMassege("No base state",__LINE__,__FILE__); break;
+        }
+        CharClassPtr merge = mergeTransition(s,base->cc);
+        s->cc = merge;
+    }
+}
+
+void subsetConstruction(TransitionGeneratorPtr tg) {
+    for (;tg->stateTop;tg->stateTop = tg->stateTop->next) {
+        CharClassWalkerPtr cw = createCharClassWalker(state->cc);
         while (hasNext(cw)) {
             CharClassPtr cc = getNext(cw);
             BitVector bi = cc->nextState;
-            if (scv.tg->stateArray[bi.bitContainer]) continue;  // already done
-            scv = createState(scv,bi);
-            StatePtr s = scv.stateEnd;
-            scv.tg->stateArray[bi.bitContainer] = s;
-            for (;;) {
-                int bitPosition = searchBit(bi);
-                if (!bitPosition) break;
-                unsigned long baseNum = 1 << (bitPosition-1);
-                // printf("bit %lx pos %d baseNum %lx\n",bi.bitContainer,bitPosition,baseNum);
-                bi.bitContainer ^= baseNum; 
-                if (baseNum==2) continue;   // EOF case
-                StatePtr base = scv.tg->stateArray[baseNum];
-                if (base == NULL) {
-                    errorMassege("No base state",__LINE__,__FILE__); break;
-                }
-                CharClassPtr merge = mergeTransition(s,base->cc);
-                s->cc = merge;
-            }
+            if (tg->stateArray[bi.bitContainer]) continue;  // already done
+            StatePtr s = createState(tg,bi);
+            tg->stateArray[bi.bitContainer] = s;
+            determinize(s,tg);
         }
         free(cw);
     }
-    return scv;
 }
--- a/regexParser/subsetConstruction.h	Fri Jan 22 18:37:04 2016 +0900
+++ b/regexParser/subsetConstruction.h	Fri Jan 22 20:09:42 2016 +0900
@@ -10,5 +10,5 @@
 extern CharClassWalkerPtr createCharClassWalker (CharClassPtr next);
 extern void printState(TransitionGeneratorPtr tg);
 extern SCValue createSCValue(TGValue tgv) ;
+extern SCValue determinize(StatePtr state, SCValue scv);
 extern SCValue subsetConstruction(SCValue scv) ;
-
--- a/regexParser/threadedSearch.cc	Fri Jan 22 18:37:04 2016 +0900
+++ b/regexParser/threadedSearch.cc	Fri Jan 22 20:09:42 2016 +0900
@@ -4,39 +4,6 @@
 #include "regexParser.h"
 #include "subsetConstruction.h"
 
-struct tsValue;
-
-typedef struct ccv {
-        unsigned long begin;
-        unsigned long end;
-        BitVector state;
-        struct tState *tState;
-} *CCV;
-
-void stateSkip(TSValue tsv);
-
-typedef struct tState {
-    State *state;
-    void stateSkip(tsValue);
-    int ccvSize;
-    CCV ccv;
-} TState, *TStatePtr;
-
-typedef struct result {
-    unsigned char begin;
-    unsigned char end;
-    struct result *next;
-} Result, *ResultPtr;
-
-typedef struct tsValue {
-    Buffer buff;
-    ResultPtr result;
-    State *stateArray;
-    TState *current;
-    TState *blockBegin;
-    TState *blockEnd;
-} TSValue, *TSValuePtr;
-
 void stateSkip(TSValue tsv) {
     tsv.buff.matchBegin = tsv.buff.buffptr;
     tsv.current->stateSkip(tsv);
@@ -63,6 +30,7 @@
         ccv->end = end;
         ccv->tState = NULL;
         ccv->state = cc->nextState;
+        ccv->w = cc->cond.w;
     }
     free(ccw);
     return tState;
@@ -72,14 +40,32 @@
     next: while (tsv.buff.buffptr < tsv.buff.buffend) {
         unsigned char c = *tsv.buff.buffptr++;
         for (int i = 0; i < tsv.current->ccvSize; i++) {
-            if (c<tsv.current->ccv[i].begin) tsv.current->stateSkip(tsv);
-            else if (c<=tsv.current->ccv[i].end) {
-                TStatePtr current = tsv.current->ccv[i].tState;
+            CCVPtr ccv = &tsv.current->ccv[i];
+            if (c<ccv->begin) tsv.current->stateSkip(tsv);
+            else if (c<=ccv->end) {
+                // range matched.
+                if (ccv->w) {
+                    // match the word.
+                    // if (not match) continue;
+                }
+                TStatePtr current = ccv->tState;
                 if (current == NULL) {
-                    current = generateTState(tsv.stateArray[tsv.current->ccv[i].state.bitContainer]);
-                    tsv.current->ccv[i].tState = current;
+                    // create tSearch in next state.
+                    StatePtr state = tsv.stateArray[ccv->state.bitContainer];
+                    if (state == NULL) {
+                        // on the fly subset construction.
+                        state = createState(tg,bi);
+                        tg->stateArray[bi.bitContainer] = state;
+                        determinize(state,tsv.tg);
+                    }
+                    if (state->tState == NULL) {
+                        current = generateTState(state);
+                        ccv->tState = current;
+                    } else {
+                        ccv->tState = state->tState;
+                    }
                 }
-                tsv.current = tsv.current->ccv[i].tState;
+                tsv.current = ccv->tState;
                 goto next;
             }
         }