# HG changeset patch # User Masataka Kohagura # Date 1453460982 -32400 # Node ID 2b1fbfb92d54de4ad03ce1f07381469f00052323 # Parent 96c2507fd22dfb5f30eaa0c8fe8446c384818b81 implement tSearch diff -r 96c2507fd22d -r 2b1fbfb92d54 regexParser/regexParser.h --- a/regexParser/regexParser.h Fri Jan 22 18:37:04 2016 +0900 +++ b/regexParser/regexParser.h Fri Jan 22 20:09:42 2016 +0900 @@ -11,6 +11,9 @@ typedef struct word { unsigned char *word; int length; + // look up table for BM search. + // BitVector nextState; + // struct word *next; } Word, *WordPtr; typedef struct utf8Range { @@ -40,10 +43,34 @@ BitVector bitState; CharClassPtr cc; bool accept; + struct tState *tState; struct node *node; struct state *next; } State, *StatePtr; +struct tsValue; + +typedef struct ccv { + unsigned long begin; + unsigned long end; + Word w; + BitVector state; + struct tState *tState; +} CCV,*CCVPtr; + +typedef struct tState { + State *state; + void stateSkip(tsValue); + int ccvSize; + CCV ccv; +} TState, *TStatePtr; + +typedef struct result { + unsigned char begin; + unsigned char end; + struct result *next; +} Result, *ResultPtr; + typedef struct node { unsigned char tokenType; CharClassPtr cc; @@ -63,17 +90,20 @@ typedef struct transitionGenerator { long totalStateCount; StateStackPtr stack; + StatePtr stateTop; + StatePtr stateEnd; StatePtr *stateArray; StatePtr stateList; } TransitionGenerator, *TransitionGeneratorPtr; -// Value Container is passed directly in functions -// Don't use it's pointer -typedef struct scValue { - StatePtr stateTop; - StatePtr stateEnd; +typedef struct tsValue { + Buffer buff; + ResultPtr result; TransitionGeneratorPtr tg; -} SCValue, *SCValuePtr; + TState *current; + TState *blockBegin; + TState *blockEnd; +} TSValue, *TSValuePtr; typedef struct tgValue { StatePtr asterisk; // last * state of the expression diff -r 96c2507fd22d -r 2b1fbfb92d54 regexParser/subsetConstruction.cc --- a/regexParser/subsetConstruction.cc Fri Jan 22 18:37:04 2016 +0900 +++ b/regexParser/subsetConstruction.cc Fri Jan 22 20:09:42 2016 +0900 @@ -164,6 +164,7 @@ } void setState(CharClassPtr cc, BitVector bi) { + // if (word case) setNext(bitVector to the word list) cc->nextState = bi; if (cc->left) { setState(cc->left,bi); @@ -198,19 +199,30 @@ state に対応する NodePtr を */ StatePtr createState(TGValue tgv,NodePtr n) { - StatePtr s = NEW(State); - s->stateNum = n->stateNum = tgv.tg->totalStateCount; + BitVector bi = createBitVector(tgv.tg->totalStateCount); + StatePtr s = createState(tgv.tg,bi); + n->stateNum = s->stateNum; s->next = tgv.tg->stateList; tgv.tg->stateList = s; s->node = n; - BitVector bi = createBitVector(n->stateNum); s->bitState = bi; s->accept = false; - tgv.tg->totalStateCount++; s->cc = n->cc; return s; } +SCValue createState(TransitionGeneratorPtr tg,BitVector bi) { + StatePtr s = NEW(State); + s->stateNum = tg->totalStateCount++; + s->next = NULL; + tg->stateEnd->next = s; + tg->stateEnd = s; + s->bitState = bi; + s->cc = NULL; + s->node = NULL; + return scv; +} + /** pass 1 : 正規表現に必要な状態を探して、それぞれに番号を割り振る 前が * でない + は新しく状態を作る @@ -351,18 +363,6 @@ return scv; } -SCValue createState(SCValue scv,BitVector bi) { - StatePtr s = NEW(State); - s->stateNum = ++scv.tg->totalStateCount; - s->next = NULL; - scv.stateEnd->next = s; - scv.stateEnd = s; - s->bitState = bi; - s->cc = NULL; - s->node = NULL; - return scv; -} - /** 現在のステートに含まれる組み合わせ状態をとってくる 組み合わされた個々の charclass をmerge して新しい charclass をつくり、組み合わせ状態に登録する @@ -371,32 +371,35 @@ charClassMerge の段階で新しい状態をリストに登録したら、charclasswalk をする必要がなくなる */ -SCValue subsetConstruction(SCValue scv) { - for (;scv.stateTop;scv.stateTop = scv.stateTop->next) { - CharClassWalkerPtr cw = createCharClassWalker(scv.stateTop->cc); +void determinize(StatePtr s, TransitionGeneratorPtr tg) { + BitVector bi = s->bitState; + for (;;) { + int bitPosition = searchBit(bi); + if (!bitPosition) break; + unsigned long baseNum = 1 << (bitPosition-1); + // printf("bit %lx pos %d baseNum %lx\n",bi.bitContainer,bitPosition,baseNum); + bi.bitContainer ^= baseNum; + if (baseNum==2) continue; // EOF case + StatePtr base = tg->stateArray[baseNum]; + if (base == NULL) { + errorMassege("No base state",__LINE__,__FILE__); break; + } + CharClassPtr merge = mergeTransition(s,base->cc); + s->cc = merge; + } +} + +void subsetConstruction(TransitionGeneratorPtr tg) { + for (;tg->stateTop;tg->stateTop = tg->stateTop->next) { + CharClassWalkerPtr cw = createCharClassWalker(state->cc); while (hasNext(cw)) { CharClassPtr cc = getNext(cw); BitVector bi = cc->nextState; - if (scv.tg->stateArray[bi.bitContainer]) continue; // already done - scv = createState(scv,bi); - StatePtr s = scv.stateEnd; - scv.tg->stateArray[bi.bitContainer] = s; - for (;;) { - int bitPosition = searchBit(bi); - if (!bitPosition) break; - unsigned long baseNum = 1 << (bitPosition-1); - // printf("bit %lx pos %d baseNum %lx\n",bi.bitContainer,bitPosition,baseNum); - bi.bitContainer ^= baseNum; - if (baseNum==2) continue; // EOF case - StatePtr base = scv.tg->stateArray[baseNum]; - if (base == NULL) { - errorMassege("No base state",__LINE__,__FILE__); break; - } - CharClassPtr merge = mergeTransition(s,base->cc); - s->cc = merge; - } + if (tg->stateArray[bi.bitContainer]) continue; // already done + StatePtr s = createState(tg,bi); + tg->stateArray[bi.bitContainer] = s; + determinize(s,tg); } free(cw); } - return scv; } diff -r 96c2507fd22d -r 2b1fbfb92d54 regexParser/subsetConstruction.h --- a/regexParser/subsetConstruction.h Fri Jan 22 18:37:04 2016 +0900 +++ b/regexParser/subsetConstruction.h Fri Jan 22 20:09:42 2016 +0900 @@ -10,5 +10,5 @@ extern CharClassWalkerPtr createCharClassWalker (CharClassPtr next); extern void printState(TransitionGeneratorPtr tg); extern SCValue createSCValue(TGValue tgv) ; +extern SCValue determinize(StatePtr state, SCValue scv); extern SCValue subsetConstruction(SCValue scv) ; - diff -r 96c2507fd22d -r 2b1fbfb92d54 regexParser/threadedSearch.cc --- a/regexParser/threadedSearch.cc Fri Jan 22 18:37:04 2016 +0900 +++ b/regexParser/threadedSearch.cc Fri Jan 22 20:09:42 2016 +0900 @@ -4,39 +4,6 @@ #include "regexParser.h" #include "subsetConstruction.h" -struct tsValue; - -typedef struct ccv { - unsigned long begin; - unsigned long end; - BitVector state; - struct tState *tState; -} *CCV; - -void stateSkip(TSValue tsv); - -typedef struct tState { - State *state; - void stateSkip(tsValue); - int ccvSize; - CCV ccv; -} TState, *TStatePtr; - -typedef struct result { - unsigned char begin; - unsigned char end; - struct result *next; -} Result, *ResultPtr; - -typedef struct tsValue { - Buffer buff; - ResultPtr result; - State *stateArray; - TState *current; - TState *blockBegin; - TState *blockEnd; -} TSValue, *TSValuePtr; - void stateSkip(TSValue tsv) { tsv.buff.matchBegin = tsv.buff.buffptr; tsv.current->stateSkip(tsv); @@ -63,6 +30,7 @@ ccv->end = end; ccv->tState = NULL; ccv->state = cc->nextState; + ccv->w = cc->cond.w; } free(ccw); return tState; @@ -72,14 +40,32 @@ next: while (tsv.buff.buffptr < tsv.buff.buffend) { unsigned char c = *tsv.buff.buffptr++; for (int i = 0; i < tsv.current->ccvSize; i++) { - if (cccv[i].begin) tsv.current->stateSkip(tsv); - else if (c<=tsv.current->ccv[i].end) { - TStatePtr current = tsv.current->ccv[i].tState; + CCVPtr ccv = &tsv.current->ccv[i]; + if (cbegin) tsv.current->stateSkip(tsv); + else if (c<=ccv->end) { + // range matched. + if (ccv->w) { + // match the word. + // if (not match) continue; + } + TStatePtr current = ccv->tState; if (current == NULL) { - current = generateTState(tsv.stateArray[tsv.current->ccv[i].state.bitContainer]); - tsv.current->ccv[i].tState = current; + // create tSearch in next state. + StatePtr state = tsv.stateArray[ccv->state.bitContainer]; + if (state == NULL) { + // on the fly subset construction. + state = createState(tg,bi); + tg->stateArray[bi.bitContainer] = state; + determinize(state,tsv.tg); + } + if (state->tState == NULL) { + current = generateTState(state); + ccv->tState = current; + } else { + ccv->tState = state->tState; + } } - tsv.current = tsv.current->ccv[i].tState; + tsv.current = ccv->tState; goto next; } }