# HG changeset patch # User Shinji KONO # Date 1451455506 -32400 # Node ID 4852bfa85db471231875194b8d070b5cbce2e8c1 # Parent 63e9224c7b2be1889ecca48e176491605eb5b2ef spell fix diff -r 63e9224c7b2b -r 4852bfa85db4 regexParser/Makefile --- a/regexParser/Makefile Tue Dec 29 19:01:23 2015 +0900 +++ b/regexParser/Makefile Wed Dec 30 15:05:06 2015 +0900 @@ -19,7 +19,7 @@ $(CC) $(CFLAGS) -I. $< bitVector.cc -o $@ test/ccMerge: test/ccMerge.cc - $(CC) $(CFLAGS) -I. $< subsetConstraction.cc regexParser.cc node.cc error.cc bitVector.cc -o $@ + $(CC) $(CFLAGS) -I. $< subsetConstruction.cc regexParser.cc node.cc error.cc bitVector.cc -o $@ gcov: make CFLAGS="-Wall -O0 -g -coverage" @@ -75,6 +75,7 @@ ./regexParser -regex '[d-ga-de-d]' ./regexParser -regex '[d-ga-db-e]' ./regexParser -regex '[d-gh-ja-e]' + ./regexParser -regex 'abcd' merge_test: test/ccMerge ./test/ccMerge -regex '[f-i]' -regex 'e' diff -r 63e9224c7b2b -r 4852bfa85db4 regexParser/main.cc --- a/regexParser/main.cc Tue Dec 29 19:01:23 2015 +0900 +++ b/regexParser/main.cc Wed Dec 30 15:05:06 2015 +0900 @@ -2,23 +2,40 @@ #include #include #include "regexParser.h" -#include "subsetConstraction.h" +#include "subsetConstruction.h" #include "node.h" int main(int argc, char **argv) { + bool generate = true; + bool subset = false; + RegexInfo ri; ri.stateNumber = 1; for (int i = 1; i < argc; i++) { if (strcmp(argv[i],"-regex") == 0) { ri.ptr = (unsigned char*)argv[i+1]; i++; + } else if (strcmp(argv[i],"-noGeneration") == 0) { + generate = false; + } else if (strcmp(argv[i],"-subset") == 0) { + subset = true; } } + if (!ri.ptr) return 0; + printf("regex : %s\n",ri.ptr); - NodePtr n = regex(&ri); + NodePtr n = regex(&ri); // parse only printTree(n); - TransitionGeneratorPtr tg = generateTransitionList(n); - printTree(n); - printState(tg); + + if (generate) { // NFA generation + TGValue tgv = generateTransitionList(n); + printState(tgv.tg); + } else if (subset) { + TGValue tgv = generateTransitionList(n); + SCValue scv = createSCValue(tgv); + subsetConstruction(scv); // Determinization + printState(tgv.tg); + } + return 0; } diff -r 63e9224c7b2b -r 4852bfa85db4 regexParser/node.cc --- a/regexParser/node.cc Tue Dec 29 19:01:23 2015 +0900 +++ b/regexParser/node.cc Wed Dec 30 15:05:06 2015 +0900 @@ -34,7 +34,9 @@ putchar(n->cc->cond.w.word[i]); } if (n->state) { - printf("(%ld)\n",n->state->bitState.bitContainer); + printf("(%ld)->(%ld)\n",n->state->bitState.bitContainer,n->nextState->bitState.bitContainer); + } else { + printf("\n"); } } else if (n->tokenType == 'c') { printCharacterClass(n->cc,n->stateNum,d); diff -r 63e9224c7b2b -r 4852bfa85db4 regexParser/subsetConstraction.cc --- a/regexParser/subsetConstraction.cc Tue Dec 29 19:01:23 2015 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,383 +0,0 @@ -#include -#include -#include - -#include "regexParser.h" -#include "subsetConstraction.h" -#include "node.h" -#include "BitVector.h" -#include "error.h" - -CharClassPtr createCharClassRange(unsigned long begin, unsigned long end,unsigned long state, CharClassPtr left, CharClassPtr right) { - CharClassPtr cc = NEW(CharClass); - cc->type = 'r'; - cc->cond.range.begin = begin; - cc->cond.range.end = end; - cc->cond.range.next = NULL; - cc->left = left; - cc->right = right; - cc->nextState.bitContainer = state; - return cc; -} - -CharClassPtr mergeCCTree(CharClassPtr cc,unsigned char mBegin,unsigned char mEnd,BitVector nextState) { - CharClassPtr cc1; - if (cc) { - cc1 = charClassMerge(cc,mBegin,mEnd,nextState); - } else { - cc1 = createCharClassRange(mBegin,mEnd,nextState.bitContainer,NULL,NULL); - } - return cc1; -} - -CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, BitVector nextState) { - // 重なっているccの領域を分割する - // 必要ならばnextStateを重ねあわせる - // 変更があった場合は新しくリストを作って返す - if (end < cc->cond.range.begin ) { // 1 - if (cc->left) { - return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,charClassMerge(cc->left,begin,end,nextState),cc->right); - } else { - return createCharClassRange(begin,end,nextState.bitContainer,NULL,cc); - } - } else if (end == cc->cond.range.begin && begin != end ) { // 2 - CharClassPtr cc1 = mergeCCTree(cc->left,begin,end-1,nextState); - if (cc->cond.range.begin == cc->cond.range.end) { - return createCharClassRange(cc->cond.range.begin,cc->cond.range.end, - cc->nextState.bitContainer | nextState.bitContainer, cc1,cc->right); - } - CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right); - return createCharClassRange(cc->cond.range.begin,cc->cond.range.begin, - cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); - } else if (end < cc->cond.range.end) { // range.begin < end - if (begin < cc->cond.range.begin) { // 3 - CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); - CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right); - return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); - } - if (begin == cc->cond.range.begin) { // 6 - CharClassPtr cc2 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right); - return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc2); - } - // 9 - CharClassPtr cc2 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL); - CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right); - return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc2,cc3); - } else if (end == cc->cond.range.end) { - if (begin == cc->cond.range.begin) { // 7 - return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc->right); - } else if (begin < cc->cond.range.begin) { // 4 - CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); - return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc->right); - } - // 10 cond.range.begin < begin - CharClassPtr cc2 = createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,NULL,cc->right); - return createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,cc2); - } - if (begin > cc->cond.range.end ) { // 13 - if (cc->right) { - return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState)); - } else { - return createCharClassRange(begin,end,nextState.bitContainer,cc,NULL); - } - } - if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { - if (end > cc->cond.range.end) { // cond.range.end < end - if (begin == cc->cond.range.begin) { // 8 - CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); - return createCharClassRange(cc->cond.range.begin,cc->cond.range.end, - cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc1); - } - if (begin > cc->cond.range.begin) { // 11,12 - CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); - CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL); - return createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc3,cc1); - } - } - } else if (begin < cc->cond.range.begin) { // 5 - CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); - CharClassPtr cc3 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); - return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); - } else { - printf("charClassMerge Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end); - } - return cc; -} - -void findLeftMost(CharClassPtr next,CharClassWalkerPtr walk) { - while (next->left) { - CharClassStackPtr ccs = NEW(CharClassStack); - ccs->next = walk->stack; - ccs->turn = walk->turn; - walk->turn = LEFT; - ccs->cc = next; - walk->stack = ccs; - next = next->left; - } - walk->turn = SELF; - walk->next = next; -} - -CharClassWalkerPtr createCharClassWalker (CharClassPtr next) { - CharClassWalkerPtr walk = NEW(CharClassWalker); - walk->next = NULL; - walk->stack = NULL; - if (!next) return walk; - findLeftMost(next,walk); - return walk; -} - -bool hasNext(CharClassWalkerPtr walk) { - return walk->next != NULL; -} - -CharClassStackPtr charClassStackPop(CharClassWalkerPtr walk) { - CharClassStackPtr prev = walk->stack->next; - walk->turn = walk->stack->turn; - free(walk->stack); - walk->stack = prev; - return prev; -} - -CharClassPtr getNext(CharClassWalkerPtr walk) { - CharClassPtr current = walk->next; - walk->next = NULL; - if (walk->turn == SELF) { - if (current->right) { - walk->turn = RIGHT; - findLeftMost(current->right,walk); - return current; - } - } - while (walk->stack) { - walk->next = walk->stack->cc; - charClassStackPop(walk); - if (walk->turn == LEFT) { - walk->turn = SELF; - return current; - } - } - walk->next = NULL; - return current; -} - -void setState(CharClassPtr cc, BitVector bi) { - cc->nextState = bi; - if (cc->left) { - setState(cc->left,bi); - } - cc->nextState = bi; - if (cc->right) { - setState(cc->right,bi); - } -} - -CharClassPtr mergeTransition(StatePtr x,CharClassPtr y) { - if (x->cc == NULL) { - return y; - } - CharClassWalkerPtr walk = createCharClassWalker(x->cc); - CharClassPtr ccy = y; - BitVector bi; - while (hasNext(walk)) { - CharClassPtr cc = getNext(walk); - unsigned long begin = cc->cond.range.begin; - unsigned long end = cc->cond.range.end; - bi = cc->nextState; - ccy = charClassMerge(ccy,begin,end,bi); - } - free(walk); - return ccy; -} - -/** - 作成する state を linked list - bitvector を index とした配列に BitVectorPtr を格納 - state に対応する NodePtr を - */ -StatePtr createState(TGValue tgv,NodePtr n) { - StatePtr s = NEW(State); - s->stateNum = n->stateNum = tgv.tg->totalStateCount; - s->next = tgv.tg->stateList; - tgv.tg->stateList = s; - s->node = n; - BitVector bi = createBitVector(n->stateNum); - s->bitState = bi; - tgv.tg->totalStateCount++; - s->cc = n->cc; - return s; -} - -/** - pass 1 : 正規表現に必要な状態を探して、それぞれに番号を割り振る - 前が * でない + は新しく状態を作る - * があったら、次の状態はその時の先頭の状態になる - 常に先頭の状態を返す - 最後が*ならば、それを持ち歩く - pass 2: - 割り当てられた状態に沿って charclass の行き先を書き換える - 書き換えた charclass を merge する - 前の部分に * がない + は新しい状態をつくるので、state を切り替えて生成する - */ -TGValue generateTransition(NodePtr n,TGValue tgv, int pass) { - if (n->tokenType == '+') { - TGValue tgvLeft = tgv; - tgvLeft.endState = n->right->state; - tgvLeft.asterisk = NULL; - tgvLeft = generateTransition(n->left,tgv,pass); - TGValue tgvRight = tgvLeft; - if (tgvLeft.asterisk) { - n->right->state = tgv.endState; - tgvRight.startState = tgvRight.asterisk; - tgvRight = generateTransition(n->right,tgvRight,pass); - tgvLeft.asterisk = tgvRight.asterisk; - return tgvLeft; - } - tgvRight.asterisk = NULL; - if (pass==1) { - n->right->state = tgvRight.startState = createState(tgvRight,n->right); - } else { - tgvLeft.startState = n->right->state; - tgvLeft.tg->stateArray[tgvLeft.startState->bitState.bitContainer] = tgvLeft.startState ; - } - tgvRight = generateTransition(n->right,tgvRight,pass); - tgvLeft.asterisk = tgvRight.asterisk; - return tgvLeft; - } else if (n->tokenType == '|') { - TGValue tgv1 = generateTransition(n->left,tgv,pass); - TGValue tgv2 = generateTransition(n->right,tgv1,pass); - return tgv2; - } else if (n->tokenType == '*') { - TGValue tgvAstah = tgv; - tgvAstah.endState = tgvAstah.startState; - tgvAstah = generateTransition(n->left,tgvAstah,pass); - tgvAstah.asterisk = tgvAstah.startState; - return tgvAstah; - } else if (n->tokenType == 'c' || n->tokenType == 'a'){ - TGValue tgv1 = tgv; - if (pass==1) { - n->stateNum = tgv.startState->stateNum; - n->nextStateNum = tgv.endState->stateNum; - n->state = tgv.startState;; - n->nextState = tgv.endState; - } else { - BitVector bi = createBitVector(n->nextStateNum); - setState(n->cc,bi); - tgv1.startState->cc = mergeTransition(tgv1.startState,n->cc); - } - return tgv1; - } else { - return tgv; - } -} - -TransitionGeneratorPtr createTransitionGenerator() { - TransitionGeneratorPtr tg = NEW(TransitionGenerator); - tg->totalStateCount = 0; - tg->stack = NULL; - tg->stateArray = NULL; - tg->stateList = NULL; - return tg; -} - -TGValue createTGValue() { - TGValue tgv; - tgv.startState = NULL; - tgv.endState = NULL; - tgv.asterisk = NULL; - tgv.tg = createTransitionGenerator(); - return tgv; -} - -TransitionGeneratorPtr generateTransitionList(NodePtr n) { - TGValue tgv = createTGValue(); - StatePtr startState = tgv.startState = createState(tgv,n); - NodePtr eof = createNode(NULL,'e',NULL,NULL,NULL); - StatePtr endState = tgv.endState = createState(tgv,eof); - tgv = generateTransition(n,tgv,1); - if (tgv.tg->totalStateCount > BITBLOCK) { - errorMassege("StateMax > BITBLOCK",__LINE__,__FILE__); - } - BitVector bi = createBitVector(tgv.tg->totalStateCount); - tgv.tg->stateArray = (StatePtr*)calloc(bi.bitContainer*2,sizeof(StatePtr*)); - tgv.tg->stateArray[startState->bitState.bitContainer] = startState; - tgv.tg->stateArray[endState->bitState.bitContainer] = endState; - tgv = generateTransition(n,tgv,2); - return tgv.tg; -} - -void printState(StatePtr state) { - printf("state : %lx\n",state->bitState.bitContainer); - long nodeNumber = 0; - if (state->node) { - printf("node : %c %lx -> %d\n",state->node->tokenType,state->bitState.bitContainer,state->node->nextStateNum); - if (state->node->state) - nodeNumber = state->node->state->bitState.bitContainer; - } - if (state->cc) { - printCharacterClass(state->cc,nodeNumber,4); - } -} - -void printState(TransitionGeneratorPtr tg) { - StatePtr state = tg->stateList; - for (;state;state = state->next) { - printState(state); - putchar('\n'); - } -} - -SCValue createSCValue(TGValue tgv) { - SCValue scv; - scv.stateTop = tgv.tg->stateList; - scv.stateEnd = scv.stateTop; - while (scv.stateEnd->next) { - scv.stateEnd = scv.stateEnd->next; - } - return scv; -} - -SCValue createState(SCValue scv,BitVector bi) { - StatePtr s = NEW(State); - s->stateNum = ++scv.tg->totalStateCount; - s->next = NULL; - scv.stateEnd->next = s; - scv.stateEnd = s; - s->bitState = bi; - s->cc = NULL; - return scv; -} - -/** - 現在のステートに含まれる組み合わせ状態をとってくる - 組み合わされた個々の charclass をmerge して新しい charclass をつくり、組み合わせ状態に登録する - 生成した状態は stateArray に格納するA - 新しい状態ができなくなったら終了 - - charClassMerge の段階で新しい状態をリストに登録したら、charclasswalk をする必要がなくなる - */ -SCValue subsetConstraction(SCValue scv) { - for (;scv.stateTop;scv.stateTop = scv.stateTop->next) { - CharClassWalkerPtr cw = createCharClassWalker(scv.stateTop->cc); - while (hasNext(cw)) { - CharClassPtr cc = getNext(cw); - BitVector bi = cc->nextState; - if (scv.stateArray[bi.bitContainer]) continue; - scv = createState(scv,bi); - StatePtr s = scv.stateEnd; - for (;bi.bitContainer;) { - int bitPosition = searchBit(bi); - unsigned long baseNum = 1 << bitPosition; - bi.bitContainer ^= baseNum; - StatePtr base = scv.stateArray[baseNum]; - if (base == NULL) {// error - continue; - } - CharClassPtr merge = mergeTransition(s,base->cc); - s->cc = merge; - } - scv.stateArray[bi.bitContainer] = s; - } - } - return scv; -} diff -r 63e9224c7b2b -r 4852bfa85db4 regexParser/subsetConstraction.h --- a/regexParser/subsetConstraction.h Tue Dec 29 19:01:23 2015 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -extern CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, BitVector nextState); -extern TGValue createTGValue(); -extern CharClassPtr mergeTransition(StatePtr x,CharClassPtr y); -extern void setState(CharClassPtr cc, BitVector bi); -extern StatePtr createState(TGValue tgv,NodePtr n); -extern TransitionGeneratorPtr generateTransitionList(NodePtr n); -extern void printState(TransitionGeneratorPtr tg); diff -r 63e9224c7b2b -r 4852bfa85db4 regexParser/subsetConstruction.cc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexParser/subsetConstruction.cc Wed Dec 30 15:05:06 2015 +0900 @@ -0,0 +1,384 @@ +#include +#include +#include + +#include "regexParser.h" +#include "subsetConstruction.h" +#include "node.h" +#include "BitVector.h" +#include "error.h" + +CharClassPtr createCharClassRange(unsigned long begin, unsigned long end,unsigned long state, CharClassPtr left, CharClassPtr right) { + CharClassPtr cc = NEW(CharClass); + cc->type = 'r'; + cc->cond.range.begin = begin; + cc->cond.range.end = end; + cc->cond.range.next = NULL; + cc->left = left; + cc->right = right; + cc->nextState.bitContainer = state; + return cc; +} + +CharClassPtr mergeCCTree(CharClassPtr cc,unsigned char mBegin,unsigned char mEnd,BitVector nextState) { + CharClassPtr cc1; + if (cc) { + cc1 = charClassMerge(cc,mBegin,mEnd,nextState); + } else { + cc1 = createCharClassRange(mBegin,mEnd,nextState.bitContainer,NULL,NULL); + } + return cc1; +} + +CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, BitVector nextState) { + // 重なっているccの領域を分割する + // 必要ならばnextStateを重ねあわせる + // 変更があった場合は新しくリストを作って返す + if (end < cc->cond.range.begin ) { // 1 + if (cc->left) { + return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,charClassMerge(cc->left,begin,end,nextState),cc->right); + } else { + return createCharClassRange(begin,end,nextState.bitContainer,NULL,cc); + } + } else if (end == cc->cond.range.begin && begin != end ) { // 2 + CharClassPtr cc1 = mergeCCTree(cc->left,begin,end-1,nextState); + if (cc->cond.range.begin == cc->cond.range.end) { + return createCharClassRange(cc->cond.range.begin,cc->cond.range.end, + cc->nextState.bitContainer | nextState.bitContainer, cc1,cc->right); + } + CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right); + return createCharClassRange(cc->cond.range.begin,cc->cond.range.begin, + cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); + } else if (end < cc->cond.range.end) { // range.begin < end + if (begin < cc->cond.range.begin) { // 3 + CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); + CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right); + return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); + } + if (begin == cc->cond.range.begin) { // 6 + CharClassPtr cc2 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right); + return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc2); + } + // 9 + CharClassPtr cc2 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL); + CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right); + return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc2,cc3); + } else if (end == cc->cond.range.end) { + if (begin == cc->cond.range.begin) { // 7 + return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc->right); + } else if (begin < cc->cond.range.begin) { // 4 + CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); + return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc->right); + } + // 10 cond.range.begin < begin + CharClassPtr cc2 = createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,NULL,cc->right); + return createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,cc2); + } + if (begin > cc->cond.range.end ) { // 13 + if (cc->right) { + return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState)); + } else { + return createCharClassRange(begin,end,nextState.bitContainer,cc,NULL); + } + } + if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { + if (end > cc->cond.range.end) { // cond.range.end < end + if (begin == cc->cond.range.begin) { // 8 + CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); + return createCharClassRange(cc->cond.range.begin,cc->cond.range.end, + cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc1); + } + if (begin > cc->cond.range.begin) { // 11,12 + CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); + CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL); + return createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc3,cc1); + } + } + } else if (begin < cc->cond.range.begin) { // 5 + CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); + CharClassPtr cc3 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); + return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); + } else { + printf("charClassMerge Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end); + } + return cc; +} + +void findLeftMost(CharClassPtr next,CharClassWalkerPtr walk) { + while (next->left) { + CharClassStackPtr ccs = NEW(CharClassStack); + ccs->next = walk->stack; + ccs->turn = walk->turn; + walk->turn = LEFT; + ccs->cc = next; + walk->stack = ccs; + next = next->left; + } + walk->turn = SELF; + walk->next = next; +} + +CharClassWalkerPtr createCharClassWalker (CharClassPtr next) { + CharClassWalkerPtr walk = NEW(CharClassWalker); + walk->next = NULL; + walk->stack = NULL; + if (!next) return walk; + findLeftMost(next,walk); + return walk; +} + +bool hasNext(CharClassWalkerPtr walk) { + return walk->next != NULL; +} + +CharClassStackPtr charClassStackPop(CharClassWalkerPtr walk) { + CharClassStackPtr prev = walk->stack->next; + walk->turn = walk->stack->turn; + free(walk->stack); + walk->stack = prev; + return prev; +} + +CharClassPtr getNext(CharClassWalkerPtr walk) { + CharClassPtr current = walk->next; + walk->next = NULL; + if (walk->turn == SELF) { + if (current->right) { + walk->turn = RIGHT; + findLeftMost(current->right,walk); + return current; + } + } + while (walk->stack) { + walk->next = walk->stack->cc; + charClassStackPop(walk); + if (walk->turn == LEFT) { + walk->turn = SELF; + return current; + } + } + walk->next = NULL; + return current; +} + +void setState(CharClassPtr cc, BitVector bi) { + cc->nextState = bi; + if (cc->left) { + setState(cc->left,bi); + } + cc->nextState = bi; + if (cc->right) { + setState(cc->right,bi); + } +} + +CharClassPtr mergeTransition(StatePtr x,CharClassPtr y) { + if (x->cc == NULL) { + return y; + } + CharClassWalkerPtr walk = createCharClassWalker(x->cc); + CharClassPtr ccy = y; + BitVector bi; + while (hasNext(walk)) { + CharClassPtr cc = getNext(walk); + unsigned long begin = cc->cond.range.begin; + unsigned long end = cc->cond.range.end; + bi = cc->nextState; + ccy = charClassMerge(ccy,begin,end,bi); + } + free(walk); + return ccy; +} + +/** + 作成する state を linked list + bitvector を index とした配列に BitVectorPtr を格納 + state に対応する NodePtr を + */ +StatePtr createState(TGValue tgv,NodePtr n) { + StatePtr s = NEW(State); + s->stateNum = n->stateNum = tgv.tg->totalStateCount; + s->next = tgv.tg->stateList; + tgv.tg->stateList = s; + s->node = n; + BitVector bi = createBitVector(n->stateNum); + s->bitState = bi; + tgv.tg->totalStateCount++; + s->cc = n->cc; + return s; +} + +/** + pass 1 : 正規表現に必要な状態を探して、それぞれに番号を割り振る + 前が * でない + は新しく状態を作る + * があったら、次の状態はその時の先頭の状態になる + 常に先頭の状態を返す + 最後が*ならば、それを持ち歩く + pass 2: + 割り当てられた状態に沿って charclass の行き先を書き換える + 書き換えた charclass を merge する + 前の部分に * がない + は新しい状態をつくるので、state を切り替えて生成する + */ +TGValue generateTransition(NodePtr n,TGValue tgv, int pass) { + if (n->tokenType == '+') { + TGValue tgvLeft = tgv; + tgvLeft.endState = n->right->state; + tgvLeft.asterisk = NULL; + tgvLeft = generateTransition(n->left,tgv,pass); + TGValue tgvRight = tgvLeft; + if (tgvLeft.asterisk) { + n->right->state = tgv.endState; + tgvRight.startState = tgvRight.asterisk; + tgvRight = generateTransition(n->right,tgvRight,pass); + tgvLeft.asterisk = tgvRight.asterisk; + return tgvLeft; + } + tgvRight.asterisk = NULL; + if (pass==1) { + n->right->state = tgvRight.startState = createState(tgvRight,n->right); + } else { + tgvLeft.startState = n->right->state; + tgvLeft.tg->stateArray[tgvLeft.startState->bitState.bitContainer] = tgvLeft.startState ; + } + tgvRight = generateTransition(n->right,tgvRight,pass); + tgvLeft.asterisk = tgvRight.asterisk; + return tgvLeft; + } else if (n->tokenType == '|') { + TGValue tgv1 = generateTransition(n->left,tgv,pass); + TGValue tgv2 = generateTransition(n->right,tgv1,pass); + return tgv2; + } else if (n->tokenType == '*') { + TGValue tgvAstah = tgv; + tgvAstah.endState = tgvAstah.startState; + tgvAstah = generateTransition(n->left,tgvAstah,pass); + tgvAstah.asterisk = tgvAstah.startState; + return tgvAstah; + } else if (n->tokenType == 'c' || n->tokenType == 'a'){ + TGValue tgv1 = tgv; + if (pass==1) { + n->stateNum = tgv.startState->stateNum; + n->nextStateNum = tgv.endState->stateNum; + n->state = tgv.startState;; + n->nextState = tgv.endState; + } else { + BitVector bi = createBitVector(n->nextStateNum); + setState(n->cc,bi); + tgv1.startState->cc = mergeTransition(tgv1.startState,n->cc); + } + return tgv1; + } else { + return tgv; + } +} + +TransitionGeneratorPtr createTransitionGenerator() { + TransitionGeneratorPtr tg = NEW(TransitionGenerator); + tg->totalStateCount = 0; + tg->stack = NULL; + tg->stateArray = NULL; + tg->stateList = NULL; + return tg; +} + +TGValue createTGValue() { + TGValue tgv; + tgv.startState = NULL; + tgv.endState = NULL; + tgv.asterisk = NULL; + tgv.tg = createTransitionGenerator(); + return tgv; +} + +TGValue generateTransitionList(NodePtr n) { + TGValue tgv = createTGValue(); + StatePtr startState = tgv.startState = createState(tgv,n); + NodePtr eof = createNode(NULL,'e',NULL,NULL,NULL); + StatePtr endState = tgv.endState = createState(tgv,eof); + tgv = generateTransition(n,tgv,1); + printTree(n); + if (tgv.tg->totalStateCount > BITBLOCK) { + errorMassege("StateMax > BITBLOCK",__LINE__,__FILE__); + } + BitVector bi = createBitVector(tgv.tg->totalStateCount); + tgv.tg->stateArray = (StatePtr*)calloc(bi.bitContainer*2,sizeof(StatePtr*)); + tgv.tg->stateArray[startState->bitState.bitContainer] = startState; + tgv.tg->stateArray[endState->bitState.bitContainer] = endState; + tgv = generateTransition(n,tgv,2); + return tgv; +} + +void printState(StatePtr state) { + printf("state : %lx\n",state->bitState.bitContainer); + long nodeNumber = 0; + if (state->node) { + printf("node : %c %lx -> %d\n",state->node->tokenType,state->bitState.bitContainer,state->node->nextStateNum); + if (state->node->state) + nodeNumber = state->node->state->bitState.bitContainer; + } + if (state->cc) { + printCharacterClass(state->cc,nodeNumber,4); + } +} + +void printState(TransitionGeneratorPtr tg) { + StatePtr state = tg->stateList; + for (;state;state = state->next) { + printState(state); + putchar('\n'); + } +} + +SCValue createSCValue(TGValue tgv) { + SCValue scv; + scv.stateTop = tgv.tg->stateList; + scv.stateEnd = scv.stateTop; + while (scv.stateEnd->next) { + scv.stateEnd = scv.stateEnd->next; + } + return scv; +} + +SCValue createState(SCValue scv,BitVector bi) { + StatePtr s = NEW(State); + s->stateNum = ++scv.tg->totalStateCount; + s->next = NULL; + scv.stateEnd->next = s; + scv.stateEnd = s; + s->bitState = bi; + s->cc = NULL; + return scv; +} + +/** + 現在のステートに含まれる組み合わせ状態をとってくる + 組み合わされた個々の charclass をmerge して新しい charclass をつくり、組み合わせ状態に登録する + 生成した状態は stateArray に格納するA + 新しい状態ができなくなったら終了 + + charClassMerge の段階で新しい状態をリストに登録したら、charclasswalk をする必要がなくなる + */ +SCValue subsetConstruction(SCValue scv) { + for (;scv.stateTop;scv.stateTop = scv.stateTop->next) { + CharClassWalkerPtr cw = createCharClassWalker(scv.stateTop->cc); + while (hasNext(cw)) { + CharClassPtr cc = getNext(cw); + BitVector bi = cc->nextState; + if (scv.stateArray[bi.bitContainer]) continue; + scv = createState(scv,bi); + StatePtr s = scv.stateEnd; + for (;bi.bitContainer;) { + int bitPosition = searchBit(bi); + unsigned long baseNum = 1 << bitPosition; + bi.bitContainer ^= baseNum; + StatePtr base = scv.stateArray[baseNum]; + if (base == NULL) {// error + continue; + } + CharClassPtr merge = mergeTransition(s,base->cc); + s->cc = merge; + } + scv.stateArray[bi.bitContainer] = s; + } + } + return scv; +} diff -r 63e9224c7b2b -r 4852bfa85db4 regexParser/subsetConstruction.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexParser/subsetConstruction.h Wed Dec 30 15:05:06 2015 +0900 @@ -0,0 +1,10 @@ +extern CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, BitVector nextState); +extern TGValue createTGValue(); +extern CharClassPtr mergeTransition(StatePtr x,CharClassPtr y); +extern void setState(CharClassPtr cc, BitVector bi); +extern StatePtr createState(TGValue tgv,NodePtr n); +extern TGValue generateTransitionList(NodePtr n); +extern void printState(TransitionGeneratorPtr tg); +extern SCValue createSCValue(TGValue tgv) ; +extern SCValue subsetConstruction(SCValue scv) ; + diff -r 63e9224c7b2b -r 4852bfa85db4 regexParser/test/ccMerge.cc --- a/regexParser/test/ccMerge.cc Tue Dec 29 19:01:23 2015 +0900 +++ b/regexParser/test/ccMerge.cc Wed Dec 30 15:05:06 2015 +0900 @@ -3,7 +3,7 @@ #include #include "regexParser.h" #include "node.h" -#include "subsetConstraction.h" +#include "subsetConstruction.h" void printCCTree(CharClassPtr cc) { if (cc->left != NULL) {