# HG changeset patch # User Masataka Kohagura # Date 1450865856 -32400 # Node ID 5e8c6857934cc2bd522f86e1ab95febf998e585d # Parent 8de9a33f6ae514ccf65cecc2d8319ad8b058a883 implement charClassMerge diff -r 8de9a33f6ae5 -r 5e8c6857934c regexParser/main.cc --- a/regexParser/main.cc Wed Dec 23 17:28:59 2015 +0900 +++ b/regexParser/main.cc Wed Dec 23 19:17:36 2015 +0900 @@ -8,17 +8,23 @@ int main(int argc, char **argv) { - RegexInfoPtr ri = (RegexInfoPtr)malloc(sizeof(RegexInfo)); - ri->nodeNumber = 1; - + RegexInfo ri; + ri.stateNumber = 1; + ri.asterisk = false; + ri.current = NEW(State); + ri.current->bitState.bitContainer = 0 + bitSet(ri.current->bitState,ri.stateNumber); + ri.current->next = NULL; + ri.current->cc = NULL; + ri.current->node = NULL; + ri.states = ri.current; for (int i = 1; i < argc; i++) { if (strcmp(argv[i],"-regex") == 0) { - ri->ptr = (unsigned char*)argv[i+1]; i++; + ri.ptr = (unsigned char*)argv[i+1]; i++; } } - - printf("regex : %s\n",ri->ptr); - NodePtr n = regex(ri); + printf("regex : %s\n",ri.ptr); + NodePtr n = regex(&ri); printTree(n); return 0; } diff -r 8de9a33f6ae5 -r 5e8c6857934c regexParser/regexParser.cc --- a/regexParser/regexParser.cc Wed Dec 23 17:28:59 2015 +0900 +++ b/regexParser/regexParser.cc Wed Dec 23 19:17:36 2015 +0900 @@ -43,23 +43,13 @@ return n; } -CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right) { - CharClassPtr cc = NEW(CharClass); - cc->type = 'r'; - cc->cond.range.begin = begin; - cc->cond.range.end = end; - cc->left = left; - cc->right = right; - cc->nextState.bitContainer = 0; - return cc; -} - CharClassPtr createCharClassWord(RegexInfoPtr ri) { CharClassPtr cc = NEW(CharClass); cc->type = 'a'; cc->cond.w.word = ri->tokenValue; cc->cond.w.length = ri->ptr - ri->tokenValue; - cc->nextState.bitContainer = 0; + cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue; + cc->nextState = ri->current->bitSet; return cc; } @@ -89,13 +79,15 @@ 13. b--e */ -CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) { - +CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end,BitVector nextState) { + if (cc == NULL) { + createCharClassRange(begin,end,nextState.bitContainer,0,0); + } if (end < cc->cond.range.begin ) { // 1 if (cc->left) { - cc->left = insertCharClass(cc->left,begin,end); + cc->left = insertCharClass(cc->left,begin,end,nextState); } else { - cc->left = createCharClassRange(begin,end,0,0); + cc->left = createCharClassRange(begin,end,nextState.bitContainer,0,0); } return cc; } else if (end == cc->cond.range.begin ) { // 2 @@ -108,9 +100,9 @@ return cc; } else if (begin > cc->cond.range.end ) { // 13 if (cc->right) { - cc->right = insertCharClass(cc->right,begin,end); + cc->right = insertCharClass(cc->right,begin,end,nextState); } else { - cc->right = createCharClassRange(begin,end,0,0); + cc->right = createCharClassRange(begin,end,nextState.bitContainer,0,0); } return cc; } @@ -118,7 +110,7 @@ CharClassPtr right = cc->right; begin = cc->cond.range.begin; free(cc); - return insertCharClass(right,begin,end); + return insertCharClass(right,begin,end,nextState); } if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12 if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8 @@ -162,10 +154,10 @@ RangeListPtr r = cc->cond.range.next; cc->cond.range.next = 0; for (; r; r = r->next) { - cc = insertCharClass(cc, r->begin, r->end); + cc = insertCharClass(cc, r->begin, r->end,ri->current->bitState); } - n->cc = cc; + n->cc = ri->current->cc = mergeTransition(ri->current,cc); // TODO literal support // merge rangeList here if (*ri->ptr) ri->ptr++; @@ -255,11 +247,26 @@ if (ri->tokenType == '*') { n = createNode(ri,'*',0,n,0); token(ri); + ri->asterisk = true; + ri->node = n; } - return n; } +RegexInfoPtr createRegexInfo (RegexInfoPtr ri) { + ri->stateNumber++; + ri->asterisk = false; + ri->current = NEW(State); + ri->current->bitState.bitContainer = 0 + bitSet(ri.current->bitState,ri->stateNumber); + ri->current->next = ri->states; + ri->current->cc = NULL; + ri->current->node = NULL; + ri->states = ri->current; + return ri; +} + + // ::= | '*' | '|' | '*' | NodePtr regex(RegexInfoPtr ri) { token(ri); @@ -268,17 +275,28 @@ if (ri->tokenType == '*') { n = createNode(ri,'*',0,n,0); token(ri); + ri->asterisk = true; + ri->node = n; return n; } else if (ri->tokenType == '|') { n = createNode(ri,'|',0,n,0); NodePtr n1 = regex(ri); n->right = n1; + n->node = n; } else if (ri->tokenType == ')') { return n; } else { n = createNode(ri,'+',0,n,0); - NodePtr n1 = regexAtom(ri); - n->right = n1; + if (!ri->asterisk) { + StatePtr save = ri->current; + ri = createRegexInfo(ri); + NodePtr n1 = regexAtom(ri); + n->right = n1; + ri->current = save; + } else { + NodePtr n1 = regexAtom(ri); + n->right = n1; + } } } return n; diff -r 8de9a33f6ae5 -r 5e8c6857934c regexParser/regexParser.h --- a/regexParser/regexParser.h Wed Dec 23 17:28:59 2015 +0900 +++ b/regexParser/regexParser.h Wed Dec 23 19:17:36 2015 +0900 @@ -18,7 +18,7 @@ struct utf8Range *next; // only used in the parser. } RangeList , *RangeListPtr; -typedef union condition { +typedef struct condition { RangeList range; Word w; } Condition, *ConditionList; @@ -43,7 +43,10 @@ unsigned char *ptr; unsigned char tokenType; unsigned char *tokenValue; - int nodeNumber; + int stateNumber; + bool asterisk; + StatePtr current; + StatePtr states; } RegexInfo, *RegexInfoPtr; CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right); diff -r 8de9a33f6ae5 -r 5e8c6857934c regexParser/subsetConstraction.cc --- a/regexParser/subsetConstraction.cc Wed Dec 23 17:28:59 2015 +0900 +++ b/regexParser/subsetConstraction.cc Wed Dec 23 19:17:36 2015 +0900 @@ -159,9 +159,12 @@ return current; } -TransitionPtr mergeTransition(TransitionPtr x,TransitionPtr y) { - CharClassWalkerPtr walk = createCharClassWalker(x->condition); - CharClassPtr ccy = y->condition; +CharClassPtr mergeTransition(StatePtr x,CharClassPtr y) { + if (x->cc == NULL) { + return y; + } + CharClassWalkerPtr walk = createCharClassWalker(x->cc); + CharClassPtr ccy = y; BitVector bi; for (CharClassPtr cc = getNext(walk); hasNext(walk); cc=getNext(walk)) { unsigned long begin = cc->cond.range.begin; @@ -169,41 +172,34 @@ bi = cc->nextState; ccy = charClassMerge(ccy,begin,end,bi); } - TransitionPtr z = createTransition(ccy,&bi); free(walk); - return z; + return ccy; } -TGValue generateTransition(NodePtr n,TransitionGenerator tg) { +TGValue generateTransition(NodePtr n,TGValue tg) { TGValue tgv2; if (n->tokenType == '+') { TGValue tgv = generateTransition(n->right,tg); - TGValue tgv1 = generateTransition(n->left,tg); if (tgv.asterisk) { - tgv1.ts = mergeTransition(tgv.ts,tgv1.ts); + TGValue tgv1 = generateTransition(n->left,tgv); return tgv1; } + TGValue tgLeft = createTransitionGenerator(tgv); + TGValue tgv1 = generateTransition(n->left,tgLeft); return tgv; } else if (n->tokenType == '|') { TGValue tgv = generateTransition(n->left,tg); - TGValue tgv1 = generateTransition(n->right,tg); + TGValue tgv1 = generateTransition(n->right,tgv); tgv.tg = tgv1.tg; - tgv.ts = mergeTransition(tgv.ts,tgv1.ts); tgv.asterisk |= tgv1.asterisk; return tgv; } else if (n->tokenType == '*') { TGValue tgv = generateTransition(n->left,tg); tgv.asterisk = true; return tgv; - } else if (n->tokenType == 'c'){ - tgv2.ts = createTransition(n->cc,0); - return tgv2; - } else if (n->tokenType == 'a'){ - TGValue tgv; - tgv.ts = NEW(Transition); - tgv.ts->condition = n->cc; - bitSet(&tgv.ts->condition->nextState,n->nodeNumber); - tg.ts = appendTransition(tg.ts,tgv.ts); + } else if (n->tokenType == 'c' || n->tokenType == 'a'){ + TGValue tgv = tg; + tgv.ts = mergeTransition(tgv.ts,n->cc); return tgv; } else { // error diff -r 8de9a33f6ae5 -r 5e8c6857934c regexParser/transition.h --- a/regexParser/transition.h Wed Dec 23 17:28:59 2015 +0900 +++ b/regexParser/transition.h Wed Dec 23 19:17:36 2015 +0900 @@ -1,14 +1,9 @@ #include "bitVector.h" -typedef struct transition { - CharClassPtr condition; - struct transition *next; -} Transition, *TransitionPtr; - typedef struct state { BitVector bitState; - TransitionPtr transition; - NodePtr nextNode; + CharClassPtr cc; + NodePtr node; struct state *next; } State, *StatePtr;