Mercurial > hg > Applications > Grep
diff regexParser/regexParser.cc @ 178:5e8c6857934c pairPro
implement charClassMerge
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 23 Dec 2015 19:17:36 +0900 |
parents | 3be0fbcd4b52 |
children | 6cf8252f3912 |
line wrap: on
line diff
--- a/regexParser/regexParser.cc Wed Dec 23 17:28:59 2015 +0900 +++ b/regexParser/regexParser.cc Wed Dec 23 19:17:36 2015 +0900 @@ -43,23 +43,13 @@ return n; } -CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right) { - CharClassPtr cc = NEW(CharClass); - cc->type = 'r'; - cc->cond.range.begin = begin; - cc->cond.range.end = end; - cc->left = left; - cc->right = right; - cc->nextState.bitContainer = 0; - return cc; -} - CharClassPtr createCharClassWord(RegexInfoPtr ri) { CharClassPtr cc = NEW(CharClass); cc->type = 'a'; cc->cond.w.word = ri->tokenValue; cc->cond.w.length = ri->ptr - ri->tokenValue; - cc->nextState.bitContainer = 0; + cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue; + cc->nextState = ri->current->bitSet; return cc; } @@ -89,13 +79,15 @@ 13. b--e */ -CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) { - +CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end,BitVector nextState) { + if (cc == NULL) { + createCharClassRange(begin,end,nextState.bitContainer,0,0); + } if (end < cc->cond.range.begin ) { // 1 if (cc->left) { - cc->left = insertCharClass(cc->left,begin,end); + cc->left = insertCharClass(cc->left,begin,end,nextState); } else { - cc->left = createCharClassRange(begin,end,0,0); + cc->left = createCharClassRange(begin,end,nextState.bitContainer,0,0); } return cc; } else if (end == cc->cond.range.begin ) { // 2 @@ -108,9 +100,9 @@ return cc; } else if (begin > cc->cond.range.end ) { // 13 if (cc->right) { - cc->right = insertCharClass(cc->right,begin,end); + cc->right = insertCharClass(cc->right,begin,end,nextState); } else { - cc->right = createCharClassRange(begin,end,0,0); + cc->right = createCharClassRange(begin,end,nextState.bitContainer,0,0); } return cc; } @@ -118,7 +110,7 @@ CharClassPtr right = cc->right; begin = cc->cond.range.begin; free(cc); - return insertCharClass(right,begin,end); + return insertCharClass(right,begin,end,nextState); } if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12 if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8 @@ -162,10 +154,10 @@ RangeListPtr r = cc->cond.range.next; cc->cond.range.next = 0; for (; r; r = r->next) { - cc = insertCharClass(cc, r->begin, r->end); + cc = insertCharClass(cc, r->begin, r->end,ri->current->bitState); } - n->cc = cc; + n->cc = ri->current->cc = mergeTransition(ri->current,cc); // TODO literal support // merge rangeList here if (*ri->ptr) ri->ptr++; @@ -255,11 +247,26 @@ if (ri->tokenType == '*') { n = createNode(ri,'*',0,n,0); token(ri); + ri->asterisk = true; + ri->node = n; } - return n; } +RegexInfoPtr createRegexInfo (RegexInfoPtr ri) { + ri->stateNumber++; + ri->asterisk = false; + ri->current = NEW(State); + ri->current->bitState.bitContainer = 0 + bitSet(ri.current->bitState,ri->stateNumber); + ri->current->next = ri->states; + ri->current->cc = NULL; + ri->current->node = NULL; + ri->states = ri->current; + return ri; +} + + // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> NodePtr regex(RegexInfoPtr ri) { token(ri); @@ -268,17 +275,28 @@ if (ri->tokenType == '*') { n = createNode(ri,'*',0,n,0); token(ri); + ri->asterisk = true; + ri->node = n; return n; } else if (ri->tokenType == '|') { n = createNode(ri,'|',0,n,0); NodePtr n1 = regex(ri); n->right = n1; + n->node = n; } else if (ri->tokenType == ')') { return n; } else { n = createNode(ri,'+',0,n,0); - NodePtr n1 = regexAtom(ri); - n->right = n1; + if (!ri->asterisk) { + StatePtr save = ri->current; + ri = createRegexInfo(ri); + NodePtr n1 = regexAtom(ri); + n->right = n1; + ri->current = save; + } else { + NodePtr n1 = regexAtom(ri); + n->right = n1; + } } } return n;