Mercurial > hg > Applications > Grep
view regexParser/regexParser.cc @ 178:5e8c6857934c pairPro
implement charClassMerge
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 23 Dec 2015 19:17:36 +0900 |
parents | 3be0fbcd4b52 |
children | 6cf8252f3912 |
line wrap: on
line source
#include <stdlib.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include "regexParser.h" static NodePtr charClass(RegexInfoPtr); static void token(RegexInfoPtr); static NodePtr regexAtom(RegexInfoPtr); NodePtr regex(RegexInfoPtr); /** * Create a node of regex parse tree. * tokenType * regexPosition(state) * stateTransitionTable */ static NodePtr allocateNode() { NodePtr n = NEW(Node); n->cc = NULL; n->left = NULL; n->right = NULL; return n; } static NodePtr createNode(RegexInfoPtr ri,unsigned char type,CharClassPtr cc, NodePtr left, NodePtr right) { NodePtr n = allocateNode(); n->tokenType = type; n->cc = cc; n->left = left; n->right = right; if (n->tokenType == 'a' || n->tokenType == 'c') { n->nodeNumber = ri->nodeNumber; ri->nodeNumber++; } else { n->nodeNumber = SYNTAX_NODENUMBER; } return n; } CharClassPtr createCharClassWord(RegexInfoPtr ri) { CharClassPtr cc = NEW(CharClass); cc->type = 'a'; cc->cond.w.word = ri->tokenValue; cc->cond.w.length = ri->ptr - ri->tokenValue; cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue; cc->nextState = ri->current->bitSet; return cc; } /* cond.range.begin cond.range.end |----------------| 1.b---e 2.b------e 3.b------------e 4.b-----------------------e 5.b----------------------------e |----------------| 6. b---------e 7. b----------------e 8. b---------------------e |----------------| 9. b-----e 10. b--------e 11. b-------------e |----------------| 12. b-----e |----------------| 13. b--e */ CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end,BitVector nextState) { if (cc == NULL) { createCharClassRange(begin,end,nextState.bitContainer,0,0); } if (end < cc->cond.range.begin ) { // 1 if (cc->left) { cc->left = insertCharClass(cc->left,begin,end,nextState); } else { cc->left = createCharClassRange(begin,end,nextState.bitContainer,0,0); } return cc; } else if (end == cc->cond.range.begin ) { // 2 cc->cond.range.begin = begin; return cc; } else if (end <= cc->cond.range.end) { // 3,4,6,7,9,10 if (begin < cc->cond.range.begin) { // 3,4 cc->cond.range.begin = begin; } return cc; } else if (begin > cc->cond.range.end ) { // 13 if (cc->right) { cc->right = insertCharClass(cc->right,begin,end,nextState); } else { cc->right = createCharClassRange(begin,end,nextState.bitContainer,0,0); } return cc; } if (cc->right) { CharClassPtr right = cc->right; begin = cc->cond.range.begin; free(cc); return insertCharClass(right,begin,end,nextState); } if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12 if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8 } else if (begin < cc->cond.range.begin) { // 5 cc->cond.range.begin = begin; cc->cond.range.end = end; } else { printf("insertCharClass Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end); } return cc; } // <charClass> ::= '['<literal>'-'<literal>']' static NodePtr charClass(RegexInfoPtr ri) { CharClassPtr cc = NEW(CharClass); NodePtr n = createNode(ri,'c',cc,0,0); cc->type = 'r'; cc->nextState.bitContainer = 0; cc->left = NULL; cc->right = NULL; RangeListPtr rangeList = &cc->cond.range; rangeList->begin = *ri->ptr; rangeList->end = *ri->ptr; rangeList->next = NULL; for (ri->ptr++; *ri->ptr && *ri->ptr != ']'; ri->ptr++) { if (*ri->ptr == '-') { rangeList->end = *(ri->ptr + 1); ri->ptr++; continue; } if (ri->ptr[0] == 0 || ri->ptr[0] == ']') break; rangeList->next = NEW(RangeList); rangeList = rangeList->next; rangeList->begin = *ri->ptr; rangeList->end = *ri->ptr; rangeList->next = NULL; } RangeListPtr r = cc->cond.range.next; cc->cond.range.next = 0; for (; r; r = r->next) { cc = insertCharClass(cc, r->begin, r->end,ri->current->bitState); } n->cc = ri->current->cc = mergeTransition(ri->current,cc); // TODO literal support // merge rangeList here if (*ri->ptr) ri->ptr++; token(ri); return n; } // <literal> ::= [a-z][A-Z][0-9] static NodePtr literal(RegexInfoPtr ri) { CharClassPtr cc = createCharClassWord(ri); token(ri); NodePtr n = createNode(ri,'a',cc,0,0); return n; } static void token(RegexInfoPtr ri) { while (ri->ptr[0] != '\0') { if (ri->ptr[0] == '('){ ri->ptr++; ri->tokenType = '('; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == ')') { ri->ptr++; ri->tokenType = ')'; ri->tokenValue = ri->ptr; return; } else if (ri->ptr[0] == ']') { ri->ptr++; ri->tokenType = ']'; ri->tokenValue = ri->ptr; return; } else if (ri->ptr[0] == '|'){ ri->ptr++; ri->tokenType = '|'; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == '*'){ ri->ptr++; ri->tokenType = '*'; ri->tokenValue = NULL; return; } else if (ri->ptr[0] == '\\'){ // need more proccesing /* \277 \0xa5 \[ \\ \utf-8 etc... */ } else if (ri->ptr[0] == '[') { ri->ptr++; ri->tokenType = 'c'; ri->tokenValue = ri->ptr; return; } else { ri->tokenType = 'a'; ri->tokenValue = ri->ptr; if (isalnum(ri->ptr[0])) { ri->ptr++; } return; } } ri->tokenType = 0; ri->tokenValue = NULL; return; } // <regexAtom> ::= <literal>|<charClass>|<group> static NodePtr regexAtom(RegexInfoPtr ri) { NodePtr n = NULL; if (ri->tokenType == 'c') n = charClass(ri); else if (ri->tokenType == 'a') n = literal(ri); else if (ri->tokenType == '(') { n = regex(ri); if (ri->tokenType != ')') { // error } token(ri); } if (ri->tokenType == '*') { n = createNode(ri,'*',0,n,0); token(ri); ri->asterisk = true; ri->node = n; } return n; } RegexInfoPtr createRegexInfo (RegexInfoPtr ri) { ri->stateNumber++; ri->asterisk = false; ri->current = NEW(State); ri->current->bitState.bitContainer = 0 bitSet(ri.current->bitState,ri->stateNumber); ri->current->next = ri->states; ri->current->cc = NULL; ri->current->node = NULL; ri->states = ri->current; return ri; } // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> NodePtr regex(RegexInfoPtr ri) { token(ri); NodePtr n = regexAtom(ri); while (ri->tokenType) { if (ri->tokenType == '*') { n = createNode(ri,'*',0,n,0); token(ri); ri->asterisk = true; ri->node = n; return n; } else if (ri->tokenType == '|') { n = createNode(ri,'|',0,n,0); NodePtr n1 = regex(ri); n->right = n1; n->node = n; } else if (ri->tokenType == ')') { return n; } else { n = createNode(ri,'+',0,n,0); if (!ri->asterisk) { StatePtr save = ri->current; ri = createRegexInfo(ri); NodePtr n1 = regexAtom(ri); n->right = n1; ri->current = save; } else { NodePtr n1 = regexAtom(ri); n->right = n1; } } } return n; }