Mercurial > hg > Applications > Grep
diff regexParser/regexParser.cc @ 167:3bf2c6d6d53e pairPro
move regexparser dir
author | masa |
---|---|
date | Sat, 19 Dec 2015 15:38:45 +0900 |
parents | c/regexParser/regexParser.cc@1c9e8ba64f6a |
children | b9e913030a47 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/regexParser/regexParser.cc Sat Dec 19 15:38:45 2015 +0900 @@ -0,0 +1,281 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include "regexParser.h" + +static NodePtr charClass(RegexInfoPtr); +static void token(RegexInfoPtr); +static NodePtr regexAtom(RegexInfoPtr); +NodePtr regex(RegexInfoPtr); + +/** + * Create a node of regex parse tree. + * tokenType + * regexPosition(state) + * stateTransitionTable + */ + +static +NodePtr allocateNode() { + NodePtr n = NEW(Node); + n->cc = NULL; + n->left = NULL; + n->right = NULL; + return n; +} + +static +NodePtr createNode(RegexInfoPtr ri,unsigned char type,CharClassPtr cc, NodePtr left, NodePtr right) { + NodePtr n = allocateNode(); + + n->tokenType = type; + n->cc = cc; + n->left = left; + n->right = right; + n->nodeNumber = ri->nodeNumber; + ri->nodeNumber++; + + return n; +} + +CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right) { + CharClassPtr cc = NEW(CharClass); + cc->type = 'r'; + cc->cond.range.begin = begin; + cc->cond.range.end = end; + cc->left = left; + cc->right = right; + cc->nextState.bitContainer = 0; + return cc; +} + +CharClassPtr createCharClassWord(RegexInfoPtr ri) { + CharClassPtr cc = NEW(CharClass); + cc->type = 'a'; + cc->cond.w.word = ri->tokenValue; + cc->cond.w.length = ri->ptr - ri->tokenValue; + cc->nextState.bitContainer = 0; + return cc; +} + +/* + cond.range.begin cond.range.end + |----------------| + 1.b---e + 2.b------e + 3.b------------e + 4.b-----------------------e + 5.b----------------------------e + + |----------------| + 6. b---------e + 7. b----------------e + 8. b---------------------e + + |----------------| + 9. b-----e + 10. b--------e + 11. b-------------e + + |----------------| + 12. b-----e + + |----------------| + 13. b--e + + */ +CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) { + + if (end < cc->cond.range.begin ) { // 1 + if (cc->left) { + cc->left = insertCharClass(cc->left,begin,end); + } else { + cc->left = createCharClassRange(begin,end,0,0); + } + return cc; + } else if (end == cc->cond.range.begin ) { // 2 + cc->cond.range.begin = begin; + return cc; + } else if (end <= cc->cond.range.end) { // 3,4,6,7,9,10 + if (begin < cc->cond.range.begin) { // 3,4 + cc->cond.range.begin = begin; + } + return cc; + } else if (begin > cc->cond.range.end ) { // 13 + if (cc->right) { + cc->right = insertCharClass(cc->right,begin,end); + } else { + cc->right = createCharClassRange(begin,end,0,0); + } + return cc; + } + if (cc->right) { + CharClassPtr right = cc->right; + begin = cc->cond.range.begin; + free(cc); + return insertCharClass(right,begin,end); + } + if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12 + if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8 + } else if (begin < cc->cond.range.begin) { // 5 + cc->cond.range.begin = begin; + cc->cond.range.end = end; + } else { + printf("insertCharClass Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end); + } + return cc; +} + +// <charClass> ::= '['<literal>'-'<literal>']' +static +NodePtr charClass(RegexInfoPtr ri) { + CharClassPtr cc = NEW(CharClass); + NodePtr n = createNode(ri,'c',cc,0,0); + cc->type = 'r'; + cc->nextState.bitContainer = 0; + cc->left = NULL; + cc->right = NULL; + RangeListPtr rangeList = &cc->cond.range; + rangeList->begin = *ri->ptr; + rangeList->end = *ri->ptr; + rangeList->next = NULL; + + for (ri->ptr++; *ri->ptr && *ri->ptr != ']'; ri->ptr++) { + if (*ri->ptr == '-') { + rangeList->end = *(ri->ptr + 1); + ri->ptr++; + continue; + } + if (ri->ptr[0] == 0 || ri->ptr[0] == ']') break; + rangeList->next = NEW(RangeList); + rangeList = rangeList->next; + rangeList->begin = *ri->ptr; + rangeList->end = *ri->ptr; + rangeList->next = NULL; + } + + RangeListPtr r = cc->cond.range.next; + cc->cond.range.next = 0; + for (; r; r = r->next) { + cc = insertCharClass(cc, r->begin, r->end); + } + + n->cc = cc; + // TODO literal support + // merge rangeList here + if (*ri->ptr) ri->ptr++; + token(ri); + return n; +} + +// <literal> ::= [a-z][A-Z][0-9] +static +NodePtr literal(RegexInfoPtr ri) { + CharClassPtr cc = createCharClassWord(ri); + token(ri); + NodePtr n = createNode(ri,'a',cc,0,0); + return n; +} + +static +void token(RegexInfoPtr ri) { + while (ri->ptr[0] != '\0') { + if (ri->ptr[0] == '('){ + ri->ptr++; + ri->tokenType = '('; + ri->tokenValue = NULL; + return; + } else if (ri->ptr[0] == ')') { + ri->ptr++; + ri->tokenType = ')'; + ri->tokenValue = ri->ptr; + return; + } else if (ri->ptr[0] == ']') { + ri->ptr++; + ri->tokenType = ']'; + ri->tokenValue = ri->ptr; + return; + } else if (ri->ptr[0] == '|'){ + ri->ptr++; + ri->tokenType = '|'; + ri->tokenValue = NULL; + return; + } else if (ri->ptr[0] == '*'){ + ri->ptr++; + ri->tokenType = '*'; + ri->tokenValue = NULL; + return; + } else if (ri->ptr[0] == '\\'){ + // need more proccesing + /* + \277 + \0xa5 + \[ + \\ + \utf-8 etc... + */ + } else if (ri->ptr[0] == '[') { + ri->ptr++; + ri->tokenType = 'c'; + ri->tokenValue = ri->ptr; + return; + } else { + ri->tokenType = 'a'; + ri->tokenValue = ri->ptr; + if (isalnum(ri->ptr[0])) { + ri->ptr++; + } + return; + } + } + ri->tokenType = 0; + ri->tokenValue = NULL; + return; +} + +// <regexAtom> ::= <literal>|<charClass>|<group> +static +NodePtr regexAtom(RegexInfoPtr ri) { + + NodePtr n = NULL; + if (ri->tokenType == 'c') n = charClass(ri); + else if (ri->tokenType == 'a') n = literal(ri); + else if (ri->tokenType == '(') { + n = regex(ri); + if (ri->tokenType != ')') { + // error + } + token(ri); + } + if (ri->tokenType == '*') { + n = createNode(ri,'*',0,n,0); + token(ri); + } + + return n; +} + +// <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> +NodePtr regex(RegexInfoPtr ri) { + token(ri); + NodePtr n = regexAtom(ri); + while (ri->tokenType) { + if (ri->tokenType == '*') { + n = createNode(ri,'*',0,n,0); + token(ri); + return n; + } else if (ri->tokenType == '|') { + n = createNode(ri,'|',0,n,0); + NodePtr n1 = regex(ri); + n->right = n1; + } else if (ri->tokenType == ')') { + return n; + } else { + n = createNode(ri,'+',0,n,0); + NodePtr n1 = regexAtom(ri); + n->right = n1; + } + } + return n; +}