Mercurial > hg > Applications > Grep
diff c/regexParser/regexParser.cc @ 115:ca30f8334741 pairPro
rename createRegexParser.cc to regexParser.cc
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 24 Nov 2015 14:38:26 +0900 |
parents | c/regexParser/createRegexParser.cc@ec485345daf9 |
children | 66c633575b53 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/c/regexParser/regexParser.cc Tue Nov 24 14:38:26 2015 +0900 @@ -0,0 +1,158 @@ +#include <stdlib.h> +#include <stdio.h> +#include "regexParser.h" +#include "error.h" + +typedef struct regexInfo { + unsigned char *ptr; + unsigned char tokenType; + int tokenValue; + int nodeNumber; +} RegexInfo, *RegexInfoPtr; + +static NodePtr createNode(RegexInfoPtr,unsigned char,NodePtr,NodePtr); +static NodePtr charClass(RegexInfoPtr); +static NodePtr group(RegexInfoPtr); +static void token(RegexInfoPtr); +static NodePtr regexAtom(RegexInfoPtr); +NodePtr regex(RegexInfoPtr); + +/** + * Create a node of regex parse tree. + * tokenType + * regexPosition(state) + * stateTransitionTable + */ + +static +NodePtr createNode(RegexInfoPtr ri,unsigned char character, NodePtr left, NodePtr right) { + NodePtr n = (NodePtr)malloc(sizeof(Node)); + if (n == NULL) { + mallocFailedMessage(); + } + + n->tokenType = ri->tokenType; + n->cc->conditionList->character = character; + n->left = left; + n->right = right; + + if (ri->tokenType == 'a') { + n->nodeNumber = ri->nodeNumber; + ri->nodeNumber++; + ri->tokenType = 0; + } + return n; +} + +// <charClass> ::= '['<literal>'-'<literal>']' +static +NodePtr charClass(RegexInfoPtr ri) { + NodePtr n = (NodePtr)malloc(sizeof(Node)); + if (n == NULL) { + mallocFailedMessage(); + } + while (ri->ptr[0] == '-') { + ri->ptr++; + } + return n; +} + +// <literal> ::= [a-z][A-Z][0-9] +static +NodePtr literal(RegexInfoPtr ri) { + NodePtr n = createNode(ri,ri->ptr[0],0,0); + ri->ptr++; + return n; +} + +// <group> ::= '('<regex>')' +static +NodePtr group(RegexInfoPtr ri) { + return regex(ri); +} + +static +void token(RegexInfoPtr ri) { + while (ri->ptr[0] != '\0') { + if (ri->ptr[0] == '('){ + ri->ptr++; + ri->tokenType = '('; + ri->tokenValue = 0; + if (ri->ptr[1] == ')') { + ri->ptr++; + } + return; + } else if (ri->ptr[0] == ')') { + ri->ptr++; + ri->tokenType = ')'; + ri->tokenValue = ri->ptr[0]; + return; + } else if (ri->ptr[0] == '[') { + ri->ptr++; + ri->tokenType = '['; + ri->tokenValue = ri->ptr[0]; + if (ri->ptr[1] == ']') { + ri->ptr++; + } + return; + } else if (ri->ptr[0] == '|'){ + ri->ptr++; + ri->tokenType = '|'; + ri->tokenValue = 0; + return; + } else if (ri->ptr[0] == '*'){ + ri->ptr++; + ri->tokenType = '*'; + ri->tokenValue = 0; + return; + } else if (ri->ptr[0] == '\\'){ + // need more proccesing + /* + \277 + \0xa5 + \[ + \\ + \utf-8 etc... + */ + } else { + ri->tokenType = 'a'; + ri->tokenValue = ri->ptr[0]; + return; + } + } + ri->tokenType = 0; + ri->tokenValue = 0; + return; +} + +// <regexAtom> ::= <literal>|<charClass>|<group> +static +NodePtr regexAtom(RegexInfoPtr ri) { + + token(ri); + NodePtr n = NULL; + if (ri->tokenType == 'a') n = literal(ri); + else if (ri->tokenType == '[') n = charClass(ri); + else if (ri->tokenType == '(') n = group(ri); + + return n; +} + +// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex> +NodePtr regex(RegexInfoPtr ri) { + NodePtr n = regexAtom(ri); + while (ri->ptr[0]) { + token(ri); + if (ri->tokenType == '*') { + n = createNode(ri,'*',n,0); + } else if (ri->tokenType == '|') { + NodePtr n1 = regex(ri); + n = createNode(ri,'|',n,n1); + } else if (ri->tokenType == ')') { + return n; + } else { + NodePtr n1 = regex(ri); + n = createNode(ri,'+',n,n1); + } + } return n; +}