# HG changeset patch # User masa # Date 1437642062 -32400 # Node ID 8fd3d35e98614bc73def16a2960daf05a8ff6abd # Parent a49b4a8b8c148358e3f724b397650a304cb711de add token function diff -r a49b4a8b8c14 -r 8fd3d35e9861 c/regexParser/main.cc --- a/c/regexParser/main.cc Tue Jul 14 16:45:07 2015 +0900 +++ b/c/regexParser/main.cc Thu Jul 23 18:01:02 2015 +0900 @@ -12,9 +12,22 @@ #include #include +typedef struct charClass { + unsigned char table[256]; + struct utf8Range { + unsigned char *begin; + unsigned char *end; + struct utf8Range next; + } *rangeList; +} + typedef struct node { + unsigned char type; + union value { + charClass *cc; + unsigned char *string; + } struct node *self; - char character; struct node *left; struct node *right; } Node, *NodePtr; @@ -25,16 +38,18 @@ NodePtr charClass(); NodePtr string(); NodePtr group(); -NodePtr _or(); +NodePtr orexp(); NodePtr asterisk(); NodePtr regex(); NodePtr createNode(char,NodePtr,NodePtr); bool isLiteral(char c) { - if (('a'<=c && c<='z')||('A'<=c && c<='Z')||('0'<=c && c<='9')) { - return true; - } - return false; + if (*ptr > 0x7f) return true; + else if (*ptr == '(') return false; + else if (*ptr == '[') return false; + else if (*ptr == '|') return false; + else if (*ptr == '*') return false; + return true; } void printNodeDate(NodePtr n) { @@ -85,54 +100,86 @@ return n; } -// ::= '('')' | '(''|'')' +// ::= '('')' NodePtr group() { - NodePtr n; + token(); + NodePtr n = regex(); + token(); if (*ptr == ')') { - n = createNode(0,0,0); - ptr++; + n = createNode('(',n,0); } else { - ptr++; - n = regex(); + // ) reqiured } return n; } - -// ::= '|' -NodePtr _or() { - ptr++; - NodePtr n = createNode('|',regexHeadNode,regex()); - return n; +// ::= |'*'|'|'| +NodePtr regex() { + NodePtr n = regexAtom(); + while (*ptr) { + token(); + if (tokenType == '*') { + n = createNode('*',n,0); + } else if (tokenType == '|') { + NodePtr n1 = regex(); + n = createNode('|',n,n1); + } else { + NodePtr n1 = regex(); + n = createNode('+',n,n1); + } + } } -// <*> ::= '*' -NodePtr asterisk() { - ptr++; - NodePtr n = createNode('*',regexHeadNode,regex()); +// ::= || +NodePtr regexAtom() { + + token(); + NodePter n; + if (tokenType == 'a') n = literal(); + else if (tokenType == '[') n = charClass(); + else if (tokenType == '(') n = group(); + return n; } -// ::= ||||<*> -NodePtr regex() { - - NodePtr n; - +void token() { while (*ptr != '\0') { if ((*ptr == '(') || (*ptr == ')')) { - n = group(); + tokenType = *ptr++; + tokenValue = 0; + return ; } else if (*ptr == '[') { - n = charClass(); + tokenType = '['; + tokenValue = ptr; + if (ptr[1] == ']') { + ptr++; + } + while (*ptr != ']') ptr++; + ptr++; + return; } else if (*ptr == '|'){ - n = _or(); + tokenType = '|'; + tokenValue = 0; + return; } else if (*ptr == '*'){ - n = asterisk(); - } else { - n = string(); - regexHeadNode = n; + tokenType = '*'; + tokenValue = 0; + return; } + + tokenType = 'a'; + tokenValue = ptr; + + if (*ptr == '\\') ptr++; // need more proccesing + /* + \277 + \0xa5 + \[ + \\ + \utf-8 etc... + */ + } - return n; } int main(int argc, char **argv)