changeset 63:8fd3d35e9861

add token function
author masa
date Thu, 23 Jul 2015 18:01:02 +0900
parents a49b4a8b8c14
children e0ad6c145f89
files c/regexParser/main.cc
diffstat 1 files changed, 82 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/c/regexParser/main.cc	Tue Jul 14 16:45:07 2015 +0900
+++ b/c/regexParser/main.cc	Thu Jul 23 18:01:02 2015 +0900
@@ -12,9 +12,22 @@
 #include <stdlib.h>
 #include <string.h>
 
+typedef struct charClass {
+    unsigned char table[256];
+    struct utf8Range {
+        unsigned char *begin;
+        unsigned char *end;
+        struct utf8Range next;
+    } *rangeList;
+}
+
 typedef struct node {
+    unsigned char type;
+    union value {
+        charClass *cc;
+        unsigned char *string;
+    }
     struct node *self;
-    char character;
     struct node *left;
     struct node *right;
 } Node, *NodePtr;
@@ -25,16 +38,18 @@
 NodePtr charClass();
 NodePtr string();
 NodePtr group();
-NodePtr _or();
+NodePtr orexp();
 NodePtr asterisk();
 NodePtr regex();
 NodePtr createNode(char,NodePtr,NodePtr);
 
 bool isLiteral(char c) {
-    if (('a'<=c && c<='z')||('A'<=c && c<='Z')||('0'<=c && c<='9')) {
-        return true;
-    }
-    return false;
+    if (*ptr > 0x7f) return true;
+    else if (*ptr == '(') return false;
+    else if (*ptr == '[') return false;
+    else if (*ptr == '|') return false;
+    else if (*ptr == '*') return false;
+    return true;
 }
 
 void printNodeDate(NodePtr n) {
@@ -85,54 +100,86 @@
     return n;
 }
 
-// <group> ::= '('<regex>')' | '('<regex>'|'<regex>')'
+// <group> ::= '('<regex>')'
 NodePtr group() {
-    NodePtr n;
+    token();
+    NodePtr n = regex();
+    token();
     if (*ptr == ')') {
-        n = createNode(0,0,0);
-        ptr++;
+        n = createNode('(',n,0);
     } else {
-        ptr++;
-        n = regex();
+        // ) reqiured
     }
     return n;
 }
 
-
-// <or> ::= <regex>'|'<regex>
-NodePtr _or() {
-    ptr++;
-    NodePtr n = createNode('|',regexHeadNode,regex());
-    return n;
+// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex>
+NodePtr regex() {
+    NodePtr n = regexAtom();
+    while (*ptr) {
+        token();
+        if (tokenType == '*') {
+            n = createNode('*',n,0);
+        } else if (tokenType == '|') {
+            NodePtr n1 = regex();
+            n = createNode('|',n,n1);
+        } else {
+            NodePtr n1 = regex();
+            n = createNode('+',n,n1);
+        }
+    }
 }
 
-// <*> ::= <regex>'*'
-NodePtr asterisk() {
-    ptr++;
-    NodePtr n = createNode('*',regexHeadNode,regex());
+// <regexAtom> ::= <literal>|<charClass>|<group>
+NodePtr regexAtom() {
+
+    token();
+    NodePter n;
+    if (tokenType == 'a') n = literal();
+    else if (tokenType == '[') n = charClass();
+    else if (tokenType == '(') n = group();
+
     return n;
 }
 
-// <regex> ::= <string>|<or>|<charClass>|<group>|<*>
-NodePtr regex() {
-
-    NodePtr n;
-
+void token() {
     while (*ptr != '\0') {
         if ((*ptr == '(') || (*ptr == ')')) {
-            n = group();
+            tokenType = *ptr++;
+            tokenValue = 0;
+            return ;
         } else if (*ptr == '[') {
-            n = charClass();
+            tokenType = '[';
+            tokenValue = ptr;
+            if (ptr[1] == ']') {
+                ptr++;
+            }
+            while (*ptr != ']') ptr++;
+            ptr++;
+            return;
         } else if (*ptr == '|'){
-            n = _or();
+            tokenType = '|';
+            tokenValue = 0;
+            return;
         } else if (*ptr == '*'){
-            n = asterisk();
-        } else {
-            n = string();
-            regexHeadNode = n;
+            tokenType = '*';
+            tokenValue = 0;
+            return;
         }
+
+        tokenType = 'a';
+        tokenValue = ptr;
+
+        if (*ptr == '\\') ptr++; // need more proccesing 
+        /*
+            \277
+            \0xa5
+            \[
+            \\
+            \utf-8 etc...
+        */
+
     }
-    return n;
 }
 
 int main(int argc, char **argv)