diff c/regexParser/createRegexParser.cc @ 95:1cdad0468484 impl-bitvector

rename createRegexTree to createRegexParser
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Tue, 10 Nov 2015 15:35:49 +0900
parents c/regexParser/createRegexTree.cc@912d7bd51f38
children 0b6940588e88
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/c/regexParser/createRegexParser.cc	Tue Nov 10 15:35:49 2015 +0900
@@ -0,0 +1,144 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "regexParser.h"
+
+NodePtr createNode(RegexInfoPtr,unsigned char,NodePtr,NodePtr);
+NodePtr charClass(RegexInfoPtr);
+NodePtr group(RegexInfoPtr);
+void token(RegexInfoPtr);
+NodePtr regexAtom(RegexInfoPtr);
+NodePtr regex(RegexInfoPtr);
+
+/**
+ * Create a node of regex parse tree.
+ *     tokenType
+ *     regexPosition(state)
+ *     stateTransitionTable
+ */
+NodePtr createNode(RegexInfoPtr ri,unsigned char character, NodePtr left, NodePtr right) {
+    NodePtr n = (NodePtr)malloc(sizeof(Node));
+    n->tokenType = ri->tokenType;
+    n->self = n;
+    n->Value.character = character;
+    n->left = left;
+    n->right = right;
+
+    if (ri->tokenType == 'a') {
+        n->nodeNumber = ri->nodeNumber;
+        ri->nodeNumber++;
+        ri->tokenType = 0;
+    }
+    return n;
+}
+
+// <charClass> ::= '['<literal>'-'<literal>']'
+NodePtr charClass(RegexInfoPtr ri) {
+    NodePtr n = (NodePtr)malloc(sizeof(Node));
+    unsigned char startChar = ri->ptr[0];
+    while (ri->ptr[0] == '-') {
+        ri->ptr++;
+    }
+    unsigned char endChar = ri->ptr[0];
+    unsigned char *charTable = (unsigned char*)malloc(sizeof(char)*256);
+
+    return n;
+}
+
+// <literal> ::= [a-z][A-Z][0-9]
+NodePtr literal(RegexInfoPtr ri) {
+    unsigned char *top = ri->ptr;
+    NodePtr n = createNode(ri,ri->ptr[0],0,0);
+    ri->ptr++;
+    return n;
+}
+
+// <group> ::= '('<regex>')'
+NodePtr group(RegexInfoPtr ri) {
+    return regex(ri);
+}
+
+
+
+void token(RegexInfoPtr ri) {
+    while (ri->ptr[0] != '\0') {
+        if (ri->ptr[0] == '('){
+            ri->ptr++;
+            ri->tokenType = '(';
+            ri->tokenValue = 0;
+            if (ri->ptr[1] == ')') {
+                ri->ptr++;
+            }
+            return;
+        } else if (ri->ptr[0] == ')') {
+            ri->ptr++;
+            ri->tokenType = ')';
+            ri->tokenValue = ri->ptr[0];
+            return;
+        } else if (ri->ptr[0] == '[') {
+            ri->ptr++;
+            ri->tokenType = '[';
+            ri->tokenValue = ri->ptr[0];
+            if (ri->ptr[1] == ']') {
+                ri->ptr++;
+            }
+            return;
+        } else if (ri->ptr[0] == '|'){
+            ri->ptr++;
+            ri->tokenType = '|';
+            ri->tokenValue = 0;
+            return;
+        } else if (ri->ptr[0] == '*'){
+            ri->ptr++;
+            ri->tokenType = '*';
+            ri->tokenValue = 0;
+            return;
+        } else if (ri->ptr[0] == '\\'){
+            // need more proccesing 
+            /*
+                \277
+                \0xa5
+                \[
+                \\
+                \utf-8 etc...
+            */
+        } else {
+            ri->tokenType = 'a';
+            ri->tokenValue = ri->ptr[0];
+            return;
+        }
+    }
+    ri->tokenType = 0;
+    ri->tokenValue = 0;
+    return;
+}
+
+// <regexAtom> ::= <literal>|<charClass>|<group>
+NodePtr regexAtom(RegexInfoPtr ri) {
+
+    token(ri);
+    NodePtr n = NULL;
+    if (ri->tokenType == 'a') n = literal(ri);
+    else if (ri->tokenType == '[') n = charClass(ri);
+    else if (ri->tokenType == '(') n = group(ri);
+
+    return n;
+}
+
+// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex>
+NodePtr regex(RegexInfoPtr ri) {
+    NodePtr n = regexAtom(ri);
+    while (ri->ptr[0]) {
+        token(ri);
+        if (ri->tokenType == '*') {
+            n = createNode(ri,'*',n,0);
+        } else if (ri->tokenType == '|') {
+            NodePtr n1 = regex(ri);
+            n = createNode(ri,'|',n,n1);
+        } else if (ri->tokenType == ')') {
+            return n;
+        } else {
+            NodePtr n1 = regex(ri);
+            n = createNode(ri,'+',n,n1);
+        }
+    } return n;
+}