changeset 115:ca30f8334741 pairPro

rename createRegexParser.cc to regexParser.cc
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Tue, 24 Nov 2015 14:38:26 +0900
parents c82e7a7ef8d9
children 66c633575b53
files c/regexParser/bitVector.cc c/regexParser/bitVectorNode.cc c/regexParser/createRegexParser.cc c/regexParser/error.cc c/regexParser/error.h c/regexParser/regexParser.cc c/regexParser/regexParser.h c/regexParser/word.c
diffstat 8 files changed, 200 insertions(+), 171 deletions(-) [+]
line wrap: on
line diff
--- a/c/regexParser/bitVector.cc	Mon Nov 23 19:19:43 2015 +0900
+++ b/c/regexParser/bitVector.cc	Tue Nov 24 14:38:26 2015 +0900
@@ -2,19 +2,23 @@
 #include <stdlib.h>
 #include <string.h>
 #include "bitVector.h"
+#include "regexParser.h"
 
-void bitPrint(BitVectorPtr bi);
+extern BitVectorListPtr allocateBitVectorList();
+BitVectorListPtr createBitVector(NodePtr,BitVectorListPtr);
+const BitVectorPtr allocateBitVector();
+BitVectorPtr bitSet(int);
+void bitPrint(BitVectorPtr);
 
 int bitBlock = sizeof(unsigned long) * 8;
 
 BitVectorListPtr createBitVector(NodePtr n,BitVectorListPtr bvl) {
     BitVectorListPtr nextBvl = allocateBitVectorList();
     nextBvl->bi = bitSet(n->nodeNumber);
-    nextBvl->initBvl = initBvl;
     return nextBvl;
 }
 
-BitVectorPtr createBitVector(int bitSetPosition) {
+const BitVectorPtr allocateBitVector() {
 
     BitVectorPtr bi = (BitVectorPtr)malloc(sizeof(BitVector));
     if (bi == NULL) {
@@ -35,7 +39,9 @@
     return bi;
 }
 
-BitVectorPtr bitSet(BitVectorPtr bi, int bitSetPosition) {
+BitVectorPtr bitSet(int bitSetPosition) {
+
+    BitVectorPtr bi = allocateBitVector();
 
     bi->arrayNum = (bitSetPosition + bitBlock - 1) / bitBlock;
 
--- a/c/regexParser/bitVectorNode.cc	Mon Nov 23 19:19:43 2015 +0900
+++ b/c/regexParser/bitVectorNode.cc	Tue Nov 24 14:38:26 2015 +0900
@@ -4,6 +4,10 @@
 #include "bitVector.h"
 #include "regexParser.h"
 
+extern BitVectorPtr bitSet(int);
+BitVectorListPtr allocateBitVectorList();
+BitVectorListPtr initBitVector();
+
 BitVectorListPtr allocateBitVectorList() {
     BitVectorListPtr bvl = (BitVectorListPtr)malloc(sizeof(BitVectorList));
     if (bvl == NULL) {
@@ -26,7 +30,7 @@
 BitVectorListPtr initBitVector() {
 
     BitVectorListPtr bvl = allocateBitVectorList();
-    bvl->initBvl = initBvl = bvl;
+    bvl->initBvl = bvl;
     bvl->bi = bitSet(0);
 
     for (int i = 0; i < 256; i++) {
--- a/c/regexParser/createRegexParser.cc	Mon Nov 23 19:19:43 2015 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,158 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include "regexParser.h"
-
-static NodePtr createNode(RegexInfoPtr,unsigned char,NodePtr,NodePtr);
-static NodePtr charClass(RegexInfoPtr);
-static NodePtr group(RegexInfoPtr);
-static void token(RegexInfoPtr);
-static NodePtr regexAtom(RegexInfoPtr);
-NodePtr regex(RegexInfoPtr);
-
-/**
- * Create a node of regex parse tree.
- *     tokenType
- *     regexPosition(state)
- *     stateTransitionTable
- */
-
-static
-NodePtr createNode(RegexInfoPtr ri,unsigned char character, NodePtr left, NodePtr right) {
-    NodePtr n = (NodePtr)malloc(sizeof(Node));
-    if (n == NULL) {
-        mallocFailedMessage();
-    }
-
-    n->tokenType = ri->tokenType;
-    n->self = n;
-    n->Value.character = character;
-    n->left = left;
-    n->right = right;
-
-    if (ri->tokenType == '*') {
-        n->left->parent = n->self;
-    } else if (ri->tokenType != 'a') {
-        n->right = right;
-        n->left->parent = n->right->parent = n->self;
-    }
-
-    if (ri->tokenType == 'a') {
-        n->nodeNumber = ri->nodeNumber;
-        ri->nodeNumber++;
-        ri->tokenType = 0;
-    }
-    return n;
-}
-
-// <charClass> ::= '['<literal>'-'<literal>']'
-static
-NodePtr charClass(RegexInfoPtr ri) {
-    NodePtr n = (NodePtr)malloc(sizeof(Node));
-    if (n == NULL) {
-        mallocFailedMessage();
-    }
-    while (ri->ptr[0] == '-') {
-        ri->ptr++;
-    }
-    return n;
-}
-
-// <literal> ::= [a-z][A-Z][0-9]
-static
-NodePtr literal(RegexInfoPtr ri) {
-    NodePtr n = createNode(ri,ri->ptr[0],0,0);
-    ri->ptr++;
-    return n;
-}
-
-// <group> ::= '('<regex>')'
-static
-NodePtr group(RegexInfoPtr ri) {
-    return regex(ri);
-}
-
-static
-void token(RegexInfoPtr ri) {
-    while (ri->ptr[0] != '\0') {
-        if (ri->ptr[0] == '('){
-            ri->ptr++;
-            ri->tokenType = '(';
-            ri->tokenValue = 0;
-            if (ri->ptr[1] == ')') {
-                ri->ptr++;
-            }
-            return;
-        } else if (ri->ptr[0] == ')') {
-            ri->ptr++;
-            ri->tokenType = ')';
-            ri->tokenValue = ri->ptr[0];
-            return;
-        } else if (ri->ptr[0] == '[') {
-            ri->ptr++;
-            ri->tokenType = '[';
-            ri->tokenValue = ri->ptr[0];
-            if (ri->ptr[1] == ']') {
-                ri->ptr++;
-            }
-            return;
-        } else if (ri->ptr[0] == '|'){
-            ri->ptr++;
-            ri->tokenType = '|';
-            ri->tokenValue = 0;
-            return;
-        } else if (ri->ptr[0] == '*'){
-            ri->ptr++;
-            ri->tokenType = '*';
-            ri->tokenValue = 0;
-            return;
-        } else if (ri->ptr[0] == '\\'){
-            // need more proccesing 
-            /*
-                \277
-                \0xa5
-                \[
-                \\
-                \utf-8 etc...
-            */
-        } else {
-            ri->tokenType = 'a';
-            ri->tokenValue = ri->ptr[0];
-            return;
-        }
-    }
-    ri->tokenType = 0;
-    ri->tokenValue = 0;
-    return;
-}
-
-// <regexAtom> ::= <literal>|<charClass>|<group>
-static
-NodePtr regexAtom(RegexInfoPtr ri) {
-
-    token(ri);
-    NodePtr n = NULL;
-    if (ri->tokenType == 'a') n = literal(ri);
-    else if (ri->tokenType == '[') n = charClass(ri);
-    else if (ri->tokenType == '(') n = group(ri);
-
-    return n;
-}
-
-// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex>
-NodePtr regex(RegexInfoPtr ri) {
-    NodePtr n = regexAtom(ri);
-    while (ri->ptr[0]) {
-        token(ri);
-        if (ri->tokenType == '*') {
-            n = createNode(ri,'*',n,0);
-        } else if (ri->tokenType == '|') {
-            NodePtr n1 = regex(ri);
-            n = createNode(ri,'|',n,n1);
-        } else if (ri->tokenType == ')') {
-            return n;
-        } else {
-            NodePtr n1 = regex(ri);
-            n = createNode(ri,'+',n,n1);
-        }
-    } return n;
-}
--- a/c/regexParser/error.cc	Mon Nov 23 19:19:43 2015 +0900
+++ b/c/regexParser/error.cc	Tue Nov 24 14:38:26 2015 +0900
@@ -1,7 +1,5 @@
 #include <stdio.h>
 
-void mallocFailedMessage();
-
 void mallocFailedMessage() {
     fprintf(stderr, "Failed to allocate memory.\n");
     exit(-1);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/c/regexParser/error.h	Tue Nov 24 14:38:26 2015 +0900
@@ -0,0 +1,1 @@
+void mallocFailedMessage();
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/c/regexParser/regexParser.cc	Tue Nov 24 14:38:26 2015 +0900
@@ -0,0 +1,158 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "regexParser.h"
+#include "error.h"
+
+typedef struct regexInfo {
+    unsigned char *ptr;
+    unsigned char tokenType;
+    int tokenValue;
+    int nodeNumber;
+} RegexInfo, *RegexInfoPtr;
+
+static NodePtr createNode(RegexInfoPtr,unsigned char,NodePtr,NodePtr);
+static NodePtr charClass(RegexInfoPtr);
+static NodePtr group(RegexInfoPtr);
+static void token(RegexInfoPtr);
+static NodePtr regexAtom(RegexInfoPtr);
+NodePtr regex(RegexInfoPtr);
+
+/**
+ * Create a node of regex parse tree.
+ *     tokenType
+ *     regexPosition(state)
+ *     stateTransitionTable
+ */
+
+static
+NodePtr createNode(RegexInfoPtr ri,unsigned char character, NodePtr left, NodePtr right) {
+    NodePtr n = (NodePtr)malloc(sizeof(Node));
+    if (n == NULL) {
+        mallocFailedMessage();
+    }
+
+    n->tokenType = ri->tokenType;
+    n->cc->conditionList->character = character;
+    n->left = left;
+    n->right = right;
+
+    if (ri->tokenType == 'a') {
+        n->nodeNumber = ri->nodeNumber;
+        ri->nodeNumber++;
+        ri->tokenType = 0;
+    }
+    return n;
+}
+
+// <charClass> ::= '['<literal>'-'<literal>']'
+static
+NodePtr charClass(RegexInfoPtr ri) {
+    NodePtr n = (NodePtr)malloc(sizeof(Node));
+    if (n == NULL) {
+        mallocFailedMessage();
+    }
+    while (ri->ptr[0] == '-') {
+        ri->ptr++;
+    }
+    return n;
+}
+
+// <literal> ::= [a-z][A-Z][0-9]
+static
+NodePtr literal(RegexInfoPtr ri) {
+    NodePtr n = createNode(ri,ri->ptr[0],0,0);
+    ri->ptr++;
+    return n;
+}
+
+// <group> ::= '('<regex>')'
+static
+NodePtr group(RegexInfoPtr ri) {
+    return regex(ri);
+}
+
+static
+void token(RegexInfoPtr ri) {
+    while (ri->ptr[0] != '\0') {
+        if (ri->ptr[0] == '('){
+            ri->ptr++;
+            ri->tokenType = '(';
+            ri->tokenValue = 0;
+            if (ri->ptr[1] == ')') {
+                ri->ptr++;
+            }
+            return;
+        } else if (ri->ptr[0] == ')') {
+            ri->ptr++;
+            ri->tokenType = ')';
+            ri->tokenValue = ri->ptr[0];
+            return;
+        } else if (ri->ptr[0] == '[') {
+            ri->ptr++;
+            ri->tokenType = '[';
+            ri->tokenValue = ri->ptr[0];
+            if (ri->ptr[1] == ']') {
+                ri->ptr++;
+            }
+            return;
+        } else if (ri->ptr[0] == '|'){
+            ri->ptr++;
+            ri->tokenType = '|';
+            ri->tokenValue = 0;
+            return;
+        } else if (ri->ptr[0] == '*'){
+            ri->ptr++;
+            ri->tokenType = '*';
+            ri->tokenValue = 0;
+            return;
+        } else if (ri->ptr[0] == '\\'){
+            // need more proccesing 
+            /*
+                \277
+                \0xa5
+                \[
+                \\
+                \utf-8 etc...
+            */
+        } else {
+            ri->tokenType = 'a';
+            ri->tokenValue = ri->ptr[0];
+            return;
+        }
+    }
+    ri->tokenType = 0;
+    ri->tokenValue = 0;
+    return;
+}
+
+// <regexAtom> ::= <literal>|<charClass>|<group>
+static
+NodePtr regexAtom(RegexInfoPtr ri) {
+
+    token(ri);
+    NodePtr n = NULL;
+    if (ri->tokenType == 'a') n = literal(ri);
+    else if (ri->tokenType == '[') n = charClass(ri);
+    else if (ri->tokenType == '(') n = group(ri);
+
+    return n;
+}
+
+// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex>
+NodePtr regex(RegexInfoPtr ri) {
+    NodePtr n = regexAtom(ri);
+    while (ri->ptr[0]) {
+        token(ri);
+        if (ri->tokenType == '*') {
+            n = createNode(ri,'*',n,0);
+        } else if (ri->tokenType == '|') {
+            NodePtr n1 = regex(ri);
+            n = createNode(ri,'|',n,n1);
+        } else if (ri->tokenType == ')') {
+            return n;
+        } else {
+            NodePtr n1 = regex(ri);
+            n = createNode(ri,'+',n,n1);
+        }
+    } return n;
+}
--- a/c/regexParser/regexParser.h	Mon Nov 23 19:19:43 2015 +0900
+++ b/c/regexParser/regexParser.h	Tue Nov 24 14:38:26 2015 +0900
@@ -1,3 +1,8 @@
+typedef struct word {
+    unsigned char *word;
+    long length;
+} Word, *WordPtr;
+
 typedef struct charClass {
     unsigned char type;
     union condition  {
@@ -9,14 +14,12 @@
         unsigned char character;
         WordPtr w;
     } *conditionList;
-    struct charClass *next;
+    struct charClass *left;
+    struct charClass *right;
+    unsigned long *begin;
+    unsigned long *end;
 } CharClass, *CharClassPtr;
 
-typedef struct word {
-    unsigned char *word;
-    long length;
-} Word, *WordPtr;
-
 typedef struct node {
     unsigned char tokenType;
     unsigned long nodeNumber;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/c/regexParser/word.c	Tue Nov 24 14:38:26 2015 +0900
@@ -0,0 +1,17 @@
+#include <ctype.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+unsigned char* getWord(unsigned char *string) {
+    int wordSize = 0;
+    while (isalnum(string[wordSize])) {
+        wordSize++;
+    }
+
+    int allocateWordSize = wordSize + 1;
+    unsigned char *word = (unsigned char*)malloc(sizeof(unsigned char)*allocateWordSize);
+    strncpy((char*)word, (char*)string, allocateWordSize);
+    word[wordSize] = '\0';
+    return word;
+}