diff regexParser/regexParser.cc @ 167:3bf2c6d6d53e pairPro

move regexparser dir
author masa
date Sat, 19 Dec 2015 15:38:45 +0900
parents c/regexParser/regexParser.cc@1c9e8ba64f6a
children b9e913030a47
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/regexParser/regexParser.cc	Sat Dec 19 15:38:45 2015 +0900
@@ -0,0 +1,281 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include "regexParser.h"
+
+static NodePtr charClass(RegexInfoPtr);
+static void token(RegexInfoPtr);
+static NodePtr regexAtom(RegexInfoPtr);
+NodePtr regex(RegexInfoPtr);
+
+/**
+ * Create a node of regex parse tree.
+ *     tokenType
+ *     regexPosition(state)
+ *     stateTransitionTable
+ */
+
+static
+NodePtr allocateNode() {
+    NodePtr n = NEW(Node);
+    n->cc = NULL;
+    n->left = NULL;
+    n->right = NULL;
+    return n;
+}
+
+static
+NodePtr createNode(RegexInfoPtr ri,unsigned char type,CharClassPtr cc, NodePtr left, NodePtr right) {
+    NodePtr n = allocateNode();
+
+    n->tokenType = type;
+    n->cc = cc;
+    n->left = left;
+    n->right = right;
+    n->nodeNumber = ri->nodeNumber;
+    ri->nodeNumber++;
+
+    return n;
+}
+
+CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right) {
+    CharClassPtr cc = NEW(CharClass);
+    cc->type = 'r';
+    cc->cond.range.begin = begin;
+    cc->cond.range.end = end;
+    cc->left = left;
+    cc->right = right;
+    cc->nextState.bitContainer = 0;
+    return cc;
+}
+
+CharClassPtr createCharClassWord(RegexInfoPtr ri) {
+    CharClassPtr cc = NEW(CharClass);
+    cc->type = 'a';
+    cc->cond.w.word = ri->tokenValue;
+    cc->cond.w.length = ri->ptr - ri->tokenValue;
+    cc->nextState.bitContainer = 0;
+    return cc;
+}
+
+/*
+    cond.range.begin  cond.range.end
+           |----------------|
+  1.b---e
+  2.b------e
+  3.b------------e
+  4.b-----------------------e
+  5.b----------------------------e
+
+           |----------------|
+  6.       b---------e
+  7.       b----------------e
+  8.       b---------------------e
+
+           |----------------|
+  9.               b-----e
+  10.              b--------e
+  11.              b-------------e
+
+           |----------------|
+  12.                       b-----e
+
+           |----------------|
+  13.                          b--e
+
+ */
+CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) {
+
+    if (end < cc->cond.range.begin ) { // 1
+        if (cc->left) {
+            cc->left = insertCharClass(cc->left,begin,end);
+        } else {
+            cc->left = createCharClassRange(begin,end,0,0);
+        }
+        return cc;
+    } else if (end == cc->cond.range.begin ) { // 2
+        cc->cond.range.begin = begin;
+        return cc;
+    } else if (end <= cc->cond.range.end) {  // 3,4,6,7,9,10
+        if (begin < cc->cond.range.begin) {  // 3,4
+            cc->cond.range.begin = begin;
+        }
+        return cc;
+    } else if (begin > cc->cond.range.end ) { // 13
+        if (cc->right) {
+            cc->right = insertCharClass(cc->right,begin,end);
+        } else {
+            cc->right = createCharClassRange(begin,end,0,0);
+        }
+        return cc;
+    }
+    if (cc->right) {
+        CharClassPtr right = cc->right;
+        begin = cc->cond.range.begin;
+        free(cc);
+        return insertCharClass(right,begin,end);
+    }
+    if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12
+        if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8
+    } else if (begin < cc->cond.range.begin) { // 5
+        cc->cond.range.begin = begin;
+        cc->cond.range.end = end; 
+    } else {
+        printf("insertCharClass Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end);
+    }
+    return cc;
+}
+
+// <charClass> ::= '['<literal>'-'<literal>']'
+static
+NodePtr charClass(RegexInfoPtr ri) {
+    CharClassPtr cc = NEW(CharClass);
+    NodePtr n = createNode(ri,'c',cc,0,0);
+    cc->type = 'r';
+    cc->nextState.bitContainer = 0;
+    cc->left = NULL;
+    cc->right = NULL;
+    RangeListPtr rangeList = &cc->cond.range;
+    rangeList->begin = *ri->ptr;
+    rangeList->end = *ri->ptr;
+    rangeList->next = NULL;
+
+    for (ri->ptr++; *ri->ptr && *ri->ptr != ']'; ri->ptr++) {
+        if (*ri->ptr == '-') {
+            rangeList->end = *(ri->ptr + 1);
+            ri->ptr++;
+            continue;
+        }
+        if (ri->ptr[0] == 0 || ri->ptr[0] == ']') break;
+        rangeList->next = NEW(RangeList);
+        rangeList = rangeList->next;
+        rangeList->begin = *ri->ptr;
+        rangeList->end = *ri->ptr;
+        rangeList->next = NULL;
+    }
+
+    RangeListPtr r = cc->cond.range.next;
+    cc->cond.range.next = 0;
+    for (; r; r = r->next) {
+        cc = insertCharClass(cc, r->begin, r->end);
+    }
+
+    n->cc = cc;
+    // TODO literal support
+    // merge rangeList here
+    if (*ri->ptr) ri->ptr++;
+    token(ri);
+    return n;
+}
+
+// <literal> ::= [a-z][A-Z][0-9]
+static
+NodePtr literal(RegexInfoPtr ri) {
+    CharClassPtr cc = createCharClassWord(ri);
+    token(ri);
+    NodePtr n = createNode(ri,'a',cc,0,0);
+    return n;
+}
+
+static
+void token(RegexInfoPtr ri) {
+    while (ri->ptr[0] != '\0') {
+        if (ri->ptr[0] == '('){
+            ri->ptr++;
+            ri->tokenType = '(';
+            ri->tokenValue = NULL;
+            return;
+        } else if (ri->ptr[0] == ')') {
+            ri->ptr++;
+            ri->tokenType = ')';
+            ri->tokenValue = ri->ptr;
+            return;
+        } else if (ri->ptr[0] == ']') {
+            ri->ptr++;
+            ri->tokenType = ']';
+            ri->tokenValue = ri->ptr;
+            return;
+        } else if (ri->ptr[0] == '|'){
+            ri->ptr++;
+            ri->tokenType = '|';
+            ri->tokenValue = NULL;
+            return;
+        } else if (ri->ptr[0] == '*'){
+            ri->ptr++;
+            ri->tokenType = '*';
+            ri->tokenValue = NULL;
+            return;
+        } else if (ri->ptr[0] == '\\'){
+            // need more proccesing 
+            /*
+                \277
+                \0xa5
+                \[
+                \\
+                \utf-8 etc...
+            */
+        } else if (ri->ptr[0] == '[') {
+            ri->ptr++;
+            ri->tokenType = 'c';
+            ri->tokenValue = ri->ptr;
+            return;
+        } else {
+            ri->tokenType = 'a';
+            ri->tokenValue = ri->ptr;
+            if (isalnum(ri->ptr[0])) {
+                ri->ptr++;
+            }
+            return;
+        }
+    }
+    ri->tokenType = 0;
+    ri->tokenValue = NULL;
+    return;
+}
+
+// <regexAtom> ::= <literal>|<charClass>|<group>
+static
+NodePtr regexAtom(RegexInfoPtr ri) {
+
+    NodePtr n = NULL;
+    if (ri->tokenType == 'c') n = charClass(ri);
+    else if (ri->tokenType == 'a') n = literal(ri);
+    else if (ri->tokenType == '(') {
+        n = regex(ri);
+        if (ri->tokenType != ')') {
+            // error
+        }
+        token(ri);
+    }
+    if (ri->tokenType == '*') {
+        n = createNode(ri,'*',0,n,0);
+        token(ri);
+    }
+
+    return n;
+}
+
+// <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex>
+NodePtr regex(RegexInfoPtr ri) {
+    token(ri);
+    NodePtr n = regexAtom(ri);
+    while (ri->tokenType) {
+        if (ri->tokenType == '*') {
+            n = createNode(ri,'*',0,n,0);
+            token(ri);
+            return n;
+        } else if (ri->tokenType == '|') {
+            n = createNode(ri,'|',0,n,0);
+            NodePtr n1 = regex(ri);
+            n->right = n1;
+        } else if (ri->tokenType == ')') {
+            return n;
+        } else {
+            n = createNode(ri,'+',0,n,0);
+            NodePtr n1 = regexAtom(ri);
+            n->right = n1;
+        }
+    }
+    return n;
+}