diff regexParser/regexParser.cc @ 178:5e8c6857934c pairPro

implement charClassMerge
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Wed, 23 Dec 2015 19:17:36 +0900
parents 3be0fbcd4b52
children 6cf8252f3912
line wrap: on
line diff
--- a/regexParser/regexParser.cc	Wed Dec 23 17:28:59 2015 +0900
+++ b/regexParser/regexParser.cc	Wed Dec 23 19:17:36 2015 +0900
@@ -43,23 +43,13 @@
     return n;
 }
 
-CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right) {
-    CharClassPtr cc = NEW(CharClass);
-    cc->type = 'r';
-    cc->cond.range.begin = begin;
-    cc->cond.range.end = end;
-    cc->left = left;
-    cc->right = right;
-    cc->nextState.bitContainer = 0;
-    return cc;
-}
-
 CharClassPtr createCharClassWord(RegexInfoPtr ri) {
     CharClassPtr cc = NEW(CharClass);
     cc->type = 'a';
     cc->cond.w.word = ri->tokenValue;
     cc->cond.w.length = ri->ptr - ri->tokenValue;
-    cc->nextState.bitContainer = 0;
+    cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue;
+    cc->nextState = ri->current->bitSet;
     return cc;
 }
 
@@ -89,13 +79,15 @@
   13.                          b--e
 
  */
-CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) {
-
+CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end,BitVector nextState) {
+    if (cc == NULL) {
+        createCharClassRange(begin,end,nextState.bitContainer,0,0);
+    }
     if (end < cc->cond.range.begin ) { // 1
         if (cc->left) {
-            cc->left = insertCharClass(cc->left,begin,end);
+            cc->left = insertCharClass(cc->left,begin,end,nextState);
         } else {
-            cc->left = createCharClassRange(begin,end,0,0);
+            cc->left = createCharClassRange(begin,end,nextState.bitContainer,0,0);
         }
         return cc;
     } else if (end == cc->cond.range.begin ) { // 2
@@ -108,9 +100,9 @@
         return cc;
     } else if (begin > cc->cond.range.end ) { // 13
         if (cc->right) {
-            cc->right = insertCharClass(cc->right,begin,end);
+            cc->right = insertCharClass(cc->right,begin,end,nextState);
         } else {
-            cc->right = createCharClassRange(begin,end,0,0);
+            cc->right = createCharClassRange(begin,end,nextState.bitContainer,0,0);
         }
         return cc;
     }
@@ -118,7 +110,7 @@
         CharClassPtr right = cc->right;
         begin = cc->cond.range.begin;
         free(cc);
-        return insertCharClass(right,begin,end);
+        return insertCharClass(right,begin,end,nextState);
     }
     if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12
         if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8
@@ -162,10 +154,10 @@
     RangeListPtr r = cc->cond.range.next;
     cc->cond.range.next = 0;
     for (; r; r = r->next) {
-        cc = insertCharClass(cc, r->begin, r->end);
+        cc = insertCharClass(cc, r->begin, r->end,ri->current->bitState);
     }
 
-    n->cc = cc;
+    n->cc = ri->current->cc = mergeTransition(ri->current,cc);
     // TODO literal support
     // merge rangeList here
     if (*ri->ptr) ri->ptr++;
@@ -255,11 +247,26 @@
     if (ri->tokenType == '*') {
         n = createNode(ri,'*',0,n,0);
         token(ri);
+        ri->asterisk = true;
+        ri->node = n;
     }
-
     return n;
 }
 
+RegexInfoPtr createRegexInfo (RegexInfoPtr ri) {
+    ri->stateNumber++;
+    ri->asterisk = false;
+    ri->current = NEW(State);
+    ri->current->bitState.bitContainer = 0
+    bitSet(ri.current->bitState,ri->stateNumber);
+    ri->current->next = ri->states;
+    ri->current->cc = NULL;
+    ri->current->node = NULL;
+    ri->states = ri->current;
+    return ri;
+}
+
+
 // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex>
 NodePtr regex(RegexInfoPtr ri) {
     token(ri);
@@ -268,17 +275,28 @@
         if (ri->tokenType == '*') {
             n = createNode(ri,'*',0,n,0);
             token(ri);
+            ri->asterisk = true;
+            ri->node = n;
             return n;
         } else if (ri->tokenType == '|') {
             n = createNode(ri,'|',0,n,0);
             NodePtr n1 = regex(ri);
             n->right = n1;
+            n->node = n;
         } else if (ri->tokenType == ')') {
             return n;
         } else {
             n = createNode(ri,'+',0,n,0);
-            NodePtr n1 = regexAtom(ri);
-            n->right = n1;
+            if (!ri->asterisk) {
+                StatePtr save = ri->current;
+                ri = createRegexInfo(ri);
+                NodePtr n1 = regexAtom(ri);
+                n->right = n1;
+                ri->current = save;
+            } else {
+                NodePtr n1 = regexAtom(ri);
+                n->right = n1;
+            }
         }
     }
     return n;