changeset 178:5e8c6857934c pairPro

implement charClassMerge
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Wed, 23 Dec 2015 19:17:36 +0900
parents 8de9a33f6ae5
children 6cf8252f3912
files regexParser/main.cc regexParser/regexParser.cc regexParser/regexParser.h regexParser/subsetConstraction.cc regexParser/transition.h
diffstat 5 files changed, 77 insertions(+), 59 deletions(-) [+]
line wrap: on
line diff
--- a/regexParser/main.cc	Wed Dec 23 17:28:59 2015 +0900
+++ b/regexParser/main.cc	Wed Dec 23 19:17:36 2015 +0900
@@ -8,17 +8,23 @@
 
 int main(int argc, char **argv)
 {
-    RegexInfoPtr ri = (RegexInfoPtr)malloc(sizeof(RegexInfo));
-    ri->nodeNumber = 1;
-
+    RegexInfo ri;
+    ri.stateNumber = 1;
+    ri.asterisk = false;
+    ri.current = NEW(State);
+    ri.current->bitState.bitContainer = 0
+    bitSet(ri.current->bitState,ri.stateNumber);
+    ri.current->next = NULL;
+    ri.current->cc = NULL;
+    ri.current->node = NULL;
+    ri.states = ri.current;
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i],"-regex") == 0) {
-            ri->ptr = (unsigned char*)argv[i+1]; i++;
+            ri.ptr = (unsigned char*)argv[i+1]; i++;
         }
     }
-
-    printf("regex : %s\n",ri->ptr);
-    NodePtr n = regex(ri);
+    printf("regex : %s\n",ri.ptr);
+    NodePtr n = regex(&ri);
     printTree(n);
     return 0;
 }
--- a/regexParser/regexParser.cc	Wed Dec 23 17:28:59 2015 +0900
+++ b/regexParser/regexParser.cc	Wed Dec 23 19:17:36 2015 +0900
@@ -43,23 +43,13 @@
     return n;
 }
 
-CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right) {
-    CharClassPtr cc = NEW(CharClass);
-    cc->type = 'r';
-    cc->cond.range.begin = begin;
-    cc->cond.range.end = end;
-    cc->left = left;
-    cc->right = right;
-    cc->nextState.bitContainer = 0;
-    return cc;
-}
-
 CharClassPtr createCharClassWord(RegexInfoPtr ri) {
     CharClassPtr cc = NEW(CharClass);
     cc->type = 'a';
     cc->cond.w.word = ri->tokenValue;
     cc->cond.w.length = ri->ptr - ri->tokenValue;
-    cc->nextState.bitContainer = 0;
+    cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue;
+    cc->nextState = ri->current->bitSet;
     return cc;
 }
 
@@ -89,13 +79,15 @@
   13.                          b--e
 
  */
-CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) {
-
+CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end,BitVector nextState) {
+    if (cc == NULL) {
+        createCharClassRange(begin,end,nextState.bitContainer,0,0);
+    }
     if (end < cc->cond.range.begin ) { // 1
         if (cc->left) {
-            cc->left = insertCharClass(cc->left,begin,end);
+            cc->left = insertCharClass(cc->left,begin,end,nextState);
         } else {
-            cc->left = createCharClassRange(begin,end,0,0);
+            cc->left = createCharClassRange(begin,end,nextState.bitContainer,0,0);
         }
         return cc;
     } else if (end == cc->cond.range.begin ) { // 2
@@ -108,9 +100,9 @@
         return cc;
     } else if (begin > cc->cond.range.end ) { // 13
         if (cc->right) {
-            cc->right = insertCharClass(cc->right,begin,end);
+            cc->right = insertCharClass(cc->right,begin,end,nextState);
         } else {
-            cc->right = createCharClassRange(begin,end,0,0);
+            cc->right = createCharClassRange(begin,end,nextState.bitContainer,0,0);
         }
         return cc;
     }
@@ -118,7 +110,7 @@
         CharClassPtr right = cc->right;
         begin = cc->cond.range.begin;
         free(cc);
-        return insertCharClass(right,begin,end);
+        return insertCharClass(right,begin,end,nextState);
     }
     if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12
         if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8
@@ -162,10 +154,10 @@
     RangeListPtr r = cc->cond.range.next;
     cc->cond.range.next = 0;
     for (; r; r = r->next) {
-        cc = insertCharClass(cc, r->begin, r->end);
+        cc = insertCharClass(cc, r->begin, r->end,ri->current->bitState);
     }
 
-    n->cc = cc;
+    n->cc = ri->current->cc = mergeTransition(ri->current,cc);
     // TODO literal support
     // merge rangeList here
     if (*ri->ptr) ri->ptr++;
@@ -255,11 +247,26 @@
     if (ri->tokenType == '*') {
         n = createNode(ri,'*',0,n,0);
         token(ri);
+        ri->asterisk = true;
+        ri->node = n;
     }
-
     return n;
 }
 
+RegexInfoPtr createRegexInfo (RegexInfoPtr ri) {
+    ri->stateNumber++;
+    ri->asterisk = false;
+    ri->current = NEW(State);
+    ri->current->bitState.bitContainer = 0
+    bitSet(ri.current->bitState,ri->stateNumber);
+    ri->current->next = ri->states;
+    ri->current->cc = NULL;
+    ri->current->node = NULL;
+    ri->states = ri->current;
+    return ri;
+}
+
+
 // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex>
 NodePtr regex(RegexInfoPtr ri) {
     token(ri);
@@ -268,17 +275,28 @@
         if (ri->tokenType == '*') {
             n = createNode(ri,'*',0,n,0);
             token(ri);
+            ri->asterisk = true;
+            ri->node = n;
             return n;
         } else if (ri->tokenType == '|') {
             n = createNode(ri,'|',0,n,0);
             NodePtr n1 = regex(ri);
             n->right = n1;
+            n->node = n;
         } else if (ri->tokenType == ')') {
             return n;
         } else {
             n = createNode(ri,'+',0,n,0);
-            NodePtr n1 = regexAtom(ri);
-            n->right = n1;
+            if (!ri->asterisk) {
+                StatePtr save = ri->current;
+                ri = createRegexInfo(ri);
+                NodePtr n1 = regexAtom(ri);
+                n->right = n1;
+                ri->current = save;
+            } else {
+                NodePtr n1 = regexAtom(ri);
+                n->right = n1;
+            }
         }
     }
     return n;
--- a/regexParser/regexParser.h	Wed Dec 23 17:28:59 2015 +0900
+++ b/regexParser/regexParser.h	Wed Dec 23 19:17:36 2015 +0900
@@ -18,7 +18,7 @@
     struct utf8Range *next; // only used in the parser.
 } RangeList , *RangeListPtr;
 
-typedef union condition {
+typedef struct condition {
     RangeList range;
     Word w;
 } Condition, *ConditionList;
@@ -43,7 +43,10 @@
     unsigned char *ptr;
     unsigned char tokenType;
     unsigned char *tokenValue;
-    int nodeNumber;
+    int stateNumber;
+    bool asterisk;
+    StatePtr current;
+    StatePtr states;
 } RegexInfo, *RegexInfoPtr;
 
 CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right);
--- a/regexParser/subsetConstraction.cc	Wed Dec 23 17:28:59 2015 +0900
+++ b/regexParser/subsetConstraction.cc	Wed Dec 23 19:17:36 2015 +0900
@@ -159,9 +159,12 @@
     return current;
 }
 
-TransitionPtr mergeTransition(TransitionPtr x,TransitionPtr y) {
-    CharClassWalkerPtr walk = createCharClassWalker(x->condition);
-    CharClassPtr ccy = y->condition;
+CharClassPtr mergeTransition(StatePtr x,CharClassPtr y) {
+    if (x->cc == NULL) {
+        return y;
+    }
+    CharClassWalkerPtr walk = createCharClassWalker(x->cc);
+    CharClassPtr ccy = y;
     BitVector bi;
     for (CharClassPtr cc = getNext(walk); hasNext(walk); cc=getNext(walk)) {
         unsigned long begin = cc->cond.range.begin;
@@ -169,41 +172,34 @@
         bi = cc->nextState;
         ccy = charClassMerge(ccy,begin,end,bi);
     }
-    TransitionPtr z = createTransition(ccy,&bi);
     free(walk);
-    return z;
+    return ccy;
 }
 
-TGValue generateTransition(NodePtr n,TransitionGenerator tg) {
+TGValue generateTransition(NodePtr n,TGValue tg) {
     TGValue tgv2;
     if (n->tokenType == '+') {
         TGValue tgv = generateTransition(n->right,tg);
-        TGValue tgv1 = generateTransition(n->left,tg);
         if (tgv.asterisk) {
-            tgv1.ts = mergeTransition(tgv.ts,tgv1.ts);
+            TGValue tgv1 = generateTransition(n->left,tgv);
             return tgv1;
         }
+        TGValue tgLeft = createTransitionGenerator(tgv);
+        TGValue tgv1 = generateTransition(n->left,tgLeft);
         return tgv;
     } else if (n->tokenType == '|') {
         TGValue tgv  = generateTransition(n->left,tg);
-        TGValue tgv1 = generateTransition(n->right,tg);
+        TGValue tgv1 = generateTransition(n->right,tgv);
         tgv.tg = tgv1.tg;
-        tgv.ts = mergeTransition(tgv.ts,tgv1.ts);
         tgv.asterisk |= tgv1.asterisk;
         return tgv;
     } else if (n->tokenType == '*') {
         TGValue tgv = generateTransition(n->left,tg);
         tgv.asterisk = true;
         return tgv;
-    } else if (n->tokenType == 'c'){
-        tgv2.ts = createTransition(n->cc,0);
-        return tgv2;
-    } else if (n->tokenType == 'a'){
-        TGValue tgv;
-        tgv.ts = NEW(Transition);
-        tgv.ts->condition = n->cc;
-        bitSet(&tgv.ts->condition->nextState,n->nodeNumber);
-        tg.ts = appendTransition(tg.ts,tgv.ts);
+    } else if (n->tokenType == 'c' || n->tokenType == 'a'){
+        TGValue tgv = tg;
+        tgv.ts = mergeTransition(tgv.ts,n->cc);
         return tgv;
     } else {
         // error
--- a/regexParser/transition.h	Wed Dec 23 17:28:59 2015 +0900
+++ b/regexParser/transition.h	Wed Dec 23 19:17:36 2015 +0900
@@ -1,14 +1,9 @@
 #include "bitVector.h"
 
-typedef struct transition {
-    CharClassPtr condition;
-    struct transition *next;
-} Transition, *TransitionPtr;
-
 typedef struct state {
     BitVector bitState;
-    TransitionPtr transition;
-    NodePtr nextNode;
+    CharClassPtr cc;
+    NodePtr node;
     struct state *next;
 } State, *StatePtr;