view regexParser/subsetConstruction.cc @ 228:399380ad95b7

fix generateTransitionGenerator
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Fri, 15 Jan 2016 19:48:53 +0900
parents 8be58af605da
children 5d66672e5029
line wrap: on
line source

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#include "regexParser.h"
#include "subsetConstruction.h"
#include "node.h"
#include "BitVector.h"
#include "error.h"

#define SIZE 256

CharClassPtr createCharClassRange(unsigned long begin, unsigned long end,unsigned long state, CharClassPtr left, CharClassPtr right) {
    CharClassPtr cc = NEW(CharClass);
    cc->type = 'r';
    cc->cond.range.begin = begin;
    cc->cond.range.end = end;
    cc->cond.range.next = NULL;
    cc->left = left;
    cc->right = right;
    cc->nextState.bitContainer = state;
    return cc;
}

CharClassPtr mergeCCTree(CharClassPtr cc,unsigned char mBegin,unsigned char mEnd,BitVector nextState) {
    CharClassPtr cc1;
    if (cc) {
        cc1 = charClassMerge(cc,mBegin,mEnd,nextState);
    } else {
        cc1 = createCharClassRange(mBegin,mEnd,nextState.bitContainer,NULL,NULL);
    }
    return cc1;
}

CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, BitVector nextState) {
    // 重なっているccの領域を分割する
    // 必要ならばnextStateを重ねあわせる
    // 変更があった場合は新しくリストを作って返す
    if (end < cc->cond.range.begin ) { // 1
        if (cc->left) {
            return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,charClassMerge(cc->left,begin,end,nextState),cc->right);
        } else {
            return createCharClassRange(begin,end,nextState.bitContainer,NULL,cc);
        }
    } else if (end == cc->cond.range.begin && begin != end ) { // 2
        CharClassPtr cc1 = mergeCCTree(cc->left,begin,end-1,nextState);
        if (cc->cond.range.begin == cc->cond.range.end) {
            return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,
                cc->nextState.bitContainer | nextState.bitContainer, cc1,cc->right);
        }
        CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right);
        return createCharClassRange(cc->cond.range.begin,cc->cond.range.begin,
            cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3);
    } else if (end < cc->cond.range.end) { // range.begin < end
        if (begin < cc->cond.range.begin) {  // 3
            CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState);
            CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right);
            return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3);
        }
        if (begin == cc->cond.range.begin) {  // 6
            CharClassPtr cc2 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right);
            return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc2);
        }
        // 9
        CharClassPtr cc2 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL);
        CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right);
        return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc2,cc3);
    } else if (end == cc->cond.range.end) {
        if (begin == cc->cond.range.begin) { // 7
            return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc->right);
        } else if (begin < cc->cond.range.begin) { // 4
            CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState);
            return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc->right);
        }
        // 10 cond.range.begin < begin
        CharClassPtr cc2 = createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,NULL,cc->right);
        return createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,cc2);
    }
    if (begin > cc->cond.range.end ) { // 13
        if (cc->right) {
            return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState));
        } else {
            return createCharClassRange(begin,end,nextState.bitContainer,cc,NULL);
        }
    }
    if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) {
        if (end > cc->cond.range.end) { // cond.range.end < end
            if (begin == cc->cond.range.begin) {    // 8
                CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState);
                return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,
                    cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc1);
            }
            if (begin > cc->cond.range.begin) {  // 11,12
                CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState);
                CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL);
                return createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc3,cc1);
            }
        }
    } else if (begin < cc->cond.range.begin) { // 5
        CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState);
        CharClassPtr cc3 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState);
        return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3);
    } else {
        printf("charClassMerge Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end);
    }
    return cc;
}

void findLeftMost(CharClassPtr next,CharClassWalkerPtr walk) {
    while (next->left) {
        CharClassStackPtr ccs = NEW(CharClassStack);
        ccs->next = walk->stack;
        ccs->turn = walk->turn;
        walk->turn = LEFT;
        ccs->cc = next;
        walk->stack = ccs;
        next = next->left;
    }
    walk->turn = SELF;
    walk->next = next;
}

CharClassWalkerPtr createCharClassWalker (CharClassPtr next) {
    CharClassWalkerPtr walk = NEW(CharClassWalker);
    walk->next = NULL;
    walk->stack = NULL;
    if (!next) return walk;
    findLeftMost(next,walk);
    return walk;
}

bool hasNext(CharClassWalkerPtr walk) {
    return walk->next != NULL;
}

CharClassStackPtr charClassStackPop(CharClassWalkerPtr walk) {
    CharClassStackPtr prev = walk->stack->next;
    walk->turn = walk->stack->turn;
    free(walk->stack);
    walk->stack = prev;
    return prev;
}

CharClassPtr getNext(CharClassWalkerPtr walk) {
    CharClassPtr current = walk->next;
    walk->next = NULL;
    if (walk->turn == SELF) {
        if (current->right) {
            walk->turn = RIGHT;
            findLeftMost(current->right,walk);
            return current;
        }
    }
    while (walk->stack) {
        walk->next = walk->stack->cc;
        charClassStackPop(walk);
        if (walk->turn == LEFT) {
            walk->turn = SELF;
            return current;
        }
    } 
    return current;
}

void setState(CharClassPtr cc, BitVector bi) {
    cc->nextState = bi;
    if (cc->left) {
        setState(cc->left,bi);
    }
    cc->nextState = bi;
    if (cc->right) {
        setState(cc->right,bi);
    }
}

CharClassPtr mergeTransition(StatePtr x,CharClassPtr y) {
    if (x->cc == NULL) {
        return y;
    }
    CharClassWalkerPtr walk = createCharClassWalker(x->cc);
    CharClassPtr ccy = y;
    BitVector bi;
    while (hasNext(walk)) {
        CharClassPtr cc = getNext(walk);
        unsigned long begin = cc->cond.range.begin;
        unsigned long end = cc->cond.range.end;
        bi = cc->nextState;
        ccy = charClassMerge(ccy,begin,end,bi);
    }
    free(walk);
    return ccy;
}

/**
    作成する state を linked list
    bitvector を index とした配列に BitVectorPtr を格納
    state に対応する NodePtr を
 */
StatePtr createState(TGValue tgv,NodePtr n) {
    StatePtr s = NEW(State);
    s->stateNum = n->stateNum = tgv.tg->totalStateCount;
    s->next = tgv.tg->stateList;
    tgv.tg->stateList = s;
    s->node = n;
    BitVector bi = createBitVector(n->stateNum);
    s->bitState = bi;
    tgv.tg->totalStateCount++;
    s->cc = n->cc;
    return s;
}

/**
    pass 1 : 正規表現に必要な状態を探して、それぞれに番号を割り振る
        前が * でない + は新しく状態を作る
        * があったら、次の状態はその時の先頭の状態か次の状態が終了状態ならば終了状態との組み合わせになる
        常に先頭の状態を返す
        最後が*ならば、それを持ち歩く
    pass 2: 
        割り当てられた状態に沿って charclass の行き先を書き換える
        書き換えた charclass を merge する
        前の部分に * がない + は新しい状態をつくるので、state を切り替えて生成する
 */
TGValue generateTransition(NodePtr n,TGValue tgv, int pass) {
    if (n->tokenType == '+') {
        TGValue tgvLeft = tgv;
        tgvLeft.endState = n->right->state;
        tgvLeft.asterisk = NULL;
        tgvLeft = generateTransition(n->left,tgvLeft,pass);
        TGValue tgvRight = tgv;
        if (tgvLeft.asterisk) {
            n->right->state = tgv.endState;
            tgvRight.startState = tgvLeft.asterisk;
            tgvRight = generateTransition(n->right,tgvRight,pass);
            tgvLeft.asterisk = tgvRight.asterisk;
            return tgvLeft;
        }
        tgvRight.asterisk = NULL;
        if (pass==1) {
            n->right->state = tgvRight.startState = createState(tgvRight,n->right);
        } else {
            tgvRight.startState = n->right->state;
            tgvRight.tg->stateArray[tgvRight.startState->bitState.bitContainer] = tgvRight.startState ;
        }
        tgvRight = generateTransition(n->right,tgvRight,pass);
        tgvLeft.asterisk = tgvRight.asterisk;
        return tgvLeft;
    } else if (n->tokenType == '|') {
        TGValue tgv1  = generateTransition(n->left,tgv,pass);
        TGValue tgv2 = generateTransition(n->right,tgv1,pass);
        return tgv2;
    } else if (n->tokenType == '*') {
        TGValue tgvAstah = tgv;
        if (tgvAstah.endState->bitState.bitContainer & 2) {
            tgvAstah.endState = tgvAstah.startState;
            tgvAstah.endState->bitState = bitSet(tgvAstah.endState->bitState,1);
        } else {
            tgvAstah.endState = tgvAstah.startState;
        }
        tgvAstah = generateTransition(n->left,tgvAstah,pass);
        tgvAstah.asterisk = tgvAstah.endState;
        return tgvAstah;
    } else if (n->tokenType == 'c' || n->tokenType == 'a'){
        TGValue tgv1 = tgv;
        if (pass==1) {
            n->stateNum = tgv.startState->stateNum;
            n->state = tgv.startState;
        } else {
            int nextState = tgv.endState->stateNum;
            n->nextStateNum = nextState;
            n->nextState = tgv.endState;
            BitVector bi = createBitVector(nextState);
            setState(n->cc,bi);
            tgv1.startState->cc = mergeTransition(tgv1.startState,n->cc);
        }
        return tgv1;
    } else {
        return tgv;
    }
}

TransitionGeneratorPtr createTransitionGenerator() {
    TransitionGeneratorPtr tg = NEW(TransitionGenerator);
    tg->totalStateCount = 0;
    tg->stack = NULL;
    tg->stateArray = NULL;
    tg->stateList = NULL;
    return tg;
}

TGValue createTGValue() {
    TGValue tgv;
    tgv.startState = NULL;
    tgv.endState = NULL;
    tgv.asterisk = NULL;
    tgv.tg = createTransitionGenerator();
    return tgv;
}

TGValue  generateTransitionList(NodePtr n) {
    TGValue tgv = createTGValue();
    StatePtr startState = tgv.startState = createState(tgv,n);
    NodePtr eof = createNode(NULL,'e',NULL,NULL,NULL);
    StatePtr endState = tgv.endState = createState(tgv,eof);
    tgv.startState = startState;
    tgv.endState = endState;
    tgv = generateTransition(n,tgv,1);
    printTree(n);
    if (tgv.tg->totalStateCount > BITBLOCK) {
        errorMassege("StateMax > BITBLOCK",__LINE__,__FILE__);
    }
    BitVector bi = createBitVector(tgv.tg->totalStateCount);
    tgv.tg->stateArray = (StatePtr*)calloc(bi.bitContainer*2,sizeof(StatePtr*));
    tgv.tg->stateArray[startState->bitState.bitContainer] = startState;
    tgv.tg->stateArray[endState->bitState.bitContainer] = endState;
    tgv.startState = startState;
    tgv.endState = endState;
    tgv = generateTransition(n,tgv,2);
    return tgv;
}

void printState(StatePtr state) {
    printf("state : %lx\n",state->bitState.bitContainer);
    long nodeNumber = 0;
    if (state->node) {
        printf("node : %c %lx -> %d\n",state->node->tokenType,state->bitState.bitContainer,state->node->nextStateNum);
        if (state->node->state)
            nodeNumber = state->node->state->bitState.bitContainer;
    }
    if (state->cc) {
        printCharacterClass(state->cc,nodeNumber,4);
    }
}

void printState(TransitionGeneratorPtr tg) {
    StatePtr state = tg->stateList;
    for (;state;state = state->next) {
        printState(state);
        putchar('\n');
    }
}

SCValue createSCValue(TGValue tgv) {
    SCValue scv;
    scv.tg = tgv.tg;
    scv.stateTop = tgv.tg->stateList;
    scv.stateEnd = scv.stateTop;
    while (scv.stateEnd->next) {
        scv.stateEnd = scv.stateEnd->next;
    }
    return scv;
}

SCValue createState(SCValue scv,BitVector bi) {
    StatePtr s = NEW(State);
    s->stateNum = ++scv.tg->totalStateCount;
    s->next = NULL;
    scv.stateEnd->next = s;
    scv.stateEnd = s;
    s->bitState = bi;
    s->cc = NULL;
    return scv;
}

/**
    現在のステートに含まれる組み合わせ状態をとってくる
    組み合わされた個々の charclass をmerge して新しい charclass をつくり、組み合わせ状態に登録する
    生成した状態は stateArray に格納するA
    新しい状態ができなくなったら終了
    
    charClassMerge の段階で新しい状態をリストに登録したら、charclasswalk をする必要がなくなる
 */
SCValue subsetConstruction(SCValue scv) {
    for (;scv.stateTop;scv.stateTop = scv.stateTop->next) {
        CharClassWalkerPtr cw = createCharClassWalker(scv.stateTop->cc);
        while (hasNext(cw)) {
            CharClassPtr cc = getNext(cw);
            BitVector bi = cc->nextState;
            if (scv.tg->stateArray[bi.bitContainer]) continue;  // already done
            scv = createState(scv,bi);
            StatePtr s = scv.stateEnd;
            scv.tg->stateArray[bi.bitContainer] = s;
            for (;;) {
                int bitPosition = searchBit(bi);
                if (!bitPosition) break;
                unsigned long baseNum = 1 << (bitPosition-1);
                // printf("bit %lx pos %d baseNum %lx\n",bi.bitContainer,bitPosition,baseNum);
                bi.bitContainer ^= baseNum; 
                if (baseNum==2) continue;   // EOF case
                StatePtr base = scv.tg->stateArray[baseNum];
                if (base == NULL) {
                    errorMassege("No base state",__LINE__,__FILE__); break;
                }
                CharClassPtr merge = mergeTransition(s,base->cc);
                s->cc = merge;
            }
        }
    }
    return scv;
}