view regexParser/subsetConstruction.cc @ 321:a1b65d39b947

bmSearch fix
author mir3636
date Mon, 16 May 2016 17:03:17 +0900
parents 1188debbef10
children
line wrap: on
line source

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#include "regexParser.h"
#include "subsetConstruction.h"
#include "node.h"
#include "bitVector.h"
#include "CharClass.h"
#include "error.h"

/**
    作成する state を linked list
    bitvector を index とした配列に BitVectorPtr を格納
    state に対応する NodePtr を
 */
StatePtr createState(TransitionGeneratorPtr tg,BitVector bi) {
    StatePtr s = NEW(State);
    s->stateNum = tg->totalStateCount++;
    s->next = NULL;
    tg->stateEnd->next = s;
    tg->stateEnd = s;
    s->bitState = bi;
    s->cc = NULL;
    s->node = NULL;
    s->tState = NULL;
    return s;
}

StatePtr createState(TGValue tgv,NodePtr n) {
    BitVector bi = createBitVector(tgv.tg->totalStateCount);
    StatePtr s = createState(tgv.tg,bi);
    n->stateNum = s->stateNum;
    s->node = n;
    s->bitState = bi;
    s->accept = false;
    s->cc = n->cc;
    return s;
}

/**
    pass 1 : 正規表現に必要な状態を探して、それぞれに番号を割り振る
        前が * でない + は新しく状態を作る
        * があったら、次の状態はその時の先頭の状態か次の状態が終了状態ならば終了状態との組み合わせになる
        常に先頭の状態を返す
        最後が*ならば、それを持ち歩く
    pass 2: 
        割り当てられた状態に沿って charclass の行き先を書き換える
        書き換えた charclass を merge する
        前の部分に * がない + は新しい状態をつくるので、state を切り替えて生成する
 */
TGValue generateTransition(NodePtr n,TGValue tgv, int pass) {
    if (n->tokenType == '+') {
        TGValue tgvLeft = tgv;
        tgvLeft.endState = n->right->state;
        tgvLeft.asterisk = NULL;
        tgvLeft = generateTransition(n->left,tgvLeft,pass);
        TGValue tgvRight = tgv;
        if (tgvLeft.asterisk) {
            n->right->state = tgv.endState;
            tgvRight.startState = tgvLeft.asterisk;
            tgvRight = generateTransition(n->right,tgvRight,pass);
            tgvLeft.asterisk = tgvRight.asterisk;
            return tgvLeft;
        }
        tgvRight.asterisk = NULL;
        if (pass==1) {
            n->right->state = tgvRight.startState = createState(tgvRight,n->right);
        } else {
            tgvRight.startState = n->right->state;
            tgvRight.tg->stateArray[tgvRight.startState->bitState.bitContainer] = tgvRight.startState ;
        }
        tgvRight = generateTransition(n->right,tgvRight,pass);
        if (tgv.endState && tgvRight.asterisk) tgvRight.startState->accept = tgv.endState->accept;
        tgvLeft.asterisk = tgvRight.asterisk;
        return tgvLeft;
    } else if (n->tokenType == '|') {
        TGValue tgv1  = generateTransition(n->left,tgv,pass);
        tgv1.endState = tgv.endState;
        TGValue tgv2 = generateTransition(n->right,tgv1,pass);
        return tgv2;
    } else if (n->tokenType == '*') {
        TGValue tgvAstah = tgv;
        tgvAstah.endState = tgvAstah.startState;
        if (pass==2) tgvAstah.endState->accept = tgv.endState->accept;
        tgvAstah = generateTransition(n->left,tgvAstah,pass);
        tgvAstah.asterisk = tgvAstah.startState;
        return tgvAstah;
    } else if (n->tokenType == 'c' || n->tokenType == 'a'){
        TGValue tgv1 = tgv;
        if (pass==1) {
            n->stateNum = tgv.startState->stateNum;
            n->state = tgv.startState;
        } else {
            int nextState = tgv.endState->stateNum;
            n->nextStateNum = nextState;
            n->nextState = tgv.endState;
            BitVector bi = createBitVector(nextState);
            if (n->nextState->accept) bi = bitSet(bi,1);
            setState(n->cc,bi);
            tgv1.startState->cc = mergeTransition(tgv1.startState,n->cc);
        }
        return tgv1;
    } else {
        return tgv;
    }
}

TransitionGeneratorPtr createTransitionGenerator() {
    TransitionGeneratorPtr tg = NEW(TransitionGenerator);
    tg->totalStateCount = 0;
    tg->stack = NULL;
    tg->stateArray = NULL;
    tg->stateList = NULL;
    return tg;
}

TGValue createTGValue() {
    TGValue tgv;
    tgv.startState = NULL;
    tgv.endState = NULL;
    tgv.asterisk = NULL;
    tgv.tg = createTransitionGenerator();
    return tgv;
}

TGValue  generateTransitionList(NodePtr n) {
    TGValue tgv = createTGValue();
    TransitionGeneratorPtr tg = tgv.tg;
    State dummy;
    tg->stateEnd = &dummy;
    StatePtr startState = tgv.startState = createState(tgv,n);
    tg->stateList = startState;
    NodePtr eof = createNode(NULL,'e',NULL,NULL,NULL);
    StatePtr endState = tgv.endState = createState(tgv,eof);
    endState->accept = true;
    tgv.startState = startState;
    tgv.endState = endState;
    tgv = generateTransition(n,tgv,1);
    printTree(n);
    if (tg->totalStateCount > BITBLOCK) {
        errorMassege("StateMax > BITBLOCK",__LINE__,__FILE__);
    }
    BitVector bi = createBitVector(tg->totalStateCount);
    tg->stateArray = (StatePtr*)calloc(bi.bitContainer*2,sizeof(StatePtr*));
    tg->stateArray[startState->bitState.bitContainer] = startState;
    tg->stateArray[endState->bitState.bitContainer] = endState;
    tg->totalBasicState = tg->totalStateCount;
    tgv.startState = startState;
    tgv.endState = endState;
    tgv = generateTransition(n,tgv,2);
    return tgv;
}

void createAnyState(TransitionGeneratorPtr tg) {
    BitVector anyBi = createBitVector(tg->totalBasicState);
    anyBi.bitContainer = anyBi.bitContainer - 1; // all bit 1 state
    anyBi.bitContainer ^= 2; // exclude Accept State
    tg->anyState = createState(tg,anyBi);
    determinize(tg->anyState,tg);
    tg->stateArray[tg->anyState->bitState.bitContainer] = tg->anyState;
}

void printState(StatePtr state) {
    printf("state : %lx%c\n",state->bitState.bitContainer,state->accept?'*':' ');
    long nodeNumber = 0;
    if (state->node) {
        BitVector bi = createBitVector(state->node->nextStateNum);
        printf("node : %c %lx -> %lx\n",state->node->tokenType,state->bitState.bitContainer,bi.bitContainer);
        if (state->node->state)
            nodeNumber = state->node->state->bitState.bitContainer;
    }
    if (state->cc) {
        printCharacterClass(state->cc,nodeNumber,4);
    }
}

void printState(TransitionGeneratorPtr tg) {
    StatePtr state = tg->stateList;
    for (;state;state = state->next) {
        printState(state);
        putchar('\n');
    }
}

/**
    現在のステートに含まれる組み合わせ状態をとってくる
    組み合わされた個々の charclass をmerge して新しい charclass をつくり、組み合わせ状態に登録する
    生成した状態は stateArray に格納するA
    新しい状態ができなくなったら終了
    
    charClassMerge の段階で新しい状態をリストに登録したら、charclasswalk をする必要がなくなる
 */
void determinize(StatePtr s, TransitionGeneratorPtr tg) {
    BitVector bi = s->bitState;
    for (;;) {
        int bitPosition = searchBit(bi);
        if (!bitPosition) break;
        unsigned long baseNum = 1 << (bitPosition-1);
        // printf("bit %lx pos %d baseNum %lx\n",bi.bitContainer,bitPosition,baseNum);
        bi.bitContainer ^= baseNum; 
        if (baseNum==2) {
            s->accept = true;
            continue;   // EOF case
        }
        StatePtr base = tg->stateArray[baseNum];
        if (base == NULL) {
            fprintf(stderr,"baseNum=%lx ",baseNum);
            errorMassege("No base state",__LINE__,__FILE__); break;
        }
        CharClassPtr merge = mergeTransition(s,base->cc);
        s->cc = merge;
    }
}

void subsetConstruction(TransitionGeneratorPtr tg) {
    for (StatePtr st = tg->stateList;st;st = st->next) {
        CharClassWalkerPtr cw = createCharClassWalker(st->cc);
        while (hasNext(cw)) {
            CharClassPtr cc = getNext(cw);
            BitVector bi = cc->nextState;
            if (tg->stateArray[bi.bitContainer]) continue;  // already done
            StatePtr s = createState(tg,bi);  // s is added at the end of stateList.
            determinize(s,tg);
            tg->stateArray[bi.bitContainer] = s;
        }
        free(cw);
    }
}