view regexParser/CharClass.cc @ 310:df27e6cab846

CharClassMerge with Word ( no match implementation )
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 08 Feb 2016 19:23:37 +0900
parents 058c87665213
children 66012db6a717
line wrap: on
line source

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#include "regexParser.h"
#include "subsetConstruction.h"
#include "node.h"
#include "BitVector.h"
#include "error.h"

#include "CharClass.h"

CharClassPtr createCharClassRange(unsigned long begin, unsigned long end,unsigned long state, CharClassPtr left, CharClassPtr right) {
    CharClassPtr cc = NEW(CharClass);
    cc->cond.range.begin = begin;
    cc->cond.range.end = end;
    cc->cond.w.word = NULL;
    cc->cond.w.length = 0;
    cc->cond.w.next = NULL;
    cc->left = left;
    cc->right = right;
    cc->nextState.bitContainer = state;
    return cc;
}

CharClassPtr createCharClassWord(RegexInfoPtr ri) {
    CharClassPtr cc = NEW(CharClass);
    cc->left = NULL;
    cc->right = NULL;
    cc->cond.w.word = ri->tokenValue;
    cc->cond.w.length = ri->ptr - ri->tokenValue;
    cc->cond.w.next = NULL;
    cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue;
    return cc;
}

CharClassPtr createCharClassRangeWord(unsigned long begin, unsigned long end,CharClassPtr cc,CharClassPtr m, CharClassPtr left, CharClassPtr right) {
    // same range and seme nextState?
    CharClassPtr ncc = NEW(CharClass);
    ncc->cond.range.begin = begin;
    ncc->cond.range.end = end;
    ncc->cond.w = cc->cond.w;
    ncc->left = left;
    ncc->right = right;
    if (!m) {
        ncc->nextState.bitContainer = cc->nextState.bitContainer;
        return ncc;
    }
    if (m->cond.w.word) {
        WordPtr w = &m->cond.w;
        WordPtr *next = &ncc->cond.w.next;
        while(w->next) {   // insert sort?
            WordPtr n = NEW(Word);
            n->word = w->word;
            n->length = w->length;
            *next = n;
            next = &n->next;
            w = w->next;
        }
        *next = &m->cond.w;
    } 
    ncc->nextState.bitContainer = m->nextState.bitContainer | cc->nextState.bitContainer;
    return ncc;
}


/*
    cond.range.begin  cond.range.end
           |----------------|
  1.b---e
  2.b------e
  3.b------------e
  4.b-----------------------e
  5.b----------------------------e

           |----------------|
  6.       b---------e
  7.       b----------------e
  8.       b---------------------e

           |----------------|
  9.               b-----e
  10.              b--------e
  11.              b-------------e

           |----------------|
  12.                       b-----e

           |----------------|
  13.                          b--e

 */
CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) {
    if (begin>end) {
        unsigned long tmp = begin; begin = end; end = tmp;
    }
    if (cc == NULL) {
        return createCharClassRange(begin,end,0,0,0);
    }
    if (end < cc->cond.range.begin ) { // 1
        if (cc->left) {
            cc->left = insertCharClass(cc->left,begin,end);
        } else {
            cc->left = createCharClassRange(begin,end,0,0,0);
        }
        return cc;
    } else if (end == cc->cond.range.begin ) { // 2
        cc->cond.range.begin = begin;
        return cc;
    } else if (end <= cc->cond.range.end) {  // 3,4,6,7,9,10
        if (begin < cc->cond.range.begin) {  // 3,4
            cc->cond.range.begin = begin;
        }
        return cc;
    } else if (begin > cc->cond.range.end ) { // 13
        if (cc->right) {
            cc->right = insertCharClass(cc->right,begin,end);
        } else {
            cc->right = createCharClassRange(begin,end,0,0,0);
        }
        return cc;
    }
    if (cc->right) {
        CharClassPtr right = cc->right;
        begin = cc->cond.range.begin;
        free(cc);
        return insertCharClass(right,begin,end);
    }
    if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12
        if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8
    } else if (begin < cc->cond.range.begin) { // 5
        cc->cond.range.begin = begin;
        cc->cond.range.end = end; 
    } else {
        printf("insertCharClass Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end);
    }
    return cc;
}



CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, CharClassPtr m) ;

CharClassPtr mergeCCTree(CharClassPtr cc,unsigned char mBegin,unsigned char mEnd,CharClassPtr m) {
    CharClassPtr cc1;
    if (cc) {
        cc1 = charClassMerge(cc,mBegin,mEnd,m);
    } else {
        cc1 = createCharClassRangeWord(mBegin,mEnd,m,NULL,NULL,NULL);
    }
    return cc1;
}

/*
    cond.range.begin  cond.range.end
           |----------------|
  1.b---e
  2.b------e
  3.b------------e
  4.b-----------------------e
  5.b----------------------------e

           |----------------|
  6.       b---------e
  7.       b----------------e
  8.       b---------------------e

           |----------------|
  9.               b-----e
  10.              b--------e
  11.              b-------------e

           |----------------|
  12.                       b-----e

           |----------------|
  13.                          b--e

 */

CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, CharClassPtr m) {
    // 重なっているccの領域を分割する
    // 必要ならばnextStateを重ねあわせる
    // 変更があった場合は新しくリストを作って返す
    if (end < cc->cond.range.begin ) { // 1
        if (cc->left) {
            return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,m,NULL,charClassMerge(cc->left,begin,end,m),cc->right);
        } else {
            return createCharClassRangeWord(begin,end,m,NULL,NULL,cc);
        }
    } else if (end == cc->cond.range.begin && begin != end ) { // 2
        CharClassPtr cc1 = mergeCCTree(cc->left,begin,end-1,m);
        if (cc->cond.range.begin == cc->cond.range.end) {
            return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,
                cc,m, cc1,cc->right);
        }
        CharClassPtr cc3 = createCharClassRangeWord(cc->cond.range.begin+1,cc->cond.range.end,cc,NULL,cc->left,cc->right);
        return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.begin,
            cc,m,cc1,cc3);
    } else if (end < cc->cond.range.end) { // range.begin < end
        if (begin < cc->cond.range.begin) {  // 3
            CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,m);
            CharClassPtr cc3 = createCharClassRangeWord(end+1,cc->cond.range.end,cc,NULL,cc->left,cc->right);
            return createCharClassRangeWord(cc->cond.range.begin,end,m,cc,cc1,cc3);
        }
        if (begin == cc->cond.range.begin) {  // 6
            CharClassPtr cc2 = createCharClassRangeWord(end+1,cc->cond.range.end,cc,NULL,NULL,cc->right);
            return createCharClassRangeWord(begin,end,cc,m,cc->left,cc2);
        }
        // 9
        CharClassPtr cc2 = createCharClassRangeWord(cc->cond.range.begin,begin-1,cc,NULL,cc->left,NULL);
        CharClassPtr cc3 = createCharClassRangeWord(end+1,cc->cond.range.end,cc,NULL,NULL,cc->right);
        return createCharClassRangeWord(begin,end,cc,m,cc2,cc3);
    } else if (end == cc->cond.range.end) {
        if (begin == cc->cond.range.begin) { // 7
            return createCharClassRangeWord(begin,end,cc,m,cc->left,cc->right);
        } else if (begin < cc->cond.range.begin) { // 4
            CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,m);
            return createCharClassRangeWord(cc->cond.range.begin,end,cc,m,cc1,cc->right);
        }
        // 10 cond.range.begin < begin
        CharClassPtr cc2 = createCharClassRangeWord(begin,cc->cond.range.end,cc,m,NULL,cc->right);
        return createCharClassRangeWord(cc->cond.range.begin,begin-1,cc,NULL,cc->left,cc2);
    }
    if (begin > cc->cond.range.end ) { // 13
        if (cc->right) {
            return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,cc,NULL,cc->left,charClassMerge(cc->right,begin,end,m));
        } else {
            return createCharClassRangeWord(begin,end,m,NULL,cc,NULL);
        }
    }
    if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) {
        if (end > cc->cond.range.end) { // cond.range.end < end
            if (begin == cc->cond.range.begin) {    // 8
                CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,m);
                return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,
                    cc,m,cc->left,cc1);
            }
            if (begin > cc->cond.range.begin) {  // 11,12
                CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,m);
                CharClassPtr cc3 = createCharClassRangeWord(cc->cond.range.begin,begin-1,cc,m,cc->left,NULL);
                return createCharClassRangeWord(begin,cc->cond.range.end,cc,m,cc3,cc1);
            }
        }
    } else if (begin < cc->cond.range.begin) { // 5
        CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,m);
        CharClassPtr cc3 = mergeCCTree(cc->right,cc->cond.range.end+1,end,m);
        return createCharClassRangeWord(cc->cond.range.begin,cc->cond.range.end,cc,m,cc1,cc3);
    } else {
        printf("charClassMerge Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end);
    }
    return cc;
}

void findLeftMost(CharClassPtr next,CharClassWalkerPtr walk) {
    while (next->left) {
        CharClassStackPtr ccs = NEW(CharClassStack);
        ccs->next = walk->stack;
        ccs->turn = walk->turn;
        walk->turn = LEFT;
        ccs->cc = next;
        walk->stack = ccs;
        next = next->left;
    }
    walk->turn = SELF;
    walk->next = next;
}

CharClassWalkerPtr createCharClassWalker (CharClassPtr next) {
    CharClassWalkerPtr walk = NEW(CharClassWalker);
    walk->next = NULL;
    walk->stack = NULL;
    walk->turn = LEFT;
    if (!next) return walk;
    findLeftMost(next,walk);
    return walk;
}

bool hasNext(CharClassWalkerPtr walk) {
    return walk->next != NULL;
}

CharClassStackPtr charClassStackPop(CharClassWalkerPtr walk) {
    CharClassStackPtr prev = walk->stack->next;
    walk->turn = walk->stack->turn;
    free(walk->stack);
    walk->stack = prev;
    return prev;
}

CharClassPtr getNext(CharClassWalkerPtr walk) {
    CharClassPtr current = walk->next;
    walk->next = NULL;
    if (walk->turn == SELF) {
        if (current->right) {
            walk->turn = RIGHT;
            findLeftMost(current->right,walk);
            return current;
        }
    }
    while (walk->stack) {
        walk->next = walk->stack->cc;
        charClassStackPop(walk);
        if (walk->turn == LEFT) {
            walk->turn = SELF;
            return current;
        }
    }
    return current;
}

void setState(CharClassPtr cc, BitVector bi) {
    // if (word case) setNext(bitVector to the word list)
    cc->nextState = bi;
    if (cc->left) {
        setState(cc->left,bi);
    }
    if (cc->right) {
        setState(cc->right,bi);
    }
}

CharClassPtr mergeTransition(StatePtr x,CharClassPtr y) {
    if (x->cc == NULL) {
        return y;
    }
    CharClassWalkerPtr walk = createCharClassWalker(x->cc);
    CharClassPtr ccy = y;
    while (hasNext(walk)) {
        CharClassPtr cc = getNext(walk);
        unsigned long begin = cc->cond.range.begin;
        unsigned long end = cc->cond.range.end;
        ccy = charClassMerge(ccy,begin,end,cc);
    }
    free(walk);
    return ccy;
}

/* end */