view c/regexParser/regexParser.cc @ 124:c363a66dc1a7 pairPro

fix
author masa
date Tue, 01 Dec 2015 17:06:26 +0900
parents 188d866227a4
children b061cd8205cc
line wrap: on
line source

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "regexParser.h"
#include "error.h"

static NodePtr allocateNode();
static NodePtr createNode(RegexInfoPtr,unsigned char*,NodePtr,NodePtr);
static NodePtr charClass(RegexInfoPtr);
static NodePtr group(RegexInfoPtr);
static void token(RegexInfoPtr);
static NodePtr regexAtom(RegexInfoPtr);
NodePtr regex(RegexInfoPtr);

/**
 * Create a node of regex parse tree.
 *     tokenType
 *     regexPosition(state)
 *     stateTransitionTable
 */

static
NodePtr allocateNode() {
    NodePtr n = (NodePtr)malloc(sizeof(node));
    n->cc = (CharClassPtr)malloc(sizeof(CharClass));
    n->cc->cond = (ConditionList)malloc(sizeof(Condition));
    return n;
}

static
NodePtr createNode(RegexInfoPtr ri,unsigned char *character, NodePtr left, NodePtr right) {
    NodePtr n = allocateNode();
    if (n == NULL) {
        mallocFailedMessage();
    }

    n->tokenType = ri->tokenType;
    n->left = left;
    n->right = right;
    n->nodeNumber = ri->nodeNumber;
    ri->nodeNumber++;

    if (ri->tokenType == 'a') {
        ri->tokenType = 0;
        n->cc->cond->w = getWord(ri->tokenValue);
    } else {
        WordPtr w = (WordPtr)malloc(sizeof(Word));
        w->word = character;
        w->length = 1;
        n->cc->cond->w = w;
    }
    return n;
}

// <charClass> ::= '['<literal>'-'<literal>']'
static
NodePtr charClass(RegexInfoPtr ri) {
    NodePtr n = allocateNode();
    if (n == NULL) {
        mallocFailedMessage();
    }

    n->tokenType = ri->tokenType;
    n->nodeNumber = ri->nodeNumber;
    ri->nodeNumber++;
    n->cc->cond->w = (WordPtr)malloc(sizeof(Word));

    int i = 0;

    while (ri->ptr[i] != ']') {
        if (ri->ptr[i] == '-') {
            n->cc->begin = ri->ptr[i-1];
            n->cc->end = ri->ptr[i+1];
        }
        i++;
    }
            // TODO literal support

    n->cc->cond->w->word = (unsigned char*)malloc(sizeof(unsigned char)*(i+1));
    strncpy((char*)n->cc->cond->w->word, (char*)ri->ptr,i+1);
    n->cc->cond->w->word[i] = '\0';
    ri->ptr += i+1;

    return n;
}

// <literal> ::= [a-z][A-Z][0-9]
static
NodePtr literal(RegexInfoPtr ri) {
    NodePtr n = createNode(ri,ri->ptr,0,0);
    return n;
}

// <group> ::= '('<regex>')'
static
NodePtr group(RegexInfoPtr ri) {
    return regex(ri);
}

static
void token(RegexInfoPtr ri) {
    while (ri->ptr[0] != '\0') {
        if (ri->ptr[0] == '('){
            ri->ptr++;
            ri->tokenType = '(';
            ri->tokenValue = NULL;
            return;
        } else if (ri->ptr[0] == ')') {
            ri->ptr++;
            ri->tokenType = ')';
            ri->tokenValue = ri->ptr;
            return;
        } else if (ri->ptr[0] == '[') {
            ri->ptr++;
            ri->tokenType = 'c';
            ri->tokenValue = ri->ptr;
            return;
        } else if (ri->ptr[0] == '|'){
            ri->ptr++;
            ri->tokenType = '|';
            ri->tokenValue = NULL;
            return;
        } else if (ri->ptr[0] == '*'){
            ri->ptr++;
            ri->tokenType = '*';
            ri->tokenValue = NULL;
            return;
        } else if (ri->ptr[0] == '\\'){
            // need more proccesing 
            /*
                \277
                \0xa5
                \[
                \\
                \utf-8 etc...
            */
        } else {
            ri->tokenType = 'a';
            ri->tokenValue = ri->ptr;
            while (isalnum(ri->ptr[0])) {
                ri->ptr++;
            }
            return;
        }
    }
    return;
}

// <regexAtom> ::= <literal>|<charClass>
static
NodePtr regexAtom(RegexInfoPtr ri) {

    token(ri);
    NodePtr n = NULL;
    if (ri->tokenType == 'c') n = charClass(ri);

    return n;
}

// <regex> ::= <regexAtom> | <regexAtom><regex>'*' | <regexAtom>'*' | <regexAtom>'|'<regex> | <regexAtom><regex> | '(' regex ')'
NodePtr regex(RegexInfoPtr ri) {
    NodePtr n = NULL;
    while (ri->ptr[0]) {
        token(ri);
        if (ri->tokenType == '*') {
            // TODO literal support
            unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char));
            syntax[0] = '*';
            NodePtr n1 = createNode(ri,syntax,n->right,0);

            unsigned char *syntax1 = (unsigned char*)malloc(sizeof(unsigned char));
            syntax1[0] = '+';

            n = createNode(ri,syntax1,n->left,n1);
        } else if (ri->tokenType == '|') {
            NodePtr n1 = regex(ri);
            unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char));
            syntax[0] = '|';
            n = createNode(ri,syntax,n,n1);
        } else if (ri->tokenType == '(') {
            NodePtr n1 = regex(ri);
            unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char));
            syntax[0] = '+';
            n = createNode(ri,syntax,n,n1);
        } else if (ri->tokenType == ')') {
            return n;
        } else if (ri->tokenType == 'a') {
            NodePtr n1 = literal(ri);
            unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char));
            syntax[0] = '+';
            n = createNode(ri,syntax,n,n1);
        } else {
            // return NULL
            NodePtr n1 = regex(ri);
            unsigned char *syntax = (unsigned char*)malloc(sizeof(unsigned char));
            syntax[0] = '+';
            n = createNode(ri,syntax,n,n1);
        }
    } return n;
}