view c/regexParser/main.cc @ 71:58d2b10988c9

move ptr++ in token()
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Tue, 25 Aug 2015 20:56:26 +0900
parents 87dff3a124ab
children c4b934048e2a
line wrap: on
line source

/*
 * <literal> ::= [a-z][A-Z][0-9]
 * <charClass> ::= '['<literal>'-'<literal>']'
 * <group> ::= '('<regex>')'
 * <regexAtom> ::= <literal>|<charClass>|<group>
 * <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex>
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct charClass {
    unsigned char table[256];
    struct utf8Range {
        unsigned char *begin;
        unsigned char *end;
        struct utf8Range *next;
    } *rangeList;
} CharClass, *CharClassPtr;

typedef struct node {
    unsigned char type;
    union value {
        charClass *cc;
        unsigned char character;
    } Value, *ValuePtr;
    struct node *self;
    struct node *parent;
    struct node *left;
    struct node *right;
} Node, *NodePtr;

unsigned char *ptr;
unsigned char tokenType;
int tokenValue;
NodePtr regexHeadNode;

NodePtr charClass();
NodePtr group();
NodePtr orexp();
NodePtr asterisk();
NodePtr regex();
NodePtr createNode(unsigned char,NodePtr,NodePtr);
extern void token();
extern NodePtr regexAtom();


bool isLiteral(char c) {
    if (*ptr > 0x7f) return true;
    else if (*ptr == '(') return false;
    else if (*ptr == '[') return false;
    else if (*ptr == '|') return false;
    else if (*ptr == '*') return false;
    return true;
}

void printNodeDate(NodePtr n) {
    puts("---------------------");
    printf("Self  Node char : %c\n", n->Value.character);
    printf("Self  Node addr : %p\n", n->self);
    printf("left  Node addr : %p\n", n->left);
    printf("right Node addr : %p\n", n->right);
    puts("---------------------");
    puts("");
}

NodePtr createNode(unsigned char character, NodePtr left, NodePtr right) {
    NodePtr n = (NodePtr)malloc(sizeof(Node));
    n->self = n;
    n->Value.character = character;
    n->left = left;
    n->right = right;

    printNodeDate(n);
    return n;
}

// <charClass> ::= '['<literal>'-'<literal>']'
NodePtr charClass() {
    NodePtr n = (NodePtr)malloc(sizeof(Node));
    unsigned char startChar = *ptr;
    while (*ptr == '-') {
        ptr++;
    }
    unsigned char endChar = *ptr;
    unsigned char *charTable = (unsigned char*)malloc(sizeof(char)*256);

    return n;
}

// <literal> ::= [a-z][A-Z][0-9]
NodePtr literal() {
    NodePtr n = createNode(*ptr,0,0);
    ptr++;
    return n;
}

// <group> ::= '('<regex>')'
NodePtr group() {
    NodePtr n = regex();
    if (*ptr == ')') {
        n = createNode('(',n,0);
    } else {
        // ) reqiured
    }
    return n;
}



void token() {
    while (*ptr != '\0') {
        if ((*ptr == '(') || (*ptr == ')')) {
            ptr++;
            tokenType = '(';
            tokenValue = 0;
            if (ptr[1] == ')') {
                ptr++;
            }
            return;
        } else if (*ptr == '[') {
            ptr++;
            tokenType = '[';
            tokenValue = *ptr;
            if (ptr[1] == ']') {
                ptr++;
            }
            return;
        } else if (*ptr == '|'){
            ptr++;
            tokenType = '|';
            tokenValue = 0;
            return;
        } else if (*ptr == '*'){
            ptr++;
            tokenType = '*';
            tokenValue = 0;
            return;
        }

        tokenType = 'a';
        tokenValue = *ptr;
        return;

        if (*ptr == '\\') ptr++; // need more proccesing 
        /*
            \277
            \0xa5
            \[
            \\
            \utf-8 etc...
        */

    }

    tokenType = 0;
    tokenValue = 0;
    return;
}

// <regexAtom> ::= <literal>|<charClass>|<group>
NodePtr regexAtom() {

    token();
    NodePtr n = NULL;
    if (tokenType == 'a') n = literal();
    else if (tokenType == '[') n = charClass();
    else if (tokenType == '(') n = group();

    return n;
}

// <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex>
NodePtr regex() {
    NodePtr n = regexAtom();
    while (*ptr) {
        token();
        if (tokenType == '*') {
            n = createNode('*',n,0);
        } else if (tokenType == '|') {
            NodePtr n1 = regex();
            n = createNode('|',n,n1);
        } else {
            NodePtr n1 = regex();
            n = createNode('+',n,n1);
        }
    } return n;
}
/*
 * e.g.
 *
 * % ./regexParser -regex abc
 *     c
 *   +
 *     b
 * +
 *   a
 *
 * % ./regexParser -regex (a*|bc)d
 *
 *     d
 * +
 *       c
 *     +
 *       b
 *   |
 *     *
 *       a
 */

void descendTree(NodePtr n) {
    static int d = 0;
    if (n->right != NULL) {
        d++;
        descendTree(n->right);
        d--;
    }
    printf("%*c%c\n",d*4, ' ',n->Value.character);
    if (n->left != NULL) {
        d++;
        descendTree(n->left);
        d--;
    }
}

void printTree(NodePtr n) {
    descendTree(n);
}


int main(int argc, char **argv)
{
    for (int i = 1; i < argc; i++) {
        if (strcmp(argv[i],"-regex") == 0) {
            ptr = (unsigned char*)argv[i+1]; i++;
        }
    }

    printf("regex : %s\n",ptr);
    NodePtr n = regex();
    printTree(n);
    return 0;
}