comparison c/regexParser/regexParser.cc @ 115:ca30f8334741 pairPro

rename createRegexParser.cc to regexParser.cc
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Tue, 24 Nov 2015 14:38:26 +0900
parents c/regexParser/createRegexParser.cc@ec485345daf9
children 66c633575b53
comparison
equal deleted inserted replaced
114:c82e7a7ef8d9 115:ca30f8334741
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include "regexParser.h"
4 #include "error.h"
5
6 typedef struct regexInfo {
7 unsigned char *ptr;
8 unsigned char tokenType;
9 int tokenValue;
10 int nodeNumber;
11 } RegexInfo, *RegexInfoPtr;
12
13 static NodePtr createNode(RegexInfoPtr,unsigned char,NodePtr,NodePtr);
14 static NodePtr charClass(RegexInfoPtr);
15 static NodePtr group(RegexInfoPtr);
16 static void token(RegexInfoPtr);
17 static NodePtr regexAtom(RegexInfoPtr);
18 NodePtr regex(RegexInfoPtr);
19
20 /**
21 * Create a node of regex parse tree.
22 * tokenType
23 * regexPosition(state)
24 * stateTransitionTable
25 */
26
27 static
28 NodePtr createNode(RegexInfoPtr ri,unsigned char character, NodePtr left, NodePtr right) {
29 NodePtr n = (NodePtr)malloc(sizeof(Node));
30 if (n == NULL) {
31 mallocFailedMessage();
32 }
33
34 n->tokenType = ri->tokenType;
35 n->cc->conditionList->character = character;
36 n->left = left;
37 n->right = right;
38
39 if (ri->tokenType == 'a') {
40 n->nodeNumber = ri->nodeNumber;
41 ri->nodeNumber++;
42 ri->tokenType = 0;
43 }
44 return n;
45 }
46
47 // <charClass> ::= '['<literal>'-'<literal>']'
48 static
49 NodePtr charClass(RegexInfoPtr ri) {
50 NodePtr n = (NodePtr)malloc(sizeof(Node));
51 if (n == NULL) {
52 mallocFailedMessage();
53 }
54 while (ri->ptr[0] == '-') {
55 ri->ptr++;
56 }
57 return n;
58 }
59
60 // <literal> ::= [a-z][A-Z][0-9]
61 static
62 NodePtr literal(RegexInfoPtr ri) {
63 NodePtr n = createNode(ri,ri->ptr[0],0,0);
64 ri->ptr++;
65 return n;
66 }
67
68 // <group> ::= '('<regex>')'
69 static
70 NodePtr group(RegexInfoPtr ri) {
71 return regex(ri);
72 }
73
74 static
75 void token(RegexInfoPtr ri) {
76 while (ri->ptr[0] != '\0') {
77 if (ri->ptr[0] == '('){
78 ri->ptr++;
79 ri->tokenType = '(';
80 ri->tokenValue = 0;
81 if (ri->ptr[1] == ')') {
82 ri->ptr++;
83 }
84 return;
85 } else if (ri->ptr[0] == ')') {
86 ri->ptr++;
87 ri->tokenType = ')';
88 ri->tokenValue = ri->ptr[0];
89 return;
90 } else if (ri->ptr[0] == '[') {
91 ri->ptr++;
92 ri->tokenType = '[';
93 ri->tokenValue = ri->ptr[0];
94 if (ri->ptr[1] == ']') {
95 ri->ptr++;
96 }
97 return;
98 } else if (ri->ptr[0] == '|'){
99 ri->ptr++;
100 ri->tokenType = '|';
101 ri->tokenValue = 0;
102 return;
103 } else if (ri->ptr[0] == '*'){
104 ri->ptr++;
105 ri->tokenType = '*';
106 ri->tokenValue = 0;
107 return;
108 } else if (ri->ptr[0] == '\\'){
109 // need more proccesing
110 /*
111 \277
112 \0xa5
113 \[
114 \\
115 \utf-8 etc...
116 */
117 } else {
118 ri->tokenType = 'a';
119 ri->tokenValue = ri->ptr[0];
120 return;
121 }
122 }
123 ri->tokenType = 0;
124 ri->tokenValue = 0;
125 return;
126 }
127
128 // <regexAtom> ::= <literal>|<charClass>|<group>
129 static
130 NodePtr regexAtom(RegexInfoPtr ri) {
131
132 token(ri);
133 NodePtr n = NULL;
134 if (ri->tokenType == 'a') n = literal(ri);
135 else if (ri->tokenType == '[') n = charClass(ri);
136 else if (ri->tokenType == '(') n = group(ri);
137
138 return n;
139 }
140
141 // <regex> ::= <regexAtom>|<regexAtom>'*'|<regexAtom>'|'<regex>|<regexAtom><regex>
142 NodePtr regex(RegexInfoPtr ri) {
143 NodePtr n = regexAtom(ri);
144 while (ri->ptr[0]) {
145 token(ri);
146 if (ri->tokenType == '*') {
147 n = createNode(ri,'*',n,0);
148 } else if (ri->tokenType == '|') {
149 NodePtr n1 = regex(ri);
150 n = createNode(ri,'|',n,n1);
151 } else if (ri->tokenType == ')') {
152 return n;
153 } else {
154 NodePtr n1 = regex(ri);
155 n = createNode(ri,'+',n,n1);
156 }
157 } return n;
158 }