Mercurial > hg > Applications > Grep
comparison c/regexParser/regexParser.cc @ 134:dbafc753078e pairPro
fix concatination & selection & grouping
author | masa |
---|---|
date | Fri, 04 Dec 2015 17:45:09 +0900 |
parents | ccc673449351 |
children | e1a262ec75f0 |
comparison
equal
deleted
inserted
replaced
133:ccc673449351 | 134:dbafc753078e |
---|---|
3 #include <string.h> | 3 #include <string.h> |
4 #include <ctype.h> | 4 #include <ctype.h> |
5 #include "regexParser.h" | 5 #include "regexParser.h" |
6 #include "error.h" | 6 #include "error.h" |
7 | 7 |
8 static NodePtr allocateNode(); | |
9 static NodePtr createNode(RegexInfoPtr,unsigned char,NodePtr,NodePtr); | |
10 static NodePtr charClass(RegexInfoPtr); | 8 static NodePtr charClass(RegexInfoPtr); |
11 static NodePtr group(RegexInfoPtr); | |
12 static void token(RegexInfoPtr); | 9 static void token(RegexInfoPtr); |
13 static NodePtr regexAtom(RegexInfoPtr); | 10 static NodePtr regexAtom(RegexInfoPtr); |
14 NodePtr regex(RegexInfoPtr); | 11 NodePtr regex(RegexInfoPtr); |
15 | 12 |
16 /** | 13 /** |
35 cc->type = 'a'; | 32 cc->type = 'a'; |
36 cc->cond = NEW(Condition); | 33 cc->cond = NEW(Condition); |
37 cc->cond->w = NEW(Word); | 34 cc->cond->w = NEW(Word); |
38 cc->cond->w->word = ri->tokenValue; | 35 cc->cond->w->word = ri->tokenValue; |
39 cc->cond->w->length = ri->ptr - ri->tokenValue; | 36 cc->cond->w->length = ri->ptr - ri->tokenValue; |
37 token(ri); | |
40 | 38 |
41 return cc; | 39 return cc; |
42 } | 40 } |
43 | 41 |
44 static | 42 static |
45 NodePtr createNode(RegexInfoPtr ri,unsigned char type, NodePtr left, NodePtr right) { | 43 NodePtr createNode(RegexInfoPtr ri,unsigned char type,CharClassPtr cc, NodePtr left, NodePtr right) { |
46 NodePtr n = allocateNode(); | 44 NodePtr n = allocateNode(); |
47 | 45 |
48 n->tokenType = type; | 46 n->tokenType = type; |
47 n->cc = cc; | |
49 n->left = left; | 48 n->left = left; |
50 n->right = right; | 49 n->right = right; |
51 n->nodeNumber = ri->nodeNumber; | 50 n->nodeNumber = ri->nodeNumber; |
52 ri->nodeNumber++; | 51 ri->nodeNumber++; |
53 | |
54 if (type == 'a') { | |
55 n->cc = createCharClassWord(ri); | |
56 } | |
57 | 52 |
58 return n; | 53 return n; |
59 } | 54 } |
60 | 55 |
61 | 56 |
62 // <charClass> ::= '['<literal>'-'<literal>']' | 57 // <charClass> ::= '['<literal>'-'<literal>']' |
63 static | 58 static |
64 NodePtr charClass(RegexInfoPtr ri) { | 59 NodePtr charClass(RegexInfoPtr ri) { |
65 NodePtr n = allocateNode(); | |
66 | |
67 n->tokenType = 'c'; | |
68 n->nodeNumber = ri->nodeNumber; | |
69 ri->nodeNumber++; | |
70 | 60 |
71 CharClassPtr cc = NEW(CharClass); | 61 CharClassPtr cc = NEW(CharClass); |
72 cc->type = 'r'; | 62 cc->type = 'r'; |
73 cc->cond = NEW(Condition); | 63 cc->cond = NEW(Condition); |
74 cc->cond->range = NEW(RangeList); | 64 cc->cond->range = NEW(RangeList); |
91 i++; | 81 i++; |
92 } | 82 } |
93 // TODO literal support | 83 // TODO literal support |
94 | 84 |
95 rangeList->end = ri->ptr + i - 1; | 85 rangeList->end = ri->ptr + i - 1; |
96 | 86 NodePtr n = createNode(ri,'c',cc,0,0); |
87 token(ri); | |
97 return n; | 88 return n; |
98 } | 89 } |
99 | 90 |
100 // <literal> ::= [a-z][A-Z][0-9] | 91 // <literal> ::= [a-z][A-Z][0-9] |
101 static | 92 static |
102 NodePtr literal(RegexInfoPtr ri) { | 93 NodePtr literal(RegexInfoPtr ri) { |
103 NodePtr n = createNode(ri,'a',0,0); | 94 CharClassPtr cc = createCharClassWord(ri); |
95 NodePtr n = createNode(ri,'a',cc,0,0); | |
104 return n; | 96 return n; |
105 } | |
106 | |
107 // <group> ::= '('<regex>')' | |
108 static | |
109 NodePtr group(RegexInfoPtr ri) { | |
110 return regex(ri); | |
111 } | |
112 | |
113 static | |
114 void asterCheck(RegexInfoPtr ri) { | |
115 if (ri->ptr[0] == '*') { | |
116 ri->asterFlag = true; | |
117 } | |
118 return; | |
119 } | 97 } |
120 | 98 |
121 static | 99 static |
122 void token(RegexInfoPtr ri) { | 100 void token(RegexInfoPtr ri) { |
123 while (ri->ptr[0] != '\0') { | 101 while (ri->ptr[0] != '\0') { |
128 return; | 106 return; |
129 } else if (ri->ptr[0] == ')') { | 107 } else if (ri->ptr[0] == ')') { |
130 ri->ptr++; | 108 ri->ptr++; |
131 ri->tokenType = ')'; | 109 ri->tokenType = ')'; |
132 ri->tokenValue = ri->ptr; | 110 ri->tokenValue = ri->ptr; |
133 asterCheck(ri); | |
134 return; | 111 return; |
135 } else if (ri->ptr[0] == '[') { | 112 } else if (ri->ptr[0] == '[') { |
136 ri->ptr++; | 113 ri->ptr++; |
137 ri->tokenType = 'c'; | 114 ri->tokenType = 'c'; |
138 ri->tokenValue = ri->ptr; | 115 ri->tokenValue = ri->ptr; |
158 \utf-8 etc... | 135 \utf-8 etc... |
159 */ | 136 */ |
160 } else { | 137 } else { |
161 ri->tokenType = 'a'; | 138 ri->tokenType = 'a'; |
162 ri->tokenValue = ri->ptr; | 139 ri->tokenValue = ri->ptr; |
163 while (isalnum(ri->ptr[0])) { | 140 if (isalnum(ri->ptr[0])) { |
164 ri->ptr++; | 141 ri->ptr++; |
165 } | 142 } |
166 asterCheck(ri); | |
167 return; | 143 return; |
168 } | 144 } |
169 } | 145 } |
146 ri->tokenType = 0; | |
147 ri->tokenValue = NULL; | |
170 return; | 148 return; |
171 } | 149 } |
172 | 150 |
173 // <regexAtom> ::= <literal>|<charClass>|<group> | 151 // <regexAtom> ::= <literal>|<charClass>|<group> |
174 static | 152 static |
175 NodePtr regexAtom(RegexInfoPtr ri) { | 153 NodePtr regexAtom(RegexInfoPtr ri) { |
176 | 154 |
177 token(ri); | |
178 NodePtr n = NULL; | 155 NodePtr n = NULL; |
179 if (ri->tokenType == 'c') n = charClass(ri); | 156 if (ri->tokenType == 'c') n = charClass(ri); |
180 if (ri->tokenType == 'a') n = literal(ri); | 157 else if (ri->tokenType == 'a') n = literal(ri); |
181 if (ri->tokenType == '(') n = group(ri); | 158 else if (ri->tokenType == '(') { |
159 n = regex(ri); | |
160 if (ri->tokenType != ')') { | |
161 // error | |
162 } | |
163 token(ri); | |
164 } | |
165 if (ri->tokenType == '*') { | |
166 n = createNode(ri,'*',0,n,0); | |
167 token(ri); | |
168 } | |
182 | 169 |
183 return n; | 170 return n; |
184 } | 171 } |
185 | 172 |
186 // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> | 173 // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> |
187 NodePtr regex(RegexInfoPtr ri) { | 174 NodePtr regex(RegexInfoPtr ri) { |
175 token(ri); | |
188 NodePtr n = regexAtom(ri); | 176 NodePtr n = regexAtom(ri); |
189 while (ri->ptr[0]) { | 177 while (ri->tokenType) { |
190 token(ri); | |
191 if (ri->tokenType == '*') { | 178 if (ri->tokenType == '*') { |
192 n = createNode(ri,'*',n,0); | 179 n = createNode(ri,'*',0,n,0); |
193 ri->asterFlag = false; | 180 token(ri); |
181 return n; | |
194 } else if (ri->tokenType == '|') { | 182 } else if (ri->tokenType == '|') { |
183 n = createNode(ri,'|',0,n,0); | |
195 NodePtr n1 = regex(ri); | 184 NodePtr n1 = regex(ri); |
196 n = createNode(ri,'|',n,n1); | 185 n->right = n1; |
197 } else if (ri->tokenType == '(') { | |
198 ri->ptr--; | |
199 NodePtr n1 = regex(ri); | |
200 if (ri->asterFlag == true) { | |
201 n1 = createNode(ri,'*',n1,0); | |
202 ri->asterFlag = false; | |
203 ri->ptr++; | |
204 } | |
205 n = createNode(ri,'+',n,n1); | |
206 } else if (ri->tokenType == ')') { | 186 } else if (ri->tokenType == ')') { |
207 if (ri->orNum != 0 && ri->ptr[0] != ')') { | |
208 ri->ptr--; | |
209 ri->orNum--; | |
210 } | |
211 return n; | 187 return n; |
212 } else { | 188 } else { |
213 NodePtr n1 = NULL; | 189 n = createNode(ri,'+',0,n,0); |
214 if (ri->asterFlag == true) { | 190 NodePtr n1 = regexAtom(ri); |
215 ri->ptr = ri->tokenValue; | 191 n->right = n1; |
216 NodePtr n1 = regexAtom(ri); | |
217 n1 = createNode(ri,'*',n1,0); | |
218 ri->asterFlag = false; | |
219 ri->ptr++; | |
220 } else { | |
221 n1 = regex(ri); | |
222 } | |
223 n = createNode(ri,'+',n,n1); | |
224 } | 192 } |
225 } return n; | 193 } |
194 return n; | |
226 } | 195 } |