Mercurial > hg > Applications > Grep
comparison regexParser/subsetConstruction.cc @ 216:4852bfa85db4
spell fix
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 30 Dec 2015 15:05:06 +0900 |
parents | regexParser/subsetConstraction.cc@63e9224c7b2b |
children | a9e3512120e2 |
comparison
equal
deleted
inserted
replaced
215:63e9224c7b2b | 216:4852bfa85db4 |
---|---|
1 #include <stdio.h> | |
2 #include <stdlib.h> | |
3 #include <ctype.h> | |
4 | |
5 #include "regexParser.h" | |
6 #include "subsetConstruction.h" | |
7 #include "node.h" | |
8 #include "BitVector.h" | |
9 #include "error.h" | |
10 | |
11 CharClassPtr createCharClassRange(unsigned long begin, unsigned long end,unsigned long state, CharClassPtr left, CharClassPtr right) { | |
12 CharClassPtr cc = NEW(CharClass); | |
13 cc->type = 'r'; | |
14 cc->cond.range.begin = begin; | |
15 cc->cond.range.end = end; | |
16 cc->cond.range.next = NULL; | |
17 cc->left = left; | |
18 cc->right = right; | |
19 cc->nextState.bitContainer = state; | |
20 return cc; | |
21 } | |
22 | |
23 CharClassPtr mergeCCTree(CharClassPtr cc,unsigned char mBegin,unsigned char mEnd,BitVector nextState) { | |
24 CharClassPtr cc1; | |
25 if (cc) { | |
26 cc1 = charClassMerge(cc,mBegin,mEnd,nextState); | |
27 } else { | |
28 cc1 = createCharClassRange(mBegin,mEnd,nextState.bitContainer,NULL,NULL); | |
29 } | |
30 return cc1; | |
31 } | |
32 | |
33 CharClassPtr charClassMerge(CharClassPtr cc,unsigned long begin, unsigned long end, BitVector nextState) { | |
34 // 重なっているccの領域を分割する | |
35 // 必要ならばnextStateを重ねあわせる | |
36 // 変更があった場合は新しくリストを作って返す | |
37 if (end < cc->cond.range.begin ) { // 1 | |
38 if (cc->left) { | |
39 return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,charClassMerge(cc->left,begin,end,nextState),cc->right); | |
40 } else { | |
41 return createCharClassRange(begin,end,nextState.bitContainer,NULL,cc); | |
42 } | |
43 } else if (end == cc->cond.range.begin && begin != end ) { // 2 | |
44 CharClassPtr cc1 = mergeCCTree(cc->left,begin,end-1,nextState); | |
45 if (cc->cond.range.begin == cc->cond.range.end) { | |
46 return createCharClassRange(cc->cond.range.begin,cc->cond.range.end, | |
47 cc->nextState.bitContainer | nextState.bitContainer, cc1,cc->right); | |
48 } | |
49 CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right); | |
50 return createCharClassRange(cc->cond.range.begin,cc->cond.range.begin, | |
51 cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); | |
52 } else if (end < cc->cond.range.end) { // range.begin < end | |
53 if (begin < cc->cond.range.begin) { // 3 | |
54 CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); | |
55 CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,cc->left,cc->right); | |
56 return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); | |
57 } | |
58 if (begin == cc->cond.range.begin) { // 6 | |
59 CharClassPtr cc2 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right); | |
60 return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc2); | |
61 } | |
62 // 9 | |
63 CharClassPtr cc2 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL); | |
64 CharClassPtr cc3 = createCharClassRange(end+1,cc->cond.range.end,cc->nextState.bitContainer,NULL,cc->right); | |
65 return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc2,cc3); | |
66 } else if (end == cc->cond.range.end) { | |
67 if (begin == cc->cond.range.begin) { // 7 | |
68 return createCharClassRange(begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc->right); | |
69 } else if (begin < cc->cond.range.begin) { // 4 | |
70 CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); | |
71 return createCharClassRange(cc->cond.range.begin,end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc->right); | |
72 } | |
73 // 10 cond.range.begin < begin | |
74 CharClassPtr cc2 = createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,NULL,cc->right); | |
75 return createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,cc2); | |
76 } | |
77 if (begin > cc->cond.range.end ) { // 13 | |
78 if (cc->right) { | |
79 return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,nextState.bitContainer,cc->left,charClassMerge(cc->right,begin,end,nextState)); | |
80 } else { | |
81 return createCharClassRange(begin,end,nextState.bitContainer,cc,NULL); | |
82 } | |
83 } | |
84 if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { | |
85 if (end > cc->cond.range.end) { // cond.range.end < end | |
86 if (begin == cc->cond.range.begin) { // 8 | |
87 CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); | |
88 return createCharClassRange(cc->cond.range.begin,cc->cond.range.end, | |
89 cc->nextState.bitContainer | nextState.bitContainer,cc->left,cc1); | |
90 } | |
91 if (begin > cc->cond.range.begin) { // 11,12 | |
92 CharClassPtr cc1 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); | |
93 CharClassPtr cc3 = createCharClassRange(cc->cond.range.begin,begin-1,cc->nextState.bitContainer,cc->left,NULL); | |
94 return createCharClassRange(begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc3,cc1); | |
95 } | |
96 } | |
97 } else if (begin < cc->cond.range.begin) { // 5 | |
98 CharClassPtr cc1 = mergeCCTree(cc->left,begin,cc->cond.range.begin-1,nextState); | |
99 CharClassPtr cc3 = mergeCCTree(cc->right,cc->cond.range.end+1,end,nextState); | |
100 return createCharClassRange(cc->cond.range.begin,cc->cond.range.end,cc->nextState.bitContainer | nextState.bitContainer,cc1,cc3); | |
101 } else { | |
102 printf("charClassMerge Error : begin %lu end %lu cc->begin %lu cc->end %lu\n", begin,end,cc->cond.range.begin,cc->cond.range.end); | |
103 } | |
104 return cc; | |
105 } | |
106 | |
107 void findLeftMost(CharClassPtr next,CharClassWalkerPtr walk) { | |
108 while (next->left) { | |
109 CharClassStackPtr ccs = NEW(CharClassStack); | |
110 ccs->next = walk->stack; | |
111 ccs->turn = walk->turn; | |
112 walk->turn = LEFT; | |
113 ccs->cc = next; | |
114 walk->stack = ccs; | |
115 next = next->left; | |
116 } | |
117 walk->turn = SELF; | |
118 walk->next = next; | |
119 } | |
120 | |
121 CharClassWalkerPtr createCharClassWalker (CharClassPtr next) { | |
122 CharClassWalkerPtr walk = NEW(CharClassWalker); | |
123 walk->next = NULL; | |
124 walk->stack = NULL; | |
125 if (!next) return walk; | |
126 findLeftMost(next,walk); | |
127 return walk; | |
128 } | |
129 | |
130 bool hasNext(CharClassWalkerPtr walk) { | |
131 return walk->next != NULL; | |
132 } | |
133 | |
134 CharClassStackPtr charClassStackPop(CharClassWalkerPtr walk) { | |
135 CharClassStackPtr prev = walk->stack->next; | |
136 walk->turn = walk->stack->turn; | |
137 free(walk->stack); | |
138 walk->stack = prev; | |
139 return prev; | |
140 } | |
141 | |
142 CharClassPtr getNext(CharClassWalkerPtr walk) { | |
143 CharClassPtr current = walk->next; | |
144 walk->next = NULL; | |
145 if (walk->turn == SELF) { | |
146 if (current->right) { | |
147 walk->turn = RIGHT; | |
148 findLeftMost(current->right,walk); | |
149 return current; | |
150 } | |
151 } | |
152 while (walk->stack) { | |
153 walk->next = walk->stack->cc; | |
154 charClassStackPop(walk); | |
155 if (walk->turn == LEFT) { | |
156 walk->turn = SELF; | |
157 return current; | |
158 } | |
159 } | |
160 walk->next = NULL; | |
161 return current; | |
162 } | |
163 | |
164 void setState(CharClassPtr cc, BitVector bi) { | |
165 cc->nextState = bi; | |
166 if (cc->left) { | |
167 setState(cc->left,bi); | |
168 } | |
169 cc->nextState = bi; | |
170 if (cc->right) { | |
171 setState(cc->right,bi); | |
172 } | |
173 } | |
174 | |
175 CharClassPtr mergeTransition(StatePtr x,CharClassPtr y) { | |
176 if (x->cc == NULL) { | |
177 return y; | |
178 } | |
179 CharClassWalkerPtr walk = createCharClassWalker(x->cc); | |
180 CharClassPtr ccy = y; | |
181 BitVector bi; | |
182 while (hasNext(walk)) { | |
183 CharClassPtr cc = getNext(walk); | |
184 unsigned long begin = cc->cond.range.begin; | |
185 unsigned long end = cc->cond.range.end; | |
186 bi = cc->nextState; | |
187 ccy = charClassMerge(ccy,begin,end,bi); | |
188 } | |
189 free(walk); | |
190 return ccy; | |
191 } | |
192 | |
193 /** | |
194 作成する state を linked list | |
195 bitvector を index とした配列に BitVectorPtr を格納 | |
196 state に対応する NodePtr を | |
197 */ | |
198 StatePtr createState(TGValue tgv,NodePtr n) { | |
199 StatePtr s = NEW(State); | |
200 s->stateNum = n->stateNum = tgv.tg->totalStateCount; | |
201 s->next = tgv.tg->stateList; | |
202 tgv.tg->stateList = s; | |
203 s->node = n; | |
204 BitVector bi = createBitVector(n->stateNum); | |
205 s->bitState = bi; | |
206 tgv.tg->totalStateCount++; | |
207 s->cc = n->cc; | |
208 return s; | |
209 } | |
210 | |
211 /** | |
212 pass 1 : 正規表現に必要な状態を探して、それぞれに番号を割り振る | |
213 前が * でない + は新しく状態を作る | |
214 * があったら、次の状態はその時の先頭の状態になる | |
215 常に先頭の状態を返す | |
216 最後が*ならば、それを持ち歩く | |
217 pass 2: | |
218 割り当てられた状態に沿って charclass の行き先を書き換える | |
219 書き換えた charclass を merge する | |
220 前の部分に * がない + は新しい状態をつくるので、state を切り替えて生成する | |
221 */ | |
222 TGValue generateTransition(NodePtr n,TGValue tgv, int pass) { | |
223 if (n->tokenType == '+') { | |
224 TGValue tgvLeft = tgv; | |
225 tgvLeft.endState = n->right->state; | |
226 tgvLeft.asterisk = NULL; | |
227 tgvLeft = generateTransition(n->left,tgv,pass); | |
228 TGValue tgvRight = tgvLeft; | |
229 if (tgvLeft.asterisk) { | |
230 n->right->state = tgv.endState; | |
231 tgvRight.startState = tgvRight.asterisk; | |
232 tgvRight = generateTransition(n->right,tgvRight,pass); | |
233 tgvLeft.asterisk = tgvRight.asterisk; | |
234 return tgvLeft; | |
235 } | |
236 tgvRight.asterisk = NULL; | |
237 if (pass==1) { | |
238 n->right->state = tgvRight.startState = createState(tgvRight,n->right); | |
239 } else { | |
240 tgvLeft.startState = n->right->state; | |
241 tgvLeft.tg->stateArray[tgvLeft.startState->bitState.bitContainer] = tgvLeft.startState ; | |
242 } | |
243 tgvRight = generateTransition(n->right,tgvRight,pass); | |
244 tgvLeft.asterisk = tgvRight.asterisk; | |
245 return tgvLeft; | |
246 } else if (n->tokenType == '|') { | |
247 TGValue tgv1 = generateTransition(n->left,tgv,pass); | |
248 TGValue tgv2 = generateTransition(n->right,tgv1,pass); | |
249 return tgv2; | |
250 } else if (n->tokenType == '*') { | |
251 TGValue tgvAstah = tgv; | |
252 tgvAstah.endState = tgvAstah.startState; | |
253 tgvAstah = generateTransition(n->left,tgvAstah,pass); | |
254 tgvAstah.asterisk = tgvAstah.startState; | |
255 return tgvAstah; | |
256 } else if (n->tokenType == 'c' || n->tokenType == 'a'){ | |
257 TGValue tgv1 = tgv; | |
258 if (pass==1) { | |
259 n->stateNum = tgv.startState->stateNum; | |
260 n->nextStateNum = tgv.endState->stateNum; | |
261 n->state = tgv.startState;; | |
262 n->nextState = tgv.endState; | |
263 } else { | |
264 BitVector bi = createBitVector(n->nextStateNum); | |
265 setState(n->cc,bi); | |
266 tgv1.startState->cc = mergeTransition(tgv1.startState,n->cc); | |
267 } | |
268 return tgv1; | |
269 } else { | |
270 return tgv; | |
271 } | |
272 } | |
273 | |
274 TransitionGeneratorPtr createTransitionGenerator() { | |
275 TransitionGeneratorPtr tg = NEW(TransitionGenerator); | |
276 tg->totalStateCount = 0; | |
277 tg->stack = NULL; | |
278 tg->stateArray = NULL; | |
279 tg->stateList = NULL; | |
280 return tg; | |
281 } | |
282 | |
283 TGValue createTGValue() { | |
284 TGValue tgv; | |
285 tgv.startState = NULL; | |
286 tgv.endState = NULL; | |
287 tgv.asterisk = NULL; | |
288 tgv.tg = createTransitionGenerator(); | |
289 return tgv; | |
290 } | |
291 | |
292 TGValue generateTransitionList(NodePtr n) { | |
293 TGValue tgv = createTGValue(); | |
294 StatePtr startState = tgv.startState = createState(tgv,n); | |
295 NodePtr eof = createNode(NULL,'e',NULL,NULL,NULL); | |
296 StatePtr endState = tgv.endState = createState(tgv,eof); | |
297 tgv = generateTransition(n,tgv,1); | |
298 printTree(n); | |
299 if (tgv.tg->totalStateCount > BITBLOCK) { | |
300 errorMassege("StateMax > BITBLOCK",__LINE__,__FILE__); | |
301 } | |
302 BitVector bi = createBitVector(tgv.tg->totalStateCount); | |
303 tgv.tg->stateArray = (StatePtr*)calloc(bi.bitContainer*2,sizeof(StatePtr*)); | |
304 tgv.tg->stateArray[startState->bitState.bitContainer] = startState; | |
305 tgv.tg->stateArray[endState->bitState.bitContainer] = endState; | |
306 tgv = generateTransition(n,tgv,2); | |
307 return tgv; | |
308 } | |
309 | |
310 void printState(StatePtr state) { | |
311 printf("state : %lx\n",state->bitState.bitContainer); | |
312 long nodeNumber = 0; | |
313 if (state->node) { | |
314 printf("node : %c %lx -> %d\n",state->node->tokenType,state->bitState.bitContainer,state->node->nextStateNum); | |
315 if (state->node->state) | |
316 nodeNumber = state->node->state->bitState.bitContainer; | |
317 } | |
318 if (state->cc) { | |
319 printCharacterClass(state->cc,nodeNumber,4); | |
320 } | |
321 } | |
322 | |
323 void printState(TransitionGeneratorPtr tg) { | |
324 StatePtr state = tg->stateList; | |
325 for (;state;state = state->next) { | |
326 printState(state); | |
327 putchar('\n'); | |
328 } | |
329 } | |
330 | |
331 SCValue createSCValue(TGValue tgv) { | |
332 SCValue scv; | |
333 scv.stateTop = tgv.tg->stateList; | |
334 scv.stateEnd = scv.stateTop; | |
335 while (scv.stateEnd->next) { | |
336 scv.stateEnd = scv.stateEnd->next; | |
337 } | |
338 return scv; | |
339 } | |
340 | |
341 SCValue createState(SCValue scv,BitVector bi) { | |
342 StatePtr s = NEW(State); | |
343 s->stateNum = ++scv.tg->totalStateCount; | |
344 s->next = NULL; | |
345 scv.stateEnd->next = s; | |
346 scv.stateEnd = s; | |
347 s->bitState = bi; | |
348 s->cc = NULL; | |
349 return scv; | |
350 } | |
351 | |
352 /** | |
353 現在のステートに含まれる組み合わせ状態をとってくる | |
354 組み合わされた個々の charclass をmerge して新しい charclass をつくり、組み合わせ状態に登録する | |
355 生成した状態は stateArray に格納するA | |
356 新しい状態ができなくなったら終了 | |
357 | |
358 charClassMerge の段階で新しい状態をリストに登録したら、charclasswalk をする必要がなくなる | |
359 */ | |
360 SCValue subsetConstruction(SCValue scv) { | |
361 for (;scv.stateTop;scv.stateTop = scv.stateTop->next) { | |
362 CharClassWalkerPtr cw = createCharClassWalker(scv.stateTop->cc); | |
363 while (hasNext(cw)) { | |
364 CharClassPtr cc = getNext(cw); | |
365 BitVector bi = cc->nextState; | |
366 if (scv.stateArray[bi.bitContainer]) continue; | |
367 scv = createState(scv,bi); | |
368 StatePtr s = scv.stateEnd; | |
369 for (;bi.bitContainer;) { | |
370 int bitPosition = searchBit(bi); | |
371 unsigned long baseNum = 1 << bitPosition; | |
372 bi.bitContainer ^= baseNum; | |
373 StatePtr base = scv.stateArray[baseNum]; | |
374 if (base == NULL) {// error | |
375 continue; | |
376 } | |
377 CharClassPtr merge = mergeTransition(s,base->cc); | |
378 s->cc = merge; | |
379 } | |
380 scv.stateArray[bi.bitContainer] = s; | |
381 } | |
382 } | |
383 return scv; | |
384 } |