Mercurial > hg > Applications > Grep
comparison regexParser/regexParser.cc @ 178:5e8c6857934c pairPro
implement charClassMerge
author | Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 23 Dec 2015 19:17:36 +0900 |
parents | 3be0fbcd4b52 |
children | 6cf8252f3912 |
comparison
equal
deleted
inserted
replaced
177:8de9a33f6ae5 | 178:5e8c6857934c |
---|---|
41 } | 41 } |
42 | 42 |
43 return n; | 43 return n; |
44 } | 44 } |
45 | 45 |
46 CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right) { | |
47 CharClassPtr cc = NEW(CharClass); | |
48 cc->type = 'r'; | |
49 cc->cond.range.begin = begin; | |
50 cc->cond.range.end = end; | |
51 cc->left = left; | |
52 cc->right = right; | |
53 cc->nextState.bitContainer = 0; | |
54 return cc; | |
55 } | |
56 | |
57 CharClassPtr createCharClassWord(RegexInfoPtr ri) { | 46 CharClassPtr createCharClassWord(RegexInfoPtr ri) { |
58 CharClassPtr cc = NEW(CharClass); | 47 CharClassPtr cc = NEW(CharClass); |
59 cc->type = 'a'; | 48 cc->type = 'a'; |
60 cc->cond.w.word = ri->tokenValue; | 49 cc->cond.w.word = ri->tokenValue; |
61 cc->cond.w.length = ri->ptr - ri->tokenValue; | 50 cc->cond.w.length = ri->ptr - ri->tokenValue; |
62 cc->nextState.bitContainer = 0; | 51 cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue; |
52 cc->nextState = ri->current->bitSet; | |
63 return cc; | 53 return cc; |
64 } | 54 } |
65 | 55 |
66 /* | 56 /* |
67 cond.range.begin cond.range.end | 57 cond.range.begin cond.range.end |
87 | 77 |
88 |----------------| | 78 |----------------| |
89 13. b--e | 79 13. b--e |
90 | 80 |
91 */ | 81 */ |
92 CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) { | 82 CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end,BitVector nextState) { |
93 | 83 if (cc == NULL) { |
84 createCharClassRange(begin,end,nextState.bitContainer,0,0); | |
85 } | |
94 if (end < cc->cond.range.begin ) { // 1 | 86 if (end < cc->cond.range.begin ) { // 1 |
95 if (cc->left) { | 87 if (cc->left) { |
96 cc->left = insertCharClass(cc->left,begin,end); | 88 cc->left = insertCharClass(cc->left,begin,end,nextState); |
97 } else { | 89 } else { |
98 cc->left = createCharClassRange(begin,end,0,0); | 90 cc->left = createCharClassRange(begin,end,nextState.bitContainer,0,0); |
99 } | 91 } |
100 return cc; | 92 return cc; |
101 } else if (end == cc->cond.range.begin ) { // 2 | 93 } else if (end == cc->cond.range.begin ) { // 2 |
102 cc->cond.range.begin = begin; | 94 cc->cond.range.begin = begin; |
103 return cc; | 95 return cc; |
106 cc->cond.range.begin = begin; | 98 cc->cond.range.begin = begin; |
107 } | 99 } |
108 return cc; | 100 return cc; |
109 } else if (begin > cc->cond.range.end ) { // 13 | 101 } else if (begin > cc->cond.range.end ) { // 13 |
110 if (cc->right) { | 102 if (cc->right) { |
111 cc->right = insertCharClass(cc->right,begin,end); | 103 cc->right = insertCharClass(cc->right,begin,end,nextState); |
112 } else { | 104 } else { |
113 cc->right = createCharClassRange(begin,end,0,0); | 105 cc->right = createCharClassRange(begin,end,nextState.bitContainer,0,0); |
114 } | 106 } |
115 return cc; | 107 return cc; |
116 } | 108 } |
117 if (cc->right) { | 109 if (cc->right) { |
118 CharClassPtr right = cc->right; | 110 CharClassPtr right = cc->right; |
119 begin = cc->cond.range.begin; | 111 begin = cc->cond.range.begin; |
120 free(cc); | 112 free(cc); |
121 return insertCharClass(right,begin,end); | 113 return insertCharClass(right,begin,end,nextState); |
122 } | 114 } |
123 if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12 | 115 if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12 |
124 if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8 | 116 if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8 |
125 } else if (begin < cc->cond.range.begin) { // 5 | 117 } else if (begin < cc->cond.range.begin) { // 5 |
126 cc->cond.range.begin = begin; | 118 cc->cond.range.begin = begin; |
160 } | 152 } |
161 | 153 |
162 RangeListPtr r = cc->cond.range.next; | 154 RangeListPtr r = cc->cond.range.next; |
163 cc->cond.range.next = 0; | 155 cc->cond.range.next = 0; |
164 for (; r; r = r->next) { | 156 for (; r; r = r->next) { |
165 cc = insertCharClass(cc, r->begin, r->end); | 157 cc = insertCharClass(cc, r->begin, r->end,ri->current->bitState); |
166 } | 158 } |
167 | 159 |
168 n->cc = cc; | 160 n->cc = ri->current->cc = mergeTransition(ri->current,cc); |
169 // TODO literal support | 161 // TODO literal support |
170 // merge rangeList here | 162 // merge rangeList here |
171 if (*ri->ptr) ri->ptr++; | 163 if (*ri->ptr) ri->ptr++; |
172 token(ri); | 164 token(ri); |
173 return n; | 165 return n; |
253 token(ri); | 245 token(ri); |
254 } | 246 } |
255 if (ri->tokenType == '*') { | 247 if (ri->tokenType == '*') { |
256 n = createNode(ri,'*',0,n,0); | 248 n = createNode(ri,'*',0,n,0); |
257 token(ri); | 249 token(ri); |
258 } | 250 ri->asterisk = true; |
259 | 251 ri->node = n; |
260 return n; | 252 } |
261 } | 253 return n; |
254 } | |
255 | |
256 RegexInfoPtr createRegexInfo (RegexInfoPtr ri) { | |
257 ri->stateNumber++; | |
258 ri->asterisk = false; | |
259 ri->current = NEW(State); | |
260 ri->current->bitState.bitContainer = 0 | |
261 bitSet(ri.current->bitState,ri->stateNumber); | |
262 ri->current->next = ri->states; | |
263 ri->current->cc = NULL; | |
264 ri->current->node = NULL; | |
265 ri->states = ri->current; | |
266 return ri; | |
267 } | |
268 | |
262 | 269 |
263 // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> | 270 // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> |
264 NodePtr regex(RegexInfoPtr ri) { | 271 NodePtr regex(RegexInfoPtr ri) { |
265 token(ri); | 272 token(ri); |
266 NodePtr n = regexAtom(ri); | 273 NodePtr n = regexAtom(ri); |
267 while (ri->tokenType) { | 274 while (ri->tokenType) { |
268 if (ri->tokenType == '*') { | 275 if (ri->tokenType == '*') { |
269 n = createNode(ri,'*',0,n,0); | 276 n = createNode(ri,'*',0,n,0); |
270 token(ri); | 277 token(ri); |
278 ri->asterisk = true; | |
279 ri->node = n; | |
271 return n; | 280 return n; |
272 } else if (ri->tokenType == '|') { | 281 } else if (ri->tokenType == '|') { |
273 n = createNode(ri,'|',0,n,0); | 282 n = createNode(ri,'|',0,n,0); |
274 NodePtr n1 = regex(ri); | 283 NodePtr n1 = regex(ri); |
275 n->right = n1; | 284 n->right = n1; |
285 n->node = n; | |
276 } else if (ri->tokenType == ')') { | 286 } else if (ri->tokenType == ')') { |
277 return n; | 287 return n; |
278 } else { | 288 } else { |
279 n = createNode(ri,'+',0,n,0); | 289 n = createNode(ri,'+',0,n,0); |
280 NodePtr n1 = regexAtom(ri); | 290 if (!ri->asterisk) { |
281 n->right = n1; | 291 StatePtr save = ri->current; |
282 } | 292 ri = createRegexInfo(ri); |
283 } | 293 NodePtr n1 = regexAtom(ri); |
284 return n; | 294 n->right = n1; |
285 } | 295 ri->current = save; |
296 } else { | |
297 NodePtr n1 = regexAtom(ri); | |
298 n->right = n1; | |
299 } | |
300 } | |
301 } | |
302 return n; | |
303 } |