comparison regexParser/regexParser.cc @ 178:5e8c6857934c pairPro

implement charClassMerge
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Wed, 23 Dec 2015 19:17:36 +0900
parents 3be0fbcd4b52
children 6cf8252f3912
comparison
equal deleted inserted replaced
177:8de9a33f6ae5 178:5e8c6857934c
41 } 41 }
42 42
43 return n; 43 return n;
44 } 44 }
45 45
46 CharClassPtr createCharClassRange(unsigned long begin, unsigned long end, CharClassPtr left, CharClassPtr right) {
47 CharClassPtr cc = NEW(CharClass);
48 cc->type = 'r';
49 cc->cond.range.begin = begin;
50 cc->cond.range.end = end;
51 cc->left = left;
52 cc->right = right;
53 cc->nextState.bitContainer = 0;
54 return cc;
55 }
56
57 CharClassPtr createCharClassWord(RegexInfoPtr ri) { 46 CharClassPtr createCharClassWord(RegexInfoPtr ri) {
58 CharClassPtr cc = NEW(CharClass); 47 CharClassPtr cc = NEW(CharClass);
59 cc->type = 'a'; 48 cc->type = 'a';
60 cc->cond.w.word = ri->tokenValue; 49 cc->cond.w.word = ri->tokenValue;
61 cc->cond.w.length = ri->ptr - ri->tokenValue; 50 cc->cond.w.length = ri->ptr - ri->tokenValue;
62 cc->nextState.bitContainer = 0; 51 cc->cond.range.begin = cc->cond.range.end = *ri->tokenValue;
52 cc->nextState = ri->current->bitSet;
63 return cc; 53 return cc;
64 } 54 }
65 55
66 /* 56 /*
67 cond.range.begin cond.range.end 57 cond.range.begin cond.range.end
87 77
88 |----------------| 78 |----------------|
89 13. b--e 79 13. b--e
90 80
91 */ 81 */
92 CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end) { 82 CharClassPtr insertCharClass(CharClassPtr cc, unsigned long begin, unsigned long end,BitVector nextState) {
93 83 if (cc == NULL) {
84 createCharClassRange(begin,end,nextState.bitContainer,0,0);
85 }
94 if (end < cc->cond.range.begin ) { // 1 86 if (end < cc->cond.range.begin ) { // 1
95 if (cc->left) { 87 if (cc->left) {
96 cc->left = insertCharClass(cc->left,begin,end); 88 cc->left = insertCharClass(cc->left,begin,end,nextState);
97 } else { 89 } else {
98 cc->left = createCharClassRange(begin,end,0,0); 90 cc->left = createCharClassRange(begin,end,nextState.bitContainer,0,0);
99 } 91 }
100 return cc; 92 return cc;
101 } else if (end == cc->cond.range.begin ) { // 2 93 } else if (end == cc->cond.range.begin ) { // 2
102 cc->cond.range.begin = begin; 94 cc->cond.range.begin = begin;
103 return cc; 95 return cc;
106 cc->cond.range.begin = begin; 98 cc->cond.range.begin = begin;
107 } 99 }
108 return cc; 100 return cc;
109 } else if (begin > cc->cond.range.end ) { // 13 101 } else if (begin > cc->cond.range.end ) { // 13
110 if (cc->right) { 102 if (cc->right) {
111 cc->right = insertCharClass(cc->right,begin,end); 103 cc->right = insertCharClass(cc->right,begin,end,nextState);
112 } else { 104 } else {
113 cc->right = createCharClassRange(begin,end,0,0); 105 cc->right = createCharClassRange(begin,end,nextState.bitContainer,0,0);
114 } 106 }
115 return cc; 107 return cc;
116 } 108 }
117 if (cc->right) { 109 if (cc->right) {
118 CharClassPtr right = cc->right; 110 CharClassPtr right = cc->right;
119 begin = cc->cond.range.begin; 111 begin = cc->cond.range.begin;
120 free(cc); 112 free(cc);
121 return insertCharClass(right,begin,end); 113 return insertCharClass(right,begin,end,nextState);
122 } 114 }
123 if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12 115 if (begin >= cc->cond.range.begin && begin <= cc->cond.range.end) { // 12
124 if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8 116 if (end > cc->cond.range.end) cc->cond.range.end = end; // 11,8
125 } else if (begin < cc->cond.range.begin) { // 5 117 } else if (begin < cc->cond.range.begin) { // 5
126 cc->cond.range.begin = begin; 118 cc->cond.range.begin = begin;
160 } 152 }
161 153
162 RangeListPtr r = cc->cond.range.next; 154 RangeListPtr r = cc->cond.range.next;
163 cc->cond.range.next = 0; 155 cc->cond.range.next = 0;
164 for (; r; r = r->next) { 156 for (; r; r = r->next) {
165 cc = insertCharClass(cc, r->begin, r->end); 157 cc = insertCharClass(cc, r->begin, r->end,ri->current->bitState);
166 } 158 }
167 159
168 n->cc = cc; 160 n->cc = ri->current->cc = mergeTransition(ri->current,cc);
169 // TODO literal support 161 // TODO literal support
170 // merge rangeList here 162 // merge rangeList here
171 if (*ri->ptr) ri->ptr++; 163 if (*ri->ptr) ri->ptr++;
172 token(ri); 164 token(ri);
173 return n; 165 return n;
253 token(ri); 245 token(ri);
254 } 246 }
255 if (ri->tokenType == '*') { 247 if (ri->tokenType == '*') {
256 n = createNode(ri,'*',0,n,0); 248 n = createNode(ri,'*',0,n,0);
257 token(ri); 249 token(ri);
258 } 250 ri->asterisk = true;
259 251 ri->node = n;
260 return n; 252 }
261 } 253 return n;
254 }
255
256 RegexInfoPtr createRegexInfo (RegexInfoPtr ri) {
257 ri->stateNumber++;
258 ri->asterisk = false;
259 ri->current = NEW(State);
260 ri->current->bitState.bitContainer = 0
261 bitSet(ri.current->bitState,ri->stateNumber);
262 ri->current->next = ri->states;
263 ri->current->cc = NULL;
264 ri->current->node = NULL;
265 ri->states = ri->current;
266 return ri;
267 }
268
262 269
263 // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex> 270 // <regex> ::= <regexAtom> | <regexAtom>'*'<regex> | <regexAtom>'|'<regex> | <regexAtom><regexAtom>'*' | <regexAtom><regex>
264 NodePtr regex(RegexInfoPtr ri) { 271 NodePtr regex(RegexInfoPtr ri) {
265 token(ri); 272 token(ri);
266 NodePtr n = regexAtom(ri); 273 NodePtr n = regexAtom(ri);
267 while (ri->tokenType) { 274 while (ri->tokenType) {
268 if (ri->tokenType == '*') { 275 if (ri->tokenType == '*') {
269 n = createNode(ri,'*',0,n,0); 276 n = createNode(ri,'*',0,n,0);
270 token(ri); 277 token(ri);
278 ri->asterisk = true;
279 ri->node = n;
271 return n; 280 return n;
272 } else if (ri->tokenType == '|') { 281 } else if (ri->tokenType == '|') {
273 n = createNode(ri,'|',0,n,0); 282 n = createNode(ri,'|',0,n,0);
274 NodePtr n1 = regex(ri); 283 NodePtr n1 = regex(ri);
275 n->right = n1; 284 n->right = n1;
285 n->node = n;
276 } else if (ri->tokenType == ')') { 286 } else if (ri->tokenType == ')') {
277 return n; 287 return n;
278 } else { 288 } else {
279 n = createNode(ri,'+',0,n,0); 289 n = createNode(ri,'+',0,n,0);
280 NodePtr n1 = regexAtom(ri); 290 if (!ri->asterisk) {
281 n->right = n1; 291 StatePtr save = ri->current;
282 } 292 ri = createRegexInfo(ri);
283 } 293 NodePtr n1 = regexAtom(ri);
284 return n; 294 n->right = n1;
285 } 295 ri->current = save;
296 } else {
297 NodePtr n1 = regexAtom(ri);
298 n->right = n1;
299 }
300 }
301 }
302 return n;
303 }