comparison CbC-examples/regexp/dfareg.py @ 62:70947ddafad7

added test code, CbC-example/regexp
author ryoma <e075725@ie.u-ryukyu.ac.jp>
date Sun, 25 Apr 2010 17:46:01 +0900
parents
children
comparison
equal deleted inserted replaced
61:60c1b2f8487a 62:70947ddafad7
1 #!/usr/bin/env python
2
3 import copy
4 import sys
5
6 class NondeterministicFiniteAutomaton(object):
7 def __init__(self, transition, start, accepts, map_):
8 self.transition = transition
9 self.start = start
10 self.accepts = accepts
11 self.map = map_
12
13 def epsilonExpand(self, set_):
14 que = set(set_)
15 done = set()
16 while que:
17 stat = que.pop()
18 nexts = self.transition(stat, "")
19 done.add(stat)
20 for nextStat in nexts:
21 if not nextStat in done: que.add(nextStat)
22 return frozenset(done)
23
24 class DeterministicFiniteAutomaton(object):
25 def __init__(self, transition, start, accepts, map_):
26 self.transition = transition
27 self.start = start
28 self.accepts = accepts
29 self.map = map_
30 def getRuntime(self):
31 return DFARuntime(self)
32
33 class DFARuntime(object):
34 def __init__(self, DFA):
35 self.DFA = DFA
36 self.curState = self.DFA.start
37
38 def doTransition(self, char):
39 self.curState = self.DFA.transition(
40 self.curState, char)
41
42 def isAcceptState(self):
43 return self.curState in self.DFA.accepts
44
45 def doesAccept(self, input):
46 for alphabet in input:
47 self.doTransition(alphabet)
48 return self.isAcceptState()
49
50 class Token(object):
51 CHARACTER = 0
52 OPE_UNION = 1
53 OPE_STAR = 2
54 LPAREN = 3
55 RPAREN = 4
56 EOF = 5
57
58 def __init__(self, value, kind):
59 self.value = value
60 self.kind = kind
61
62
63 class Lexer(object):
64 def __init__(self, str):
65 self.stringList = list(str)
66
67 def scan(self):
68 if not self.stringList:
69 return Token(None, Token.EOF)
70
71 ch = self.stringList.pop(0)
72 if ch == '\\':
73 return Token(self.stringList.pop(0), Token.CHARACTER)
74 elif ch == '|':
75 return Token(ch, Token.OPE_UNION)
76 elif ch == '(':
77 return Token(ch, Token.LPAREN)
78 elif ch == ')':
79 return Token(ch, Token.RPAREN)
80 elif ch == '*':
81 return Token(ch, Token.OPE_STAR)
82 else:
83 return Token(ch, Token.CHARACTER)
84
85 """
86 Context-free Grammer:
87 (A) expression -> subexpr EOF
88 (B) subexpr -> seq '|' subexpr | seq
89 (C) seq -> subseq | ''
90 (D) subseq -> star subseq | star
91 (E) star -> factor '*' | factor
92 (F) factor -> '(' subexpr ')' | CHARACTER
93 """
94
95 class Parser(object):
96 def __init__(self, lexer):
97 self.lexer = lexer
98 self.look = None
99 self.move()
100
101 def match(self, tag):
102 if self.look.kind != tag:
103 raise Exeption("syntax error")
104 self.move()
105
106 def move(self):
107 self.look = self.lexer.scan()
108
109 def factor(self):
110 if self.look.kind == Token.LPAREN:
111 # factor -> '(' subexpr ')'
112 self.match(Token.LPAREN)
113 node = self.subexpr()
114 self.match(Token.RPAREN)
115 return node
116 else:
117 # factor -> CHARACTER
118 node = Character(self.look.value)
119 self.match(Token.CHARACTER)
120 return node
121
122 def star(self):
123 # star -> factor '*' | factor
124 node = self.factor()
125 if self.look.kind == Token.OPE_STAR:
126 self.match(Token.OPE_STAR)
127 node = Star(node)
128 return node
129
130 def seq(self):
131 if self.look.kind == Token.LPAREN \
132 or self.look.kind == Token.CHARACTER:
133 # seq -> subseq
134 return self.subseq()
135 else:
136 # seq -> ''
137 return Character("")
138
139 def subseq(self):
140 node1 = self.star()
141 if self.look.kind == Token.LPAREN \
142 or self.look.kind == Token.CHARACTER:
143 # subseq -> star subseq
144 node2 = self.subseq()
145 node = Concat(node1, node2)
146 return node
147 else:
148 # subseq -> star
149 return node1
150
151 def subexpr(self):
152 node1 = self.seq()
153 if self.look.kind == Token.OPE_UNION:
154 self.match(Token.OPE_UNION)
155 # subexpr -> seq '|' subexpr
156 node2 = self.subexpr()
157 node = Union(node1, node2)
158 return node
159 else:
160 # subexpr -> seq
161 return node1
162
163 def expression(self):
164 # expression -> subexpr EOF
165 node = self.subexpr()
166 self.match(Token.EOF)
167
168 # Create TREE, NFA
169 context = Context()
170 fragment = node.assemble(context)
171 return fragment.build()
172
173 class Context(object):
174 def __init__(self):
175 self.stateCount = 0
176
177 def newState(self):
178 self.stateCount += 1
179 return self.stateCount
180
181 class NFAFragment(object):
182 def __init__(self):
183 self.start = None # integer
184 self.accepts = None # frozenset
185 self.map = dict() # transition
186
187 def connect(self, from_, char, to):
188 slot = self.map.setdefault((from_, char), set())
189 slot.add(to)
190
191 def newSkelton(self):
192 # copy fragment (self), and it return
193 newFrag = NFAFragment();
194 newFrag.map = copy.deepcopy(self.map)
195 return newFrag
196
197 def __or__(self, frag):
198 newFrag = self.newSkelton()
199 for k, v in frag.map.iteritems():
200 newFrag.map[k] = v.copy()
201 return newFrag
202
203 def build(self):
204 map_ = self.map
205 def transition(state, char):
206 return frozenset(map_.get((state, char), []))
207
208 return NondeterministicFiniteAutomaton(
209 transition,
210 self.start,
211 self.accepts,
212 self.map
213 )
214
215 class Character(object):
216 def __init__(self, char):
217 self.char = char
218
219 def assemble(self, context):
220 frag = NFAFragment()
221 s1 = context.newState()
222 s2 = context.newState()
223 frag.connect(s1, self.char, s2)
224 frag.start = s1
225 frag.accepts = frozenset([s2])
226 return frag
227
228 class Union(object):
229 def __init__(self, op1, op2):
230 self.op1 = op1
231 self.op2 = op2
232
233 def assemble(self, context):
234 frag1 = self.op1.assemble(context)
235 frag2 = self.op2.assemble(context)
236 frag = frag1 | frag2
237 s = context.newState()
238 frag.connect(s, "", frag1.start)
239 frag.connect(s, "", frag2.start)
240 frag.start = s
241 frag.accepts = frag1.accepts | frag2.accepts
242 return frag
243
244 class Concat(object):
245 def __init__(self, op1, op2):
246 self.op1 = op1
247 self.op2 = op2
248
249 def assemble(self, context):
250 frag1 = self.op1.assemble(context)
251 frag2 = self.op2.assemble(context)
252 frag = frag1 | frag2
253
254 for state in frag1.accepts:
255 frag.connect(state, "", frag2.start)
256
257 frag.start = frag1.start
258 frag.accepts = frag2.accepts
259 return frag
260
261 class Star(object):
262 def __init__(self, op):
263 self.op = op
264
265 def assemble(self, context):
266 fragOrig = self.op.assemble(context)
267 frag = fragOrig.newSkelton()
268
269 for state in fragOrig.accepts:
270 frag.connect(state, "", fragOrig.start)
271
272 s = context.newState()
273 frag.connect(s, "", fragOrig.start)
274 frag.start = s
275 frag.accepts = fragOrig.accepts | frozenset([s])
276 return frag
277
278 class NonDisjoinSets(object):
279 def __init__(self, sub):
280 self.sub = sub
281 def __contains__(self, a_set):
282 return a_set & self.sub
283
284 def nfa2dfa(nfa):
285 def transition(set_, alpha):
286 ret = set()
287 for elem in set_:
288 ret |= nfa.transition(elem, alpha)
289 return nfa.epsilonExpand(frozenset(ret))
290
291 def stateSetTransition(stateSet, char):
292 trans = set()
293 for state in stateSet:
294 t = nfa.map.setdefault((state, char), None)
295 if not t == None: trans.update(nfa.epsilonExpand(t))
296 return frozenset(trans)
297
298 map_ = dict()
299 que = set(frozenset([nfa.epsilonExpand(set([nfa.start]))]))
300 done = set()
301
302 while que:
303 stateSet = que.pop()
304
305 for state in stateSet:
306 for k, v in nfa.map.iteritems():
307 if state == k[0] and k[1] != '':
308 slot = map_.setdefault((stateSet, k[1]), set())
309 slot.update(nfa.epsilonExpand(v))
310
311 done.add(stateSet)
312
313 for v in map_.itervalues():
314 if not v in done:
315 que.add(frozenset(v))
316
317 return DeterministicFiniteAutomaton(
318 transition,
319 nfa.epsilonExpand(frozenset([nfa.start])),
320 NonDisjoinSets(nfa.accepts),
321 map_
322 )
323
324 # emit CbC-souce, from dfa
325 def dfa2CbC(dfa, regexp):
326
327 # return state name from set
328 def set2name(set_):
329 l = list(set_)
330 l.sort()
331 return '_'.join(str(x) for x in l)
332
333 funcMap = dict()
334 funcMap[dfa.start] = set()
335
336 # dfa.map => {(frozenset([1, 15]), 'd'): frozenset([16]), ....}
337 # : frozenset(1, 15) x 'd' -> frozenset([16])
338
339 for v in dfa.map.itervalues():
340 funcMap[frozenset(v)] = set()
341
342 for key in dfa.map.iterkeys():
343 slot = funcMap.setdefault(key[0], set())
344 slot.add(key[1])
345
346 # emit cbc code
347 print "#include <stdio.h>\n"
348 print "#include <stdlib.h>\n"
349 for k in funcMap.iterkeys():
350 print '__code state_' + set2name(k) + "(char* s);"
351 print '__code accept();'
352 print '__code reject();'
353 print """
354 int main(int argc, char* argv[]) {
355 \tif (argc == 1) {
356 \t\tprintf(\"usage: %%s text\\n\", argv[0]);
357 \t\texit(0);
358 }
359
360 \tputs(\"regexp: %s\");
361 \tputs(\"number of state: %d\");
362 \tprintf(\"string: %%s\\n\", argv[1]);
363 \tgoto state_%s(argv[1]);
364 \treturn 0;
365 }\n""" % (regexp, len(funcMap), set2name(dfa.start))
366
367 for k, v in funcMap.iteritems():
368 print '__code state_' + set2name(k) + "(char* s) {"
369 print "\tswitch(*s++) {"
370 for case in v:
371 print "\t\tcase '%c': " % (case)
372 print "\t\t\tgoto state_%s(s);\n\t\t\tbreak;" % set2name(dfa.map[(frozenset(k), case)])
373 if k in dfa.accepts: print "\t\tcase '\\0': goto accept();\n\t\t\tbreak;"
374 print "\t\tdefault: goto reject();\n\t}"
375 print "}\n"
376
377 print """
378 __code accept() {
379 \tprintf(\"\\nstring matches regexp. \\n\\n\");
380 }\n"""
381 print """
382 __code reject() {
383 \tprintf(\"\\nstring does not match regexp. \\n\\n\");
384 }\n"""
385
386 class Regexp(object):
387 def __init__(self, regexp):
388 self.regexp = regexp
389 self.nfa = None
390 self.dfa = None
391 self._compile()
392
393 def _compile(self):
394 lexer_ = Lexer(self.regexp)
395 parser_ = Parser(lexer_)
396 self.nfa = parser_.expression()
397 self.dfa = nfa2dfa(self.nfa)
398
399 def emitCbC(self):
400 dfa2CbC(self.dfa, self.regexp)
401
402 def matches(self, string):
403 runtime = self.dfa.getRuntime()
404 return runtime.doesAccept(string)
405
406 def compile(regexp):
407 return Regexp(regexp)
408
409 def main(argv):
410 if len(argv) == 1 : exit("usage: dfareg.py regexp [> file]")
411 r = compile(argv[1])
412 r.emitCbC()
413
414 if __name__ == '__main__' : main(sys.argv)