# HG changeset patch # User Ryoma SHINYA # Date 1292162671 -32400 # Node ID 4d498b002de5a3bea3eb2d82da48ace6abc0ef5b # Parent 5db856953793bb62ba0b1e925f4b5eefa6e16309 refactor for data-structure (dict -> Transition). diff -r 5db856953793 -r 4d498b002de5 pyrect/regexp/__init__.py --- a/pyrect/regexp/__init__.py Sun Dec 12 19:02:37 2010 +0900 +++ b/pyrect/regexp/__init__.py Sun Dec 12 23:04:31 2010 +0900 @@ -5,7 +5,6 @@ from pyrect.regexp.nfa import NFA from pyrect.regexp.nfa_translator import NFATranslator from pyrect.regexp.dfa_translator import DFATranslator, SuffixDFATranslator, SuffixTrieTranslator -from pyrect.regexp.callgraph import CallGraph from pyrect.regexp.analyzer import Analyzer from pyrect.regexp.char_collector import CharCollector @@ -13,8 +12,8 @@ """Regexp is basic class in Pyrect. this contains regexp, dfa, nfa,, actually it's include all. >>> regexp = Regexp('(A|B)*C') - >>> regexp.dfacg.map - {'1': {}, '0': {(Character:'A'): '0', (Character:'B'): '0', (Character:'C'): '1'}} + >>> print(regexp.dfa.transition) + Transition: 0 x 'A' -> 0, 0 x 'B' -> 0, 0 x 'C' -> 1, >>> regexp.matches('ABC') True >>> regexp = Regexp('(a|b)*cd*e') @@ -29,8 +28,6 @@ self.ast = Parser().parse(regexp) self.nfa = NFATranslator().translate(self.ast) self.dfa = DFATranslator().translate(self.nfa) - self.nfacg = CallGraph(self.nfa) - self.dfacg = CallGraph(self.dfa) an = Analyzer() self.max_len, self.min_len, self.must_words =\ an.analyze(self.ast) diff -r 5db856953793 -r 4d498b002de5 pyrect/regexp/dfa.py --- a/pyrect/regexp/dfa.py Sun Dec 12 19:02:37 2010 +0900 +++ b/pyrect/regexp/dfa.py Sun Dec 12 23:04:31 2010 +0900 @@ -5,15 +5,13 @@ from fa import FA class DFA(FA): - def __init__(self, start, accepts, map_, states, assoc): - def transition(state, input): - return map_[state, input] - FA.__init__(self, transition, start, accepts, map_, states) + def __init__(self, transition, start, accepts, states, assoc): + FA.__init__(self, transition, start, accepts, states) self.assoc = assoc def copy_from(self, dfa): self.start = dfa.start - self.map = copy.deepcopy(dfa.map) + self.transition = copy.deepcopy(dfa.transition) self.accepts = copy.deepcopy(dfa.accepts) self.states = copy.deepcopy(dfa.states) self.assoc = copy.deepcopy(dfa.assoc) @@ -41,11 +39,11 @@ self.acceptable = collections.defaultdict(self.setdict) def search_acceptable(pcur, scur, searched, history, pstart): - for (pcur_, i), pnext in self.pdfa.map.iteritems(): - if pcur_ != pcur or (pnext, i) in searched: continue + for i, pnext in self.pdfa.transition[pcur].iteritems(): + if (pnext, i) in searched: continue searched_ = searched + [(pnext, i)] - snext = self.map[(scur, i)] - if scur not in history: + snext = self.transition[(scur, i)] + if scur not in history: history_ = history + (scur,) else: history_ = history @@ -62,8 +60,7 @@ self.curState = self.DFA.start def do_transition(self, char): - self.curState = self.DFA.transition( - self.curState, char) + self.curState = self.DFA.transition[self.curState, char] def is_accept_state(self): return self.curState in self.DFA.accepts diff -r 5db856953793 -r 4d498b002de5 pyrect/regexp/dfa_translator.py --- a/pyrect/regexp/dfa_translator.py Sun Dec 12 19:02:37 2010 +0900 +++ b/pyrect/regexp/dfa_translator.py Sun Dec 12 23:04:31 2010 +0900 @@ -3,6 +3,7 @@ from pyrect.regexp.parser import Parser from pyrect.regexp.ast import ASTWalker, AnyChar +from pyrect.regexp.fa import Transition from pyrect.regexp.nfa_translator import NFATranslator from pyrect.regexp.nfa import NFA, SuffixNFA, SuffixTrieNFA from pyrect.regexp.dfa import DFA, SuffixDFA @@ -12,12 +13,12 @@ actually DFATranslator use NFATranslator and it convert NFA into DFA. >>> prs = Parser() >>> dfat = DFATranslator() - >>> dfat.translate(prs.parse('(A|B)*C')).map - {(0, (Character:'C')): 1, (0, (Character:'A')): 0, (0, (Character:'B')): 0} + >>> print(dfat.translate(prs.parse('(A|B)*C')).transition) + Transition: 0 x 'A' -> 0, 0 x 'B' -> 0, 0 x 'C' -> 1, >>> dfat.translate(prs.parse('(A|B)*C')).accepts [1] - >>> dfat.translate(prs.parse('ほ?げ+')).map - {(2, (MBCharacter:'げ')): 2, (0, (MBCharacter:'げ')): 2, (0, (MBCharacter:'ほ')): 1, (1, (MBCharacter:'げ')): 2} + >>> print(dfat.translate(prs.parse('ほ?げ+')).transition) + Transition: 0 x \\x81 -> 1, 1 x \\x92 -> 5, 2 x \\x92 -> 5, 2 x \\xbb -> 3, 3 x \\xe3 -> 0, 4 x \\xe3 -> 6, 5 x \\xe3 -> 0, 6 x \\x81 -> 2, """ def __init__(self): @@ -34,61 +35,70 @@ return self.dfa def convert(self, nfa): - map_ = dict() - que = set(frozenset([nfa.epsilon_expand(set([nfa.start]))])) + transition = Transition() + que = set() done = set() + que.add(nfa.epsilon_expand(nfa.start)) while que: states = que.pop() anys = set() for state in states: - for k, v in nfa.map.iteritems(): - if state == k[0] and k[1] == AnyChar(): - anys.update(nfa.epsilon_expand(v)) + trans = nfa.transition[state] + if trans == None: continue + for i, n in trans.iteritems(): + if i == AnyChar(): + anys.update(nfa.epsilon_expand(n)) for state in states: - for k, v in nfa.map.iteritems(): - if state == k[0] and k[1] != '': + trans = nfa.transition[state] + if trans == None: continue + for i, n in trans.iteritems(): + if i != '': default = set(anys) - slot = map_.setdefault((states, k[1]), default) - slot.update(nfa.epsilon_expand(v)) - + if transition[states, i] == None: + slot = default + transition[states, i] = slot + else: + slot = transition[states, i] + slot.update(nfa.epsilon_expand(n)) done.add(states) - for v in map_.itervalues(): - if not v in done: - que.add(frozenset(v)) + for (_, _), n in transition.itertrans(): + if not n in done: + que.add(frozenset(n)) assoc = dict() states = set() + states.add(nfa.epsilon_expand(nfa.start)) - states.add(nfa.epsilon_expand(frozenset([nfa.start]))) - for state in map_.itervalues(): + for _, state in transition.itertrans(): states.add(frozenset(state)) for index, state in enumerate(states): - assoc[frozenset(state)] = index + assoc[state] = index states = list() for state in assoc.itervalues(): states.append(state) - start = assoc[nfa.epsilon_expand(frozenset([nfa.start]))] + start = assoc[nfa.epsilon_expand(nfa.start)] accepts = list() for state in assoc.iterkeys(): if state & set(nfa.accepts): accepts.append(assoc[state]) - map__ = dict() - for (s, c), v in map_.iteritems(): - map__[(assoc[frozenset(s)], c)] = assoc[frozenset(v)] + transition_ = Transition() + + for (s, i), n in transition.itertrans(): + transition_[(assoc[s], i)] = assoc[frozenset(n)] for k, v in assoc.items(): assoc[v] = k return DFA( + transition_, start, accepts, - map__, states, assoc ) diff -r 5db856953793 -r 4d498b002de5 pyrect/regexp/fa.py --- a/pyrect/regexp/fa.py Sun Dec 12 19:02:37 2010 +0900 +++ b/pyrect/regexp/fa.py Sun Dec 12 23:04:31 2010 +0900 @@ -1,11 +1,10 @@ #!/usr/bin/env python class FA(object): - def __init__(self, transition, start, accepts, map_, states=None): + def __init__(self, transition, start, accepts, states=None): self.transition = transition self.start = start self.accepts = accepts - self.map = map_ self.states = states def accept(self, laungage): @@ -39,12 +38,12 @@ def __str__(self): string = "Transition:" - for (s, i), ns in self.iterallitems(): + for (s, i), ns in self.itertrans(): string += " %s x %s -> %s," % (s, i, ns) return string def iterstates(self): - for s, t self.iteritems(): + for s, t in self.iteritems(): yield s, t def itertrans(self): diff -r 5db856953793 -r 4d498b002de5 pyrect/regexp/nfa.py --- a/pyrect/regexp/nfa.py Sun Dec 12 19:02:37 2010 +0900 +++ b/pyrect/regexp/nfa.py Sun Dec 12 23:04:31 2010 +0900 @@ -3,16 +3,20 @@ import curses.ascii as ascii import copy import collections -from pyrect.regexp.fa import FA +from pyrect.regexp.fa import FA, Transition class NFAFragment(object): def __init__(self): self.start = None # integer self.accepts = set() # frozenset - self.map = dict() # transition, char or special -> frozenset([states]) + self.transition = Transition() # transition, char or special -> frozenset([states]) def connect(self, from_, char, to): - slot = self.map.setdefault((from_, char), set()) + if self.transition[from_, char]: + slot = self.transition[from_, char] + else: + slot = set() + self.transition[from_, char] = slot slot.add(to) def accept_to(self, from_, char): @@ -26,46 +30,44 @@ def new_skelton(self): # copy fragment (self), and it return newFrag = NFAFragment(); - newFrag.map = copy.deepcopy(self.map) + newFrag.transition = copy.deepcopy(self.transition) return newFrag def __or__(self, frag): newFrag = self.new_skelton() - for k, v in frag.map.iteritems(): - newFrag.map[k] = v.copy() + for k, v in frag.transition.iterstates(): + newFrag.transition[k] = copy.deepcopy(v) return newFrag def __str__(self): return ("NFA: delta -> %s, start -> %s, accepts -> %s" % - (self.map, self.start, self.accepts)) + (self.transition, self.start, self.accepts)) def build(self): - map_ = self.map - def transition(state, char): - return frozenset(map_.get((state, char), [])) - states = set() - for (s, _) in self.map.iterkeys(): + for s in self.transition.iterkeys(): states.add(s) return NFA( - transition, + self.transition, self.start, self.accepts, - self.map, sorted(states) ) class NFA(FA): - def epsilon_expand(self, set_): - que = set(set_) + def epsilon_expand(self, states): + if not isinstance(states, set) and \ + not isinstance(states, list): + states = set([states]) + states = copy.deepcopy(states) done = set() - while que: - stat = que.pop() - nexts = self.transition(stat, "") + while states: + stat = states.pop() done.add(stat) - for nextStat in nexts: - if not nextStat in done: que.add(nextStat) + if self.transition[stat, ""] != None: + for n in self.transition[stat, ""]: + if not n in done: states.add(n) return frozenset(done) class SuffixNFA(NFA): @@ -73,40 +75,36 @@ self.start = nfa.start self.accepts = copy.deepcopy(nfa.accepts) self.states = copy.deepcopy(nfa.states) - self.map = self.nfa_map(nfa) + self.transition = self.nfa_trans(nfa) start = set(self.states)-set([self.start]) if self.map.has_key((self.start, '')): self.map[(self.start, '')].add(start) else: self.map[(self.start, '')] = start - def nfa_map(self, fa): + def nfa_trans(self, fa): if type(fa) == NFA: - return copy.deepcopy(fa.map) - nmap = dict() - for k, v in fa.map.items(): - nmap[k] = set([v]) - return nmap - - def transition(self, state, char): - return frozenset(self.map.get((state, char), [])) + return copy.deepcopy(fa.transition) + ntrans = dict() + for k, v in fa.map.itertrans(): + ntrans[k] = set([v]) + return ntrans class SuffixTrieNFA(NFA): def __init__(self, dfa): self.start = dfa.start self.pdfa = dfa - self.map = dict() self.states = copy.deepcopy(dfa.states) self.acceptable = collections.defaultdict(set) self.accepts = copy.deepcopy(set(dfa.accepts)) states = [self.start] + list(set(self.states)-set((self.start,))) - self.make_trie_map(states) + self.make_trie_trans(states) - def make_trie_map(self, states): + def make_trie_trans(self, states): add_states = set() offset = [x * len(states) for x in range(len(states))] for s, ind in zip(states, offset): - for (s_, i), n in self.pdfa.map.iteritems(): + for (s_, i), n in self.pdfa.trans.itertrans(): sind = s_ + ind nind = n + ind add_states.add((sind, nind)) @@ -120,7 +118,4 @@ starts.add(self.start + i + offset[i]) self.states = list(add_states) - self.map[(self.start, '')] = starts - - def transition(self, state, char): - return frozenset(self.map.get((state, char), [])) + self.trans[(self.start, '')] = starts diff -r 5db856953793 -r 4d498b002de5 pyrect/regexp/nfa_translator.py --- a/pyrect/regexp/nfa_translator.py Sun Dec 12 19:02:37 2010 +0900 +++ b/pyrect/regexp/nfa_translator.py Sun Dec 12 23:04:31 2010 +0900 @@ -11,10 +11,11 @@ >>> prs = Parser() >>> nfat = NFATranslator() >>> nfa = nfat.translate(prs.parse('(A|B)*C')) - >>> nfa.map - {(3, ''): set([2, 4]), (2, ''): set([0, 1]), (1, (Character:'B')): set([3]), (4, (Character:'C')): set([5]), (0, (Character:'A')): set([3])} - >>> nfa = nfat.translate(prs.parse('A{2}B{3,}C{4,5}')) - >>> nfa.map + >>> print nfa.transition + Transition: 0 x 'A' -> set([3]), 1 x 'B' -> set([3]), 2 x -> set([0, 1]), 3 x -> set([2, 4]), 4 x 'C' -> set([5]), + >>> nfa = nfat.translate(prs.parse('[A-C]{2}D{3,}E{4,5}')) + >>> print nfa.transition + Transition: 0 x 'A' -> set([5]), 1 x 'B' -> set([5]), 2 x -> set([0, 1]), 3 x 'C' -> set([5]), 4 x -> set([2, 3]), 5 x REPEAT -> set([4]), 5 x ['A'-'C']{2} -> set([7]), 7 x 'D' -> set([8]), 8 x REPEAT -> set([7]), 8 x 'D'{3,} -> set([10]), 10 x 'E' -> set([11]), 11 x REPEAT -> set([10]), 11 x 'E'{4, 5} -> set([13]), """ def __init__(self):