# HG changeset patch # User Ryoma SHINYA # Date 1289675772 -32400 # Node ID d23f12ce0369a46ae2885e21129e0efc42a2e376 # Parent b34a900a3a0b92f17290ce9b4c9883ad236320dd add suffix-dfa, it's used to be parallel-matching-algorithm (not implement, yet). diff -r b34a900a3a0b -r d23f12ce0369 pyrect/converter.py --- a/pyrect/converter.py Thu Nov 11 10:17:11 2010 +0900 +++ b/pyrect/converter.py Sun Nov 14 04:16:12 2010 +0900 @@ -1,7 +1,8 @@ #!/usr/bin/env python import sys -from pyrect.regexp import Regexp +from pyrect.regexp import Regexp, CallGraph, DFATranslator, SuffixDFATranslator, SuffixTrieTranslator +from pyrect.regexp.nfa import SuffixNFA from pyrect.translator import * from optparse import OptionParser @@ -14,6 +15,8 @@ psr.add_option("--Dot", action="store_true", dest="emit_dot", default=False, help="emit Dot-source") psr.add_option("--from-dfa", action="store_true", dest="dfa", default=True, help="translate from DFA") psr.add_option("--from-nfa", action="store_true", dest="nfa", default=False, help="translate from NFA") + psr.add_option("--from-snfa", action="store_true", dest="snfa", default=False, help="translate from SuffixNFA") + psr.add_option("--from-sdfa", action="store_true", dest="sdfa", default=False, help="translate from SuffixDFA") psr.add_option("-o", action="store", type="string", dest="output", default=False, help="output file", metavar="FILE") psr.add_option("-O", action="store_true", dest="optimize", default=False, help="do optimization (only in llvm).") psr.add_option("-g", action="store_true", dest="debug", default=False, help="embed debug info") @@ -29,6 +32,15 @@ if opts.nfa: fa = "NFA" + elif opts.snfa: + fa = "NFA" + reg.nfa = SuffixNFA(reg.nfa) + reg.nfacg = CallGraph(reg.nfa) + elif opts.sdfa: + fa = "DFA" + sdfa = SuffixDFATranslator().translate(reg.dfa) + reg.dfa = sdfa + reg.dfacg = CallGraph(sdfa) else: fa = "DFA" diff -r b34a900a3a0b -r d23f12ce0369 pyrect/regexp/__init__.py --- a/pyrect/regexp/__init__.py Thu Nov 11 10:17:11 2010 +0900 +++ b/pyrect/regexp/__init__.py Sun Nov 14 04:16:12 2010 +0900 @@ -4,7 +4,7 @@ from pyrect.regexp.dfa import DFA from pyrect.regexp.nfa import NFA from pyrect.regexp.nfa_translator import NFATranslator -from pyrect.regexp.dfa_translator import DFATranslator +from pyrect.regexp.dfa_translator import DFATranslator, SuffixDFATranslator, SuffixTrieTranslator from pyrect.regexp.callgraph import CallGraph from pyrect.regexp.analyzer import Analyzer from pyrect.regexp.char_collector import CharCollector diff -r b34a900a3a0b -r d23f12ce0369 pyrect/regexp/dfa.py --- a/pyrect/regexp/dfa.py Thu Nov 11 10:17:11 2010 +0900 +++ b/pyrect/regexp/dfa.py Sun Nov 14 04:16:12 2010 +0900 @@ -1,18 +1,61 @@ #!/usr/bin/env python import curses.ascii as ascii -import copy +import copy, collections from fa import FA class DFA(FA): - def __init__(self, start, accepts, map_, states): + def __init__(self, start, accepts, map_, states, assoc): def transition(state, input): return map_[state, input] FA.__init__(self, transition, start, accepts, map_, states) + self.assoc = assoc + + def copy_from(self, dfa): + self.start = dfa.start + self.map = copy.deepcopy(dfa.map) + self.accepts = copy.deepcopy(dfa.accepts) + self.states = copy.deepcopy(dfa.states) + self.assoc = copy.deepcopy(dfa.assoc) + + @classmethod + def copy(cls, dfa_): + dfa = DFA(dfa_) + dfa.copy_from(dfa) + return dfa def get_runtime(self): return DFARuntime(self) +class SuffixDFA(DFA): + def __init__(self, dfa, pdfa): + self.copy_from(dfa) + self.pdfa = pdfa + self.acceptable = dict() + self.make_acceptable() + + def setdict(self): + return collections.defaultdict(set) + + def make_acceptable(self): + self.acceptable = collections.defaultdict(self.setdict) + + def search_acceptable(pcur, scur, searched, history, pstart): + for (pcur_, i), pnext in self.pdfa.map.iteritems(): + if pcur_ != pcur or (pnext, i) in searched: continue + searched_ = searched + [(pnext, i)] + snext = self.map[(scur, i)] + if scur not in history: + history_ = history + (scur,) + else: + history_ = history + if pnext in self.pdfa.accepts: + self.acceptable[snext][history_].add(pstart) + search_acceptable(pnext, snext, searched_, history_, pstart) + + for s in self.pdfa.states: + search_acceptable(s, self.start, [], (), s) + class DFARuntime(object): def __init__(self, DFA): self.DFA = DFA diff -r b34a900a3a0b -r d23f12ce0369 pyrect/regexp/dfa_translator.py --- a/pyrect/regexp/dfa_translator.py Thu Nov 11 10:17:11 2010 +0900 +++ b/pyrect/regexp/dfa_translator.py Sun Nov 14 04:16:12 2010 +0900 @@ -3,9 +3,9 @@ from pyrect.regexp.parser import Parser from pyrect.regexp.ast import ASTWalker, AnyChar -from pyrect.regexp.nfa import NFA from pyrect.regexp.nfa_translator import NFATranslator -from pyrect.regexp.dfa import DFA +from pyrect.regexp.nfa import NFA, SuffixNFA, SuffixTrieNFA +from pyrect.regexp.dfa import DFA, SuffixDFA class DFATranslator(object): """ Create DFA from AST with Visitor-Pattern. @@ -60,36 +60,57 @@ if not v in done: que.add(frozenset(v)) - name_hash = dict() + assoc = dict() states = set() states.add(nfa.epsilon_expand(frozenset([nfa.start]))) for state in map_.itervalues(): states.add(frozenset(state)) for index, state in enumerate(states): - name_hash[frozenset(state)] = index + assoc[frozenset(state)] = index states = list() - for state in name_hash.itervalues(): + for state in assoc.itervalues(): states.append(state) - start = name_hash[nfa.epsilon_expand(frozenset([nfa.start]))] + start = assoc[nfa.epsilon_expand(frozenset([nfa.start]))] accepts = list() - for state in name_hash.iterkeys(): + for state in assoc.iterkeys(): if state & set(nfa.accepts): - accepts.append(name_hash[state]) + accepts.append(assoc[state]) map__ = dict() - for key, value in map_.iteritems(): - map__[(name_hash[frozenset(key[0])],key[1])] = name_hash[frozenset(value)] + for (s, c), v in map_.iteritems(): + map__[(assoc[frozenset(s)], c)] = assoc[frozenset(v)] + for k, v in assoc.items(): + assoc[v] = k return DFA( start, accepts, map__, - states + states, + assoc ) +class SuffixDFATranslator(DFATranslator): + def __init__(self): + self.sdfa = None + + def translate(self, fa=None): + if fa: + self.sdfa = SuffixDFA(DFATranslator.translate(self, SuffixNFA(fa)), fa) + return self.sdfa + +class SuffixTrieTranslator(DFATranslator): + def __init__(self): + self.trie = None + + def translate(self, fa=None): + if fa: + self.trie = DFATranslator().translate(SuffixTrieNFA(fa)) + return self.trie + def test(): import doctest doctest.testmod() diff -r b34a900a3a0b -r d23f12ce0369 pyrect/regexp/nfa.py --- a/pyrect/regexp/nfa.py Thu Nov 11 10:17:11 2010 +0900 +++ b/pyrect/regexp/nfa.py Sun Nov 14 04:16:12 2010 +0900 @@ -2,6 +2,7 @@ import curses.ascii as ascii import copy +import collections from pyrect.regexp.fa import FA class NFAFragment(object): @@ -43,11 +44,16 @@ def transition(state, char): return frozenset(map_.get((state, char), [])) + states = set() + for (s, _) in self.map.iterkeys(): + states.add(s) + return NFA( transition, self.start, self.accepts, - self.map + self.map, + sorted(states) ) class NFA(FA): @@ -61,3 +67,59 @@ for nextStat in nexts: if not nextStat in done: que.add(nextStat) return frozenset(done) + +class SuffixNFA(NFA): + def __init__(self, nfa): + self.start = nfa.start + self.accepts = copy.deepcopy(nfa.accepts) + self.states = copy.deepcopy(nfa.states) + self.map = self.nfa_map(nfa) + start = set(self.states)-set([self.start]) + if self.map.has_key((self.start, '')): + self.map[(self.start, '')].add(start) + else: + self.map[(self.start, '')] = start + + def nfa_map(self, fa): + if type(fa) == NFA: + return copy.deepcopy(fa.map) + nmap = dict() + for k, v in fa.map.items(): + nmap[k] = set([v]) + return nmap + + def transition(self, state, char): + return frozenset(self.map.get((state, char), [])) + +#TODO: bug-fix +class SuffixTrieNFA(NFA): + def __init__(self, dfa): + self.start = dfa.start + self.pdfa = dfa + self.map = dict() + self.states = copy.deepcopy(dfa.states) + self.acceptable = collections.defaultdict(set) + self.accepts = copy.deepcopy(set(dfa.accepts)) + states = [self.start] + list(set(self.states)-set((self.start,))) + self.make_trie_map(states) + + def make_trie_map(self, states): + add_states = set() + offset = [x * len(states) for x in range(len(states))] + for s, ind in zip(states, offset): + for (s_, i), n in self.pdfa.map.iteritems(): + if s_ != s: continue + sind = s + ind + nind = n + ind + add_states.add((sind, nind)) + if n in self.pdfa.accepts: + self.accepts.add(nind) + self.acceptable[nind].add(s) + self.map[(sind, i)] = set((nind,)) + + self.states = list(add_states) + self.map[(self.start, '')] = add_states.difference((self.start, )) + print self.map + + def transition(self, state, char): + return frozenset(self.map.get((state, char), [])) diff -r b34a900a3a0b -r d23f12ce0369 pyrect/regexp/nfa_translator.py --- a/pyrect/regexp/nfa_translator.py Thu Nov 11 10:17:11 2010 +0900 +++ b/pyrect/regexp/nfa_translator.py Sun Nov 14 04:16:12 2010 +0900 @@ -10,11 +10,12 @@ AST (ast), is represented by Node-Tree. >>> prs = Parser() >>> nfat = NFATranslator() - >>> nfat.translate(prs.parse('(A|B)*C')).map - {(3, ''): set([2, 4]), (2, ''): set([0, 1]), (1, (Character:'B')): set([3]), (0, (Character:'A')): set([3]), (4, (Character:'C')): set([5])} - >>> nfat.translate(prs.parse('(A|B)*C')).accepts - [5] - >>> nfat.translate(prs.parse('ほ?げ+')).map + >>> nfa = nfat.translate(prs.parse('(A|B)*C')) + >>> nfa.map + {(3, ''): set([2, 4]), (2, ''): set([0, 1]), (1, (Character:'B')): set([3]), (4, (Character:'C')): set([5]), (0, (Character:'A')): set([3])} + >>> anfa = ANFA(nfa) + >>> anfa.map + {(1, (Character:'B')): set([3]), (0, (Character:'A')): set([3]), (3, ''): set([2, 4]), (6, ''): set([0, 1, 2, 3, 4, 5]), (2, ''): set([0, 1]), (4, (Character:'C')): set([5])} """ def __init__(self): diff -r b34a900a3a0b -r d23f12ce0369 pyrect/translator/dot_translator.py --- a/pyrect/translator/dot_translator.py Thu Nov 11 10:17:11 2010 +0900 +++ b/pyrect/translator/dot_translator.py Sun Nov 14 04:16:12 2010 +0900 @@ -4,6 +4,7 @@ from translator import Translator from pyrect.regexp import Regexp +from pyrect.regexp.dfa import SuffixDFA class DotTranslator(Translator): """ @@ -43,8 +44,7 @@ # edge to start state self.emit("start [shape=point]") - self.emit("start -> %s" % self.state_name(self.cg.start)) - self.emit() + self.emit("start -> %s" % self.state_name(self.cg.start), 2) for cur_state, trans in self.cg.map.iteritems(): for input, next_states in trans.iteritems(): @@ -53,8 +53,34 @@ for next_state in next_states: self.emit("%s -> %s [label=\"%s\"]" % (self.state_name(cur_state), self.state_name(next_state), input)) - self.dedent() - self.emit("}", 2) + + if type(self.regexp.dfa) == SuffixDFA: + sdfa = self.regexp.dfa + pdfa = sdfa.pdfa + color = "fillcolor=\"#99cc99\", style=filled, color = \"#000000\"" + arrowstyle = "arrowhead = odot, dir = both" + for s in sdfa.states: + s_is = sdfa.assoc[s] & set(pdfa.states) + self.emit("%s_is [shape=box,label=\"%s\", %s, %s]" + % (self.state_name(str(s)), ", ".join(map(self.state_name, map(str, s_is))), color, arrowstyle)) + self.emit("%s -> %s_is [label=\"is\"]" % (self.state_name(str(s)), (self.state_name(str(s))))) + + color = "fillcolor=\"#cc9999\", style=filled, color = \"#000000\"" + arrowstyle = "arrowhead = odot, dir = both" + for s in sdfa.acceptable.iterkeys(): + msg = list() + def pretty(hist, s): + def p1 (s): + return self.state_name(str(s)) + def p2 (s): + return "p"+str(s) + return " -> ".join(map(p1, hist)) + " := " + ", ".join(map(p2, s)) + for hist, ss in sdfa.acceptable[s].iteritems(): + msg.append(pretty(hist, ss)) + self.emit("%s_accept [shape=box,label=\"%s\", %s, %s]" % (self.state_name(str(s)), "\\n".join(msg), color, arrowstyle)) + self.emit("%s -> %s_accept [label=\"acceptable\"]" % (self.state_name(str(s)), (self.state_name(str(s))))) + + self.emitd("}", 2) def test(): import doctest