changeset 87:d23f12ce0369

add suffix-dfa, it's used to be parallel-matching-algorithm (not implement, yet).
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Sun, 14 Nov 2010 04:16:12 +0900
parents b34a900a3a0b
children dafb393108f3
files pyrect/converter.py pyrect/regexp/__init__.py pyrect/regexp/dfa.py pyrect/regexp/dfa_translator.py pyrect/regexp/nfa.py pyrect/regexp/nfa_translator.py pyrect/translator/dot_translator.py
diffstat 7 files changed, 190 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/pyrect/converter.py	Thu Nov 11 10:17:11 2010 +0900
+++ b/pyrect/converter.py	Sun Nov 14 04:16:12 2010 +0900
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 
 import sys
-from pyrect.regexp import Regexp
+from pyrect.regexp import Regexp, CallGraph, DFATranslator, SuffixDFATranslator, SuffixTrieTranslator
+from pyrect.regexp.nfa import SuffixNFA
 from pyrect.translator import *
 from optparse import OptionParser
 
@@ -14,6 +15,8 @@
     psr.add_option("--Dot", action="store_true", dest="emit_dot", default=False, help="emit Dot-source")
     psr.add_option("--from-dfa", action="store_true", dest="dfa", default=True, help="translate from DFA")
     psr.add_option("--from-nfa", action="store_true", dest="nfa", default=False, help="translate from NFA")
+    psr.add_option("--from-snfa", action="store_true", dest="snfa", default=False, help="translate from SuffixNFA")
+    psr.add_option("--from-sdfa", action="store_true", dest="sdfa", default=False, help="translate from SuffixDFA")
     psr.add_option("-o", action="store", type="string", dest="output", default=False, help="output file", metavar="FILE")
     psr.add_option("-O", action="store_true", dest="optimize", default=False, help="do optimization (only in llvm).")
     psr.add_option("-g", action="store_true", dest="debug", default=False, help="embed debug info")
@@ -29,6 +32,15 @@
 
     if opts.nfa:
         fa = "NFA"
+    elif opts.snfa:
+        fa = "NFA"
+        reg.nfa = SuffixNFA(reg.nfa)
+        reg.nfacg = CallGraph(reg.nfa)
+    elif opts.sdfa:
+        fa = "DFA"
+        sdfa = SuffixDFATranslator().translate(reg.dfa)
+        reg.dfa = sdfa
+        reg.dfacg = CallGraph(sdfa)
     else:
         fa = "DFA"
 
--- a/pyrect/regexp/__init__.py	Thu Nov 11 10:17:11 2010 +0900
+++ b/pyrect/regexp/__init__.py	Sun Nov 14 04:16:12 2010 +0900
@@ -4,7 +4,7 @@
 from pyrect.regexp.dfa import DFA
 from pyrect.regexp.nfa import NFA
 from pyrect.regexp.nfa_translator import NFATranslator
-from pyrect.regexp.dfa_translator import DFATranslator
+from pyrect.regexp.dfa_translator import DFATranslator, SuffixDFATranslator, SuffixTrieTranslator
 from pyrect.regexp.callgraph import CallGraph
 from pyrect.regexp.analyzer import Analyzer
 from pyrect.regexp.char_collector import CharCollector
--- a/pyrect/regexp/dfa.py	Thu Nov 11 10:17:11 2010 +0900
+++ b/pyrect/regexp/dfa.py	Sun Nov 14 04:16:12 2010 +0900
@@ -1,18 +1,61 @@
 #!/usr/bin/env python
 
 import curses.ascii as ascii
-import copy
+import copy, collections
 from fa import FA
 
 class DFA(FA):
-    def __init__(self, start, accepts, map_, states):
+    def __init__(self, start, accepts, map_, states, assoc):
         def transition(state, input):
             return map_[state, input]
         FA.__init__(self, transition, start, accepts, map_, states)
+        self.assoc = assoc
+
+    def copy_from(self, dfa):
+        self.start = dfa.start
+        self.map = copy.deepcopy(dfa.map)
+        self.accepts = copy.deepcopy(dfa.accepts)
+        self.states = copy.deepcopy(dfa.states)
+        self.assoc = copy.deepcopy(dfa.assoc)
+
+    @classmethod
+    def copy(cls, dfa_):
+        dfa = DFA(dfa_)
+        dfa.copy_from(dfa)
+        return dfa
 
     def get_runtime(self):
         return DFARuntime(self)
 
+class SuffixDFA(DFA):
+    def __init__(self, dfa, pdfa):
+        self.copy_from(dfa)
+        self.pdfa = pdfa
+        self.acceptable = dict()
+        self.make_acceptable()
+
+    def setdict(self):
+        return collections.defaultdict(set)
+
+    def make_acceptable(self):
+        self.acceptable = collections.defaultdict(self.setdict)
+
+        def search_acceptable(pcur, scur, searched, history, pstart):
+            for (pcur_, i), pnext in self.pdfa.map.iteritems():
+                if pcur_ != pcur or (pnext, i) in searched: continue
+                searched_ = searched + [(pnext, i)]
+                snext = self.map[(scur, i)]
+                if scur  not in history:
+                    history_ = history + (scur,)
+                else:
+                    history_ = history
+                if pnext in self.pdfa.accepts:
+                    self.acceptable[snext][history_].add(pstart)
+                search_acceptable(pnext, snext, searched_, history_, pstart)
+
+        for s in self.pdfa.states:
+            search_acceptable(s, self.start, [], (), s)
+
 class DFARuntime(object):
     def __init__(self, DFA):
         self.DFA = DFA
--- a/pyrect/regexp/dfa_translator.py	Thu Nov 11 10:17:11 2010 +0900
+++ b/pyrect/regexp/dfa_translator.py	Sun Nov 14 04:16:12 2010 +0900
@@ -3,9 +3,9 @@
 
 from pyrect.regexp.parser import Parser
 from pyrect.regexp.ast import ASTWalker, AnyChar
-from pyrect.regexp.nfa import NFA
 from pyrect.regexp.nfa_translator import NFATranslator
-from pyrect.regexp.dfa import DFA
+from pyrect.regexp.nfa import NFA, SuffixNFA, SuffixTrieNFA
+from pyrect.regexp.dfa import DFA, SuffixDFA
 
 class DFATranslator(object):
     """ Create DFA from AST with Visitor-Pattern.
@@ -60,36 +60,57 @@
                 if not v in done:
                     que.add(frozenset(v))
 
-        name_hash = dict()
+        assoc = dict()
         states = set()
 
         states.add(nfa.epsilon_expand(frozenset([nfa.start])))
         for state in map_.itervalues():
             states.add(frozenset(state))
         for index, state in enumerate(states):
-            name_hash[frozenset(state)] = index
+            assoc[frozenset(state)] = index
         states = list()
-        for state in name_hash.itervalues():
+        for state in assoc.itervalues():
             states.append(state)
 
-        start = name_hash[nfa.epsilon_expand(frozenset([nfa.start]))]
+        start = assoc[nfa.epsilon_expand(frozenset([nfa.start]))]
 
         accepts = list()
-        for state in name_hash.iterkeys():
+        for state in assoc.iterkeys():
             if state & set(nfa.accepts):
-                accepts.append(name_hash[state])
+                accepts.append(assoc[state])
 
         map__ = dict()
-        for key, value in map_.iteritems():
-            map__[(name_hash[frozenset(key[0])],key[1])] = name_hash[frozenset(value)]
+        for (s, c), v in map_.iteritems():
+            map__[(assoc[frozenset(s)], c)] = assoc[frozenset(v)]
+        for k, v in assoc.items():
+            assoc[v] = k
 
         return DFA(
             start,
             accepts,
             map__,
-            states
+            states,
+            assoc
             )
 
+class SuffixDFATranslator(DFATranslator):
+    def __init__(self):
+        self.sdfa = None
+
+    def translate(self, fa=None):
+        if fa:
+            self.sdfa = SuffixDFA(DFATranslator.translate(self, SuffixNFA(fa)), fa)
+        return self.sdfa
+
+class SuffixTrieTranslator(DFATranslator):
+    def __init__(self):
+        self.trie = None
+
+    def translate(self, fa=None):
+        if fa:
+            self.trie = DFATranslator().translate(SuffixTrieNFA(fa))
+        return self.trie
+
 def test():
     import doctest
     doctest.testmod()
--- a/pyrect/regexp/nfa.py	Thu Nov 11 10:17:11 2010 +0900
+++ b/pyrect/regexp/nfa.py	Sun Nov 14 04:16:12 2010 +0900
@@ -2,6 +2,7 @@
 
 import curses.ascii as ascii
 import copy
+import collections
 from pyrect.regexp.fa import FA
 
 class NFAFragment(object):
@@ -43,11 +44,16 @@
         def transition(state, char):
             return frozenset(map_.get((state, char), []))
 
+        states = set()
+        for (s, _) in self.map.iterkeys():
+            states.add(s)
+
         return NFA(
             transition,
             self.start,
             self.accepts,
-            self.map
+            self.map,
+            sorted(states)
             )
 
 class NFA(FA):
@@ -61,3 +67,59 @@
             for nextStat in nexts:
                 if not nextStat in done: que.add(nextStat)
         return frozenset(done)
+
+class SuffixNFA(NFA):
+    def __init__(self, nfa):
+        self.start = nfa.start
+        self.accepts = copy.deepcopy(nfa.accepts)
+        self.states = copy.deepcopy(nfa.states)
+        self.map = self.nfa_map(nfa)
+        start = set(self.states)-set([self.start])
+        if self.map.has_key((self.start, '')):
+            self.map[(self.start, '')].add(start)
+        else:
+            self.map[(self.start, '')] = start
+
+    def nfa_map(self, fa):
+        if type(fa) == NFA:
+            return copy.deepcopy(fa.map)
+        nmap = dict()
+        for k, v in fa.map.items():
+            nmap[k] = set([v])
+        return nmap
+
+    def transition(self, state, char):
+        return frozenset(self.map.get((state, char), []))
+
+#TODO: bug-fix
+class SuffixTrieNFA(NFA):
+    def __init__(self, dfa):
+        self.start = dfa.start
+        self.pdfa = dfa
+        self.map = dict()
+        self.states = copy.deepcopy(dfa.states)
+        self.acceptable = collections.defaultdict(set)
+        self.accepts = copy.deepcopy(set(dfa.accepts))
+        states = [self.start] + list(set(self.states)-set((self.start,)))
+        self.make_trie_map(states)
+
+    def make_trie_map(self, states):
+        add_states = set()
+        offset = [x * len(states) for x in range(len(states))]
+        for s, ind in zip(states, offset):
+            for (s_, i), n in self.pdfa.map.iteritems():
+                if s_ != s: continue
+                sind = s + ind
+                nind = n + ind
+                add_states.add((sind, nind))
+                if n in self.pdfa.accepts:
+                    self.accepts.add(nind)
+                    self.acceptable[nind].add(s)
+                self.map[(sind, i)] = set((nind,))
+
+        self.states = list(add_states)
+        self.map[(self.start, '')] = add_states.difference((self.start, ))
+        print self.map
+
+    def transition(self, state, char):
+        return frozenset(self.map.get((state, char), []))
--- a/pyrect/regexp/nfa_translator.py	Thu Nov 11 10:17:11 2010 +0900
+++ b/pyrect/regexp/nfa_translator.py	Sun Nov 14 04:16:12 2010 +0900
@@ -10,11 +10,12 @@
     AST (ast), is represented by Node-Tree.
     >>> prs  = Parser()
     >>> nfat = NFATranslator()
-    >>> nfat.translate(prs.parse('(A|B)*C')).map
-    {(3, ''): set([2, 4]), (2, ''): set([0, 1]), (1, (Character:'B')): set([3]), (0, (Character:'A')): set([3]), (4, (Character:'C')): set([5])}
-    >>> nfat.translate(prs.parse('(A|B)*C')).accepts
-    [5]
-    >>> nfat.translate(prs.parse('ほ?げ+')).map
+    >>> nfa = nfat.translate(prs.parse('(A|B)*C'))
+    >>> nfa.map
+    {(3, ''): set([2, 4]), (2, ''): set([0, 1]), (1, (Character:'B')): set([3]), (4, (Character:'C')): set([5]), (0, (Character:'A')): set([3])}
+    >>> anfa = ANFA(nfa)
+    >>> anfa.map
+    {(1, (Character:'B')): set([3]), (0, (Character:'A')): set([3]), (3, ''): set([2, 4]), (6, ''): set([0, 1, 2, 3, 4, 5]), (2, ''): set([0, 1]), (4, (Character:'C')): set([5])}
     """
 
     def __init__(self):
--- a/pyrect/translator/dot_translator.py	Thu Nov 11 10:17:11 2010 +0900
+++ b/pyrect/translator/dot_translator.py	Sun Nov 14 04:16:12 2010 +0900
@@ -4,6 +4,7 @@
 
 from translator import Translator
 from pyrect.regexp import Regexp
+from pyrect.regexp.dfa import SuffixDFA
 
 class DotTranslator(Translator):
     """
@@ -43,8 +44,7 @@
 
         # edge to start state
         self.emit("start [shape=point]")
-        self.emit("start -> %s" % self.state_name(self.cg.start))
-        self.emit()
+        self.emit("start -> %s" % self.state_name(self.cg.start), 2)
 
         for cur_state, trans in self.cg.map.iteritems():
             for input, next_states in trans.iteritems():
@@ -53,8 +53,34 @@
                 for next_state in next_states:
                     self.emit("%s -> %s [label=\"%s\"]"
                               % (self.state_name(cur_state), self.state_name(next_state), input))
-        self.dedent()
-        self.emit("}", 2)
+
+        if type(self.regexp.dfa) == SuffixDFA:
+            sdfa = self.regexp.dfa
+            pdfa = sdfa.pdfa
+            color = "fillcolor=\"#99cc99\", style=filled, color = \"#000000\""
+            arrowstyle = "arrowhead = odot, dir = both"
+            for s in sdfa.states:
+                s_is = sdfa.assoc[s] & set(pdfa.states)
+                self.emit("%s_is [shape=box,label=\"%s\", %s, %s]"
+                          % (self.state_name(str(s)), ", ".join(map(self.state_name, map(str, s_is))), color, arrowstyle))
+                self.emit("%s -> %s_is [label=\"is\"]" % (self.state_name(str(s)), (self.state_name(str(s)))))
+
+            color = "fillcolor=\"#cc9999\", style=filled, color = \"#000000\""
+            arrowstyle = "arrowhead = odot, dir = both"
+            for s in sdfa.acceptable.iterkeys():
+                msg = list()
+                def pretty(hist, s):
+                    def p1 (s):
+                        return self.state_name(str(s))
+                    def p2 (s):
+                        return "p"+str(s)
+                    return " -> ".join(map(p1, hist)) + " := " + ", ".join(map(p2, s))
+                for hist, ss in sdfa.acceptable[s].iteritems():
+                    msg.append(pretty(hist, ss))
+                self.emit("%s_accept [shape=box,label=\"%s\", %s, %s]" % (self.state_name(str(s)), "\\n".join(msg), color, arrowstyle))
+                self.emit("%s -> %s_accept [label=\"acceptable\"]" % (self.state_name(str(s)), (self.state_name(str(s)))))
+
+        self.emitd("}", 2)
 
 def test():
     import doctest