grammar-inference-engine/bex/koa.py

106 lines
3 KiB
Python
Raw Normal View History

"""k-OA — k-Occurrence Automaton (Definition 4.1, arXiv 1004.2372).
A k-OA is like a SOA but each symbol appears at most k times as a state label.
"""
from .soa import SOA
from .expr import strip_k
class KOA(SOA):
"""k-Occurrence Automaton.
Same structure as SOA but each symbol may label up to k states.
"""
def __init__(self, k=1):
super().__init__()
self.k = k
self._symbol_count = {}
def add_state(self, label):
nid = super().add_state(label)
sym = strip_k(label)
self._symbol_count.setdefault(sym, 0)
self._symbol_count[sym] += 1
return nid
def remove_state(self, nid):
label = self._label.get(nid)
if label:
sym = strip_k(label)
self._symbol_count[sym] -= 1
super().rm_state(nid)
def count_symbol(self, symbol):
return self._symbol_count.get(strip_k(symbol), 0)
def symbol_ok(self, symbol):
return self.count_symbol(symbol) < self.k
def is_deterministic(self):
for n in self._succ:
label_map = {}
for t in self._succ[n]:
lab = self._label.get(t)
if lab:
base = strip_k(lab)
if base in label_map:
return False
label_map[base] = t
return True
def accept(self, w):
"""Accept using base symbols (strip k-markers from state labels)."""
cur = {self.src}
for sym in w:
nxt = set()
for s in cur:
for t in self._succ.get(s, set()):
lab = self._label.get(t)
if lab and strip_k(lab) == sym:
nxt.add(t)
if not nxt:
return False
cur = nxt
return any(self.sink in self._succ.get(s, set()) for s in cur)
def succ_labeled(self, nid, symbol):
return {t for t in self._succ.get(nid, set()) if strip_k(self._label.get(t) or '') == symbol}
def build_complete_koa(sequences, k):
"""Build complete k-OA Ck (Definition 4.2, arXiv 1004.2372).
For each a Σ(S), exactly k states labeled a (a_1 ... a_k).
- src connected to exactly one a_i for each a
- Every state has edge to every other state (except src)
- src sink edge (for ε)
"""
G = KOA(k=k)
alphabet = set()
for seq in sequences:
for token in seq:
alphabet.add(token)
symbol_states = {}
for sym in alphabet:
state_ids = []
for i in range(1, k + 1):
nid = G.add_state(f"{sym}_{i}")
state_ids.append(nid)
G.add_edge(G.src, nid)
symbol_states[sym] = state_ids
all_states = [n for n in G._succ if n not in (G.src, G.sink)]
for s in all_states:
for t in all_states:
if s != t and not G.has_edge(s, t):
G.add_edge(s, t)
if not G.has_edge(s, G.sink):
G.add_edge(s, G.sink)
G.add_edge(G.src, G.sink)
return G, symbol_states