grammar-inference-engine/bex/koa.py
tobjend 7c00c6713d Initial commit: BEX-based grammar inference engine
- CRX: direct CHARE inference (Algorithm 7, TODS 2010)
- iDRegEx: k-ORE inference (Algorithm 4, arXiv 2010)
- RWR₀: SORE repair (Algorithm 6, TODS 2010)
- rwr²: k-ORE extraction (Algorithm 3, arXiv 2010)
- SOA, k-OA, iKoa, 2T-INF, Baum-Welch
- Ansible role grammar adapter
- Generic YAML key-path converter
- 28 tests, all passing
2026-07-01 08:01:16 +02:00

105 lines
3 KiB
Python

"""k-OA — k-Occurrence Automaton (Definition 4.1, arXiv 1004.2372).
A k-OA is like a SOA but each symbol appears at most k times as a state label.
"""
from .soa import SOA
from .expr import strip_k
class KOA(SOA):
"""k-Occurrence Automaton.
Same structure as SOA but each symbol may label up to k states.
"""
def __init__(self, k=1):
super().__init__()
self.k = k
self._symbol_count = {}
def add_state(self, label):
nid = super().add_state(label)
sym = strip_k(label)
self._symbol_count.setdefault(sym, 0)
self._symbol_count[sym] += 1
return nid
def remove_state(self, nid):
label = self._label.get(nid)
if label:
sym = strip_k(label)
self._symbol_count[sym] -= 1
super().rm_state(nid)
def count_symbol(self, symbol):
return self._symbol_count.get(strip_k(symbol), 0)
def symbol_ok(self, symbol):
return self.count_symbol(symbol) < self.k
def is_deterministic(self):
for n in self._succ:
label_map = {}
for t in self._succ[n]:
lab = self._label.get(t)
if lab:
base = strip_k(lab)
if base in label_map:
return False
label_map[base] = t
return True
def accept(self, w):
"""Accept using base symbols (strip k-markers from state labels)."""
cur = {self.src}
for sym in w:
nxt = set()
for s in cur:
for t in self._succ.get(s, set()):
lab = self._label.get(t)
if lab and strip_k(lab) == sym:
nxt.add(t)
if not nxt:
return False
cur = nxt
return any(self.sink in self._succ.get(s, set()) for s in cur)
def succ_labeled(self, nid, symbol):
return {t for t in self._succ.get(nid, set()) if strip_k(self._label.get(t) or '') == symbol}
def build_complete_koa(sequences, k):
"""Build complete k-OA Ck (Definition 4.2, arXiv 1004.2372).
For each a ∈ Σ(S), exactly k states labeled a (a_1 ... a_k).
- src connected to exactly one a_i for each a
- Every state has edge to every other state (except src)
- src → sink edge (for ε)
"""
G = KOA(k=k)
alphabet = set()
for seq in sequences:
for token in seq:
alphabet.add(token)
symbol_states = {}
for sym in alphabet:
state_ids = []
for i in range(1, k + 1):
nid = G.add_state(f"{sym}_{i}")
state_ids.append(nid)
G.add_edge(G.src, nid)
symbol_states[sym] = state_ids
all_states = [n for n in G._succ if n not in (G.src, G.sink)]
for s in all_states:
for t in all_states:
if s != t and not G.has_edge(s, t):
G.add_edge(s, t)
if not G.has_edge(s, G.sink):
G.add_edge(s, G.sink)
G.add_edge(G.src, G.sink)
return G, symbol_states