grammar-inference-engine/bex/soa.py
tobjend 7c00c6713d Initial commit: BEX-based grammar inference engine
- CRX: direct CHARE inference (Algorithm 7, TODS 2010)
- iDRegEx: k-ORE inference (Algorithm 4, arXiv 2010)
- RWR₀: SORE repair (Algorithm 6, TODS 2010)
- rwr²: k-ORE extraction (Algorithm 3, arXiv 2010)
- SOA, k-OA, iKoa, 2T-INF, Baum-Welch
- Ansible role grammar adapter
- Generic YAML key-path converter
- 28 tests, all passing
2026-07-01 08:01:16 +02:00

193 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""SOA — Single Occurrence Automaton (Definition 6, TODS 2010)."""
import copy
from .expr import concat, disj, star, optional
class SOA:
"""
Node-labeled automaton (Definition 6, TODS 2010).
V = {src, sink} symbol-labeled states.
E ⊆ V × V, unlabeled edges.
Walk src=v₁,v₂,...,vₙ₊₁=sink accepts word lab(v₂)...lab(vₙ).
States are proper SOREs, pairwise alphabet-disjoint (Definition 10).
"""
def __init__(self):
self._next = 0
self._succ = {}
self._pred = {}
self._label = {}
self.src = self._new()
self.sink = self._new()
def _new(self):
n = self._next
self._next += 1
self._succ[n] = set()
self._pred[n] = set()
self._label[n] = None
return n
def add_state(self, label):
n = self._new()
self._label[n] = label
return n
def add_edge(self, f, t):
self._succ[f].add(t)
self._pred[t].add(f)
def rm_edge(self, f, t):
self._succ[f].discard(t)
self._pred[t].discard(f)
def rm_state(self, n):
if n in (self.src, self.sink):
return
for p in list(self._pred[n]):
self.rm_edge(p, n)
for s in list(self._succ[n]):
self.rm_edge(n, s)
del self._label[n]
del self._succ[n]
del self._pred[n]
def label(self, n):
return self._label.get(n)
def set_label(self, n, lab):
self._label[n] = lab
def succ(self, n):
return set(self._succ.get(n, set()))
def pred(self, n):
return set(self._pred.get(n, set()))
def has_edge(self, f, t):
return t in self._succ.get(f, set())
def states(self):
return [n for n in self._succ if n not in (self.src, self.sink) and self._label.get(n) is not None]
def _pred_plus(self, n):
r = set(self._pred.get(n, set()))
if self._label.get(n) and self._label[n].endswith('+'):
r.add(n)
return r
def _succ_plus(self, n):
r = set(self._succ.get(n, set()))
if self._label.get(n) and self._label[n].endswith('+'):
r.add(n)
return r
def copy(self):
return copy.deepcopy(self)
def accept(self, w):
cur = {self.src}
for sym in w:
nxt = set()
for s in cur:
for t in self._succ.get(s, set()):
if self._label.get(t) == sym:
nxt.add(t)
if not nxt:
return False
cur = nxt
return any(self.sink in self._succ.get(s, set()) for s in cur)
def sink_reachable(self):
seen = set()
q = [self.src]
while q:
s = q.pop()
if s == self.sink:
return True
if s in seen:
continue
seen.add(s)
q.extend(self._succ.get(s, []))
return False
def num_non_special(self):
return sum(1 for n in self._succ if n not in (self.src, self.sink))
def is_final(self):
ns = self.states()
return len(ns) == 1 and self.has_edge(self.src, ns[0]) and self.has_edge(ns[0], self.sink)
def expression(self):
if not self.is_final():
return None
return self._label[self.states()[0]]
def contract(self, r, s, new_label):
"""
State contraction G[r,s ⇒ t] (Definition 11, TODS 2010).
(1) Add t as new state with label new_label.
(2) Every v ∈ Pred(r) {r,s} → predecessor of t.
(3) Every w ∈ Succ(s) {r,s} → successor of t. [matching figures]
(4) Loop t→t if r ∈ Succ(s).
(5) Remove r, s and all edges.
"""
t = self._new()
self._label[t] = new_label
for v in self._pred.get(r, set()) - {r, s}:
self.add_edge(v, t)
for v in self._pred.get(s, set()) - {r, s}:
self.add_edge(v, t)
for w in self._succ.get(r, set()) - {r, s}:
self.add_edge(t, w)
for w in self._succ.get(s, set()) - {r, s}:
self.add_edge(t, w)
if r in self._succ.get(s, set()):
self.add_edge(t, t)
self.rm_state(r)
self.rm_state(s)
return t
def contract_single(self, r, new_label):
"""Single-state substitution G[r ⇒ t] (Definition 11 note)."""
if r in (self.src, self.sink):
return r
t = self._new()
self._label[t] = new_label
for v in self._pred.get(r, set()) - {r}:
self.add_edge(v, t)
for w in self._succ.get(r, set()) - {r}:
self.add_edge(t, w)
if r in self._succ.get(r, set()):
self.add_edge(t, t)
self.rm_state(r)
return t
def epsilon_closure(self):
"""G* (Definition 25, TODS 2010). Add self-loops for + states and ε-transitive closure."""
G = self.copy()
changed = True
while changed:
changed = False
for n in list(G._succ.keys()):
lab = G._label.get(n)
if lab and (lab.endswith('+') or lab.endswith('+?')):
if not G.has_edge(n, n):
G.add_edge(n, n)
changed = True
for n in list(G._succ.keys()):
for m in list(G._succ.get(n, set())):
mlab = G._label.get(m)
if mlab == 'ε':
for mp in list(G._succ.get(m, set())):
if mp != n and not G.has_edge(n, mp):
G.add_edge(n, mp)
changed = True
return G
def __repr__(self):
return f"SOA(nodes={len(self._succ)}, special={self.num_non_special()})"