grammar-inference-engine/bex/crx.py
tobjend 7c00c6713d Initial commit: BEX-based grammar inference engine
- CRX: direct CHARE inference (Algorithm 7, TODS 2010)
- iDRegEx: k-ORE inference (Algorithm 4, arXiv 2010)
- RWR₀: SORE repair (Algorithm 6, TODS 2010)
- rwr²: k-ORE extraction (Algorithm 3, arXiv 2010)
- SOA, k-OA, iKoa, 2T-INF, Baum-Welch
- Ansible role grammar adapter
- Generic YAML key-path converter
- 28 tests, all passing
2026-07-01 08:01:16 +02:00

191 lines
6.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""CRX — Direct CHARE inference (Algorithm 7, TODS 2010)."""
from collections import defaultdict
from .expr import concat
class CRX:
"""
|———— Algorithm 7: CRX ————|
Input: sample S (list of token lists)
Output: CHARE r such that S ⊆ L(r)
"""
def infer(self, sequences):
S = [list(s) for s in sequences if s]
if not S:
return 'ε'
sigma = set()
for w in S:
for a in w:
sigma.add(a)
if not sigma:
return 'ε'
# Step 1: Compute ImmedPred and equivalence classes ≈_S
immed = set()
for w in S:
for i in range(len(w) - 1):
immed.add((w[i], w[i + 1]))
# Reachability: →_S (reflexive, transitive closure)
closure = self._transitive_closure(sigma, immed)
# Equivalence: a ≈_S b iff a →*_S b and b →*_S a
eq = self._equivalence(sigma, closure)
# Build class map: symbol → class index
sym_to_cls = {}
classes = []
for cls_syms in eq:
idx = len(classes)
for sym in cls_syms:
sym_to_cls[sym] = idx
classes.append(set(cls_syms))
# Step 2-3: Preserve only singleton nodes? No, the algorithm says merge singletons
# that share Pred/Succ in the Hasse diagram. But actually, looking at the algorithm
# more carefully:
#
# "while a maximal set of singleton nodes γ₁,...,γ_ such that
# Pred_HS(γ₁)=···=Pred_HS(γ_) and Succ_HS(γ₁)=···=Succ_HS(γ_) exists do
# Replace γ₁,...,γ_ by γ := ∪ⱼ γⱼ"
#
# This merges singleton equivalence classes (classes with exactly one symbol)
# that have the same Pred and Succ sets in the Hasse diagram.
changed = True
while changed:
changed = False
singleton_ids = [i for i, c in enumerate(classes) if len(c) == 1]
# Compute Pred and Succ for each singleton (considering ALL symbols in each class)
hs_pred = {}
hs_succ = {}
for i in singleton_ids:
hs_pred[i] = set()
hs_succ[i] = set()
sym_i = next(iter(classes[i]))
for j, c in enumerate(classes):
if i == j:
continue
if any((sym_j, sym_i) in immed for sym_j in c):
hs_pred[i].add(j)
if any((sym_i, sym_j) in immed for sym_j in c):
hs_succ[i].add(j)
# Group by same (Pred, Succ)
groups = defaultdict(list)
for i in singleton_ids:
groups[(frozenset(hs_pred[i]), frozenset(hs_succ[i]))].append(i)
for (pred_set, succ_set), group in groups.items():
if len(group) >= 2:
merged = set()
for i in group:
merged.update(classes[i])
new_id = len(classes)
classes.append(merged)
for i in sorted(group, reverse=True):
classes.pop(i)
changed = True
break
# After merging, rebuild sym_to_cls to map to new class indices
sym_to_cls = {}
for idx, cls in enumerate(classes):
for sym in cls:
sym_to_cls[sym] = idx
# Step 5: Topological sort of the Hasse diagram
adj = {i: set() for i in range(len(classes))}
indeg = {i: 0 for i in range(len(classes))}
for a, b in immed:
ca, cb = sym_to_cls.get(a), sym_to_cls.get(b)
if ca is not None and cb is not None and ca != cb:
if cb not in adj[ca]:
adj[ca].add(cb)
indeg[cb] += 1
# Topological sort (Kahn's algorithm)
order = []
q = [i for i in range(len(classes)) if indeg[i] == 0]
while q:
i = q.pop(0)
order.append(i)
for j in adj[i]:
indeg[j] -= 1
if indeg[j] == 0:
q.append(j)
remaining = set(range(len(classes))) - set(order)
order.extend(remaining)
# Step 6-16: Assign chain factors (Algorithm 7 lines 7-14)
def count_in_class(w, syms):
return sum(1 for a in w if a in syms)
parts = []
for i in order:
syms = classes[i]
counts = [count_in_class(w, syms) for w in S]
all_exactly_one = all(c == 1 for c in counts)
all_at_most_one = all(c <= 1 for c in counts)
all_at_least_one = all(c >= 1 for c in counts)
some_two_or_more = any(c >= 2 for c in counts)
sym_list = sorted(syms)
factor = '+'.join(sym_list)
if len(sym_list) > 1:
factor = '(' + factor + ')'
if all_exactly_one:
pass # (a₁+···+aₙ)
elif all_at_most_one:
factor += '?' # (a₁+···+aₙ)?
elif all_at_least_one and some_two_or_more:
factor += '+' # (a₁+···+aₙ)+
else:
factor += '+?' # (a₁+···+aₙ)+?
parts.append(factor)
if not parts:
return 'ε'
return '.'.join(parts)
def _transitive_closure(self, sigma, immed):
"""Compute reflexive, transitive closure of immed relation."""
closure = {(a, b) for (a, b) in immed}
for a in sigma:
closure.add((a, a))
changed = True
while changed:
changed = False
for a in sigma:
for b in sigma:
for c in sigma:
if (a, b) in closure and (b, c) in closure and (a, c) not in closure:
closure.add((a, c))
changed = True
return closure
def _equivalence(self, sigma, closure):
"""Compute equivalence classes of ≈_S."""
remaining = set(sigma)
classes = []
while remaining:
a = remaining.pop()
cls = {a}
added = True
while added:
added = False
for b in list(remaining):
if (a, b) in closure and (b, a) in closure:
if b not in cls:
cls.add(b)
remaining.discard(b)
added = True
classes.append(cls)
return classes