grammar-inference-engine/bex/crx.py

192 lines
6.6 KiB
Python
Raw Permalink Normal View History

"""CRX — Direct CHARE inference (Algorithm 7, TODS 2010)."""
from collections import defaultdict
from .expr import concat
class CRX:
"""
| Algorithm 7: CRX |
Input: sample S (list of token lists)
Output: CHARE r such that S L(r)
"""
def infer(self, sequences):
S = [list(s) for s in sequences if s]
if not S:
return 'ε'
sigma = set()
for w in S:
for a in w:
sigma.add(a)
if not sigma:
return 'ε'
# Step 1: Compute ImmedPred and equivalence classes ≈_S
immed = set()
for w in S:
for i in range(len(w) - 1):
immed.add((w[i], w[i + 1]))
# Reachability: →_S (reflexive, transitive closure)
closure = self._transitive_closure(sigma, immed)
# Equivalence: a ≈_S b iff a →*_S b and b →*_S a
eq = self._equivalence(sigma, closure)
# Build class map: symbol → class index
sym_to_cls = {}
classes = []
for cls_syms in eq:
idx = len(classes)
for sym in cls_syms:
sym_to_cls[sym] = idx
classes.append(set(cls_syms))
# Step 2-3: Preserve only singleton nodes? No, the algorithm says merge singletons
# that share Pred/Succ in the Hasse diagram. But actually, looking at the algorithm
# more carefully:
#
# "while a maximal set of singleton nodes γ₁,...,γ_ such that
# Pred_HS(γ₁)=···=Pred_HS(γ_) and Succ_HS(γ₁)=···=Succ_HS(γ_) exists do
# Replace γ₁,...,γ_ by γ := ∪ⱼ γⱼ"
#
# This merges singleton equivalence classes (classes with exactly one symbol)
# that have the same Pred and Succ sets in the Hasse diagram.
changed = True
while changed:
changed = False
singleton_ids = [i for i, c in enumerate(classes) if len(c) == 1]
# Compute Pred and Succ for each singleton (considering ALL symbols in each class)
hs_pred = {}
hs_succ = {}
for i in singleton_ids:
hs_pred[i] = set()
hs_succ[i] = set()
sym_i = next(iter(classes[i]))
for j, c in enumerate(classes):
if i == j:
continue
if any((sym_j, sym_i) in immed for sym_j in c):
hs_pred[i].add(j)
if any((sym_i, sym_j) in immed for sym_j in c):
hs_succ[i].add(j)
# Group by same (Pred, Succ)
groups = defaultdict(list)
for i in singleton_ids:
groups[(frozenset(hs_pred[i]), frozenset(hs_succ[i]))].append(i)
for (pred_set, succ_set), group in groups.items():
if len(group) >= 2:
merged = set()
for i in group:
merged.update(classes[i])
new_id = len(classes)
classes.append(merged)
for i in sorted(group, reverse=True):
classes.pop(i)
changed = True
break
# After merging, rebuild sym_to_cls to map to new class indices
sym_to_cls = {}
for idx, cls in enumerate(classes):
for sym in cls:
sym_to_cls[sym] = idx
# Step 5: Topological sort of the Hasse diagram
adj = {i: set() for i in range(len(classes))}
indeg = {i: 0 for i in range(len(classes))}
for a, b in immed:
ca, cb = sym_to_cls.get(a), sym_to_cls.get(b)
if ca is not None and cb is not None and ca != cb:
if cb not in adj[ca]:
adj[ca].add(cb)
indeg[cb] += 1
# Topological sort (Kahn's algorithm)
order = []
q = [i for i in range(len(classes)) if indeg[i] == 0]
while q:
i = q.pop(0)
order.append(i)
for j in adj[i]:
indeg[j] -= 1
if indeg[j] == 0:
q.append(j)
remaining = set(range(len(classes))) - set(order)
order.extend(remaining)
# Step 6-16: Assign chain factors (Algorithm 7 lines 7-14)
def count_in_class(w, syms):
return sum(1 for a in w if a in syms)
parts = []
for i in order:
syms = classes[i]
counts = [count_in_class(w, syms) for w in S]
all_exactly_one = all(c == 1 for c in counts)
all_at_most_one = all(c <= 1 for c in counts)
all_at_least_one = all(c >= 1 for c in counts)
some_two_or_more = any(c >= 2 for c in counts)
sym_list = sorted(syms)
factor = '+'.join(sym_list)
if len(sym_list) > 1:
factor = '(' + factor + ')'
if all_exactly_one:
pass # (a₁+···+aₙ)
elif all_at_most_one:
factor += '?' # (a₁+···+aₙ)?
elif all_at_least_one and some_two_or_more:
factor += '+' # (a₁+···+aₙ)+
else:
factor += '+?' # (a₁+···+aₙ)+?
parts.append(factor)
if not parts:
return 'ε'
return '.'.join(parts)
def _transitive_closure(self, sigma, immed):
"""Compute reflexive, transitive closure of immed relation."""
closure = {(a, b) for (a, b) in immed}
for a in sigma:
closure.add((a, a))
changed = True
while changed:
changed = False
for a in sigma:
for b in sigma:
for c in sigma:
if (a, b) in closure and (b, c) in closure and (a, c) not in closure:
closure.add((a, c))
changed = True
return closure
def _equivalence(self, sigma, closure):
"""Compute equivalence classes of ≈_S."""
remaining = set(sigma)
classes = []
while remaining:
a = remaining.pop()
cls = {a}
added = True
while added:
added = False
for b in list(remaining):
if (a, b) in closure and (b, a) in closure:
if b not in cls:
cls.add(b)
remaining.discard(b)
added = True
classes.append(cls)
return classes