grammar-inference-engine/bex/crx.py

"""CRX — Direct CHARE inference (Algorithm 7, TODS 2010)."""

from collections import defaultdict
from .expr import concat


class CRX:
    """
    |———— Algorithm 7: CRX ————|
    Input:  sample S (list of token lists)
    Output: CHARE r such that S ⊆ L(r)
    """

    def infer(self, sequences):
        S = [list(s) for s in sequences if s]
        if not S:
            return 'ε'

        sigma = set()
        for w in S:
            for a in w:
                sigma.add(a)
        if not sigma:
            return 'ε'

        # Step 1: Compute ImmedPred and equivalence classes ≈_S
        immed = set()
        for w in S:
            for i in range(len(w) - 1):
                immed.add((w[i], w[i + 1]))

        # Reachability: →_S (reflexive, transitive closure)
        closure = self._transitive_closure(sigma, immed)

        # Equivalence: a ≈_S b iff a →*_S b and b →*_S a
        eq = self._equivalence(sigma, closure)

        # Build class map: symbol → class index
        sym_to_cls = {}
        classes = []
        for cls_syms in eq:
            idx = len(classes)
            for sym in cls_syms:
                sym_to_cls[sym] = idx
            classes.append(set(cls_syms))

        # Step 2-3: Preserve only singleton nodes? No, the algorithm says merge singletons
        # that share Pred/Succ in the Hasse diagram. But actually, looking at the algorithm
        # more carefully:
        #
        # "while a maximal set of singleton nodes γ₁,...,γ_ℓ such that
        #  Pred_HS(γ₁)=···=Pred_HS(γ_ℓ) and Succ_HS(γ₁)=···=Succ_HS(γ_ℓ) exists do
        #   Replace γ₁,...,γ_ℓ by γ := ∪ⱼ γⱼ"
        #
        # This merges singleton equivalence classes (classes with exactly one symbol)
        # that have the same Pred and Succ sets in the Hasse diagram.

        changed = True
        while changed:
            changed = False
            singleton_ids = [i for i, c in enumerate(classes) if len(c) == 1]

            # Compute Pred and Succ for each singleton (considering ALL symbols in each class)
            hs_pred = {}
            hs_succ = {}
            for i in singleton_ids:
                hs_pred[i] = set()
                hs_succ[i] = set()
                sym_i = next(iter(classes[i]))
                for j, c in enumerate(classes):
                    if i == j:
                        continue
                    if any((sym_j, sym_i) in immed for sym_j in c):
                        hs_pred[i].add(j)
                    if any((sym_i, sym_j) in immed for sym_j in c):
                        hs_succ[i].add(j)

            # Group by same (Pred, Succ)
            groups = defaultdict(list)
            for i in singleton_ids:
                groups[(frozenset(hs_pred[i]), frozenset(hs_succ[i]))].append(i)

            for (pred_set, succ_set), group in groups.items():
                if len(group) >= 2:
                    merged = set()
                    for i in group:
                        merged.update(classes[i])
                    new_id = len(classes)
                    classes.append(merged)
                    for i in sorted(group, reverse=True):
                        classes.pop(i)
                    changed = True
                    break

        # After merging, rebuild sym_to_cls to map to new class indices
        sym_to_cls = {}
        for idx, cls in enumerate(classes):
            for sym in cls:
                sym_to_cls[sym] = idx

        # Step 5: Topological sort of the Hasse diagram
        adj = {i: set() for i in range(len(classes))}
        indeg = {i: 0 for i in range(len(classes))}
        for a, b in immed:
            ca, cb = sym_to_cls.get(a), sym_to_cls.get(b)
            if ca is not None and cb is not None and ca != cb:
                if cb not in adj[ca]:
                    adj[ca].add(cb)
                    indeg[cb] += 1

        # Topological sort (Kahn's algorithm)
        order = []
        q = [i for i in range(len(classes)) if indeg[i] == 0]
        while q:
            i = q.pop(0)
            order.append(i)
            for j in adj[i]:
                indeg[j] -= 1
                if indeg[j] == 0:
                    q.append(j)
        remaining = set(range(len(classes))) - set(order)
        order.extend(remaining)

        # Step 6-16: Assign chain factors (Algorithm 7 lines 7-14)
        def count_in_class(w, syms):
            return sum(1 for a in w if a in syms)

        parts = []
        for i in order:
            syms = classes[i]
            counts = [count_in_class(w, syms) for w in S]

            all_exactly_one = all(c == 1 for c in counts)
            all_at_most_one = all(c <= 1 for c in counts)
            all_at_least_one = all(c >= 1 for c in counts)
            some_two_or_more = any(c >= 2 for c in counts)

            sym_list = sorted(syms)
            factor = '+'.join(sym_list)
            if len(sym_list) > 1:
                factor = '(' + factor + ')'

            if all_exactly_one:
                pass  # (a₁+···+aₙ)
            elif all_at_most_one:
                factor += '?'  # (a₁+···+aₙ)?
            elif all_at_least_one and some_two_or_more:
                factor += '+'  # (a₁+···+aₙ)+
            else:
                factor += '+?'  # (a₁+···+aₙ)+?

            parts.append(factor)

        if not parts:
            return 'ε'
        return '.'.join(parts)

    def _transitive_closure(self, sigma, immed):
        """Compute reflexive, transitive closure of immed relation."""
        closure = {(a, b) for (a, b) in immed}
        for a in sigma:
            closure.add((a, a))
        changed = True
        while changed:
            changed = False
            for a in sigma:
                for b in sigma:
                    for c in sigma:
                        if (a, b) in closure and (b, c) in closure and (a, c) not in closure:
                            closure.add((a, c))
                            changed = True
        return closure

    def _equivalence(self, sigma, closure):
        """Compute equivalence classes of ≈_S."""
        remaining = set(sigma)
        classes = []
        while remaining:
            a = remaining.pop()
            cls = {a}
            added = True
            while added:
                added = False
                for b in list(remaining):
                    if (a, b) in closure and (b, a) in closure:
                        if b not in cls:
                            cls.add(b)
                            remaining.discard(b)
                            added = True
            classes.append(cls)
        return classes