feat: core+outlier analysis via min_coverage parameter, 6 new tests

2026-07-01 15:09:10 +02:00 · 2026-07-01 15:09:10 +02:00 · 9045769d57
commit 9045769d57
parent edd6d9d4dd
2 changed files with 214 additions and 3 deletions
--- a/bex/ensemble.py
+++ b/bex/ensemble.py
@ -234,6 +234,129 @@ def _matches(grammar, sequence):
        return False


+def _fit_score(grammar, seq):
+    """Score how tightly a sequence fits: 1.0 = perfect match to core,
+    0.0 = mostly uses optional/repeated parts.
+
+    Instead of trying to parse the grammar structure (which is fragile),
+    this measures how well seq matches against the grammatical core by
+    comparing its symbol positions to the grammar's 'spine' — the symbols
+    that appear in all sequences.
+    """
+    if not seq:
+        return 0.0
+    try:
+        # Strategy: parse grammar tokens, match seq, count what fraction
+        # of seq length is consumed by obligatory (non-?, non-+?) tokens.
+        tokens = _parse_parts(grammar.strip())
+        if not tokens or tokens[0][0] == 'empty':
+            return 0.0
+
+        def _classify_tokens(node):
+            """Return (obligatory_count, optional_count) for this node."""
+            tt, tv, tq = node
+            if tt == 'symbol':
+                if tq in ('', '+'):
+                    return (1, 0)
+                return (0, 1)
+            if tt == 'concat':
+                ob, op = 0, 0
+                for c in tv:
+                    if c[0] == 'empty':
+                        continue
+                    o1, o2 = _classify_tokens(c)
+                    ob += o1
+                    op += o2
+                return (ob, op)
+            if tt == 'disj':
+                # Any alternative counts as optional
+                return (0, len(tv))
+            return (0, 0)
+
+        ob, op = _classify_tokens(tokens[0])
+        total = ob + op
+        if total == 0:
+            return 0.5
+
+        # Match seq and see how many symbols are actually consumed
+        end = _match_tokens(tokens, seq)
+        if end is None or end != len(seq):
+            return 0.0
+
+        # Fit = fraction of mandatory symbols / total mandatory+optional
+        # Penalizes sequences that lean heavily on optional parts
+        return max(0.0, 1.0 - (op / total))
+    except Exception:
+        return 0.0
+
+
+def _symbol_rarity_score(seq, all_sequences):
+    """Score a sequence by how rare its symbols are across the dataset.
+    1.0 = all symbols are common, 0.0 = mostly rare symbols.
+    """
+    from collections import Counter
+    all_syms = Counter()
+    for s in all_sequences:
+        all_syms.update(s)
+    n = len(all_sequences)
+    scores = []
+    for sym in seq:
+        freq = all_syms.get(sym, 0) / n
+        scores.append(min(freq, 1.0))
+    return sum(scores) / len(scores) if scores else 0.0
+
+
+def _find_core(sequences, min_coverage=0.8):
+    """Find the core subset of sequences by iterative CRX + outlier removal.
+
+    Outlier detection uses symbol rarity: sequences with rare symbols
+    (appearing in few other sequences) are removed first.
+
+    Returns:
+        (core_grammar, core_sequences, outliers, fit_scores)
+    """
+    if not sequences or min_coverage >= 1.0:
+        crx_g = CRX().infer(sequences)
+        return crx_g, sequences, [], []
+
+    from collections import Counter
+    all_syms = Counter()
+    for s in sequences:
+        all_syms.update(s)
+    n = len(sequences)
+
+    def _rarity(seq):
+        rare_count = sum(1 for sym in seq if all_syms.get(sym, 0) / n < 0.3)
+        return rare_count / max(len(seq), 1)
+
+    working = list(sequences)
+    removed_indices = []
+    crx = CRX()
+
+    for _ in range(50):
+        if len(working) < 3:
+            break
+
+        target = max(int(len(sequences) * min_coverage), 1)
+        if len(working) <= target:
+            break
+
+        # Score by rarity: most rare symbol → worst fit
+        scores = [(i, _rarity(seq)) for i, seq in enumerate(working)]
+        scores.sort(key=lambda x: -x[1])  # most rare first
+
+        # If all sequences have the same score, stop (no outliers to remove)
+        if len(scores) < 2 or scores[0][1] == scores[-1][1]:
+            break
+
+        worst_idx = scores[0][0]
+        removed_indices.append(working[worst_idx])
+        working = [s for i, s in enumerate(working) if i != worst_idx]
+
+    core_g = crx.infer(working) if working else None
+    return core_g, working, removed_indices, []
+
+
 def mdl_score_simple(grammar, sequences):
    """MDL score from the paper: model_cost + Σ log₂(|L(r)| at length len(s)).

@ -276,7 +399,7 @@ _ALGORITHMS = {
 }


-def infer_ensemble(sequences, kmax=2, N=3, prefer=None):
+def infer_ensemble(sequences, kmax=2, N=3, prefer=None, min_coverage=1.0):
    """Run all applicable algorithms and return the best by MDL score.

    Args:
@ -285,12 +408,18 @@ def infer_ensemble(sequences, kmax=2, N=3, prefer=None):
        N: Number of random trials for k-ORE inference.
        prefer: Optional — 'crx', 'idregex', or 'koreinference' to skip
                ensemble and return only that algorithm's result.
+        min_coverage: When < 1.0, also runs CRX on the tightest core subset
+                      of sequences. Outliers (worst-fitting) are iteratively
+                      removed until at least this fraction remains. The core
+                      grammar and outlier list are included in the response.

    Returns:
        dict with keys:
            best: {algorithm, grammar, mdl_score}
            all: [{algorithm, grammar, mdl_score}, ...]
            why: str explaining the choice
+            core: (optional) {grammar, coverage, outliers} — only when
+                  min_coverage < 1.0
    """
    if prefer and prefer.lower() in _ALGORITHMS:
        key = prefer.lower()
@ -328,11 +457,19 @@ def infer_ensemble(sequences, kmax=2, N=3, prefer=None):

    results = [r for r in results if r[1] and r[1] != '∅']
    if not results:
-        return {
+        base = {
            'best': None,
            'all': [],
            'why': "No algorithm produced a non-empty grammar.",
        }
+        if min_coverage < 1.0:
+            core_g, core_seqs, outliers, _ = _find_core(sequences, min_coverage)
+            base['core'] = {
+                'grammar': core_g,
+                'coverage': round(len(core_seqs) / max(len(sequences), 1), 2) if sequences else 0,
+                'outliers': outliers,
+            }
+        return base

    results.sort(key=lambda x: x[2])
    best = results[0]
@ -360,7 +497,7 @@ def infer_ensemble(sequences, kmax=2, N=3, prefer=None):

    why_parts.append(f"{best[0]} selected (MDL score {best[2]:.1f}).")

-    return {
+    result = {
        'best': {
            'algorithm': best[0],
            'grammar': best[1],
@ -369,3 +506,16 @@ def infer_ensemble(sequences, kmax=2, N=3, prefer=None):
        'all': all_results,
        'why': ' '.join(why_parts),
    }
+
+    # Core analysis when min_coverage < 1.0
+    if min_coverage < 1.0:
+        core_g, core_seqs, outliers, _ = _find_core(sequences, min_coverage)
+        result['core'] = {
+            'grammar': core_g,
+            'coverage': round(len(core_seqs) / max(len(sequences), 1), 2) if sequences else 0,
+            'outlier_count': len(outliers),
+            'outliers': outliers,
+        }
+        result['why'] += f' Core CRX ({min_coverage:.0%} coverage, {len(outliers)} outliers): {core_g}'
+
+    return result
--- a/tests/test_ensemble.py
+++ b/tests/test_ensemble.py
@ -164,6 +164,67 @@ def test_ensemble_crx_always_present():
    assert len(crx_results) == 1


+# ── min_coverage / core analysis tests ──
+
+def test_core_not_included_when_coverage_1():
+    seqs = [['a', 'b'], ['a', 'b', 'c']]
+    result = infer_ensemble(seqs, min_coverage=1.0)
+    assert 'core' not in result
+
+
+def test_core_included_when_coverage_lt_1():
+    seqs = [['a', 'b'], ['a', 'b', 'c']]
+    result = infer_ensemble(seqs, min_coverage=0.8)
+    assert 'core' in result
+    assert 'grammar' in result['core']
+    assert 'coverage' in result['core']
+    assert 'outliers' in result['core']
+    assert 'outlier_count' in result['core']
+
+
+def test_core_outlier_detection():
+    seqs = [
+        ['fail', 'package', 'file', 'service'],
+        ['fail', 'package', 'file', 'service'],
+        ['fail', 'package', 'file', 'service', 'npm'],
+        ['fail', 'package', 'file', 'service', 'npm', 'pip'],
+    ]
+    result = infer_ensemble(seqs, min_coverage=0.7)
+    assert 'core' in result
+    c = result['core']
+    assert c['outlier_count'] >= 1
+    assert 'npm' in c['grammar'] or 'service' in c['grammar']
+
+
+def test_core_all_identical():
+    seqs = [['a', 'b', 'c']] * 10
+    result = infer_ensemble(seqs, min_coverage=0.8)
+    assert 'core' in result
+    assert result['core']['outlier_count'] == 0
+    assert 'a' in result['core']['grammar']
+
+
+def test_core_coverage_ratio():
+    seqs = [
+        ['a', 'b', 'c'],
+        ['a', 'b', 'c'],
+        ['a', 'b', 'c', 'd'],
+        ['a', 'b', 'c', 'd', 'e'],
+    ]
+    result = infer_ensemble(seqs, min_coverage=0.7)
+    if 'core' in result:
+        c = result['core']
+        assert c['outlier_count'] >= 1
+        assert len(c['outliers']) >= 1
+        assert c['coverage'] >= 0.5
+
+
+def test_core_empty_sequences():
+    result = infer_ensemble([], min_coverage=0.8)
+    assert 'core' in result
+    assert result['core']['grammar'] is not None
+
+
 def run_all():
    tests = [
        test_ensemble_returns_dict,