grammar-inference-engine/bex/ensemble.py

"""Ensemble grammar inference — run multiple algorithms, pick best by MDL scoring."""

import re
from .crx import CRX
from .idregex import idregex
from .kore import kOREInference
from .expr import alphabet
from .mdl import model_cost, mdl_score


def _parse_parts(expr):
    """Parse expression into a list of tokens for matching.

    Each token: (type, value, quantifier)
      type: 'symbol' | 'disj' | 'concat' | 'empty'
      quantifier: '' | '?' | '+' | '+?'
    """
    if not expr or expr == '∅':
        return [('empty', '', '')]
    if expr == 'ε':
        return [('empty', '', '+?')]

    # 1. Check if it's a concatenation (split outermost by '.')
    # Must check BEFORE stripping trailing quantifier, because
    # quantifiers belong to individual parts (e.g., a?.b+)
    concat_parts = _split_outer(expr.strip(), '.')
    if len(concat_parts) > 1:
        children = []
        for p in concat_parts:
            children.extend(_parse_parts(p.strip()))
        return [('concat', children, '')]

    # 2. Now handle quantifier suffix on this single part
    quantifier = ''
    if expr.endswith('+?'):
        quantifier = '+?'
        expr = expr[:-2]
    elif expr.endswith('*'):
        quantifier = '*'
        expr = expr[:-1]
    elif expr.endswith('?'):
        quantifier = '?'
        expr = expr[:-1]
    elif expr.endswith('+'):
        quantifier = '+'
        expr = expr[:-1]

    # 3. Disjunction group: (a+b+c) for CRX or (a|b|c) for iDRegEx
    if expr.startswith('(') and expr.endswith(')'):
        inner = expr[1:-1]
        # Try CRX-style (+) first, then iDRegEx-style (|)
        disj_parts = _split_outer(inner, '+')
        if len(disj_parts) <= 1:
            disj_parts = _split_outer(inner, '|')
        if len(disj_parts) > 1:
            children = []
            for p in disj_parts:
                p = p.strip()
                # Parse as a flat symbol (don't split dots — they're part of
                # the symbol name, e.g. "community.docker.docker_image")
                children.append(_parse_flat_symbol(p))
            return [('disj', children, quantifier)]
        # Single element inside parens: treat as flat symbol
        return [_parse_flat_symbol(inner)]

    # 4. Single symbol
    if expr and expr not in ('∅', 'ε'):
        return [('symbol', expr, quantifier)]

    return []


def _parse_flat_symbol(s):
    """Parse a single symbol with optional quantifier, no dot splitting.

    Unlike _parse_parts, this treats dots as part of the symbol name
    (e.g. 'community.docker.docker_image' stays as one symbol).
    """
    s = s.strip()
    quantifier = ''
    if s.endswith('+?'):
        quantifier = '+?'
        s = s[:-2]
    elif s.endswith('*'):
        quantifier = '*'
        s = s[:-1]
    elif s.endswith('?'):
        quantifier = '?'
        s = s[:-1]
    elif s.endswith('+'):
        quantifier = '+'
        s = s[:-1]
    if s and s not in ('∅', 'ε'):
        return ('symbol', s, quantifier)
    return ('empty', '', quantifier)


def _split_outer(s, sep):
    """Split on `sep` at the top level (not inside parentheses)."""
    depth = 0
    parts = []
    cur = []
    for ch in s:
        if ch == '(':
            depth += 1
            cur.append(ch)
        elif ch == ')':
            depth -= 1
            cur.append(ch)
        elif ch == sep and depth == 0:
            parts.append(''.join(cur))
            cur = []
        else:
            cur.append(ch)
    parts.append(''.join(cur))
    return parts


def _match_possible(token, seq, pos):
    """Return all possible end positions after matching this token starting at pos."""
    ttype, tval, tquant = token
    positions = []

    if ttype == 'empty':
        positions.append(pos)

    elif ttype == 'symbol':
        if tquant in ('', '?'):
            if pos < len(seq) and seq[pos] == tval:
                positions.append(pos + 1)
            if tquant == '?':
                positions.append(pos)
        elif tquant in ('+?', '*'):
            positions.append(pos)
            cnt = pos
            while cnt < len(seq) and seq[cnt] == tval:
                cnt += 1
                positions.append(cnt)
        elif tquant == '+':
            if pos < len(seq) and seq[pos] == tval:
                cnt = pos + 1
                positions.append(cnt)
                while cnt < len(seq) and seq[cnt] == tval:
                    cnt += 1
                    positions.append(cnt)

    elif ttype == 'disj':
        if tquant in ('', '?'):
            for child in tval:
                for ep in _match_possible(child, seq, pos):
                    positions.append(ep)
            if tquant == '?':
                positions.append(pos)
        elif tquant in ('+?', '*'):
            positions.append(pos)
            for child in tval:
                for ep in _match_possible(child, seq, pos):
                    if ep > pos:
                        positions.append(ep)
                        # After consuming one, recurse to try more
                        for ep2 in _match_possible(token, seq, ep):
                            if ep2 > ep:
                                positions.append(ep2)
        elif tquant == '+':
            for child in tval:
                for ep in _match_possible(child, seq, pos):
                    if ep > pos:
                        positions.append(ep)
                        for ep2 in _match_possible(token, seq, ep):
                            if ep2 > ep:
                                positions.append(ep2)

    elif ttype == 'concat':
        # Match all children sequentially
        def _match_seq(children, start):
            cur = [start]
            for child in children:
                next_cur = []
                for p in cur:
                    next_cur.extend(_match_possible(child, seq, p))
                cur = next_cur
                if not cur:
                    break
            return cur
        if tquant in ('', '?'):
            positions.extend(_match_seq(tval, pos))
            if tquant == '?':
                positions.append(pos)
        elif tquant in ('+?', '*'):
            positions.append(pos)
            inner_end = _match_seq(tval, pos)
            for ep in inner_end:
                if ep > pos:
                    positions.append(ep)
                    for ep2 in _match_possible(token, seq, ep):
                        if ep2 > ep:
                            positions.append(ep2)
        elif tquant == '+':
            inner_end = _match_seq(tval, pos)
            for ep in inner_end:
                if ep > pos:
                    positions.append(ep)
                    for ep2 in _match_possible(token, seq, ep):
                        if ep2 > ep:
                            positions.append(ep2)

    return positions


def _match_tokens(tokens, seq, pos=0):
    """Try to match tokens against seq starting at pos. Returns max position or None."""
    cur = [pos]
    for token in tokens:
        next_cur = []
        for p in cur:
            next_cur.extend(_match_possible(token, seq, p))
        cur = next_cur
        if not cur:
            return None
    return max(cur) if cur else pos


def _matches(grammar, sequence):
    """Check if a sequence matches the grammar."""
    try:
        tokens = _parse_parts(grammar.strip())
        if not tokens:
            return False
        end = _match_tokens(tokens, sequence)
        if end is None:
            return False
        return end == len(sequence)
    except Exception:
        return False


def mdl_score_simple(grammar, sequences):
    """MDL score from the paper: model_cost + Σ log₂(|L(r)| at length len(s)).

    Lower is better. Uses the paper's definition from Bex et al.
    model_cost = number of alphabet symbol occurrences in the expression.
    data_cost  = Σ log₂(|L(r)|) — penalizes overly general grammars.
    """
    return mdl_score(grammar, sequences)


def _run_idregex(sequences, kmax, N):
    """Run standalone iDRegEx, return (grammar, score) or (None, inf)."""
    g = idregex(sequences, kmax=kmax, N=N)
    if g and g != '∅':
        return g, mdl_score_simple(g, sequences)
    return None, float('inf')


def _run_kore(sequences, kmax, N):
    """Run kOREInference (Algorithm 4 with MDL), return (grammar, score) or (None, inf)."""
    kore = kOREInference(k_max=kmax, N=N)
    result = kore.infer(sequences)
    if result:
        _, expr, _ = result
        return expr, mdl_score_simple(expr, sequences)
    return None, float('inf')


_ALGO_NAMES = {
    'crx': 'CRX',
    'idregex': 'iDRegEx',
    'koreinference': 'kOREInference',
}


_ALGORITHMS = {
    'crx': lambda s, k, n: (CRX().infer(s), mdl_score_simple(CRX().infer(s), s)),
    'idregex': _run_idregex,
    'koreinference': _run_kore,
}


def infer_ensemble(sequences, kmax=2, N=3, prefer=None):
    """Run all applicable algorithms and return the best by MDL score.

    Args:
        sequences: List of sequences, each a list of strings.
        kmax: Maximum k for k-ORE inference (iDRegEx, kOREInference).
        N: Number of random trials for k-ORE inference.
        prefer: Optional — 'crx', 'idregex', or 'koreinference' to skip
                ensemble and return only that algorithm's result.

    Returns:
        dict with keys:
            best: {algorithm, grammar, mdl_score}
            all: [{algorithm, grammar, mdl_score}, ...]
            why: str explaining the choice
    """
    if prefer and prefer.lower() in _ALGORITHMS:
        key = prefer.lower()
        fn = _ALGORITHMS[key]
        algo_name = _ALGO_NAMES.get(key, key)
        g, score = fn(sequences, kmax, N)
        if g and g != '∅':
            return {
                'best': {'algorithm': algo_name, 'grammar': g, 'mdl_score': round(score, 2)},
                'all': [{'algorithm': algo_name, 'grammar': g, 'mdl_score': round(score, 2)}],
                'why': f"Requested {algo_name} only.",
            }
        return {
            'best': None,
            'all': [],
            'why': f"{algo_name} returned ∅ (no grammar found).",
        }

    results = []

    # 1. CRX (always fast, always produces a result)
    crx_g = CRX().infer(sequences)
    crx_score = mdl_score_simple(crx_g, sequences) if crx_g and crx_g != '∅' else float('inf')
    results.append(('CRX', crx_g if crx_g and crx_g != '∅' else '∅', crx_score))

    # 2. iDRegEx (standalone, langsize-based)
    idr_g, idr_score = _run_idregex(sequences, kmax, N)
    if idr_g:
        results.append(('iDRegEx', idr_g, idr_score))

    # 3. kOREInference (Algorithm 4 with MDL scoring)
    kore_g, kore_score = _run_kore(sequences, kmax, N)
    if kore_g:
        results.append(('kOREInference', kore_g, kore_score))

    results = [r for r in results if r[1] and r[1] != '∅']
    if not results:
        return {
            'best': None,
            'all': [],
            'why': "No algorithm produced a non-empty grammar.",
        }

    results.sort(key=lambda x: x[2])
    best = results[0]
    all_results = [
        {'algorithm': a, 'grammar': g, 'mdl_score': round(s, 2)}
        for a, g, s in results
    ]

    active = {r[0] for r in results}

    why_parts = []
    if len(results) == 1:
        why_parts.append(f"Only {results[0][0]} produced a result.")
    else:
        scores_str = ', '.join(f"{r[0]}={r[2]:.1f}" for r in results)
        why_parts.append(f"Scores: {scores_str}.")

    match_strs = []
    for r_algo, r_grammar, _ in results:
        if r_grammar and r_grammar != '∅':
            m = sum(1 for s in sequences if _matches(r_grammar, s))
            match_strs.append(f"{r_algo}={m}/{len(sequences)}")
    if match_strs:
        why_parts.append(f"Match rates: {', '.join(match_strs)}.")

    why_parts.append(f"{best[0]} selected (MDL score {best[2]:.1f}).")

    return {
        'best': {
            'algorithm': best[0],
            'grammar': best[1],
            'mdl_score': round(best[2], 2),
        },
        'all': all_results,
        'why': ' '.join(why_parts),
    }