"""Ensemble grammar inference — run multiple algorithms, pick best by MDL scoring.""" import re from .crx import CRX from .idregex import idregex from .kore import kOREInference from .expr import alphabet from .mdl import model_cost, mdl_score def _parse_parts(expr): """Parse expression into a list of tokens for matching. Each token: (type, value, quantifier) type: 'symbol' | 'disj' | 'concat' | 'empty' quantifier: '' | '?' | '+' | '+?' """ if not expr or expr == '∅': return [('empty', '', '')] if expr == 'ε': return [('empty', '', '+?')] # 1. Check if it's a concatenation (split outermost by '.') # Must check BEFORE stripping trailing quantifier, because # quantifiers belong to individual parts (e.g., a?.b+) concat_parts = _split_outer(expr.strip(), '.') if len(concat_parts) > 1: children = [] for p in concat_parts: children.extend(_parse_parts(p.strip())) return [('concat', children, '')] # 2. Now handle quantifier suffix on this single part quantifier = '' if expr.endswith('+?'): quantifier = '+?' expr = expr[:-2] elif expr.endswith('*'): quantifier = '*' expr = expr[:-1] elif expr.endswith('?'): quantifier = '?' expr = expr[:-1] elif expr.endswith('+'): quantifier = '+' expr = expr[:-1] # 3. Disjunction group: (a+b+c) for CRX or (a|b|c) for iDRegEx if expr.startswith('(') and expr.endswith(')'): inner = expr[1:-1] # Try CRX-style (+) first, then iDRegEx-style (|) disj_parts = _split_outer(inner, '+') if len(disj_parts) <= 1: disj_parts = _split_outer(inner, '|') if len(disj_parts) > 1: children = [] for p in disj_parts: p = p.strip() # Parse as a flat symbol (don't split dots — they're part of # the symbol name, e.g. "community.docker.docker_image") children.append(_parse_flat_symbol(p)) return [('disj', children, quantifier)] # Single element inside parens: treat as flat symbol return [_parse_flat_symbol(inner)] # 4. Single symbol if expr and expr not in ('∅', 'ε'): return [('symbol', expr, quantifier)] return [] def _parse_flat_symbol(s): """Parse a single symbol with optional quantifier, no dot splitting. Unlike _parse_parts, this treats dots as part of the symbol name (e.g. 'community.docker.docker_image' stays as one symbol). """ s = s.strip() quantifier = '' if s.endswith('+?'): quantifier = '+?' s = s[:-2] elif s.endswith('*'): quantifier = '*' s = s[:-1] elif s.endswith('?'): quantifier = '?' s = s[:-1] elif s.endswith('+'): quantifier = '+' s = s[:-1] if s and s not in ('∅', 'ε'): return ('symbol', s, quantifier) return ('empty', '', quantifier) def _split_outer(s, sep): """Split on `sep` at the top level (not inside parentheses).""" depth = 0 parts = [] cur = [] for ch in s: if ch == '(': depth += 1 cur.append(ch) elif ch == ')': depth -= 1 cur.append(ch) elif ch == sep and depth == 0: parts.append(''.join(cur)) cur = [] else: cur.append(ch) parts.append(''.join(cur)) return parts def _match_possible(token, seq, pos): """Return all possible end positions after matching this token starting at pos.""" ttype, tval, tquant = token positions = [] if ttype == 'empty': positions.append(pos) elif ttype == 'symbol': if tquant in ('', '?'): if pos < len(seq) and seq[pos] == tval: positions.append(pos + 1) if tquant == '?': positions.append(pos) elif tquant in ('+?', '*'): positions.append(pos) cnt = pos while cnt < len(seq) and seq[cnt] == tval: cnt += 1 positions.append(cnt) elif tquant == '+': if pos < len(seq) and seq[pos] == tval: cnt = pos + 1 positions.append(cnt) while cnt < len(seq) and seq[cnt] == tval: cnt += 1 positions.append(cnt) elif ttype == 'disj': if tquant in ('', '?'): for child in tval: for ep in _match_possible(child, seq, pos): positions.append(ep) if tquant == '?': positions.append(pos) elif tquant in ('+?', '*'): positions.append(pos) for child in tval: for ep in _match_possible(child, seq, pos): if ep > pos: positions.append(ep) # After consuming one, recurse to try more for ep2 in _match_possible(token, seq, ep): if ep2 > ep: positions.append(ep2) elif tquant == '+': for child in tval: for ep in _match_possible(child, seq, pos): if ep > pos: positions.append(ep) for ep2 in _match_possible(token, seq, ep): if ep2 > ep: positions.append(ep2) elif ttype == 'concat': # Match all children sequentially def _match_seq(children, start): cur = [start] for child in children: next_cur = [] for p in cur: next_cur.extend(_match_possible(child, seq, p)) cur = next_cur if not cur: break return cur if tquant in ('', '?'): positions.extend(_match_seq(tval, pos)) if tquant == '?': positions.append(pos) elif tquant in ('+?', '*'): positions.append(pos) inner_end = _match_seq(tval, pos) for ep in inner_end: if ep > pos: positions.append(ep) for ep2 in _match_possible(token, seq, ep): if ep2 > ep: positions.append(ep2) elif tquant == '+': inner_end = _match_seq(tval, pos) for ep in inner_end: if ep > pos: positions.append(ep) for ep2 in _match_possible(token, seq, ep): if ep2 > ep: positions.append(ep2) return positions def _match_tokens(tokens, seq, pos=0): """Try to match tokens against seq starting at pos. Returns max position or None.""" cur = [pos] for token in tokens: next_cur = [] for p in cur: next_cur.extend(_match_possible(token, seq, p)) cur = next_cur if not cur: return None return max(cur) if cur else pos def _matches(grammar, sequence): """Check if a sequence matches the grammar.""" try: tokens = _parse_parts(grammar.strip()) if not tokens: return False end = _match_tokens(tokens, sequence) if end is None: return False return end == len(sequence) except Exception: return False def mdl_score_simple(grammar, sequences): """MDL score from the paper: model_cost + Σ log₂(|L(r)| at length len(s)). Lower is better. Uses the paper's definition from Bex et al. model_cost = number of alphabet symbol occurrences in the expression. data_cost = Σ log₂(|L(r)|) — penalizes overly general grammars. """ return mdl_score(grammar, sequences) def _run_idregex(sequences, kmax, N): """Run standalone iDRegEx, return (grammar, score) or (None, inf).""" g = idregex(sequences, kmax=kmax, N=N) if g and g != '∅': return g, mdl_score_simple(g, sequences) return None, float('inf') def _run_kore(sequences, kmax, N): """Run kOREInference (Algorithm 4 with MDL), return (grammar, score) or (None, inf).""" kore = kOREInference(k_max=kmax, N=N) result = kore.infer(sequences) if result: _, expr, _ = result return expr, mdl_score_simple(expr, sequences) return None, float('inf') _ALGO_NAMES = { 'crx': 'CRX', 'idregex': 'iDRegEx', 'koreinference': 'kOREInference', } _ALGORITHMS = { 'crx': lambda s, k, n: (CRX().infer(s), mdl_score_simple(CRX().infer(s), s)), 'idregex': _run_idregex, 'koreinference': _run_kore, } def infer_ensemble(sequences, kmax=2, N=3, prefer=None): """Run all applicable algorithms and return the best by MDL score. Args: sequences: List of sequences, each a list of strings. kmax: Maximum k for k-ORE inference (iDRegEx, kOREInference). N: Number of random trials for k-ORE inference. prefer: Optional — 'crx', 'idregex', or 'koreinference' to skip ensemble and return only that algorithm's result. Returns: dict with keys: best: {algorithm, grammar, mdl_score} all: [{algorithm, grammar, mdl_score}, ...] why: str explaining the choice """ if prefer and prefer.lower() in _ALGORITHMS: key = prefer.lower() fn = _ALGORITHMS[key] algo_name = _ALGO_NAMES.get(key, key) g, score = fn(sequences, kmax, N) if g and g != '∅': return { 'best': {'algorithm': algo_name, 'grammar': g, 'mdl_score': round(score, 2)}, 'all': [{'algorithm': algo_name, 'grammar': g, 'mdl_score': round(score, 2)}], 'why': f"Requested {algo_name} only.", } return { 'best': None, 'all': [], 'why': f"{algo_name} returned ∅ (no grammar found).", } results = [] # 1. CRX (always fast, always produces a result) crx_g = CRX().infer(sequences) crx_score = mdl_score_simple(crx_g, sequences) if crx_g and crx_g != '∅' else float('inf') results.append(('CRX', crx_g if crx_g and crx_g != '∅' else '∅', crx_score)) # 2. iDRegEx (standalone, langsize-based) idr_g, idr_score = _run_idregex(sequences, kmax, N) if idr_g: results.append(('iDRegEx', idr_g, idr_score)) # 3. kOREInference (Algorithm 4 with MDL scoring) kore_g, kore_score = _run_kore(sequences, kmax, N) if kore_g: results.append(('kOREInference', kore_g, kore_score)) results = [r for r in results if r[1] and r[1] != '∅'] if not results: return { 'best': None, 'all': [], 'why': "No algorithm produced a non-empty grammar.", } results.sort(key=lambda x: x[2]) best = results[0] all_results = [ {'algorithm': a, 'grammar': g, 'mdl_score': round(s, 2)} for a, g, s in results ] active = {r[0] for r in results} why_parts = [] if len(results) == 1: why_parts.append(f"Only {results[0][0]} produced a result.") else: scores_str = ', '.join(f"{r[0]}={r[2]:.1f}" for r in results) why_parts.append(f"Scores: {scores_str}.") match_strs = [] for r_algo, r_grammar, _ in results: if r_grammar and r_grammar != '∅': m = sum(1 for s in sequences if _matches(r_grammar, s)) match_strs.append(f"{r_algo}={m}/{len(sequences)}") if match_strs: why_parts.append(f"Match rates: {', '.join(match_strs)}.") why_parts.append(f"{best[0]} selected (MDL score {best[2]:.1f}).") return { 'best': { 'algorithm': best[0], 'grammar': best[1], 'mdl_score': round(best[2], 2), }, 'all': all_results, 'why': ' '.join(why_parts), }