2026-07-01 09:51:41 +02:00
|
|
|
"""Ensemble grammar inference — run multiple algorithms, pick best by MDL scoring."""
|
|
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
from .crx import CRX
|
|
|
|
|
from .idregex import idregex
|
2026-07-01 14:50:09 +02:00
|
|
|
from .kore import kOREInference
|
2026-07-01 09:51:41 +02:00
|
|
|
from .expr import alphabet
|
|
|
|
|
from .mdl import model_cost, mdl_score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_parts(expr):
|
|
|
|
|
"""Parse expression into a list of tokens for matching.
|
|
|
|
|
|
|
|
|
|
Each token: (type, value, quantifier)
|
|
|
|
|
type: 'symbol' | 'disj' | 'concat' | 'empty'
|
|
|
|
|
quantifier: '' | '?' | '+' | '+?'
|
|
|
|
|
"""
|
|
|
|
|
if not expr or expr == '∅':
|
|
|
|
|
return [('empty', '', '')]
|
|
|
|
|
if expr == 'ε':
|
|
|
|
|
return [('empty', '', '+?')]
|
|
|
|
|
|
|
|
|
|
# 1. Check if it's a concatenation (split outermost by '.')
|
|
|
|
|
# Must check BEFORE stripping trailing quantifier, because
|
|
|
|
|
# quantifiers belong to individual parts (e.g., a?.b+)
|
|
|
|
|
concat_parts = _split_outer(expr.strip(), '.')
|
|
|
|
|
if len(concat_parts) > 1:
|
|
|
|
|
children = []
|
|
|
|
|
for p in concat_parts:
|
|
|
|
|
children.extend(_parse_parts(p.strip()))
|
|
|
|
|
return [('concat', children, '')]
|
|
|
|
|
|
|
|
|
|
# 2. Now handle quantifier suffix on this single part
|
|
|
|
|
quantifier = ''
|
|
|
|
|
if expr.endswith('+?'):
|
|
|
|
|
quantifier = '+?'
|
|
|
|
|
expr = expr[:-2]
|
|
|
|
|
elif expr.endswith('*'):
|
|
|
|
|
quantifier = '*'
|
|
|
|
|
expr = expr[:-1]
|
|
|
|
|
elif expr.endswith('?'):
|
|
|
|
|
quantifier = '?'
|
|
|
|
|
expr = expr[:-1]
|
|
|
|
|
elif expr.endswith('+'):
|
|
|
|
|
quantifier = '+'
|
|
|
|
|
expr = expr[:-1]
|
|
|
|
|
|
|
|
|
|
# 3. Disjunction group: (a+b+c) for CRX or (a|b|c) for iDRegEx
|
|
|
|
|
if expr.startswith('(') and expr.endswith(')'):
|
|
|
|
|
inner = expr[1:-1]
|
|
|
|
|
# Try CRX-style (+) first, then iDRegEx-style (|)
|
|
|
|
|
disj_parts = _split_outer(inner, '+')
|
|
|
|
|
if len(disj_parts) <= 1:
|
|
|
|
|
disj_parts = _split_outer(inner, '|')
|
|
|
|
|
if len(disj_parts) > 1:
|
|
|
|
|
children = []
|
|
|
|
|
for p in disj_parts:
|
|
|
|
|
p = p.strip()
|
|
|
|
|
# Parse as a flat symbol (don't split dots — they're part of
|
|
|
|
|
# the symbol name, e.g. "community.docker.docker_image")
|
|
|
|
|
children.append(_parse_flat_symbol(p))
|
|
|
|
|
return [('disj', children, quantifier)]
|
|
|
|
|
# Single element inside parens: treat as flat symbol
|
|
|
|
|
return [_parse_flat_symbol(inner)]
|
|
|
|
|
|
|
|
|
|
# 4. Single symbol
|
|
|
|
|
if expr and expr not in ('∅', 'ε'):
|
|
|
|
|
return [('symbol', expr, quantifier)]
|
|
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _parse_flat_symbol(s):
|
|
|
|
|
"""Parse a single symbol with optional quantifier, no dot splitting.
|
|
|
|
|
|
|
|
|
|
Unlike _parse_parts, this treats dots as part of the symbol name
|
|
|
|
|
(e.g. 'community.docker.docker_image' stays as one symbol).
|
|
|
|
|
"""
|
|
|
|
|
s = s.strip()
|
|
|
|
|
quantifier = ''
|
|
|
|
|
if s.endswith('+?'):
|
|
|
|
|
quantifier = '+?'
|
|
|
|
|
s = s[:-2]
|
|
|
|
|
elif s.endswith('*'):
|
|
|
|
|
quantifier = '*'
|
|
|
|
|
s = s[:-1]
|
|
|
|
|
elif s.endswith('?'):
|
|
|
|
|
quantifier = '?'
|
|
|
|
|
s = s[:-1]
|
|
|
|
|
elif s.endswith('+'):
|
|
|
|
|
quantifier = '+'
|
|
|
|
|
s = s[:-1]
|
|
|
|
|
if s and s not in ('∅', 'ε'):
|
|
|
|
|
return ('symbol', s, quantifier)
|
|
|
|
|
return ('empty', '', quantifier)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _split_outer(s, sep):
|
|
|
|
|
"""Split on `sep` at the top level (not inside parentheses)."""
|
|
|
|
|
depth = 0
|
|
|
|
|
parts = []
|
|
|
|
|
cur = []
|
|
|
|
|
for ch in s:
|
|
|
|
|
if ch == '(':
|
|
|
|
|
depth += 1
|
|
|
|
|
cur.append(ch)
|
|
|
|
|
elif ch == ')':
|
|
|
|
|
depth -= 1
|
|
|
|
|
cur.append(ch)
|
|
|
|
|
elif ch == sep and depth == 0:
|
|
|
|
|
parts.append(''.join(cur))
|
|
|
|
|
cur = []
|
|
|
|
|
else:
|
|
|
|
|
cur.append(ch)
|
|
|
|
|
parts.append(''.join(cur))
|
|
|
|
|
return parts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _match_possible(token, seq, pos):
|
|
|
|
|
"""Return all possible end positions after matching this token starting at pos."""
|
|
|
|
|
ttype, tval, tquant = token
|
|
|
|
|
positions = []
|
|
|
|
|
|
|
|
|
|
if ttype == 'empty':
|
|
|
|
|
positions.append(pos)
|
|
|
|
|
|
|
|
|
|
elif ttype == 'symbol':
|
|
|
|
|
if tquant in ('', '?'):
|
|
|
|
|
if pos < len(seq) and seq[pos] == tval:
|
|
|
|
|
positions.append(pos + 1)
|
|
|
|
|
if tquant == '?':
|
|
|
|
|
positions.append(pos)
|
|
|
|
|
elif tquant in ('+?', '*'):
|
|
|
|
|
positions.append(pos)
|
|
|
|
|
cnt = pos
|
|
|
|
|
while cnt < len(seq) and seq[cnt] == tval:
|
|
|
|
|
cnt += 1
|
|
|
|
|
positions.append(cnt)
|
|
|
|
|
elif tquant == '+':
|
|
|
|
|
if pos < len(seq) and seq[pos] == tval:
|
|
|
|
|
cnt = pos + 1
|
|
|
|
|
positions.append(cnt)
|
|
|
|
|
while cnt < len(seq) and seq[cnt] == tval:
|
|
|
|
|
cnt += 1
|
|
|
|
|
positions.append(cnt)
|
|
|
|
|
|
|
|
|
|
elif ttype == 'disj':
|
|
|
|
|
if tquant in ('', '?'):
|
|
|
|
|
for child in tval:
|
|
|
|
|
for ep in _match_possible(child, seq, pos):
|
|
|
|
|
positions.append(ep)
|
|
|
|
|
if tquant == '?':
|
|
|
|
|
positions.append(pos)
|
|
|
|
|
elif tquant in ('+?', '*'):
|
|
|
|
|
positions.append(pos)
|
|
|
|
|
for child in tval:
|
|
|
|
|
for ep in _match_possible(child, seq, pos):
|
|
|
|
|
if ep > pos:
|
|
|
|
|
positions.append(ep)
|
|
|
|
|
# After consuming one, recurse to try more
|
|
|
|
|
for ep2 in _match_possible(token, seq, ep):
|
|
|
|
|
if ep2 > ep:
|
|
|
|
|
positions.append(ep2)
|
|
|
|
|
elif tquant == '+':
|
|
|
|
|
for child in tval:
|
|
|
|
|
for ep in _match_possible(child, seq, pos):
|
|
|
|
|
if ep > pos:
|
|
|
|
|
positions.append(ep)
|
|
|
|
|
for ep2 in _match_possible(token, seq, ep):
|
|
|
|
|
if ep2 > ep:
|
|
|
|
|
positions.append(ep2)
|
|
|
|
|
|
|
|
|
|
elif ttype == 'concat':
|
|
|
|
|
# Match all children sequentially
|
|
|
|
|
def _match_seq(children, start):
|
|
|
|
|
cur = [start]
|
|
|
|
|
for child in children:
|
|
|
|
|
next_cur = []
|
|
|
|
|
for p in cur:
|
|
|
|
|
next_cur.extend(_match_possible(child, seq, p))
|
|
|
|
|
cur = next_cur
|
|
|
|
|
if not cur:
|
|
|
|
|
break
|
|
|
|
|
return cur
|
|
|
|
|
if tquant in ('', '?'):
|
|
|
|
|
positions.extend(_match_seq(tval, pos))
|
|
|
|
|
if tquant == '?':
|
|
|
|
|
positions.append(pos)
|
|
|
|
|
elif tquant in ('+?', '*'):
|
|
|
|
|
positions.append(pos)
|
|
|
|
|
inner_end = _match_seq(tval, pos)
|
|
|
|
|
for ep in inner_end:
|
|
|
|
|
if ep > pos:
|
|
|
|
|
positions.append(ep)
|
|
|
|
|
for ep2 in _match_possible(token, seq, ep):
|
|
|
|
|
if ep2 > ep:
|
|
|
|
|
positions.append(ep2)
|
|
|
|
|
elif tquant == '+':
|
|
|
|
|
inner_end = _match_seq(tval, pos)
|
|
|
|
|
for ep in inner_end:
|
|
|
|
|
if ep > pos:
|
|
|
|
|
positions.append(ep)
|
|
|
|
|
for ep2 in _match_possible(token, seq, ep):
|
|
|
|
|
if ep2 > ep:
|
|
|
|
|
positions.append(ep2)
|
|
|
|
|
|
|
|
|
|
return positions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _match_tokens(tokens, seq, pos=0):
|
|
|
|
|
"""Try to match tokens against seq starting at pos. Returns max position or None."""
|
|
|
|
|
cur = [pos]
|
|
|
|
|
for token in tokens:
|
|
|
|
|
next_cur = []
|
|
|
|
|
for p in cur:
|
|
|
|
|
next_cur.extend(_match_possible(token, seq, p))
|
|
|
|
|
cur = next_cur
|
|
|
|
|
if not cur:
|
|
|
|
|
return None
|
|
|
|
|
return max(cur) if cur else pos
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _matches(grammar, sequence):
|
|
|
|
|
"""Check if a sequence matches the grammar."""
|
|
|
|
|
try:
|
|
|
|
|
tokens = _parse_parts(grammar.strip())
|
|
|
|
|
if not tokens:
|
|
|
|
|
return False
|
|
|
|
|
end = _match_tokens(tokens, sequence)
|
|
|
|
|
if end is None:
|
|
|
|
|
return False
|
|
|
|
|
return end == len(sequence)
|
|
|
|
|
except Exception:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mdl_score_simple(grammar, sequences):
|
|
|
|
|
"""MDL score from the paper: model_cost + Σ log₂(|L(r)| at length len(s)).
|
|
|
|
|
|
|
|
|
|
Lower is better. Uses the paper's definition from Bex et al.
|
|
|
|
|
model_cost = number of alphabet symbol occurrences in the expression.
|
|
|
|
|
data_cost = Σ log₂(|L(r)|) — penalizes overly general grammars.
|
|
|
|
|
"""
|
|
|
|
|
return mdl_score(grammar, sequences)
|
|
|
|
|
|
|
|
|
|
|
2026-07-01 14:50:09 +02:00
|
|
|
def _run_idregex(sequences, kmax, N):
|
|
|
|
|
"""Run standalone iDRegEx, return (grammar, score) or (None, inf)."""
|
|
|
|
|
g = idregex(sequences, kmax=kmax, N=N)
|
|
|
|
|
if g and g != '∅':
|
|
|
|
|
return g, mdl_score_simple(g, sequences)
|
|
|
|
|
return None, float('inf')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _run_kore(sequences, kmax, N):
|
|
|
|
|
"""Run kOREInference (Algorithm 4 with MDL), return (grammar, score) or (None, inf)."""
|
|
|
|
|
kore = kOREInference(k_max=kmax, N=N)
|
|
|
|
|
result = kore.infer(sequences)
|
|
|
|
|
if result:
|
|
|
|
|
_, expr, _ = result
|
|
|
|
|
return expr, mdl_score_simple(expr, sequences)
|
|
|
|
|
return None, float('inf')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_ALGO_NAMES = {
|
|
|
|
|
'crx': 'CRX',
|
|
|
|
|
'idregex': 'iDRegEx',
|
|
|
|
|
'koreinference': 'kOREInference',
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_ALGORITHMS = {
|
|
|
|
|
'crx': lambda s, k, n: (CRX().infer(s), mdl_score_simple(CRX().infer(s), s)),
|
|
|
|
|
'idregex': _run_idregex,
|
|
|
|
|
'koreinference': _run_kore,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2026-07-01 09:51:41 +02:00
|
|
|
def infer_ensemble(sequences, kmax=2, N=3, prefer=None):
|
|
|
|
|
"""Run all applicable algorithms and return the best by MDL score.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
sequences: List of sequences, each a list of strings.
|
2026-07-01 14:50:09 +02:00
|
|
|
kmax: Maximum k for k-ORE inference (iDRegEx, kOREInference).
|
|
|
|
|
N: Number of random trials for k-ORE inference.
|
|
|
|
|
prefer: Optional — 'crx', 'idregex', or 'koreinference' to skip
|
|
|
|
|
ensemble and return only that algorithm's result.
|
2026-07-01 09:51:41 +02:00
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict with keys:
|
|
|
|
|
best: {algorithm, grammar, mdl_score}
|
|
|
|
|
all: [{algorithm, grammar, mdl_score}, ...]
|
|
|
|
|
why: str explaining the choice
|
|
|
|
|
"""
|
2026-07-01 14:50:09 +02:00
|
|
|
if prefer and prefer.lower() in _ALGORITHMS:
|
|
|
|
|
key = prefer.lower()
|
|
|
|
|
fn = _ALGORITHMS[key]
|
|
|
|
|
algo_name = _ALGO_NAMES.get(key, key)
|
|
|
|
|
g, score = fn(sequences, kmax, N)
|
|
|
|
|
if g and g != '∅':
|
2026-07-01 09:51:41 +02:00
|
|
|
return {
|
2026-07-01 14:50:09 +02:00
|
|
|
'best': {'algorithm': algo_name, 'grammar': g, 'mdl_score': round(score, 2)},
|
|
|
|
|
'all': [{'algorithm': algo_name, 'grammar': g, 'mdl_score': round(score, 2)}],
|
|
|
|
|
'why': f"Requested {algo_name} only.",
|
2026-07-01 09:51:41 +02:00
|
|
|
}
|
|
|
|
|
return {
|
2026-07-01 14:50:09 +02:00
|
|
|
'best': None,
|
|
|
|
|
'all': [],
|
|
|
|
|
'why': f"{algo_name} returned ∅ (no grammar found).",
|
2026-07-01 09:51:41 +02:00
|
|
|
}
|
|
|
|
|
|
2026-07-01 14:50:09 +02:00
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
# 1. CRX (always fast, always produces a result)
|
2026-07-01 09:51:41 +02:00
|
|
|
crx_g = CRX().infer(sequences)
|
2026-07-01 14:50:09 +02:00
|
|
|
crx_score = mdl_score_simple(crx_g, sequences) if crx_g and crx_g != '∅' else float('inf')
|
|
|
|
|
results.append(('CRX', crx_g if crx_g and crx_g != '∅' else '∅', crx_score))
|
|
|
|
|
|
|
|
|
|
# 2. iDRegEx (standalone, langsize-based)
|
|
|
|
|
idr_g, idr_score = _run_idregex(sequences, kmax, N)
|
|
|
|
|
if idr_g:
|
|
|
|
|
results.append(('iDRegEx', idr_g, idr_score))
|
2026-07-01 09:51:41 +02:00
|
|
|
|
2026-07-01 14:50:09 +02:00
|
|
|
# 3. kOREInference (Algorithm 4 with MDL scoring)
|
|
|
|
|
kore_g, kore_score = _run_kore(sequences, kmax, N)
|
|
|
|
|
if kore_g:
|
|
|
|
|
results.append(('kOREInference', kore_g, kore_score))
|
|
|
|
|
|
|
|
|
|
results = [r for r in results if r[1] and r[1] != '∅']
|
|
|
|
|
if not results:
|
2026-07-01 09:51:41 +02:00
|
|
|
return {
|
2026-07-01 14:50:09 +02:00
|
|
|
'best': None,
|
|
|
|
|
'all': [],
|
|
|
|
|
'why': "No algorithm produced a non-empty grammar.",
|
2026-07-01 09:51:41 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
results.sort(key=lambda x: x[2])
|
|
|
|
|
best = results[0]
|
|
|
|
|
all_results = [
|
|
|
|
|
{'algorithm': a, 'grammar': g, 'mdl_score': round(s, 2)}
|
|
|
|
|
for a, g, s in results
|
|
|
|
|
]
|
|
|
|
|
|
2026-07-01 14:50:09 +02:00
|
|
|
active = {r[0] for r in results}
|
2026-07-01 09:51:41 +02:00
|
|
|
|
|
|
|
|
why_parts = []
|
|
|
|
|
if len(results) == 1:
|
2026-07-01 14:50:09 +02:00
|
|
|
why_parts.append(f"Only {results[0][0]} produced a result.")
|
2026-07-01 09:51:41 +02:00
|
|
|
else:
|
2026-07-01 14:50:09 +02:00
|
|
|
scores_str = ', '.join(f"{r[0]}={r[2]:.1f}" for r in results)
|
|
|
|
|
why_parts.append(f"Scores: {scores_str}.")
|
|
|
|
|
|
|
|
|
|
match_strs = []
|
|
|
|
|
for r_algo, r_grammar, _ in results:
|
|
|
|
|
if r_grammar and r_grammar != '∅':
|
|
|
|
|
m = sum(1 for s in sequences if _matches(r_grammar, s))
|
|
|
|
|
match_strs.append(f"{r_algo}={m}/{len(sequences)}")
|
|
|
|
|
if match_strs:
|
|
|
|
|
why_parts.append(f"Match rates: {', '.join(match_strs)}.")
|
|
|
|
|
|
|
|
|
|
why_parts.append(f"{best[0]} selected (MDL score {best[2]:.1f}).")
|
2026-07-01 09:51:41 +02:00
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
'best': {
|
|
|
|
|
'algorithm': best[0],
|
|
|
|
|
'grammar': best[1],
|
|
|
|
|
'mdl_score': round(best[2], 2),
|
|
|
|
|
},
|
|
|
|
|
'all': all_results,
|
|
|
|
|
'why': ' '.join(why_parts),
|
|
|
|
|
}
|