""" kOREInference — Algorithm 4: iDRegEx (arXiv 1004.2372). Implements the full iDRegEx pipeline: 1. For k = 1..kmax, for n = 1..N: a. iKoa (Algorithm 1) — build a deterministic k-OA from S b. rwr² (Algorithm 3) — translate k-OA to k-ORE expression c. Validate determinism and k-occurrence 2. Score all valid candidates by MDL (model cost + data cost) 3. Return the best k-ORE Unlike the PTA→Shrink→Repair approach from Bex 2008, this follows the journal paper (arXiv 1004.2372) exactly. """ from .ikoa import ikoa from .rwrsq import rwr_sq from .idregex import is_deterministic from .mdl import mdl_score def validate_k_ore(expr, k, alphabet_set=None): """ Check if a k-ORE satisfies the k-occurrence condition. The k-occurrence condition: for every subexpression (r|s), each alphabet symbol appears at most k times across all alternatives combined. Simplified implementation: count raw alphabet symbol occurrences in the expression string. A symbol appearing more than k times violates the condition. Returns: (bool, str): (passes, explanation) """ if not expr or expr in ('∅', 'ε'): return True, "OK" from .expr import alphabet syms = alphabet_set or alphabet(expr) counts = {} for sym in syms: import re count = len(re.findall(rf'(? 0: counts[sym] = count violations = [f"{s}:{c}" for s, c in sorted(counts.items()) if c > k] if violations: return False, f"k={k} violations: {', '.join(violations)}" return True, "OK" class kOREInference: """ |———— Algorithm 4: iDRegEx ————| Require: sample S, kmax Ensure: k-ORE r 1: C ← ∅ 2: for k = 1 to kmax do 3: for n = 1 to N do 4: G ← iKoa(S, k) 5: if rwr²(G) is deterministic then 6: add rwr²(G) to C 7: return best(C) by MDL """ def __init__(self, k_max=5, N=5): self.k_max = k_max self.N = N def infer(self, sequences): """ Infer the best k-ORE for the given sequences. Returns: (koa_automaton, expression_string, best_k) or None if no valid k-ORE can be inferred. """ sequences = [s for s in sequences if s] if not sequences: return None candidates = [] for k in range(1, self.k_max + 1): for _ in range(self.N): G = ikoa(sequences, k, num_trials=1) if G is None: continue expr = rwr_sq(G) if expr and expr not in ('∅', 'ε'): if is_deterministic(expr): valid, _ = validate_k_ore(expr, k) if valid: candidates.append((G, expr, k)) if not candidates: return None return min(candidates, key=lambda c: mdl_score(c[1], sequences))