"""iDRegEx — Algorithm 4 (arXiv 1004.2372).""" from .ikoa import ikoa from .rwrsq import rwr_sq from .expr import alphabet def is_deterministic(expr): """Check if a k-ORE is deterministic (Glushkov determinism). A k-ORE is deterministic iff for every subexpression (r|s), first(r) ∩ first(s) = ∅. """ if not expr or expr == '∅' or expr == 'ε': return True return _check_det(expr) def _check_det(expr): """Recursive determinism check.""" depth = 0 i = 0 while i < len(expr): if expr[i] == '(': if depth == 0: start = i depth += 1 elif expr[i] == ')': depth -= 1 if depth == 0: inner = expr[start + 1:i] if '|' in inner: alts = _split_or(inner) first_sets = [] for alt in alts: fs = _first_set(alt.strip()) first_sets.append(fs) for j, fs1 in enumerate(first_sets): for fs2 in first_sets[j + 1:]: if fs1 & fs2: return False for alt in alts: if not _check_det(alt.strip()): return False else: if not _check_det(inner): return False elif expr[i] == '+': pass elif expr[i] == '?': pass i += 1 return True def _first_set(expr): """Compute first(r) — set of alphabet symbols that can appear at the start of a word in L(r).""" if not expr or expr == '∅': return set() if expr == 'ε': return set() alpha = alphabet(expr) if expr in alpha: return {expr} if expr.endswith('?') or expr.endswith('+'): inner = expr.rstrip('+?') return _first_set(inner) if '.' in expr: parts = expr.split('.') return _first_set(parts[0]) if expr.startswith('(') and '|' in expr: inner = expr[1:-1] alts = _split_or(inner) result = set() for a in alts: result |= _first_set(a.strip()) return result return alpha def _split_or(s): """Split disjunction string at top-level | operators.""" depth = 0 parts = [] cur = [] for ch in s: if ch == '(': depth += 1 cur.append(ch) elif ch == ')': depth -= 1 cur.append(ch) elif ch == '|' and depth == 0: parts.append(''.join(cur)) cur = [] else: cur.append(ch) parts.append(''.join(cur)) return parts def _lang_size(expr, n=None): """|L(r)≤n| — number of words of length ≤ n in L(r). n = 2m + 1 where m = |r| excluding operators. Uses simple structural approximation. """ if not expr or expr == '∅': return 0 if expr == 'ε': return 1 m = len(alphabet(expr)) if n is None: n = 2 * m + 1 total = 0 for length in range(n + 1): total += _count_len(expr, length) return total def _count_len(expr, length): if length < 0: return 0 if not expr or expr == '∅': return 0 if expr == 'ε': return 1 if length == 0 else 0 alpha = alphabet(expr) if expr in alpha: return 1 if length == 1 else 0 if expr.endswith('+'): inner = expr[:-1] if inner.endswith('?'): inner = inner[:-1] total = 0 for rep in range(1, length + 1): total += _count_repeat(inner, rep, length) return total if expr.endswith('?'): inner = expr[:-1] return _count_len(inner, length) + (1 if length == 0 else 0) if '.' in expr: parts = expr.split('.') return _count_concat(parts, length, 0) if expr.startswith('(') and '|' in expr: inner = expr[1:-1] alts = _split_or(inner) return sum(_count_len(a.strip(), length) for a in alts) return 0 def _count_concat(parts, length, idx): if idx >= len(parts): return 1 if length == 0 else 0 total = 0 for take in range(length + 1): cnt = _count_len(parts[idx], take) if cnt: total += cnt * _count_concat(parts, length - take, idx + 1) return total def _count_repeat(inner, rep, length): if rep == 0: return 1 if length == 0 else 0 total = 0 for take in range(length + 1): cnt = _count_len(inner, take) if cnt: total += cnt * _count_repeat(inner, rep - 1, length - take) return total def idregex(sequences, kmax=4, N=5, criterion='langsize'): """ |———— Algorithm 4: iDRegEx ————| Require: sample S Ensure: k-ORE r 1: C ← ∅ 2: for k = 1 to kmax do 3: for n = 1 to N do 4: G ← iKoa(S, k) 5: if rwr²(G) is deterministic then 6: add rwr²(G) to C 7: return best(C) """ C = set() for k in range(1, kmax + 1): for _ in range(N): G = ikoa(sequences, k, num_trials=1) if G is None: continue expr = rwr_sq(G) if expr and expr not in ('∅', 'ε'): if is_deterministic(expr): C.add(expr) if not C: return None if criterion == 'langsize': return min(C, key=lambda e: (_lang_size(e), len(e))) return min(C, key=lambda e: len(e))