104 lines
3 KiB
Python
104 lines
3 KiB
Python
"""
|
|
kOREInference — Algorithm 4: iDRegEx (arXiv 1004.2372).
|
|
|
|
Implements the full iDRegEx pipeline:
|
|
1. For k = 1..kmax, for n = 1..N:
|
|
a. iKoa (Algorithm 1) — build a deterministic k-OA from S
|
|
b. rwr² (Algorithm 3) — translate k-OA to k-ORE expression
|
|
c. Validate determinism and k-occurrence
|
|
2. Score all valid candidates by MDL (model cost + data cost)
|
|
3. Return the best k-ORE
|
|
|
|
Unlike the PTA→Shrink→Repair approach from Bex 2008, this follows
|
|
the journal paper (arXiv 1004.2372) exactly.
|
|
"""
|
|
|
|
from .ikoa import ikoa
|
|
from .rwrsq import rwr_sq
|
|
from .idregex import is_deterministic
|
|
from .mdl import mdl_score
|
|
|
|
|
|
def validate_k_ore(expr, k, alphabet_set=None):
|
|
"""
|
|
Check if a k-ORE satisfies the k-occurrence condition.
|
|
|
|
The k-occurrence condition: for every subexpression (r|s),
|
|
each alphabet symbol appears at most k times across all
|
|
alternatives combined.
|
|
|
|
Simplified implementation: count raw alphabet symbol
|
|
occurrences in the expression string. A symbol appearing
|
|
more than k times violates the condition.
|
|
|
|
Returns:
|
|
(bool, str): (passes, explanation)
|
|
"""
|
|
if not expr or expr in ('∅', 'ε'):
|
|
return True, "OK"
|
|
|
|
from .expr import alphabet
|
|
syms = alphabet_set or alphabet(expr)
|
|
|
|
counts = {}
|
|
for sym in syms:
|
|
import re
|
|
count = len(re.findall(rf'(?<![a-zA-Z_/]){re.escape(sym)}(?![a-zA-Z_/])', expr))
|
|
if count > 0:
|
|
counts[sym] = count
|
|
|
|
violations = [f"{s}:{c}" for s, c in sorted(counts.items()) if c > k]
|
|
if violations:
|
|
return False, f"k={k} violations: {', '.join(violations)}"
|
|
return True, "OK"
|
|
|
|
|
|
class kOREInference:
|
|
"""
|
|
|———— Algorithm 4: iDRegEx ————|
|
|
Require: sample S, kmax
|
|
Ensure: k-ORE r
|
|
|
|
1: C ← ∅
|
|
2: for k = 1 to kmax do
|
|
3: for n = 1 to N do
|
|
4: G ← iKoa(S, k)
|
|
5: if rwr²(G) is deterministic then
|
|
6: add rwr²(G) to C
|
|
7: return best(C) by MDL
|
|
"""
|
|
|
|
def __init__(self, k_max=5, N=5):
|
|
self.k_max = k_max
|
|
self.N = N
|
|
|
|
def infer(self, sequences):
|
|
"""
|
|
Infer the best k-ORE for the given sequences.
|
|
|
|
Returns:
|
|
(koa_automaton, expression_string, best_k) or None if no valid
|
|
k-ORE can be inferred.
|
|
"""
|
|
sequences = [s for s in sequences if s]
|
|
if not sequences:
|
|
return None
|
|
|
|
candidates = []
|
|
|
|
for k in range(1, self.k_max + 1):
|
|
for _ in range(self.N):
|
|
G = ikoa(sequences, k, num_trials=1)
|
|
if G is None:
|
|
continue
|
|
expr = rwr_sq(G)
|
|
if expr and expr not in ('∅', 'ε'):
|
|
if is_deterministic(expr):
|
|
valid, _ = validate_k_ore(expr, k)
|
|
if valid:
|
|
candidates.append((G, expr, k))
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
return min(candidates, key=lambda c: mdl_score(c[1], sequences))
|