- CRX: direct CHARE inference (Algorithm 7, TODS 2010) - iDRegEx: k-ORE inference (Algorithm 4, arXiv 2010) - RWR₀: SORE repair (Algorithm 6, TODS 2010) - rwr²: k-ORE extraction (Algorithm 3, arXiv 2010) - SOA, k-OA, iKoa, 2T-INF, Baum-Welch - Ansible role grammar adapter - Generic YAML key-path converter - 28 tests, all passing
202 lines
5.5 KiB
Python
202 lines
5.5 KiB
Python
"""iDRegEx — Algorithm 4 (arXiv 1004.2372)."""
|
|
|
|
from .ikoa import ikoa
|
|
from .rwrsq import rwr_sq
|
|
from .expr import alphabet
|
|
|
|
|
|
def is_deterministic(expr):
|
|
"""Check if a k-ORE is deterministic (Glushkov determinism).
|
|
|
|
A k-ORE is deterministic iff for every subexpression (r|s),
|
|
first(r) ∩ first(s) = ∅.
|
|
"""
|
|
if not expr or expr == '∅' or expr == 'ε':
|
|
return True
|
|
return _check_det(expr)
|
|
|
|
|
|
def _check_det(expr):
|
|
"""Recursive determinism check."""
|
|
depth = 0
|
|
i = 0
|
|
while i < len(expr):
|
|
if expr[i] == '(':
|
|
if depth == 0:
|
|
start = i
|
|
depth += 1
|
|
elif expr[i] == ')':
|
|
depth -= 1
|
|
if depth == 0:
|
|
inner = expr[start + 1:i]
|
|
if '|' in inner:
|
|
alts = _split_or(inner)
|
|
first_sets = []
|
|
for alt in alts:
|
|
fs = _first_set(alt.strip())
|
|
first_sets.append(fs)
|
|
for j, fs1 in enumerate(first_sets):
|
|
for fs2 in first_sets[j + 1:]:
|
|
if fs1 & fs2:
|
|
return False
|
|
for alt in alts:
|
|
if not _check_det(alt.strip()):
|
|
return False
|
|
else:
|
|
if not _check_det(inner):
|
|
return False
|
|
elif expr[i] == '+':
|
|
pass
|
|
elif expr[i] == '?':
|
|
pass
|
|
i += 1
|
|
return True
|
|
|
|
|
|
def _first_set(expr):
|
|
"""Compute first(r) — set of alphabet symbols that can appear at the start of a word in L(r)."""
|
|
if not expr or expr == '∅':
|
|
return set()
|
|
if expr == 'ε':
|
|
return set()
|
|
alpha = alphabet(expr)
|
|
if expr in alpha:
|
|
return {expr}
|
|
if expr.endswith('?') or expr.endswith('+'):
|
|
inner = expr.rstrip('+?')
|
|
return _first_set(inner)
|
|
if '.' in expr:
|
|
parts = expr.split('.')
|
|
return _first_set(parts[0])
|
|
if expr.startswith('(') and '|' in expr:
|
|
inner = expr[1:-1]
|
|
alts = _split_or(inner)
|
|
result = set()
|
|
for a in alts:
|
|
result |= _first_set(a.strip())
|
|
return result
|
|
return alpha
|
|
|
|
|
|
def _split_or(s):
|
|
"""Split disjunction string at top-level | operators."""
|
|
depth = 0
|
|
parts = []
|
|
cur = []
|
|
for ch in s:
|
|
if ch == '(':
|
|
depth += 1
|
|
cur.append(ch)
|
|
elif ch == ')':
|
|
depth -= 1
|
|
cur.append(ch)
|
|
elif ch == '|' and depth == 0:
|
|
parts.append(''.join(cur))
|
|
cur = []
|
|
else:
|
|
cur.append(ch)
|
|
parts.append(''.join(cur))
|
|
return parts
|
|
|
|
|
|
def _lang_size(expr, n=None):
|
|
"""|L(r)≤n| — number of words of length ≤ n in L(r).
|
|
|
|
n = 2m + 1 where m = |r| excluding operators.
|
|
Uses simple structural approximation.
|
|
"""
|
|
if not expr or expr == '∅':
|
|
return 0
|
|
if expr == 'ε':
|
|
return 1
|
|
m = len(alphabet(expr))
|
|
if n is None:
|
|
n = 2 * m + 1
|
|
total = 0
|
|
for length in range(n + 1):
|
|
total += _count_len(expr, length)
|
|
return total
|
|
|
|
|
|
def _count_len(expr, length):
|
|
if length < 0:
|
|
return 0
|
|
if not expr or expr == '∅':
|
|
return 0
|
|
if expr == 'ε':
|
|
return 1 if length == 0 else 0
|
|
alpha = alphabet(expr)
|
|
if expr in alpha:
|
|
return 1 if length == 1 else 0
|
|
if expr.endswith('+'):
|
|
inner = expr[:-1]
|
|
if inner.endswith('?'):
|
|
inner = inner[:-1]
|
|
total = 0
|
|
for rep in range(1, length + 1):
|
|
total += _count_repeat(inner, rep, length)
|
|
return total
|
|
if expr.endswith('?'):
|
|
inner = expr[:-1]
|
|
return _count_len(inner, length) + (1 if length == 0 else 0)
|
|
if '.' in expr:
|
|
parts = expr.split('.')
|
|
return _count_concat(parts, length, 0)
|
|
if expr.startswith('(') and '|' in expr:
|
|
inner = expr[1:-1]
|
|
alts = _split_or(inner)
|
|
return sum(_count_len(a.strip(), length) for a in alts)
|
|
return 0
|
|
|
|
|
|
def _count_concat(parts, length, idx):
|
|
if idx >= len(parts):
|
|
return 1 if length == 0 else 0
|
|
total = 0
|
|
for take in range(length + 1):
|
|
cnt = _count_len(parts[idx], take)
|
|
if cnt:
|
|
total += cnt * _count_concat(parts, length - take, idx + 1)
|
|
return total
|
|
|
|
|
|
def _count_repeat(inner, rep, length):
|
|
if rep == 0:
|
|
return 1 if length == 0 else 0
|
|
total = 0
|
|
for take in range(length + 1):
|
|
cnt = _count_len(inner, take)
|
|
if cnt:
|
|
total += cnt * _count_repeat(inner, rep - 1, length - take)
|
|
return total
|
|
|
|
|
|
def idregex(sequences, kmax=4, N=5, criterion='langsize'):
|
|
"""
|
|
|———— Algorithm 4: iDRegEx ————|
|
|
Require: sample S
|
|
Ensure: k-ORE r
|
|
|
|
1: C ← ∅
|
|
2: for k = 1 to kmax do
|
|
3: for n = 1 to N do
|
|
4: G ← iKoa(S, k)
|
|
5: if rwr²(G) is deterministic then
|
|
6: add rwr²(G) to C
|
|
7: return best(C)
|
|
"""
|
|
C = set()
|
|
for k in range(1, kmax + 1):
|
|
for _ in range(N):
|
|
G = ikoa(sequences, k, num_trials=1)
|
|
if G is None:
|
|
continue
|
|
expr = rwr_sq(G)
|
|
if expr and expr not in ('∅', 'ε'):
|
|
if is_deterministic(expr):
|
|
C.add(expr)
|
|
if not C:
|
|
return None
|
|
if criterion == 'langsize':
|
|
return min(C, key=lambda e: (_lang_size(e), len(e)))
|
|
return min(C, key=lambda e: len(e))
|