grammar-inference-engine/bex/idregex.py
tobjend 7c00c6713d Initial commit: BEX-based grammar inference engine
- CRX: direct CHARE inference (Algorithm 7, TODS 2010)
- iDRegEx: k-ORE inference (Algorithm 4, arXiv 2010)
- RWR₀: SORE repair (Algorithm 6, TODS 2010)
- rwr²: k-ORE extraction (Algorithm 3, arXiv 2010)
- SOA, k-OA, iKoa, 2T-INF, Baum-Welch
- Ansible role grammar adapter
- Generic YAML key-path converter
- 28 tests, all passing
2026-07-01 08:01:16 +02:00

202 lines
5.5 KiB
Python

"""iDRegEx — Algorithm 4 (arXiv 1004.2372)."""
from .ikoa import ikoa
from .rwrsq import rwr_sq
from .expr import alphabet
def is_deterministic(expr):
"""Check if a k-ORE is deterministic (Glushkov determinism).
A k-ORE is deterministic iff for every subexpression (r|s),
first(r) ∩ first(s) = ∅.
"""
if not expr or expr == '' or expr == 'ε':
return True
return _check_det(expr)
def _check_det(expr):
"""Recursive determinism check."""
depth = 0
i = 0
while i < len(expr):
if expr[i] == '(':
if depth == 0:
start = i
depth += 1
elif expr[i] == ')':
depth -= 1
if depth == 0:
inner = expr[start + 1:i]
if '|' in inner:
alts = _split_or(inner)
first_sets = []
for alt in alts:
fs = _first_set(alt.strip())
first_sets.append(fs)
for j, fs1 in enumerate(first_sets):
for fs2 in first_sets[j + 1:]:
if fs1 & fs2:
return False
for alt in alts:
if not _check_det(alt.strip()):
return False
else:
if not _check_det(inner):
return False
elif expr[i] == '+':
pass
elif expr[i] == '?':
pass
i += 1
return True
def _first_set(expr):
"""Compute first(r) — set of alphabet symbols that can appear at the start of a word in L(r)."""
if not expr or expr == '':
return set()
if expr == 'ε':
return set()
alpha = alphabet(expr)
if expr in alpha:
return {expr}
if expr.endswith('?') or expr.endswith('+'):
inner = expr.rstrip('+?')
return _first_set(inner)
if '.' in expr:
parts = expr.split('.')
return _first_set(parts[0])
if expr.startswith('(') and '|' in expr:
inner = expr[1:-1]
alts = _split_or(inner)
result = set()
for a in alts:
result |= _first_set(a.strip())
return result
return alpha
def _split_or(s):
"""Split disjunction string at top-level | operators."""
depth = 0
parts = []
cur = []
for ch in s:
if ch == '(':
depth += 1
cur.append(ch)
elif ch == ')':
depth -= 1
cur.append(ch)
elif ch == '|' and depth == 0:
parts.append(''.join(cur))
cur = []
else:
cur.append(ch)
parts.append(''.join(cur))
return parts
def _lang_size(expr, n=None):
"""|L(r)≤n| — number of words of length ≤ n in L(r).
n = 2m + 1 where m = |r| excluding operators.
Uses simple structural approximation.
"""
if not expr or expr == '':
return 0
if expr == 'ε':
return 1
m = len(alphabet(expr))
if n is None:
n = 2 * m + 1
total = 0
for length in range(n + 1):
total += _count_len(expr, length)
return total
def _count_len(expr, length):
if length < 0:
return 0
if not expr or expr == '':
return 0
if expr == 'ε':
return 1 if length == 0 else 0
alpha = alphabet(expr)
if expr in alpha:
return 1 if length == 1 else 0
if expr.endswith('+'):
inner = expr[:-1]
if inner.endswith('?'):
inner = inner[:-1]
total = 0
for rep in range(1, length + 1):
total += _count_repeat(inner, rep, length)
return total
if expr.endswith('?'):
inner = expr[:-1]
return _count_len(inner, length) + (1 if length == 0 else 0)
if '.' in expr:
parts = expr.split('.')
return _count_concat(parts, length, 0)
if expr.startswith('(') and '|' in expr:
inner = expr[1:-1]
alts = _split_or(inner)
return sum(_count_len(a.strip(), length) for a in alts)
return 0
def _count_concat(parts, length, idx):
if idx >= len(parts):
return 1 if length == 0 else 0
total = 0
for take in range(length + 1):
cnt = _count_len(parts[idx], take)
if cnt:
total += cnt * _count_concat(parts, length - take, idx + 1)
return total
def _count_repeat(inner, rep, length):
if rep == 0:
return 1 if length == 0 else 0
total = 0
for take in range(length + 1):
cnt = _count_len(inner, take)
if cnt:
total += cnt * _count_repeat(inner, rep - 1, length - take)
return total
def idregex(sequences, kmax=4, N=5, criterion='langsize'):
"""
|———— Algorithm 4: iDRegEx ————|
Require: sample S
Ensure: k-ORE r
1: C ← ∅
2: for k = 1 to kmax do
3: for n = 1 to N do
4: G ← iKoa(S, k)
5: if rwr²(G) is deterministic then
6: add rwr²(G) to C
7: return best(C)
"""
C = set()
for k in range(1, kmax + 1):
for _ in range(N):
G = ikoa(sequences, k, num_trials=1)
if G is None:
continue
expr = rwr_sq(G)
if expr and expr not in ('', 'ε'):
if is_deterministic(expr):
C.add(expr)
if not C:
return None
if criterion == 'langsize':
return min(C, key=lambda e: (_lang_size(e), len(e)))
return min(C, key=lambda e: len(e))