grammar-inference-engine/bex/idregex.py

"""iDRegEx — Algorithm 4 (arXiv 1004.2372)."""

from .ikoa import ikoa
from .rwrsq import rwr_sq
from .expr import alphabet


def is_deterministic(expr):
    """Check if a k-ORE is deterministic (Glushkov determinism).

    A k-ORE is deterministic iff for every subexpression (r|s),
    first(r) ∩ first(s) = ∅.
    """
    if not expr or expr == '∅' or expr == 'ε':
        return True
    return _check_det(expr)


def _check_det(expr):
    """Recursive determinism check."""
    depth = 0
    i = 0
    while i < len(expr):
        if expr[i] == '(':
            if depth == 0:
                start = i
            depth += 1
        elif expr[i] == ')':
            depth -= 1
            if depth == 0:
                inner = expr[start + 1:i]
                if '|' in inner:
                    alts = _split_or(inner)
                    first_sets = []
                    for alt in alts:
                        fs = _first_set(alt.strip())
                        first_sets.append(fs)
                    for j, fs1 in enumerate(first_sets):
                        for fs2 in first_sets[j + 1:]:
                            if fs1 & fs2:
                                return False
                    for alt in alts:
                        if not _check_det(alt.strip()):
                            return False
                else:
                    if not _check_det(inner):
                        return False
        elif expr[i] == '+':
            pass
        elif expr[i] == '?':
            pass
        i += 1
    return True


def _first_set(expr):
    """Compute first(r) — set of alphabet symbols that can appear at the start of a word in L(r)."""
    if not expr or expr == '∅':
        return set()
    if expr == 'ε':
        return set()
    alpha = alphabet(expr)
    if expr in alpha:
        return {expr}
    if expr.endswith('?') or expr.endswith('+'):
        inner = expr.rstrip('+?')
        return _first_set(inner)
    if '.' in expr:
        parts = expr.split('.')
        return _first_set(parts[0])
    if expr.startswith('(') and '|' in expr:
        inner = expr[1:-1]
        alts = _split_or(inner)
        result = set()
        for a in alts:
            result |= _first_set(a.strip())
        return result
    return alpha


def _split_or(s):
    """Split disjunction string at top-level | operators."""
    depth = 0
    parts = []
    cur = []
    for ch in s:
        if ch == '(':
            depth += 1
            cur.append(ch)
        elif ch == ')':
            depth -= 1
            cur.append(ch)
        elif ch == '|' and depth == 0:
            parts.append(''.join(cur))
            cur = []
        else:
            cur.append(ch)
    parts.append(''.join(cur))
    return parts


def _lang_size(expr, n=None):
    """|L(r)≤n| — number of words of length ≤ n in L(r).

    n = 2m + 1 where m = |r| excluding operators.
    Uses simple structural approximation.
    """
    if not expr or expr == '∅':
        return 0
    if expr == 'ε':
        return 1
    m = len(alphabet(expr))
    if n is None:
        n = 2 * m + 1
    total = 0
    for length in range(n + 1):
        total += _count_len(expr, length)
    return total


def _count_len(expr, length):
    if length < 0:
        return 0
    if not expr or expr == '∅':
        return 0
    if expr == 'ε':
        return 1 if length == 0 else 0
    alpha = alphabet(expr)
    if expr in alpha:
        return 1 if length == 1 else 0
    if expr.endswith('+'):
        inner = expr[:-1]
        if inner.endswith('?'):
            inner = inner[:-1]
        total = 0
        for rep in range(1, length + 1):
            total += _count_repeat(inner, rep, length)
        return total
    if expr.endswith('?'):
        inner = expr[:-1]
        return _count_len(inner, length) + (1 if length == 0 else 0)
    if '.' in expr:
        parts = expr.split('.')
        return _count_concat(parts, length, 0)
    if expr.startswith('(') and '|' in expr:
        inner = expr[1:-1]
        alts = _split_or(inner)
        return sum(_count_len(a.strip(), length) for a in alts)
    return 0


def _count_concat(parts, length, idx):
    if idx >= len(parts):
        return 1 if length == 0 else 0
    total = 0
    for take in range(length + 1):
        cnt = _count_len(parts[idx], take)
        if cnt:
            total += cnt * _count_concat(parts, length - take, idx + 1)
    return total


def _count_repeat(inner, rep, length):
    if rep == 0:
        return 1 if length == 0 else 0
    total = 0
    for take in range(length + 1):
        cnt = _count_len(inner, take)
        if cnt:
            total += cnt * _count_repeat(inner, rep - 1, length - take)
    return total


def idregex(sequences, kmax=4, N=5, criterion='langsize'):
    """
    |———— Algorithm 4: iDRegEx ————|
    Require: sample S
    Ensure: k-ORE r

    1: C ← ∅
    2: for k = 1 to kmax do
    3:   for n = 1 to N do
    4:     G ← iKoa(S, k)
    5:     if rwr²(G) is deterministic then
    6:       add rwr²(G) to C
    7: return best(C)
    """
    C = set()
    for k in range(1, kmax + 1):
        for _ in range(N):
            G = ikoa(sequences, k, num_trials=1)
            if G is None:
                continue
            expr = rwr_sq(G)
            if expr and expr not in ('∅', 'ε'):
                if is_deterministic(expr):
                    C.add(expr)
    if not C:
        return None
    if criterion == 'langsize':
        return min(C, key=lambda e: (_lang_size(e), len(e)))
    return min(C, key=lambda e: len(e))