- CRX: direct CHARE inference (Algorithm 7, TODS 2010) - iDRegEx: k-ORE inference (Algorithm 4, arXiv 2010) - RWR₀: SORE repair (Algorithm 6, TODS 2010) - rwr²: k-ORE extraction (Algorithm 3, arXiv 2010) - SOA, k-OA, iKoa, 2T-INF, Baum-Welch - Ansible role grammar adapter - Generic YAML key-path converter - 28 tests, all passing
164 lines
4.6 KiB
Python
164 lines
4.6 KiB
Python
"""Expression utilities for SOREs and k-OREs."""
|
|
|
|
import re
|
|
|
|
|
|
def sym(s):
|
|
"""Create a simple symbol expression."""
|
|
return s
|
|
|
|
|
|
def concat(*parts):
|
|
"""Create concatenation expression."""
|
|
parts = [p for p in parts if p and p != 'ε']
|
|
if not parts:
|
|
return 'ε'
|
|
if len(parts) == 1:
|
|
return parts[0]
|
|
return '.'.join(parts)
|
|
|
|
|
|
def disj(*parts):
|
|
"""Create disjunction expression."""
|
|
parts = [p for p in parts if p and p != '∅']
|
|
if not parts:
|
|
return '∅'
|
|
if len(parts) == 1:
|
|
return parts[0]
|
|
return '(' + '|'.join(parts) + ')'
|
|
|
|
|
|
def star(expr):
|
|
"""Create iteration expression (one or more, r+)."""
|
|
if not expr or expr in ('∅', 'ε'):
|
|
return expr
|
|
if len(expr) == 1 or (expr.startswith('(') and expr.endswith(')')):
|
|
return expr + '+'
|
|
return '(' + expr + ')+'
|
|
|
|
|
|
def optional(expr):
|
|
"""Create optional expression (r?)."""
|
|
if not expr or expr in ('∅', 'ε'):
|
|
return 'ε'
|
|
if len(expr) == 1 or (expr.startswith('(') and expr.endswith(')')):
|
|
return expr + '?'
|
|
return '(' + expr + ')?'
|
|
|
|
|
|
def alphabet(expr):
|
|
"""Return set of alphabet symbols in expression."""
|
|
cleaned = re.sub(r'[+?*().|]', ' ', expr)
|
|
result = set()
|
|
for token in cleaned.split():
|
|
token = token.strip('_0123456789')
|
|
if token and token not in ('ε', '∅'):
|
|
result.add(token)
|
|
return result
|
|
|
|
|
|
def strip_k(s):
|
|
"""Remove k-ORE markers: a_1 → a, b^(2) → b."""
|
|
result = re.sub(r'_\d+', '', s)
|
|
result = re.sub(r'\^\(\d+\)', '', result)
|
|
result = re.sub(r'^\(|\)$', '', result)
|
|
return result
|
|
|
|
|
|
def has_repeats(expr, symbol):
|
|
"""Check if a symbol appears more than once in expression."""
|
|
return expr.count(symbol) > 1
|
|
|
|
|
|
def lang_size_at_most(expr, n, alphabet_symbols=None):
|
|
"""Compute |L(r)<=n| — number of words of length ≤ n in L(r)."""
|
|
if alphabet_symbols is None:
|
|
alphabet_symbols = alphabet(expr)
|
|
if not alphabet_symbols:
|
|
return 1 if 'ε' in expr else 0
|
|
size = 0
|
|
for length in range(n + 1):
|
|
size += _count_words(expr, length, alphabet_symbols)
|
|
return size
|
|
|
|
|
|
def _count_words(expr, length, alphabet_symbols):
|
|
if length < 0:
|
|
return 0
|
|
if not expr or expr == '∅':
|
|
return 0
|
|
if expr == 'ε':
|
|
return 1 if length == 0 else 0
|
|
if expr in alphabet_symbols:
|
|
return 1 if length == 1 else 0
|
|
if '+' in expr:
|
|
inner = expr.rstrip('+')
|
|
if inner.endswith('?'):
|
|
inner = inner[:-1]
|
|
return _count_star_words(inner, length, alphabet_symbols, 1)
|
|
if expr.endswith('?'):
|
|
inner = expr[:-1]
|
|
return _count_words(inner, length, alphabet_symbols) + (1 if length == 0 else 0)
|
|
if expr.startswith('(') and '|' in expr:
|
|
inner = expr[1:-1]
|
|
parts = _split_disjunction(inner)
|
|
return sum(_count_words(p, length, alphabet_symbols) for p in parts)
|
|
if '.' in expr:
|
|
parts = expr.split('.')
|
|
return _count_concat_words(parts, length, alphabet_symbols, 0)
|
|
if ')' in expr or '(' in expr:
|
|
return 0
|
|
return 0
|
|
|
|
|
|
def _count_concat_words(parts, length, alphabet_symbols, idx):
|
|
if idx >= len(parts):
|
|
return 1 if length == 0 else 0
|
|
total = 0
|
|
for take in range(length + 1):
|
|
cnt = _count_words(parts[idx], take, alphabet_symbols)
|
|
if cnt > 0:
|
|
rest = _count_concat_words(parts, length - take, alphabet_symbols, idx + 1)
|
|
total += cnt * rest
|
|
return total
|
|
|
|
|
|
def _count_star_words(inner, length, alphabet_symbols, min_count):
|
|
total = 0
|
|
for repeat in range(min_count, length + 1):
|
|
if repeat == 0:
|
|
continue
|
|
total += _count_repeat_words(inner, repeat, length, alphabet_symbols)
|
|
return total
|
|
|
|
|
|
def _count_repeat_words(inner, repeat, length, alphabet_symbols):
|
|
if repeat == 0:
|
|
return 1 if length == 0 else 0
|
|
total = 0
|
|
for take in range(length + 1):
|
|
cnt = _count_words(inner, take, alphabet_symbols)
|
|
if cnt > 0:
|
|
rest = _count_repeat_words(inner, repeat - 1, length - take, alphabet_symbols)
|
|
total += cnt * rest
|
|
return total
|
|
|
|
|
|
def _split_disjunction(s):
|
|
depth = 0
|
|
parts = []
|
|
current = []
|
|
for ch in s:
|
|
if ch == '(':
|
|
depth += 1
|
|
current.append(ch)
|
|
elif ch == ')':
|
|
depth -= 1
|
|
current.append(ch)
|
|
elif ch == '|' and depth == 0:
|
|
parts.append(''.join(current))
|
|
current = []
|
|
else:
|
|
current.append(ch)
|
|
parts.append(''.join(current))
|
|
return parts
|