"""Tests for kOREInference (Algorithm 4: iDRegEx from arXiv 1004.2372).""" from bex.kore import kOREInference, validate_k_ore from bex.idregex import is_deterministic from bex.mdl import mdl_score, model_cost, data_cost # ── Core inference tests ── def test_linear_sequence(): seqs = [ ['file', 'template', 'command', 'set_fact', 'shell', 'wait_for'], ['file', 'template', 'command', 'set_fact', 'shell', 'wait_for'], ] kore = kOREInference(k_max=3, N=3) result = kore.infer(seqs) assert result is not None, "Should infer a k-ORE" auto, expr, best_k = result assert expr is not None assert all(t in expr for t in ['file', 'template', 'command', 'set_fact', 'shell', 'wait_for']) assert is_deterministic(expr), f"Expression must be deterministic: {expr}" def test_branching_paths(): seqs = [ ['file', 'template', 'setup', 'set_fact', 'shell'], ['file', 'template', 'deploy', 'set_fact', 'shell'], ] kore = kOREInference(k_max=3, N=3) result = kore.infer(seqs) assert result is not None auto, expr, best_k = result assert is_deterministic(expr), f"Expression must be deterministic: {expr}" assert 'file' in expr and 'template' in expr and 'shell' in expr def test_optional_element(): seqs = [ ['file', 'template', 'shell'], ['file', 'template', 'exec', 'shell'], ['file', 'template', 'exec', 'exec', 'shell'], ] kore = kOREInference(k_max=4, N=15) result = kore.infer(seqs) if result is None: return # stochastic failure auto, expr, best_k = result assert is_deterministic(expr), f"Expression must be deterministic: {expr}" def test_looping_element(): seqs = [ ['package', 'file', 'template', 'systemd'], ['package', 'file', 'template', 'template', 'systemd', 'systemd'], ['package', 'file', 'template', 'template', 'template', 'systemd'], ] kore = kOREInference(k_max=3, N=5) result = kore.infer(seqs) assert result is not None auto, expr, best_k = result assert is_deterministic(expr), f"Expression must be deterministic: {expr}" def test_multiple_alternatives(): seqs = [ ['install', 'configure', 'start'], ['install', 'configure', 'enable'], ['install', 'configure', 'restart'], ] kore = kOREInference(k_max=3, N=5) result = kore.infer(seqs) assert result is not None auto, expr, best_k = result assert is_deterministic(expr), f"Expression must be deterministic: {expr}" def test_rejects_non_deterministic(): seqs = [['a'], ['a']] kore = kOREInference(k_max=2, N=2) result = kore.infer(seqs) assert result is not None auto, expr, best_k = result assert is_deterministic(expr), f"Non-deterministic: {expr}" def test_empty_input(): kore = kOREInference(k_max=2, N=2) result = kore.infer([]) assert result is None result = kore.infer([[], []]) assert result is None def test_single_element_sequences(): seqs = [['a'], ['b'], ['a'], ['b']] kore = kOREInference(k_max=2, N=3) result = kore.infer(seqs) assert result is not None auto, expr, best_k = result assert is_deterministic(expr) def test_infer_returns_best_k(): seqs = [ ['a', 'b', 'c'], ['a', 'b', 'c', 'd'], ['a', 'b', 'd'], ] kore = kOREInference(k_max=4, N=3) result = kore.infer(seqs) assert result is not None auto, expr, best_k = result assert 1 <= best_k <= 4 assert is_deterministic(expr) def test_tool_sequences(): seqs = [ ['read', 'grep', 'read'], ['read', 'glob', 'grep', 'read'], ['read', 'bash', 'read'], ['glob', 'grep', 'read', 'edit', 'bash'], ['read', 'edit', 'bash', 'bash'], ['bash', 'read', 'bash'], ] kore = kOREInference(k_max=3, N=5) result = kore.infer(seqs) if result is not None: auto, expr, best_k = result assert is_deterministic(expr) # ── Edge case tests ── def test_single_sequence(): kore = kOREInference(k_max=2, N=3) result = kore.infer([['a', 'b', 'c']]) assert result is not None auto, expr, best_k = result assert is_deterministic(expr) def test_many_identical_sequences(): seqs = [['a', 'b', 'c']] * 20 kore = kOREInference(k_max=2, N=3) result = kore.infer(seqs) assert result is not None auto, expr, best_k = result assert is_deterministic(expr) assert 'a' in expr and 'b' in expr and 'c' in expr def test_xml_like_structured(): seqs = [ ['header', 'body', 'footer'], ['header', 'body', 'body', 'footer'], ['header', 'body', 'body', 'body', 'footer'], ['header', 'footer'], ] kore = kOREInference(k_max=3, N=10) result = kore.infer(seqs) if result is not None: auto, expr, best_k = result assert is_deterministic(expr) assert 'header' in expr and 'footer' in expr def test_disjoint_symbols(): seqs = [ ['alpha', 'beta'], ['gamma', 'delta'], ] kore = kOREInference(k_max=2, N=3) result = kore.infer(seqs) if result is not None: auto, expr, best_k = result assert is_deterministic(expr) def test_k1_vs_k2_selection(): seqs = [ ['a', 'a', 'b'], ['a', 'b'], ['a', 'a', 'a', 'b'], ] kore = kOREInference(k_max=3, N=5) result = kore.infer(seqs) assert result is not None auto, expr, best_k = result assert is_deterministic(expr) def test_all_same_symbol(): seqs = [ ['a', 'a'], ['a', 'a', 'a'], ['a'], ] kore = kOREInference(k_max=2, N=5) result = kore.infer(seqs) if result is not None: auto, expr, best_k = result assert is_deterministic(expr) def test_long_sequence(): seqs = [ ['a', 'b', 'c', 'd', 'e', 'f', 'g'], ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], ] kore = kOREInference(k_max=2, N=5) result = kore.infer(seqs) if result is not None: auto, expr, best_k = result assert is_deterministic(expr) def test_infer_returns_koa(): kore = kOREInference(k_max=2, N=3) result = kore.infer([['a', 'b'], ['a', 'b', 'c']]) assert result is not None auto, expr, best_k = result assert hasattr(auto, '_succ'), "Should return a KOA automaton" assert hasattr(auto, 'src') assert hasattr(auto, 'sink') def test_different_kmax(): seqs = [['a', 'b', 'c', 'd', 'e'], ['a', 'b', 'c']] kore1 = kOREInference(k_max=1, N=5) kore2 = kOREInference(k_max=3, N=5) r1 = kore1.infer(seqs) r2 = kore2.infer(seqs) assert r1 is not None or r2 is not None # ── validate_k_ore tests ── def test_validate_k_ore_basic(): valid, reason = validate_k_ore('a.b.c', 2) assert valid, f"a.b.c should be valid for k=2: {reason}" def test_validate_k_ore_exceeds_k(): valid, reason = validate_k_ore('a.a.a', 1) assert not valid, "a.a.a should fail for k=1" def test_validate_k_ore_epsilon(): valid, reason = validate_k_ore('ε', 1) assert valid def test_validate_k_ore_empty(): valid, reason = validate_k_ore('', 1) assert valid def test_validate_k_ore_disjunction(): valid, reason = validate_k_ore('(a|b|c)', 2) assert valid, f"(a|b|c) should be valid for k=2: {reason}" def test_validate_k_ore_loop(): valid, reason = validate_k_ore('a+', 1) assert valid, "a+ should be valid for k=1" def test_validate_k_ore_k0(): valid, reason = validate_k_ore('a', 0) assert not valid, "a should fail for k=0" # ── MDL scoring tests ── def test_mdl_model_cost(): assert model_cost('a.b.c') == 3 assert model_cost('(a|b)+.c') >= 2 assert model_cost('ε') >= 0 def test_mdl_data_cost(): # General expression (a|b)+ has multiple words of length 1+: non-zero cost dc = data_cost('(a|b)+', [['a', 'b'], ['b', 'a'], ['a']]) assert dc > 0, f"data_cost should be > 0 for general expression, got {dc}" # Exact expression has cost 0 (log2(1) = 0) dc_exact = data_cost('a.b.c', [['a', 'b', 'c']]) assert dc_exact == 0.0, f"data_cost for exact match should be 0, got {dc_exact}" def test_mdl_score_lower_is_better(): score_specific = mdl_score('a.b.c', [['a', 'b', 'c']]) score_general = mdl_score('(a|b|c)+?', [['a', 'b', 'c']]) assert score_specific > 0 and score_general > 0 def test_mdl_empty_sequences(): score = mdl_score('a.b.c', []) assert score == model_cost('a.b.c') # ── Algorithm 4 paper-faithful tests ── def test_infer_returns_deterministic(): for _ in range(5): seqs = [['x', 'y'], ['x', 'z']] kore = kOREInference(k_max=2, N=2) result = kore.infer(seqs) if result: _, expr, _ = result assert is_deterministic(expr), f"Non-deterministic: {expr}" def test_infer_obeys_k_occurrence(): seqs = [['a', 'b'], ['a', 'b', 'c']] for k in range(1, 4): kore = kOREInference(k_max=k, N=5) result = kore.infer(seqs) if result: _, expr, best_k = result valid, _ = validate_k_ore(expr, best_k) assert valid, f"k={best_k} expression {expr} violates k-occurrence" def run_all(): tests = [ test_linear_sequence, test_branching_paths, test_optional_element, test_looping_element, test_multiple_alternatives, test_rejects_non_deterministic, test_empty_input, test_single_element_sequences, test_infer_returns_best_k, test_tool_sequences, test_single_sequence, test_many_identical_sequences, test_xml_like_structured, test_disjoint_symbols, test_k1_vs_k2_selection, test_all_same_symbol, test_long_sequence, test_infer_returns_koa, test_different_kmax, test_validate_k_ore_basic, test_validate_k_ore_exceeds_k, test_validate_k_ore_epsilon, test_validate_k_ore_empty, test_validate_k_ore_disjunction, test_validate_k_ore_loop, test_validate_k_ore_k0, test_mdl_model_cost, test_mdl_data_cost, test_mdl_score_lower_is_better, test_mdl_empty_sequences, test_infer_returns_deterministic, test_infer_obeys_k_occurrence, ] passed = 0 failed = 0 for t in tests: try: t() passed += 1 except Exception as e: import traceback print(f" FAIL {t.__name__}: {e}") traceback.print_exc() failed += 1 print(f"\n{passed} passed, {failed} failed") if __name__ == '__main__': run_all()