- Ensemble inference (infer_ensemble) runs both CRX and iDRegEx, picks best by MDL - CRX: CRX algorithm for wide coverage (accepts all sequences, large vocabulary) - iDRegEx: iDRegEx for minimal core grammar (tightest common pattern) - MDL scoring: fixed model_cost to count alphabet symbol occurrences, fixed dispatch order in _count_words_fast - Fixed _match_tokens: rewritten as _match_possible with proper backtracking - Fixed _parse_parts disjunction: children use _parse_flat_symbol to avoid dot-splitting - MCP server: infer_best_grammar and infer_grammar tools - Added prefer parameter (crx/idregex) to skip ensemble - 28 passing tests - SHOWCASE.md with Geerlingguy Galaxy demonstration - blog_post.md with full technical deep-dive
175 lines
5.7 KiB
Python
175 lines
5.7 KiB
Python
"""Grammar Inference Engine — MCP server.
|
|
|
|
Provides tools to infer regular expression grammars from example sequences.
|
|
Run as: python -m bex.mcp_server
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from mcp.server.fastmcp import FastMCP
|
|
|
|
from .crx import CRX
|
|
from .idregex import idregex
|
|
from .ensemble import infer_ensemble, _matches
|
|
from .yaml_to_seq import yaml_file_to_sequence, sequences_to_crx
|
|
|
|
mcp = FastMCP("grammar-inference", log_level="ERROR")
|
|
|
|
|
|
@mcp.tool()
|
|
def infer_grammar(
|
|
sequences: list[list[str]],
|
|
method: str = "crx",
|
|
kmax: int = 2,
|
|
N: int = 3,
|
|
) -> str:
|
|
"""Infer a grammar (regular expression) from example sequences.
|
|
|
|
Args:
|
|
sequences: List of sequences, each a list of symbols (strings).
|
|
method: Algorithm to use — 'crx' (fast, deterministic) or 'idregex' (probabilistic, handles noise better).
|
|
kmax: Maximum k for k-ORE inference (iDRegEx only).
|
|
N: Number of EM iterations (iDRegEx only).
|
|
|
|
Returns:
|
|
A regular expression string describing the inferred grammar.
|
|
"""
|
|
if method == "crx":
|
|
return CRX().infer(sequences)
|
|
elif method == "idregex":
|
|
result = idregex(sequences, kmax=kmax, N=N)
|
|
return result or "∅"
|
|
else:
|
|
raise ValueError(f"Unknown method: {method}. Use 'crx' or 'idregex'.")
|
|
|
|
|
|
@mcp.tool()
|
|
def infer_best_grammar(
|
|
sequences: list[list[str]],
|
|
prefer: str = "",
|
|
kmax: int = 2,
|
|
N: int = 3,
|
|
) -> str:
|
|
"""Infer a compact grammar from example sequences. Use this when you
|
|
need to generate structured content (Ansible roles, CI configs, Helm
|
|
values, YAML configs, etc.) and have existing examples to learn from.
|
|
|
|
The grammar compresses N examples into ~100 chars — far fewer tokens
|
|
than passing all examples. Pass the existing sequences, get back a
|
|
pattern you can follow to generate new instances.
|
|
|
|
Args:
|
|
sequences: List of sequences, each a list of strings (symbols in
|
|
the order they appear). Example: [["file","copy","command"],
|
|
["file","template","command"]].
|
|
prefer: Optional — 'crx' for full coverage (accepts all examples),
|
|
'idregex' for minimal core (only what every example shares).
|
|
Default: runs both and picks best by MDL score.
|
|
kmax: Maximum k for iDRegEx k-ORE inference.
|
|
N: Number of EM iterations for iDRegEx.
|
|
|
|
Returns:
|
|
A formatted string with the best grammar, scores, and explanation.
|
|
Grammar notation: a.b = a then b, (a+b) = a or b, r? = optional,
|
|
r+ = one or more, r+? = zero or more.
|
|
"""
|
|
pref = prefer if prefer else None
|
|
result = infer_ensemble(sequences, kmax=kmax, N=N, prefer=pref)
|
|
if result['best'] is None:
|
|
return f"No grammar found. {result['why']}"
|
|
lines = [f"Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})",
|
|
f"Grammar: {result['best']['grammar']}",
|
|
""]
|
|
if len(result['all']) > 1:
|
|
for r in result['all']:
|
|
m = sum(1 for s in sequences if _matches(r['grammar'], s))
|
|
lines.append(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}")
|
|
lines.append("")
|
|
lines.append(f"Why: {result['why']}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
@mcp.tool()
|
|
def infer_yaml_grammar(
|
|
yaml_dir: str,
|
|
pattern: str = "**/*.yml",
|
|
method: str = "crx",
|
|
) -> str:
|
|
"""Infer a grammar from YAML files by converting them to key-path sequences.
|
|
|
|
Each YAML file is converted to a sequence of key paths (DFS traversal).
|
|
CRX then learns the common pattern across all files.
|
|
|
|
Args:
|
|
yaml_dir: Root directory to search for YAML files.
|
|
pattern: Glob pattern for YAML files (default: **/*.yml).
|
|
method: Algorithm to use ('crx' or 'idregex').
|
|
|
|
Returns:
|
|
A regular expression grammar describing the YAML structure.
|
|
"""
|
|
files = sorted(Path(yaml_dir).rglob(pattern))
|
|
sequences = []
|
|
for f in files:
|
|
if f.is_file():
|
|
try:
|
|
seq = yaml_file_to_sequence(f)
|
|
if seq:
|
|
sequences.append(seq)
|
|
except Exception:
|
|
continue
|
|
if not sequences:
|
|
return "ε (no sequences found)"
|
|
if method == "crx":
|
|
return CRX().infer(sequences)
|
|
else:
|
|
result = idregex(sequences, kmax=2, N=3)
|
|
return result or "∅"
|
|
|
|
|
|
@mcp.tool()
|
|
def infer_ansible_role_grammar(roles_dir: str = ".") -> str:
|
|
"""Infer grammars from Ansible role task module sequences.
|
|
|
|
Reads tasks/main.yml from each role, extracts the sequence of
|
|
Ansible module names, groups roles by category prefix, and learns
|
|
a per-category grammar.
|
|
|
|
Args:
|
|
roles_dir: Path to the Ansible roles directory.
|
|
|
|
Returns:
|
|
A formatted report with per-category grammars and role listings.
|
|
"""
|
|
try:
|
|
from .role_grammar import collect_all_role_sequences, learn_grammar
|
|
except ImportError:
|
|
return "role_grammar module not available"
|
|
|
|
all_roles, by_category = collect_all_role_sequences(roles_dir)
|
|
if not all_roles:
|
|
return "No roles found."
|
|
|
|
lines = [f"Found {len(all_roles)} roles in {len(by_category)} categories\n"]
|
|
for cat in sorted(by_category.keys()):
|
|
items = by_category[cat]
|
|
seqs = [s for _, s in items]
|
|
lines.append(f"── {cat} ({len(items)} roles) ──")
|
|
if len(items) > 1:
|
|
g = learn_grammar(seqs)
|
|
lines.append(f" Grammar: {g}")
|
|
name, seq = items[0]
|
|
lines.append(f" Roles: {', '.join(n for n, _ in items)}")
|
|
lines.append("")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
mcp.run()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|