"""Grammar Inference Engine — MCP server. Provides tools to infer regular expression grammars from example sequences. Run as: python -m bex.mcp_server """ import json import sys from pathlib import Path from typing import Any from mcp.server.fastmcp import FastMCP from .crx import CRX from .idregex import idregex from .ensemble import infer_ensemble, _matches from .yaml_to_seq import yaml_file_to_sequence, sequences_to_crx mcp = FastMCP("grammar-inference", log_level="ERROR") @mcp.tool() def infer_grammar( sequences: list[list[str]], method: str = "crx", kmax: int = 2, N: int = 3, ) -> str: """Infer a grammar (regular expression) from example sequences. Args: sequences: List of sequences, each a list of symbols (strings). method: Algorithm to use — 'crx' (fast, deterministic) or 'idregex' (probabilistic, handles noise better). kmax: Maximum k for k-ORE inference (iDRegEx only). N: Number of EM iterations (iDRegEx only). Returns: A regular expression string describing the inferred grammar. """ if method == "crx": return CRX().infer(sequences) elif method == "idregex": result = idregex(sequences, kmax=kmax, N=N) return result or "∅" else: raise ValueError(f"Unknown method: {method}. Use 'crx' or 'idregex'.") @mcp.tool() def infer_best_grammar( sequences: list[list[str]], prefer: str = "", kmax: int = 2, N: int = 3, ) -> str: """Infer a compact grammar from example sequences. Use this when you need to generate structured content (Ansible roles, CI configs, Helm values, YAML configs, etc.) and have existing examples to learn from. The grammar compresses N examples into ~100 chars — far fewer tokens than passing all examples. Pass the existing sequences, get back a pattern you can follow to generate new instances. Args: sequences: List of sequences, each a list of strings (symbols in the order they appear). Example: [["file","copy","command"], ["file","template","command"]]. prefer: Optional — 'crx' for full coverage (accepts all examples), 'idregex' for minimal core (only what every example shares). Default: runs both and picks best by MDL score. kmax: Maximum k for iDRegEx k-ORE inference. N: Number of EM iterations for iDRegEx. Returns: A formatted string with the best grammar, scores, and explanation. Grammar notation: a.b = a then b, (a+b) = a or b, r? = optional, r+ = one or more, r+? = zero or more. """ pref = prefer if prefer else None result = infer_ensemble(sequences, kmax=kmax, N=N, prefer=pref) if result['best'] is None: return f"No grammar found. {result['why']}" lines = [f"Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})", f"Grammar: {result['best']['grammar']}", ""] if len(result['all']) > 1: for r in result['all']: m = sum(1 for s in sequences if _matches(r['grammar'], s)) lines.append(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}") lines.append("") lines.append(f"Why: {result['why']}") return "\n".join(lines) @mcp.tool() def infer_yaml_grammar( yaml_dir: str, pattern: str = "**/*.yml", method: str = "crx", ) -> str: """Infer a grammar from YAML files by converting them to key-path sequences. Each YAML file is converted to a sequence of key paths (DFS traversal). CRX then learns the common pattern across all files. Args: yaml_dir: Root directory to search for YAML files. pattern: Glob pattern for YAML files (default: **/*.yml). method: Algorithm to use ('crx' or 'idregex'). Returns: A regular expression grammar describing the YAML structure. """ files = sorted(Path(yaml_dir).rglob(pattern)) sequences = [] for f in files: if f.is_file(): try: seq = yaml_file_to_sequence(f) if seq: sequences.append(seq) except Exception: continue if not sequences: return "ε (no sequences found)" if method == "crx": return CRX().infer(sequences) else: result = idregex(sequences, kmax=2, N=3) return result or "∅" @mcp.tool() def infer_ansible_role_grammar(roles_dir: str = ".") -> str: """Infer grammars from Ansible role task module sequences. Reads tasks/main.yml from each role, extracts the sequence of Ansible module names, groups roles by category prefix, and learns a per-category grammar. Args: roles_dir: Path to the Ansible roles directory. Returns: A formatted report with per-category grammars and role listings. """ try: from .role_grammar import collect_all_role_sequences, learn_grammar except ImportError: return "role_grammar module not available" all_roles, by_category = collect_all_role_sequences(roles_dir) if not all_roles: return "No roles found." lines = [f"Found {len(all_roles)} roles in {len(by_category)} categories\n"] for cat in sorted(by_category.keys()): items = by_category[cat] seqs = [s for _, s in items] lines.append(f"── {cat} ({len(items)} roles) ──") if len(items) > 1: g = learn_grammar(seqs) lines.append(f" Grammar: {g}") name, seq = items[0] lines.append(f" Roles: {', '.join(n for n, _ in items)}") lines.append("") return "\n".join(lines) def main(): mcp.run() if __name__ == "__main__": main()