"""Dervish — MCP server. Provides tools to infer regular expression grammars from example sequences. Run as: python -m bex.mcp_server """ from mcp.server.fastmcp import FastMCP from .ensemble import infer_ensemble, _matches mcp = FastMCP("grammar-inference", log_level="ERROR") @mcp.tool() def infer_best_grammar( sequences: list[list[str]], prefer: str = "", kmax: int = 2, N: int = 3, min_coverage: float = 1.0, ) -> str: """Infer a compact grammar from example sequences. Use this when you have examples of sequential data and want to learn the pattern. The grammar compresses N examples into ~100 chars — far fewer tokens than passing all examples. Pass the existing sequences, get back a pattern you can follow to generate new instances. Args: sequences: List of sequences, each a list of strings (symbols in the order they appear). Example: [["file","copy","command"], ["file","template","command"]]. prefer: Optional — 'crx' for full vocabulary (accepts all examples), 'idregex' for deterministic minimal core. Omit to auto-pick by MDL. kmax: Context depth for k-ORE inference. Default 2. N: Random trials for k-ORE inference (higher = better, slower). min_coverage: (Expert) When < 1.0, also runs a **core+outlier analysis**: iteratively removes outlier sequences (those with rarest symbols) until at least this fraction remain. Returns the core grammar for the majority, plus a list of which sequences were removed and why. Default 1.0 = no core analysis. Set to 0.8 to find the tight pattern shared by ~80% of examples while flagging the other ~20% as variations. Returns: A formatted string with the best grammar, scores, and explanation. When min_coverage < 1.0, includes the core grammar and outlier info. Grammar notation: a.b = a then b, (a+b) = a or b, r? = optional, r+ = one or more, r+? = zero or more. """ pref = prefer if prefer else None result = infer_ensemble(sequences, kmax=kmax, N=N, prefer=pref, min_coverage=min_coverage) if result['best'] is None: return f"No grammar found. {result['why']}" lines = [f"Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})", f"Grammar: {result['best']['grammar']}", ""] if len(result['all']) > 1: for r in result['all']: m = sum(1 for s in sequences if _matches(r['grammar'], s)) lines.append(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}") lines.append("") lines.append(f"Why: {result['why']}") if 'core' in result and result['core']: c = result['core'] lines.append(f"\nCore CRX ({c['coverage']:.0%} coverage, {c['outlier_count']} outliers): {c['grammar']}") if c['outliers']: lines.append(f" Outlier sequences:") for i, o in enumerate(c['outliers'], 1): lines.append(f" {i}. {' → '.join(str(x) for x in o[:8])}{'...' if len(o) > 8 else ''}") return "\n".join(lines) def main(): mcp.run() if __name__ == "__main__": main()