grammar-inference-engine/bex/mcp_server.py

"""Grammar Inference Engine — MCP server.

Provides tools to infer regular expression grammars from example sequences.
Run as: python -m bex.mcp_server
"""

import json
import sys
from pathlib import Path
from typing import Any

from mcp.server.fastmcp import FastMCP

from .crx import CRX
from .idregex import idregex
from .ensemble import infer_ensemble, _matches
from .yaml_to_seq import yaml_file_to_sequence, sequences_to_crx

mcp = FastMCP("grammar-inference", log_level="ERROR")


@mcp.tool()
def infer_grammar(
    sequences: list[list[str]],
    method: str = "crx",
    kmax: int = 2,
    N: int = 3,
) -> str:
    """Infer a grammar (regular expression) from example sequences.

    Args:
        sequences: List of sequences, each a list of symbols (strings).
        method: Algorithm to use — 'crx' (fast, deterministic) or 'idregex' (probabilistic, handles noise better).
        kmax: Maximum k for k-ORE inference (iDRegEx only).
        N: Number of EM iterations (iDRegEx only).

    Returns:
        A regular expression string describing the inferred grammar.
    """
    if method == "crx":
        return CRX().infer(sequences)
    elif method == "idregex":
        result = idregex(sequences, kmax=kmax, N=N)
        return result or "∅"
    else:
        raise ValueError(f"Unknown method: {method}. Use 'crx' or 'idregex'.")


@mcp.tool()
def infer_best_grammar(
    sequences: list[list[str]],
    prefer: str = "",
    kmax: int = 2,
    N: int = 3,
) -> str:
    """Infer a compact grammar from example sequences. Use this when you
    need to generate structured content (Ansible roles, CI configs, Helm
    values, YAML configs, etc.) and have existing examples to learn from.

    The grammar compresses N examples into ~100 chars — far fewer tokens
    than passing all examples. Pass the existing sequences, get back a
    pattern you can follow to generate new instances.

    Args:
        sequences: List of sequences, each a list of strings (symbols in
            the order they appear). Example: [["file","copy","command"],
            ["file","template","command"]].
        prefer: Optional — 'crx' for full coverage (accepts all examples),
            'idregex' for minimal core (only what every example shares).
            Default: runs both and picks best by MDL score.
        kmax: Maximum k for iDRegEx k-ORE inference.
        N: Number of EM iterations for iDRegEx.

    Returns:
        A formatted string with the best grammar, scores, and explanation.
        Grammar notation: a.b = a then b, (a+b) = a or b, r? = optional,
        r+ = one or more, r+? = zero or more.
    """
    pref = prefer if prefer else None
    result = infer_ensemble(sequences, kmax=kmax, N=N, prefer=pref)
    if result['best'] is None:
        return f"No grammar found. {result['why']}"
    lines = [f"Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})",
             f"Grammar: {result['best']['grammar']}",
             ""]
    if len(result['all']) > 1:
        for r in result['all']:
            m = sum(1 for s in sequences if _matches(r['grammar'], s))
            lines.append(f"  {r['algorithm']:10s}  MDL={r['mdl_score']:>8.2f}  match={m}/{len(sequences)}")
    lines.append("")
    lines.append(f"Why: {result['why']}")
    return "\n".join(lines)


@mcp.tool()
def infer_yaml_grammar(
    yaml_dir: str,
    pattern: str = "**/*.yml",
    method: str = "crx",
) -> str:
    """Infer a grammar from YAML files by converting them to key-path sequences.

    Each YAML file is converted to a sequence of key paths (DFS traversal).
    CRX then learns the common pattern across all files.

    Args:
        yaml_dir: Root directory to search for YAML files.
        pattern: Glob pattern for YAML files (default: **/*.yml).
        method: Algorithm to use ('crx' or 'idregex').

    Returns:
        A regular expression grammar describing the YAML structure.
    """
    files = sorted(Path(yaml_dir).rglob(pattern))
    sequences = []
    for f in files:
        if f.is_file():
            try:
                seq = yaml_file_to_sequence(f)
                if seq:
                    sequences.append(seq)
            except Exception:
                continue
    if not sequences:
        return "ε (no sequences found)"
    if method == "crx":
        return CRX().infer(sequences)
    else:
        result = idregex(sequences, kmax=2, N=3)
        return result or "∅"


@mcp.tool()
def infer_ansible_role_grammar(roles_dir: str = ".") -> str:
    """Infer grammars from Ansible role task module sequences.

    Reads tasks/main.yml from each role, extracts the sequence of
    Ansible module names, groups roles by category prefix, and learns
    a per-category grammar.

    Args:
        roles_dir: Path to the Ansible roles directory.

    Returns:
        A formatted report with per-category grammars and role listings.
    """
    try:
        from .role_grammar import collect_all_role_sequences, learn_grammar
    except ImportError:
        return "role_grammar module not available"

    all_roles, by_category = collect_all_role_sequences(roles_dir)
    if not all_roles:
        return "No roles found."

    lines = [f"Found {len(all_roles)} roles in {len(by_category)} categories\n"]
    for cat in sorted(by_category.keys()):
        items = by_category[cat]
        seqs = [s for _, s in items]
        lines.append(f"── {cat} ({len(items)} roles) ──")
        if len(items) > 1:
            g = learn_grammar(seqs)
            lines.append(f"  Grammar: {g}")
        name, seq = items[0]
        lines.append(f"  Roles: {', '.join(n for n, _ in items)}")
        lines.append("")
    return "\n".join(lines)


def main():
    mcp.run()


if __name__ == "__main__":
    main()