grammar-inference-engine/bex/mcp_server.py

"""Grammar Inference Engine — MCP server.

Provides tools to infer regular expression grammars from example sequences.
Run as: python -m bex.mcp_server
"""

import json
import sys
from pathlib import Path
from typing import Any

from mcp.server.fastmcp import FastMCP

from .crx import CRX
from .idregex import idregex
from .ensemble import infer_ensemble, _matches
from .yaml_to_seq import yaml_file_to_sequence, sequences_to_crx

mcp = FastMCP("grammar-inference", log_level="ERROR")


@mcp.tool()
def infer_grammar(
    sequences: list[list[str]],
    method: str = "crx",
    kmax: int = 2,
    N: int = 3,
) -> str:
    """Infer a grammar (regular expression) from example sequences.

    Args:
        sequences: List of sequences, each a list of symbols (strings).
        method: Algorithm to use — 'crx' (fast, deterministic) or 'idregex' (probabilistic, handles noise better).
        kmax: Maximum k for k-ORE inference (iDRegEx only).
        N: Number of EM iterations (iDRegEx only).

    Returns:
        A regular expression string describing the inferred grammar.
    """
    if method == "crx":
        return CRX().infer(sequences)
    elif method == "idregex":
        result = idregex(sequences, kmax=kmax, N=N)
        return result or "∅"
    else:
        raise ValueError(f"Unknown method: {method}. Use 'crx' or 'idregex'.")


@mcp.tool()
def infer_best_grammar(
    sequences: list[list[str]],
    prefer: str = "",
    kmax: int = 2,
    N: int = 3,
) -> str:
    """Infer a compact grammar from example sequences. Use this when you
    need to generate structured content (Ansible roles, CI configs, Helm
    values, YAML configs, etc.) and have existing examples to learn from.

    The grammar compresses N examples into ~100 chars — far fewer tokens
    than passing all examples. Pass the existing sequences, get back a
    pattern you can follow to generate new instances.

    Args:
        sequences: List of sequences, each a list of strings (symbols in
            the order they appear). Example: [["file","copy","command"],
            ["file","template","command"]].
        prefer: Optional — 'crx' for full coverage (accepts all examples),
            'idregex' for minimal core (only what every example shares).
            Default: runs both and picks best by MDL score.
        kmax: Maximum k for iDRegEx k-ORE inference.
        N: Number of EM iterations for iDRegEx.

    Returns:
        A formatted string with the best grammar, scores, and explanation.
        Grammar notation: a.b = a then b, (a+b) = a or b, r? = optional,
        r+ = one or more, r+? = zero or more.
    """
    pref = prefer if prefer else None
    result = infer_ensemble(sequences, kmax=kmax, N=N, prefer=pref)
    if result['best'] is None:
        return f"No grammar found. {result['why']}"
    lines = [f"Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})",
             f"Grammar: {result['best']['grammar']}",
             ""]
    if len(result['all']) > 1:
        for r in result['all']:
            m = sum(1 for s in sequences if _matches(r['grammar'], s))
            lines.append(f"  {r['algorithm']:10s}  MDL={r['mdl_score']:>8.2f}  match={m}/{len(sequences)}")
    lines.append("")
    lines.append(f"Why: {result['why']}")
    return "\n".join(lines)


@mcp.tool()
def infer_yaml_grammar(
    yaml_dir: str,
    pattern: str = "**/*.yml",
    method: str = "crx",
) -> str:
    """Infer a grammar from YAML files by converting them to key-path sequences.

    Each YAML file is converted to a sequence of key paths (DFS traversal).
    CRX then learns the common pattern across all files.

    Args:
        yaml_dir: Root directory to search for YAML files.
        pattern: Glob pattern for YAML files (default: **/*.yml).
        method: Algorithm to use ('crx' or 'idregex').

    Returns:
        A regular expression grammar describing the YAML structure.
    """
    files = sorted(Path(yaml_dir).rglob(pattern))
    sequences = []
    for f in files:
        if f.is_file():
            try:
                seq = yaml_file_to_sequence(f)
                if seq:
                    sequences.append(seq)
            except Exception:
                continue
    if not sequences:
        return "ε (no sequences found)"
    if method == "crx":
        return CRX().infer(sequences)
    else:
        result = idregex(sequences, kmax=2, N=3)
        return result or "∅"


@mcp.tool()
def infer_ansible_role_grammar(roles_dir: str = ".") -> str:
    """Infer grammars from Ansible role task module sequences.

    Reads tasks/main.yml from each role, extracts the sequence of
    Ansible module names, groups roles by category prefix, and learns
    a per-category grammar.

    Args:
        roles_dir: Path to the Ansible roles directory.

    Returns:
        A formatted report with per-category grammars and role listings.
    """
    try:
        from .role_grammar import collect_all_role_sequences, learn_grammar
    except ImportError:
        return "role_grammar module not available"

    all_roles, by_category = collect_all_role_sequences(roles_dir)
    if not all_roles:
        return "No roles found."

    lines = [f"Found {len(all_roles)} roles in {len(by_category)} categories\n"]
    for cat in sorted(by_category.keys()):
        items = by_category[cat]
        seqs = [s for _, s in items]
        lines.append(f"── {cat} ({len(items)} roles) ──")
        if len(items) > 1:
            g = learn_grammar(seqs)
            lines.append(f"  Grammar: {g}")
        name, seq = items[0]
        lines.append(f"  Roles: {', '.join(n for n, _ in items)}")
        lines.append("")
    return "\n".join(lines)


def main():
    mcp.run()


if __name__ == "__main__":
    main()
Add MCP server: grammar inference via FastMCP - bex/mcp_server.py: FastMCP server with 3 tools: * infer_grammar(sequences, method='crx'\|'idregex') * infer_yaml_grammar(yaml_dir, pattern, method) * infer_ansible_role_grammar(roles_dir) - pyproject.toml: add bex-mcp console_scripts entry point 2026-07-01 08:03:10 +02:00			`"""Grammar Inference Engine — MCP server.`

			`Provides tools to infer regular expression grammars from example sequences.`
			`Run as: python -m bex.mcp_server`
			`"""`

			`import json`
			`import sys`
			`from pathlib import Path`
			`from typing import Any`

			`from mcp.server.fastmcp import FastMCP`

			`from .crx import CRX`
			`from .idregex import idregex`
Grammar inference engine: CRX + iDRegEx ensemble with MDL scoring, MCP server, showcase, and blog post - Ensemble inference (infer_ensemble) runs both CRX and iDRegEx, picks best by MDL - CRX: CRX algorithm for wide coverage (accepts all sequences, large vocabulary) - iDRegEx: iDRegEx for minimal core grammar (tightest common pattern) - MDL scoring: fixed model_cost to count alphabet symbol occurrences, fixed dispatch order in _count_words_fast - Fixed _match_tokens: rewritten as _match_possible with proper backtracking - Fixed _parse_parts disjunction: children use _parse_flat_symbol to avoid dot-splitting - MCP server: infer_best_grammar and infer_grammar tools - Added prefer parameter (crx/idregex) to skip ensemble - 28 passing tests - SHOWCASE.md with Geerlingguy Galaxy demonstration - blog_post.md with full technical deep-dive 2026-07-01 09:51:41 +02:00			`from .ensemble import infer_ensemble, _matches`
Add MCP server: grammar inference via FastMCP - bex/mcp_server.py: FastMCP server with 3 tools: * infer_grammar(sequences, method='crx'\|'idregex') * infer_yaml_grammar(yaml_dir, pattern, method) * infer_ansible_role_grammar(roles_dir) - pyproject.toml: add bex-mcp console_scripts entry point 2026-07-01 08:03:10 +02:00			`from .yaml_to_seq import yaml_file_to_sequence, sequences_to_crx`

			`mcp = FastMCP("grammar-inference", log_level="ERROR")`


			`@mcp.tool()`
			`def infer_grammar(`
			`sequences: list[list[str]],`
			`method: str = "crx",`
			`kmax: int = 2,`
			`N: int = 3,`
			`) -> str:`
			`"""Infer a grammar (regular expression) from example sequences.`

			`Args:`
			`sequences: List of sequences, each a list of symbols (strings).`
			`method: Algorithm to use — 'crx' (fast, deterministic) or 'idregex' (probabilistic, handles noise better).`
			`kmax: Maximum k for k-ORE inference (iDRegEx only).`
			`N: Number of EM iterations (iDRegEx only).`

			`Returns:`
			`A regular expression string describing the inferred grammar.`
			`"""`
			`if method == "crx":`
			`return CRX().infer(sequences)`
			`elif method == "idregex":`
			`result = idregex(sequences, kmax=kmax, N=N)`
			`return result or "∅"`
			`else:`
			`raise ValueError(f"Unknown method: {method}. Use 'crx' or 'idregex'.")`


Grammar inference engine: CRX + iDRegEx ensemble with MDL scoring, MCP server, showcase, and blog post - Ensemble inference (infer_ensemble) runs both CRX and iDRegEx, picks best by MDL - CRX: CRX algorithm for wide coverage (accepts all sequences, large vocabulary) - iDRegEx: iDRegEx for minimal core grammar (tightest common pattern) - MDL scoring: fixed model_cost to count alphabet symbol occurrences, fixed dispatch order in _count_words_fast - Fixed _match_tokens: rewritten as _match_possible with proper backtracking - Fixed _parse_parts disjunction: children use _parse_flat_symbol to avoid dot-splitting - MCP server: infer_best_grammar and infer_grammar tools - Added prefer parameter (crx/idregex) to skip ensemble - 28 passing tests - SHOWCASE.md with Geerlingguy Galaxy demonstration - blog_post.md with full technical deep-dive 2026-07-01 09:51:41 +02:00			`@mcp.tool()`
			`def infer_best_grammar(`
			`sequences: list[list[str]],`
			`prefer: str = "",`
			`kmax: int = 2,`
			`N: int = 3,`
			`) -> str:`
			`"""Infer a compact grammar from example sequences. Use this when you`
			`need to generate structured content (Ansible roles, CI configs, Helm`
			`values, YAML configs, etc.) and have existing examples to learn from.`

			`The grammar compresses N examples into ~100 chars — far fewer tokens`
			`than passing all examples. Pass the existing sequences, get back a`
			`pattern you can follow to generate new instances.`

			`Args:`
			`sequences: List of sequences, each a list of strings (symbols in`
			`the order they appear). Example: [["file","copy","command"],`
			`["file","template","command"]].`
			`prefer: Optional — 'crx' for full coverage (accepts all examples),`
			`'idregex' for minimal core (only what every example shares).`
			`Default: runs both and picks best by MDL score.`
			`kmax: Maximum k for iDRegEx k-ORE inference.`
			`N: Number of EM iterations for iDRegEx.`

			`Returns:`
			`A formatted string with the best grammar, scores, and explanation.`
			`Grammar notation: a.b = a then b, (a+b) = a or b, r? = optional,`
			`r+ = one or more, r+? = zero or more.`
			`"""`
			`pref = prefer if prefer else None`
			`result = infer_ensemble(sequences, kmax=kmax, N=N, prefer=pref)`
			`if result['best'] is None:`
			`return f"No grammar found. {result['why']}"`
			`lines = [f"Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})",`
			`f"Grammar: {result['best']['grammar']}",`
			`""]`
			`if len(result['all']) > 1:`
			`for r in result['all']:`
			`m = sum(1 for s in sequences if _matches(r['grammar'], s))`
			`lines.append(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}")`
			`lines.append("")`
			`lines.append(f"Why: {result['why']}")`
			`return "\n".join(lines)`


Add MCP server: grammar inference via FastMCP - bex/mcp_server.py: FastMCP server with 3 tools: * infer_grammar(sequences, method='crx'\|'idregex') * infer_yaml_grammar(yaml_dir, pattern, method) * infer_ansible_role_grammar(roles_dir) - pyproject.toml: add bex-mcp console_scripts entry point 2026-07-01 08:03:10 +02:00			`@mcp.tool()`
			`def infer_yaml_grammar(`
			`yaml_dir: str,`
			`pattern: str = "*/.yml",`
			`method: str = "crx",`
			`) -> str:`
			`"""Infer a grammar from YAML files by converting them to key-path sequences.`

			`Each YAML file is converted to a sequence of key paths (DFS traversal).`
			`CRX then learns the common pattern across all files.`

			`Args:`
			`yaml_dir: Root directory to search for YAML files.`
			`pattern: Glob pattern for YAML files (default: */.yml).`
			`method: Algorithm to use ('crx' or 'idregex').`

			`Returns:`
			`A regular expression grammar describing the YAML structure.`
			`"""`
			`files = sorted(Path(yaml_dir).rglob(pattern))`
			`sequences = []`
			`for f in files:`
			`if f.is_file():`
			`try:`
			`seq = yaml_file_to_sequence(f)`
			`if seq:`
			`sequences.append(seq)`
			`except Exception:`
			`continue`
			`if not sequences:`
			`return "ε (no sequences found)"`
			`if method == "crx":`
			`return CRX().infer(sequences)`
			`else:`
			`result = idregex(sequences, kmax=2, N=3)`
			`return result or "∅"`


			`@mcp.tool()`
			`def infer_ansible_role_grammar(roles_dir: str = ".") -> str:`
			`"""Infer grammars from Ansible role task module sequences.`

			`Reads tasks/main.yml from each role, extracts the sequence of`
			`Ansible module names, groups roles by category prefix, and learns`
			`a per-category grammar.`

			`Args:`
			`roles_dir: Path to the Ansible roles directory.`

			`Returns:`
			`A formatted report with per-category grammars and role listings.`
			`"""`
			`try:`
			`from .role_grammar import collect_all_role_sequences, learn_grammar`
			`except ImportError:`
			`return "role_grammar module not available"`

			`all_roles, by_category = collect_all_role_sequences(roles_dir)`
			`if not all_roles:`
			`return "No roles found."`

			`lines = [f"Found {len(all_roles)} roles in {len(by_category)} categories\n"]`
			`for cat in sorted(by_category.keys()):`
			`items = by_category[cat]`
			`seqs = [s for _, s in items]`
			`lines.append(f"── {cat} ({len(items)} roles) ──")`
			`if len(items) > 1:`
			`g = learn_grammar(seqs)`
			`lines.append(f" Grammar: {g}")`
			`name, seq = items[0]`
			`lines.append(f" Roles: {', '.join(n for n, _ in items)}")`
			`lines.append("")`
			`return "\n".join(lines)`


			`def main():`
			`mcp.run()`


			`if __name__ == "__main__":`
			`main()`