79 lines
3.2 KiB
Python
79 lines
3.2 KiB
Python
"""Dervish — MCP server.
|
|
|
|
Provides tools to infer regular expression grammars from example sequences.
|
|
Run as: python -m bex.mcp_server
|
|
"""
|
|
|
|
from mcp.server.fastmcp import FastMCP
|
|
|
|
from .ensemble import infer_ensemble, _matches
|
|
|
|
mcp = FastMCP("grammar-inference", log_level="ERROR")
|
|
|
|
|
|
@mcp.tool()
|
|
def infer_best_grammar(
|
|
sequences: list[list[str]],
|
|
prefer: str = "",
|
|
kmax: int = 2,
|
|
N: int = 3,
|
|
min_coverage: float = 1.0,
|
|
) -> str:
|
|
"""Infer a compact grammar from example sequences. Use this when you
|
|
have examples of sequential data and want to learn the pattern.
|
|
|
|
The grammar compresses N examples into ~100 chars — far fewer tokens
|
|
than passing all examples. Pass the existing sequences, get back a
|
|
pattern you can follow to generate new instances.
|
|
|
|
Args:
|
|
sequences: List of sequences, each a list of strings (symbols in
|
|
the order they appear). Example: [["file","copy","command"],
|
|
["file","template","command"]].
|
|
prefer: Optional — 'crx' for full vocabulary (accepts all examples),
|
|
'idregex' for deterministic minimal core. Omit to auto-pick by MDL.
|
|
kmax: Context depth for k-ORE inference. Default 2.
|
|
N: Random trials for k-ORE inference (higher = better, slower).
|
|
min_coverage: (Expert) When < 1.0, also runs a **core+outlier analysis**:
|
|
iteratively removes outlier sequences (those with rarest symbols)
|
|
until at least this fraction remain. Returns the core grammar
|
|
for the majority, plus a list of which sequences were removed and why.
|
|
Default 1.0 = no core analysis. Set to 0.8 to find the tight
|
|
pattern shared by ~80% of examples while flagging the other ~20%
|
|
as variations.
|
|
|
|
Returns:
|
|
A formatted string with the best grammar, scores, and explanation.
|
|
When min_coverage < 1.0, includes the core grammar and outlier info.
|
|
Grammar notation: a.b = a then b, (a+b) = a or b, r? = optional,
|
|
r+ = one or more, r+? = zero or more.
|
|
"""
|
|
pref = prefer if prefer else None
|
|
result = infer_ensemble(sequences, kmax=kmax, N=N, prefer=pref, min_coverage=min_coverage)
|
|
if result['best'] is None:
|
|
return f"No grammar found. {result['why']}"
|
|
lines = [f"Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})",
|
|
f"Grammar: {result['best']['grammar']}",
|
|
""]
|
|
if len(result['all']) > 1:
|
|
for r in result['all']:
|
|
m = sum(1 for s in sequences if _matches(r['grammar'], s))
|
|
lines.append(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}")
|
|
lines.append("")
|
|
lines.append(f"Why: {result['why']}")
|
|
if 'core' in result and result['core']:
|
|
c = result['core']
|
|
lines.append(f"\nCore CRX ({c['coverage']:.0%} coverage, {c['outlier_count']} outliers): {c['grammar']}")
|
|
if c['outliers']:
|
|
lines.append(f" Outlier sequences:")
|
|
for i, o in enumerate(c['outliers'], 1):
|
|
lines.append(f" {i}. {' → '.join(str(x) for x in o[:8])}{'...' if len(o) > 8 else ''}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
mcp.run()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|