purge make_charts.py, examples/, full-text papers, blog_post (moved to ~/Desktop/kesai/); translate German CLI to English
This commit is contained in:
parent
c8a49f0149
commit
6bf7a681ce
9 changed files with 45 additions and 5270 deletions
50
bex/cli.py
50
bex/cli.py
|
|
@ -19,7 +19,7 @@ from .ilocal import iLocal, extract_contexts_from_file, reduce_contexts
|
||||||
|
|
||||||
|
|
||||||
def find_yaml_files(directory):
|
def find_yaml_files(directory):
|
||||||
"""Findet alle YAML-Dateien in einem Verzeichnis (rekursiv)."""
|
"""Find all YAML files in a directory (recursive)."""
|
||||||
patterns = ['**/*.yml', '**/*.yaml']
|
patterns = ['**/*.yml', '**/*.yaml']
|
||||||
files = []
|
files = []
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
|
|
@ -32,37 +32,37 @@ def main():
|
||||||
description='bex — BEX-based YAML Grammar Inference',
|
description='bex — BEX-based YAML Grammar Inference',
|
||||||
)
|
)
|
||||||
parser.add_argument('--dir', type=str, default='roles/',
|
parser.add_argument('--dir', type=str, default='roles/',
|
||||||
help='Verzeichnis mit YAML-Dateien (default: roles/)')
|
help='Directory with YAML files (default: roles/)')
|
||||||
parser.add_argument('--k-max', type=int, default=5,
|
parser.add_argument('--k-max', type=int, default=5,
|
||||||
help='Max k für k-ORE-Inferenz (default: 5)')
|
help='Max k for k-ORE inference (default: 5)')
|
||||||
parser.add_argument('--context', type=str, default=None,
|
parser.add_argument('--context', type=str, default=None,
|
||||||
help='Auf spezifischen Container-Key beschränken (z.B. tasks)')
|
help='Restrict to specific container key (e.g. tasks)')
|
||||||
parser.add_argument('--output', type=str, default=None,
|
parser.add_argument('--output', type=str, default=None,
|
||||||
help='Output-Datei für Template (default: stdout)')
|
help='Output file for template (default: stdout)')
|
||||||
parser.add_argument('--ilocal', action='store_true',
|
parser.add_argument('--ilocal', action='store_true',
|
||||||
help='iLocal-Kontextanalyse durchführen')
|
help='Run iLocal context analysis')
|
||||||
parser.add_argument('--crx', action='store_true',
|
parser.add_argument('--crx', action='store_true',
|
||||||
help='CRX (direct CHARE inference) verwenden')
|
help='Use CRX (direct CHARE inference)')
|
||||||
parser.add_argument('--verbose', '-v', action='store_true',
|
parser.add_argument('--verbose', '-v', action='store_true',
|
||||||
help='Ausführliche Ausgabe')
|
help='Verbose output')
|
||||||
parser.add_argument('--stats', action='store_true',
|
parser.add_argument('--stats', action='store_true',
|
||||||
help='Zeige Token-Statistiken')
|
help='Show token statistics')
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not os.path.isdir(args.dir):
|
if not os.path.isdir(args.dir):
|
||||||
print(f"Fehler: Verzeichnis '{args.dir}' nicht gefunden.", file=sys.stderr)
|
print(f"Error: directory '{args.dir}' not found.", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
yaml_files = find_yaml_files(args.dir)
|
yaml_files = find_yaml_files(args.dir)
|
||||||
if not yaml_files:
|
if not yaml_files:
|
||||||
print(f"Keine YAML-Dateien in '{args.dir}' gefunden.", file=sys.stderr)
|
print(f"No YAML files found in '{args.dir}'.", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
print(f"Gefundene YAML-Dateien: {len(yaml_files)}", file=sys.stderr)
|
print(f"Found YAML files: {len(yaml_files)}", file=sys.stderr)
|
||||||
|
|
||||||
if args.ilocal:
|
if args.ilocal:
|
||||||
print("\n=== iLocal: Kontext-Extraktion ===", file=sys.stderr)
|
print("\n=== iLocal: Context Extraction ===", file=sys.stderr)
|
||||||
all_contexts = {}
|
all_contexts = {}
|
||||||
for f in yaml_files:
|
for f in yaml_files:
|
||||||
contexts = extract_contexts_from_file(f)
|
contexts = extract_contexts_from_file(f)
|
||||||
|
|
@ -72,11 +72,11 @@ def main():
|
||||||
all_contexts[ctx].extend(seqs)
|
all_contexts[ctx].extend(seqs)
|
||||||
|
|
||||||
reduced = reduce_contexts(all_contexts)
|
reduced = reduce_contexts(all_contexts)
|
||||||
print(f" Kontexte gefunden: {len(reduced)}", file=sys.stderr)
|
print(f" Contexts found: {len(reduced)}", file=sys.stderr)
|
||||||
for ctx, seqs in sorted(reduced.items()):
|
for ctx, seqs in sorted(reduced.items()):
|
||||||
lengths = [len(s) for s in seqs]
|
lengths = [len(s) for s in seqs]
|
||||||
print(f" {ctx}: {len(seqs)} Sequenzen, "
|
print(f" {ctx}: {len(seqs)} sequences, "
|
||||||
f"Längen {min(lengths)}-{max(lengths)}, "
|
f"lengths {min(lengths)}-{max(lengths)}, "
|
||||||
f"unique_seqs={len(set(tuple(s) for s in seqs))}",
|
f"unique_seqs={len(set(tuple(s) for s in seqs))}",
|
||||||
file=sys.stderr)
|
file=sys.stderr)
|
||||||
|
|
||||||
|
|
@ -94,30 +94,30 @@ def main():
|
||||||
print(f" {os.path.relpath(f)}: {seq}", file=sys.stderr)
|
print(f" {os.path.relpath(f)}: {seq}", file=sys.stderr)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
print(f" Fehler in {f}: {e}", file=sys.stderr)
|
print(f" Error in {f}: {e}", file=sys.stderr)
|
||||||
|
|
||||||
if not all_sequences:
|
if not all_sequences:
|
||||||
print("Keine Sequenzen extrahiert.", file=sys.stderr)
|
print("No sequences extracted.", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
print(f" Sequenzen extrahiert: {len(all_sequences)}", file=sys.stderr)
|
print(f" Sequences extracted: {len(all_sequences)}", file=sys.stderr)
|
||||||
lengths = [len(s) for s in all_sequences]
|
lengths = [len(s) for s in all_sequences]
|
||||||
print(f" Längen: min={min(lengths)}, max={max(lengths)}, "
|
print(f" Lengths: min={min(lengths)}, max={max(lengths)}, "
|
||||||
f"avg={sum(lengths)/len(lengths):.1f}", file=sys.stderr)
|
f"avg={sum(lengths)/len(lengths):.1f}", file=sys.stderr)
|
||||||
|
|
||||||
if args.stats:
|
if args.stats:
|
||||||
stats = tokenizer.get_statistics()
|
stats = tokenizer.get_statistics()
|
||||||
print("\n=== Token-Statistiken ===", file=sys.stderr)
|
print("\n=== Token Statistics ===", file=sys.stderr)
|
||||||
for token, count in list(stats.items())[:30]:
|
for token, count in list(stats.items())[:30]:
|
||||||
print(f" {token}: {count}", file=sys.stderr)
|
print(f" {token}: {count}", file=sys.stderr)
|
||||||
|
|
||||||
print("\n=== k-ORE Inferenz ===", file=sys.stderr)
|
print("\n=== k-ORE Inference ===", file=sys.stderr)
|
||||||
kore = kOREInference(k_max=args.k_max)
|
kore = kOREInference(k_max=args.k_max)
|
||||||
|
|
||||||
if args.crx:
|
if args.crx:
|
||||||
result = kore.infer_with_crx(all_sequences)
|
result = kore.infer_with_crx(all_sequences)
|
||||||
_, expr, method = result
|
_, expr, method = result
|
||||||
print(f" Methode: {method}", file=sys.stderr)
|
print(f" Method: {method}", file=sys.stderr)
|
||||||
else:
|
else:
|
||||||
result = kore.infer(all_sequences)
|
result = kore.infer(all_sequences)
|
||||||
if result:
|
if result:
|
||||||
|
|
@ -127,7 +127,7 @@ def main():
|
||||||
expr = "∅"
|
expr = "∅"
|
||||||
print(" Kein Ergebnis", file=sys.stderr)
|
print(" Kein Ergebnis", file=sys.stderr)
|
||||||
|
|
||||||
print(f" Inferierter Ausdruck: {expr}", file=sys.stderr)
|
print(f" Inferred expression: {expr}", file=sys.stderr)
|
||||||
|
|
||||||
print("\n=== One-Shot Template ===", file=sys.stderr)
|
print("\n=== One-Shot Template ===", file=sys.stderr)
|
||||||
print(file=sys.stderr)
|
print(file=sys.stderr)
|
||||||
|
|
@ -136,7 +136,7 @@ def main():
|
||||||
if args.output:
|
if args.output:
|
||||||
with open(args.output, 'w') as f:
|
with open(args.output, 'w') as f:
|
||||||
f.write(template)
|
f.write(template)
|
||||||
print(f"Template geschrieben nach: {args.output}", file=sys.stderr)
|
print(f"Template written to: {args.output}", file=sys.stderr)
|
||||||
else:
|
else:
|
||||||
print(template)
|
print(template)
|
||||||
|
|
||||||
|
|
|
||||||
263
blog_post.md
263
blog_post.md
|
|
@ -1,263 +0,0 @@
|
||||||
# Dervish: Discovering Unwritten Conventions with Grammar Inference
|
|
||||||
|
|
||||||
<p align="left"><img src="dervish-logo.png" alt="Dervish" width="180"></p>
|
|
||||||
|
|
||||||
**How we turned 36 Ansible roles into a 200-character grammar — and why
|
|
||||||
it matters for LLM agents.**
|
|
||||||
|
|
||||||
## The problem
|
|
||||||
|
|
||||||
Every codebase has unwritten conventions. Your team's Docker Compose
|
|
||||||
files always put `image` before `ports` before `volumes`. Your Ansible
|
|
||||||
deploy roles always start with `assert`, then `file`, then `template`.
|
|
||||||
Your CI pipelines always run `lint` before `test` before `deploy`.
|
|
||||||
|
|
||||||
Nobody writes these down. They're emergent — copied from role to role,
|
|
||||||
file to file, until they become a tacit standard.
|
|
||||||
|
|
||||||
When an LLM agent needs to generate new content that follows these
|
|
||||||
conventions, you have two options:
|
|
||||||
|
|
||||||
1. **Stuff every existing file into context** — 36 deploy roles = 15,000
|
|
||||||
tokens. You'll hit the context window on your third example.
|
|
||||||
2. **Give it one or two examples and hope** — the LLM will guess the
|
|
||||||
pattern, and it will often guess wrong.
|
|
||||||
|
|
||||||
Neither is good. The first is wasteful. The second is unreliable.
|
|
||||||
|
|
||||||
What you really want is the **compiled convention** — the minimal
|
|
||||||
description of what all 36 roles share, expressed in ~200 tokens. An
|
|
||||||
LLM can follow a rule in 200 tokens far more reliably than it can
|
|
||||||
infer a pattern from 36 examples.
|
|
||||||
|
|
||||||
This is grammar inference.
|
|
||||||
|
|
||||||
## The approach
|
|
||||||
|
|
||||||
Given a set of example sequences over some alphabet (e.g., Ansible
|
|
||||||
module names, Docker Compose keys, CI job names), learn a regular
|
|
||||||
expression that describes the general pattern.
|
|
||||||
|
|
||||||
We implemented two algorithms from Bex et al., a pair of papers from
|
|
||||||
TODS 2010 and arXiv 2010:
|
|
||||||
|
|
||||||
- **CRX** (TODS 2010 §6): A single-pass algorithm that builds a
|
|
||||||
predecessor relation over symbols, computes equivalence classes,
|
|
||||||
and emits a Chain Regular Expression (CHARE) that matches ALL
|
|
||||||
input sequences. Fast, deterministic, captures the full vocabulary.
|
|
||||||
|
|
||||||
- **iDRegEx** (arXiv 2010): A probabilistic algorithm using k-testable
|
|
||||||
Observation Automata (k-OA) trained with Baum-Welch EM. It finds
|
|
||||||
only the *minimal common core* — the symbols that appear in every
|
|
||||||
example. Robust against noise, but fails (returns ∅) when the
|
|
||||||
examples are too diverse.
|
|
||||||
|
|
||||||
Both run in the **ensemble**: CRX produces a permissive grammar (full
|
|
||||||
vocabulary, many optional parts), iDRegEx produces a strict grammar
|
|
||||||
(minimal core). A Minimum Description Length (MDL) score picks the
|
|
||||||
winner: the grammar that compresses the data best.
|
|
||||||
|
|
||||||
## The algorithms, briefly
|
|
||||||
|
|
||||||
### CRX — Chain Regular Expression inference
|
|
||||||
|
|
||||||
CRX (Algorithm 7, TODS 2010) works in four steps:
|
|
||||||
|
|
||||||
1. **Build the immediate-predecessor relation.** For every adjacent
|
|
||||||
pair (x, y) across all sequences, record that x precedes y. If
|
|
||||||
symbol `assert` always appears before `file`, record
|
|
||||||
`assert → file`.
|
|
||||||
|
|
||||||
2. **Compute equivalence classes.** Take the reflexive-transitive
|
|
||||||
closure of the predecessor relation. The strongly connected
|
|
||||||
components are *equivalence classes* — groups of symbols that can
|
|
||||||
appear in the same position. If `copy` and `template` both follow
|
|
||||||
`file` and precede `command`, they're in the same class.
|
|
||||||
|
|
||||||
3. **Merge singleton classes.** A class with one symbol that shares
|
|
||||||
the same predecessor/successor sets as another singleton class
|
|
||||||
gets merged. This handles symbols that always appear in the
|
|
||||||
same structural position.
|
|
||||||
|
|
||||||
4. **Topological sort.** The equivalence classes are sorted by their
|
|
||||||
position in the Hasse diagram of the predecessor relation. Each
|
|
||||||
class becomes a factor in the output, annotated with a quantifier:
|
|
||||||
- `+` (one or more) if the class forms a cycle
|
|
||||||
- `+?` (zero or more) if the class appears variably
|
|
||||||
- `?` (optional) if the class can be absent
|
|
||||||
- (exact) if the class always appears exactly once
|
|
||||||
|
|
||||||
The result is a CHARE: a sequence of factors where each factor is a
|
|
||||||
disjunction of equivalent symbols with a quantifier.
|
|
||||||
|
|
||||||
### iDRegEx — k-optimal regular expression inference
|
|
||||||
|
|
||||||
iDRegEx (Algorithm 4, arXiv 2010) uses a probabilistic automaton:
|
|
||||||
|
|
||||||
1. **Build a complete k-OA.** A k-testable Observation Automaton
|
|
||||||
records all k-grams (subsequences of length k) from the input
|
|
||||||
sequences. The automaton's states represent (k-1)-grams.
|
|
||||||
|
|
||||||
2. **Train with Baum-Welch.** EM iterations assign probabilities to
|
|
||||||
transitions, learning which paths through the automaton are most
|
|
||||||
likely given the data.
|
|
||||||
|
|
||||||
3. **Disambiguate.** Remove nondeterministic transitions — for any
|
|
||||||
state and symbol, keep only the most probable next state.
|
|
||||||
|
|
||||||
4. **Prune.** Remove low-probability edges and unreachable states,
|
|
||||||
leaving only the most likely paths.
|
|
||||||
|
|
||||||
5. **Extract with rwr².** The REWRITE-SQUARED algorithm (rwr²,
|
|
||||||
Algorithm 3) collapses the pruned automaton into a k-optimal
|
|
||||||
regular expression — the minimal common core.
|
|
||||||
|
|
||||||
### MDL scoring — picking the right level of specificity
|
|
||||||
|
|
||||||
The Minimum Description Length principle (Rissanen 1978) says: the
|
|
||||||
best grammar is the one that minimizes the sum of its own size and
|
|
||||||
the cost of encoding the data using it.
|
|
||||||
|
|
||||||
```
|
|
||||||
MDL = model_cost + data_cost
|
|
||||||
```
|
|
||||||
|
|
||||||
**model_cost** = the number of alphabet symbol occurrences in the
|
|
||||||
grammar. A grammar with 5 unique symbols used once each has
|
|
||||||
model_cost = 5.
|
|
||||||
|
|
||||||
**data_cost** = Σ log₂(|L(r)|) across all sequences, where |L(r)| is
|
|
||||||
the number of strings of length len(s) that the grammar accepts.
|
|
||||||
A grammar like `(a+b+c+...+z)+` accepts 19 possible symbols at each
|
|
||||||
position, so for a sequence of length 120, the data cost is
|
|
||||||
120 × log₂(19) ≈ 510 bits. A grammar like `a.b.c.d.e` accepts only
|
|
||||||
1 string of length 5, so data cost is 0.
|
|
||||||
|
|
||||||
The ensemble picks the grammar with the lowest total MDL. This
|
|
||||||
automatically balances specificity against coverage: a grammar that
|
|
||||||
matches only 1 sequence but does so perfectly (low data cost) can
|
|
||||||
beat a grammar that matches all sequences but is extremely permissive
|
|
||||||
(high data cost).
|
|
||||||
|
|
||||||
## The results
|
|
||||||
|
|
||||||
### Ansible deploy roles — 36 roles from companyweb
|
|
||||||
|
|
||||||
Your own deploy roles cover everything from AdGuard Home to
|
|
||||||
Woodpecker CI. They have NO schema — each is a free-form script.
|
|
||||||
|
|
||||||
```
|
|
||||||
Grammar: docker_volume+?.group?.docker_container?.user?.apt?.npm?.
|
|
||||||
(assert+...+command+copy+file+template+set_fact+...+wait_for)+?.
|
|
||||||
(cron+firewalld)?
|
|
||||||
Match: 36/36
|
|
||||||
MDL: 2186.28
|
|
||||||
```
|
|
||||||
|
|
||||||
Bottleneck analysis: optional docker setup (volume, group, container,
|
|
||||||
user, apt, npm), then a large disjunction of ~25 task modules (one or
|
|
||||||
more), then optional cron/firewalld at the end. This captures the
|
|
||||||
convention precisely.
|
|
||||||
|
|
||||||
**Compression: 36 roles (15,000 tokens) → 200 tokens (75×)**
|
|
||||||
|
|
||||||
### Geerlingguy Galaxy roles — 15 popular roles
|
|
||||||
|
|
||||||
Jeff Geerling's roles are the most popular on Ansible Galaxy. He has
|
|
||||||
never documented their structural pattern. Yet every one of the 15
|
|
||||||
follows the same arc:
|
|
||||||
|
|
||||||
```
|
|
||||||
Grammar: fail?.(include_vars+set_fact+package+file+template+service+...)+.
|
|
||||||
include+?.(npm+pip)+?.lineinfile?
|
|
||||||
Match: 15/15
|
|
||||||
MDL: 596.64
|
|
||||||
```
|
|
||||||
|
|
||||||
Check prerequisites, OS-specific variables, install packages,
|
|
||||||
configure with templates, start services, optionally run sub-tasks,
|
|
||||||
install npm/pip packages, and optionally tweak config lines.
|
|
||||||
|
|
||||||
**This is the first explicit description of the geerlingguy role
|
|
||||||
module ordering convention.** It took 15 roles and a grammar inference
|
|
||||||
algorithm to write it down.
|
|
||||||
|
|
||||||
**Compression: 15 roles (5,000 tokens) → 60 tokens (83×)**
|
|
||||||
|
|
||||||
### Ensemble dynamics
|
|
||||||
|
|
||||||
The ensemble (CRX + iDRegEx + MDL) selects different winners
|
|
||||||
depending on the data:
|
|
||||||
|
|
||||||
| Dataset | Winner | Why |
|
|
||||||
|---------|--------|-----|
|
|
||||||
| Ansible galaxy (15 roles) | CRX | iDRegEx returned ∅ (too diverse) |
|
|
||||||
| Helm prom-stack (6 configs) | **iDRegEx** | Finds minimal core across all configs |
|
|
||||||
| Terraform modules (8) | CRX | iDRegEx returned ∅ (no common core across domains) |
|
|
||||||
| Terraform modules (8) | CRX | Every resource type optional across domains |
|
|
||||||
| GitHub Actions Go lint (6) | CRX | Tight pattern, all match |
|
|
||||||
|
|
||||||
iDRegEx wins when the data has a clear common core. CRX wins when
|
|
||||||
there's no single shared subsequence (the roles share the *vocabulary*
|
|
||||||
but not the *order*).
|
|
||||||
|
|
||||||
## The MCP
|
|
||||||
|
|
||||||
The engine is exposed as an MCP server:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from bex.mcp_server import infer_best_grammar
|
|
||||||
|
|
||||||
# Full coverage
|
|
||||||
output = infer_best_grammar(
|
|
||||||
sequences=role_sequences,
|
|
||||||
prefer="crx",
|
|
||||||
)
|
|
||||||
# Returns:
|
|
||||||
# Best: CRX (MDL 288)
|
|
||||||
# Grammar: fail?.(include_vars+set_fact+package+file+template+service+...)+
|
|
||||||
# .include+?.(npm+pip)+?.lineinfile?
|
|
||||||
|
|
||||||
# Ensemble — let MDL pick
|
|
||||||
output = infer_best_grammar(sequences=role_sequences)
|
|
||||||
```
|
|
||||||
|
|
||||||
An agent workflow:
|
|
||||||
|
|
||||||
1. Agent needs to write an Ansible role
|
|
||||||
2. Finds 15 existing geerlingguy roles, extracts their task module sequences
|
|
||||||
3. Calls `infer_best_grammar(sequences=..., prefer='crx')`
|
|
||||||
4. Gets back the grammar in ~60 tokens
|
|
||||||
5. Generates a new role that follows the structural pattern
|
|
||||||
|
|
||||||
Without the MCP: 15 role files in context (5,000 tokens), or guesswork.
|
|
||||||
With the MCP: one grammar rule (~60 tokens), known to match 15/15 roles.
|
|
||||||
|
|
||||||
## What it means
|
|
||||||
|
|
||||||
Grammar inference turns **examples** into **rules**. The rule is a
|
|
||||||
compressed description of the structural convention — and for
|
|
||||||
schema-less content like the geerlingguy role module ordering, this is
|
|
||||||
the *first time* the convention has been written down at all.
|
|
||||||
|
|
||||||
For LLM agents, this changes the trade-off between context and
|
|
||||||
accuracy. Instead of flooding the context window with examples, the
|
|
||||||
agent can call the MCP, get the rule in ~60 tokens, and follow it.
|
|
||||||
The rule is more reliable than guessing from examples, and it costs
|
|
||||||
less than the first example would have.
|
|
||||||
|
|
||||||
The algorithm doesn't need to understand what a deploy role does. It
|
|
||||||
doesn't know that `file` creates directories and `template` renders
|
|
||||||
Jinja2. It only needs to see 36 sequences of module names and find
|
|
||||||
the pattern they all share. The structural convention is in the data
|
|
||||||
— you just have to extract it.
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- Bex, G. J., Gelade, W., Neven, F., & Vansummeren, S. (2010).
|
|
||||||
[*Learning Deterministic Regular Expressions for the Web.*](https://doi.org/10.1145/1806907.1806911) TODS 2010.
|
|
||||||
- Bex, G. J., Gelade, W., Martens, W., & Neven, F. (2010).
|
|
||||||
[*Simplifying XML Schema: Single-Type Approximations of Regular
|
|
||||||
Expressions.*](https://arxiv.org/abs/1004.2372) arXiv:1004.2372.
|
|
||||||
- Rissanen, J. (1978). *Modeling by shortest data description.*
|
|
||||||
Automatica 14(5).
|
|
||||||
|
|
@ -1,111 +0,0 @@
|
||||||
"""Extract Ansible role task module sequences and learn per-group grammars."""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
import yaml
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from .crx import CRX
|
|
||||||
from .expr import strip_k
|
|
||||||
|
|
||||||
|
|
||||||
IGNORE_MODULES = frozenset({'name', 'tags', 'when', 'register', 'no_log',
|
|
||||||
'changed_when', 'failed_when', 'ignore_errors',
|
|
||||||
'run_once', 'delegate_to', 'loop', 'loop_control',
|
|
||||||
'until', 'retries', 'delay', 'poll', 'async',
|
|
||||||
'become', 'become_user', 'become_flags',
|
|
||||||
'check_mode', 'diff', 'environment',
|
|
||||||
'vars', 'notify', 'args',
|
|
||||||
'block', 'rescue', 'always', 'include_tasks'})
|
|
||||||
|
|
||||||
|
|
||||||
def extract_module_name(task):
|
|
||||||
"""Extract the Ansible module name from a task dict.
|
|
||||||
|
|
||||||
The module is the key that is NOT a known non-module key.
|
|
||||||
Returns 'skip' for non-task entries like block/rescue/always.
|
|
||||||
"""
|
|
||||||
if not isinstance(task, dict):
|
|
||||||
return None
|
|
||||||
# Check for block/rescue/always — these contain nested tasks
|
|
||||||
for key in ('block', 'rescue', 'always'):
|
|
||||||
if key in task:
|
|
||||||
nested = task[key]
|
|
||||||
if isinstance(nested, list):
|
|
||||||
return [extract_module_name(t) for t in nested]
|
|
||||||
return None
|
|
||||||
# Find the module key (not name, not meta-keys)
|
|
||||||
for key, value in task.items():
|
|
||||||
if key in ('name',):
|
|
||||||
continue
|
|
||||||
if key in IGNORE_MODULES:
|
|
||||||
continue
|
|
||||||
if isinstance(value, (dict, list, str, bool, int, float)):
|
|
||||||
# It's the module name (venv or fqcn)
|
|
||||||
return strip_k(key)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def flatten_nested(seq):
|
|
||||||
"""Flatten nested lists into a single list."""
|
|
||||||
result = []
|
|
||||||
for item in seq:
|
|
||||||
if isinstance(item, list):
|
|
||||||
result.extend(flatten_nested(item))
|
|
||||||
elif item is not None and item != 'skip':
|
|
||||||
result.append(item)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def get_role_category(role_name):
|
|
||||||
"""Extract category from role name like deploy_foo → deploy."""
|
|
||||||
parts = role_name.split('_')
|
|
||||||
if len(parts) >= 2:
|
|
||||||
return parts[0]
|
|
||||||
return 'other'
|
|
||||||
|
|
||||||
|
|
||||||
def load_role_module_sequence(role_dir):
|
|
||||||
"""Load a role's task file and extract the module sequence."""
|
|
||||||
task_file = role_dir / 'tasks' / 'main.yml'
|
|
||||||
if not task_file.exists():
|
|
||||||
return None, None
|
|
||||||
with open(task_file) as f:
|
|
||||||
data = yaml.safe_load(f)
|
|
||||||
if not isinstance(data, list):
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
modules = []
|
|
||||||
for task in data:
|
|
||||||
result = extract_module_name(task)
|
|
||||||
if isinstance(result, list):
|
|
||||||
modules.extend(flatten_nested(result))
|
|
||||||
elif result is not None:
|
|
||||||
modules.append(result)
|
|
||||||
|
|
||||||
return role_dir.name, modules
|
|
||||||
|
|
||||||
|
|
||||||
def collect_all_role_sequences(roles_dir='roles'):
|
|
||||||
"""Collect module sequences from all roles, grouped by category."""
|
|
||||||
by_category = defaultdict(list)
|
|
||||||
all_roles = []
|
|
||||||
for role_dir in sorted(Path(roles_dir).glob('*/tasks/main.yml')):
|
|
||||||
role_name = role_dir.parent.parent.name
|
|
||||||
name, seq = load_role_module_sequence(role_dir.parent.parent)
|
|
||||||
if seq:
|
|
||||||
cat = get_role_category(role_name)
|
|
||||||
by_category[cat].append((role_name, seq))
|
|
||||||
all_roles.append((role_name, seq))
|
|
||||||
return all_roles, by_category
|
|
||||||
|
|
||||||
|
|
||||||
def learn_grammar(sequences):
|
|
||||||
"""Run CRX on a list of sequences."""
|
|
||||||
if len(sequences) < 2:
|
|
||||||
seqs = [sequences[0]] if sequences else []
|
|
||||||
else:
|
|
||||||
seqs = sequences
|
|
||||||
if not seqs:
|
|
||||||
return 'ε'
|
|
||||||
crx = CRX()
|
|
||||||
return crx.infer(seqs)
|
|
||||||
|
|
@ -1,81 +0,0 @@
|
||||||
"""Convert YAML files to key-path sequences for BEX grammar inference."""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
import yaml
|
|
||||||
|
|
||||||
|
|
||||||
def yaml_to_keypath_sequence(data, prefix=""):
|
|
||||||
"""Convert parsed YAML data to a sequence of key paths (DFS traversal).
|
|
||||||
|
|
||||||
Each leaf (scalar) emits its full key path as a symbol.
|
|
||||||
Lists use a generic `[]` marker (no indices).
|
|
||||||
Values are NOT included — only key paths.
|
|
||||||
"""
|
|
||||||
seq = []
|
|
||||||
if isinstance(data, dict):
|
|
||||||
for key, value in data.items():
|
|
||||||
path = f"{prefix}.{key}" if prefix else key
|
|
||||||
if isinstance(value, (dict, list)):
|
|
||||||
seq.extend(yaml_to_keypath_sequence(value, path))
|
|
||||||
else:
|
|
||||||
seq.append(path)
|
|
||||||
elif isinstance(data, list):
|
|
||||||
for item in data:
|
|
||||||
list_prefix = f"{prefix}[]" if prefix else "[]"
|
|
||||||
if isinstance(item, (dict, list)):
|
|
||||||
seq.extend(yaml_to_keypath_sequence(item, list_prefix))
|
|
||||||
else:
|
|
||||||
seq.append(list_prefix)
|
|
||||||
return seq
|
|
||||||
|
|
||||||
|
|
||||||
def yaml_file_to_sequence(filepath):
|
|
||||||
"""Load a YAML file and convert to a key-path sequence."""
|
|
||||||
with open(filepath) as f:
|
|
||||||
data = yaml.safe_load(f)
|
|
||||||
if data is None:
|
|
||||||
return []
|
|
||||||
return yaml_to_keypath_sequence(data)
|
|
||||||
|
|
||||||
|
|
||||||
def is_vault_file(filepath):
|
|
||||||
"""Check if a file is an Ansible vault file (encrypted)."""
|
|
||||||
try:
|
|
||||||
with open(filepath) as f:
|
|
||||||
first = f.read(100)
|
|
||||||
return '$ANSIBLE_VAULT' in first or first.startswith('!vault |')
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def collect_all_sequences(root_dir=".", include_vault=False):
|
|
||||||
"""Collect key-path sequences from all YAML files.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list of (filepath, sequence) tuples.
|
|
||||||
"""
|
|
||||||
results = []
|
|
||||||
for path in sorted(Path(root_dir).rglob("*.yml")):
|
|
||||||
parts = path.parts
|
|
||||||
if any(d in parts for d in ('node_modules', '.venv', '__pycache__', '.git')):
|
|
||||||
continue
|
|
||||||
skippable = ('vault.yml' in path.name or 'vault' in path.name)
|
|
||||||
if not include_vault and (skippable or is_vault_file(path)):
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
seq = yaml_file_to_sequence(path)
|
|
||||||
if seq:
|
|
||||||
results.append((path, seq))
|
|
||||||
except Exception as e:
|
|
||||||
print(f" SKIP {path}: {e}")
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def sequences_to_crx(result_list):
|
|
||||||
"""Run CRX on collected sequences."""
|
|
||||||
from .crx import CRX
|
|
||||||
sequences = [seq for _, seq in result_list]
|
|
||||||
if not sequences:
|
|
||||||
return 'ε'
|
|
||||||
crx = CRX()
|
|
||||||
return crx.infer(sequences)
|
|
||||||
|
|
@ -1,71 +0,0 @@
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
plt.xkcd(scale=0.7, length=60, randomness=2)
|
|
||||||
|
|
||||||
FIG_W = 8
|
|
||||||
FIG_H = 5
|
|
||||||
|
|
||||||
# ── Chart 1: Context cost vs examples ──
|
|
||||||
fig1, ax1 = plt.subplots(figsize=(FIG_W, FIG_H))
|
|
||||||
|
|
||||||
N = [1, 5, 15, 36]
|
|
||||||
raw = [100, 500, 1500, 3600] # ~100 tokens/example
|
|
||||||
dervish = [40, 60, 60, 200] # grammar grows only when diversity grows
|
|
||||||
|
|
||||||
x = np.arange(len(N))
|
|
||||||
w = 0.35
|
|
||||||
|
|
||||||
bars1 = ax1.bar(x - w/2, raw, w, label='Raw examples', color='#e74c3c', alpha=0.85)
|
|
||||||
bars2 = ax1.bar(x + w/2, dervish, w, label='Dervish grammar', color='#3498db', alpha=0.85)
|
|
||||||
|
|
||||||
ax1.set_xticks(x)
|
|
||||||
ax1.set_xticklabels([f'{n} examples' for n in N])
|
|
||||||
ax1.set_ylabel('Tokens needed in context')
|
|
||||||
ax1.set_title('Context cost: raw examples vs Dervish grammar')
|
|
||||||
ax1.legend(frameon=False)
|
|
||||||
|
|
||||||
for bar in bars1:
|
|
||||||
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 80,
|
|
||||||
f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=9)
|
|
||||||
for bar in bars2:
|
|
||||||
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 80,
|
|
||||||
f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=9)
|
|
||||||
|
|
||||||
ax1.set_ylim(0, 4500)
|
|
||||||
fig1.tight_layout()
|
|
||||||
fig1.savefig('chart_context_cost.png', dpi=200)
|
|
||||||
plt.close(fig1)
|
|
||||||
|
|
||||||
# ── Chart 2: Tokens — Without vs With Dervish (per dataset) ──
|
|
||||||
fig2, ax2 = plt.subplots(figsize=(FIG_W, FIG_H))
|
|
||||||
|
|
||||||
datasets = ['Ansible Galaxy\n(15 roles)', 'Helm\n(6 configs)', 'Go lint\n(6 jobs)']
|
|
||||||
without = [5000, 3000, 900]
|
|
||||||
with_derv = [60, 40, 30]
|
|
||||||
ratios = [f'{int(w/d)}×' for w, d in zip(without, with_derv)]
|
|
||||||
|
|
||||||
x2 = np.arange(len(datasets))
|
|
||||||
w2 = 0.3
|
|
||||||
|
|
||||||
bw = ax2.bar(x2 - w2/2, without, w2, label='Without Dervish', color='#e74c3c', alpha=0.85)
|
|
||||||
bd = ax2.bar(x2 + w2/2, with_derv, w2, label='With Dervish', color='#3498db', alpha=0.85)
|
|
||||||
|
|
||||||
ax2.set_xticks(x2)
|
|
||||||
ax2.set_xticklabels(datasets)
|
|
||||||
ax2.set_ylabel('Tokens')
|
|
||||||
ax2.set_title('Token savings per dataset')
|
|
||||||
ax2.legend(frameon=False)
|
|
||||||
ax2.set_yscale('log')
|
|
||||||
ax2.set_ylim(5, 30000)
|
|
||||||
|
|
||||||
# Label compression ratios
|
|
||||||
for i, (r, wbar, dbar) in enumerate(zip(ratios, bw, bd)):
|
|
||||||
ax2.text(x2[i], without[i] * 1.3, r, ha='center', va='bottom', fontsize=11, fontweight='bold',
|
|
||||||
bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='gray', alpha=0.8))
|
|
||||||
|
|
||||||
fig2.tight_layout()
|
|
||||||
fig2.savefig('chart_token_savings.png', dpi=200)
|
|
||||||
plt.close(fig2)
|
|
||||||
|
|
||||||
print("Charts saved: chart_context_cost.png, chart_token_savings.png")
|
|
||||||
6
papers/README.md
Normal file
6
papers/README.md
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
# Papers
|
||||||
|
|
||||||
|
The Dervish algorithms are based on two papers by Bex et al.:
|
||||||
|
|
||||||
|
- **CRX** — [*Learning Deterministic Regular Expressions for the Web*](https://doi.org/10.1145/1806907.1806911) (TODS 2010)
|
||||||
|
- **iDRegEx** — [*Simplifying XML Schema: Single-Type Approximations of Regular Expressions*](https://arxiv.org/abs/1004.2372) (arXiv:1004.2372)
|
||||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -1,8 +1,5 @@
|
||||||
"""Tests for BEX paper algorithm implementations."""
|
"""Tests for BEX paper algorithm implementations."""
|
||||||
|
|
||||||
import sys
|
|
||||||
sys.path.insert(0, '/home/tobi/Desktop/kesai/ProjectManagement/companyweb')
|
|
||||||
|
|
||||||
from bex.soa import SOA
|
from bex.soa import SOA
|
||||||
from bex.twotinf import build_soa
|
from bex.twotinf import build_soa
|
||||||
from bex.rwr0 import rwr0
|
from bex.rwr0 import rwr0
|
||||||
|
|
@ -273,7 +270,7 @@ def run_all():
|
||||||
|
|
||||||
# ── Integration tests with real Ansible task data ──
|
# ── Integration tests with real Ansible task data ──
|
||||||
|
|
||||||
def test_integration_quartz_deploy():
|
def test_integration_linear_sequence():
|
||||||
"""Simple linear sequence — all tasks always in same order."""
|
"""Simple linear sequence — all tasks always in same order."""
|
||||||
seqs = [
|
seqs = [
|
||||||
['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'],
|
['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'],
|
||||||
|
|
@ -283,11 +280,11 @@ def test_integration_quartz_deploy():
|
||||||
result = crx.infer(seqs)
|
result = crx.infer(seqs)
|
||||||
assert result is not None
|
assert result is not None
|
||||||
assert all(t in result for t in ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'])
|
assert all(t in result for t in ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'])
|
||||||
print(f" PASS quartz_deploy: {result}")
|
print(f" PASS linear_sequence: {result}")
|
||||||
|
|
||||||
|
|
||||||
def test_integration_validate_system():
|
def test_integration_optional_tasks():
|
||||||
"""Optional shell tasks."""
|
"""Optional tasks — some sequences have more of the same."""
|
||||||
seqs = [
|
seqs = [
|
||||||
['shell', 'debug', 'shell', 'debug'],
|
['shell', 'debug', 'shell', 'debug'],
|
||||||
['shell', 'debug', 'shell', 'debug', 'shell', 'debug'],
|
['shell', 'debug', 'shell', 'debug', 'shell', 'debug'],
|
||||||
|
|
@ -297,11 +294,11 @@ def test_integration_validate_system():
|
||||||
result = crx.infer(seqs)
|
result = crx.infer(seqs)
|
||||||
assert result is not None
|
assert result is not None
|
||||||
assert 'shell' in result and 'debug' in result
|
assert 'shell' in result and 'debug' in result
|
||||||
print(f" PASS validate_system: {result}")
|
print(f" PASS optional_tasks: {result}")
|
||||||
|
|
||||||
|
|
||||||
def test_integration_docker_detect_branch():
|
def test_integration_branching_paths():
|
||||||
"""Branching: docker compose v2 check or v1 fallback."""
|
"""Branching: one path or an alternative."""
|
||||||
seqs = [
|
seqs = [
|
||||||
['file', 'template', 'command_v2', 'set_fact', 'shell', 'wait_for'],
|
['file', 'template', 'command_v2', 'set_fact', 'shell', 'wait_for'],
|
||||||
['file', 'template', 'command_v1', 'set_fact', 'shell', 'wait_for'],
|
['file', 'template', 'command_v1', 'set_fact', 'shell', 'wait_for'],
|
||||||
|
|
@ -310,11 +307,11 @@ def test_integration_docker_detect_branch():
|
||||||
result = crx.infer(seqs)
|
result = crx.infer(seqs)
|
||||||
assert result is not None
|
assert result is not None
|
||||||
assert 'file' in result and 'template' in result and 'shell' in result
|
assert 'file' in result and 'template' in result and 'shell' in result
|
||||||
print(f" PASS docker_detect: {result}")
|
print(f" PASS branching_paths: {result}")
|
||||||
|
|
||||||
|
|
||||||
def test_integration_firewall_gating():
|
def test_integration_conditional_tasks():
|
||||||
"""Conditional firewall rule sequence (gated)."""
|
"""Tasks that sometimes appear, sometimes not."""
|
||||||
seqs = [
|
seqs = [
|
||||||
['assert', 'file', 'template', 'shell', 'wait_for'],
|
['assert', 'file', 'template', 'shell', 'wait_for'],
|
||||||
['assert', 'file', 'template', 'command_fw', 'command_fw', 'shell', 'wait_for'],
|
['assert', 'file', 'template', 'command_fw', 'command_fw', 'shell', 'wait_for'],
|
||||||
|
|
@ -324,7 +321,7 @@ def test_integration_firewall_gating():
|
||||||
result = crx.infer(seqs)
|
result = crx.infer(seqs)
|
||||||
assert result is not None
|
assert result is not None
|
||||||
assert 'assert' in result and 'file' in result
|
assert 'assert' in result and 'file' in result
|
||||||
print(f" PASS firewall_gating: {result}")
|
print(f" PASS conditional_tasks: {result}")
|
||||||
|
|
||||||
|
|
||||||
def test_integration_idregex_linear():
|
def test_integration_idregex_linear():
|
||||||
|
|
@ -361,8 +358,8 @@ def test_integration_ikoa_linear():
|
||||||
print(f" PASS ikoa_linear: {expr}")
|
print(f" PASS ikoa_linear: {expr}")
|
||||||
|
|
||||||
|
|
||||||
def test_integration_backup_restic():
|
def test_integration_looping_tasks():
|
||||||
"""Sequence with loop (systemd enable)."""
|
"""Sequence with loop (repeated tasks)."""
|
||||||
seqs = [
|
seqs = [
|
||||||
['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd', 'systemd', 'systemd'],
|
['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd', 'systemd', 'systemd'],
|
||||||
['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd'],
|
['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd'],
|
||||||
|
|
@ -370,7 +367,7 @@ def test_integration_backup_restic():
|
||||||
crx = CRX()
|
crx = CRX()
|
||||||
result = crx.infer(seqs)
|
result = crx.infer(seqs)
|
||||||
assert result is not None
|
assert result is not None
|
||||||
print(f" PASS backup_restic: {result}")
|
print(f" PASS looping_tasks: {result}")
|
||||||
|
|
||||||
|
|
||||||
def run_all():
|
def run_all():
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue