purge make_charts.py, examples/, full-text papers, blog_post (moved to ~/Desktop/kesai/); translate German CLI to English

This commit is contained in:
tobjend 2026-07-01 11:28:42 +02:00
parent c8a49f0149
commit 6bf7a681ce
9 changed files with 45 additions and 5270 deletions

View file

@ -19,7 +19,7 @@ from .ilocal import iLocal, extract_contexts_from_file, reduce_contexts
def find_yaml_files(directory): def find_yaml_files(directory):
"""Findet alle YAML-Dateien in einem Verzeichnis (rekursiv).""" """Find all YAML files in a directory (recursive)."""
patterns = ['**/*.yml', '**/*.yaml'] patterns = ['**/*.yml', '**/*.yaml']
files = [] files = []
for pattern in patterns: for pattern in patterns:
@ -32,37 +32,37 @@ def main():
description='bex — BEX-based YAML Grammar Inference', description='bex — BEX-based YAML Grammar Inference',
) )
parser.add_argument('--dir', type=str, default='roles/', parser.add_argument('--dir', type=str, default='roles/',
help='Verzeichnis mit YAML-Dateien (default: roles/)') help='Directory with YAML files (default: roles/)')
parser.add_argument('--k-max', type=int, default=5, parser.add_argument('--k-max', type=int, default=5,
help='Max k für k-ORE-Inferenz (default: 5)') help='Max k for k-ORE inference (default: 5)')
parser.add_argument('--context', type=str, default=None, parser.add_argument('--context', type=str, default=None,
help='Auf spezifischen Container-Key beschränken (z.B. tasks)') help='Restrict to specific container key (e.g. tasks)')
parser.add_argument('--output', type=str, default=None, parser.add_argument('--output', type=str, default=None,
help='Output-Datei für Template (default: stdout)') help='Output file for template (default: stdout)')
parser.add_argument('--ilocal', action='store_true', parser.add_argument('--ilocal', action='store_true',
help='iLocal-Kontextanalyse durchführen') help='Run iLocal context analysis')
parser.add_argument('--crx', action='store_true', parser.add_argument('--crx', action='store_true',
help='CRX (direct CHARE inference) verwenden') help='Use CRX (direct CHARE inference)')
parser.add_argument('--verbose', '-v', action='store_true', parser.add_argument('--verbose', '-v', action='store_true',
help='Ausführliche Ausgabe') help='Verbose output')
parser.add_argument('--stats', action='store_true', parser.add_argument('--stats', action='store_true',
help='Zeige Token-Statistiken') help='Show token statistics')
args = parser.parse_args() args = parser.parse_args()
if not os.path.isdir(args.dir): if not os.path.isdir(args.dir):
print(f"Fehler: Verzeichnis '{args.dir}' nicht gefunden.", file=sys.stderr) print(f"Error: directory '{args.dir}' not found.", file=sys.stderr)
sys.exit(1) sys.exit(1)
yaml_files = find_yaml_files(args.dir) yaml_files = find_yaml_files(args.dir)
if not yaml_files: if not yaml_files:
print(f"Keine YAML-Dateien in '{args.dir}' gefunden.", file=sys.stderr) print(f"No YAML files found in '{args.dir}'.", file=sys.stderr)
sys.exit(1) sys.exit(1)
print(f"Gefundene YAML-Dateien: {len(yaml_files)}", file=sys.stderr) print(f"Found YAML files: {len(yaml_files)}", file=sys.stderr)
if args.ilocal: if args.ilocal:
print("\n=== iLocal: Kontext-Extraktion ===", file=sys.stderr) print("\n=== iLocal: Context Extraction ===", file=sys.stderr)
all_contexts = {} all_contexts = {}
for f in yaml_files: for f in yaml_files:
contexts = extract_contexts_from_file(f) contexts = extract_contexts_from_file(f)
@ -72,11 +72,11 @@ def main():
all_contexts[ctx].extend(seqs) all_contexts[ctx].extend(seqs)
reduced = reduce_contexts(all_contexts) reduced = reduce_contexts(all_contexts)
print(f" Kontexte gefunden: {len(reduced)}", file=sys.stderr) print(f" Contexts found: {len(reduced)}", file=sys.stderr)
for ctx, seqs in sorted(reduced.items()): for ctx, seqs in sorted(reduced.items()):
lengths = [len(s) for s in seqs] lengths = [len(s) for s in seqs]
print(f" {ctx}: {len(seqs)} Sequenzen, " print(f" {ctx}: {len(seqs)} sequences, "
f"Längen {min(lengths)}-{max(lengths)}, " f"lengths {min(lengths)}-{max(lengths)}, "
f"unique_seqs={len(set(tuple(s) for s in seqs))}", f"unique_seqs={len(set(tuple(s) for s in seqs))}",
file=sys.stderr) file=sys.stderr)
@ -94,30 +94,30 @@ def main():
print(f" {os.path.relpath(f)}: {seq}", file=sys.stderr) print(f" {os.path.relpath(f)}: {seq}", file=sys.stderr)
except Exception as e: except Exception as e:
if args.verbose: if args.verbose:
print(f" Fehler in {f}: {e}", file=sys.stderr) print(f" Error in {f}: {e}", file=sys.stderr)
if not all_sequences: if not all_sequences:
print("Keine Sequenzen extrahiert.", file=sys.stderr) print("No sequences extracted.", file=sys.stderr)
sys.exit(1) sys.exit(1)
print(f" Sequenzen extrahiert: {len(all_sequences)}", file=sys.stderr) print(f" Sequences extracted: {len(all_sequences)}", file=sys.stderr)
lengths = [len(s) for s in all_sequences] lengths = [len(s) for s in all_sequences]
print(f" Längen: min={min(lengths)}, max={max(lengths)}, " print(f" Lengths: min={min(lengths)}, max={max(lengths)}, "
f"avg={sum(lengths)/len(lengths):.1f}", file=sys.stderr) f"avg={sum(lengths)/len(lengths):.1f}", file=sys.stderr)
if args.stats: if args.stats:
stats = tokenizer.get_statistics() stats = tokenizer.get_statistics()
print("\n=== Token-Statistiken ===", file=sys.stderr) print("\n=== Token Statistics ===", file=sys.stderr)
for token, count in list(stats.items())[:30]: for token, count in list(stats.items())[:30]:
print(f" {token}: {count}", file=sys.stderr) print(f" {token}: {count}", file=sys.stderr)
print("\n=== k-ORE Inferenz ===", file=sys.stderr) print("\n=== k-ORE Inference ===", file=sys.stderr)
kore = kOREInference(k_max=args.k_max) kore = kOREInference(k_max=args.k_max)
if args.crx: if args.crx:
result = kore.infer_with_crx(all_sequences) result = kore.infer_with_crx(all_sequences)
_, expr, method = result _, expr, method = result
print(f" Methode: {method}", file=sys.stderr) print(f" Method: {method}", file=sys.stderr)
else: else:
result = kore.infer(all_sequences) result = kore.infer(all_sequences)
if result: if result:
@ -127,7 +127,7 @@ def main():
expr = "" expr = ""
print(" Kein Ergebnis", file=sys.stderr) print(" Kein Ergebnis", file=sys.stderr)
print(f" Inferierter Ausdruck: {expr}", file=sys.stderr) print(f" Inferred expression: {expr}", file=sys.stderr)
print("\n=== One-Shot Template ===", file=sys.stderr) print("\n=== One-Shot Template ===", file=sys.stderr)
print(file=sys.stderr) print(file=sys.stderr)
@ -136,7 +136,7 @@ def main():
if args.output: if args.output:
with open(args.output, 'w') as f: with open(args.output, 'w') as f:
f.write(template) f.write(template)
print(f"Template geschrieben nach: {args.output}", file=sys.stderr) print(f"Template written to: {args.output}", file=sys.stderr)
else: else:
print(template) print(template)

View file

@ -1,263 +0,0 @@
# Dervish: Discovering Unwritten Conventions with Grammar Inference
<p align="left"><img src="dervish-logo.png" alt="Dervish" width="180"></p>
**How we turned 36 Ansible roles into a 200-character grammar — and why
it matters for LLM agents.**
## The problem
Every codebase has unwritten conventions. Your team's Docker Compose
files always put `image` before `ports` before `volumes`. Your Ansible
deploy roles always start with `assert`, then `file`, then `template`.
Your CI pipelines always run `lint` before `test` before `deploy`.
Nobody writes these down. They're emergent — copied from role to role,
file to file, until they become a tacit standard.
When an LLM agent needs to generate new content that follows these
conventions, you have two options:
1. **Stuff every existing file into context** — 36 deploy roles = 15,000
tokens. You'll hit the context window on your third example.
2. **Give it one or two examples and hope** — the LLM will guess the
pattern, and it will often guess wrong.
Neither is good. The first is wasteful. The second is unreliable.
What you really want is the **compiled convention** — the minimal
description of what all 36 roles share, expressed in ~200 tokens. An
LLM can follow a rule in 200 tokens far more reliably than it can
infer a pattern from 36 examples.
This is grammar inference.
## The approach
Given a set of example sequences over some alphabet (e.g., Ansible
module names, Docker Compose keys, CI job names), learn a regular
expression that describes the general pattern.
We implemented two algorithms from Bex et al., a pair of papers from
TODS 2010 and arXiv 2010:
- **CRX** (TODS 2010 §6): A single-pass algorithm that builds a
predecessor relation over symbols, computes equivalence classes,
and emits a Chain Regular Expression (CHARE) that matches ALL
input sequences. Fast, deterministic, captures the full vocabulary.
- **iDRegEx** (arXiv 2010): A probabilistic algorithm using k-testable
Observation Automata (k-OA) trained with Baum-Welch EM. It finds
only the *minimal common core* — the symbols that appear in every
example. Robust against noise, but fails (returns ∅) when the
examples are too diverse.
Both run in the **ensemble**: CRX produces a permissive grammar (full
vocabulary, many optional parts), iDRegEx produces a strict grammar
(minimal core). A Minimum Description Length (MDL) score picks the
winner: the grammar that compresses the data best.
## The algorithms, briefly
### CRX — Chain Regular Expression inference
CRX (Algorithm 7, TODS 2010) works in four steps:
1. **Build the immediate-predecessor relation.** For every adjacent
pair (x, y) across all sequences, record that x precedes y. If
symbol `assert` always appears before `file`, record
`assert → file`.
2. **Compute equivalence classes.** Take the reflexive-transitive
closure of the predecessor relation. The strongly connected
components are *equivalence classes* — groups of symbols that can
appear in the same position. If `copy` and `template` both follow
`file` and precede `command`, they're in the same class.
3. **Merge singleton classes.** A class with one symbol that shares
the same predecessor/successor sets as another singleton class
gets merged. This handles symbols that always appear in the
same structural position.
4. **Topological sort.** The equivalence classes are sorted by their
position in the Hasse diagram of the predecessor relation. Each
class becomes a factor in the output, annotated with a quantifier:
- `+` (one or more) if the class forms a cycle
- `+?` (zero or more) if the class appears variably
- `?` (optional) if the class can be absent
- (exact) if the class always appears exactly once
The result is a CHARE: a sequence of factors where each factor is a
disjunction of equivalent symbols with a quantifier.
### iDRegEx — k-optimal regular expression inference
iDRegEx (Algorithm 4, arXiv 2010) uses a probabilistic automaton:
1. **Build a complete k-OA.** A k-testable Observation Automaton
records all k-grams (subsequences of length k) from the input
sequences. The automaton's states represent (k-1)-grams.
2. **Train with Baum-Welch.** EM iterations assign probabilities to
transitions, learning which paths through the automaton are most
likely given the data.
3. **Disambiguate.** Remove nondeterministic transitions — for any
state and symbol, keep only the most probable next state.
4. **Prune.** Remove low-probability edges and unreachable states,
leaving only the most likely paths.
5. **Extract with rwr².** The REWRITE-SQUARED algorithm (rwr²,
Algorithm 3) collapses the pruned automaton into a k-optimal
regular expression — the minimal common core.
### MDL scoring — picking the right level of specificity
The Minimum Description Length principle (Rissanen 1978) says: the
best grammar is the one that minimizes the sum of its own size and
the cost of encoding the data using it.
```
MDL = model_cost + data_cost
```
**model_cost** = the number of alphabet symbol occurrences in the
grammar. A grammar with 5 unique symbols used once each has
model_cost = 5.
**data_cost** = Σ log₂(|L(r)|) across all sequences, where |L(r)| is
the number of strings of length len(s) that the grammar accepts.
A grammar like `(a+b+c+...+z)+` accepts 19 possible symbols at each
position, so for a sequence of length 120, the data cost is
120 × log₂(19) ≈ 510 bits. A grammar like `a.b.c.d.e` accepts only
1 string of length 5, so data cost is 0.
The ensemble picks the grammar with the lowest total MDL. This
automatically balances specificity against coverage: a grammar that
matches only 1 sequence but does so perfectly (low data cost) can
beat a grammar that matches all sequences but is extremely permissive
(high data cost).
## The results
### Ansible deploy roles — 36 roles from companyweb
Your own deploy roles cover everything from AdGuard Home to
Woodpecker CI. They have NO schema — each is a free-form script.
```
Grammar: docker_volume+?.group?.docker_container?.user?.apt?.npm?.
(assert+...+command+copy+file+template+set_fact+...+wait_for)+?.
(cron+firewalld)?
Match: 36/36
MDL: 2186.28
```
Bottleneck analysis: optional docker setup (volume, group, container,
user, apt, npm), then a large disjunction of ~25 task modules (one or
more), then optional cron/firewalld at the end. This captures the
convention precisely.
**Compression: 36 roles (15,000 tokens) → 200 tokens (75×)**
### Geerlingguy Galaxy roles — 15 popular roles
Jeff Geerling's roles are the most popular on Ansible Galaxy. He has
never documented their structural pattern. Yet every one of the 15
follows the same arc:
```
Grammar: fail?.(include_vars+set_fact+package+file+template+service+...)+.
include+?.(npm+pip)+?.lineinfile?
Match: 15/15
MDL: 596.64
```
Check prerequisites, OS-specific variables, install packages,
configure with templates, start services, optionally run sub-tasks,
install npm/pip packages, and optionally tweak config lines.
**This is the first explicit description of the geerlingguy role
module ordering convention.** It took 15 roles and a grammar inference
algorithm to write it down.
**Compression: 15 roles (5,000 tokens) → 60 tokens (83×)**
### Ensemble dynamics
The ensemble (CRX + iDRegEx + MDL) selects different winners
depending on the data:
| Dataset | Winner | Why |
|---------|--------|-----|
| Ansible galaxy (15 roles) | CRX | iDRegEx returned ∅ (too diverse) |
| Helm prom-stack (6 configs) | **iDRegEx** | Finds minimal core across all configs |
| Terraform modules (8) | CRX | iDRegEx returned ∅ (no common core across domains) |
| Terraform modules (8) | CRX | Every resource type optional across domains |
| GitHub Actions Go lint (6) | CRX | Tight pattern, all match |
iDRegEx wins when the data has a clear common core. CRX wins when
there's no single shared subsequence (the roles share the *vocabulary*
but not the *order*).
## The MCP
The engine is exposed as an MCP server:
```python
from bex.mcp_server import infer_best_grammar
# Full coverage
output = infer_best_grammar(
sequences=role_sequences,
prefer="crx",
)
# Returns:
# Best: CRX (MDL 288)
# Grammar: fail?.(include_vars+set_fact+package+file+template+service+...)+
# .include+?.(npm+pip)+?.lineinfile?
# Ensemble — let MDL pick
output = infer_best_grammar(sequences=role_sequences)
```
An agent workflow:
1. Agent needs to write an Ansible role
2. Finds 15 existing geerlingguy roles, extracts their task module sequences
3. Calls `infer_best_grammar(sequences=..., prefer='crx')`
4. Gets back the grammar in ~60 tokens
5. Generates a new role that follows the structural pattern
Without the MCP: 15 role files in context (5,000 tokens), or guesswork.
With the MCP: one grammar rule (~60 tokens), known to match 15/15 roles.
## What it means
Grammar inference turns **examples** into **rules**. The rule is a
compressed description of the structural convention — and for
schema-less content like the geerlingguy role module ordering, this is
the *first time* the convention has been written down at all.
For LLM agents, this changes the trade-off between context and
accuracy. Instead of flooding the context window with examples, the
agent can call the MCP, get the rule in ~60 tokens, and follow it.
The rule is more reliable than guessing from examples, and it costs
less than the first example would have.
The algorithm doesn't need to understand what a deploy role does. It
doesn't know that `file` creates directories and `template` renders
Jinja2. It only needs to see 36 sequences of module names and find
the pattern they all share. The structural convention is in the data
— you just have to extract it.
## References
- Bex, G. J., Gelade, W., Neven, F., & Vansummeren, S. (2010).
[*Learning Deterministic Regular Expressions for the Web.*](https://doi.org/10.1145/1806907.1806911) TODS 2010.
- Bex, G. J., Gelade, W., Martens, W., & Neven, F. (2010).
[*Simplifying XML Schema: Single-Type Approximations of Regular
Expressions.*](https://arxiv.org/abs/1004.2372) arXiv:1004.2372.
- Rissanen, J. (1978). *Modeling by shortest data description.*
Automatica 14(5).

View file

@ -1,111 +0,0 @@
"""Extract Ansible role task module sequences and learn per-group grammars."""
from pathlib import Path
import yaml
from collections import defaultdict
from .crx import CRX
from .expr import strip_k
IGNORE_MODULES = frozenset({'name', 'tags', 'when', 'register', 'no_log',
'changed_when', 'failed_when', 'ignore_errors',
'run_once', 'delegate_to', 'loop', 'loop_control',
'until', 'retries', 'delay', 'poll', 'async',
'become', 'become_user', 'become_flags',
'check_mode', 'diff', 'environment',
'vars', 'notify', 'args',
'block', 'rescue', 'always', 'include_tasks'})
def extract_module_name(task):
"""Extract the Ansible module name from a task dict.
The module is the key that is NOT a known non-module key.
Returns 'skip' for non-task entries like block/rescue/always.
"""
if not isinstance(task, dict):
return None
# Check for block/rescue/always — these contain nested tasks
for key in ('block', 'rescue', 'always'):
if key in task:
nested = task[key]
if isinstance(nested, list):
return [extract_module_name(t) for t in nested]
return None
# Find the module key (not name, not meta-keys)
for key, value in task.items():
if key in ('name',):
continue
if key in IGNORE_MODULES:
continue
if isinstance(value, (dict, list, str, bool, int, float)):
# It's the module name (venv or fqcn)
return strip_k(key)
return None
def flatten_nested(seq):
"""Flatten nested lists into a single list."""
result = []
for item in seq:
if isinstance(item, list):
result.extend(flatten_nested(item))
elif item is not None and item != 'skip':
result.append(item)
return result
def get_role_category(role_name):
"""Extract category from role name like deploy_foo → deploy."""
parts = role_name.split('_')
if len(parts) >= 2:
return parts[0]
return 'other'
def load_role_module_sequence(role_dir):
"""Load a role's task file and extract the module sequence."""
task_file = role_dir / 'tasks' / 'main.yml'
if not task_file.exists():
return None, None
with open(task_file) as f:
data = yaml.safe_load(f)
if not isinstance(data, list):
return None, None
modules = []
for task in data:
result = extract_module_name(task)
if isinstance(result, list):
modules.extend(flatten_nested(result))
elif result is not None:
modules.append(result)
return role_dir.name, modules
def collect_all_role_sequences(roles_dir='roles'):
"""Collect module sequences from all roles, grouped by category."""
by_category = defaultdict(list)
all_roles = []
for role_dir in sorted(Path(roles_dir).glob('*/tasks/main.yml')):
role_name = role_dir.parent.parent.name
name, seq = load_role_module_sequence(role_dir.parent.parent)
if seq:
cat = get_role_category(role_name)
by_category[cat].append((role_name, seq))
all_roles.append((role_name, seq))
return all_roles, by_category
def learn_grammar(sequences):
"""Run CRX on a list of sequences."""
if len(sequences) < 2:
seqs = [sequences[0]] if sequences else []
else:
seqs = sequences
if not seqs:
return 'ε'
crx = CRX()
return crx.infer(seqs)

View file

@ -1,81 +0,0 @@
"""Convert YAML files to key-path sequences for BEX grammar inference."""
from pathlib import Path
import yaml
def yaml_to_keypath_sequence(data, prefix=""):
"""Convert parsed YAML data to a sequence of key paths (DFS traversal).
Each leaf (scalar) emits its full key path as a symbol.
Lists use a generic `[]` marker (no indices).
Values are NOT included only key paths.
"""
seq = []
if isinstance(data, dict):
for key, value in data.items():
path = f"{prefix}.{key}" if prefix else key
if isinstance(value, (dict, list)):
seq.extend(yaml_to_keypath_sequence(value, path))
else:
seq.append(path)
elif isinstance(data, list):
for item in data:
list_prefix = f"{prefix}[]" if prefix else "[]"
if isinstance(item, (dict, list)):
seq.extend(yaml_to_keypath_sequence(item, list_prefix))
else:
seq.append(list_prefix)
return seq
def yaml_file_to_sequence(filepath):
"""Load a YAML file and convert to a key-path sequence."""
with open(filepath) as f:
data = yaml.safe_load(f)
if data is None:
return []
return yaml_to_keypath_sequence(data)
def is_vault_file(filepath):
"""Check if a file is an Ansible vault file (encrypted)."""
try:
with open(filepath) as f:
first = f.read(100)
return '$ANSIBLE_VAULT' in first or first.startswith('!vault |')
except Exception:
return False
def collect_all_sequences(root_dir=".", include_vault=False):
"""Collect key-path sequences from all YAML files.
Returns:
list of (filepath, sequence) tuples.
"""
results = []
for path in sorted(Path(root_dir).rglob("*.yml")):
parts = path.parts
if any(d in parts for d in ('node_modules', '.venv', '__pycache__', '.git')):
continue
skippable = ('vault.yml' in path.name or 'vault' in path.name)
if not include_vault and (skippable or is_vault_file(path)):
continue
try:
seq = yaml_file_to_sequence(path)
if seq:
results.append((path, seq))
except Exception as e:
print(f" SKIP {path}: {e}")
return results
def sequences_to_crx(result_list):
"""Run CRX on collected sequences."""
from .crx import CRX
sequences = [seq for _, seq in result_list]
if not sequences:
return 'ε'
crx = CRX()
return crx.infer(sequences)

View file

@ -1,71 +0,0 @@
import matplotlib.pyplot as plt
import numpy as np
plt.xkcd(scale=0.7, length=60, randomness=2)
FIG_W = 8
FIG_H = 5
# ── Chart 1: Context cost vs examples ──
fig1, ax1 = plt.subplots(figsize=(FIG_W, FIG_H))
N = [1, 5, 15, 36]
raw = [100, 500, 1500, 3600] # ~100 tokens/example
dervish = [40, 60, 60, 200] # grammar grows only when diversity grows
x = np.arange(len(N))
w = 0.35
bars1 = ax1.bar(x - w/2, raw, w, label='Raw examples', color='#e74c3c', alpha=0.85)
bars2 = ax1.bar(x + w/2, dervish, w, label='Dervish grammar', color='#3498db', alpha=0.85)
ax1.set_xticks(x)
ax1.set_xticklabels([f'{n} examples' for n in N])
ax1.set_ylabel('Tokens needed in context')
ax1.set_title('Context cost: raw examples vs Dervish grammar')
ax1.legend(frameon=False)
for bar in bars1:
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 80,
f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=9)
for bar in bars2:
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 80,
f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=9)
ax1.set_ylim(0, 4500)
fig1.tight_layout()
fig1.savefig('chart_context_cost.png', dpi=200)
plt.close(fig1)
# ── Chart 2: Tokens — Without vs With Dervish (per dataset) ──
fig2, ax2 = plt.subplots(figsize=(FIG_W, FIG_H))
datasets = ['Ansible Galaxy\n(15 roles)', 'Helm\n(6 configs)', 'Go lint\n(6 jobs)']
without = [5000, 3000, 900]
with_derv = [60, 40, 30]
ratios = [f'{int(w/d)}×' for w, d in zip(without, with_derv)]
x2 = np.arange(len(datasets))
w2 = 0.3
bw = ax2.bar(x2 - w2/2, without, w2, label='Without Dervish', color='#e74c3c', alpha=0.85)
bd = ax2.bar(x2 + w2/2, with_derv, w2, label='With Dervish', color='#3498db', alpha=0.85)
ax2.set_xticks(x2)
ax2.set_xticklabels(datasets)
ax2.set_ylabel('Tokens')
ax2.set_title('Token savings per dataset')
ax2.legend(frameon=False)
ax2.set_yscale('log')
ax2.set_ylim(5, 30000)
# Label compression ratios
for i, (r, wbar, dbar) in enumerate(zip(ratios, bw, bd)):
ax2.text(x2[i], without[i] * 1.3, r, ha='center', va='bottom', fontsize=11, fontweight='bold',
bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='gray', alpha=0.8))
fig2.tight_layout()
fig2.savefig('chart_token_savings.png', dpi=200)
plt.close(fig2)
print("Charts saved: chart_context_cost.png, chart_token_savings.png")

6
papers/README.md Normal file
View file

@ -0,0 +1,6 @@
# Papers
The Dervish algorithms are based on two papers by Bex et al.:
- **CRX** — [*Learning Deterministic Regular Expressions for the Web*](https://doi.org/10.1145/1806907.1806911) (TODS 2010)
- **iDRegEx** — [*Simplifying XML Schema: Single-Type Approximations of Regular Expressions*](https://arxiv.org/abs/1004.2372) (arXiv:1004.2372)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,5 @@
"""Tests for BEX paper algorithm implementations.""" """Tests for BEX paper algorithm implementations."""
import sys
sys.path.insert(0, '/home/tobi/Desktop/kesai/ProjectManagement/companyweb')
from bex.soa import SOA from bex.soa import SOA
from bex.twotinf import build_soa from bex.twotinf import build_soa
from bex.rwr0 import rwr0 from bex.rwr0 import rwr0
@ -273,7 +270,7 @@ def run_all():
# ── Integration tests with real Ansible task data ── # ── Integration tests with real Ansible task data ──
def test_integration_quartz_deploy(): def test_integration_linear_sequence():
"""Simple linear sequence — all tasks always in same order.""" """Simple linear sequence — all tasks always in same order."""
seqs = [ seqs = [
['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'], ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'],
@ -283,11 +280,11 @@ def test_integration_quartz_deploy():
result = crx.infer(seqs) result = crx.infer(seqs)
assert result is not None assert result is not None
assert all(t in result for t in ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for']) assert all(t in result for t in ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'])
print(f" PASS quartz_deploy: {result}") print(f" PASS linear_sequence: {result}")
def test_integration_validate_system(): def test_integration_optional_tasks():
"""Optional shell tasks.""" """Optional tasks — some sequences have more of the same."""
seqs = [ seqs = [
['shell', 'debug', 'shell', 'debug'], ['shell', 'debug', 'shell', 'debug'],
['shell', 'debug', 'shell', 'debug', 'shell', 'debug'], ['shell', 'debug', 'shell', 'debug', 'shell', 'debug'],
@ -297,11 +294,11 @@ def test_integration_validate_system():
result = crx.infer(seqs) result = crx.infer(seqs)
assert result is not None assert result is not None
assert 'shell' in result and 'debug' in result assert 'shell' in result and 'debug' in result
print(f" PASS validate_system: {result}") print(f" PASS optional_tasks: {result}")
def test_integration_docker_detect_branch(): def test_integration_branching_paths():
"""Branching: docker compose v2 check or v1 fallback.""" """Branching: one path or an alternative."""
seqs = [ seqs = [
['file', 'template', 'command_v2', 'set_fact', 'shell', 'wait_for'], ['file', 'template', 'command_v2', 'set_fact', 'shell', 'wait_for'],
['file', 'template', 'command_v1', 'set_fact', 'shell', 'wait_for'], ['file', 'template', 'command_v1', 'set_fact', 'shell', 'wait_for'],
@ -310,11 +307,11 @@ def test_integration_docker_detect_branch():
result = crx.infer(seqs) result = crx.infer(seqs)
assert result is not None assert result is not None
assert 'file' in result and 'template' in result and 'shell' in result assert 'file' in result and 'template' in result and 'shell' in result
print(f" PASS docker_detect: {result}") print(f" PASS branching_paths: {result}")
def test_integration_firewall_gating(): def test_integration_conditional_tasks():
"""Conditional firewall rule sequence (gated).""" """Tasks that sometimes appear, sometimes not."""
seqs = [ seqs = [
['assert', 'file', 'template', 'shell', 'wait_for'], ['assert', 'file', 'template', 'shell', 'wait_for'],
['assert', 'file', 'template', 'command_fw', 'command_fw', 'shell', 'wait_for'], ['assert', 'file', 'template', 'command_fw', 'command_fw', 'shell', 'wait_for'],
@ -324,7 +321,7 @@ def test_integration_firewall_gating():
result = crx.infer(seqs) result = crx.infer(seqs)
assert result is not None assert result is not None
assert 'assert' in result and 'file' in result assert 'assert' in result and 'file' in result
print(f" PASS firewall_gating: {result}") print(f" PASS conditional_tasks: {result}")
def test_integration_idregex_linear(): def test_integration_idregex_linear():
@ -361,8 +358,8 @@ def test_integration_ikoa_linear():
print(f" PASS ikoa_linear: {expr}") print(f" PASS ikoa_linear: {expr}")
def test_integration_backup_restic(): def test_integration_looping_tasks():
"""Sequence with loop (systemd enable).""" """Sequence with loop (repeated tasks)."""
seqs = [ seqs = [
['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd', 'systemd', 'systemd'], ['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd', 'systemd', 'systemd'],
['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd'], ['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd'],
@ -370,7 +367,7 @@ def test_integration_backup_restic():
crx = CRX() crx = CRX()
result = crx.infer(seqs) result = crx.infer(seqs)
assert result is not None assert result is not None
print(f" PASS backup_restic: {result}") print(f" PASS looping_tasks: {result}")
def run_all(): def run_all():