grammar-inference-engine/examples/yaml_to_seq.py

81 lines
2.5 KiB
Python

"""Convert YAML files to key-path sequences for BEX grammar inference."""
from pathlib import Path
import yaml
def yaml_to_keypath_sequence(data, prefix=""):
"""Convert parsed YAML data to a sequence of key paths (DFS traversal).
Each leaf (scalar) emits its full key path as a symbol.
Lists use a generic `[]` marker (no indices).
Values are NOT included — only key paths.
"""
seq = []
if isinstance(data, dict):
for key, value in data.items():
path = f"{prefix}.{key}" if prefix else key
if isinstance(value, (dict, list)):
seq.extend(yaml_to_keypath_sequence(value, path))
else:
seq.append(path)
elif isinstance(data, list):
for item in data:
list_prefix = f"{prefix}[]" if prefix else "[]"
if isinstance(item, (dict, list)):
seq.extend(yaml_to_keypath_sequence(item, list_prefix))
else:
seq.append(list_prefix)
return seq
def yaml_file_to_sequence(filepath):
"""Load a YAML file and convert to a key-path sequence."""
with open(filepath) as f:
data = yaml.safe_load(f)
if data is None:
return []
return yaml_to_keypath_sequence(data)
def is_vault_file(filepath):
"""Check if a file is an Ansible vault file (encrypted)."""
try:
with open(filepath) as f:
first = f.read(100)
return '$ANSIBLE_VAULT' in first or first.startswith('!vault |')
except Exception:
return False
def collect_all_sequences(root_dir=".", include_vault=False):
"""Collect key-path sequences from all YAML files.
Returns:
list of (filepath, sequence) tuples.
"""
results = []
for path in sorted(Path(root_dir).rglob("*.yml")):
parts = path.parts
if any(d in parts for d in ('node_modules', '.venv', '__pycache__', '.git')):
continue
skippable = ('vault.yml' in path.name or 'vault' in path.name)
if not include_vault and (skippable or is_vault_file(path)):
continue
try:
seq = yaml_file_to_sequence(path)
if seq:
results.append((path, seq))
except Exception as e:
print(f" SKIP {path}: {e}")
return results
def sequences_to_crx(result_list):
"""Run CRX on collected sequences."""
from .crx import CRX
sequences = [seq for _, seq in result_list]
if not sequences:
return 'ε'
crx = CRX()
return crx.infer(sequences)