"""Convert YAML files to key-path sequences for BEX grammar inference.""" from pathlib import Path import yaml def yaml_to_keypath_sequence(data, prefix=""): """Convert parsed YAML data to a sequence of key paths (DFS traversal). Each leaf (scalar) emits its full key path as a symbol. Lists use a generic `[]` marker (no indices). Values are NOT included — only key paths. """ seq = [] if isinstance(data, dict): for key, value in data.items(): path = f"{prefix}.{key}" if prefix else key if isinstance(value, (dict, list)): seq.extend(yaml_to_keypath_sequence(value, path)) else: seq.append(path) elif isinstance(data, list): for item in data: list_prefix = f"{prefix}[]" if prefix else "[]" if isinstance(item, (dict, list)): seq.extend(yaml_to_keypath_sequence(item, list_prefix)) else: seq.append(list_prefix) return seq def yaml_file_to_sequence(filepath): """Load a YAML file and convert to a key-path sequence.""" with open(filepath) as f: data = yaml.safe_load(f) if data is None: return [] return yaml_to_keypath_sequence(data) def is_vault_file(filepath): """Check if a file is an Ansible vault file (encrypted).""" try: with open(filepath) as f: first = f.read(100) return '$ANSIBLE_VAULT' in first or first.startswith('!vault |') except Exception: return False def collect_all_sequences(root_dir=".", include_vault=False): """Collect key-path sequences from all YAML files. Returns: list of (filepath, sequence) tuples. """ results = [] for path in sorted(Path(root_dir).rglob("*.yml")): parts = path.parts if any(d in parts for d in ('node_modules', '.venv', '__pycache__', '.git')): continue skippable = ('vault.yml' in path.name or 'vault' in path.name) if not include_vault and (skippable or is_vault_file(path)): continue try: seq = yaml_file_to_sequence(path) if seq: results.append((path, seq)) except Exception as e: print(f" SKIP {path}: {e}") return results def sequences_to_crx(result_list): """Run CRX on collected sequences.""" from .crx import CRX sequences = [seq for _, seq in result_list] if not sequences: return 'ε' crx = CRX() return crx.infer(sequences)