grammar-inference-engine/bex/ilocal.py
tobjend dc559a4aee
All checks were successful
ci/woodpecker/push/woodpecker Pipeline was successful
fix badge position; purge remaining German user-reference comments
2026-07-01 13:28:55 +02:00

166 lines
5 KiB
Python

"""
iLocal — context-based inference (Bex 2007).
Per Bex et al. 2007: "Inferring XML Schema Definitions from XML Data"
Extracts (context, sequence) pairs from YAML trees, where context is
the YAML key (container key).
Adapted for YAML:
- Context = YAML key whose value is a list (e.g. tasks, steps)
- Sequence = item keys within that list (e.g. apt, template, service)
Uses container keys directly instead of file paths (design decision:
no path overhead).
"""
import yaml
def extract_contexts_from_yaml(data, context_prefix=None):
"""
Extrahiert (context, sequence)-Paare aus geparstem YAML.
Args:
data: Geparste YAML-Daten (dict oder list)
context_prefix: Interner Prefix für verschachtelte Kontexte
Returns:
dict: {context_key: [sequence1, sequence2, ...]}
"""
contexts = {}
def walk(node, prefix=None):
if isinstance(node, dict):
for key, value in node.items():
full_key = f"{prefix}.{key}" if prefix else str(key)
if isinstance(value, list) and len(value) > 0:
seq = []
for item in value:
if isinstance(item, dict):
item_key = next(
(k for k in item if k != 'name' and not k.startswith('_')),
None
)
if item_key:
seq.append(item_key)
else:
named = item.get('name', str(item))
seq.append(f"named:{named[:20]}")
else:
seq.append(str(item))
if full_key not in contexts:
contexts[full_key] = []
contexts[full_key].append(seq)
for item in value:
walk(item, full_key)
elif isinstance(value, dict):
walk(value, full_key)
elif isinstance(value, list):
for item in value:
walk(item, full_key)
elif isinstance(node, list):
for item in node:
walk(item, prefix)
walk(data)
return contexts
def extract_contexts_from_yaml_string(yaml_string):
"""
Extrahiert Kontext-Sequenzen aus einem YAML-String.
Args:
yaml_string: YAML-String
Returns:
dict: {context_key: [sequence1, sequence2, ...]}
"""
try:
data = yaml.safe_load(yaml_string)
except yaml.YAMLError:
return {}
if data is None:
return {}
return extract_contexts_from_yaml(data)
def extract_contexts_from_file(filepath):
"""
Extrahiert Kontext-Sequenzen aus einer YAML-Datei.
Args:
filepath: Pfad zur YAML-Datei
Returns:
dict: {context_key: [sequence1, sequence2, ...]}
"""
with open(filepath) as f:
return extract_contexts_from_yaml_string(f.read())
def reduce_contexts(context_groups):
"""
reduce — Generalisierung nach Bex 2007 (Algorithmus reduce).
Identifiziert äquivalente Kontext-Modelle und fasst sie zusammen:
- Wenn zwei Kontexte die gleiche Sequenz-Struktur haben,
werden sie zu einem generalisierten Kontext zusammengefasst
Args:
context_groups: dict of {context_key: [sequences]}
Returns:
dict: {generalized_context: [sequences]} (reduziert)
"""
if not context_groups:
return {}
signature_map = {}
for ctx, seqs in context_groups.items():
# Signatur = sortierte Menge der (Länge, erstes/letztes Element)
sig_parts = []
for s in seqs:
first = s[0] if s else ""
last = s[-1] if s else ""
sig_parts.append((len(s), first, last))
signature = tuple(sorted(set(sig_parts)))
if signature not in signature_map:
signature_map[signature] = []
signature_map[signature].append(ctx)
# Gruppen mit gleicher Signatur → merge
result = {}
for sig, ctx_list in signature_map.items():
merged_ctx = "|".join(sorted(ctx_list))
merged_seqs = []
for ctx in ctx_list:
merged_seqs.extend(context_groups[ctx])
result[merged_ctx] = merged_seqs
return result
def iLocal(yaml_documents):
"""
iLocal — Kontext-Inferenz nach Bex 2007.
Args:
yaml_documents: Liste von YAML-Strings oder Dateipfaden
Returns:
dict: {generalized_context: [sequences]}
"""
all_contexts = {}
for doc in yaml_documents:
if '\n' in doc or '\r' in doc:
contexts = extract_contexts_from_yaml_string(doc)
else:
contexts = extract_contexts_from_file(doc)
for ctx, seqs in contexts.items():
if ctx not in all_contexts:
all_contexts[ctx] = []
all_contexts[ctx].extend(seqs)
return reduce_contexts(all_contexts)