- CRX: direct CHARE inference (Algorithm 7, TODS 2010) - iDRegEx: k-ORE inference (Algorithm 4, arXiv 2010) - RWR₀: SORE repair (Algorithm 6, TODS 2010) - rwr²: k-ORE extraction (Algorithm 3, arXiv 2010) - SOA, k-OA, iKoa, 2T-INF, Baum-Welch - Ansible role grammar adapter - Generic YAML key-path converter - 28 tests, all passing
166 lines
5 KiB
Python
166 lines
5 KiB
Python
"""
|
|
iLocal — Kontext-basierte Inferenz (Bex 2007).
|
|
|
|
Nach Bex et al. 2007: "Inferring XML Schema Definitions from XML Data"
|
|
Extrahiert aus YAML-Bäumen (Kontext, Sequenz)-Paare, wobei der Kontext
|
|
der YAML-Key (Container-Key) ist.
|
|
|
|
Angepasst für YAML:
|
|
- Kontext = YAML-Key, dessen Wert eine Liste ist (z.B. tasks, steps)
|
|
- Sequenz = Die item-Keys innerhalb dieser Liste (z.B. apt, template, service)
|
|
|
|
Anstatt Dateipfade zu verwenden (wie im XML-Kontext), arbeiten wir
|
|
mit den Container-Keys direkt (Benutzer-Vorgabe: kein Dateipfad-Ballast).
|
|
"""
|
|
|
|
import yaml
|
|
|
|
|
|
def extract_contexts_from_yaml(data, context_prefix=None):
|
|
"""
|
|
Extrahiert (context, sequence)-Paare aus geparstem YAML.
|
|
|
|
Args:
|
|
data: Geparste YAML-Daten (dict oder list)
|
|
context_prefix: Interner Prefix für verschachtelte Kontexte
|
|
|
|
Returns:
|
|
dict: {context_key: [sequence1, sequence2, ...]}
|
|
"""
|
|
contexts = {}
|
|
|
|
def walk(node, prefix=None):
|
|
if isinstance(node, dict):
|
|
for key, value in node.items():
|
|
full_key = f"{prefix}.{key}" if prefix else str(key)
|
|
if isinstance(value, list) and len(value) > 0:
|
|
seq = []
|
|
for item in value:
|
|
if isinstance(item, dict):
|
|
item_key = next(
|
|
(k for k in item if k != 'name' and not k.startswith('_')),
|
|
None
|
|
)
|
|
if item_key:
|
|
seq.append(item_key)
|
|
else:
|
|
named = item.get('name', str(item))
|
|
seq.append(f"named:{named[:20]}")
|
|
else:
|
|
seq.append(str(item))
|
|
if full_key not in contexts:
|
|
contexts[full_key] = []
|
|
contexts[full_key].append(seq)
|
|
for item in value:
|
|
walk(item, full_key)
|
|
elif isinstance(value, dict):
|
|
walk(value, full_key)
|
|
elif isinstance(value, list):
|
|
for item in value:
|
|
walk(item, full_key)
|
|
elif isinstance(node, list):
|
|
for item in node:
|
|
walk(item, prefix)
|
|
|
|
walk(data)
|
|
return contexts
|
|
|
|
|
|
def extract_contexts_from_yaml_string(yaml_string):
|
|
"""
|
|
Extrahiert Kontext-Sequenzen aus einem YAML-String.
|
|
|
|
Args:
|
|
yaml_string: YAML-String
|
|
|
|
Returns:
|
|
dict: {context_key: [sequence1, sequence2, ...]}
|
|
"""
|
|
try:
|
|
data = yaml.safe_load(yaml_string)
|
|
except yaml.YAMLError:
|
|
return {}
|
|
|
|
if data is None:
|
|
return {}
|
|
return extract_contexts_from_yaml(data)
|
|
|
|
|
|
def extract_contexts_from_file(filepath):
|
|
"""
|
|
Extrahiert Kontext-Sequenzen aus einer YAML-Datei.
|
|
|
|
Args:
|
|
filepath: Pfad zur YAML-Datei
|
|
|
|
Returns:
|
|
dict: {context_key: [sequence1, sequence2, ...]}
|
|
"""
|
|
with open(filepath) as f:
|
|
return extract_contexts_from_yaml_string(f.read())
|
|
|
|
|
|
def reduce_contexts(context_groups):
|
|
"""
|
|
reduce — Generalisierung nach Bex 2007 (Algorithmus reduce).
|
|
|
|
Identifiziert äquivalente Kontext-Modelle und fasst sie zusammen:
|
|
- Wenn zwei Kontexte die gleiche Sequenz-Struktur haben,
|
|
werden sie zu einem generalisierten Kontext zusammengefasst
|
|
|
|
Args:
|
|
context_groups: dict of {context_key: [sequences]}
|
|
|
|
Returns:
|
|
dict: {generalized_context: [sequences]} (reduziert)
|
|
"""
|
|
if not context_groups:
|
|
return {}
|
|
|
|
signature_map = {}
|
|
for ctx, seqs in context_groups.items():
|
|
# Signatur = sortierte Menge der (Länge, erstes/letztes Element)
|
|
sig_parts = []
|
|
for s in seqs:
|
|
first = s[0] if s else "∅"
|
|
last = s[-1] if s else "∅"
|
|
sig_parts.append((len(s), first, last))
|
|
signature = tuple(sorted(set(sig_parts)))
|
|
if signature not in signature_map:
|
|
signature_map[signature] = []
|
|
signature_map[signature].append(ctx)
|
|
|
|
# Gruppen mit gleicher Signatur → merge
|
|
result = {}
|
|
for sig, ctx_list in signature_map.items():
|
|
merged_ctx = "|".join(sorted(ctx_list))
|
|
merged_seqs = []
|
|
for ctx in ctx_list:
|
|
merged_seqs.extend(context_groups[ctx])
|
|
result[merged_ctx] = merged_seqs
|
|
|
|
return result
|
|
|
|
|
|
def iLocal(yaml_documents):
|
|
"""
|
|
iLocal — Kontext-Inferenz nach Bex 2007.
|
|
|
|
Args:
|
|
yaml_documents: Liste von YAML-Strings oder Dateipfaden
|
|
|
|
Returns:
|
|
dict: {generalized_context: [sequences]}
|
|
"""
|
|
all_contexts = {}
|
|
for doc in yaml_documents:
|
|
if '\n' in doc or '\r' in doc:
|
|
contexts = extract_contexts_from_yaml_string(doc)
|
|
else:
|
|
contexts = extract_contexts_from_file(doc)
|
|
for ctx, seqs in contexts.items():
|
|
if ctx not in all_contexts:
|
|
all_contexts[ctx] = []
|
|
all_contexts[ctx].extend(seqs)
|
|
|
|
return reduce_contexts(all_contexts)
|