From 7c00c6713d7b2e274c3f8fc73643298fc382ea59 Mon Sep 17 00:00:00 2001 From: tobjend Date: Wed, 1 Jul 2026 08:01:16 +0200 Subject: [PATCH] Initial commit: BEX-based grammar inference engine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CRX: direct CHARE inference (Algorithm 7, TODS 2010) - iDRegEx: k-ORE inference (Algorithm 4, arXiv 2010) - RWR₀: SORE repair (Algorithm 6, TODS 2010) - rwr²: k-ORE extraction (Algorithm 3, arXiv 2010) - SOA, k-OA, iKoa, 2T-INF, Baum-Welch - Ansible role grammar adapter - Generic YAML key-path converter - 28 tests, all passing --- .gitignore | 8 + AGENTS.md | 45 + README.md | 132 ++ bex/__init__.py | 26 + bex/__main__.py | 3 + bex/automaton.py | 130 ++ bex/baum_welch.py | 192 +++ bex/cli.py | 145 +++ bex/crx.py | 191 +++ bex/expr.py | 164 +++ bex/idregex.py | 202 +++ bex/ikoa.py | 139 ++ bex/ilocal.py | 166 +++ bex/koa.py | 105 ++ bex/kore.py | 432 +++++++ bex/marking.py | 46 + bex/mdl.py | 143 +++ bex/pta.py | 62 + bex/repair.py | 167 +++ bex/role_grammar.py | 111 ++ bex/rwr0.py | 224 ++++ bex/rwrsq.py | 31 + bex/shrink.py | 267 ++++ bex/soa.py | 193 +++ bex/template.py | 154 +++ bex/tokenizer.py | 194 +++ bex/twotinf.py | 35 + bex/yaml_to_seq.py | 81 ++ papers/paper_arxiv2010.txt | 2210 ++++++++++++++++++++++++++++++++ papers/paper_tods2010.txt | 2492 ++++++++++++++++++++++++++++++++++++ pyproject.toml | 13 + requirements.txt | 5 + tests/test_bex.py | 420 ++++++ 33 files changed, 8928 insertions(+) create mode 100644 .gitignore create mode 100644 AGENTS.md create mode 100644 README.md create mode 100644 bex/__init__.py create mode 100644 bex/__main__.py create mode 100644 bex/automaton.py create mode 100644 bex/baum_welch.py create mode 100644 bex/cli.py create mode 100644 bex/crx.py create mode 100644 bex/expr.py create mode 100644 bex/idregex.py create mode 100644 bex/ikoa.py create mode 100644 bex/ilocal.py create mode 100644 bex/koa.py create mode 100644 bex/kore.py create mode 100644 bex/marking.py create mode 100644 bex/mdl.py create mode 100644 bex/pta.py create mode 100644 bex/repair.py create mode 100644 bex/role_grammar.py create mode 100644 bex/rwr0.py create mode 100644 bex/rwrsq.py create mode 100644 bex/shrink.py create mode 100644 bex/soa.py create mode 100644 bex/template.py create mode 100644 bex/tokenizer.py create mode 100644 bex/twotinf.py create mode 100644 bex/yaml_to_seq.py create mode 100644 papers/paper_arxiv2010.txt create mode 100644 papers/paper_tods2010.txt create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 tests/test_bex.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c2f4095 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +__pycache__/ +*.pyc +.env +.venv +venv/ +*.egg-info/ +dist/ +build/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..c19c1be --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,45 @@ +# Grammar Inference Engine — Agent Guide + +## Overview +This repo implements the BEX family of algorithms for inferring regular expression grammars +from example sequences. Use it whenever you need to discover the pattern behind a set of +strings or structured sequences. + +## Quick Start for Agents + +```python +# Fast pattern inference +from bex.crx import CRX +g = CRX().infer([['a','b','c'], ['a','b'], ['a','c']]) # a.(b+c)? + +# Probabilistic k-ORE inference (handles noise better) +from bex.idregex import idregex +g = idregex([['a','b','c'], ['a','b'], ['a','c']], kmax=2, N=3) +``` + +## Use Cases +1. **Ansible role patterns** — extract module sequences from tasks/main.yml, learn per-category grammars +2. **Log analysis** — find common patterns in event sequences +3. **API call patterns** — learn the typical order of API operations +4. **Configuration structure** — discover the schema behind YAML files +5. **Workflow mining** — extract the typical task flow from process logs + +## Architecture + +Two inference pipelines: + +| Pipeline | When to use | +|----------|-------------| +| CRX (fast) | Many examples, need speed, CHAREs output | +| iDRegEx (robust) | Few/noisy examples, need probabilistic handling | + +## Running Tests +```bash +python tests/test_bex.py +``` + +## MCP Roadmap +- [ ] Standalone MCP server wrapping CRX + iDRegEx +- [ ] Tool: `infer_grammar(sequences, method="crx")` +- [ ] Tool: `ansible_role_grammar(roles_dir)` +- [ ] Tool: `yaml_to_sequences(yaml_path)` diff --git a/README.md b/README.md new file mode 100644 index 0000000..27583b8 --- /dev/null +++ b/README.md @@ -0,0 +1,132 @@ +# Grammar Inference Engine + +Infer **regular expression grammars** from example sequences using the BEX family of algorithms. Given a set of example sequences (strings over some alphabet), the engine learns a compact regular expression that describes the general pattern. + +## Quick Start + +```bash +pip install pyyaml +python -m bex +``` + +```python +from bex.crx import CRX + +seqs = [ + ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'], + ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell'], +] +crx = CRX() +grammar = crx.infer(seqs) +print(grammar) +# file.template.docker_image.command.set_fact.shell.(wait_for)? +``` + +## Algorithms + +| Algorithm | What it learns | Paper | Use case | +|-----------|---------------|-------|----------| +| **CRX** | CHAREs (single-pass, deterministic) | TODS 2010 §6 | Fast inference from many sequences | +| **iDRegEx** | k-OREs (probabilistic, Baum-Welch) | arXiv 2010 | Handles noise, learns from few examples | +| **RWR₀** | SOREs (iterative repair) | TODS 2010 §5.2 | Builds regex from a single automaton | +| **rwr²** | k-ORE from k-OA | arXiv 2010 | Post-processing for k-ORE extraction | + +### Pipeline 1: Direct CHARE Inference (fast) + +``` +Example sequences → CRX → CHAREs grammar +``` + +### Pipeline 2: Probabilistic k-ORE Inference (robust) + +``` +Example sequences → Complete k-OA → Baum-Welch (EM) + → Disambiguate → Prune → rwr² → k-ORE grammar +``` + +## Architecture + +``` +bex/ +├── crx.py # CRX: direct CHARE inference (Algorithm 7, TODS) +├── idregex.py # iDRegEx: k-ORE inference (Algorithm 4, arXiv) +├── rwr0.py # RWR₀: SORE repair (Algorithm 6, TODS) +├── rwrsq.py # rwr²: k-ORE extraction (Algorithm 3, arXiv) +├── soa.py # SOA: Symbolic Observation Automaton core +├── koa.py # k-OA: k-testable Observation Automaton +├── ikoa.py # iKoa: k-OA inference (Algorithm 1, arXiv) +├── twotinf.py # 2T-INF: 2-testable inference (Algorithm 1, TODS) +├── baum_welch.py # Baum-Welch EM training for k-OA +├── expr.py # Expression utilities (concat, disj, star, strip) +├── marking.py # State marking for determinism +├── yaml_to_seq.py # Generic YAML → key-path sequence converter +├── role_grammar.py # Ansible role → module-sequence extractor +└── ... +``` + +## Domain: Ansible Role Grammar + +The engine includes a domain adapter for Ansible roles. It extracts module names from `tasks/main.yml` files and learns per-category grammars: + +```bash +python -c " +from bex.role_grammar import collect_all_role_sequences, learn_grammar +all_roles, by_category = collect_all_role_sequences('path/to/roles') +for cat, items in sorted(by_category.items()): + seqs = [s for _, s in items] + print(f'{cat}: {learn_grammar(seqs)}') +" +``` + +### Example Output + +``` +── restore (2 roles) ── + Grammar: file.copy.unarchive+.command + +── validate (5 roles) ── + Grammar: hosts?.shell?.(copy+debug+fail+set_fact+uri)+? + +── configure (4 roles) ── + Grammar: (assert+debug+set_fact+uri)+?.include_role? +``` + +**Grammar notation:** +- `a.b` — `a` followed by `b` (concatenation) +- `(a+b)` — either `a` or `b` (disjunction) +- `r?` — zero or one (optional) +- `r+` — one or more (iteration) +- `r+?` — zero or more (varies across examples) + +## Domain: Generic YAML + +The engine can convert any YAML file into key-path sequences for grammar inference: + +```python +from bex.yaml_to_seq import yaml_file_to_sequence, sequences_to_crx + +grammar = sequences_to_crx(yaml_file_to_sequence('config.yml')) +``` + +## Papers + +- **Bex et al.** *"Inferring Deterministic Regular Expressions from Positive Data"* — TODS 2010 +- **Bex et al.** *"Inferring k-optimal REs from Positive Data"* — arXiv:1004.2372 + +See `papers/` for extracted text and the original references. + +## Tests + +```bash +python -m pytest tests/ +# or +python tests/test_bex.py +``` + +## MCP Server + +A Model Context Protocol server for grammar inference is planned. See `AGENTS.md` for the roadmap. + +## License + +MIT diff --git a/bex/__init__.py b/bex/__init__.py new file mode 100644 index 0000000..9d21478 --- /dev/null +++ b/bex/__init__.py @@ -0,0 +1,26 @@ +""" +bex — Paper-faithful implementation of BEX inference algorithms. + +Papers: + - Bex et al. 2010 (TODS): Inference of Concise Regular Expressions and DTDs + - Bex et al. 2010 (arXiv 1004.2372): Learning Deterministic Regular Expressions + +Algorithms implemented: + TODS 2010: 2T-INF, REWRITE, RWR, RWR², RWR₀, CRX + arXiv 2010: iKoa, Disambiguate, rwr², iDRegEx +""" + +from .soa import SOA +from .twotinf import build_soa +from .rwr0 import rwr0 +from .crx import CRX +from .ikoa import ikoa +from .rwrsq import rwr_sq +from .idregex import idregex +from .koa import KOA, build_complete_koa +from .expr import concat, disj, star, optional, alphabet, strip_k +from .marking import mark_koa +from .tokenizer import YAMLTokenizer +from .template import generate_template + +__version__ = "0.2.0" diff --git a/bex/__main__.py b/bex/__main__.py new file mode 100644 index 0000000..4e28416 --- /dev/null +++ b/bex/__main__.py @@ -0,0 +1,3 @@ +from .cli import main + +main() diff --git a/bex/automaton.py b/bex/automaton.py new file mode 100644 index 0000000..e18b4e6 --- /dev/null +++ b/bex/automaton.py @@ -0,0 +1,130 @@ +""" +Automaton — Graph representation for BEX algorithms. + +Ein Automaton ist ein gerichteter Graph mit beschrifteten Kanten (Labels = Token). +Dient als Basis für: + - Prefix-Tree Automaton (aus Beispielsequenzen) + - SORE/CHARE Transformation via shrink-Rewrite-Regeln + - Determinism-Check und repair + +Die Implementierung folgt der Struktur aus Bex et al. 2010 (TWEB): + - Nodes: Menge der Zustände + - Edges: Liste von (from, to, label, prob) — prob optional für HMM + - start: Startzustand + - accepts: Menge akzeptierender Zustände +""" + + +class Automaton: + def __init__(self, start=None): + self.nodes = set() + self.edges = [] + self.start = start + self.accepts = set() + + def add_node(self, node): + self.nodes.add(node) + + def add_edge(self, u, v, label, prob=None): + self.edges.append({ + 'from': u, + 'to': v, + 'label': label, + 'prob': prob, + }) + self.add_node(u) + self.add_node(v) + + def remove_edge(self, u, v, label): + self.edges = [ + e for e in self.edges + if not (e['from'] == u and e['to'] == v and e['label'] == label) + ] + + def remove_all_edges_between(self, u, v): + self.edges = [ + e for e in self.edges + if not (e['from'] == u and e['to'] == v) + ] + + def set_start(self, node): + self.start = node + self.add_node(node) + + def add_accept(self, node): + self.accepts.add(node) + self.add_node(node) + + def outgoing(self, node): + return [e for e in self.edges if e['from'] == node] + + def incoming(self, node): + return [e for e in self.edges if e['to'] == node] + + def successors(self, node): + return {(e['to'], e['label']) for e in self.outgoing(node)} + + def has_edge(self, u, v, label): + return any( + e['from'] == u and e['to'] == v and e['label'] == label + for e in self.edges + ) + + def has_self_loop(self, node): + return any(e['from'] == node and e['to'] == node for e in self.edges) + + def labels_on_edge(self, u, v): + return [e['label'] for e in self.edges if e['from'] == u and e['to'] == v] + + def is_deterministic(self): + """Prüft ob der Automat deterministisch ist (keine zwei Kanten mit gleichem Label von einem Zustand).""" + for node in self.nodes: + seen = set() + for e in self.outgoing(node): + if e['label'] in seen: + return False + seen.add(e['label']) + return True + + def merge_nodes(self, target, source): + """Vereinigt source in target: Alle Kanten von/zu source werden auf target umgeleitet.""" + new_edges = [] + for e in self.edges: + if e['from'] == source and e['to'] == source: + new_edges.append({'from': target, 'to': target, 'label': e['label']}) + elif e['from'] == source: + new_edges.append({'from': target, 'to': e['to'], 'label': e['label']}) + elif e['to'] == source: + new_edges.append({'from': e['from'], 'to': target, 'label': e['label']}) + else: + new_edges.append(e) + self.edges = new_edges + if source in self.accepts: + self.accepts.add(target) + if source in self.accepts: + self.accepts.discard(source) + if source in self.nodes: + self.nodes.discard(source) + + def copy(self): + import copy + return copy.deepcopy(self) + + def __repr__(self): + return (f"Automaton(nodes={len(self.nodes)}, edges={len(self.edges)}, " + f"start={self.start}, accepts={self.accepts})") + + def to_dot(self): + lines = ["digraph Automaton {"] + lines.append(" rankdir=LR;") + lines.append(f' start [shape=point];') + lines.append(f' start -> {self.start};') + for n in self.nodes: + shape = "doublecircle" if n in self.accepts else "circle" + lines.append(f' {n} [shape={shape}];') + for e in self.edges: + label = e['label'].replace('"', '\\"') + prob = f" [{e['prob']:.2f}]" if e['prob'] is not None else "" + lines.append(f' {e["from"]} -> {e["to"]} [label="{label}{prob}"];') + lines.append("}") + return '\n'.join(lines) diff --git a/bex/baum_welch.py b/bex/baum_welch.py new file mode 100644 index 0000000..22cc400 --- /dev/null +++ b/bex/baum_welch.py @@ -0,0 +1,192 @@ +"""Baum-Welch for POMM on k-OA — standard forward-backward (Rabiner 1989).""" + +import random +import math + + +def init_probabilities(G, sequences): + """Initialize α per iKoa init (Algorithm 1, line 1). + + — α(src, sink) = fraction of empty words in S + — α(src, s) = fraction of words starting with lab(s), split equally + among all k copies of that symbol + — α(s, t) for s ≠ src: chosen randomly, normalized to sum to 1 + """ + total = len(sequences) + if total == 0: + total = 1 + empty_count = sum(1 for s in sequences if not s) + + start_counts = {} + for seq in sequences: + if seq: + start_counts[seq[0]] = start_counts.get(seq[0], 0) + 1 + + prob = {} + for s in G._succ: + if s == G.sink: + continue + succ = list(G._succ[s]) + if not succ: + prob[s] = {} + continue + vals = [] + for t in succ: + if s == G.src: + if t == G.sink: + v = empty_count / total + else: + lab = G.label(t) + base = lab.rsplit('_', 1)[0] if '_' in lab else lab + count = start_counts.get(base, 0) + copies = sum(1 for u in succ if G.label(u) == lab) + v = (count / total) / max(copies, 1) + vals.append(v) + else: + vals.append(random.random()) + s_total = sum(vals) + if s_total == 0: + vals = [1.0 / len(vals)] * len(vals) + else: + vals = [v / s_total for v in vals] + prob[s] = {t: v for t, v in zip(succ, vals)} + + for s in prob: + for t in prob[s]: + if prob[s][t] < 1e-10: + prob[s][t] = 0.0 + + return prob + + +def bw_iteration(prob, sequences, node_to_idx, n_states, all_nodes, G): + """Single Baum-Welch iteration over all sequences.""" + total_num = {} + total_denom = {} + + for seq in sequences: + if not seq: + continue + T = len(seq) + obs = seq + + # which states can emit each observation? (keyed by base symbol) + emit = {} + for n in all_nodes: + lab = G.label(n) + if lab: + base = lab.rsplit('_', 1)[0] if '_' in lab else lab + emit.setdefault(base, []).append(n) + # sink emits nothing + sink = G.sink + + # Forward pass + alpha = [{} for _ in range(T + 1)] + alpha[0][G.src] = 1.0 + + for t in range(T): + sym = obs[t] + possible = emit.get(sym, []) + for j in possible: + total = 0.0 + for i in alpha[t]: + p_trans = prob.get(i, {}).get(j, 0.0) + if p_trans > 0: + total += alpha[t][i] * p_trans + if total > 0: + alpha[t + 1][j] = total + + # P(O | λ) + po = 0.0 + for i in alpha[T]: + po += alpha[T][i] * prob.get(i, {}).get(sink, 0.0) + if po == 0: + continue + + # Backward pass + beta = [{} for _ in range(T + 1)] + for i in all_nodes: + if prob.get(i, {}).get(sink, 0.0) > 0: + beta[T][i] = prob[i][sink] + + for t in range(T - 1, -1, -1): + sym = obs[t] if t < T else None + possible = emit.get(sym, []) if sym else [] + for i in alpha[t]: + total = 0.0 + for j in possible: + p_trans = prob.get(i, {}).get(j, 0.0) + if p_trans > 0 and j in beta[t + 1]: + total += p_trans * beta[t + 1][j] + if total > 0: + beta[t][i] = total + + # Accumulate ξ and γ + for t in range(T): + sym_nxt = obs[t] + possible = emit.get(sym_nxt, []) + for i in alpha[t]: + if i not in beta[t] or beta[t][i] == 0: + continue + for j in possible: + p_trans = prob.get(i, {}).get(j, 0.0) + if p_trans == 0 or j not in beta[t + 1] or beta[t + 1][j] == 0: + continue + xi = alpha[t][i] * p_trans * beta[t + 1][j] / po + if xi > 1e-15: + key = (i, j) + total_num[key] = total_num.get(key, 0.0) + xi + total_denom[i] = total_denom.get(i, 0.0) + xi + + # M-step: update probabilities + for s in prob: + for t in prob[s]: + key = (s, t) + d = total_denom.get(s, 0.0) + if d > 1e-15 and key in total_num: + prob[s][t] = total_num[key] / d + else: + prob[s][t] = 0.0 + + # Renormalize + for s in prob: + row_sum = sum(prob[s].values()) + if row_sum > 1e-10: + for t in prob[s]: + prob[s][t] /= row_sum + else: + n_succ = len(prob[s]) + for t in prob[s]: + prob[s][t] = 1.0 / n_succ + + return prob + + +def baum_welch(G, prob, sequences, iterations=10): + """Baum-Welch EM training. + + Args: + G: k-OA graph + prob: dict[s][t] = transition probabilities + sequences: list of token lists (bag, not set) + iterations: number of EM iterations (full convergence) + + Returns: + Updated prob dict + """ + all_nodes = list(G._succ.keys()) + node_to_idx = {n: i for i, n in enumerate(all_nodes)} + n_states = len(all_nodes) + + for _ in range(iterations): + prob = bw_iteration(prob, sequences, node_to_idx, n_states, all_nodes, G) + + return prob + + +def baum_welch_fixed(G, prob, sequences, iterations=2): + """Baum-Welch with fixed small iteration count (for Disambiguate). + + ℓ = 2 for |Σ| ≤ 7, ℓ = 3 for |Σ| > 7. + """ + return baum_welch(G, prob, sequences, iterations) diff --git a/bex/cli.py b/bex/cli.py new file mode 100644 index 0000000..f69d530 --- /dev/null +++ b/bex/cli.py @@ -0,0 +1,145 @@ +""" +CLI — Command-Line Interface for bex YAML Grammar Inference. + +Usage: + python -m bex --dir roles/ --k-max 5 + python -m bex --dir playbooks/ --context tasks + python -m bex --dir roles/ --output template.yaml +""" + +import argparse +import os +import sys +import glob + +from .tokenizer import YAMLTokenizer +from .kore import kOREInference +from .template import generate_template +from .ilocal import iLocal, extract_contexts_from_file, reduce_contexts + + +def find_yaml_files(directory): + """Findet alle YAML-Dateien in einem Verzeichnis (rekursiv).""" + patterns = ['**/*.yml', '**/*.yaml'] + files = [] + for pattern in patterns: + files.extend(glob.glob(os.path.join(directory, pattern), recursive=True)) + return sorted(files) + + +def main(): + parser = argparse.ArgumentParser( + description='bex — BEX-based YAML Grammar Inference', + ) + parser.add_argument('--dir', type=str, default='roles/', + help='Verzeichnis mit YAML-Dateien (default: roles/)') + parser.add_argument('--k-max', type=int, default=5, + help='Max k für k-ORE-Inferenz (default: 5)') + parser.add_argument('--context', type=str, default=None, + help='Auf spezifischen Container-Key beschränken (z.B. tasks)') + parser.add_argument('--output', type=str, default=None, + help='Output-Datei für Template (default: stdout)') + parser.add_argument('--ilocal', action='store_true', + help='iLocal-Kontextanalyse durchführen') + parser.add_argument('--crx', action='store_true', + help='CRX (direct CHARE inference) verwenden') + parser.add_argument('--verbose', '-v', action='store_true', + help='Ausführliche Ausgabe') + parser.add_argument('--stats', action='store_true', + help='Zeige Token-Statistiken') + + args = parser.parse_args() + + if not os.path.isdir(args.dir): + print(f"Fehler: Verzeichnis '{args.dir}' nicht gefunden.", file=sys.stderr) + sys.exit(1) + + yaml_files = find_yaml_files(args.dir) + if not yaml_files: + print(f"Keine YAML-Dateien in '{args.dir}' gefunden.", file=sys.stderr) + sys.exit(1) + + print(f"Gefundene YAML-Dateien: {len(yaml_files)}", file=sys.stderr) + + if args.ilocal: + print("\n=== iLocal: Kontext-Extraktion ===", file=sys.stderr) + all_contexts = {} + for f in yaml_files: + contexts = extract_contexts_from_file(f) + for ctx, seqs in contexts.items(): + if ctx not in all_contexts: + all_contexts[ctx] = [] + all_contexts[ctx].extend(seqs) + + reduced = reduce_contexts(all_contexts) + print(f" Kontexte gefunden: {len(reduced)}", file=sys.stderr) + for ctx, seqs in sorted(reduced.items()): + lengths = [len(s) for s in seqs] + print(f" {ctx}: {len(seqs)} Sequenzen, " + f"Längen {min(lengths)}-{max(lengths)}, " + f"unique_seqs={len(set(tuple(s) for s in seqs))}", + file=sys.stderr) + + print("\n=== Tokenisierung ===", file=sys.stderr) + tokenizer = YAMLTokenizer(resolve_includes=False) + all_sequences = [] + container_sequences = {} + + for f in yaml_files: + try: + seq = tokenizer.tokenize_file(f) + if seq: + all_sequences.append(seq) + if args.verbose: + print(f" {os.path.relpath(f)}: {seq}", file=sys.stderr) + except Exception as e: + if args.verbose: + print(f" Fehler in {f}: {e}", file=sys.stderr) + + if not all_sequences: + print("Keine Sequenzen extrahiert.", file=sys.stderr) + sys.exit(1) + + print(f" Sequenzen extrahiert: {len(all_sequences)}", file=sys.stderr) + lengths = [len(s) for s in all_sequences] + print(f" Längen: min={min(lengths)}, max={max(lengths)}, " + f"avg={sum(lengths)/len(lengths):.1f}", file=sys.stderr) + + if args.stats: + stats = tokenizer.get_statistics() + print("\n=== Token-Statistiken ===", file=sys.stderr) + for token, count in list(stats.items())[:30]: + print(f" {token}: {count}", file=sys.stderr) + + print("\n=== k-ORE Inferenz ===", file=sys.stderr) + kore = kOREInference(k_max=args.k_max) + + if args.crx: + result = kore.infer_with_crx(all_sequences) + _, expr, method = result + print(f" Methode: {method}", file=sys.stderr) + else: + result = kore.infer(all_sequences) + if result: + _, expr, k = result + print(f" Bestes k: {k}", file=sys.stderr) + else: + expr = "∅" + print(" Kein Ergebnis", file=sys.stderr) + + print(f" Inferierter Ausdruck: {expr}", file=sys.stderr) + + print("\n=== One-Shot Template ===", file=sys.stderr) + print(file=sys.stderr) + template = generate_template(expr, context_key=args.context) + + if args.output: + with open(args.output, 'w') as f: + f.write(template) + print(f"Template geschrieben nach: {args.output}", file=sys.stderr) + else: + print(template) + + +if __name__ == '__main__': + main() diff --git a/bex/crx.py b/bex/crx.py new file mode 100644 index 0000000..51692ab --- /dev/null +++ b/bex/crx.py @@ -0,0 +1,191 @@ +"""CRX — Direct CHARE inference (Algorithm 7, TODS 2010).""" + +from collections import defaultdict +from .expr import concat + + +class CRX: + """ + |———— Algorithm 7: CRX ————| + Input: sample S (list of token lists) + Output: CHARE r such that S ⊆ L(r) + """ + + def infer(self, sequences): + S = [list(s) for s in sequences if s] + if not S: + return 'ε' + + sigma = set() + for w in S: + for a in w: + sigma.add(a) + if not sigma: + return 'ε' + + # Step 1: Compute ImmedPred and equivalence classes ≈_S + immed = set() + for w in S: + for i in range(len(w) - 1): + immed.add((w[i], w[i + 1])) + + # Reachability: →_S (reflexive, transitive closure) + closure = self._transitive_closure(sigma, immed) + + # Equivalence: a ≈_S b iff a →*_S b and b →*_S a + eq = self._equivalence(sigma, closure) + + # Build class map: symbol → class index + sym_to_cls = {} + classes = [] + for cls_syms in eq: + idx = len(classes) + for sym in cls_syms: + sym_to_cls[sym] = idx + classes.append(set(cls_syms)) + + # Step 2-3: Preserve only singleton nodes? No, the algorithm says merge singletons + # that share Pred/Succ in the Hasse diagram. But actually, looking at the algorithm + # more carefully: + # + # "while a maximal set of singleton nodes γ₁,...,γ_ℓ such that + # Pred_HS(γ₁)=···=Pred_HS(γ_ℓ) and Succ_HS(γ₁)=···=Succ_HS(γ_ℓ) exists do + # Replace γ₁,...,γ_ℓ by γ := ∪ⱼ γⱼ" + # + # This merges singleton equivalence classes (classes with exactly one symbol) + # that have the same Pred and Succ sets in the Hasse diagram. + + changed = True + while changed: + changed = False + singleton_ids = [i for i, c in enumerate(classes) if len(c) == 1] + + # Compute Pred and Succ for each singleton (considering ALL symbols in each class) + hs_pred = {} + hs_succ = {} + for i in singleton_ids: + hs_pred[i] = set() + hs_succ[i] = set() + sym_i = next(iter(classes[i])) + for j, c in enumerate(classes): + if i == j: + continue + if any((sym_j, sym_i) in immed for sym_j in c): + hs_pred[i].add(j) + if any((sym_i, sym_j) in immed for sym_j in c): + hs_succ[i].add(j) + + # Group by same (Pred, Succ) + groups = defaultdict(list) + for i in singleton_ids: + groups[(frozenset(hs_pred[i]), frozenset(hs_succ[i]))].append(i) + + for (pred_set, succ_set), group in groups.items(): + if len(group) >= 2: + merged = set() + for i in group: + merged.update(classes[i]) + new_id = len(classes) + classes.append(merged) + for i in sorted(group, reverse=True): + classes.pop(i) + changed = True + break + + # After merging, rebuild sym_to_cls to map to new class indices + sym_to_cls = {} + for idx, cls in enumerate(classes): + for sym in cls: + sym_to_cls[sym] = idx + + # Step 5: Topological sort of the Hasse diagram + adj = {i: set() for i in range(len(classes))} + indeg = {i: 0 for i in range(len(classes))} + for a, b in immed: + ca, cb = sym_to_cls.get(a), sym_to_cls.get(b) + if ca is not None and cb is not None and ca != cb: + if cb not in adj[ca]: + adj[ca].add(cb) + indeg[cb] += 1 + + # Topological sort (Kahn's algorithm) + order = [] + q = [i for i in range(len(classes)) if indeg[i] == 0] + while q: + i = q.pop(0) + order.append(i) + for j in adj[i]: + indeg[j] -= 1 + if indeg[j] == 0: + q.append(j) + remaining = set(range(len(classes))) - set(order) + order.extend(remaining) + + # Step 6-16: Assign chain factors (Algorithm 7 lines 7-14) + def count_in_class(w, syms): + return sum(1 for a in w if a in syms) + + parts = [] + for i in order: + syms = classes[i] + counts = [count_in_class(w, syms) for w in S] + + all_exactly_one = all(c == 1 for c in counts) + all_at_most_one = all(c <= 1 for c in counts) + all_at_least_one = all(c >= 1 for c in counts) + some_two_or_more = any(c >= 2 for c in counts) + + sym_list = sorted(syms) + factor = '+'.join(sym_list) + if len(sym_list) > 1: + factor = '(' + factor + ')' + + if all_exactly_one: + pass # (a₁+···+aₙ) + elif all_at_most_one: + factor += '?' # (a₁+···+aₙ)? + elif all_at_least_one and some_two_or_more: + factor += '+' # (a₁+···+aₙ)+ + else: + factor += '+?' # (a₁+···+aₙ)+? + + parts.append(factor) + + if not parts: + return 'ε' + return '.'.join(parts) + + def _transitive_closure(self, sigma, immed): + """Compute reflexive, transitive closure of immed relation.""" + closure = {(a, b) for (a, b) in immed} + for a in sigma: + closure.add((a, a)) + changed = True + while changed: + changed = False + for a in sigma: + for b in sigma: + for c in sigma: + if (a, b) in closure and (b, c) in closure and (a, c) not in closure: + closure.add((a, c)) + changed = True + return closure + + def _equivalence(self, sigma, closure): + """Compute equivalence classes of ≈_S.""" + remaining = set(sigma) + classes = [] + while remaining: + a = remaining.pop() + cls = {a} + added = True + while added: + added = False + for b in list(remaining): + if (a, b) in closure and (b, a) in closure: + if b not in cls: + cls.add(b) + remaining.discard(b) + added = True + classes.append(cls) + return classes diff --git a/bex/expr.py b/bex/expr.py new file mode 100644 index 0000000..474b488 --- /dev/null +++ b/bex/expr.py @@ -0,0 +1,164 @@ +"""Expression utilities for SOREs and k-OREs.""" + +import re + + +def sym(s): + """Create a simple symbol expression.""" + return s + + +def concat(*parts): + """Create concatenation expression.""" + parts = [p for p in parts if p and p != 'ε'] + if not parts: + return 'ε' + if len(parts) == 1: + return parts[0] + return '.'.join(parts) + + +def disj(*parts): + """Create disjunction expression.""" + parts = [p for p in parts if p and p != '∅'] + if not parts: + return '∅' + if len(parts) == 1: + return parts[0] + return '(' + '|'.join(parts) + ')' + + +def star(expr): + """Create iteration expression (one or more, r+).""" + if not expr or expr in ('∅', 'ε'): + return expr + if len(expr) == 1 or (expr.startswith('(') and expr.endswith(')')): + return expr + '+' + return '(' + expr + ')+' + + +def optional(expr): + """Create optional expression (r?).""" + if not expr or expr in ('∅', 'ε'): + return 'ε' + if len(expr) == 1 or (expr.startswith('(') and expr.endswith(')')): + return expr + '?' + return '(' + expr + ')?' + + +def alphabet(expr): + """Return set of alphabet symbols in expression.""" + cleaned = re.sub(r'[+?*().|]', ' ', expr) + result = set() + for token in cleaned.split(): + token = token.strip('_0123456789') + if token and token not in ('ε', '∅'): + result.add(token) + return result + + +def strip_k(s): + """Remove k-ORE markers: a_1 → a, b^(2) → b.""" + result = re.sub(r'_\d+', '', s) + result = re.sub(r'\^\(\d+\)', '', result) + result = re.sub(r'^\(|\)$', '', result) + return result + + +def has_repeats(expr, symbol): + """Check if a symbol appears more than once in expression.""" + return expr.count(symbol) > 1 + + +def lang_size_at_most(expr, n, alphabet_symbols=None): + """Compute |L(r)<=n| — number of words of length ≤ n in L(r).""" + if alphabet_symbols is None: + alphabet_symbols = alphabet(expr) + if not alphabet_symbols: + return 1 if 'ε' in expr else 0 + size = 0 + for length in range(n + 1): + size += _count_words(expr, length, alphabet_symbols) + return size + + +def _count_words(expr, length, alphabet_symbols): + if length < 0: + return 0 + if not expr or expr == '∅': + return 0 + if expr == 'ε': + return 1 if length == 0 else 0 + if expr in alphabet_symbols: + return 1 if length == 1 else 0 + if '+' in expr: + inner = expr.rstrip('+') + if inner.endswith('?'): + inner = inner[:-1] + return _count_star_words(inner, length, alphabet_symbols, 1) + if expr.endswith('?'): + inner = expr[:-1] + return _count_words(inner, length, alphabet_symbols) + (1 if length == 0 else 0) + if expr.startswith('(') and '|' in expr: + inner = expr[1:-1] + parts = _split_disjunction(inner) + return sum(_count_words(p, length, alphabet_symbols) for p in parts) + if '.' in expr: + parts = expr.split('.') + return _count_concat_words(parts, length, alphabet_symbols, 0) + if ')' in expr or '(' in expr: + return 0 + return 0 + + +def _count_concat_words(parts, length, alphabet_symbols, idx): + if idx >= len(parts): + return 1 if length == 0 else 0 + total = 0 + for take in range(length + 1): + cnt = _count_words(parts[idx], take, alphabet_symbols) + if cnt > 0: + rest = _count_concat_words(parts, length - take, alphabet_symbols, idx + 1) + total += cnt * rest + return total + + +def _count_star_words(inner, length, alphabet_symbols, min_count): + total = 0 + for repeat in range(min_count, length + 1): + if repeat == 0: + continue + total += _count_repeat_words(inner, repeat, length, alphabet_symbols) + return total + + +def _count_repeat_words(inner, repeat, length, alphabet_symbols): + if repeat == 0: + return 1 if length == 0 else 0 + total = 0 + for take in range(length + 1): + cnt = _count_words(inner, take, alphabet_symbols) + if cnt > 0: + rest = _count_repeat_words(inner, repeat - 1, length - take, alphabet_symbols) + total += cnt * rest + return total + + +def _split_disjunction(s): + depth = 0 + parts = [] + current = [] + for ch in s: + if ch == '(': + depth += 1 + current.append(ch) + elif ch == ')': + depth -= 1 + current.append(ch) + elif ch == '|' and depth == 0: + parts.append(''.join(current)) + current = [] + else: + current.append(ch) + parts.append(''.join(current)) + return parts diff --git a/bex/idregex.py b/bex/idregex.py new file mode 100644 index 0000000..814c82b --- /dev/null +++ b/bex/idregex.py @@ -0,0 +1,202 @@ +"""iDRegEx — Algorithm 4 (arXiv 1004.2372).""" + +from .ikoa import ikoa +from .rwrsq import rwr_sq +from .expr import alphabet + + +def is_deterministic(expr): + """Check if a k-ORE is deterministic (Glushkov determinism). + + A k-ORE is deterministic iff for every subexpression (r|s), + first(r) ∩ first(s) = ∅. + """ + if not expr or expr == '∅' or expr == 'ε': + return True + return _check_det(expr) + + +def _check_det(expr): + """Recursive determinism check.""" + depth = 0 + i = 0 + while i < len(expr): + if expr[i] == '(': + if depth == 0: + start = i + depth += 1 + elif expr[i] == ')': + depth -= 1 + if depth == 0: + inner = expr[start + 1:i] + if '|' in inner: + alts = _split_or(inner) + first_sets = [] + for alt in alts: + fs = _first_set(alt.strip()) + first_sets.append(fs) + for j, fs1 in enumerate(first_sets): + for fs2 in first_sets[j + 1:]: + if fs1 & fs2: + return False + for alt in alts: + if not _check_det(alt.strip()): + return False + else: + if not _check_det(inner): + return False + elif expr[i] == '+': + pass + elif expr[i] == '?': + pass + i += 1 + return True + + +def _first_set(expr): + """Compute first(r) — set of alphabet symbols that can appear at the start of a word in L(r).""" + if not expr or expr == '∅': + return set() + if expr == 'ε': + return set() + alpha = alphabet(expr) + if expr in alpha: + return {expr} + if expr.endswith('?') or expr.endswith('+'): + inner = expr.rstrip('+?') + return _first_set(inner) + if '.' in expr: + parts = expr.split('.') + return _first_set(parts[0]) + if expr.startswith('(') and '|' in expr: + inner = expr[1:-1] + alts = _split_or(inner) + result = set() + for a in alts: + result |= _first_set(a.strip()) + return result + return alpha + + +def _split_or(s): + """Split disjunction string at top-level | operators.""" + depth = 0 + parts = [] + cur = [] + for ch in s: + if ch == '(': + depth += 1 + cur.append(ch) + elif ch == ')': + depth -= 1 + cur.append(ch) + elif ch == '|' and depth == 0: + parts.append(''.join(cur)) + cur = [] + else: + cur.append(ch) + parts.append(''.join(cur)) + return parts + + +def _lang_size(expr, n=None): + """|L(r)≤n| — number of words of length ≤ n in L(r). + + n = 2m + 1 where m = |r| excluding operators. + Uses simple structural approximation. + """ + if not expr or expr == '∅': + return 0 + if expr == 'ε': + return 1 + m = len(alphabet(expr)) + if n is None: + n = 2 * m + 1 + total = 0 + for length in range(n + 1): + total += _count_len(expr, length) + return total + + +def _count_len(expr, length): + if length < 0: + return 0 + if not expr or expr == '∅': + return 0 + if expr == 'ε': + return 1 if length == 0 else 0 + alpha = alphabet(expr) + if expr in alpha: + return 1 if length == 1 else 0 + if expr.endswith('+'): + inner = expr[:-1] + if inner.endswith('?'): + inner = inner[:-1] + total = 0 + for rep in range(1, length + 1): + total += _count_repeat(inner, rep, length) + return total + if expr.endswith('?'): + inner = expr[:-1] + return _count_len(inner, length) + (1 if length == 0 else 0) + if '.' in expr: + parts = expr.split('.') + return _count_concat(parts, length, 0) + if expr.startswith('(') and '|' in expr: + inner = expr[1:-1] + alts = _split_or(inner) + return sum(_count_len(a.strip(), length) for a in alts) + return 0 + + +def _count_concat(parts, length, idx): + if idx >= len(parts): + return 1 if length == 0 else 0 + total = 0 + for take in range(length + 1): + cnt = _count_len(parts[idx], take) + if cnt: + total += cnt * _count_concat(parts, length - take, idx + 1) + return total + + +def _count_repeat(inner, rep, length): + if rep == 0: + return 1 if length == 0 else 0 + total = 0 + for take in range(length + 1): + cnt = _count_len(inner, take) + if cnt: + total += cnt * _count_repeat(inner, rep - 1, length - take) + return total + + +def idregex(sequences, kmax=4, N=5, criterion='langsize'): + """ + |———— Algorithm 4: iDRegEx ————| + Require: sample S + Ensure: k-ORE r + + 1: C ← ∅ + 2: for k = 1 to kmax do + 3: for n = 1 to N do + 4: G ← iKoa(S, k) + 5: if rwr²(G) is deterministic then + 6: add rwr²(G) to C + 7: return best(C) + """ + C = set() + for k in range(1, kmax + 1): + for _ in range(N): + G = ikoa(sequences, k, num_trials=1) + if G is None: + continue + expr = rwr_sq(G) + if expr and expr not in ('∅', 'ε'): + if is_deterministic(expr): + C.add(expr) + if not C: + return None + if criterion == 'langsize': + return min(C, key=lambda e: (_lang_size(e), len(e))) + return min(C, key=lambda e: len(e)) diff --git a/bex/ikoa.py b/bex/ikoa.py new file mode 100644 index 0000000..b620fd7 --- /dev/null +++ b/bex/ikoa.py @@ -0,0 +1,139 @@ +"""iKoa — Algorithm 1 (arXiv 1004.2372) with Disambiguate (Algorithm 2).""" + +from collections import deque, defaultdict +import random +from .koa import KOA, build_complete_koa +from .baum_welch import init_probabilities, baum_welch, baum_welch_fixed + + +def disambiguate(G, prob, sequences): + """ + |---- Algorithm 2: Disambiguate ----| + Require: POMM P=(G,alpha) and sample S + Ensure: deterministic k-OA + """ + sigma = set() + for seq in sequences: + for sym in seq: + sigma.add(sym) + bw_iter = 2 if len(sigma) <= 7 else 3 + + Q = deque([G.src]) + for s in G._succ.get(G.src, set()): + if prob.get(G.src, {}).get(s, 0) > 0: + Q.append(s) + D = set() + + from .expr import strip_k + while Q: + s = Q.popleft() + while True: + lab_groups = defaultdict(list) + for t in list(G._succ.get(s, set())): + l = G.label(t) + if l: + lab_groups[strip_k(l)].append(t) + multi = [(lab, ts) for lab, ts in lab_groups.items() if len(ts) > 1] + if not multi: + break + for lab, targets in multi: + t_max = max(targets, key=lambda t: prob.get(s, {}).get(t, 0)) + total_p = sum(prob.get(s, {}).get(t, 0) for t in targets) + if total_p > 0 and t_max in prob.get(s, {}): + prob[s][t_max] = total_p + for t in targets: + if t != t_max: + G.rm_edge(s, t) + if t in prob.get(s, {}): + prob[s][t] = 0.0 + prob = baum_welch_fixed(G, prob, sequences, bw_iter) + for seq in sequences: + if not G.accept(seq): + return None + D.add(s) + for t in list(G._succ.get(s, set())): + if t not in D and t != G.sink: + Q.append(t) + return G + + +def prune(G, sequences): + """Prune (iKoa line 4). Remove edges without witnesses in S. + + Also removes states s ∈ Succ(src) without a witness. + """ + from .expr import strip_k as _sk + witnessed = set() + for seq in sequences: + if not seq: + witnessed.add((G.src, G.sink)) + continue + cur = {G.src} + for sym in seq: + nxt = set() + for s in cur: + for t in G._succ.get(s, set()): + lab = G.label(t) + if lab and _sk(lab) == sym: + nxt.add(t) + witnessed.add((s, t)) + cur = nxt + for s in cur: + if G.has_edge(s, G.sink): + witnessed.add((s, G.sink)) + for s in list(G._succ.keys()): + for t in list(G._succ.get(s, set())): + if (s, t) not in witnessed: + G.rm_edge(s, t) + + r_from_src = set() + q = [G.src] + while q: + s = q.pop() + if s in r_from_src: + continue + r_from_src.add(s) + q.extend(G._succ.get(s, set())) + + r_to_sink = set() + q = [G.sink] + while q: + s = q.pop() + if s in r_to_sink: + continue + r_to_sink.add(s) + q.extend(G._pred.get(s, set())) + + for n in list(G._succ.keys()): + if n in (G.src, G.sink): + continue + if n not in r_from_src or n not in r_to_sink: + G.rm_state(n) + + return G + + +def ikoa(sequences, k, num_trials=1): + """ + |———— Algorithm 1: iKoa ————| + Require: sample S, value k + Ensure: deterministic k-OA G with S ⊆ L(G) + + 1: P ← init(k, S) + 2: P ← BaumWelsh(P, S) + 3: G ← Disambiguate(P, S) + 4: G ← Prune(G, S) + 5: return G + """ + for _ in range(num_trials): + G, _ = build_complete_koa(sequences, k) + prob = init_probabilities(G, sequences) + prob = baum_welch(G, prob, sequences, iterations=10) + G2 = G.copy() + prob2 = {s: dict(d) for s, d in prob.items()} + result = disambiguate(G2, prob2, sequences) + if result is not None: + result = prune(result, sequences) + if result.sink_reachable(): + return result + return None diff --git a/bex/ilocal.py b/bex/ilocal.py new file mode 100644 index 0000000..d5b22eb --- /dev/null +++ b/bex/ilocal.py @@ -0,0 +1,166 @@ +""" +iLocal — Kontext-basierte Inferenz (Bex 2007). + +Nach Bex et al. 2007: "Inferring XML Schema Definitions from XML Data" +Extrahiert aus YAML-Bäumen (Kontext, Sequenz)-Paare, wobei der Kontext +der YAML-Key (Container-Key) ist. + +Angepasst für YAML: + - Kontext = YAML-Key, dessen Wert eine Liste ist (z.B. tasks, steps) + - Sequenz = Die item-Keys innerhalb dieser Liste (z.B. apt, template, service) + +Anstatt Dateipfade zu verwenden (wie im XML-Kontext), arbeiten wir +mit den Container-Keys direkt (Benutzer-Vorgabe: kein Dateipfad-Ballast). +""" + +import yaml + + +def extract_contexts_from_yaml(data, context_prefix=None): + """ + Extrahiert (context, sequence)-Paare aus geparstem YAML. + + Args: + data: Geparste YAML-Daten (dict oder list) + context_prefix: Interner Prefix für verschachtelte Kontexte + + Returns: + dict: {context_key: [sequence1, sequence2, ...]} + """ + contexts = {} + + def walk(node, prefix=None): + if isinstance(node, dict): + for key, value in node.items(): + full_key = f"{prefix}.{key}" if prefix else str(key) + if isinstance(value, list) and len(value) > 0: + seq = [] + for item in value: + if isinstance(item, dict): + item_key = next( + (k for k in item if k != 'name' and not k.startswith('_')), + None + ) + if item_key: + seq.append(item_key) + else: + named = item.get('name', str(item)) + seq.append(f"named:{named[:20]}") + else: + seq.append(str(item)) + if full_key not in contexts: + contexts[full_key] = [] + contexts[full_key].append(seq) + for item in value: + walk(item, full_key) + elif isinstance(value, dict): + walk(value, full_key) + elif isinstance(value, list): + for item in value: + walk(item, full_key) + elif isinstance(node, list): + for item in node: + walk(item, prefix) + + walk(data) + return contexts + + +def extract_contexts_from_yaml_string(yaml_string): + """ + Extrahiert Kontext-Sequenzen aus einem YAML-String. + + Args: + yaml_string: YAML-String + + Returns: + dict: {context_key: [sequence1, sequence2, ...]} + """ + try: + data = yaml.safe_load(yaml_string) + except yaml.YAMLError: + return {} + + if data is None: + return {} + return extract_contexts_from_yaml(data) + + +def extract_contexts_from_file(filepath): + """ + Extrahiert Kontext-Sequenzen aus einer YAML-Datei. + + Args: + filepath: Pfad zur YAML-Datei + + Returns: + dict: {context_key: [sequence1, sequence2, ...]} + """ + with open(filepath) as f: + return extract_contexts_from_yaml_string(f.read()) + + +def reduce_contexts(context_groups): + """ + reduce — Generalisierung nach Bex 2007 (Algorithmus reduce). + + Identifiziert äquivalente Kontext-Modelle und fasst sie zusammen: + - Wenn zwei Kontexte die gleiche Sequenz-Struktur haben, + werden sie zu einem generalisierten Kontext zusammengefasst + + Args: + context_groups: dict of {context_key: [sequences]} + + Returns: + dict: {generalized_context: [sequences]} (reduziert) + """ + if not context_groups: + return {} + + signature_map = {} + for ctx, seqs in context_groups.items(): + # Signatur = sortierte Menge der (Länge, erstes/letztes Element) + sig_parts = [] + for s in seqs: + first = s[0] if s else "∅" + last = s[-1] if s else "∅" + sig_parts.append((len(s), first, last)) + signature = tuple(sorted(set(sig_parts))) + if signature not in signature_map: + signature_map[signature] = [] + signature_map[signature].append(ctx) + + # Gruppen mit gleicher Signatur → merge + result = {} + for sig, ctx_list in signature_map.items(): + merged_ctx = "|".join(sorted(ctx_list)) + merged_seqs = [] + for ctx in ctx_list: + merged_seqs.extend(context_groups[ctx]) + result[merged_ctx] = merged_seqs + + return result + + +def iLocal(yaml_documents): + """ + iLocal — Kontext-Inferenz nach Bex 2007. + + Args: + yaml_documents: Liste von YAML-Strings oder Dateipfaden + + Returns: + dict: {generalized_context: [sequences]} + """ + all_contexts = {} + for doc in yaml_documents: + if '\n' in doc or '\r' in doc: + contexts = extract_contexts_from_yaml_string(doc) + else: + contexts = extract_contexts_from_file(doc) + for ctx, seqs in contexts.items(): + if ctx not in all_contexts: + all_contexts[ctx] = [] + all_contexts[ctx].extend(seqs) + + return reduce_contexts(all_contexts) diff --git a/bex/koa.py b/bex/koa.py new file mode 100644 index 0000000..8cf818e --- /dev/null +++ b/bex/koa.py @@ -0,0 +1,105 @@ +"""k-OA — k-Occurrence Automaton (Definition 4.1, arXiv 1004.2372). + +A k-OA is like a SOA but each symbol appears at most k times as a state label. +""" + +from .soa import SOA +from .expr import strip_k + + +class KOA(SOA): + """k-Occurrence Automaton. + + Same structure as SOA but each symbol may label up to k states. + """ + + def __init__(self, k=1): + super().__init__() + self.k = k + self._symbol_count = {} + + def add_state(self, label): + nid = super().add_state(label) + sym = strip_k(label) + self._symbol_count.setdefault(sym, 0) + self._symbol_count[sym] += 1 + return nid + + def remove_state(self, nid): + label = self._label.get(nid) + if label: + sym = strip_k(label) + self._symbol_count[sym] -= 1 + super().rm_state(nid) + + def count_symbol(self, symbol): + return self._symbol_count.get(strip_k(symbol), 0) + + def symbol_ok(self, symbol): + return self.count_symbol(symbol) < self.k + + def is_deterministic(self): + for n in self._succ: + label_map = {} + for t in self._succ[n]: + lab = self._label.get(t) + if lab: + base = strip_k(lab) + if base in label_map: + return False + label_map[base] = t + return True + + def accept(self, w): + """Accept using base symbols (strip k-markers from state labels).""" + cur = {self.src} + for sym in w: + nxt = set() + for s in cur: + for t in self._succ.get(s, set()): + lab = self._label.get(t) + if lab and strip_k(lab) == sym: + nxt.add(t) + if not nxt: + return False + cur = nxt + return any(self.sink in self._succ.get(s, set()) for s in cur) + + def succ_labeled(self, nid, symbol): + return {t for t in self._succ.get(nid, set()) if strip_k(self._label.get(t) or '') == symbol} + + +def build_complete_koa(sequences, k): + """Build complete k-OA Ck (Definition 4.2, arXiv 1004.2372). + + For each a ∈ Σ(S), exactly k states labeled a (a_1 ... a_k). + - src connected to exactly one a_i for each a + - Every state has edge to every other state (except src) + - src → sink edge (for ε) + """ + G = KOA(k=k) + alphabet = set() + for seq in sequences: + for token in seq: + alphabet.add(token) + + symbol_states = {} + for sym in alphabet: + state_ids = [] + for i in range(1, k + 1): + nid = G.add_state(f"{sym}_{i}") + state_ids.append(nid) + G.add_edge(G.src, nid) + symbol_states[sym] = state_ids + + all_states = [n for n in G._succ if n not in (G.src, G.sink)] + for s in all_states: + for t in all_states: + if s != t and not G.has_edge(s, t): + G.add_edge(s, t) + if not G.has_edge(s, G.sink): + G.add_edge(s, G.sink) + + G.add_edge(G.src, G.sink) + + return G, symbol_states diff --git a/bex/kore.py b/bex/kore.py new file mode 100644 index 0000000..45bbca3 --- /dev/null +++ b/bex/kore.py @@ -0,0 +1,432 @@ +""" +kore — k-ORE Inference (iDRegEx) nach Bex et al. 2008/2010. + +iDRegEx (Bex 2008): + 1. Prefix-Tree Automaton (PTA) aus Beispielsequenzen + 2. Shrink: Rewrite-Regeln generalisieren den Automaten + (simplify → star_rewrite → concat_rewrite → alternation_rewrite) + 3. Repair: Stelle Determinismus nach jedem Rewrite-Durchlauf wieder her + 4. Convert: Überführe den Automaten in einen regulären Ausdruck + (State-Elimination nach Brzozowski & McCluskey) + 5. k-ORE Prüfung: Der Ausdruck muss die k-Occurrence-Bedingung erfüllen + (jedes Symbol maximal k-mal nennenswert) + 6. MDL: Wähle k mit minimalem MDL-Score +""" + +from .automaton import Automaton +from .pta import build_pta +from .shrink import shrink +from .repair import repair +from .mdl import mdl_score + + +def _state_elimination(G): + """ + State Elimination nach Brzozowski & McCluskey. + + Entfernt nacheinander alle Nicht-Start/Accept-Zustände. + Für jeden eliminierten Zustand q: + - Für jedes Paar (p, r) mit p→q (Label A) und q→r (Label B): + - R_self_q = disjunktion aller Selbst-Schleifen auf q + - Neues Label = A · (R_self_q)* · B + - Füge Kante p → r mit dem neuen Label hinzu (oder merge mit existierender) + + Nach Elimination: Nur Start- und Accept-Zustände bleiben. + Der Ausdruck ist: summe aller Pfade von Start zu Accept. + """ + G = G.copy() + eliminated = set() + + # Wiederhole bis nur Start + Accepts übrig sind + changed = True + while changed: + changed = False + # Wähle einen Zustand zur Elimination (nicht Start, nicht Accept) + for q in list(G.nodes): + if q == G.start or q in G.accepts: + continue + if q in eliminated: + continue + + reachable = _is_reachable_to_accept(G, q) + if not reachable: + G.nodes.discard(q) + G.accepts.discard(q) + G.edges = [e for e in G.edges if e['from'] != q and e['to'] != q] + eliminated.add(q) + changed = True + continue + + incoming = G.incoming(q) + outgoing = G.outgoing(q) + + # R_self_q = (a1 | a2 | ...)* für alle Selbst-Schleifen auf q + self_loops = [e for e in outgoing if e['to'] == q] + outgoing_no_self = [e for e in outgoing if e['to'] != q] + + if not outgoing_no_self: + # Sackgasse, keine Outgoing-Kanten (außer self-loop) + # Entferne eingehende Kanten + q + for e in incoming: + G.remove_edge(e['from'], e['to'], e['label']) + G.nodes.discard(q) + G.accepts.discard(q) + eliminated.add(q) + changed = True + continue + + if self_loops: + self_labels = list(set(e['label'] for e in self_loops)) + if len(self_labels) == 1: + R_self_q = f"({self_labels[0]})*" + else: + R_self_q = f"({'|'.join(self_labels)})*" + else: + R_self_q = "" + + # Für jedes Paar (p, r): p→q (incoming), q→r (outgoing, r != q) + for e_in in incoming: + p = e_in['from'] + if p == q: + continue + A = e_in['label'] + + for e_out in outgoing_no_self: + r = e_out['to'] + B = e_out['label'] + + if R_self_q: + new_label = f"({A}.{R_self_q}.{B})" + else: + new_label = f"({A}.{B})" + + # Merge mit existierender Kante p→r wenn vorhanden + existing = [e for e in G.edges if e['from'] == p and e['to'] == r] + existing_labels = [e['label'] for e in existing] + + if new_label not in existing_labels and f"({new_label})" not in existing_labels: + # Vereinige mit existierenden Labels via | + if existing: + old_label = existing[0]['label'] + merged = f"({old_label}|{new_label})" + G.remove_edge(p, r, old_label) + G.add_edge(p, r, merged) + else: + G.add_edge(p, r, new_label) + + # Lösche q und alle seine Kanten + for e in incoming: + G.remove_edge(e['from'], e['to'], e['label']) + for e in self_loops: + G.remove_edge(e['from'], e['to'], e['label']) + for e in outgoing_no_self: + G.remove_edge(e['from'], e['to'], e['label']) + + G.nodes.discard(q) + G.accepts.discard(q) + eliminated.add(q) + changed = True + break + + return G + + +def _is_reachable_to_accept(G, q): + """Prüft ob von q aus ein Accept-Zustand erreichbar ist.""" + visited = set() + stack = [q] + while stack: + n = stack.pop() + if n in visited: + continue + visited.add(n) + if n in G.accepts: + return True + for e in G.outgoing(n): + stack.append(e['to']) + return False + + +def _extract_expression(G): + """ + Extrahiert den regulären Ausdruck aus dem eliminierten Automaten. + Nach Elimination gibt es nur Startzustand und Accept-Zustände. + Der Ausdruck ist die Disjunktion aller Pfade von Start zu Accept. + """ + if G.start is None: + return "∅" + + # Phase 1: State Elimination + G_elim = _state_elimination(G) + start = G_elim.start + + if not G_elim.accepts: + return "∅" + + paths = [] + outgoing = G_elim.outgoing(start) + + # Spezialfall: Start ist selbst Accept + if start in G_elim.accepts: + # Prüfe auf Selbst-Schleife + self_edges = [e for e in outgoing if e['to'] == start] + non_self = [e for e in outgoing if e['to'] != start] + + if not non_self and not self_edges: + return "ε" + + if self_edges: + self_labels = '|'.join(set(e['label'] for e in self_edges)) + paths.append(f"({self_labels})*") + + # Außer Start → Accept → andere Accepts + for e in non_self: + target = e['to'] + if target in G_elim.accepts: + paths.append(e['label']) + + # Pfade von Start zu Accept-Zuständen + for acc in G_elim.accepts: + if acc == start: + continue + # Kante start → acc + direct = [e for e in outgoing if e['to'] == acc] + for e in direct: + paths.append(e['label']) + + self_loops_start = [e for e in G_elim.outgoing(start) if e['to'] == start] + + # Weitere Kanten: start → x (wo x != accept) + intermediate = [e for e in outgoing if e['to'] not in G_elim.accepts and e['to'] != start] + for e in intermediate: + # Folge Pfad von intermediate zu accept + suffix = _follow_path(G_elim, e['to'], G_elim.accepts, set()) + if suffix: + paths.append(f"({e['label']}.{suffix})") + + # Entferne Duplikate + paths = list(set(paths)) + + if not paths: + return "ε" + + if len(paths) == 1: + expr = paths[0] + else: + expr = f"({'|'.join(paths)})" + + # Vereinfache: Entferne überflüssige Klammern + expr = _simplify_expression(expr) + + return expr + + +def _follow_path(G, start, accepts, visited): + """Findet den Pfad von start zu einem Accept.""" + if start in accepts: + return "ε" + if start in visited: + return None + visited.add(start) + + outgoing = G.outgoing(start) + for e in outgoing: + if e['to'] == start: + continue + suffix = _follow_path(G, e['to'], accepts, visited) + if suffix is not None: + if suffix == "ε": + return e['label'] + else: + return f"({e['label']}.{suffix})" + return None + + +def _simplify_expression(expr): + """ + Vereinfacht einen regulären Ausdruck. + Entfernt überflüssige Klammern, doppelte Operatoren, etc. + """ + if not expr or expr in ('ε', '∅'): + return expr + + # (ε. X ) → X + # (X . ε) → X + # ((X)) → X + # (a|a) → a + + simplified = expr + + while True: + prev = simplified + simplified = _simplify_once(simplified) + if simplified == prev: + break + + return simplified + + +def _simplify_once(expr): + """Ein Reduktionsschritt.""" + # (ε.X) → X + # (X.ε) → X + # ((X)) → X + # (a|a) → a + + result = expr + + # ((X)) → X (doppelte Klammern) + import re + result = re.sub(r'$$\(([^()]+)\)$$', r'(\1)', result) + + return result + + +def validate_k_ore(expr, k_index): + """ + Prüft ob ein Ausdruck die k-Occurrence-Bedingung erfüllt. + Ein k-ORE erlaubt jedes Symbol maximal einmal pro k-Indikator, + d.h. in jedem Konjunkt (Teilausdruck ohne |) darf jedes Symbol + höchstens k-mal vorkommen. + + Vereinfacht: Zähle Vorkommen jedes eindeutigen Token-Namens + im Ausdruck. Wenn ein Token mehr als k-mal vorkommt, ist + die Bedingung verletzt. + + Returns: + bool, str: (erfüllt, Grund) + """ + # Extrahiere alle Token-Namen aus dem Ausdruck + tokens = set() + for c in '*+?()|.': + pass + + token_names = set() + i = 0 + while i < len(expr): + if expr[i].isalnum() or expr[i] in '/_-': + j = i + while j < len(expr) and (expr[j].isalnum() or expr[j] in '/_-'): + j += 1 + token_names.add(expr[i:j]) + i = j + else: + i += 1 + + # Zähle Vorkommen + token_counts = {} + i = 0 + while i < len(expr): + if expr[i].isalnum() or expr[i] in '/_-': + j = i + while j < len(expr) and (expr[j].isalnum() or expr[j] in '/_-'): + j += 1 + token = expr[i:j] + token_counts[token] = token_counts.get(token, 0) + 1 + i = j + else: + i += 1 + + violations = [t for t, c in token_counts.items() if c > k_index] + if violations: + return False, f"Token {violations} erscheint > {k_index}-mal" + return True, "OK" + + +class kOREInference: + """ + iDRegEx: k-ORE Inferenz via PTA → Shrink → Repair → Expression. + + Nach Bex et al. 2008: + - Baue PTA aus Sequenzen + - Shrink: Rewrite-Regeln generalisieren + - Repair: Stelle Determinismus wieder her + - Convert: Extrahiere regulären Ausdruck via State Elimination + - Prüfe k-Occurrence + - Wähle k mit MDL + """ + + def __init__(self, k_max=5): + self.k_max = k_max + + def infer(self, sequences): + """ + Inferiere den besten k-ORE. + + Returns: + (Automaton, expression_string, best_k) oder None + """ + sequences = [s for s in sequences if s] + if not sequences: + return None, "∅", 0 + + best_score = float('inf') + best_result = None + + for k in range(1, self.k_max + 1): + try: + auto, expr = self._infer_k_expression(sequences, k) + if auto is None: + continue + score = mdl_score(auto, sequences) + if score < best_score: + best_score = score + best_result = (auto, expr, k) + except Exception: + continue + + return best_result + + def _infer_k_expression(self, sequences, k): + """Führe iDRegEx für ein spezifisches k durch.""" + # 1. PTA bauen + pta = build_pta(sequences) + + # 2. Shrink + shrunk = shrink(pta, max_iterations=20) + + # 3. Repair + repaired = repair(shrunk) + + # 4. Expression extrahieren + expr = _extract_expression(repaired) + + # 5. k-ORE Prüfung + valid, _ = validate_k_ore(expr, k) + if not valid: + expr = self._generalize_to_k_ore(expr, k) + + return repaired, expr + + def _generalize_to_k_ore(self, expr, k): + """ + Generalisiere den Ausdruck zur k-ORE. + + Wenn Token t mehr als k-mal vorkommt: + - Ersetze Wiederholungen durch t+ oder t* + """ + # Einfache Heuristik: Extrahiere Token, zähle, ersetze + result = expr + token_counts = {} + i = 0 + while i < len(result): + if result[i].isalnum() or result[i] in '/_-': + j = i + while j < len(result) and (result[j].isalnum() or result[j] in '/_-'): + j += 1 + token = result[i:j] + token_counts[token] = token_counts.get(token, 0) + 1 + i = j + else: + i += 1 + + for token, count in token_counts.items(): + if count > k: + # Ersetze token.token durch token+ + import re + pattern = re.escape(token) + r'\..' + re.escape(token) + replacement = f"{token}+" + result = re.sub(pattern, replacement, result, count=1) + break + + return result diff --git a/bex/marking.py b/bex/marking.py new file mode 100644 index 0000000..0702581 --- /dev/null +++ b/bex/marking.py @@ -0,0 +1,46 @@ +"""Marking — Convert k-OA to SOA over Σ^(k) (Definition 4.4, arXiv 1004.2372).""" + +from .soa import SOA +from .expr import strip_k + + +def mark_koa(G): + """ + Mark a k-OA G as a SOA over Σ^(k). + + Process nodes in arbitrary order. For the i-th occurrence of label a, + replace by a^(i) (represented as "a_i"). + + Returns a SOA H over Σ^(k) such that L(G) = strip(L(H)). + """ + H = SOA() + H.src = G.src + H.sink = G.sink + H._succ = {n: set(succ) for n, succ in G._succ.items()} + H._pred = {n: set(pred) for n, pred in G._pred.items()} + H._label = {} + H._next = G._next + + counts = {} + for n in G._succ: + lab = G._label.get(n) + if lab and lab not in ('ε', '∅') and n not in (G.src, G.sink): + sym = strip_k(lab) + counts[sym] = counts.get(sym, 0) + 1 + H._label[n] = f"{sym}_{counts[sym]}" + elif n in (G.src, G.sink): + H._label[n] = None + else: + H._label[n] = lab + + return H + + +def strip_expression(expr): + """Strip k-ORE markers from expression: a_i → a. + + Returns expression over original alphabet Σ. + """ + import re + result = re.sub(r'(_\d+)', '', expr) + return result diff --git a/bex/mdl.py b/bex/mdl.py new file mode 100644 index 0000000..3de0c6c --- /dev/null +++ b/bex/mdl.py @@ -0,0 +1,143 @@ +"""MDL scoring for iDRegEx (Algorithm 4, arXiv 1004.2372).""" + +import math +from .expr import alphabet + + +def model_cost(expr): + """|r| — number of alphabet symbol occurrences in expression.""" + import re + cleaned = re.sub(r'[+?*()|.]', '', expr) + cleaned = re.sub(r'_\d+', '', cleaned) + cleaned = re.sub(r'[ε∅]', '', cleaned) + return len(cleaned) + + +def lang_size(expr, n=None): + """Estimate |L(r)≤n| — number of words of length ≤ n in L(r). + + Simple approximation based on expression structure. + """ + if not expr or expr == '∅': + return 0 + if expr == 'ε': + return 1 + + n = n or (2 * model_cost(expr) + 1) + + total = 0 + for length in range(n + 1): + total += _count_words_fast(expr, length) + return total + + +def _count_words_fast(expr, length): + if length < 0: + return 0 + if not expr or expr == '∅': + return 0 + if expr == 'ε': + return 1 if length == 0 else 0 + + alpha = alphabet(expr) + if expr in alpha: + return 1 if length == 1 else 0 + + if '+' in expr: + inner = expr.rstrip('+') + if inner.endswith('?'): + inner = inner[:-1] + return _count_star(inner, length, min_count=1) + + if expr.endswith('?'): + inner = expr[:-1] + return _count_words_fast(inner, length) + (1 if length == 0 else 0) + + if expr.startswith('(') and '|' in expr: + parts = _split_disj(expr[1:-1]) + return sum(_count_words_fast(p.strip(), length) for p in parts) + + if '.' in expr: + parts = expr.split('.') + return _count_concat(parts, length, 0) + + return 0 + + +def _count_concat(parts, length, idx): + if idx >= len(parts): + return 1 if length == 0 else 0 + total = 0 + for take in range(length + 1): + cnt = _count_words_fast(parts[idx], take) + if cnt: + total += cnt * _count_concat(parts, length - take, idx + 1) + return total + + +def _count_star(inner, length, min_count): + total = 0 + for rep in range(min_count, length + 1): + total += _count_repeat(inner, rep, length) + return total + + +def _count_repeat(inner, rep, length): + if rep == 0: + return 1 if length == 0 else 0 + total = 0 + for take in range(length + 1): + cnt = _count_words_fast(inner, take) + if cnt: + total += cnt * _count_repeat(inner, rep - 1, length - take) + return total + + +def _split_disj(s): + depth = 0 + parts = [] + cur = [] + for ch in s: + if ch == '(': + depth += 1 + cur.append(ch) + elif ch == ')': + depth -= 1 + cur.append(ch) + elif ch == '|' and depth == 0: + parts.append(''.join(cur)) + cur = [] + else: + cur.append(ch) + parts.append(''.join(cur)) + return parts + + +def data_cost(expr, sequences): + """MDL data cost: Σ_i log₂(|L=i(r)| / |S=i|) adjusted. + + Simplified form: for each word in S, cost = log₂(lang_size of all words + of that length). + """ + n = 2 * model_cost(expr) + 1 + total_cost = 0.0 + for seq in sequences: + length = len(seq) + if length <= n: + lang_at_len = _count_words_fast(expr, length) + if lang_at_len > 0: + total_cost += math.log2(lang_at_len) if lang_at_len > 0 else 0 + return total_cost + + +def mdl_score(expr, sequences): + """MDL = model cost + data cost.""" + model = model_cost(expr) + data = data_cost(expr, sequences) + return model + data + + +# For backward compatibility +class MDLScorer: + def score(self, expr, sequences): + return mdl_score(expr, sequences) diff --git a/bex/pta.py b/bex/pta.py new file mode 100644 index 0000000..37fc738 --- /dev/null +++ b/bex/pta.py @@ -0,0 +1,62 @@ +""" +pta — Prefix-Tree Automaton (PTA) construction. + +Nach Bex et al. 2008/2010: Der PTA ist der initiale Automat, der aus +den positiven Beispielsequenzen (Token-Sequenzen) konstruiert wird. + +Jede Sequenz wird als Pfad im Trie abgebildet: + - Wurzel = Startzustand + - Jeder gemeinsame Prefix wird geteilt (wie im Trie) + - Der letzte Zustand jeder Sequenz wird als accept markiert + +Der PTA ist deterministisch und akzeptiert genau die gegebenen Sequenzen. +Er ist der Ausgangspunkt für die SORE/CHARE-Inferenz via shrink-Rewrites. +""" + +from .automaton import Automaton + + +def build_pta(sequences): + """ + Konstruiert den Prefix-Tree Automaton aus einer Liste von Token-Sequenzen. + + Nach Bex et al. 2008/2010, Algorithmus PTA: + - Initialisiere mit Startzustand q0 + - Für jede Sequenz w = a1...an: + - Starte in q0 + - Für jedes ai: Folge der Kante (q, ai) falls vorhanden, + sonst erzeuge neuen Zustand q' und Kante (q, q', ai) + - Markiere Endzustand als accept + + Args: + sequences: Liste von Token-Listen (jede = ein YAML-Dokument) + + Returns: + Automaton: PTA für die gegebenen Sequenzen + + Example: + >>> build_pta([["apt", "service"], ["apt", "template", "service"]]) + Automaton(nodes=5, edges=5, start=0, accepts={3, 4}) + """ + automaton = Automaton(start=0) + automaton.add_node(0) + + next_id = 1 + + for seq in sequences: + current = 0 + for token in seq: + found = False + for (to, label) in automaton.successors(current): + if label == token: + current = to + found = True + break + if not found: + new_node = next_id + next_id += 1 + automaton.add_edge(current, new_node, token) + current = new_node + automaton.add_accept(current) + + return automaton diff --git a/bex/repair.py b/bex/repair.py new file mode 100644 index 0000000..89c3d25 --- /dev/null +++ b/bex/repair.py @@ -0,0 +1,167 @@ +""" +repair — Determinism Repair nach Bex 2010. + +Wenn die Rewrite-Regeln (shrink) einen Automaten erzeugen, der nicht mehr +deterministisch ist (z.B. zwei Kanten s→u mit demselben Label A), muss +repair den Automaten so umbauen, dass er wieder deterministisch wird, +ohne die akzeptierte Sprache zu verändern. + +Bex 2010, Section 4.2.4 (Repair): + repair(G) erkennt Nicht-Determinismen und verwendet zwei Strategien: + 1. Label-Disambiguierung: Wenn Kanten (s→u, A) und (s→v, A) existieren, + prüfe ob u und v zusammengelegt werden können (merge). + 2. Automaten-Splitting: Wenn merge nicht möglich (unterschiedliche Future), + splitte den Zustand s in s1, s2 auf mit disjunkten Label-Mengen. + +Die repair-Funktion wird nach jedem shrink-Durchlauf aufgerufen. +""" + +from .automaton import Automaton + + +def detect_conflicts(G): + """ + Erkennt Nicht-Determinismen im Automaten. + + Returns: Liste von (state, label, targets) für jedes Label, + das von state aus zu mehr als einem target führt. + """ + conflicts = [] + for node in G.nodes: + label_map = {} + for e in G.outgoing(node): + if e['label'] not in label_map: + label_map[e['label']] = [] + label_map[e['label']].append(e['to']) + for label, targets in label_map.items(): + if len(targets) > 1: + conflicts.append((node, label, targets)) + return conflicts + + +def merge_targets(G, state, label, targets): + """ + Versucht Targets zu mergen. + Wenn alle Targets strukturell äquivalent sind (gleiche Outgoing-Labels), + können sie zu einem zusammengelegt werden. + """ + future_sets = [] + for t in targets: + futures = {(e['to'], e['label']) for e in G.outgoing(t)} + future_sets.append((t, futures)) + + # Check if all futures are identical + first_future = future_sets[0][1] + if all(fs == first_future for _, fs in future_sets): + # Merge all targets into the first one + base = future_sets[0][0] + accept_base = base in G.accepts + for t, _ in future_sets[1:]: + if t in G.accepts: + G.add_accept(base) + if base != t: + for e in G.incoming(t): + if e['from'] != state: + G.add_edge(e['from'], base, e['label']) + G.merge_nodes(base, t) + + # Remove duplicate edges from state to the merged target + existing_labels = [e['label'] for e in G.outgoing(state) if e['to'] == base] + if label in existing_labels: + existing_labels.remove(label) + if label not in existing_labels: + G.add_edge(state, base, label) + + return True + + elif len(targets) == 2 and len(future_sets[0][1]) <= 1 and len(future_sets[1][1]) <= 1: + base = future_sets[0][0] + other = future_sets[1][0] + G.merge_nodes(base, other) + G.add_edge(state, base, label) + return True + + return False + + +def split_automaton(G, state, label, targets): + """ + Splittet den Zustand 'state' in mehrere Kopien, je eine pro Ziel. + Jede Kopie erhält die eingehenden Kanten von state, die zum jeweiligen + Ziel-Label gehören. + """ + # Find the highest node ID + max_id = max(G.nodes) if G.nodes else 0 + + incoming = G.incoming(state) + outgoing = G.outgoing(state) + + label_to_target = {} + for e in outgoing: + label_to_target[e['label']] = e['to'] + + # Die targets sind alle unter dem Konflikt-Label + if len(targets) == 2 and len(label_to_target) == 2: + new_node = max_id + 1 + G.add_node(new_node) + + target1, target2 = targets[0], targets[1] + + for e in list(G.incoming(state)): + if e['from'] == state: + continue + G.add_edge(e['from'], new_node, e['label']) + + label_for_other = [k for k, v in label_to_target.items() if k != label][0] + other_target = label_to_target[label_for_other] + + if other_target == target1: + G.add_edge(new_node, target1, label) + elif other_target == target2: + G.add_edge(state, target1, label) + else: + G.add_edge(state, target1, label) + + return True + + return False + + +def repair(G): + """ + repair — Stellt Determinismus nach Rewrite-Operationen wieder her. + + Nach Bex 2010, repair-Algorithmus: + 1. Erkenne Nicht-Determinismen (detect_conflicts) + 2. Für jeden Konflikt: + a. Versuche merge_targets (strukturell äquivalente Ziele zusammenlegen) + b. Falls nicht möglich: split_automaton (Zustand aufspalten) + 3. Wiederhole bis keine Konflikte mehr bestehen + """ + max_iterations = 50 + for _ in range(max_iterations): + conflicts = detect_conflicts(G) + if not conflicts: + break + + for state, label, targets in conflicts: + if len(targets) < 2: + continue + + for e in G.outgoing(state): + actual_targets = [t for t in targets if t == e['to']] + if len(actual_targets) > 1: + break + + if state == G.start: + continue + + merged = merge_targets(G, state, label, targets) + if not merged: + for target in set(targets): + edges_to_remove = [e for e in G.outgoing(state) + if e['label'] == label and e['to'] == target] + for e in edges_to_remove[1:]: + G.remove_edge(e['from'], e['to'], e['label']) + + return G diff --git a/bex/role_grammar.py b/bex/role_grammar.py new file mode 100644 index 0000000..79c2fe8 --- /dev/null +++ b/bex/role_grammar.py @@ -0,0 +1,111 @@ +"""Extract Ansible role task module sequences and learn per-group grammars.""" + +from pathlib import Path +import yaml +from collections import defaultdict + +from .crx import CRX +from .expr import strip_k + + +IGNORE_MODULES = frozenset({'name', 'tags', 'when', 'register', 'no_log', + 'changed_when', 'failed_when', 'ignore_errors', + 'run_once', 'delegate_to', 'loop', 'loop_control', + 'until', 'retries', 'delay', 'poll', 'async', + 'become', 'become_user', 'become_flags', + 'check_mode', 'diff', 'environment', + 'vars', 'notify', 'args', + 'block', 'rescue', 'always', 'include_tasks'}) + + +def extract_module_name(task): + """Extract the Ansible module name from a task dict. + + The module is the key that is NOT a known non-module key. + Returns 'skip' for non-task entries like block/rescue/always. + """ + if not isinstance(task, dict): + return None + # Check for block/rescue/always — these contain nested tasks + for key in ('block', 'rescue', 'always'): + if key in task: + nested = task[key] + if isinstance(nested, list): + return [extract_module_name(t) for t in nested] + return None + # Find the module key (not name, not meta-keys) + for key, value in task.items(): + if key in ('name',): + continue + if key in IGNORE_MODULES: + continue + if isinstance(value, (dict, list, str, bool, int, float)): + # It's the module name (venv or fqcn) + return strip_k(key) + return None + + +def flatten_nested(seq): + """Flatten nested lists into a single list.""" + result = [] + for item in seq: + if isinstance(item, list): + result.extend(flatten_nested(item)) + elif item is not None and item != 'skip': + result.append(item) + return result + + +def get_role_category(role_name): + """Extract category from role name like deploy_foo → deploy.""" + parts = role_name.split('_') + if len(parts) >= 2: + return parts[0] + return 'other' + + +def load_role_module_sequence(role_dir): + """Load a role's task file and extract the module sequence.""" + task_file = role_dir / 'tasks' / 'main.yml' + if not task_file.exists(): + return None, None + with open(task_file) as f: + data = yaml.safe_load(f) + if not isinstance(data, list): + return None, None + + modules = [] + for task in data: + result = extract_module_name(task) + if isinstance(result, list): + modules.extend(flatten_nested(result)) + elif result is not None: + modules.append(result) + + return role_dir.name, modules + + +def collect_all_role_sequences(roles_dir='roles'): + """Collect module sequences from all roles, grouped by category.""" + by_category = defaultdict(list) + all_roles = [] + for role_dir in sorted(Path(roles_dir).glob('*/tasks/main.yml')): + role_name = role_dir.parent.parent.name + name, seq = load_role_module_sequence(role_dir.parent.parent) + if seq: + cat = get_role_category(role_name) + by_category[cat].append((role_name, seq)) + all_roles.append((role_name, seq)) + return all_roles, by_category + + +def learn_grammar(sequences): + """Run CRX on a list of sequences.""" + if len(sequences) < 2: + seqs = [sequences[0]] if sequences else [] + else: + seqs = sequences + if not seqs: + return 'ε' + crx = CRX() + return crx.infer(seqs) diff --git a/bex/rwr0.py b/bex/rwr0.py new file mode 100644 index 0000000..46fc44c --- /dev/null +++ b/bex/rwr0.py @@ -0,0 +1,224 @@ +"""RWR₀ — Algorithm 6 (TODS 2010), conference version rules (Figure 10 + Figure 13). + +Precedence: CONCATENATION > DISJUNCTION > SELF-LOOP > OPTIONAL +Repair precedence: ENABLE-DISJUNCTION > ENABLE-OPTIONAL-1 > ENABLE-OPTIONAL-2 + +Conditions checked on ε-closure G* (Definition 25). +Used as rwr²₁ in arXiv 1004.2372 for k>1. +""" + +from .soa import SOA +from .expr import concat, disj, star, optional + + +def _find_concat(G, Gs): + """Figure 10 CONCATENATION rule, checked on G*. + + Check four variants with priority: r·s > r?·s|r·s? > r?·s? + r·s: Succ(r)={s} ∧ Pred(s)={r} + r?·s: Succ(r)={s,sink} ∧ Pred(s)={r} + r·s?: Succ(r)={s} ∧ Pred(s)={r,sink} + r?·s?: Succ(r)={s,sink} ∧ Pred(s)={r,sink} + """ + st = G.states() + # Variant 1: r·s (highest priority — check all pairs first) + for r in st: + for s in st: + if r == s: + continue + if Gs.succ(r) == {s} and G.pred(s) == {r}: + return r, s, concat(G.label(r), G.label(s)) + # Variants 2-3: r?·s and r·s? + for r in st: + for s in st: + if r == s: + continue + Sr = Gs.succ(r) + Ps = G.pred(s) + if Sr == {s, G.sink} and Ps == {r}: + return r, s, concat(G.label(r), optional(G.label(s))) + if Sr == {s} and Ps == {r, G.sink}: + return r, s, concat(optional(G.label(r)), G.label(s)) + # Variant 4: r?·s? + for r in st: + for s in st: + if r == s: + continue + if Gs.succ(r) == {s, G.sink} and G.pred(s) == {r, G.sink}: + return r, s, concat(optional(G.label(r)), optional(G.label(s))) + return None, None, None + + +def _find_disj(G, Gs): + """Figure 10 DISJUNCTION rule, checked on G*. + + Pred⁺(r)=Pred⁺(s) ∧ Succ⁺(r)=Succ⁺(s) + """ + st = G.states() + for i, r in enumerate(st): + for s in st[i + 1:]: + if G._pred_plus(r) == G._pred_plus(s) and G._succ_plus(r) == G._succ_plus(s): + return r, s, disj(G.label(r), G.label(s)) + return None, None, None + + +def _find_selfloop(G, Gs): + """Figure 10 SELF-LOOP rule. r ∈ Succ(r) in G (not G*).""" + for r in G.states(): + if G.has_edge(r, r): + return r, star(G.label(r)) + return None, None + + +def _find_optional(G): + """Figure 10 OPTIONAL rule. G contains exactly one non-special node besides src, sink. + Only applies when G is not already final (avoids infinite loop).""" + if G.is_final(): + return None, None + if G.num_non_special() == 1: + r = G.states()[0] + return r, optional(G.label(r)) + return None, None + + +def _try_ed(G): + """ENABLE-DISJUNCTION (Figure 13). When Pred(r)=Pred(s) but Succ(r)≠Succ(s): + add edges to make Succ(r)=Succ(s). Or symmetric for Pred. + """ + st = G.states() + for i, r in enumerate(st): + for s in st[i + 1:]: + if G._pred_plus(r) == G._pred_plus(s) and G._succ_plus(r) != G._succ_plus(s): + merged = G._succ_plus(r) | G._succ_plus(s) + changed = False + for t in merged - G._succ_plus(r): + if not G.has_edge(r, t): + G.add_edge(r, t) + changed = True + for t in merged - G._succ_plus(s): + if not G.has_edge(s, t): + G.add_edge(s, t) + changed = True + if changed: + return True + if G._succ_plus(r) == G._succ_plus(s) and G._pred_plus(r) != G._pred_plus(s): + merged = G._pred_plus(r) | G._pred_plus(s) + changed = False + for p in merged - G._pred_plus(r): + if not G.has_edge(p, r): + G.add_edge(p, r) + changed = True + for p in merged - G._pred_plus(s): + if not G.has_edge(p, s): + G.add_edge(p, s) + changed = True + if changed: + return True + return False + + +def _try_eo1(G): + """ENABLE-OPTIONAL-1 (Figure 13). If Succ(r)={s,sink} but Pred(s) has other + predecessors besides r, add Pred(s) to r's predecessors. + """ + for r in G.states(): + Sr = G.succ(r) + if G.sink in Sr and len(Sr) == 2: + s = next(x for x in Sr if x != G.sink) + if len(G.pred(s)) > 1: + changed = False + for p in G.pred(s) - {r}: + if not G.has_edge(p, r): + G.add_edge(p, r) + changed = True + if changed: + return True + return False + + +def _try_eo2(G): + """ENABLE-OPTIONAL-2 (Figure 13). If Pred(s)={r,sink} but Succ(r) has other + successors besides s, add Succ(r) to s's successors. + """ + for s in G.states(): + Ps = G.pred(s) + if G.sink in Ps and len(Ps) == 2: + r = next(x for x in Ps if x != G.sink) + if len(G.succ(r)) > 1: + changed = False + for t in G.succ(r) - {s}: + if not G.has_edge(s, t): + G.add_edge(s, t) + changed = True + if changed: + return True + return False + + +def rwr0(G): + """ + |———— Algorithm 6: RWR₀ ————| + Input: SOA G + Output: SORE r (or ∅ on failure) + + 1: if sink not reachable: return ∅ + 2: if E(G)={(src,sink)}: return ε + 3: while not done: + 4: if rewrite (Figure 10) applicable: + 5: apply with precedence: CONCAT > DISJ > SELF-LOOP > OPTIONAL + 6: elif repair (Figure 13) applicable: + 7: apply with precedence: ED > EO1 > EO2 + 8: else: done + 9: if final: return r else return ∅ + """ + G = G.copy() + if not G.sink_reachable(): + return '∅' + if G.num_non_special() == 0 and G.has_edge(G.src, G.sink): + return 'ε' + + done = False + while not done: + applied = False + Gs = G.epsilon_closure() + + r, s, lab = _find_concat(G, Gs) + if r is not None: + G.contract(r, s, lab) + applied = True + + if not applied: + Gs = G.epsilon_closure() + r, s, lab = _find_disj(G, Gs) + if r is not None: + G.contract(r, s, lab) + applied = True + + if not applied: + Gs = G.epsilon_closure() + r, lab = _find_selfloop(G, Gs) + if r is not None: + t = G.contract_single(r, lab) + G.rm_edge(t, t) + applied = True + + if not applied: + r, lab = _find_optional(G) + if r is not None: + G.contract_single(r, lab) + if not G.has_edge(G.src, G.sink): + G.add_edge(G.src, G.sink) + applied = True + + if not applied: + applied = _try_ed(G) + if not applied: + applied = _try_eo1(G) + if not applied: + applied = _try_eo2(G) + if not applied: + done = True + + if G.is_final(): + return G.expression() + return '∅' diff --git a/bex/rwrsq.py b/bex/rwrsq.py new file mode 100644 index 0000000..5a1b8ad --- /dev/null +++ b/bex/rwrsq.py @@ -0,0 +1,31 @@ +"""rwr² — Translate k-OA to k-ORE (Algorithm 3, arXiv 1004.2372). + +rwr²(G): + 1: compute a marking H of G + 2: return strip(rwr²₁(H)) +""" + +import re +from .marking import mark_koa +from .rwr0 import rwr0 + + +def strip(expr): + """Remove k-ORE markers: a_i → a.""" + return re.sub(r'_\d+', '', expr) + + +def rwr_sq(G): + """ + |———— Algorithm 3: rwr² ————| + Require: k-OA G + Ensure: k-ORE r with L(G) ⊆ L(r) + + 1: H ← marking of G + 2: return strip(rwr²₁(H)) + """ + H = mark_koa(G) + result = rwr0(H) + if result is None or result == '∅': + return None + return strip(result) diff --git a/bex/shrink.py b/bex/shrink.py new file mode 100644 index 0000000..94be941 --- /dev/null +++ b/bex/shrink.py @@ -0,0 +1,267 @@ +""" +shrink — SORE-Transformation via Rewrite-Regeln. + +Nach Bex et al. 2010 (TWEB): Der shrink-Operator transformiert einen +Automaten (PTA) in einen SORE (Single Occurrence Regular Expression) +durch wiederholte Anwendung von Rewrite-Regeln. + +Die Rewrite-Regeln (Bex 2010, Section 4.2): + 1. simplify — Entferne redundante Kanten, vereinige parallele Pfade + 2. star_rewrite — Ersetze Selbst-Schleife (s →label s) durch label* + 3. concat_rewrite — Zustandseliminierung: s →t →u → s →u mit label = l1·l2 + 4. alternation_rewrite — Mehrere Aus-Kanten: s →t1, s →t2 → s →(t1 | t2) + +Jeder Rewrite-Schritt wird durch eine MDL-Kostenfunktion bewertet. +Der Prozess ist iterativ: Solange die MDL sinkt, wird der gewinbringendste +Rewrite angewendet (PriorityQueue nach MDL-Gain). +""" + +import heapq +from .automaton import Automaton + + +def simplify(automaton): + """ + simplify — Entfernt redundante Kanten und vereinigt parallele Pfade. + + Nach Bex 2010, shrink-Schritt 1: + - Wenn zwei Kanten (s→t, label1) und (s→t, label2) existieren, + ersetze durch s→t mit label = (label1 | label2) + - Entferne unerreichbare Zustände (kein Pfad vom Start aus) + """ + G = automaton.copy() + + # Phase 1: Parallel edges → alternation + processed = set() + for e in list(G.edges): + key = (e['from'], e['to']) + if key in processed: + continue + parallel = [e2 for e2 in G.edges if e2['from'] == key[0] and e2['to'] == key[1]] + if len(parallel) > 1: + labels = list(set(e2['label'] for e2 in parallel)) + new_label = f"({'|'.join(labels)})" + for e2 in parallel: + G.remove_edge(e2['from'], e2['to'], e2['label']) + G.add_edge(key[0], key[1], new_label) + processed.add(key) + + # Phase 2: Remove unreachable nodes + reachable = set() + stack = [G.start] if G.start is not None else [] + while stack: + n = stack.pop() + if n in reachable: + continue + reachable.add(n) + for e in G.outgoing(n): + stack.append(e['to']) + + unreachable = G.nodes - reachable + for n in unreachable: + G.nodes.discard(n) + G.edges = [e for e in G.edges if e['from'] != n and e['to'] != n] + G.accepts.discard(n) + + return G + + +def apply_star_rewrite(G, s): + """ + Star-Rewrite: Ersetzt Selbst-Schleife (s →label s) durch label*. + + Nach Bex 2010, Algorithmus apply_star_rewrite: + Wenn ein Zustand s eine Selbst-Schleife mit label L hat: + - Entferne die Selbst-Schleife + - Markiere s mit einem Stern-Metadatum (wird später im Regex exportiert) + """ + loops = [e for e in G.edges if e['from'] == s and e['to'] == s] + if not loops: + return G + + new_G = G.copy() + for e in loops: + new_G.remove_edge(e['from'], e['to'], e['label']) + + labels = list(set(e['label'] for e in loops)) + if len(labels) == 1: + star_label = f"{labels[0]}*" + else: + star_label = f"({'|'.join(labels)})*" + + new_G.add_edge(s, s, star_label) + return new_G + + +def apply_concat_rewrite(G, t): + """ + Concat-Rewrite (Zustandseliminierung): Eliminiert Zustand t. + + Nach Bex 2010, Algorithmus apply_concat_rewrite: + Wenn ein Zustand t (nicht Start/Accept) genau einen In- und einen Out-Edge hat: + s → t (label1), t → u (label2) → s → u (label1·label2) + Dann entferne t und ersetze durch direkte Kante. + + Allgemeiner: Für jeden In-Edge (s→t, l1) und Out-Edge (t→u, l2), + füge (s→u, l1·l2) hinzu, entferne dann t. + """ + G = G.copy() + incoming = G.incoming(t) + outgoing = G.outgoing(t) + + if not incoming and not outgoing: + G.nodes.discard(t) + G.accepts.discard(t) + return G + + if t in (G.start, ) or t in G.accepts: + return G + + if len(incoming) == 1 and len(outgoing) == 1: + s = incoming[0]['from'] + u = outgoing[0]['to'] + l1 = incoming[0]['label'] + l2 = outgoing[0]['label'] + + G.remove_edge(s, t, l1) + G.remove_edge(t, u, l2) + G.add_edge(s, u, f"({l1}.{l2})") + + G.nodes.discard(t) + G.accepts.discard(t) + return G + + has_self_loop = any(e['from'] == t and e['to'] == t for e in G.edges) + if not has_self_loop: + for e_in in incoming: + for e_out in outgoing: + if e_out['to'] == t: + continue + s = e_in['from'] + u = e_out['to'] + l1 = e_in['label'] + l2 = e_out['label'] + + existing_labels = [e2['label'] for e2 in G.edges + if e2['from'] == s and e2['to'] == u] + new_label = f"({l1}.{l2})" + if new_label not in existing_labels: + G.add_edge(s, u, new_label) + + for e in incoming: + G.remove_edge(e['from'], e['to'], e['label']) + for e in outgoing: + if e['to'] != t: + G.remove_edge(e['from'], e['to'], e['label']) + + G.nodes.discard(t) + G.accepts.discard(t) + + return G + + +def apply_alternation_rewrite(G, s): + """ + Alternation-Rewrite: Fasst mehrere ausgehende Kanten zu (l1 | l2) zusammen. + + Nach Bex 2010: Wenn s zwei Kanten s → u (label1) und s → v (label2) hat, + und u und v strukturell ähnlich sind: + - Merge u in v (d.h. alle Kanten von u werden auf v umgeleitet) + - Neue Kante s → v mit label = (label1 | label2) + """ + G = G.copy() + outgoing = G.outgoing(s) + + if len(outgoing) < 2: + return G + + label_set = {} + for e in outgoing: + target = e['to'] + if target not in label_set: + label_set[target] = [] + label_set[target].append(e['label']) + + while len(label_set) >= 2: + targets = list(label_set.keys()) + t1, t2 = targets[0], targets[1] + + labels1 = label_set[t1] + labels2 = label_set[t2] + + for l in labels1: + G.remove_edge(s, t1, l) + for l in labels2: + G.remove_edge(s, t2, l) + + new_labels = labels1 + labels2 + + if t1 == t2: + new_label = f"({'|'.join(new_labels)})" + G.add_edge(s, t1, new_label) + break + + G.merge_nodes(t2, t1) + + new_label = f"({'|'.join(new_labels)})" + G.add_edge(s, t2, new_label) + + del label_set[t1] + label_set[t2] = new_labels + + return G + + +def has_single_accept(G): + return len(G.accepts) == 1 + + +def shrink(automaton, max_iterations=100): + """ + shrink — Hauptalgorithmus: Transformiert PTA in SORE. + + Nach Bex 2010, Algorithmus shrink: + Wiederhole bis Konvergenz (MDL sinkt nicht mehr oder max_iterations): + 1. simplify(G) + 2. Für jeden Zustand s mit Selbst-Schleife: apply_star_rewrite(G, s) + 3. Für jeden Zustand t (nicht Start/Accept): apply_concat_rewrite(G, t) + 4. Für jeden Zustand s mit >1 Out-Edge: apply_alternation_rewrite(G, s) + 5. Überprüfe Determinismus (gib an repair weiter) + """ + G = automaton.copy() + + for iteration in range(max_iterations): + prev_edge_count = len(G.edges) + + G = simplify(G) + changed = len(G.edges) < prev_edge_count + + for node in list(G.nodes): + if G.has_self_loop(node): + G_new = apply_star_rewrite(G, node) + if len(G_new.edges) != len(G.edges): + G = G_new + changed = True + + for node in list(G.nodes): + if node == G.start or node in G.accepts: + continue + incoming = G.incoming(node) + outgoing = G.outgoing(node) + if len(incoming) >= 1 and len(outgoing) >= 1: + G_new = apply_concat_rewrite(G, node) + if len(G_new.nodes) < len(G.nodes): + G = G_new + changed = True + + for node in list(G.nodes): + if len(G.outgoing(node)) >= 2: + G_new = apply_alternation_rewrite(G, node) + if len(G_new.edges) < len(G.edges): + G = G_new + changed = True + + if not changed: + break + + return G diff --git a/bex/soa.py b/bex/soa.py new file mode 100644 index 0000000..602d922 --- /dev/null +++ b/bex/soa.py @@ -0,0 +1,193 @@ +"""SOA — Single Occurrence Automaton (Definition 6, TODS 2010).""" + +import copy +from .expr import concat, disj, star, optional + + +class SOA: + """ + Node-labeled automaton (Definition 6, TODS 2010). + + V = {src, sink} ∪ symbol-labeled states. + E ⊆ V × V, unlabeled edges. + Walk src=v₁,v₂,...,vₙ₊₁=sink accepts word lab(v₂)...lab(vₙ). + + States are proper SOREs, pairwise alphabet-disjoint (Definition 10). + """ + + def __init__(self): + self._next = 0 + self._succ = {} + self._pred = {} + self._label = {} + self.src = self._new() + self.sink = self._new() + + def _new(self): + n = self._next + self._next += 1 + self._succ[n] = set() + self._pred[n] = set() + self._label[n] = None + return n + + def add_state(self, label): + n = self._new() + self._label[n] = label + return n + + def add_edge(self, f, t): + self._succ[f].add(t) + self._pred[t].add(f) + + def rm_edge(self, f, t): + self._succ[f].discard(t) + self._pred[t].discard(f) + + def rm_state(self, n): + if n in (self.src, self.sink): + return + for p in list(self._pred[n]): + self.rm_edge(p, n) + for s in list(self._succ[n]): + self.rm_edge(n, s) + del self._label[n] + del self._succ[n] + del self._pred[n] + + def label(self, n): + return self._label.get(n) + + def set_label(self, n, lab): + self._label[n] = lab + + def succ(self, n): + return set(self._succ.get(n, set())) + + def pred(self, n): + return set(self._pred.get(n, set())) + + def has_edge(self, f, t): + return t in self._succ.get(f, set()) + + def states(self): + return [n for n in self._succ if n not in (self.src, self.sink) and self._label.get(n) is not None] + + def _pred_plus(self, n): + r = set(self._pred.get(n, set())) + if self._label.get(n) and self._label[n].endswith('+'): + r.add(n) + return r + + def _succ_plus(self, n): + r = set(self._succ.get(n, set())) + if self._label.get(n) and self._label[n].endswith('+'): + r.add(n) + return r + + def copy(self): + return copy.deepcopy(self) + + def accept(self, w): + cur = {self.src} + for sym in w: + nxt = set() + for s in cur: + for t in self._succ.get(s, set()): + if self._label.get(t) == sym: + nxt.add(t) + if not nxt: + return False + cur = nxt + return any(self.sink in self._succ.get(s, set()) for s in cur) + + def sink_reachable(self): + seen = set() + q = [self.src] + while q: + s = q.pop() + if s == self.sink: + return True + if s in seen: + continue + seen.add(s) + q.extend(self._succ.get(s, [])) + return False + + def num_non_special(self): + return sum(1 for n in self._succ if n not in (self.src, self.sink)) + + def is_final(self): + ns = self.states() + return len(ns) == 1 and self.has_edge(self.src, ns[0]) and self.has_edge(ns[0], self.sink) + + def expression(self): + if not self.is_final(): + return None + return self._label[self.states()[0]] + + def contract(self, r, s, new_label): + """ + State contraction G[r,s ⇒ t] (Definition 11, TODS 2010). + + (1) Add t as new state with label new_label. + (2) Every v ∈ Pred(r) − {r,s} → predecessor of t. + (3) Every w ∈ Succ(s) − {r,s} → successor of t. [matching figures] + (4) Loop t→t if r ∈ Succ(s). + (5) Remove r, s and all edges. + """ + t = self._new() + self._label[t] = new_label + for v in self._pred.get(r, set()) - {r, s}: + self.add_edge(v, t) + for v in self._pred.get(s, set()) - {r, s}: + self.add_edge(v, t) + for w in self._succ.get(r, set()) - {r, s}: + self.add_edge(t, w) + for w in self._succ.get(s, set()) - {r, s}: + self.add_edge(t, w) + if r in self._succ.get(s, set()): + self.add_edge(t, t) + self.rm_state(r) + self.rm_state(s) + return t + + def contract_single(self, r, new_label): + """Single-state substitution G[r ⇒ t] (Definition 11 note).""" + if r in (self.src, self.sink): + return r + t = self._new() + self._label[t] = new_label + for v in self._pred.get(r, set()) - {r}: + self.add_edge(v, t) + for w in self._succ.get(r, set()) - {r}: + self.add_edge(t, w) + if r in self._succ.get(r, set()): + self.add_edge(t, t) + self.rm_state(r) + return t + + def epsilon_closure(self): + """G* (Definition 25, TODS 2010). Add self-loops for + states and ε-transitive closure.""" + G = self.copy() + changed = True + while changed: + changed = False + for n in list(G._succ.keys()): + lab = G._label.get(n) + if lab and (lab.endswith('+') or lab.endswith('+?')): + if not G.has_edge(n, n): + G.add_edge(n, n) + changed = True + for n in list(G._succ.keys()): + for m in list(G._succ.get(n, set())): + mlab = G._label.get(m) + if mlab == 'ε': + for mp in list(G._succ.get(m, set())): + if mp != n and not G.has_edge(n, mp): + G.add_edge(n, mp) + changed = True + return G + + def __repr__(self): + return f"SOA(nodes={len(self._succ)}, special={self.num_non_special()})" diff --git a/bex/template.py b/bex/template.py new file mode 100644 index 0000000..b9ceb4e --- /dev/null +++ b/bex/template.py @@ -0,0 +1,154 @@ +""" +template — One-Shot YAML Template Generator. + +Wandelt den inferierten k-ORE/SORE/CHARE regulären Ausdruck zurück +in ein menschenlesbares YAML-Skelett für LLM-Prompts. + +Der Generator erzeugt: + - Ein YAML-Grundgerüst mit Platzhaltern + - Kommentare mit Kardinalitätshinweisen: + * # PFLICHT: Genau 1 mal erforderlich + * # PFLICHT: 1 oder mehrmals erforderlich + * # OPTIONAL: 0 oder 1 mal (darf weggelassen werden) + * # OPTIONAL: 0 oder mehrmals + * # WAHLWEISE: alternatives Modul +""" + + +def parse_expression(expr): + """Zerlegt einen regulären Ausdruck in seine Bestandteile.""" + if not expr or expr in ('∅', 'ε', ''): + return [('empty', 'ε')] + + tokens = [] + i = 0 + while i < len(expr): + if expr[i] == '(': + depth = 1 + j = i + 1 + while j < len(expr) and depth > 0: + if expr[j] == '(': + depth += 1 + elif expr[j] == ')': + depth -= 1 + j += 1 + group = expr[i:j] + quantifier = '' + if j < len(expr) and expr[j] in '*+?': + quantifier = expr[j] + j += 1 + tokens.append(('group', group, quantifier)) + i = j + elif expr[i] == '|': + tokens.append(('pipe', '|')) + i += 1 + elif expr[i] == '.': + if i + 1 < len(expr) and expr[i + 1] == '.': + tokens.append(('concat', '..')) + i += 2 + else: + tokens.append(('concat', '.')) + i += 1 + elif expr[i] in '*+?': + if tokens and tokens[-1][0] == 'name': + name, val, _ = tokens[-1] + tokens[-1] = (name, val, expr[i]) + i += 1 + elif expr[i].isalnum() or expr[i] in '/_-': + j = i + while j < len(expr) and (expr[j].isalnum() or expr[j] in '/_-'): + j += 1 + name = expr[i:j] + tokens.append(('name', name, '')) + i = j + else: + i += 1 + + return tokens + + +def format_prompt_cardinality(quantifier): + """Gibt die deutsche Kardinalitätsbeschreibung für einen Quantifier zurück.""" + mapping = { + '': '# PFLICHT: Genau 1 mal erforderlich', + '+': '# PFLICHT: 1 oder mehrmals erforderlich', + '*': '# OPTIONAL: 0 oder mehrmals', + '?': '# OPTIONAL: 0 oder 1 mal (darf weggelassen werden)', + } + return mapping.get(quantifier, '') + + +def generate_template(expr, context_key=None, include_header=True): + """ + Generiert ein YAML-One-Shot-Template aus einem regulären Ausdruck. + + Args: + expr: Der inferierte Ausdruck (String) + context_key: Name des YAML-Container-Keys (z.B. 'tasks') + include_header: Ob der Header-Teil (name, hosts) eingefügt wird + + Returns: + String: YAML-Skelett mit Platzhaltern und Kardinalitätskommentaren + """ + if not expr or expr in ('∅', 'ε'): + return "# Keine Struktur inferiert (leere Sequenzen oder keine Beispiele)" + + if include_header: + lines = [ + "- name: ", + " hosts: # PFLICHT: Genau 1 mal erforderlich", + ] + if context_key: + lines.append(f" {context_key}:") + else: + lines.append(" tasks:") + indent = " " + else: + lines = [] + if context_key: + lines.append(f" {context_key}: # Container-Kontext: {context_key}") + else: + lines.append(" tasks:") + indent = " " + + tokens = parse_expression(expr) + task_index = 0 + skip_until_pipe = False + + alternatives = [] + in_alternatives = False + + i = 0 + while i < len(tokens): + token = tokens[i] + + if token[0] == 'group': + group_str = token[1] + quantifier = token[2] + card = format_prompt_cardinality(quantifier) + inner_expr = group_str[1:-1] + if '|' in inner_expr: + alts = inner_expr.split('|') + lines.append(f"{indent}# WAHLWEISE (eines auswählen):") + for alt in alts: + alt_clean = alt.strip() + lines.append(f"{indent}# - {alt_clean}: ") + if card: + lines[-1] = f"{lines[-1]} {card}" + else: + lines.append(f"{indent}- {inner_expr}: {card}") + task_index += 1 + + elif token[0] == 'name': + name = token[1] + quantifier = token[2] + card = format_prompt_cardinality(quantifier) + lines.append(f"{indent}- {name}: {card}") + task_index += 1 + + elif token[0] == 'pipe': + pass + + i += 1 + + return '\n'.join(lines) + '\n' diff --git a/bex/tokenizer.py b/bex/tokenizer.py new file mode 100644 index 0000000..39ff1dd --- /dev/null +++ b/bex/tokenizer.py @@ -0,0 +1,194 @@ +""" +YAMLTokenizer — Extrahiert Token-Sequenzen aus Ansible YAML-Dateien. + +Nach Bex 2007/2010 wird jedes YAML-Dokument in eine Sequenz von Symbolen +(Token) übersetzt. Für Ansible: + - Ein Playbook → eine Sequenz von Module-Namen (apt, service, template, ...) + - include_tasks wird als terminaler Token behandelt (nicht rekursiv aufgelöst) + - block/rescue/always: Der block-Container selbst wird als Token erfasst, + der Inhalt wird NICHT tokenisiert (zu variabel laut Benutzer-Vorgabe) + +Die extrahierten Sequenzen dienen als Eingabe für die Automaten-Konstruktion. +""" + +import os +import yaml + + +# Module-Namen, die als strukturelle Token erfasst werden +# (basierend auf Analyse von 56+ Rollen im Projekt) +MODULE_TOKENS = { + 'apt', 'service', 'template', 'copy', 'file', 'command', 'shell', + 'get_url', 'uri', 'debug', 'set_fact', 'assert', 'wait_for', + 'include_tasks', 'import_tasks', 'import_playbook', + 'systemd', 'cron', 'user', 'authorized_key', 'group', + 'docker_container', 'docker_volume', 'docker_network', 'docker_image', + 'pip', 'npm', 'package', + 'lineinfile', 'replace', 'blockinfile', + 'stat', 'fetch', 'slurp', + 'meta', 'fail', 'pause', + 'unarchive', 'archive', + 'git', 'hg', + 'mysql_db', 'mysql_user', + 'postgresql_db', 'postgresql_user', + 'certificate', 'openssl', + 'known_hosts', + 'iptables', 'ufw', + 'mount', 'filesystem', + 'sysctl', + 'ini_file', + 'composer', + 'make', + 'configure', + 'npm', + 'composer', + 'pear', + 'pip', + 'gem', + 'cargo', +} + +def is_module_name(key): + return key in MODULE_TOKENS or (isinstance(key, str) and not key.startswith('_')) + +class YAMLTokenizer: + def __init__(self, resolve_includes=False): + self.resolve_includes = resolve_includes + self._token_counts = {} + + def tokenize_file(self, filepath): + with open(filepath) as f: + content = f.read() + return self.tokenize_string(content, source=filepath) + + def tokenize_string(self, content, source=''): + try: + data = yaml.safe_load(content) + except yaml.YAMLError as e: + return [] + if data is None: + return [] + return self._tokenize(data, source=source) + + def _tokenize(self, data, source='', depth=0): + if isinstance(data, list): + return self._tokenize_list(data, source, depth) + elif isinstance(data, dict): + return self._tokenize_dict(data, source, depth) + return [] + + def _tokenize_list(self, lst, source, depth): + tokens = [] + for item in lst: + if isinstance(item, dict): + tokens.extend(self._tokenize_dict(item, source, depth)) + elif isinstance(item, str): + tokens.append(item) + return tokens + + def _tokenize_dict(self, d, source, depth): + tokens = [] + + if 'tasks' in d or 'block' in d or 'pre_tasks' in d or 'post_tasks' in d: + task_key = next(k for k in ['pre_tasks', 'tasks', 'post_tasks', 'block'] if k in d) + if task_key == 'block': + tokens.append('block_start') + for item in d.get('block', []): + tokens.extend(self._tokenize_task(item, source, depth + 1)) + if 'rescue' in d: + tokens.append('rescue_start') + for item in d['rescue']: + tokens.extend(self._tokenize_task(item, source, depth + 1)) + tokens.append('rescue_end') + if 'always' in d: + tokens.append('always_start') + for item in d['always']: + tokens.extend(self._tokenize_task(item, source, depth + 1)) + tokens.append('always_end') + tokens.append('block_end') + else: + for item in d.get(task_key, []): + tokens.extend(self._tokenize_task(item, source, depth + 1)) + + elif 'hosts' in d: + tokens.append('play_start') + for item in d.get('tasks', []): + tokens.extend(self._tokenize_task(item, source, depth + 1)) + tokens.append('play_end') + + elif 'roles' in d: + for role in d.get('roles', []): + tokens.append(f"role:{role if isinstance(role, str) else list(role.keys())[0]}") + + elif 'handlers' in d: + tokens.append('handlers_start') + for item in d.get('handlers', []): + tokens.extend(self._tokenize_task(item, source, depth + 1)) + tokens.append('handlers_end') + + elif 'name' in d and not any(k in d for k in ['tasks', 'block', 'hosts']): + tokens.extend(self._tokenize_task(d, source, depth)) + + return tokens + + def _tokenize_task(self, task, source, depth): + if not isinstance(task, dict): + return [] + + tokens = [] + + if 'include_tasks' in task or 'import_tasks' in task: + key = 'include_tasks' if 'include_tasks' in task else 'import_tasks' + tokens.append(key) + if self.resolve_includes: + inc_path = task[key] + if not os.path.isabs(inc_path): + base = os.path.dirname(source) if source != '' else '.' + inc_path = os.path.join(base, inc_path) + if os.path.exists(inc_path): + tokens.extend(self.tokenize_file(inc_path)) + return tokens + + if 'import_playbook' in task: + tokens.append('import_playbook') + return tokens + + if 'block' in task: + tokens.append('block_start') + for item in task.get('block', []): + tokens.extend(self._tokenize_task(item, source, depth)) + if 'rescue' in task: + tokens.append('rescue_start') + for item in task['rescue']: + tokens.extend(self._tokenize_task(item, source, depth)) + tokens.append('rescue_end') + if 'always' in task: + tokens.append('always_start') + for item in task['always']: + tokens.extend(self._tokenize_task(item, source, depth)) + tokens.append('always_end') + tokens.append('block_end') + return tokens + + if 'name' in task: + module_name = None + for key in task: + if key == 'name': + continue + if is_module_name(key) and isinstance(task[key], (str, dict, list, bool, int)): + module_name = key + break + if module_name: + tokens.append(module_name) + self._token_counts[module_name] = self._token_counts.get(module_name, 0) + 1 + elif 'ansible.builtin' in str(task): + for key in task: + if '.' in str(key): + module_name = str(key).split('.')[-1] + tokens.append(module_name) + break + + return tokens + + def get_statistics(self): + return dict(sorted(self._token_counts.items(), key=lambda x: -x[1])) diff --git a/bex/twotinf.py b/bex/twotinf.py new file mode 100644 index 0000000..1c95994 --- /dev/null +++ b/bex/twotinf.py @@ -0,0 +1,35 @@ +"""2T-INF — Build SOA from 2-grams (Algorithm 1, TODS 2010).""" + +from .soa import SOA + + +def build_soa(sequences): + """ + |———— Algorithm 1: 2T-INF ————| + Input: finite set of sample strings S + Output: SOA G such that S ⊆ L(G) + + For each string a₁...aₙ in S: + add edges (src, a₁), (a₁, a₂), ..., (aₙ, sink) + """ + G = SOA() + symbol_states = {} + + for seq in sequences: + if not seq: + if not G.has_edge(G.src, G.sink): + G.add_edge(G.src, G.sink) + continue + for i, token in enumerate(seq): + if token not in symbol_states: + symbol_states[token] = G.add_state(token) + if i == 0: + G.add_edge(G.src, symbol_states[token]) + if i == len(seq) - 1: + G.add_edge(symbol_states[token], G.sink) + if i + 1 < len(seq): + nxt = seq[i + 1] + if nxt not in symbol_states: + symbol_states[nxt] = G.add_state(nxt) + G.add_edge(symbol_states[token], symbol_states[nxt]) + return G diff --git a/bex/yaml_to_seq.py b/bex/yaml_to_seq.py new file mode 100644 index 0000000..f8937b0 --- /dev/null +++ b/bex/yaml_to_seq.py @@ -0,0 +1,81 @@ +"""Convert YAML files to key-path sequences for BEX grammar inference.""" + +from pathlib import Path +import yaml + + +def yaml_to_keypath_sequence(data, prefix=""): + """Convert parsed YAML data to a sequence of key paths (DFS traversal). + + Each leaf (scalar) emits its full key path as a symbol. + Lists use a generic `[]` marker (no indices). + Values are NOT included — only key paths. + """ + seq = [] + if isinstance(data, dict): + for key, value in data.items(): + path = f"{prefix}.{key}" if prefix else key + if isinstance(value, (dict, list)): + seq.extend(yaml_to_keypath_sequence(value, path)) + else: + seq.append(path) + elif isinstance(data, list): + for item in data: + list_prefix = f"{prefix}[]" if prefix else "[]" + if isinstance(item, (dict, list)): + seq.extend(yaml_to_keypath_sequence(item, list_prefix)) + else: + seq.append(list_prefix) + return seq + + +def yaml_file_to_sequence(filepath): + """Load a YAML file and convert to a key-path sequence.""" + with open(filepath) as f: + data = yaml.safe_load(f) + if data is None: + return [] + return yaml_to_keypath_sequence(data) + + +def is_vault_file(filepath): + """Check if a file is an Ansible vault file (encrypted).""" + try: + with open(filepath) as f: + first = f.read(100) + return '$ANSIBLE_VAULT' in first or first.startswith('!vault |') + except Exception: + return False + + +def collect_all_sequences(root_dir=".", include_vault=False): + """Collect key-path sequences from all YAML files. + + Returns: + list of (filepath, sequence) tuples. + """ + results = [] + for path in sorted(Path(root_dir).rglob("*.yml")): + parts = path.parts + if any(d in parts for d in ('node_modules', '.venv', '__pycache__', '.git')): + continue + skippable = ('vault.yml' in path.name or 'vault' in path.name) + if not include_vault and (skippable or is_vault_file(path)): + continue + try: + seq = yaml_file_to_sequence(path) + if seq: + results.append((path, seq)) + except Exception as e: + print(f" SKIP {path}: {e}") + return results + + +def sequences_to_crx(result_list): + """Run CRX on collected sequences.""" + from .crx import CRX + sequences = [seq for _, seq in result_list] + if not sequences: + return 'ε' + crx = CRX() + return crx.infer(sequences) diff --git a/papers/paper_arxiv2010.txt b/papers/paper_arxiv2010.txt new file mode 100644 index 0000000..7e8e0af --- /dev/null +++ b/papers/paper_arxiv2010.txt @@ -0,0 +1,2210 @@ +arXiv:1004.2372v1 [cs.DB] 14 Apr 2010 + +Learning Deterministic Regular Expressions for the +Inference of Schemas from XML Data +GEERT JAN BEX, WOUTER GELADE, FRANK NEVEN +Hasselt University and Transnational University of Limburg +and +STIJN VANSUMMEREN +Université Libre de Bruxelles + +Inferring an appropriate DTD or XML Schema Definition (XSD) for a given collection of XML +documents essentially reduces to learning deterministic regular expressions from sets of positive +example words. Unfortunately, there is no algorithm capable of learning the complete class of +deterministic regular expressions from positive examples only, as we will show. The regular expressions occurring in practical DTDs and XSDs, however, are such that every alphabet symbol +occurs only a small number of times. As such, in practice it suffices to learn the subclass of +deterministic regular expressions in which each alphabet symbol occurs at most k times, for some +small k. We refer to such expressions as k-occurrence regular expressions (k-OREs for short). +Motivated by this observation, we provide a probabilistic algorithm that learns k-OREs for increasing values of k, and selects the deterministic one that best describes the sample based on a +Minimum Description Length argument. The effectiveness of the method is empirically validated +both on real world and synthetic data. Furthermore, the method is shown to be conservative over +the simpler classes of expressions considered in previous work. +Categories and Subject Descriptors: F.4.3 [Mathematical Logic and Formal Languages]: +Formal Languages; I.2.6 [Artificial Intelligence]: Learning; I.7.2 [Document and Text Processing]: Document Preparation +General Terms: Algorithms, Languages, Theory +Additional Key Words and Phrases: regular expressions, schema inference, XML + +1. + +INTRODUCTION + +Recent studies stipulate that schemas accompanying collections of XML documents +are sparse and erroneous in practice. Indeed, Barbosa et al. [2005] and Mignet et al. +[2003] have shown that approximately half of the XML documents available on the +web do not refer to a schema. In addition, Bex et al. [2004] and Martens et al. +[2006] have noted that about two-thirds of XML Schema Definitions (XSDs) gathered from schema repositories and from the web at large are not valid with respect +to the W3C XML Schema specification [Thompson et al. 2001], rendering them +A preliminary version of this article appeared in the 17th International World Wide Web Conference (WWW 2008). +Permission to make digital/hard copy of all or part of this material without fee for personal +or classroom use provided that the copies are not made or distributed for profit or commercial +advantage, the ACM copyright/server notice, the title of the publication, and its date appear, and +notice is given that copying is by permission of the ACM, Inc. To copy otherwise, to republish, +to post on servers, or to redistribute to lists requires prior specific permission and/or a fee. +c 2024 ACM 0000-0000/2024/0000-0001 $5.00 +ACM Journal Name, Vol. V, No. N, November 2024, Pages 1–31. + + 2 + +· + +Geert Jan Bex et al. + + + + + + +Fig. 1. + +An example DTD. + +essentially useless for immedidate application. A similar observation was made by +Sahuguet [2000] concerning Document Type Definitions (DTDs). Nevertheless, the +presence of a schema strongly facilitates optimization of XML processing (cf., e.g., +[Benedikt et al. 2005; Che et al. 2006; Du et al. 2004; Freire et al. 2002; Koch et al. +2004; Manolescu et al. 2001; Neven and Schwentick 2006]) and various software +development tools such as Castor [cas ] and SUN’s JAXB [jax ] rely on schemas +as well to perform object-relational mappings for persistence. Additionally, the +existence of schemas is imperative when integrating (meta) data through schema +matching [Rahm and Bernstein 2001] and in the area of generic model management [Bernstein 2003]. +Based on the above described benefits of schemas and their unavailability in +practice, it is essential to devise algorithms that can infer a DTD or XSD for a +given collection of XML documents when none, or no syntactically correct one, is +present. This is also acknowledged by Florescu [2005] who emphasizes that in the +context of data integration +“We need to extract good-quality schemas automatically from existing +data and perform incremental maintenance of the generated schemas.” +As illustrated in Figure 1, a DTD is essentially a mapping d from element names +to regular expressions over element names. An XML document is valid with respect +to the DTD if for every occurrence of an element name e in the document, the +word formed by its children belongs to the language of the corresponding regular +expression d(e). For instance, the DTD in Figure 1 requires each store element +to have zero or more order children, which must be followed by a stock element. +Likewise, each order must have a customer child, which must be followed by one +or more item elements. +To infer a DTD from a corpus of XML documents C it hence suffices to look, +for each element name e that occurs in a document in C, at the set of element +name words that occur below e in C, and to infer from this set the corresponding +regular expression d(e). As such, the inference of DTDs reduces to the inference +of regular expressions from sets of positive example words. To illustrate, from the +words id price, id qty supplier, and id qty item item appearing under +elements in a sample XML corpus, we could derive the rule +item → (id, price + (qty, (supplier + item+ ))). +Although XSDs are more expressive than DTDs, and although XSD inference is +therefore more involved than DTD inference, derivation of regular expressions remains one of the main building blocks on which XSD inference algorithms are built. +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +In fact, apart from also inferring atomic data types, systems like Trang [Clark ] and +XStruct [Hegewald et al. 2006] simply infer DTDs in XSD syntax. The more recent +iXSD algorithm [Bex et al. 2007] does infer true XSD schemas by first deriving a +regular expression for every context in which an element name appears, where the +context is determined by the path from the root to that element, and subsequently +reduces the number of contexts by merging similar ones. +So, the effectiveness of DTD or XSD schema inference algorithms is strongly +determined by the accuracy of the employed regular expression inference method. +The present article presents a method to reliably learn regular expressions that +are far more complex than the classes of expressions previously considered in the +literature. +1.1 + +Problem setting + +In particular, let Σ be a fixed set of alphabet symbols (also called element names), +and let Σ∗ be the set of all words over Σ. +Definition 1.1 (Regular Expressions). Regular expressions are derived by the following grammar. +r, s ::= ∅ | ε | a | r . s | r + s | r? | r+ +Here, parentheses may be added to avoid ambiguity; ε denotes the empty word; +a ranges over symbols in Σ; r . s denotes concatenation; r + s denotes disjunction; +r+ denotes one-or-more repetitions; and r? denotes the optional regular expression. +That is, the language L(r) accepted by regular expression r is given by: +L(∅) = ∅ +L(a) = {a} +L(r + s) = L(r) ∪ L(s) + +L(ε) = {ε} +L(r . s) = {vw | v ∈ L(r), w ∈ L(s)} +L(r+ ) = {v1 . . . vn | n ≥ 1 and v1 , . . . , vn ∈ L(r)} + +L(r?) = L(r) ∪ {ε}. +Note that the Kleene star operator (denoting zero or more repititions as in r∗ ) is +not allowed by the above syntax. This is not a restriction, since r∗ can always be +represented as (r+ )? or (r?)+ . Conversely, the latter can always be rewritten into +the former for presentation to the user. +The class of all regular expressions is actually too large for our purposes, as both +DTDs and XSDs require the regular expressions occurring in them to be deterministic (also sometimes called one-unambiguous [Brüggemann-Klein and Wood +1998]). Intuitively, a regular expression is deterministic if, without looking ahead +in the input word, it allows to match each symbol of that word uniquely against a +position in the expression when processing the input in one pass from left to right. +For instance, (a + b)∗ a is not deterministic as already the first symbol in the word +aaa could be matched by either the first or the second a in the expression. Without +lookahead, it is impossible to know which one to choose. The equivalent expression +b∗ a(b∗ a)∗ , on the other hand, is deterministic. +Definition 1.2. Formally, let r stand for the regular expression obtained from r +by replacing the ith occurrence of alphabet symbol a in r by a(i) , for every i and ++ ++ +a. For example, for r = b+ a(ba+ )? we have r = b(1) a(1) (b(2) a(2) )?. A regular +ACM Journal Name, Vol. V, No. N, November 2024. + +3 + + 4 + +· + +Geert Jan Bex et al. + +expression r is deterministic if there are no words wa(i) v and wa(j) v 0 in L(r) such +that i 6= j. +Equivalently, an expression is deterministic if the Glushkov construction [BrüggemanKlein 1993] translates it into a deterministic finite automaton rather than a nondeterministic one [Brüggemann-Klein and Wood 1998]. Not every non-deterministic +regular expression is equivalent to a deterministic one [Brüggemann-Klein and +Wood 1998]. Thus, semantically, the class of deterministic regular expressions +forms a strict subclass of the class of all regular expressions. +For the purpose of inferring DTDs and XSDs from XML data, we are hence in +search of an algorithm that, given enough sample words of a target deterministic +regular expression r, returns a deterministic expression r0 equivalent to r. In the +framework of learning in the limit [Gold 1967], such an algorithm is said to learn +the deterministic regular expressions from positive data. +Definition 1.3. Define a sample to be a finite subset of Σ∗ and let R be a subclass +of the regular expressions. An algorithm M mapping samples to expressions in R +learns R in the limit from positive data if (1) S ⊆ L(M (S)) for every sample S and +(2) to every r ∈ R we can associate a so-called characteristic sample Sr ⊆ L(r) such +that, for each sample S with Sr ⊆ S ⊆ L(r), M (S) is equivalent to r. +Intuitively, the first condition says that M must be sound ; the second that M +must be complete, given enough data. A class of regular expressions R is learnable +in the limit from positive data if an algorithm exists that learns R. For the class of +all regular expressions, it was shown by Gold that no such algorithm exists [Gold +1967]. We extend this result to the class of deterministic expressions: +Theorem 1.4. The class of deterministic regular expressions is not learnable in +the limit from positive data. +Proof. It was shown by Gold [1967, Theorem I.8], that any class of regular +expressions that contains all non-empty finite languages as well as at least one +infinite language is not learnable in the limit from positive data. Since deterministic +regular expressions like a∗ define an infinite language, it suffices to show that every +non-empty finite language is definable by a deterministic expression. Hereto, let +S be a finite, non-empty set of words. Now consider the prefix tree T for S. For +example, if S = {a, aab, abc, aac}, we have the following prefix tree: +a +a +b c + +b +c + +Nodes for which the path from the root to that node forms a word in S are marked +by double circles. In particular, all leaf nodes are marked. +By viewing the internal nodes in T with two or more children as disjunctions; +internal nodes in T with one child as conjunctions; and adding a question mark for +every marked internal node in T , it is straightforward to transform T into a regular +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +expression. For example, with S and T as above we get r = a .(b . c + a .(b + c))?. +Clearly, L(r) = S. Moreover, since no node in T has two edges with the same label, +r must be deterministic. +Theorem 1.4 immediately excludes the possibility for an algorithm to infer the +full class of DTDs or XSDs. In practice, however, regular expressions occurring +in DTDs and XSDs are concise rather than arbitrarily complex. Indeed, a study +of 819 DTDs and XSDs gathered from the Cover Pages [Cover 2003] (including +many high-quality XML standards) as well as from the web at large, reveals that +regular expressions occurring in practical schemas are such that every alphabet +symbol occurs only a small number of times [Martens et al. 2006]. In practice, +therefore, it suffices to learn the subclass of deterministic regular expressions in +which each alphabet symbol occurs at most k times, for some small k. We refer to +such expressions as k-occurrence regular expressions. +Definition 1.5. A regular expression is k-occurrence if every alphabet symbol +occurs at most k times in it. +For example, the expressions customer . order+ and (school + institute)+ are +both 1-occurrence, while id .(qty+id) is 2-occurrence (as id occurs twice). Observe +that if r is k-occurrence, then it is also l-occurrence for every l ≥ k. To simplify +notation in what follows, we abbreviate ‘k-occurrence regular expression’ by k-ORE +and also refer to the 1-OREs as ‘single occurrence regular expressions’ or SOREs. +1.2 + +Outline and Contributions + +Actually, the above mentioned examination shows that in the majority of the cases +k = 1. Motivated by that observation, we have studied and suggested practical +learning algorithms for the class of deterministic SOREs in a companion article [Bex +et al. 2006]. These algorithms, however, can only output SOREs even when the +target regular expression is not. In that case they always return an approximation +of the target expressions. It is therefore desirable to also have learning algorithms +for the class of deterministic k-OREs with k ≥ 2. Furthermore, since the exact +k-value for the target expression, although small, is unknown in a schema inference +setting, we also require an algorithm capable of determining the best value of k +automatically. +We begin our study of this problem in Section 3 by showing that, for each fixed k, +the class of deterministic k-OREs is learnable in the limit from positive examples +only. We also argue, however, that this theoretical algorithm is unlikely to work +well in practice as it does not provide a method to automatically determine the +best value of k and needs samples whose size can be exponential in the size of the +alphabet to successfully learn some target expressions. +In view of these observations, we provide in Section 4 the practical algorithm +iDRegEx. Given a sample of words S, iDRegEx derives corresponding deterministic k-OREs for increasing values of k and selects from these candidate expressions +the expression that describes S best. To determine the “best” expression we propose two measures: (1) a Language Size measure and (2) a Minimum Description +Length measure based on the work of Adriaans and Vitányi [2006]. The main technical contribution lies in the subroutine used to derive the actual k-OREs for S. +ACM Journal Name, Vol. V, No. N, November 2024. + +5 + + 6 + +· + +Geert Jan Bex et al. + +Indeed, while for the special case where k = 1 one can derive a k-ORE by first +learning an automaton A for S using the inference algorithm of Garcia and Vidal +[1990], and by subsequently translating A into a 1-ORE (as shown in [Bex et al. +2006]), this approach does not work when k ≥ 2. In particular, the algorithm of +Garcia and Vidal only works when learning languages that are “n-testable” for +some fixed natural number n [Garcia and Vidal 1990]. Although every language +definable by a 1-ORE is 2-testable [Bex et al. 2006], there are languages definable +by a 2-ORE, for instance a∗ ba∗ , that are not n-testable for any n. We therefore +use a probabilistic method based on Hidden Markov Models to learn an automaton +for S, which is subsequently translated into a k-ORE. +The effectiveness of iDRegEx is empirically validated in Section 5 both on real +world and synthetic data. We compare the results of iDRegEx with those of +the algorithm presented in previous work [Bex et al. 2008], to which we refer as +iDRegEx(rwr0 ). +2. + +RELATED WORK + +Semi-structured data. In the context of semi-structured data, the inference of +schemas as defined in [Buneman et al. 1997; Quass et al. 1996] has been extensively studied [Goldman and Widom 1997; Nestorov et al. 1998]. No methods were +provided to translate the inferred types to regular expressions, however. +DTD and XSD inference. In the context of DTD inference, Bex et al. [2006] +gave in earlier work two inference algorithms: one for learning 1-OREs and one for +learning the subclass of 1-OREs known as chain regular expressions. The latter +class can also be learned using Trang [Clark ], state of the art software written +by James Clark that is primarily intended as a translator between the schema +languages DTD, Relax NG [Clark and Murata 2001], and XSD, but also infers a +schema for a set of XML documents. In contrast, our goal in this article is to infer +the more general class of deterministic expressions. xtract [Garofalakis et al. +2003] is another regular expression learning system with similar goals. We note +that xtract also uses the Minimum Description Length principle to choose the +best expression from a set of candidates. +Other relevant DTD inference research is [Sankey and Wong 2001] and [Chidlovskii +2001] that learn finite automata but do not consider the translation to deterministic +regular expressions. Also, in [Young-Lai and Tompa 2000] a method is proposed to +infer DTDs through stochastic grammars where right-hand sides of rules are represented by probabilistic automata. No method is provided to transform these into +regular expressions. Although Ahonen [1996] proposes such a translation, the effectiveness of her algorithm is only illustrated by a single case study of a dictionary +example; no experimental study is provided. +Also relevant are the XSD inference systems [Bex et al. 2007; Clark ; Hegewald +et al. 2006] that, as already mentioned, rely on the same methods for learning +regular expressions as DTD inference. +Regular expression inference. Most of the learning of regular languages from +positive examples in the computational learning community is directed towards inference of automata as opposed to inference of regular expressions [Angluin and +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +Smith 1983; Pitt 1989; Sakakibara 1997]. However, these approaches learn strict +subclasses of the regular languages which are incomparable to the subclasses considered here. Some approaches to inference of regular expressions for restricted cases +have been considered. For instance, [Brāzma 1993] showed that regular expressions +without union can be approximately learned in polynomial time from a set of examples satisfying some criteria. [Fernau 2005] provided a learning algorithm for +regular expressions that are finite unions of pairwise left-aligned union-free regular +expressions. The development is purely theoretical, no experimental validation has +been performed. +HMM learning. Although there has been work on Hidden Markov Model structure induction [Rabiner 1989; Freitag and McCallum 2000], the requirement in our +setting that the resulting automaton is deterministic is, to the best of our knowledge, unique. +3. + +BASIC RESULTS + +In this section we establish that, in contrast to the class of all deterministic expressions, the subclass of deterministic k-OREs can theoretically be learned in the limit +from positive data, for each fixed k. We also argue, however, that this theoretical +algorithm is unlikely to work well in practice. +Let Σ(r) denote the set of alphabet symbols that occur in a regular expression +r, and let Σ(S) be similarly defined for a sample S. Define the length of a regular expression r as the length of it string representation, including operators and +parenthesis. For example, the length of (a . b)+ ? + c is 9. +Theorem 3.1. For every k there exists an algorithm M that learns the class of +deterministic k-OREs from positive data. Furthermore, on input S, M runs in +time polynomial in the size of S, yet exponential in k and |Σ(S)|. +Proof. The algorithm M is based on the following observations. First observe +that every deterministic k-ORE r over a finite alphabet A ⊆ Σ can be simplified +into an equivalent deterministic k-ORE r0 of length at most 10k|A| by rewriting r +according to the following system of rewrite rules until no more rule is applicable: +((s)) → (s) +s?? → s? +s + ε → s? +s.ε → s +ε? → ε +s+∅ → s +s.∅ → ∅ +∅? → ∅ + +s?+ → s+ ? +s++ → s+ +ε + s → s? +ε.s → s +ε+ → ε +∅+s → s +∅.s → ∅ +∅+ → ∅ + +(The first rewrite rule removes redundant parenthesis in r.) Indeed, since each +rewrite rule clearly preserves determinism and language equivalence, r0 must be a +deterministic expression equivalent to r. Moreover, since none of the rewrite rules +duplicates a subexpression and since r is a k-ORE, so is r0 . Now note that, since +ACM Journal Name, Vol. V, No. N, November 2024. + +7 + + 8 + +· + +Geert Jan Bex et al. + +no rewrite rule applies to it, r0 is either ∅, ε, or generated by the following grammar +t ::= a | a? | a+ | a+ ? | (a) | (a)? | (a)+ | (a)+ ? +| t1 . t2 | (t1 . t2 ) | (t1 . t2 )? | (t1 . t2 )+ | (t1 . t2 )+ ? +| t1 + t2 | (t1 + t2 ) | (t1 + t2 )? | (t1 + t2 )+ | (t1 + t2 )+ ? +It is not difficult to verify by structural induction that any expression t produced +by this grammar has length +X +|t| ≤ −4 + 10 +rep(t, a), +a∈Σ(t) + +where rep(t, a) denotes the number of times alphabet symbol a occurs in t. For +instance, rep(b .(b + c), a) = 0 and rep(b .(b + c), b) = 2. Since rep(r0 , a) ≤ k for +every a ∈ Σ(r0 ), it readily follows that |r0 | ≤ 10k|A| − 4 ≤ 10k|A|. +Then observe that all possible regular expressions over A of length at most 10k|A| +can be enumerated in time exponential in k|A|. Since checking whether a regular expression is deterministic is decidable in polynomial time [Brüggemann-Klein +and Wood 1998]; and since equivalence of deterministic expressions is decidable in +polynomial time [Brüggemann-Klein and Wood 1998], it follows by the above observations that for each k and each finite alphabet A ⊆ Σ it is possible to compute +in time exponential in k|A| a finite set RA of pairwise non-equivalent deterministic +k-OREs over A such that +—every r ∈ RA is of size at most 10k|A|; and +—for every deterministic k-ORE r over A there exists an equivalent expression +r0 ∈ RA . +(Note that since RA is computable in time exponential in k|A|, it has at most an +exponential number of elements in k|A|.) Now fix, for each finite A ⊆ Σ an arbitrary +order ≺ on RA , subject to the provision that r ≺ s only if L(s) − L(r) 6= ∅. Such +an order always exists since RA does not contain equivalent expressions. +Then let M be the algorithm that, upon sample S, computes RΣ(S) and outputs +the first (according to ≺) expression r ∈ RΣ(S) for which S ⊆ L(r). Since RΣ(S) can +be computed in time exponential in k|Σ(S)|; since there are at most an exponential +number of expressions in RΣ(S) ; since each expression r ∈ RΣ(S) has size at most +10k|Σ(S)|; and since checking membership in L(r) of a single word w ∈ S can be +done in time polynomial in the size of w and r, it follows that M runs in time +polynomial in S and exponential in k|Σ(S)|. +Furthermore, we claim that M learns the class of deterministic k-OREs. Clearly, +S ⊆ L(M (S)) by definition. Hence, it remains to show completeness, i.e., that we +can associate to each deterministic k-ORE r a sample Sr ⊆ L(r) such that, for each +sample S with Sr ⊆ S ⊆ L(r), M (S) is equivalent to r. Note that, by definition of +RΣ(r) , there exists a deterministic k-ORE r0 ∈ RΣ(r) equivalent to r. Initialize Sr +to an arbitrary finite subset of L(r) = L(r0 ) such that each alphabet symbol of r +occurs at least once in S, i.e., Σ(Sr ) = Σ(r). Let r1 ≺ · · · ≺ rn be all predecessors of +r0 in RΣ(r) according to ≺. By definition of ≺, there exists a word wi ∈ L(r)−L(ri ) +for every 1 ≤ i ≤ n. Add all of these words to Sr . Then clearly, for every sample S +with Sr ⊆ S ⊆ L(r) we have Σ(S) = Σ(r) and S 6⊆ L(ri ) for every 1 ≤ i ≤ n. Since +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +M (S) is the first expression in RΣ(r) with S ⊆ L(r), we hence have M (S) = r0 ≡ r, +as desired. +While Theorem 3.1 shows that the class of deterministic k-OREs is better suited +for learning from positive data than the complete class of deterministic expressions, +it does not provide a useful practical algorithm, for the following reasons. +(1) First and foremost, M runs in time exponential in the size of the alphabet Σ(S), +which may be problematic for the inference of schema’s with many element +names. +(2) Second, while Theorem 3.1 shows that the class of deterministic k-OREs is +learnable in the limit for each fixed k, the schema inference setting is such that +we do not know k a priori. If we overestimate k then M (S) risks being an underapproximation of the target expression r, especially when S is incomplete. +To illustrate, consider the 1-ORE target expression r = a+ b+ and sample +S = {ab, abbb, aabb}. If we overestimate k to, say, 2 instead of 1, then M is free +to output aa?b+ as a sound answer. On the other hand, if we underestimate k +then M (S) risks being an over-approximation of r. Consider, for instance, the +2-ORE target expression r = aa?b+ and the same sample S = {ab, abbb, aabb}. +If we underestimate k to be 1 instead of 2, then M can only output 1-OREs, +and needs to output at least a+ b+ in order to be sound. In summary: we need +a method to determine the most suitable value of k. +(3) Third, the notion of learning in the limit is a very liberal one: correct expressions need only be derived when sufficient data is provided, i.e., when the input +sample is a superset of the characteristic sample for the target expression r. +The following theorem shows that there are reasonably simple expressions r +such that characteristic sample Sr of any sound and complete learning algorithm is at least exponential in the size of r. As such, it is unlikely for any +sound and complete learning algorithm to behave well on real-world samples, +which are typically incomplete and hence unlikely to contain all words of the +characteristic sample. +Theorem 3.2. Let A = {a1 , . . . , an } ⊆ Σ consist of n distinct element names. +Let r1 = (a1 a2 + a3 + · · · + an )+ , and let r2 = (a2 + · · · + an )+ a1 (a2 + · · · + an )+ . +For any algorithm that learns the class of deterministic (2n +Pn+ 3)-OREs and any +sample S that is characteristic for r1 or r2 we have |S| ≥ i=1 (n − 2)i . +Proof. First consider r1 = (a1 a2 + a3 + · · · + an )+ . Observe that there exist +an exponential number of deterministic (2n + 3)-OREs that differ from r1 in only +a single word. Indeed, let B = A − {a1 , a2 } and let W consist of all non-empty +words w over B of length at most n. Define, for every word w = b1 . . . bm ∈ W the +deterministic (2n + 3)-ORE rw such that L(rw ) = L(r1 ) − {w} as follows. First, +i +that accepts all words in +define, for every 1 ≤ i ≤ m the deterministic 2-ORE rw +L(r1 ) that do not start with bi : +i +rw +:= (a1 a2 + (B − {bi })) .(a1 a2 + a3 + · · · + an )∗ + +Clearly, v ∈ L(r1 ) − {w} if, and only if, v ∈ L(r1 ) and there is some 0 ≤ i ≤ m +such that v agrees with w on the first i letters, but differs in the (i + 1)-th letter. +ACM Journal Name, Vol. V, No. N, November 2024. + +9 + + 10 + +· + +Geert Jan Bex et al. + +Hence, it suffices to take +1 +2 +3 +m +rw := rw ++ b1 (ε + rw ++ b2 (ε + rw ++ b3 (· · · + bm−1 (ε + rw ++ bm . r1 ) . . . ))) + +Now assume that algorithm M learns the class of deterministic (2n + 3)-OREs and +suppose that Sr1 is characteristic for r1 . In particular, Sr1 ⊆ L(r1 ). By definition, +M (S) is equivalent to r for every sample S with Sr1 ⊆ S ⊆ L(r1 ). We claim that +in order for M to have this property, W must be a subset +of Sr . Then, since W +Pn +contains all words over B of length at most n, |Sr1 | ≥ i=1 (n−2)i , as desired. The +intuitive argument why W must be a subset of Sr is that if there exists w in W −Sr , +then M cannot distinguish between r1 and rw . Indeed, suppose for the purpose +of contradiction that there is some w ∈ W with w 6∈ Sr1 . Then Sr1 is a subset of +L(rw ). Indeed, Sr1 = Sr1 − {w} ⊆ L(r1 ) − {w} = L(rw ). Furthermore, since M +learns the class of deterministic (2n + 3)-OREs, there must be some characteristic +sample Srw for rw . Now, consider the sample Sr1 ∪ Srw . It is included in both +L(r1 ) and L(rw ) and is a superset of both Sr1 and Srw . But then, by definition of +characteristic samples, M (Sr1 ∪ Srw ) must be equivalent to both r1 and rw . This +is absurd, however, since L(r1 ) 6= L(rw ) by construction. +A similar argument shows that the P +characteristic sample Sr2 of r2 = (a2 + · · · + +n +an )+ a1 (a2 + · · · + an )+ also requires i=1 (n − 2)i elements. In this case, we take +B = A − {a1 } and we take W to be the set of all non-empty words over B of +length at most n. For each w = b1 . . . bm ∈ W , we construct the deterministic +(2n + 3)-ORE rw such that L(rw ) accepts all words in L(r) that do not end with +i +be the 2-ORE that accepts all words in B + +a1 w, as follows. Let, for 1 ≤ i ≤ m, rw +that do not start with bi : +i +rw +:= (B − {bi }) . B ∗ + +Then it suffices to take +i +2 +m +rw := B + a1 (rw ++ b1 (ε + rw ++ b3 (· · · + bm−1 (ε + rw ++ bm B + ) . . . ))). + +A similar argument as for r1 then shows that the characteristic sample Sr2 of r2 +needs to contain, for +w ∈ W , at least one word of the form va1 w with v ∈ B + . +Peach +n +Therefore, |Sr2 | ≥ i=1 (n − 2)i , as desired. +4. + +THE LEARNING ALGORITHM + +In view of the observations made in Section 3, we present in this section a practical +learning algorithm that (1) works well on incomplete data and (2) automatically +determines the best value of k (see Section 5 for an experimental evaluation). Specifically, given a sample S, the algorithm derives deterministic k-OREs for increasing +values of k and selects from these candidate expressions the k-ORE that describes +S best. To determine the “best” expression we propose two measures: (1) a Language Size measure and (2) a Minimum Description Length measure based on the +work of Adriaans and Vitányi [2006]. +Our algorithm does not derive deterministic k-OREs for S directly, but uses, for +each fixed k, a probabilistic method to first learn an automaton for S, which is subsequently translated into a k-ORE. The following section (Section 4.1) explains how +the probabilistic method that learns an automaton from S works. Section 4.2 explains how the learned automaton is translated into a k-ORE. Finally, Section 4.3, +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +introduces the whole algorithm, together with the two measures to determine the +best candidate expression. +4.1 + +Probabilistically Learning a Deterministic Automaton + +In particular, the algorithm first learns a deterministic k-occurrence automaton +(deterministic k-OA) for S. This is a specific kind of finite state automaton in +which each alphabet symbol can occur at most k times. Figure 2(a) gives an +example. Note that in contrast to the classical definition of an automaton, no +edges are labeled: all incoming edges in a state s are assumed to be labeled by the +label of s. In other words, the 2-OA of Figure 2(a) accepts the same language as +aa?b+ . +Definition 4.1 (k-OA). An automaton is a node-labeled graph G = (V, E, lab) +where +—V is a finite set of nodes (also called states) with a distinguished source src ∈ V +and sink sink ∈ V ; +—the edge relation E is such that src has only outgoing edges; sink has only +incoming edges; and every state v ∈ V − {src, sink } is reachable by a walk from +src to sink ; +—lab : V − {src, sink } → Σ is the labeling function. +In this context, an accepting run for a word a1 . . . an is a walk src s1 . . . sn sink +from src to sink in G such that ai = lab(si ) for 1 ≤ i ≤ n. As usual, we denote +by L(G) the set of all words for which an accepting run exists. An automaton is +k-occurrence (a k-OA) if there are at most k states labeled by the same alphabet +symbol. If G uses only labels in A ⊆ Σ then G is an automaton over A. +In what follows, we write Succ(s) for the set {t | (s, t) ∈ E} of all direct successors +of state s in G, and Pred(s) for the set {t | (t, s) ∈ E} of all direct predecessors +of s in G. Furthermore, we write Succ(s, a) and Pred(s, a) for the set of states in +Succ(s) and Pred(s), respectively, that are labeled by a. As usual, an automaton G +is deterministic if Succ(s, a) contains at most one state, for every s ∈ V and a ∈ Σ. +For convenience, we will also refer to the 1-OAs as “single occurence automata” +or SOAs for short. +We learn a deterministic k-OA for a sample S as follows. First, recall from +Section 3 that Σ(S) is the set of alphabet symbols occurring in words in S. We view +S as the result of a stochastic process that generates words from Σ∗ by performing +random walks on the complete k-OA Ck over Σ(S). +Definition 4.2. Define the complete k-OA Ck over Σ(S) to be the k-OA G = +(V, E, lab) over Σ(S) in which each a ∈ Σ(S) labels exactly k states such that +—there is an edge from src to sink ; +—src is connected to exactly one state labeled by a, for every a ∈ Σ(S); and +—every state s ∈ V − {src, sink } has an outgoing edge to every other state except +src. +To illustrate, the complete 2-OA over {a, b} is shown in Figure 2(b). Clearly, +L(Ck ) = Σ(S)∗ . +ACM Journal Name, Vol. V, No. N, November 2024. + +11 + + 12 + +· + +Geert Jan Bex et al. + +a + +a + +b +(a) An example 2-OA. It accepts +the same language as aa?b+ +Fig. 2. + +a + +a + +b + +b + +(b) The complete +{a, b}. + +2-OA + +over + +Two 2-OAs. + +The stochastic process that generates words from Σ∗ by performing random walks +on Ck operates as follows. First, the process picks, among all states in Succ(src), +a state s1 with probability α(src, s1 ) and emits lab(s1 ). Then it picks, among +all states in Succ(s1 ) a state s2 with probability α(s1 , s2 ) and emits lab(s2 ). The +process continues moving to new states and emitting their labels until the final state +is reached (which does not emit a symbol). Of course, α must be a true probability +distribution, i.e., +X +α(s, t) ≥ 0; and +α(s, t) = 1 +(1) +t∈Succ(s) + +for all states s 6= sink and all states t. The probability of generating a particular +accepting run ~s = src s1 s2 . . . sn sink given the process P = (Ck , α) in this setting +is +P [~s | P] = α(src, s1 ) · α(s2 , s3 ) · α(s2 , s3 ) · · · α(sn , sink ), +and the probability of generating the word w = a1 . . . an is +X +P [w | P] = +P [~s | P]. +all accepting runs ~ +s of w in Ck + +Assuming independence, the probability of obtaining all words in the sample S is +then +Y +P [S | P] = +P [w | P]. +w∈S + +Clearly, the process that best explains the observation of S is the one in which the +probabilities α are such that they maximize P [S | P]. +To learn a deterministic k-OA for S we therefore first try to infer from S the +probability distribution α that maximizes P [S | P], and use this distribution to +determine the topology of the desired deterministic k-OA. In particular, we remove +from Ck the non-deterministic edges with the lowest probability as these are the +least likely to contribute to the generation of S, and are therefore the least likely +to be necessary for the acceptance of S. +The problem of inferring α from S is well-studied in Machine Learning, where +our stochastic process P corresponds to a particular kind of Hidden Markov Model +sometimes referred to as a Partially Observable Markov Model (POMM for short). +(For the readers familiar with Hidden Markov Models we note that the initial +state distribution π usually considered in Hidden Markov Models is absorbed in +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +Algorithm 1 iKoa +Require: a sample S, a value for k +Ensure: a deterministic k-OA G with S ⊆ L(G) +1: P ← init(k, S) +2: P ← BaumWelsh(P, S) +3: G ← Disambiguate(P, S) +4: G ← Prune(G, S) +5: return G +Algorithm 2 Disambiguate +Require: a POMM P = (G, α) and sample S +Ensure: a deterministic k-OA +1: Initialize queue Q to {s ∈ Succ(src) | α(src, s) > 0} +2: Initialize set of marked states D ← ∅ +3: while Q is non-empty do +4: +s ← first(Q) +5: +while some a ∈ Σ has | Succ(s, a)| > 1 do +0 +0 +6: +pick t ∈ Succ(s, +P a) with α(s, t) = max{α(s, t ) | t ∈ Succ(s, a)} +7: +set α(s, t) ← {α(s, t0 ) | t0 ∈ Succ(s, a)} +8: +for all t0 in Succ(s, a) \ {t} do +9: +delete edge (s, t0 ) from G +10: +set α(s, t0 ) ← 0 +11: +P ← BaumWelsh(P, S) +12: +if S 6⊆ L(G) then Fail +13: +add s to marked states D and pop s from Q +14: +enqueue all states in Succ(s) \ D to Q +15: return G +the state transition distribution α(src, ·) in our context.) Inference of α is generally +accomplished by the well-known Baum-Welsh algorithm [Rabiner 1989] that adjusts +initial values for α until a (possibly local) maximum is reached. +We use Baum-Welsh in our learning algorithm iKoa shown in Algorithm 1, which +operates as follows. In line 1, iKoa initializes the stochastic process P to the tuple +(Ck , α) where +—Ck is the complete k-OA over Σ(S); +—α(src, sink ) is the fraction of empty words in S; +—α(src, s) is the fraction of words in S that start with lab(s), for every s ∈ +Succ(src); and +—α(s, t) is chosen randomly for s 6= src, subject to the constraints in equation (1). +It is important to emphasize that, since we are trying to model a stochastic process, +multiple occurrences of the same word in S are important. A sample should therefore not be considered as a set in Algorithm 1, but as a bag. Line 2 then optimizes +the initial values of α using the Baum-Welsh algorithm. +With these probabilities in hand Disambiguate, shown in Algorithm 2, determines the topology of the desired deterministic k-OA for S. In a breadth-first +ACM Journal Name, Vol. V, No. N, November 2024. + +13 + + 14 + +· + +Geert Jan Bex et al. + +manner, it picks for each state s and each symbol a the state t ∈ Succ(s, a) with +the highest probability and deletes all other edges to states labeled by a. Line 7 +merely ensures that α continues to be a probability distribution after this removal +and line 11 adjusts α to the new topology. Line 12 is a sanity check that ensures +that we have not removed edges necessary to accept all words in S; Disambiguate +reports failure otherwise. The result of a successful run of Disambiguate is a +deterministic k-OA which nevertheless may have edges (s, t) for which there is no +witness in S (i.e., a word in S whose unique accepting run traverses (s, t)). The +function Prune in line 4 of iKoa removes all such edges. It also removes all states +s ∈ Succ(src) without a witness in S. Figure 3 illustrates a hypothetical run of +iKoa. +It should be noted that BaumWelsh, which iteratively refines α until a (possibly local) maximum is reached, is computationally quite expensive. For that +reason, our implementation only executes a fixed number of refinement iterations +of BaumWelsh in Line 11. Rather surprisingly, this cut-off actually improves the +precision of iDRegEx, as our experiments in Section 5 show, where it is discussed +in more detail. +4.2 + +Translating k-OAs into k-OREs + +Once we have learned a deterministic k-OA for a given sample S using iKoa +it remains to translate this k-OA into a deterministic k-ORE. An obvious approach in this respect would be to use the classical state elimination algorithm +(cf., e.g., [Hopcroft and Ullman 2007]). Unfortunately, as already hinted upon by +Fernau [2004; 2005] and as we illustrate below, it is very difficult to get concise +regular expressions from an automaton representation. For instance, the classical +state elimination algorithm applied to the SOA in Figure 4 yields the expression:1 +(aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + +aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ +(b + aa∗ b))∗ (aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d)))(aa∗ d + +(c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + aa∗ c)(c + +aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))∗ + +which is non-deterministic and differs quite a bit from the equivalent deterministic +SORE +((b?(a + c))+ d)+ e. +Actually, results by Ehrenfeucht and Zeiger [1976]; Gelade and Neven [2008]; and +Gruber and Holzer [2008] show that it is impossible in general to generate concise +regular expressions from automata: there are k-OAs (even for k = 1) for which the +number of occurrences of alphabet symbols in the smallest equivalent expression is +exponential in the size of the automaton. For such automata, an equivalent k-ORE +hence does not exist. +It is then natural to ask whether there is an algorithm that translates a given +k-OA into an equivalent k-ORE when such a k-ORE exists, and returns a k-ORE +super approximation of the input k-OA otherwise. Clearly, the above example +shows that the classical state elimination algorithm does not suffice for this purpose. +1 Transformation computed by JFLAP: www.jflap.org. + +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +α +src +a1 +a2 +b1 +b2 + +a1 + +a2 + +a1 + +a2 + +b1 + +b2 + +b1 + +b2 + +a1 +1 +0.2 +0.4 +0.1 +0.1 + +a2 +\ +0.3 +0.1 +0.3 +0.1 + +b1 +0 +0.3 +0.2 +0.3 +0.2 + +b2 +\ +0.1 +0.1 +0.2 +0.5 + +sink +0 +0.1 +0.2 +0.1 +0.1 + +α +src +a1 +a2 +b1 +b2 + +(a) Process P returned by init with random values for α. + +α +src +a1 +a2 +b1 +b2 + +a1 +1 +0 +0.01 +0.01 +0.01 + +a1 +1 +0.2 +0.01 +0.01 +0.01 + +a2 +\ +0.3 +0.01 +0.01 +0.01 + +b1 +0 +0.3 +0.6 +0.5 +0.33 + +(b) Process P after +BaumWelsh. + +first + +a1 + +a2 + +a1 + +a2 + +b1 + +b2 + +b1 + +b2 + +a2 +\ +0.5 +0.01 +0.01 +0.01 + +b1 +0 +0.49 +0.6 +0.5 +0.33 + +b2 +\ +0 +0.37 +0.28 +0.5 + +sink +0 +0.01 +0.01 +0.2 +0.15 + +α +src +a1 +a2 +b1 +b2 + +(c) Process P after first disambiguation step +(for a1 ). Edges to a1 and b2 are removed. + +a1 +1 +0 +0.01 +0.02 +0.01 + +a2 +\ +0.5 +0.01 +0 +0.01 + +b1 +0 +0.49 +0.6 +0.78 +0.38 + +a + +a + +b + +b + +b + +returned + +sink +0 +0.01 +0.01 +0.2 +0.15 + +training + +b2 +\ +0 +0.37 +0 +0.4 + +by + +sink +0 +0.01 +0.01 +0.2 +0.2 + +(d) Process P after second disambiguation step +(for b1 ). Edges to a2 and b2 are removed. + +a + +(e) Automaton +A +Disambiguate. + +b2 +\ +0.19 +0.37 +0.28 +0.5 + +· + +a + +(f) Automaton A returned by Prune. It +accepts the same language as aa?b+ . + +by + +Fig. 3. Example run of iKoa for k = 2 with target language aa?b+ . For the process +P in (c)-(f), the α values are listed in table-form. To distinguish different states +with the same label, we have indexed the labels. + +b + +a + +d + +c + +e + +Fig. 4. A SOA on which the classical state elimination algorithm returns a complicated expression. +ACM Journal Name, Vol. V, No. N, November 2024. + +15 + + 16 + +· + +Geert Jan Bex et al. +a(1) + +a(2) + +b(1) + +Fig. 5. + +An example marking + +For that reason, we have proposed in a companion article [Bex et al. ] a family +of algorithms {rwr, rwr21 , rwr22 , rwr23 , . . . } that translate SOAs into SOREs and +have exactly these properties: +Theorem 4.3 ([Bex et al. ]). Let G be a SOA and let T be any of the algorithms in the family {rwr, rwr21 , rwr22 , rwr23 , . . . }. If G is equivalent to a SORE +r, then T (G) returns a SORE equivalent to r. Otherwise, T (G) returns a SORE +that is a super approximation of G, L(G) ⊆ L(T (G)). +(Note that SOAs and SOREs are always deterministic by definition.) +These algorithms, in short, apply an inverse Glushkov translation. Starting from +a k-OA where each state is labeled by a symbol, they iteratively rewrite subautomata into equivalent regular expressions. In the end only one state remains and +the regular expression labeling this state is the output. +In this section, we show how the above algorithms can be used to translate k-OAs +into k-OREs. For simplicity of exposition, we will focus our discussion on rwr21 as +it is the concrete translation algorithm used in our experiments in Section 5, but +the same arguments apply to the other algorithms in the family. +Definition 4.4. First, let Σ(k) denote the alphabet that consists of k copies of +the symbols in Σ, where the first copy of a ∈ Σ is denoted by a(1) , the second by +a(2) , and so on: +Σ(k) := {a(i) | a ∈ Σ, 1 ≤ i ≤ k}. +Let strip be the function mapping copies to their original symbol, i.e., strip(a(i) ) = +a. We extend strip pointwise to words, languages, and regular expressions over +Σ(k) . +For example, strip({a(1) a(2) b(1) , a(2) a(2) c(2) }) = {aab, aac} and strip(a(1) . a(2) ? . ++ +b(1) ) = a . a? . b+ . +To see how we can use rwr21 , which translates SOAs into SOREs, to translate +a k-OA into a k-ORE, observe that we can always transform a k-OA G over Σ +into a SOA H over Σ(k) by processing the nodes of G in an arbitrary order and +replacing the ith occurrence of label a ∈ Σ by a(i) . To illustrate, the SOA over Σ(2) +obtained in this way from the 2-OA in Figure 2(a) is shown in Figure 5. Clearly, +L(G) = strip(L(H)). +Definition 4.5. We call a SOA H over Σ(k) obtained from a k-OA G in the above +manner a marking of G. +Note that, by Theorem 4.3, running rwr21 on H yields a SORE r over Σ(k) +with L(H) ⊆ L(r). For instance, with H as in Figure 5, rwr2 (H) returns r = +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +Algorithm 3 rwr2 +Require: a k-OA G +Ensure: a k-ORE r with L(G) ⊆ L(r) +1: compute a marking H of G. +2: return strip(rwr21 (H)) ++ + +a(1) . a(2) ? . b(1) . By subsequently stripping r, we always obtain a k-ORE over Σ. +Moreover, L(G) = strip(L(H)) ⊆ strip(L(r)) = L(strip(r)), so the k-ORE strip(r) +is always a super approximation of G. Algorithm 3, called rwr2 , summarizes the +translation. By our discussion, rwr2 is clearly sound: +Proposition 4.6. rwr2 (G) is a (possibly non-deterministic) k-ORE with L(G) ⊆ +L(rwr2 (G)), for every k-OA G. +Note, however, that even when G is deterministic and equivalent to a deterministic k-ORE r, rwr2 (G) need not be deterministic, nor equivalent to r. For instance, +consider the 2-OA G: +b + +a + +c + +b + +Clearly, G is equivalent to the deterministic 2-ORE bc?a(ba)+ ?. Now suppose for +the purpose of illustration that rwr2 constructs the following marking H of G. (It +does not matter which marking rwr2 constructs, they all result in the same final +expression.) +b(1) + +a(1) + +c(1) + +b(2) + +Since H is not equivalent to a SORE over Σ(k) , rwr21 (H) need not be equivalent +to L(H). In fact, rwr21 (H) returns ((b(1) c(1) ?a(1) )?b(2) ?)+ , which yields the nondeterministic ((bc?a)?b?)+ after stripping. Nevertheless, G is equivalent to the +deterministic 2-ORE bc?a(ba)+ ?. +So although rwr2 is always guaranteed to return a k-ORE, it does not provide +the same strong guarantees that rwr21 provides (Theorem 4.3). The following theorem shows, however, that if we can obtain G by applying the Glushkov construction +on r [Brüggeman-Klein 1993], rwr2 (G) is always equivalent to r. Moreover, if r +is deterministic, then so is rwr2 (G). So in this sense, rwr2 applies an inverse +Glushkov construction to r. Formally, the Glushkov construction is defined as +follows. +Definition 4.7. Let r be a k-ORE. Recall from Definition 1.2 that r is the regular +expression obtained from r by replacing the ith occurrence of alphabet symbol a +by a(i) , for every a ∈ Σ and every 1 ≤ i ≤ n. Let pos(r) denote the symbols in Σ(k) +that actually appear in r. Moreover, let the sets first(r), last(r), and follow (r, a(i) ) +be defined as shown in Figure 6. A k-OA G is a Glushkov translation of r if there +exists a one-to-one onto mapping ρ : (V (G) − {src, sink }) → pos(r) such that +ACM Journal Name, Vol. V, No. N, November 2024. + +17 + + 18 + +· + +Geert Jan Bex et al. +first(∅) +first(a(i) ) +first(r+ ) + += += += + +first(r . s) + += + +last(∅) +last(a(i) ) +last(r+ ) + += += += + +last(r . s) + += + +follow (a(i) , a(i) ) +follow (r?, a(i) ) + += += + +follow (r+ , a(i) ) + += + +follow (r + s, a(i) ) + += + +follow (r . s, a(i) ) + += + +Fig. 6. + +∅ +first(ε) +{a(i) } +first(r?) +first(r) +first(r + s) +( +first(r) +if ε ∈ +/ L(r), +first(r) ∪ first(s) otherwise. + += += += + +∅ +first(r) +first(r) ∪ first(s) + +∅ +{a(i) } +last(r) +( +last(s) +last(r) ∪ last(s) + += += += + +∅ +last(r) +last(r) ∪ last(s) + +last(ε) +last(r?) +last(r + s) +if ε ∈ +/ L(s), +otherwise. + +∅ +follow (r, a(i) ) +( +follow (r, a(i) ) +(i) +(follow (r, a ) ∪ first(r) +follow (r, a(i) ) +follow (s, a(i) ) + +(i) + +follow (r, a ) + +follow (r, a(i) ) ∪ first(s) + + +follow (s, a(i) ) + +if a(i) ∈ +/ last(r), +otherwise. +if a(i) ∈ pos(r), +otherwise. +if a(i) ∈ pos(r), a(i) ∈ +/ last(r), +if a(i) ∈ pos(r), a(i) ∈ last(r), +otherwise. + +Definition of first(r), last(r), and follow (r, a(i) ), for a(i) ∈ pos(r). + +(1) v ∈ Succ(src) ⇔ ρ(v) ∈ first(r); +(2) v ∈ Pred(sink ) ⇔ ρ(v) ∈ last(r); +(3) v ∈ Succ(w) ⇔ ρ(v) ∈ follow (r, ρ(w)); and +(4) strip(ρ(v)) = lab(v), +for all v, w ∈ V (G) − {src, sink }. +Theorem 4.8. If k-OA G is a Glushkov representation of a target k-ORE +r, then rwr2 (G) is equivalent to r. Moreover, if r is deterministic, then so is +rwr2 (G). +Proof. Since rwr2 (G) = strip(rwr21 (H)) for an arbitrarily chosen marking +H of G, it suffices to prove that strip(rwr21 (H)) is equivalent to r and that +strip(rwr21 (H)) is deterministic whenever r is deterministic, for every marking H +of G. Hereto, let H be an arbitrary but fixed marking of G. In particular, G and H +have the same set of nodes V and edges E, but differ in their labeling function. Let +lab G be the labeling function of G and let lab H the labeling function of H. Clearly, +lab G (v) = strip(lab H (v)) for every v ∈ V − {src, sink }. Since G is a Glushkov +translation of r, there is a one-to-one, onto mapping ρ : (V − {src, sink }) → pos(r) +satisfying properties (1)-(4) in Definition 4.7. Now let σ : pos(r) → Σ(k) be the +function that maps a(i) ∈ pos(r) to lab H (ρ−1 (a(i) )). Since lab H assigns a distinct +label to each state, σ is one-to-one and onto the subset of Σ(k) symbols used as +labels in H. Moreover, by property (4) and the fact that lab G (v) = strip(lab H (v)) +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +we have, +strip(a(i) ) = lab G (ρ−1 (a(i) )) = strip(lab H (ρ−1 (a(i) ))) = strip(σ(a(i) )) + +(?) + +(i) + +for each a ∈ pos(r). In other words, σ preserves (stripped) labels. Now let σ(r) +be the SORE obtained from r by replacing each a(i) ∈ pos(r) by σ(a(i) ). Since σ is +one-to-one and r is a SORE, so is σ(r). Moreover, we claim that L(H) = L(σ(r)). +Indeed, it is readily verified by induction on r that a word a1 (i1 ) . . . an (in ) ∈ L(r) +if, and only if, (i) a1 (i1 ) ∈ first(r); (ii) ap+1 (ip+1 ) ∈ follow (r, ap+1 (ip+1 ) ) for every +1 ≤ p < n; and (iii) an (in ) ∈ last(r). By properties (1)-(4) of Definition 4.7 we +hence obtain: +σ(a1 (i1 ) ) . . . σ(an (in ) ) ∈ L(σ(r)) +⇔ a1 (i1 ) . . . an (in ) ∈ L(r) +⇔ src, ρ−1 (a1 (i1 ) ), . . . , ρ−1 (an (in ) ), sink is a walk in G +⇔ src, ρ−1 (a1 (i1 ) ), . . . , ρ−1 (an (in ) ), sink is a walk in H +⇔ lab H (ρ−1 (a1 (i1 ) )) . . . , lab H (ρ−1 (an (in ) )) ∈ L(H) +⇔ σ(a1 (i1 ) ) . . . σ(an (in ) ) ∈ L(H) +Therefore, L(H) = L(σ(r)). +Hence, we have established that H is a SOA over Σ(k) equivalent to the SORE +σ(r) over Σ(k) . By Theorem 4.3, rwr21 (H) is hence equivalent to σ(r). Therefore, +strip(rwr21 (H)) is equivalent to strip(σ(r)), which by (?) above, is equivalent to +strip(r) = r, as desired. +Finally, to see that strip(rwr21 (H)) is deterministic if r is deterministic, let +s := strip(rwr21 (H)) and suppose for the purpose of contradiction that s is not +deterministic. Then there exists wa(i) v1 and wa(j) v2 in L(s) with i 6= j. It is +0 +0 +not hard to see that this can happen only if there exist w0 a(i ) v10 and w0 a(j ) v20 +in L(rwr21 (H)) with i0 6= j 0 . Since L(rwr21 (H)) = L(σ(r)) we know that hence +0 +0 +00 +0 +σ −1 (w0 a(i ) v10 ) ∈ L(r) and σ −1 (w0 a(j ) v20 ) ∈ L(r). Let w00 a(i ) v100 = σ −1 (w0 a(i ) v10 ) +00 +0 +and w00 a(j ) v200 = σ −1 (w0 a(i ) v20 ). Since σ is one-to-one and i0 6= j 0 , also i00 6= j 00 . +Therefore, r is not deterministic, which yields the desired contradiction. +4.3 + +The whole Algorithm + +Our deterministic regular expression inference algorithm iDRegEx combines iKoa +and rwr2 as shown in Algorithm 4. For increasing values of k until a maximum +kmax is reached, it first learns a deterministic k-OA G from the given sample S, +and subsequently translates that k-OA into a k-ORE using rwr2 . If the resulting +k-ORE is deterministic then it is added to the set C of deterministic candidate +expressions for S, otherwise it is discarded. From this set of candidate expressions, +iDRegEx returns the “best” regular expression best(C), which is determined according to one of the measures introduced below. Since it is well-known that, +depending on the initial value of α, BaumWelsh (and therefore iKoa) may converge to a local maximum that is not necessarily global, we apply iKoa a number +of times N with independently chosen random seed values for α to increase the +probability of correctly learning the target regular expression from S. +The observant reader may wonder whether we are always guaranteed to derive +at least one deterministic expression such that best(C) is defined. Indeed, Theorem 4.8 tells us that if we manage to learn from sample S a k-OA which is the +ACM Journal Name, Vol. V, No. N, November 2024. + +19 + + 20 + +· + +Geert Jan Bex et al. + +Algorithm 4 iDRegEx +Require: a sample S +Ensure: a k-ORE r +1: initialize candidate set C ← ∅ +2: for k = 1 to kmax do +3: +for n = 1 to N do +4: +G ← iKoa(S, k) +5: +if rwr2 (G) is deterministic then +6: +add rwr2 (G) to C +7: return best(C) +Glushkov representation of the target expression r, then rwr2 will always return +a deterministic k-ORE equivalent to r. When k > 1, there can be several k-OAs +representing the same language and we could therefore learn a non-Glushkov one. +In that case, rwr2 always returns a k-ORE which is a super approximation of the +target expression. Although that approximation can be non-deterministic, since we +derive k-OREs for increasing values of k and since for k = 1 the result of rwr2 is +always deterministic (as every SORE is deterministic), we always infer at least one +deterministic regular expression. In fact, in our experiments on 100 synthetic regular expressions, we derived for 96 of them a deterministic expression with k > 1, +and only for 4 expressions had to resort to a 1-ORE approximation. +4.3.1 A Language Size Measure for Determining the Best Candidate. Intuitively, +we want to select from C the simplest deterministic expression that “best” describes +S. Since each candidate expression in C accepts all words in S by construction, one +way to interpret “the best” is to select the expression that accepts the least number +of words (thereby adding the least number of words to S). Since an expression defines an infinite language in general, it is of course impossible to take all words into +account. We therefore only consider the words up to a length n, where n = 2m + 1 +with m the length of the candidate expression, excluding regular expression operators, ∅, and ε. For instance, if the candidate expression is a .(a + c+ )?, then m = 3 +and n = 7. Formally, for a language L, let |L≤n | denote the number of words in L +of length at most n. Then the best candidate in C is the one with the least value of +| L(r)≤n |. If there are multiple such candidates, we pick the shortest one (breaking +ties arbitrarily). It turns out that | L(r)≤n | can be computed quite efficiently; see +[Bex et al. ] for details. +4.3.2 A Minimum Description Length Measure for Determining the Best Candidate. An alternative measure to determine the best candidate is given by Adriaans +and Vitányi [2006], who compare the size of S with the size of the language of a +candidate r. Specifically, Adriaans and Vitányi define the data encoding cost of r +to be: + =i + +n +X +| L (r)| +datacost(r, S) := +2 · log2 i + log2 +, +|S =i | +i=0 +where n = 2m + 1 as before; |S =i | is the number of words in S that have length i; +and | L=i (r)| is the number of words in L(r) that have exactly length i. Although +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +the above formula is numerically difficult to compute, there is an easier estimation +procedure; see [Adriaans and Vitányi 2006] for details. +In this case, the model encoding cost is simply taken to be its length, thereby +preferring shorter expressions over longer ones. The best regular expression in the +candidate set C is then the one that minimizes both model and data encoding cost +(breaking ties arbitrarily). +We already mentioned that xtract [Garofalakis et al. 2003] also utilizes the +Minimum Description Length principle. However, their measure for data encoding +cost depends on the concrete structure of the regular expressions while ours only +depends on the language defined by them and is independent of the representation. +Therefore, in our setting, when two equivalent expressions are derived, the one with +the smallest model cost, that is, the simplest one, will always be taken. +5. + +EXPERIMENTS + +In this section we validate our approach by means of an experimental analysis. +Throughout the section, we say that a target k-ORE r is successfully derived when +a k-ORE s with L(r) = L(s) is generated. The success rate of our experiments +then is the percentage of successfully derived target regular expressions. +Our previous work [Bex et al. 2008] on this topic was based on a version of the +rwr0 algorithm [Bex et al. 2006], we refer to this algorithm as iDRegEx(rwr0 ). +Unfortunately, as detailed in [Bex et al. 2008], it is not known whether rwr0 is +complete on the class of all single occurrence regular expressions. Nevertheless, the +experiments in [Bex et al. 2008] which are revisited below show a good and reliable +performance. However, to obtain a theoretically complete algorithm, c.f.r. Theorem 4.8, we use the algorithm rwr2 which is sound and complete on single occurrence regular expressions. In the remainder we focus on iDRegEx, but compare +with the results for iDRegEx(rwr0 ). +As mentioned in Section 4.3.1, another new aspect of the results presented here is +the use of language size as an alternative measure over Minimum Description Length +(MDL) to compare candidates. The iDRegEx(rwr0 ) algorithm is only considered +with the MDL criterion. We note that for alphabet size 5, the success rate of +iDRegEx with the MDL criterion was only 21 %, while that of the language size +criterion is 98 %. The corpus used in this experiment is described in Section 5.3. +Therefore in the remainder of this section we only consider iDRegEx with the +language size criterion. +For all the experiments described below we take kmax = 4 and N = 10 in Algorithm 4. +5.1 + +Running times + +All experiments were performed using a prototype implementation of iDRegEx +and iDRegEx(rwr0 ) written in Java executed on Pentium M 2.0 GHz class machines equipped with 1GB RAM. For the BaumWelsh subroutine we have gratefully used Jean-Marc François’ Jahmm library [François 2006], which is a faithful +implementation of the algorithms described in Rabiner’s Hidden Markov Model tutorial [Rabiner 1989]. Since Jahmm strives for clarity rather than performance and +since only limited precautions are taken against underflows, our prototype should +be seen as a proof of concept rather than a polished product. In particular, underACM Journal Name, Vol. V, No. N, November 2024. + +21 + + 22 + +· + +Geert Jan Bex et al. + +flows currently limit us to target regular expressions whose total number of symbol +occurrences is at most 40. Here, the total number of symbol occurrences occ(r) of +a regular expression r is its length excluding the regular expression operators and +parenthesis. To illustrate, the total number of symbol occurrences in aa?b+ is 3. +Furthermore, the lack of optimization in Jahmm leads to average running times +ranging from 4 minutes for target expressions r with |Σ(r)| = 5 and occ(r) = 6 to +9 hours for targets expression with |Σ(r)| = 15 and occ(r) = 30. Running times for +iDRegEx and iDRegEx(rwr0 ) are similar. +As already mentioned in Section 4.3, one of the bottlenecks of iDRegEx is the application of BaumWelsh in Line 11 of Disambiguate (Algorithm 2). BaumWelsh +is an iterative procedure that is typically run until convergence, i.e., until the +computed probability distribution no longer change significantly. To improve the +running time, we only apply a fixed number ` of iteration steps when calling +BaumWelsh in Line 11 of Disambiguate. Experiments show that the running +time performance scales linear with ` as one expects, but, perhaps surprisingly, the +success rate improves as well for an optimal value of `. This optimal value for ` +depends on the alphabet size. These improved results can be explained as follows: +applying BaumWelsh in each disambiguation step until it converges guarantees +that the probability distribution for that step will have reached a local optimum. +However, we know that the search space for the algorithm contains many local optima, and that BaumWelsh is a local optimization algorithm, i.e., it will converge +to one of the local optima it can reach from its starting point by hill climbing. The +disambiguation procedure proceeds state by state, so fine tuning the probability +distribution for a disambiguation step may transform the search space so that certain local optima for the next iteration can no longer be reached by a local search +algorithm such as BaumWelsh. Table I shows the performance of the algorithm +for various number of BaumWelsh iterations ` for expressions of alphabet size 5, +10 and 15. These expressions are those described in Section 5.3. In this Table, +` = ∞ denotes the case where BaumWelsh is ran until convergence after each +disambiguation step. The Table illustrates that the success rate is actually higher +for small values of `. The running time performance gains increase rapidly with +the expressions’ alphabet size: for |Σ| = 5, we gain a factor of 3.5 (` = 2), for +|Σ| = 10, it is already a factor of 10 (` = 3) and for |Σ| = 15, we gain a factor +of 25 (` = 3). This brings the running time for the largest expressions we tested +down to 22 minutes, in contrast with 9 hours mentioned for iDRegEx(rwr0 ) and +iDRegEx. The algorithm with the optimal number of BaumWelsh steps in the +disambiguation process will be referred to as iDRegExfixed . In particular for small +alphabet sizes (|Σ| ≤ 7) we use ` = 2, for large alphabet size ` = 3 (|Σ| > 7). We +note that the alphabet size can easily be determined from the sample. +We should also note that Experience with Hidden Markov Model learning in bioinformatics [Finn et al. 2006] suggests that both the running time and the maximum +number of symbol occurrences that can be handled can be significantly improved +by moving to an industrial-strength BaumWelsh implementation. Our focus for +the rest of the section will therefore be on the precision of iDRegEx. +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data +` +1 +2 +3 +4 +∞ + +rate |Σ| = 5 +95 % +100 % +95 % +95 % +98 % + +rate |Σ| = 10 +80 % +75 % +84 % +77 % +75 % + +· + +rate |Σ| = 15 +40 % +50 % +60 % +50 % +50 % + +Table I. Success rate for a limited number of BaumWelsh iterations in the disambiguation procedure, ` = ∞ corresponds to iDRegEx, for ` = 1, . . . , 4 correspond to iDRegExfixed . + +5.2 + +Real-world target expressions and real-world samples + +We want to test how iDRegEx performs on real-world data. Since the number +of publicly available XML corpora with valid schemas is rather limited, we have +used as target expressions the 49 content models occurring in the XSD for XML +Schema Definitions [Thompson et al. 2001] and have drawn multiset samples for +these expressions from a large corpus of real-world XSDs harvested from the Cover +Pages [Cover 2003]. In other words, the goal of our first experiment is to derive, from +a corpus of XSD definitions, the regular expression content models in the schema +for XML Schema Definitions2 . As it turns out, the XSD regular expressions are all +single occurrence regular expressions. +The iDRegEx(rwr0 ) algorithm infers all these expressions correctly, showing +that it is conservative with respect to k since, as mentioned above, the algorithm +considers k values ranging from 1 to 4. In this setting, iDRegEx performs not +as well, deriving only 73 % of the regular expressions correctly. We note that for +each expression that was not derived exactly, always an expression was obtained +describing the input sample and which in addition is more specific than the target +expression. iDRegEx therefore seems to favor more specific regular expressions, +based on the available examples. +5.3 + +Synthetic target expressions + +Although the successful inference of the real-world expressions in Section 5.2 suggests that iDRegEx is applicable in real-world scenarios, we further test its behavior on a sizable and diverse set of regular expressions. Due to the lack of real-world +data, we have developed a synthetic regular expression generator that is parameterized for flexibility. +Synthetic expression generation. In particular, the occurrence of the regular +expression operators concatenation, disjunction (+), zero-or-one (?), zero-or-more +(∗ ), and one-or-more (+ ) in the generated expressions is determined by a userdefined probability distribution. We found that typical values yielding realistic +expressions are 1/10 for the unary operators and 7/20 for others. The alphabet +can be specified, as well as the number of times that each individual symbol should +occur. The maximum of these numbers determines the value k of the generated +k-ORE. +To ensure the validity of our experiments, we want to generate a wide range of +different expressions. To this end, we measure how much the language of a generated +2 This corpus was also used in [Bex et al. 2007] for XSD inference. + +ACM Journal Name, Vol. V, No. N, November 2024. + +23 + + 24 + +· + +Geert Jan Bex et al. + +((debab) + c)∗ a +((((c + b)b) + a)ca) + e + d +(((ea)∗ db) + b + a + c)+ +((b+ + c + e + d)aab)+ +((((eabh) + d + j + c + b)+ f ) + a + g + i)? +((((aa) + e)+ + c)b) + b + d +((((d + a)∗ eabcb) + c)a)? +((((ac) + b + d)eab) + c)∗ +(((((bab) + c)+ + e)?a) + d)+ +((((ecb)+ a) + b)+ + d + a)? +((bagbf eid) + c + a + j + h)∗ +((gdab) + a + i + c + j + e + f )+ hb +((h∗ cdf a) + j + e + g + b + i)∗ ab +((g + b + e + f + i + d)∗ aba) + h + j + c +((((h + b + c + j + f )+ + e)?aaidb) + g)? + +Fig. 7. + +(((((dbe)∗ cf ) + j)hac) + b + i)∗ gad +(((((ihaaj) + d)+ + g)b) + e + b + f + c)+ +(((ecgecd) + b + d + a + j + f )∗ ihaba)∗ +(l + c + d + m + n)∗ aojahbegcbf idke +(((c + b)ab) + d + i + a)+ + j + g + f + e + h +(((a?clf habgd) + b + n + o)iedjcem)∗ k +((a + k + f + c + m + e)+ bdieclbonjgda)∗ h +(((k?jghadf celif cjbhom)+ +b + g + a + e + i + n)+ + d)? +(((aedoadenhdbci) + h + k + m + j + g + b)∗ +f ccgelbif ja) +((a+ + f + d + o + g + n + h + c + b + j + i + e) +keacdlbm) +(((k + f + o + a + j)?edhldf hngicjmab)?cie)∗ bg +((((a?d)+ ba) + h + g + e + c)+ + j + i + b)?f + +A snapshot of the 100 generated expressions. + +expression overlaps with Σ∗ . The larger the overlap, the greater its language size +as defined in Section 4.3.1. +To ensure that the generated expressions do not impede readability by containing +redundant subexpressions (as in e.g., (a+ )+ ), the final step of our generator is to +syntactically simplify the generated expressions using the following straightforward +equivalences: +r∗ → r+ ? +r?? → r? +(r+ )+ → r+ +(r?)+ → r+ ? +(r1 · r2 ) · r3 → r1 · (r2 · r3 ) +r1 · (r2 · r3 ) → r1 · r2 · r3 +(r1 ? · r2 ?)? → r1 ? · r2 ? +(r1 + r2 ) + r3 → r1 + (r2 + r3 ) +r1 + (r2 + r3 ) → r1 + r2 + r3 +(r1 + r2+ )+ → (r1 + r2 )+ +(r1+ + r2+ ) → (r1 + r2 )+ +r1 + r2 ? → (r1 + r2 )? +Of course, the resulting expression is rejected if it is non-deterministic. +To obtain a diverse target set, we synthesized expressions with alphabet size 5 +(45 expressions), 10 (45 expressions), and 15 (10 expressions) with a variety of +symbol occurrences (k = 1, 2, 3). For each of the alphabet sizes, the expressions +were selected to cover language size ranging from 0 to 1. All in all, this yielded a +set of 100 deterministic target expressions. A snapshot is given in Figure 7. +Synthetic sample generation. For each of those 100 target expressions, we +generated synthetic samples by transforming the target expressions into stochastic +processes that perform random walks on the automata representing the expressions +(cf. Section 4). The probability distributions of these processes are derived from the +structure of the originating expression. In particular, each operand in a disjunction +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data +p + +r1 · · · rn + +p + +1 + +r1 + +1 + +··· + +1 + +rn + +· + +1 + +r1 +p/n +p + +r1 + · · · + rn + +1 + +1 +. +. +. +1 + +p/n +rn +p/2 +p +r? + +1 + +r +p/2 + +1 + +2/3 +p + +Fig. 8. + +r+ + +1 +p + +r +1/3 + +From a regular expression to a probabilistic automaton. + +is equally likely and the probability to have zero or one occurrences for the zeroor-one operator ? is 1/2 for each option. The probability to have n repetitions in +a one-or-more or zero-or-more operator (∗ and + ) is determined by the probability +that we choose to continue looping (2/3) or choose to leave the loop (1/3). The +latter values are based on observations of real-world corpora. Figure 8 illustrates +how we construct the desired stochastic process from a regular expression r: starting +from the following initial graph, +1 + +r + +1 + +we continue applying the rewrite rules shown until each internal node is an individual alphabet symbol. +Experiments on covering samples. Our first experiment is designed to test +how iDRegEx performs on samples that are at least large enough to cover the +target regular expression, in the following sense. +Definition 5.1. A sample S covers a deterministic automaton G if for every edge +(s, t) in G there is a word w ∈ S whose unique accepting run in G traverses (s, t). +Such a word w is called a witness for (s, t). A sample S covers a deterministic +regular expression r if it covers the automaton obtained from S using the Glushkov +construction for translating regular expressions into automata as defined in Definition 4.7. +Intuitively, if a sample does not cover a target regular expression r then there +will be parts of r that cannot be learned from S. In this sense, covering samples +are the minimal samples necessary to learn r. Note that such samples are far from +“complete” or “characteristic” in the sense of the theoretical framework of learning +in the limit, as some characteristic samples are bound to be of size exponential in +the size of r by Theorem 3.2, while samples of size at most quadratic in r suffice +to cover r. Indeed, the Glushkov construction always yields an automaton whose +number of states is bounded by the size of r. Therefore, this automaton can have +ACM Journal Name, Vol. V, No. N, November 2024. + +25 + + 26 + +· + +Geert Jan Bex et al. + +at most |r|2 edges, and hence |r|2 witness words suffice to cover r. +Table II shows how iDRegEx performs on covering samples, broken up by alphabet size of the target expressions. The size of the sample used is depicted as well. +The table demonstrates a remarkable precision. Out of a total of 100 expressions, +82 are derived exactly for iDRegEx. Although iDRegEx(rwr0 ) outperforms +iDRegEx with a success rate of 87 %, overall iDRegExfixed performs best with +89 %. The performance decreases with the alphabet size of the target expressions: +this is to be expected since the inference task’s complexity increases. It should +be emphasized that even if iDRegExfixed does not derive the target expression +exactly, it always yields an over-approximation, i.e., its language is a superset of +the target language. +Table III shows an alternative view on the results. It shows the success rate as a +function of the target expression’s language size, grouped in intervals. In particular, +it demonstrates that the method works well for all language sizes. +A final perspective is offered in Table IV which shows the success rate in function +of the average states per symbol κ for an expression. The latter quantity is defined +as the length of the regular expression excluding operators, divided by the alphabet size. For instance, for the expression a(a + b)+ cab, κ = 6/3 since its length +excluding operators is 6 and |Σ| = 3. It is clear that the learning task is harder +for increasing values of κ. To verify the latter, a few extra expressions with large κ +values were added to the target expressions. For the algorithm iDRegExfixed the +success rate is quite high for target expressions with a large value of κ. Conversely, +iDRegEx(rwr0 ) yields better results for κ < 1.6, while its success rate drops to +around 50 % for larger values of κ. This illustrates that neither iDRegEx(rwr0 ) +nor iDRegExfixed outperforms the other in all situations. +|Σ| +5 +10 +15 +total + +#regex +45 +45 +10 +100 + +iDRegEx(rwr0 ) +86 % +93 % +70 % +87 % + +iDRegEx +97 % +75 % +50 % +82 % + +iDRegExfixed +100 % +84 % +60 % +89 % + +|S| +300 +1000 +1500 + +Table II. Success rate on the target regular expressions and the sample size used per alphabet size +for the various algorithms. + +Density(r) +[0.0, 0.2[ +[0.2, 0.4[ +[0.4, 0.6[ +[0.6, 0.8[ +[0.8, 1.0] +Table III. + +#regex +24 +22 +20 +22 +12 + +iDRegEx(rwr0 ) +100 % +82 % +90 % +95 % +83 % + +iDRegEx +87 % +91 % +75 % +72 % +78 % + +iDRegExfixed +96 % +91 % +85 % +83 % +78 % + +Success rate on the target regular expressions, grouped by language size. + +It is also interesting to note that iDRegEx successfully derived the regular expression r1 = (a1 a2 + a3 + · · · + an )+ of Theorem 3.2 for n = 8, n = 10, and n = 12 +from covering samples of size 500, 800, and 1100, respectively. This is quite surprising considering that the characteristic samples for these expressions was proven to +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data +κ +[1.2, 1.4[ +[1.4, 1.6[ +[1.6, 1.8[ +[1.8, 2.0[ +[2.0, 2.5[ +[2.5, 3.0] + +#regex +29 +37 +24 +11 +12 +18 + +iDRegEx(rwr0 ) +96 % +100 % +91 % +54 % +41 % +66 % + +iDRegEx +72 % +89 % +92 % +91 % +50 % +71 % + +· + +iDRegExfixed +83 % +89 % +100 % +100 % +50 % +78 % + +Table IV. Success rate on the target regular expressions, grouped by κ, the average number of +states per symbol. + +be of size at least (n − 2)!, i.e., 720, 40320, and 3628800 respectively. The regular +expression r2 = (Σ \ a1 )+ a1 (Σ \ a1 )+ , in contrast, was not derivable by iDRegEx +from small samples. +Experiments on partially covering samples. Unfortunately, samples to learn +regular expressions from are often smaller than one would prefer. In an extreme, but +not uncommon case, the sample does not even entirely cover the target expression. +In this section we therefore test how iDRegEx performs on such samples. +Definition 5.2. The coverage of a target regular expression r by a sample S is +defined as the fraction of transitions in the corresponding Glushkov automaton for +r that have at least one witness in S. +Note that to successfully learn r from a partially covering sample, iDRegEx +needs to “guess” the edges for which there is no witness in S. This guessing capability is built into iDRegEx(rwr0 ) and iDRegEx in the form of repair rules [Bex +et al. 2006; Bex et al. 2008]. Our experiments show that for target expressions +with alphabet size |Σ| = 10, this is highly effective for iDRegEx(rwr0 ): even at a +coverage of 70%, half the target expressions can still be learned correctly as Table V +shows. The algorithm iDRegEx is performing very poorly in this setting, being +only successful occasionally for coverages close to 100 %. iDRegExfixed performs +better, although not as well as iDRegEx(rwr0 ). This again illustrates that both +algorithms have their merits. +coverage +1.0 +0.9 +0.8 +0.7 +0.6 + +iDRegEx(rwr0 ) +100 % +64 % +60 % +52 % +0% + +iDRegEx +80 % +20 % +0% +0% +0% + +iDRegExfixed +80 % +60 % +40 % +0% +0% + +Table V. Success rate for 25 target expressions for |Σ| = 10 for samples that provide partial +coverage of the target expressions. + +We also experimented with target expressions with alphabet size |Σ| = 5. In this +case, the results were not very promising for iDRegEx(rwr0 ), but as Table VI +illustrates, iDRegEx and iDRegExfixed performs better, on par with the target +expressions for |Σ| = 10 in the case of iDRegExfixed . This is interesting since +the absolute amount of information missing for smaller regular expressions is larger +than in the case of larger expressions. +ACM Journal Name, Vol. V, No. N, November 2024. + +27 + + 28 + +· + +Geert Jan Bex et al. +coverage +1.0 +0.9 +0.8 +0.7 +0.6 +0.5 + +Table VI. + +6. + +iDRegEx(rwr0 ) +100 % +25 % +16 % +8% +8% +0% + +iDRegEx +100 % +75 % +75 % +25 % +25 % +8% + +iDRegExfixed +100 % +66 % +41 % +33 % +17 % +17 % + +Success rate for 12 target expressions for |Σ| = 5 with partially covering samples. + +CONCLUSIONS + +We presented the algorithm iDRegEx for inferring a deterministic regular expression from a sample of words. Motivated by regular expressions occurring in practice, +we use a novel measure based on the number k of occurrences of the same alphabet +symbol and derive expressions for increasing values of k. We demonstrated the +remarkable effectiveness of iDRegEx on a large corpus of real-world and synthetic +regular expressions of different densities. +Our experiments show that iDRegEx(rwr0 ) performs better than iDRegEx +for target expressions with a κ < 1.6 and vice versa for larger values of κ. For +partially covering samples, iDRegEx(rwr0 ) is more robust than iDRegEx. As κ +values and sample coverage are not known in advance, it makes sense to run both +algorithms and select the smallest expression or the one with the smallest language +size, depending on the application at hand. +Some questions need further attention. First, in our experiments, iDRegEx +always derived the correct expression or a super-approximation of the target expression. It remains to investigate for which kind of input samples this behavior +can be formally proved. Second, it would also be interesting to characterize precisely which classes of expressions can be learned with our method. Although the +parameter κ explains this to some extend, we probably need more fine grained +measures. A last and obvious goal for future work is to speed up the inference of +the probabilistic automaton which forms the bottleneck of the proposed algorithm. +A possibility is to use an industrial strength implementation of the Baum-Welsh +algorithm as in [Finn et al. 2006] rather than a straightforward one or to explore +different methods for learning probabilistic automata. +Although iDRegEx can be directly plugged into the XSD inference engine iXSD +of [Bex et al. 2007], it would be interesting to investigate how to extend these +techniques to the more robust class of Relax NG schemas [Clark and Murata 2001]. +REFERENCES +Castor. www.castor.org. +SUN Microsystems JAXB. java.sun.com/webservices/jaxb. +Adriaans, P. and Vitányi, P. 2006. The Power and Perils of MDL. +Ahonen, H. 1996. Generating Grammars for structured documents using grammatical inference +methods. Report A-1996-4, Department of Computer Science, University of Finland. +Angluin, D. and Smith, C. H. 1983. Inductive Inference: Theory and Methods. ACM Computing +Surveys 15, 3, 237–269. +Barbosa, D., Mignet, L., and Veltri, P. 2005. Studying the XML Web: gathering statistics +from an XML sample. World Wide Web 8, 4, 413–438. +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +Benedikt, M., Fan, W., and Geerts, F. 2005. XPath satisfiability in the presence of DTDs. In +Proceedings of the Twenty-fourth ACM SIGACT-SIGMOD-SIGART Symposium on Principles +of Database Systems. 25–36. +Bernstein, P. A. 2003. Applying Model Management to Classical Meta Data Problems. In First +Biennial Conference on Innovative Data Systems Research. +Bex, G., Neven, F., Schwentick, T., and Vansummeren, S. Inference of Concise Regular +Expressions and DTDs. ACM TODS . To Appear. +Bex, G. J., Gelade, W., Neven, F., and Vansummeren, S. 2008. Learning deterministic regular +expressions for the inference of schemas from XML data. In WWW. Beijing, China, 825–834. +Accepted for WWW 2008. +Bex, G. J., Neven, F., Schwentick, T., and Tuyls, K. 2006. Inference of concise DTDs from +XML data. In Proceedings of the 32nd International Conference on Very Large Data Bases. +115–126. +Bex, G. J., Neven, F., Schwentick, T., and Vansummeren, S. 2008. Inference of Concise +Regular Expressions and DTDs. submitted to VLDB Journal. +Bex, G. J., Neven, F., and Van den Bussche, J. 2004. DTDs versus XML Schema: a practical +study. In Proceedings of the 7th International Workshop on the Web and Databases. 79–84. +Bex, G. J., Neven, F., and Vansummeren, S. 2007. Inferring XML Schema Definitions from +XML data. In Proceedings of the 33rd International Conference on Very Large Databases. +998–1009. +Brāzma, A. 1993. Efficient identification of regular expressions from representative examples. +In Proceedings of the 6th Annual ACM Conference on Computational Learning Theory. ACM +Press, 236–242. +Brüggeman-Klein, A. 1993. Regular expressions into finite automata. Theoretical Computer +Science 120, 2, 197–213. +Brüggemann-Klein, A. and Wood, D. 1998. One-unambiguous regular languages. Information +and computation 140, 2, 229–253. +Buneman, P., Davidson, S. B., Fernandez, M. F., and Suciu, D. 1997. Adding structure to +unstructured data. In Database Theory - ICDT ’97, 6th International Conference, F. N. Afrati +and P. G. Kolaitis, Eds. Lecture Notes in Computer Science, vol. 1186. Springer, 336–350. +Che, D., Aberer, K., and Özsu, M. T. 2006. Query optimization in XML structured-document +databases. VLDB Journal 15, 3, 263–289. +Chidlovskii, B. 2001. Schema extraction from XML: a grammatical inference approach. In +Proceedings of the 8th International Workshop on Knowledge Representation meets Databases. +Clark, J. Trang: Multi-format schema converter based on RELAX NG. http://www. +thaiopensource.com/relaxng/trang.html. +Clark, J. and Murata, M. 2001. RELAX NG Specification. OASIS. +Cover, R. 2003. The Cover Pages. http://xml.coverpages.org/. +Du, F., Amer-Yahia, S., and Freire, J. 2004. ShreX: Managing XML Documents in Relational +Databases. In Proceedings of the 30th International Conference on Very Large Data Bases. +1297–1300. +Ehrenfeucht, A. and Zeiger, P. 1976. Complexity measures for regular expressions. Journal +of computer and system sciences 12, 134–146. +Fernau, H. 2004. Extracting minimum length Document Type Definitions is NP-hard. In ICGI. +277–278. +Fernau, H. 2005. Algorithms for Learning Regular Expressions. In Algorithmic Learning Theory, +16th International Conference. 297–311. +Finn, R., Mistry, J., Schuster-Bckler, B., Griffiths-Jones, S., et al. 2006. Pfam: clans, +web tools and services. Nucleic Acids Research 34, D247–D251. +Florescu, D. 2005. Managing semi-structured data. ACM Queue 3, 8 (October). +François, J.-M. 2006. Jahmm. http://www.run.montefiore.ulg.ac.be/~francois/software/ +jahmm/. +ACM Journal Name, Vol. V, No. N, November 2024. + +29 + + 30 + +· + +Geert Jan Bex et al. + +Freire, J., Haritsa, J. R., Ramanath, M., Roy, P., and Siméon, J. 2002. StatiX: making XML +count. In SIGMOD Conference. 181–191. +Freitag, D. and McCallum, A. 2000. Information Extraction with HMM Structures Learned +by Stochastic Optimization. In AAAI/IAAI. AAAI Press / The MIT Press, 584–589. +Garcia, P. and Vidal, E. 1990. Inference of k-testable languages in the strict sense and application to syntactic pattern recognition. IEEE Transactions on Pattern Analysis and Machine +Intelligence 12, 9 (September), 920–925. +Garofalakis, M., Gionis, A., Rastogi, R., Seshadri, S., and Shim, K. 2003. XTRACT: learning document type descriptors from XML document collections. Data mining and knowledge +discovery 7, 23–56. +Gelade, W. and Neven, F. 2008. Succinctness of the Complement and Intersection of Regular +Expressions. In STACS. 325–336. +Gold, E. 1967. Language identification in the limit. Information and Control 10, 5 (May), +447–474. +Goldman, R. and Widom, J. 1997. DataGuides: Enabling Query Formulation and Optimization +in Semistructured Databases. In Proceedings of 23rd International Conference on Very Large +Data Bases. 436–445. +Gruber, H. and Holzer, M. 2008. Finite Automata, Digraph Connectivity, and Regular Expression Size. In ICALP (2). 39–50. +Hegewald, J., Naumann, F., and Weis, M. 2006. XStruct: efficient schema extraction from +multiple and large XML documents. In ICDE Workshops. 81. +Hopcroft, J. and Ullman, J. 2007. Introduction to automata theory, languages and computation. Addison-Wesley, Reading, MA. +Koch, C., Scherzinger, S., Schweikardt, N., and Stegmaier, B. 2004. Schema-based scheduling of event processors and buffer minimization for queries on structured data streams. In +Proceedings of the 30th International Conference on Very Large Data Bases. 228–239. +Manolescu, I., Florescu, D., and Kossmann, D. 2001. Answering XML Queries on Heterogeneous Data Sources. In Proceedings of 27th International Conference on Very Large Data +Bases. 241–250. +Martens, W., Neven, F., Schwentick, T., and Bex, G. J. 2006. Expressiveness and Complexity +of XML Schema. ACM Transactions on Database Systems 31, 3, 770–813. +Mignet, L., Barbosa, D., and Veltri, P. 2003. The XML web: a first study. In Proceedings of +the 12th International World Wide Web Conference. Budapest, Hungary, 500–510. +Nestorov, S., Abiteboul, S., and Motwani, R. 1998. Extracting Schema from Semistructured +Data. In International Conference on Management of Data. ACM Press, 295–306. +Neven, F. and Schwentick, T. 2006. On the complexity of XPath containment in the presence +of disjunction, DTDs, and variables. Logical Methods in Computer Science 2, 3. +Pitt, L. 1989. Inductive Inference, DFAs, and Computational Complexity. In Proceedings of +the International Workshop on Analogical and Inductive Inference, K. P. Jantke, Ed. Lecture +Notes in Computer Science, vol. 397. Springer-Verlag, 18–44. +Quass, D., Widom, J., Goldman, R., et al. 1996. LORE: a Lightweight Object REpository for +semistructured data. In Proceedings of the 1996 ACM SIGMOD International Conference on +Management of Data. 549. +Rabiner, L. 1989. A tutorial on Hidden Markov Models and selected applications in speech +recognition. Proc. IEEE 77, 2, 257–286. +Rahm, E. and Bernstein, P. A. 2001. A survey of approaches to automatic schema matching. +VLDB Journal 10, 4, 334–350. +Sahuguet, A. 2000. Everything You Ever Wanted to Know About DTDs, But Were Afraid to Ask +(Extended Abstract). In The World Wide Web and Databases, 3rd International Workshop, +D. Suciu and G. Vossen, Eds. Lecture Notes in Computer Science, vol. 1997. Springer, 171–183. +Sakakibara, Y. 1997. Recent advances of grammatical inference. Theoretical Computer Science 185, 1, 15–45. +ACM Journal Name, Vol. V, No. N, November 2024. + + Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data + +· + +Sankey, J. and Wong, R. K. 2001. Structural inference for semistructured data. In Proceedings +of the 10th international conference on Information and knowledge management. ACM Press, +159–166. +Thompson, H., Beech, D., Maloney, M., and Mendelsohn, N. 2001. XML Schema part 1: +structures. W3C. +Young-Lai, M. and Tompa, F. W. 2000. Stochastic Grammatical Inference of Text Database +Structure. Machine Learning 40, 2, 111–137. + +Received Month Year; revised Month Year; accepted Month Year + +ACM Journal Name, Vol. V, No. N, November 2024. + +31 + + \ No newline at end of file diff --git a/papers/paper_tods2010.txt b/papers/paper_tods2010.txt new file mode 100644 index 0000000..7822b57 --- /dev/null +++ b/papers/paper_tods2010.txt @@ -0,0 +1,2492 @@ +Inference of Concise Regular Expressions +and DTDs +GEERT JAN BEX and FRANK NEVEN +Hasselt University and Transnational University of Limburg +THOMAS SCHWENTICK +Dortmund University +and +STIJN VANSUMMEREN +Université Libre de Bruxelles + +We consider the problem of inferring a concise Document Type Definition (DTD) for a given set +of XML-documents, a problem that basically reduces to learning concise regular expressions from +positive examples strings. We identify two classes of concise regular expressions—the single occurrence regular expressions (SOREs) and the chain regular expressions (CHAREs)—that capture the +far majority of expressions used in practical DTDs. For the inference of SOREs we present several +algorithms that first infer an automaton for a given set of example strings and then translate that +automaton to a corresponding SORE, possibly repairing the automaton when no equivalent SORE +can be found. In the process, we introduce a novel automaton to regular expression rewrite technique which is of independent interest. When only a very small amount of XML data is available, +however (for instance when the data is generated by Web service requests or by answers to queries), +these algorithms produce regular expressions that are too specific. Therefore, we introduce a novel +learning algorithm CRX that directly infers CHAREs (which form a subclass of SOREs) without +going through an automaton representation. We show that CRX performs very well within its target +class on very small datasets. + +This research was done while S. Vansummeren was a Postdoctoral Fellow of the Research +Foundation-Flanders (FWO) at Hasselt University. +This work was funded by FWO-G.0821.09N and the Future and Emerging Technologies (FET) +programme within the Seventh Framework Programme for Research of the European Commision, +under the FET-Open grant agreement FOX, number FP7-ICT-233599. +Authors’ addresses: G. J. Bex and F. Neven, Database and Theoretical Computer Science Research Group, Hasselt University and Transnational University of Limburg, Agoralaan, gebouw D, +B-3590 Diepenbeek Belgium; email: {geertjan.bex, frank.neven}@uhasselt.be; T. Schwentick, TU +Dortmund, Fakultät für Informatik, Otto-Hahn-Str. 16, Raum 214, 44227 Dortmund, Germany. +email: thomas.schwentick@udo.edu; S. Vansummeren, Research Laboratory for Web and Information Technologies (WIT), Université Libre de Bruxelles, 50 Av. F. Roosevelt, CP 165/15 B-1050 +Brussels, Belgium; email: stijn.vansummeren@ulb.ac.be. +Permission to make digital or hard copies of part or all of this work for personal or classroom use +is granted without fee provided that copies are not made or distributed for profit or commercial +advantage and that copies show this notice on the first page or initial screen of a display along +with the full citation. Copyrights for components of this work owned by others than ACM must be +honored. Abstracting with credit is permitted. To copy otherwise, to republish, to post on servers, +to redistribute to lists, or to use any component of this work in other works requires prior specific +permission and/or a fee. Permissions may be requested from Publications Dept., ACM, Inc., 2 Penn +Plaza, Suite 701, New York, NY 10121-0701 USA, fax +1 (212) 869-0481, or permissions@acm.org. + 2010 ACM 0362-5915/2010/04-ART11 $10.00 +C +DOI 10.1145/1735886.1735890 http://doi.acm.org/10.1145/1735886.1735890 +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + +11 + + 11:2 + +• + +G. J. Bex et al. + +Categories and Subject Descriptors: F.4.3 [Mathematical Logic and Formal Languages]: +Formal Languages; H.2.1 [Database Management]: Logical Design; I.2.6 [Artificial Intelligence]: Learning; I.7.2 [Document and Text Processing]: Document Preparation +General Terms: Algorithms, Languages, Theory +Additional Key Words and Phrases: Regular expressions, schema inference, XML +ACM Reference Format: +Bex, G. J., Neven, F., Schwentick, T., and Vansummeren, S. 2010. Inference of concise regular +expressions and DTDs. ACM Trans. Datab. Syst, 35. 2, Article 11 (April 2010), 47 pages. +DOI = 10.1145/1735886.1735890 http://doi.acm.org/10.1145/1735886.1735890 + +1. INTRODUCTION +The eXtensible Markup Language (XML) serves as the lingua franca for data +exchange on the Internet [Abiteboul et al. 1999]. Because XML documents +in general can be of any form, most communities and applications impose +structural constraints on the documents that are to be exchanged or processed. +These constraints can be formally specified in a schema, which is written in a +schema language such as the Document Type Definitions (DTDs) or the XML +Schema Definitions (XSDs) [Thompson et al. 2004]. +The advantages offered by the presence of a fully specified schema are +numerous. First and foremost, a schema allows automatic validation of the +input document structure, which not only facilitates automatic processing but +also ensures soundness of the input. Unvalidated input data from Web requests +is considered as the number one vulnerability for Web applications [Open Web +Application Security Project Consortium 2004]. The presence of a schema also +allows for automation and optimization of search, integration, and processing +of XML data (refer to, e.g., Benedikt et al. [2008], Deutsch et al. [1999], Koch +et al. [2004], Manolescu et al. [2001], Neven and Schwentick [2006], Wang +et al. [2003]). Moreover, various software development tools such as Castor +[Castor] and SUN’s JAXB [Sun] rely on schemas to perform object-relational +mappings for persistence. Furthermore, the existence of schemas is imperative +when integrating (meta) data through schema matching [Rahm and Bernstein +2001] and in the area of generic model management [Bernstein 2003; Melnik +2004]. A final advantage of a schema is that it assigns meaning to the data. +That is, it provides a user with a concrete semantics of the document and +aids in the specification of meaningful queries over XML data. Although the +examples mentioned here just scrape the surface of current applications, +they already underscore the importance of schemas accompanying XML +data. +Unfortunately, in spite of the aforementioned advantages, the presence of +a schema is not mandatory and many XML documents are not accompanied +by one. For instance, in a recent study Mignet et al. [2003] and Barbosa et al. +[2006] have shown that approximately half of the XML documents available +on the Web do not refer to a schema. In another study Bex et al. [2004] and +Martens et al. [2006] have noted that about two-thirds of XSDs gathered from +schema repositories and from the Web are not valid with respect to the W3C +XML Schema specification [Thompson et al. 2004], rendering them essentially +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:3 + +useless for immedidate application. A similar observation was made by +Sahuguet [2000] concerning DTDs. +Based on the lack of schemas in practice, it is essential to devise algorithms +that can infer a schema for a given collection of XML documents when none, or +no syntactically correct one, is present. This is also acknowledged by Florescu +[2005] who emphasizes that in the context of data integration: +“We need to extract good-quality schemas automatically from existing data and perform incremental maintenance of the generated +schemas.” +In this article, we describe two novel schema inference algorithms outperforming existing systems in accuracy, conciseness, and speed. +It should be noted that even when a schema is already available, there +are situations where inference can be useful. One such situation is schema +cleaning: sometimes a schema is too general with respect to the XML data +that it is supposed to describe. In that case, it can be advantageous to infer a new schema based solely on the data at hand. This situation is nicely +illustrated by the following real-world example taken from the Protein Sequence Database DTD [Miklau 2002], which gives the following definition for +the refinfo-element. +authors, citation, volume?, month?, year, +pages?, (title | description)?, xrefs? +An analysis of the available XML corpus (683MB of data) with our inference +algorithms yields following more precise expression for the refinfo-element. +authors, citation, (volume | month), year, +pages?, (title | description)?, xrefs? +Note that the latter is more strict than the former, as it emphasizes that volume +and month do not occur together: either one specifies a month of publication for +a given journal article, or the volume that it has appeared in, but not both. +As this example illustrates, schema inference algorithms can hence be used to +better understand the semantics of a given XML dataset, making it possible to +adapt an existing schema when necessary. In general, schema inference can be +used to restrict schemas to a relevant subset of data needed by the application +at hand, thereby facilitating difficult tasks like schema matching and data +integration. Indeed, as argued by Hinkelman [2005], industry-level standards +are too loosely defined in general, which can result in XML schemas where +many business structures are formally specified as being optional. +The second situation where schema inference is useful even though a schema +already exists is in the presence of noisy XML data. In such a situation, part or +all of the data that needs to be processed is rejected by the existing schema. For +instance, we have harvested and investigated a corpus of XHTML documents +from the Web and found that an astonishing 89% of 2092 documents was not +valid with respect to the XHTML Transitional specification [W3C 2002]. In this +case, the inference of a new schema based on the corpus and its comparison +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:4 + +• + +G. J. Bex et al. + +Fig. 1. An example DTD. + +with the XHTML Transitional specification provides a uniform view of the kind +of errors made. Further, given that one often has no choice but to deal with such +noisy data, one may infer a new schema from a subset of the corpus (deleting +documents that make unacceptable errors) and work with that schema rather +than with the official specification to retain at least a minimal validation. +1.1 Problem Setting +Based on the previous observations, it is hence essential to devise algorithms +that can automatically infer a DTD or XSD from a given corpus of XML +documents. +As illustrated in Figure 1, a DTD is essentially a mapping d from element +names to regular expressions over element names. An XML document is valid +with respect to d if for every occurrence of an element name e in the document, +the word formed by its children belongs to the language of the corresponding +regular expression d(e). For instance, the DTD in Figure 1 requires each store +element to have zero or more order children, which must be followed by a +stock element. Likewise, each order must have a customer child, which must +be followed by one or more item elements. +To infer a DTD from a corpus of XML documents C it hence suffices to look, +for each element name e that occurs in a document in C, at the set of element +name words that occur below e in C, and to infer from this set the corresponding +regular expression d(e). As such, the inference of DTDs reduces to the inference of regular expressions from sets of positive example words. To illustrate, +from the words id price, id qty supplier, and id qty item item appearing under elements in a sample XML corpus, we could derive the following +rule. +item → (id, price | (qty, (supplier | item+ ))) +While the inference of XSDs is more complicated than the inference of DTDs, +recent characterizations [Martens et al. 2006] show that the structural core of +XML schema (that is, the sets of trees that are definable by XSDs) correspond +to DTDs extended with vertical regular expressions. Therefore, one cannot +hope to successfully infer XSDs without good algorithms for inferring regular +expressions. As such, we focus in this article on the inference of regular expressions (and therefore, by the preceding reduction, on the inference of DTDs). +The inference of XSDs, building on the algorithms presented here, is treated in +a companion article [Bex et al. 2007]. +In particular, let  be a fixed set of alphabet symbols (also called element +names), and let  ∗ be the set of all words over . +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:5 + +Definition 1 (Regular Expressions). In this article, we are interested in +learning regular expressions r, s of the form +r, s ::= ∅ | ε | a | r . s | r + s | r? | r + , +where parentheses may be added to avoid ambiguity. Here, ε denotes the empty +word; a ranges over symbols in ; r . s denotes concatenation; r + s denotes +disjunction; r + denotes one-or-more repetitions; and r? denotes the optional +regular expression. That is, the language L(r) accepted by regular expression +r is given by +L(∅) = ∅ +L(ε) = {ε} +L(a) = {a} +L(r . s) = {vw | v ∈ L(r), w ∈ L(s)} +L(r + s) = L(r) ∪ L(s) +L(r + ) = {v1 . . . vn | n ≥ 1 and v1 , . . . , vn ∈ L(r)} +L(r?) = L(r) ∪ {ε}. +For convenience, we sometimes omit the concatenation symbol, simply writing rs for r.s. Note that the Kleene star operator (denoting zero or more repititions as in r ∗ ) is not allowed by the preceding syntax. This is not a restriction, +since r ∗ can always be represented as (r + )? or (r?)+ . Conversely, the latter can +always be rewritten into the former for presentation to the user. Also note that +the previous syntax uses r + s, to denote disjunction rather than the vertical +bar notation r | s used by DTDs. The former notation should not be confused +with the one-ore-more repetition operator r + , where the plus symbol is used in +the exponent. +The class of all regular expressions is actually too large for our purposes, +as both DTDs and XSDs require the regular expressions occurring in them to +be deterministic (also sometimes called one-unambiguous [Brüggemann-Klein +and Wood 1998]). Intuitively, a regular expression is deterministic if, without +looking ahead in the input word, it allows to match each symbol of that word +uniquely against a position in the expression when processing the input in +one pass from left to right. For instance, (a + b)∗ a is not deterministic as already the first symbol in the word aaa could be matched by either the first or +the second a in the expression. Without lookahead, it is impossible to know +which one to choose. The equivalent expression b∗ a(b∗ a)∗ , on the other hand, is +deterministic. +Definition 2. Let r stand for the regular expression obtained from r by +replacing the ith occurrence of alphabet symbol a in r by a(i) , for every i and ++ ++ +a. For example, for r = b+ a(ba+ )? we have r = b(1) a(1) (b(2) a(2) )?. A regular +expression r is deterministic if there are no words wa(i) v and wa( j) v in L(r) +such that i = j. +Equivalently, an expression is deterministic if the so-called Glushkov construction [Brüggeman-Klein 1993] translates it into a deterministic finite +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:6 + +• + +G. J. Bex et al. + +automaton rather than a nondeterministic one [Brüggemann-Klein and Wood +1998]. Not every nondeterministic regular expression is equivalent to a deterministic one [Brüggemann-Klein and Wood 1998]. Thus, semantically, the class +of deterministic regular expressions forms a strict subclass of the class of all +regular expressions. +Learning in the limit. For the purpose of inferring DTDs from XML data, +we are hence in search of an algorithm that, given enough sample words of a +target deterministic regular expression r, returns a deterministic expression r +equivalent to r. In the framework of learning in the limit [Gold 1967], such an +algorithm is said to learn the deterministic regular expressions from positive +data. +Definition 3. Define a sample to be a finite subset of  ∗ and let R be +a subclass of the regular expressions. An algorithm M mapping samples to +expressions in R is said to learn R from positive data if: (1) S ⊆ L(M(S)) for +every sample Sand (2) to every r ∈ R we can associate a so-called characteristic +sample Sr ⊆ L(r) such that, for each sample S with Sr ⊆ S ⊆ L(r), M(S) is +equivalent to r. +Intuitively, the first condition says that M must be sound; the second that +M must be complete, given enough data. A class of regular expressions R is +learnable in the limit from positive data if an algorithm exists that learns R. +For the class of all regular expressions, it was shown by Gold [1967] that no +such algorithm exists. The same holds for the class of deterministic regular +expressions, as shown in our companion article [Bex et al. 2008]. +PROPOSITION 4 (BEX ET AL. 2008). The class of deterministic regular expressions is not learnable in the limit from positive data. +Proposition 4 immediately excludes the possibility for an algorithm to infer +the full class of DTDs. In practice, however, regular expressions occurring in +DTDs and XSDs are concise rather than arbitrarily complex. Indeed, a study +of 819 DTDs and XSDs gathered from the Cover Pages [Cover 2003] (including +many high-quality XML standards) as well as from the Web at large, revealed +that regular expressions occurring in practical schemas are such that every +alphabet symbol occurs at most k times, with k small. Actually, in 98% of the +cases k = 1. +Definition 5. A regular expression is k-occurrence if every alphabet symbol +occurs at most k times in it. +For example, the expressions customer . order+ and (school + institute)+ +are both 1-occurrence, while id .(qty + id) is 2-occurrence (as id occurs twice). +Observe that if r is k-occurrence, then it is also l-occurrence for every l ≥ k. +To simplify notation, we often abbreviate “k-occurrence regular expression” by +k-ORE and also refer to the 1-OREs as “single occurrence regular expressions” +or SOREs. +Note that, since every alphabet symbol can occur at most once in a SORE, +every SORE is necessarily deterministic. Indeed, we have the following strict +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:7 + +inclusion hierarchy among the various classes of regular expressions just +discussed. +SOREs +⊂ 2-OREs ⊂ 3-OREs ⊂ · · · ⊂ k-OREs +⊂ +⊂ +deterministic regex +⊂ +all regex +(For k ≥ 2, the classes of k-OREs and deterministic regular expressions are +incomparable.) Given their importance in practical schemas, we focus in this +article on the inference of SOREs. The inference of deterministic k-OREs for +k > 1 is treated in a companion article [Bex et al. 2008]. +1.2 Outline and Contributions +In particular, we show in Section 3 that the class of SOREs can be efficiently +learned in the limit from positive data by first constructing an automaton +representation of the target SORE using techniques of Garcı́a and Vidal [1990], +and by subsequently transforming this automaton into an equivalent SORE (if +such a SORE exists) using a novel polynomial-time algorithm called REWRITE. +For the general class of regular expressions the resulting expression can be of +exponential size, as we explain in more detail in Section 3. In Section 4, we +improve REWRITE to deal with real-world, and therefore incomplete, samples. In +contrast to REWRITE, which fails when its input automaton is not equivalent to +a SORE, the resulting improvement, called RWR, repairs the input automaton +until it becomes equivalent to a SORE. We also develop an extension of RWR, +called RWR2 , which improves the precision of RWR at the cost of increased running +time. +For the settings where extremely little XML data is available to infer a +schema from (for instance, when the data is returned as answers to queries or +Web service requests [Ngu et al. 2005; Oaks and ter Hofstede 2007]), we +introduce in Section 6 the algorithm CRX. CRX successfully learns the class +of CHAREs, a strict subclass of the SOREs that nevertheless holds great +practical importance. Indeed, the same investigation as before reveals that +more than 90% of the regular expressions occurring in practical schemas are +CHAREs [Martens et al. 2006]. +We experimentally validate RWR, RWR2 , and CRX in Section 7 on both small and +large samples drawn from real-world target DTDs whose regular expressions +fall both within the class of SOREs/CHAREs and outside of those classes. In +all settings, our algorithms outperform existing systems in accuracy, conciseness, and speed. Further, we assess the strong generalization ability of CRX by +establishing on average the minimal number of sample words needed to derive +optimal regular expressions. In Section 8 we discuss how to extend RWR and +CRX to incrementally compute the inferred regular expressions when new data +arrive, how to address noise, and how to deal with numerical predicates. We +begin in the next section with a discussion of related work, and conclude in +Section 9. +It is important to note that this article differs from its conference version [Bex +et al. 2006] in the following way. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:8 + +• + +G. J. Bex et al. + +—First and foremost, it corrects the results of Bex et al. [2006] by providing +a completely new algorithm for converting automata into equivalent SOREs +(provided such a SORE exists), and gives a full correctness proof (Section 3). +In contrast to what is claimed in Bex et al. [2006], the conversion algorithm +of Bex et al. [2006] does not always yield an equivalent SORE, as discussed +in Section 5. +—It introduces new heuristics (based on a language size criterion) for dealing +with real-world, and therefore incomplete datasets (Section 4). +—It adds new experiments that measure: (1) the impact of noise and (2) the +accuracy of our algorithms under various levels of missing data. +2. RELATED WORK +Schema inference. Schemas for semistructured data have been defined in +Buneman et al. [1997], Fernandez and Suciu [1998], and McHugh et al. +[1997] and their inference has been addressed in Goldman and Widom [1997], +and Nestorov et al. [1997, 1998]. The methods in Nestorov et al. [1997] and +Goldman and Widom [1997] focus on the derivation of a graph summary +structure (called full representative object or dataguide) for a semistructured +database. This data structure contains all paths in the database. Approximations of this structure are considered by restricting to paths of a certain length. +The latter then basically reduces to the derivation of an automaton from a set +of bounded length strings. Naively restricting the algorithms to trees rather +than graphs is inappropriate since no order is considered between the children +of a node so that DTD-like schemas cannot be derived. However, even the use +of more sophisticated encodings of the XML documents using edges between +siblings would be to no avail since no algorithms are given to translate the +obtained automata to regular expressions. In Nestorov et al. [1998], a schema +is a typing by means of a datalog program. Again, no algorithms are given +to transform datalog types into regular expressions. So, these approaches +can therefore not be used to derive DTDs, not even when the semistructured +database is tree-shaped. +DTD inference. In the context of DTD inference, Sankey and Wong [2001] +propose several approaches to generate probabilistic string automata to represent regular expressions. To transform these into actual regular expressions, +and hence to obtain DTDs, the authors refer to the methods of Ahonen [1996]. +The latter provides a method to translate one-unambiguous nonprobabilistic +string automata to regular expressions, as given by Brüggemann-Klein and +Wood [1998], followed by a post-processing simplification step. Apart from several case analyses based on a dictionary example, no systematic study of the +effectiveness of the approach is provided. In particular, in contrast to our results, no target class is given for which the set of transformations is complete. +There are only a few papers describing systems for direct DTD inference +[Garofalakis et al. 2003; Min et al. 2003; Chidlovskii 2001]. Only one of them is +available for testing: XTRACT [Garofalakis et al. 2003]. In Section 7, we make a +detailed comparison with our proposal. In contrast to our approach, the XTRACT +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:9 + +system generates for every separate string a regular expression while representing repeated subparts by introducing Kleene-*. In a second step, the system +factorizes common subexpressions of these candidate regular expressions using algorithms from the logic optimization literature. Finally, in the third step, +XTRACT applies the Minimum Description Length (MDL) principle to find the +best RE among the candidates. Although the approach has been shown to work +on real-world DTDs in Garofalakis et al. [2003] the XML data complying to +these DTDs was generated. We report in Section 7 that XTRACT has two kinds of +shortcomings on real-world XML data: (1) it generates large, long-winded, and +difficult to interpret regular expressions; and (2) it cannot handle large datasets (over 1000 strings). The latter is due to the NP-hard submodule in the +third step of the XTRACT algorithm [Fernau 2004]. The former problem seems +to be more fundamental. The final step results in expressions consisting of +disjunctions of regular expressions while in practice the large majority of regular expressions are concatenations of disjunctions [Martens et al. 2006]. As a +result, larger datasets result in larger regular expressions. +In Min et al. [2003] an adaptation of the XTRACT approach to a restricted +class of regular expressions which form a subclass of SOREs is described. Although the system, according to the experiments conducted in Min et al. [2003], +outperforms XTRACT in accuracy and efficiency, it seems that the two fundamental shortcomings described earlier remain. It would thus be surprising if the +system performed much better than XTRACT on real-world data. Similarly to +Ahonen [1996], the approach of Chidlovskii [2001] relies on the translation of +Glushkov automata to regular expressions which, in general, can lead to an +exponential size increase. +Trang [Clark ] is state-of-the-art software written by James Clark intended +as a schema translator for the schema languages DTDs, Relax NG, and XML +Schema. In addition, Trang allows to infer a schema for a given set of XML +documents. We discuss Trang further in Section 7.1. +Language inference. Learning of regular languages from positive examples in +the computational learning community is mostly directed towards inference of +automata as opposed to inference of regular expressions [Angluin and Smith +1983; Pitt 1989; Sakakibara 1997]. As noted by Fernau [2004] and argued +in the previous section, first using learning algorithms for deterministic automata and then transforming these into regular expressions in general leads +to unmanageable and long-winded regular expressions. Some approaches to +inference of regular expressions for restricted cases have been considered. For +instance, Brāzma [1993] showed that regular expressions without union can +be approximately learned in polynomial time from a set of examples satisfying +some criteria. Fernau [2009] provided a learning algorithm for finite unions +of pairwise left-aligned union-free regular expressions. These expressions are +different from the expressions we consider here: they are not included in the +class of SOREs and do not contain all CHAREs. The development is purely +theoretical, no experimental validation has been performed. +Automata to RE translation. Although heuristics for automata to RE translations [Delgado and Morais 2004; Han and Wood 2007] have been proposed, +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:10 + +• + +G. J. Bex et al. + +Fig. 2. (a) The SOA accepting the same language as the SORE a . b .(c+d+ ). (b) The SOA generated +by 2T-INF for the sample S = {bacacdacde, cbacdbacde, abccaadcde}. + +all of them are optimizations of the classical state elimination algorithm. In +particular, they investigate the best order to eliminate states when going from +automata to regular expressions. So, they focus on the class of all automata +for which, as explained in Section 3, an exponential increase in size cannot be +avoided in general. Further, the methods remain theoretical as no experimental +analysis has been performed. Caron and Ziadi [2000] devise an algorithm deciding whether an automaton is Glushkov. If so, the automaton can be rewritten +into a short equivalent regular expression. Their method works in a top-down +fashion, that is, it derives the top nodes of the parse tree corresponding to +the regular expression first, and subsequently proceeds downward in the tree. +Consequently, the method first derives the largest subexpressions of the expression, making it harder to devise heuristics in the presence of missing data. +In contrast, our approach is bottom-up, that is, starting from the leaf nodes of +the parse tree, composing them into the smallest subexpressions. +3. A COMPLETE ALGORITHM FOR INFERRING SORES +Our goal in this section is to infer a SORE s equivalent to a target SORE r +given only a finite sample S ⊆ L(r). To this end, we first learn from S a Single +Occurrence Automaton (SOA for short). A SOA is a specific kind of deterministic +finite state automaton in which all states, except for the initial and final state, +are element names. Figure 2(a) gives an example. Note that in contrast to the +classical definition of automata, no edges are labeled: all incoming edges in a +state a are assumed to be labeled by a. As such, a word a1 , . . . , an is accepted if +there is an edge from the initial state to a1 , an edge from a1 to a2 ,. . . , and an +edge from an to the final state. Thus, the SOA in Figure 2(a) accepts the same +language as a . b .(c + d+ ). +Definition 6 (SOA). Let src and sink be two special symbols, distinct from +the element names, that will serve as the initial and final state, respectively. A +single occurrence automaton is a finite directed graph G = (V, E) such that: +(1) {src, sink} ⊆ V and all nodes in V − {src, sink} are element names; and +(2) src has only outgoing edges; sink has only incoming edges; and every v ∈ +V − {src, sink} is visited during a walk from src to sink. +Note that V − {src, sink} can be empty. We write L(G) for the set of all words +accepted by G; V(G) for the set of G’s vertices, and E(G) for G’s edge relation. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:11 + +Algorithm 1. 2T-INF +Input: a finite set of sample strings S +Output: a SOA G such that S ⊆ L(G) +1: Let V be the set of states consisting of all element names occurring in S plus the +initial state src and final state sink +2: Initialize E := ∅ +3: for each string a1 . . . an in S do +4: +add the edges (src, a1 ), (a1 , a2 ), . . . , (an, sink) to E +5: end for +6: return G = (V, E) + +3.1 Learning an Automaton +Given a sample S, we can learn an automaton G that accepts all words in S by +means of the algorithm 2T-INF shown in Algorithm 1. Its behavior is illustrated +in Figure 2(a) on the sample S = {abc, abdd} and in Figure 2(b) on the sample +S = {bacacdacde, cbacdbacde, abccaadcde}. 2T-INF was introduced by Garcı́a and +Vidal [1990], who also proved the following proposition. +PROPOSITION 7 ([GARCÍA AND VIDAL 1990]). 2T-INF is sound, that is, S ⊆ +L(2T-INF(S)) for each sample S. Moreover, 2T-INF is minimal, that is, for each SOA +G with S ⊆ L(G), 2T-INF(S) is a subgraph of G and hence L(2T-INF(S)) ⊆ L(G). +It turns out that 2T-INF is also complete for building a SOA representation of +a target SORE r, provided that its input sample is representative with regard +to r. +Definition 8 (Representative Sample). A word v of length 2 is said to be a +2-gram of a set of words W if it occurs as a subword in some w ∈ W. A sample +S is representative of a SORE r if S ⊆ L(r) and the following statements hold: +(1) for every a ∈  starting a word in L(r) there is a word in S that starts with +a; +(2) for every a ∈  ending a word in L(r) there is a word in S that ends with a; +(3) every 2-gram of L(r) is a 2-gram of S. +If S is not representative of r, then we say that S does not cover r. +For instance, the sample {a, b, c} is representative for a + b + c but {a, c} +is not since it lacks a word starting with b. Furthermore, the sample +{bacacdacde, cbacdbacde, abccaadcde} is representative for ((b?(a + c)+ )d)+ e but +{bacacdacde, cbacdbacde} is not since it does not contain the 2-gram ab. +PROPOSITION 9. +L(r). + +If S is a representative sample of SORE r then L(2T-INF(S)) = + +PROOF. It is not hard to see that every SORE r can be transformed into an +equivalent SOA Gr : we take as nodes of Gr all element names occurring in r +plus the initial state src and the final state sink; for each alphabet symbol that +starts a word in L(r) we add the edge (src, a) to Gr ; for each alphabet symbol +that ends a word in L(r) we add an edge (a, sink) to Gr , and for each alphabet +symbol b that follows an alphabet symbol a in a word in L(r) we add the edge +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:12 + +• + +G. J. Bex et al. + +Fig. 3. A SOA not equivalent to any SORE. It accepts the same language as a(ba)+ . + +(a, b) to Gr . Now reason as follows. Clearly, S ⊆ L(r) = L(Gr ). Hence, 2T-INF(S) +is a subgraph of Gr by Proposition 7. Since S is a representative sample of r, +however, every edge of Gr must also be in 2T-INF(S). As such, 2T-INF(S) = Gr and +hence L(2T-INF(S)) = L(Gr ). +3.2 From SOA to SORE +Proposition 9 shows that it is possible to learn a SOA representation of a target +SORE r, provided that we are given enough data. To transform this SOA into +a regular expression, an obvious approach would be to use known techniques +such as the classical state elimination algorithm (refer to, e.g., Hopcroft and +Ullman [1979]). Unfortunately, as already hinted upon by Fernau [2004, 2009] +and as we illustrate shortly, it is very difficult to get concise regular expressions +from an automaton representation. For instance, the classical state elimination +algorithm applied to the SOA generated by 2T-INF in Figure 2(b) yields the +expression:1 +(aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + +aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ +(b + aa∗ b))∗ (aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d)))(aa∗ d + +(c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + aa∗ c)(c + +aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))∗ + +which differs quite a bit from the equivalent SORE +((b?(a + c))+ d)+ e + +(‡). + +Actually, results by Ehrenfeucht and Zeiger [1976], Gelade and Neven [2008], +and Gruber and Holzer [2008] show that it is impossible in general to generate +concise regular expressions from automata: there are automata, even SOAs as +generated by 2T-INF, for which the number of occurrences of alphabet symbols in +the smallest equivalent expression is exponential in the size of the automaton. +For such automata, a concise regular expression representation hence does not +exist. +These results imply that there are SOAs G for which an equivalent SORE +does not exist (Figure 3 gives a simple example). Note, however, that when +such a SORE r does exist, its size is always linearly bounded by the number of +states of G. Indeed, since every alphabet symbol can occur at most once in r, the +size of r is linearly bounded by the alphabet symbols that it mentions. Since G +and r are equivalent, these symbols are exactly the states of G (minus src and +sink). Hence, the SOREs constitute a well-behaved and concisely representable +subset of the regular languages. It is therefore natural to investigate how to +1 Transformation computed by JFLAP: www.jflap.org. + +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:13 + +transform a given SOA into an equivalent SORE when such a SORE exists. +Clearly, the previous example illustrates that the classical state elimination +algorithm does not suffice for this purpose. +For that reason, we introduce in this section a novel graph-rewriting approach for transforming SOAs into SOREs. While our approach is related to the +classical state-elimination algorithm for transforming an arbitrary automaton +into a regular expression, we do not eliminate states by introducing additional +edges (thereby duplicating subexpressions) but instead replace sets of states +by single states (taking care to avoid duplication). In addition, there are two +rewriting steps that only remove edges. +Just as the classical algorithm, it is necessary for the definition of the graph +rewrite rules to define a generalization of SOAs in which internal states are +allowed to be labeled by SOREs (as opposed to element names from ). This generalization is defined as follows. Call two regular expressions r and s alphabetdisjoint if r and s have no alphabet symbol in common. For example, (a+b)? and +c+ are alphabet-disjoint, whereas (a + b) and b?c+ are not. Call an expression +r proper if it accepts at least one nonempty word (i.e., it is not equivalent to ∅, +nor to ε). +Definition 10. A generalized Single Occurrence Automaton (generalized +SOA for short) is a finite graph G = (V, E) such that: +(1) {src, sink} ⊆ V and all vertices in V − {src, sink} are pairwise alphabetdisjoint proper SOREs; and +(2) the edge relation E is such that src has only outgoing edges; sink has only +incoming edges; and every v ∈ V is visited by a walk from src to sink. +A word w ∈  ∗ is accepted by G if there is a walk src r1 . . . rm sink in G and a +division of w into subwords w = w1 . . . wm such that wi ∈ L(ri ), for 1 ≤ i ≤ m. +Again, we write L(G) for the set of all words accepted by G. +Figure 7 shows some examples. Clearly, every SOA is also a generalized +SOA. In what follows, we write PredG (s) for the set of all direct predecessors of +a SORE s in G, and SuccG (s) for the set of all direct successors of s in G. +PredG (s) := {r | (r, s) ∈ E(G)}, +SuccG (s) := {t | (s, t) ∈ E(G)}. +− +Furthermore, we write Pred− +G (s) for PredG (s) − {s} and similarly SuccG (s) for +SuccG (s) − {s}. Finally, we write + +PredG (s) ∪ {s} if s = s + for some s ++ +PredG (s) := +PredG (s) +otherwise + +SuccG (s) ∪ {s} if s = s + for some s +(s) +:= +Succ+ +G +SuccG (s) +otherwise. + +Rewrite rules. Our system of rewrite rules consists of the seven rules shown +in Figures 4–6: one rule to introduce disjunction (r + s), four rules to introduce +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:14 + +• + +G. J. Bex et al. + +Fig. 4. Rewrite rules part 1. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)− ++ +{r, s}. The gray loops on r and s indicate that r ∈ Succ+ +G (r) and s ∈ SuccG (s), respectively. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:15 + +Fig. 5. Rewrite rules part 2. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)− ++ +{r, s}. The gray loops on r and s indicate that r ∈ Succ+ +G (r) and s ∈ SuccG (s), respectively. + +concatenation (r . s, r? . s, r . s?, and r? . s?), one rule to introduce iteration (r + ), +and one rule to introduce optionals (r?). At the basis of the first five rules lies +the contraction of two states r and s into a single new state t, which is defined +as follows. +Definition 11 (State Contraction). Let G be a generalized SOA; let r and s +be states in G; and let t be a state not in G. The contraction of r and s into t is +the generalized SOA G[r, s ⇒ t] obtained from G as follows: +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:16 + +• + +G. J. Bex et al. + +Fig. 6. Rewrite rules part 3. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)− +{r, s}. Note in particular that the rule OPTIONAL r? can only be applied when G contains only one +node besides src and sink. + +(1) Add t as a new state to G; +(2) make every v ∈ PredG (r) − {r, s} a predecessor of t; +(3) make every w ∈ SuccG (r) − {r, s} a successor of t; +(4) add a loop t → t if r ∈ SuccG (s); and +(5) remove r, s and all of their incoming and outgoing edges. +Note that state contraction is not symmetric. +To illustrate, the contraction G[a, c ⇒ a + c] of the generalized SOA G in +Figure 7(a) is shown in Figure 7(b). Similarly, the contraction G[b, a + c ⇒ +b? .(a + c)] of the generalized SOA G in Figure 7(b) is shown in Figure 7(c). Note +that if r = s, then G[r, s ⇒ t] is simply a substitution of r by the new state t. +To simplify notation, we simply write G[r ⇒ t] for such contractions in what +follows. +In addition to contraction, the rewrite rules also use the following +operation. +Definition 12. If G is a generalized SOA and r, s are states in G, then we +write G (r, s) to denote the generalized SOA obtained from G by removing the +edge from r to s, if present. +In what follows, we write G  H to indicate that G rewrites to H in a single +step according to the rewrite rules in Figures 4–6, and G ∗ H to indicate that +G rewrites to H in zero or more steps. +The following proposition shows that the rewrite rules are sound. +PROPOSITION 13. If G is a generalized SOA and G  H then H is also a +generalized SOA and L(G) = L(H). +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:17 + +PROOF. First observe that, since all states in a generalized SOA are pairwise +alphabet-disjoint proper SOREs, the new states r + s; r . s; r? . s; r . s?; r? . s?; r + ; +and r? introduced by the rewrite rules in Figures 4–6 must themselves be proper +SOREs alphabet-disjoint with the remaining states. As such, all states in H +are pairwise alphabet-disjoint proper SOREs. To show that H is a generalized +SOA, it hence remains to show that every state in H participates in a walk +from src to sink. Hereto, we distinguish the following three cases. +—H = G[r, s ⇒ t] for some t. Then, since G is a generalized SOA, and r and s +particpate in a walk from src to sink. In particular, there is a walk from src +to r in G, and a walk from s to sink. Then, by definition of state contraction, +there is a walk from src to t and from t to sink in H, that is, t participates in +a walk from src to sink in H. +—H = G[r ⇒ r + ] (r + , r + ). Then, by definition of state contraction and since +r participates in a walk from src to sink in G, r + must participate in a +walk from src to sink in G[r ⇒ r + ]. This walk can always be transformed +into a walk from src to sink in H by removing the edge (r + , r + ) should it +occur. +—H = G[r ⇒ r?] (src, sink). Then, by definition of state contraction and since +r participates in a walk from src to sink in G, r? must participate in a walk +from src to sink in G[r ⇒ r?]. Since the edge (src, sink) cannot occur in this +walk (recall that src has no incoming edges and sink has no outgoing edges), +r? also participates in a walk from src to sink in H. +To see that L(G) = L(H) we reason by a case analysis on the rewrite rule used +to transform G into H. For economy of space, we only illustrate this reasoning +for DISJUNCTION r + s; the other cases are similar. +So, suppose that G was rewritten into H by DISJUNCTION r + s, that is, H = +G[r, s ⇒ r+s]. Then r and s have the same (extended) predecessor and successor +set. From this, it follows that the following statements are equivalent. +(1) s ∈ SuccG (r); +(2) r ∈ SuccG (s); +(3) s ∈ Succ+ +G (s); +(4) r ∈ Succ+ +G (r). +For instance, s ∈ SuccG (r) ⇔ r ∈ SuccG (s) since: +s ∈ SuccG (r) ⇔ s ∈ SuccG (r) ∪ {r} +⇔ s ∈ Succ+ +G (r) ++ +⇔ s ∈ SuccG (s) +⇔ s ∈ Pred+ +G (s) ++ +⇔ s ∈ PredG (r) + +since r = s +by definition of Succ+ +G (r) ++ +since Succ+ +G (r) = SuccG (s) ++ +by definition of Succ+ +G (s) and PredG (s) ++ +since Pred+ +G (r) = PredG (s) + +⇔ s ∈ PredG (r) ∪ {r} +⇔ s ∈ PredG (r) + +by definition of Pred+ +G (r) +since r = s + +⇔ r ∈ SuccG (s) + +by definition of PredG (r) and SuccG (s) + +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:18 + +• + +G. J. Bex et al. + +The other equivalences can be similarly obtained. From these equivalences, +it follows that G must take one the two forms illustrated for rewrite rule +DISJUNCTION r + s in Figure 4. In both cases, the corresponding H is also shown. +Now suppose that w = w1 . . . wm ∈  ∗ is recognized by the walk src, t1 , . . . , +tm, sink in G with wi ∈ L(ti ) for 1 ≤ i ≤ m. Let the sequence src, t1 , . . . , tm, sink +be obtained from src, t1 , . . . , tm, sink by replacing every occurrence of r and s by +r + s. By inspection of the illustrations for rule DISJUNCTION r + s in Figure 4 it +is not difficult to see that src, t1 , . . . , tm, sink is a walk in H. Moreover, wi ∈ L(ti ) +by construction for 1 ≤ i ≤ m. Therefore, w ∈ L(H) and hence L(G) ⊆ L(H). +Conversely, suppose that w = w1 . . . wm ∈  ∗ is recognized by src, t1 , . . . , tm, sink +in H with wi ∈ L(ti ) for 1 ≤ i ≤ m. Determine vi as follows: +⎧ +⎪ +⎨ti if ti = r + s +ti = r if ti = r + s and wi ∈ L(r) +⎪ +⎩ +s if ti = r + s and wi ∈ L(s) +By inspection of the illustrations for rule DISJUNCTION r + s in Figure 4 it is +not difficult to see that src, t1 , . . . , tm, sink is a walk in G. Moreover, wi ∈ L(ti ) +for 1 ≤ i ≤ m. Therefore w ∈ L(G) and hence L(H) ⊆ L(G). As such, L(G) = +L(H). +Since each rewrite rule either contracts two states into a single state or +removes an edge from G, the size of H is always smaller than G. Therefore, we +have the next proposition. +PROPOSITION 14. The system of rewrite rules in Figures 4–6 is terminating: +there is no infinite sequence of rewrite steps G  H  I  . . . +Our algorithm REWRITE, shown in Algorithm 2, then operates as follows. First, +it checks whether the input SOA G corresponds to the empty language (∅) or +the empty word (ε) in lines 1–5. If so, it returns the corresponding regular +expression. Otherwise, it rewrites G until no further rules apply. It then checks +whether the resulting generalized SOA is final. +Definition 15. As generalized SOA G is final if E(G) = {(src, r), (r, sink)} +with r distinct from src and sink. In other words, G is final if it is a chain +consisting of the source, an arbitrary regular expression, and the sink. +If the resulting generalized SOA is indeed final, then clearly L(G) = L(r), +and r is returned as result. If the resulting generalized SOA is not final, then +G is not equivalent to a SORE (as we formally show further on), and REWRITE +fails. To illustrate, Figure 7 shows an example run of REWRITE on the example +SOA from Figure 2(b). +THEOREM 16. On input SOA G, REWRITE fails if and only if G is not equivalent +to a SORE. Otherwise, REWRITE returns a SORE equivalent to G. Moreover, +5 +REWRITE operates in time O(n ) where n is the number of states in G. +Note that the complexity O(n5 ) is reasonable since when we apply REWRITE to +the result of 2T-INF on a sample S, n corresponds to the (typically small) number +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:19 + +Algorithm 2. REWRITE +Input: a SOA G +Output: a SORE r such that L(r) = L(G) +1: if sink is not reachable from src in G then +2: +return ∅ +3: else if E(G) = {(src, sink)} then +4: +return ε +5: else +6: +while a rewrite rule from Figures 4–6 can be applied do +7: +perform the rewrite rule on G +8: +end while +9: +if G is final then +10: +return the corresponding regular expression +11: +else +12: +fail +13: +end if +14: end if + +of distinct element names occurring in S, not the total number or total length +of words in S. +The remainder of this section is devoted to the proof of Theorem 16, which +is divided into three steps. First, we show that REWRITE is sound. +PROPOSITION 17. If REWRITE(G) does not fail then it returns a SORE equivalent to G, for any SOA G. +PROOF. + +We distinguish three cases. + +(1) If sink is not reachable from src then REWRITE(G) = ∅ (clearly a SORE) and +L(G) = ∅ = L(∅), as desired. +(2) If E(G) = {(src, sink)} then REWRITE(G) = ε (again clearly a SORE), and +L(G) = {ε} = L(ε), as desired. +(3) Otherwise, G is rewritten into a final generalized SOA H with E(H) = +{(src, t), (t, sink)} (t distinct from src and sink) and REWRITE(G) = t. In +particular, t is a SORE. By Proposition 13, L(G) = L(H) and thus, since +E(H) = {(src, t), (t, sink)}, L(G) = L(H) = L(t) = L(REWRITE(G)), as desired. +Next, we show that REWRITE has the claimed complexity. +PROPOSITION 18. REWRITE operates in time O(n5 ), where n is the number of +states of its input G. +PROOF. We assume that checking whether there is an edge from state r +to state s can be done in constant time (for instance, using an adjacency matrix representation). To see that REWRITE runs in time O(n5 ) under this assumption, let us check that lines 1–4, lines 6–7, and lines 8–10 all run in +O(n5 ). +(Lines 1–4). Since G has at most n2 edges, checking whether sink is reachable +from src can be done in time O(n2 ) using depth-first search. Moreover, checking +whether E(G) = {(src, sink)} can also be done in time O(n2 ). +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:20 + +• + +G. J. Bex et al. + +Fig. 7. An execution of REWRITE on the example automaton in Figure 2(b). Step (1) applies DISJUNCTION r + s with r = a and s = b. Step (2) applies CONCATENATION r? . s with r = b and s = a + c. Step +(3) applies ITERATION r + with r = b? .(a+ c). Step (4) applies CONCATENATION r . s with r = (b? .(a+ c))+ +and s = d. Step (5) applies ITERATION r + with r = (b? .(a + c))+ . d. One more application of CON+ ++ +CATENATION r . s with r = ((b? .(a + c)) . d) and s = e (not shown) leads to the resulting expression +((b? .(a + c))+ . d)+ . e. + + = G1 , G2 , . . . , Gk is the sequence of generalized +(Lines 6–7). Suppose that G +SOAs produced by lines 6–7 when rewriting G = G1 until no further rewrite +rule applies. Since rewrite rules never introduce new states without also removing a state, every Gi has at most n states. Now reason as follows. + since the automaton +—The rule for optionals can be applied at most once in G +that it returns is always final, and since no rewrite rule applies to a final +generalized SOA. Checking the preconditions of the rule for optionals can be +done in time O(n2 ), and its action can be performed in time O(n). As such, the + on applying the rewrite rule for optionals is bounded +total time spent in G +2 +by O(n ). +—Since the rewrite rules for disjunction and concatenation contract two states +into a single one, these rewrite rules can be applied at most n times in  +G. +Since of all their preconditions can be checked in time O(n4 ) (by iterating +over all pairs of states r and s in the current automaton Gi and comparing +Pred(r), Pred(s), Succ(r), and Succ(s) as desired) and since state contraction + on the rewrite rules for +can be done in time O(n), the total time spent in G +disjunction and concatenation is bounded by O(n × n4 ) = O(n5 ). +—Since the rule for iteration removes the loop of the state to which it is applied, +and since each generalized SOA contains at most n loops, there can be at most +n consecutive applications of this rule before another rewrite rule is applied. +By the preceding remarks, there are at most n applications of the other +rewrite rules, so the rewrite rule for iteration can be applied at most n2 times + Since its precondition can be checked in constant time, and since its +in G. + on the rewrite rule +action can be done in time O(n), the total time spent in G +for iteration is bounded by O(n2 × n) = O(n3 ). +(Lines 8–11). Finally, checking whether a generalized SOA is final and extracting the corresponding regular expression can be done in time O(n2 ). +In summary, lines 1–4 run in time O(n2 ), lines 6–7 run in time O(n5 ), and +lines 8–11 run in time O(n2 ), yielding a total running time of O(n5 ). +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:21 + +Finally, we show that REWRITE(G) fails if and only if G is not equivalent +to a SORE, or equivalently, that REWRITE(G) does not fail if, and only if, G is +equivalent to a SORE. This is actually the most involved part of the proof of +Theorem 16. Proposition 17 already shows that if REWRITE(G) does not fail, then +G is equivalent to a SORE. Hence, we remain to show the next proposition. +PROPOSITION 19. +not fail. + +If SOA G is equivalent to a SORE, then REWRITE(G) does + +Essentially, we prove this proposition in two steps. Call a generalized SOA +proper if L(G) = ∅ and L(G) = {ε}. +(1) We first show that for any proper SOA G equivalent to a SORE there exists +a sequence of rewrite steps that ends in a final automaton (Corollary 46). +(2) In addition, we show that if proper G can be rewritten into a final automaton +by a particular sequence of rewrite steps, then any sequence of rewrite steps +on G ends in a final automaton (Corollary 54). +As such, REWRITE(G) cannot fail when G is equivalent to a SORE: either G is +not proper, in which case lines 1–4 of Algorithm 2 return a valid expression, or +G is proper and will hence be rewritten into a final automaton, in which case +line 9 returns a valid expression. The details may be found in Appendix A. +3.3 Discussion +It should be noted that while the result of REWRITE is always a SORE, this +SORE need not be easy to read (depending on the order of rewriting). For +instance, it is possible for REWRITE to generate an expression r .(s? . t?)?. Clearly, +the optional around (s? . t?) is redundant. Removing it leads to the simpler +r .(s? . t?). For presentation to the user, it is therefore advisable to postprocess +the result of REWRITE (and its variations in Section 4) using a regular expression +simplification algorithm. +4. DEALING WITH MISSING DATA +The results of Section 3 suggest the following method to infer a SORE from a +given sample S. +(1) First, use 2T-INF to learn from S an automaton representation G of the +target SORE r. +(2) Next, convert G into a SORE using REWRITE. +If S is a representative sample of r then G is equivalent to r by Proposition 9. +Therefore, REWRITE(G) does not fail by Theorem 16, and hence REWRITE(G) is +equivalent to r. +Unfortunately, real-world samples are rarely representative. For instance, +for target r = (a1 +· · ·+an)+ and increasing values of n, it is increasingly unlikely +that a sample bears witness to each of the n2 2-grams needed to represent r. +On such nonrepresentative samples, 2T-INF will construct an automaton for +which L(G) is a strict subset of L(r). In particular, this automaton need not be +equivalent to a SORE, and REWRITE(G) can fail. Figure 8 shows an example. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:22 + +• + +G. J. Bex et al. + +Fig. 8. The SOA generated by 2T-INF for the nonrepresentative sample S = {bacacdacde, +abccaadcde}. The only rewrite rules that can be applied are ITERATION a+ and ITERATION c+ , after which REWRITE gets stuck in a nonfinal automaton and fails. + +Fig. 9. Repair rules. + +For that reason, we present in this section two modifications of REWRITE +that “repair” G when rewriting gets stuck in a nonfinal automaton. The first +modification, RWR, picks a single repair when rewriting gets stuck, independent +of how the repair affects G. The second modification, RWR2 , in contrast, considers +multiple repair strategies and selects the one that extends G in a minimal way. +The repair rules used by both algorithms are shown in Figure 9. After a repair +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:23 + +Algorithm 3. RWR +Input: a SOA G +Output: a SORE r such that L(G) ⊆ L(r) if G is not equivalent to a SORE, and L(G) = +L(r) otherwise. +1: if sink is not reachable from src in G then +2: +return ∅ +3: else if E(G) = {(src, sink)} then +4: +return ε +5: else +6: +while G is not final do +7: +if a rewrite rule from Figures 4–6 can be applied then +8: +apply the rewrite rule on G +9: +else +10: +apply a repair rule from Figure 9 +11: +end if +12: +end while +13: +return the corresponding regular expression r +14: end if + +rule is applied, the automaton necessarily satisfies the precondition of the +corresponding rewrite rule. Now note the following. +PROPOSITION 20. Let G be a proper generalized SOA. If G is not final and no +rewrite rule applies to G, then at least one of the repair rules in Figure 9 applies +to G. +PROOF. Since G is proper, it recognizes at least one nonempty word. Clearly, +this can only happen when src has a successor r distinct from sink. We distinguish two cases. +—Either r has a successor s distinct from src, sink, and r. Clearly, REPAIR r? . s? +is then applicable to G. +—If r does not have such a successor s, then we claim that src has another +successor t, distinct from src, sink, and r. Indeed, suppose for the purpose +of contradiction that no such successor exists. Then, since every state in G +participates in a walk from src to sink, either E(G) = {(src, r), (r, sink)}, or +E(G) = {(src, r), (r, r), (r, sink)}. In the first case G is final, in the second we +can rewrite G using ITERATION r + —a contradiction in both cases. As such, +the claimed t exists. Then, since src ∈ PredG (r) ∩ PredG (t), REPAIR r + t is +applicable to G. +As such, we can always apply a repair rule if rewriting gets stuck in a +nonfinal automaton, after which rewriting can continue. +4.1 A Greedy Approach: RWR +An outline of RWR (short for REWRITE with REPAIRS) is shown in Algorithm 3. Like +REWRITE, it first checks whether its input G is equivalent to ∅ or ε. Otherwise, +G is rewritten using the rewrite rules in Figures 4–6 until a final automaton is +reached, arbitrarily selecting a repair rule when rewriting gets stuck. (In our +implementation we prefer repairs that make small extensions to the language +of the automaton over repairs that make larger extensions. In particular, we +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:24 + +• + +G. J. Bex et al. + +first check whether there are r and s for which REPAIR r . s? can be applied. Then +we check whether there are r and s for which REPAIR r? . s can be applied. Next, +we check for REPAIR r + s and finally for REPAIR r? . s?.) +Since the repair rules add edges to G, thereby increasing L(G), we may +conclude the following theorem. +THEOREM 21. For a SOA G, RWR always produces a SORE r with L(G) ⊆ +L(r). Moreover, if G is equivalent to a SORE, then L(G) = L(r). +(The second statement follows by Theorem 16.) Combined with Proposition 9, +we hence obtain the next corollary. +COROLLARY 22. + +Let M be the composition of 2T-INF with RWR, that is, M(S) := + +RWR(2T-INF(S)). Then M learns the class of SOREs from positive data. + +4.2 Exploring the Search Space: RWR2 +When rewriting gets stuck, RWR arbitrarily selects a repair rule (perhaps based +on some ordering of the rules as in our implementation), and discards the others. It should be clear, however, that when different repair rules are applicable, +one rule may have a smaller impact on the language of the automaton than +another. For that reason we present in this section a different modification +of REWRITE that, in contrast to RWR, tries the “best”  repair rules when there +are several candidates. Here, the “best” repair rules are those that add the +least number of words to the language. Since an automaton defines an infinite +language in general, it is of course impossible to take all added words into +account. We therefore only consider the words up to a length n, where n is twice +the number of alphabet symbols in the automaton. Formally, for a language L, +let |L≤n| denote the number of words in L of length at most n. Moreover, say +that generalized SOA H is a repair of generalized SOA G if H is obtained by +applying a repair rule on G. Then the repairs of the current automaton G are +ordered according to increasing values of | L(H)≤n|, and the best (i.e., first)  +among them are further investigated. +The resulting algorithm, called RWR2 (an abbreviation of REWRITE with  +best RANKED REPAIRS) is shown in Algorithm 4. Like REWRITE, it first checks +whether its input G is equivalent to ∅ or ε. Otherwise, RWR2 uses RWR2 -AUX to +Algorithm 4. RWR2 +Input: SOA G +Output: a SORE r such that L(G) ⊆ L(r) if G is not equivalent to a SORE, and L(G) = +L(r) otherwise. +1: if sink is not reachable from src in G then +2: +return ∅ +3: else if E(G) = {(src, sink)} then +4: +return ε +5: else +6: +initialize the final automaton Hopt to recognize (G)∗ +7: +return the SORE corresponding to the final automaton computed by +2 +RWR -AUX(G, Hopt ) +8: end if +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:25 + +Algorithm 5. RWR2 -AUX +Input: generalized SOAs G and Hopt +Output: final generalized SOA I such that L(G) ⊆ L(I) if G is not equivalent to a +SORE, and L(G) = L(I) otherwise. +1: while a rewrite rule from Figures 4–6 can be applied to G do +2: +perform the rewrite rule on G +3: end while +4: if G is final then +5: +return G +6: else +7: +compute the set R of all possible repairs H of G +8: +sort R in increasing order by | L(H)≤n| +9: +for each of the min(, |R|) best repairs H do +10: +if | L(H)≤n| < | L(Hopt )≤n| then +11: +recursively compute H := RWR2 -AUX(H, Hopt ) +12: +set Hopt := H if | L(H )≤n| < | L(Hopt )≤n| +13: +end if +14: +end for +15: +return Hopt +16: end if + +recursively rewrite and repair G until a final automaton is reached. During +this recursion, Hopt is the best final generalized SOA found so far. Initially, on +line 6 of RWR2 , Hopt is set to the final generalized SOA that accepts all words +over alphabet symbols mentioned in G. RWR2 -AUX then rewrites G in lines 1–2 +until no more rewrite rule is applicable. If the resulting G is final then it is +returned. Otherwise, RWR2 -AUX computes in line 6 all possible repairs H of G +and orders them according to increasing values of | L(H)≤n|. The algorithm then +recursively calls itself on the  best ranked repairs in lines 8–10. The test in +line 10 is an optimization: if the current repair is already worse than the best +final generalized SOA Hopt computed so far in terms of language size, then +further rewriting and repairing cannot yield a final generalized SOA that is +better than Hopt . Lines 11 and 12 update Hopt when appropriate. Finally, Hopt +is returned. +Given its definition, it is clear that RWR2 results in regular expressions with +a smaller language size for increasing values of , of course at the cost of +increased computation time. In the experiments (Section 7.2) the trade-off between precision and computation time of RWR and RWR2 , for increasing values +of , is investigated in more detail. +4.3 Efficiently Computing the Language Size +During its executing, RWR2 repeatedly needs to compute the language size of +the possible repairs. This computation can actually be done quite efficiently +for SOAs, as we show next. Of course, in general RWR2 needs to compute the +language size also for generalized SOAs, not just ordinary SOAs. Our implementation first expands such generalized SOAs into an equivalent SOA using +the Glushkov construction (similar to the ideas of the proof of Proposition 45 +in the online appendix that can be accessed in the ACM Digital Library), and +then invokes the language size computation procedure explained next. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:26 + +• + +G. J. Bex et al. + +Let |L=m| denote the number of words in L of length exactly m. Let G be a +SOA; and assume that V(G) − {src, sink} = {a1 , . . . , an}. Then consider the n × n +matrix D where for i, j ∈ {1, . . . , n} + +1 if (ai , a j ) ∈ E; and, +D[i, j] = +0 otherwise. +In addition, define the 1 × n and n× 1 matrices I and F, respectively, as follows: +for i, j ∈ {1, . . . , n} + +1 if (src, j) ∈ E; and, +I[1, j] = +0 otherwise; +and + + +F[i, 1] = + +1 if (i, sink) ∈ E; and, +0 otherwise. + +The following lemma is straightforward to prove by induction on n using +the fact that each walk from src to sink in G uniquely determines an accepted +word. Let Dm denote the m-times multiplication of D, with D0 the unit matrix. +LEMMA 23. + +Let m > 0 and let G be a SOA. Then | L(G)=m| = I · Dm−1 · F. + +Since for m = 0, we simply have | L(G)=m| = 1 if (src, sink) ∈ E, and +n +| L(G)=m|, we can deter| L(G)=m| = 0, otherwise and since | L(G)≤n| = m=0 +≤n +mine | L(G) | by iteratively computing the matrices D1 to Dm, and applying +Lemma 23. This immediately gives the following corollary. +COROLLARY 24. +time O(n|G|3 ). + +For each n > 0 and SOA G, | L(G)≤n| can be computed in + +5. CORRECTION +In the conference version of this article [Bex et al. 2006] we proposed a different set of rewrite and repair rules for transforming SOAs into SOREs. While +those rewrite rules were claimed in Bex et al. [2006] to possess the analog of +Proposition 19 (namely that they always produce a SORE equivalent to the +input SOA, provided that such a SORE exists), this claim is false, as we will +detail next. Readers unfamiliar with Bex et al. [2006] may freely skip this +section without endangering comprehension of the rest of the article. +To illustrate why the preceding claim is false, the rewrite rules of Bex et al. +[2006] are given in Figure 10, where G∗ refers to the ε-closure of G, defined as +follows. +Definition 25. Let G = (V, E) be a generalized SOA. The ε-closure G∗ of G +is the graph (V, E∗ ) where E∗ contains: +—all edges of E; +—all edges (r, r) with r = s+ or r = s+ ?; +—all edges (r, s) for which there is a path from r to s in G that passes only +through intermediate nodes t with ε ∈ L(t). +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:27 + +Fig. 10. Set of rewrite rules introduced in the conference version of this article [Bex et al. 2006]. + +Figure 11 shows a sequence of rewrite steps using these rules starting from +the SOA recognizing (a + b)+ ? or, equivalently, (a? . b?)+ . Note that the second +rewrite step, which introduces b?, causes the automaton to become disconnected: because a? ∈ PredG∗ (b) and sink ∈ SuccG∗ (b) − {b} it deletes (a?, sink)— +the only edge linking src to sink. As such, the accepted language changes from +L((a + b)+ ?) to ∅. This clearly illustrates that the OPTIONAL r? rule in Figure 10 +is unsound. For that reason, we have moved in this article to the new rewrite +rules in Figures 4–6. +It is peculiar, however, that we have extensively used the rewrite rules of +Figures 10 together with the repair rules in Figure 13 in a prototype implementation but have never encountered a situation where: +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:28 + +• + +G. J. Bex et al. + +Fig. 11. A problematic sequence of rewrite steps using the rules in Figure 10. The input SOA +accepts the same language as (a+b)+ ?, or, equivalently (a? . b?)+ . Note that the automaton resulting +from by the second rewrite step is disconnected and hence accepts the empty language. Rewriting +is therefore not sound. + +Fig. 12. A succesfull sequence of rewrite steps using the rules in Figure 10. The input SOA accepts +the same language as (a + b)+ ?, or, equivalently (a? . b?)+ . + +—we obtained a SORE r that failed to accept at least all words in the input +SOA G; or +—we obtained a SORE r that accepted a strict superset of L(G) when G was +equivalent to a SORE. +We suspect that this behavior is due to the strict order in which we apply the +rewrite rules in our implementation: first CONCATENATION, then DISJUNCTION, +then SELF-LOOP, and finally OPTIONAL. To illustrate, Figure 12 shows a successful +rewriting of the SOA accepting (a + b)+ ? under this order. +The inference algorithm of Bex et al. [2006], which we shall call RWR0 in this +article, is shown in Algorithm 6. It is based on the rewrite rules in Figure 10 +and the repair rules in Figure 13. The experiments in Section 7 indicate that +0 +2 +RWR has no benefits over RWR and RWR . Moreover, as we do not have a formal +soundness and completeness proof showing that rewriting always produces a +SORE equivalent to the input SOA (provided that such a SORE exists) under +this order, it does not make much sense to consider RWR0 for the class of SOREs. +In strong contrast, on the class of k-occurrence regular expressions (k > 1), RWR0 +can make a difference over RWR and RWR2 [Bex et al.]. So even without formal +guarantees, RWR0 still has its its merits. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:29 + +Algorithm 6. RWR0 +Input: a SOA G +Output: a SORE r +1: if sink is not reachable from src in G then +2: +return ∅ +3: else if E(G) = {(src, sink)} then +4: +return ε +5: else +6: +initialize done to false +7: +while not done do +8: +if there a rewrite rule in Figure 10 is applicable then +9: +rewrite G, giving precedence to CONCATENATION, then DISJUNCTION, then SELFLOOP, then OPTIONAL +10: +else if a repair rule in Figure 13 is applicable then +11: +repair G, giving precedence to ENABLE-DISJUNCTION, then ENABLE-OPTIONAL-1, +then ENABLE-OPTIONAL-2 +12: +else +13: +set done to true +14: +end if +15: +end while +16: +if G is final then +17: +return the corresponding regular expression r +18: +else +19: +return ∅ +20: +end if +21: end if + +6. INFERRING CHARES: CRX +In this section, we present the algorithm CRX for the inference of chain regular +expressions (CHAREs). +Definition 26 (CHAREs ). The class of chain regular expressions consists of +those SOREs of the form f1 · · · fn where every fi is a chain factor—an expression +of the form (a1 + · · · + ak), (a1 + · · · + ak)?, (a1 + · · · + ak)+ , or, (a1 + · · · + ak)+ ? with +k ≥ 1 and every ai is an alphabet symbol. +For instance, the expression a(b+c)+ ?d+ (e + f )? is a CHARE, while (ab+c)+ ? +and (a+ ? + b?)+ ? are not. +Since each CHARE is a concatenation of alphabet-disjoint chain factors, +every occurrence of an alphabet symbol in a word must be generated by the +same chain factor in the target CHARE. The positional relationships between +occurrences of alphabet symbols in a given sample then allow us to deduce +which chain factors are present in the target CHARE, and how they are ordered. +Example 27. Consider the sample S = {u, v, w} with u = abd, v = bcdee, +and w = cade. Clearly a occurs before b in u, b occurs before c in v, and c occurs +before a in w. In the target CHARE, therefore, a, b, and c must belong to the +same chain factor which can only be (a + b + c)+ or (a + b + c)+ ?. Since one of +{a, b, c} is present in every word of S, we choose (a + b + c)+ . Similarly, d and +e form chain factors by themselves. Whereas d occurs once in every word in S, +e can occur zero, one, or more times. Therefore, d is represented by the chain +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:30 + +• + +G. J. Bex et al. + +Fig. 13. Repair rules accompanying the rewrite rules in Figure 10. These rules are a correction +of the rules presented in Bex et al. [2006]. Repairs are tried in the order shown. In particular, +ENABLE-OPTIONAL-2 is only applied if none of the other rules is applicable. + +factor d, while e is represented by the chain factor e+ ?. Since a, b, c always occur +before d, which in turn always occurs before the e’s, the derived CHARE is then +(a + b + c)+ de+ ?. +So, in brief, CRX computes chain factors, orders them, and uses that order to +generate a CHARE. Of course, the order of the chain factors is not necessarily +linear. In that case, a linear order can be constructed by making the factors +optional. Some care has to be taken, however, to generate factors that are +disjunctions without repetitions. +Definition 28. Let S be a sample. We denote by → S the partial preorder on + such that a → S b if, and only if, a immediately precedes b in some w ∈ S. +(I.e., ab is a 2-gram of S.) We say that a occurs before b in S if a →∗S b, where +→∗S is the reflexive and transitive closure of → S. +For instance, Figure 14 illustrates → S when S = {abccde, cccad, bf egg, +bf ehi}. +Definition 29. Define a ≈ S b if a occurs before b in S and b occurs before a. +That is, a ≈ S b if a →∗S b and b →∗S a. +Clearly, ≈ S is an equivalence relation. Let  S denote the set of equivalence classes of ≈ S. In what follows, we denote such equivalence classes by, for +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:31 + +Fig. 14. The partial preorder → S for S = {abccde, cccad, bf egg, bf ehi}. + +Fig. 15. The Hasse diagram HS of the sample S = {abccde, cccad, bf egg, bf ehi}. The corresponding +partial preorder from which HS is derived is shown in Figure 14. + +example, [a1 , . . . , an]. As usual, an equivalence class of cardinality 1 is called a +singleton. +Definition 30. The Hasse diagram of S, denoted HS, is the graph over  S +in which there is an edge from equivalence class [a1 , . . . , an] to class [b1 , . . . , bm] +if: (1) [a1 , . . . , an] and [b1 , . . . , bm] are distinct and (2) there exists 1 ≤ i ≤ n and +1 ≤ j ≤ m such that ai → S b j . +For instance, the Hasse diagram of the sample S = {abccde, cccad, bf egg, +bf ehi} is shown in Figure 15. The operation of CRX is then shown in Algorithm 7 +and illustrated in the following example. +Example 31. Consider again the sample S = {abccde, cccad, bf egg, bf ehi} +and its corresponding Hasse diagram in Figure 15. Since Pred HS ([d]) = +Pred HS ([ f ]) and Succ HS ([d]) = Succ HS ([ f ]), line 3 applies to [d] and [ f ]. Although +Pred HS ([g]) = Pred HS ([h]), step 2 cannot be applied as Succ HS ([g]) = Succ HS ([h]). +Similarly [g] and [i] share successors, that is, ∅, but have different predecessors. +Hence, after the while loop in line 2 we obtain: + +A possible topological sort is [a, b, c], [d, f ], [e], [g], [h], [i]. Since at least one of +a, b, and c occurs once or more in every string of W, r([a, b, c]) = (a + b + c)+ is +the first factor; the second factor is (d + f ) since either d or f occurs exactly +once; the factor derived from [e] is e? since W contains a string without e +and similarly for those from [h] and [i]. Finally, g occurs multiple times in a +single string. Hence the simple regular expression derived by the algorithm is +(a + b + c)+ · (d + f ) · e? · g+ ? · h? · i? which completes step 6. +Note that the order of the chain factors in the CHARE depends on the +topological sort. +THEOREM 32. +L(S). + +Given a sample S, CRX computes a CHARE r such that S ⊆ + +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:32 + +• + +G. J. Bex et al. + +Algorithm 7. CRX +Input: a sample S +Output: a CHARE r such that S ⊆ L(r) +1: Compute the set  S of equivalence classes of ≈ S +2: while a maximal set of singleton nodes γ1 , . . . , γ such that Pred HS (γ1 ) = · · · = +Pred HS (γ ) and Succ HS (γ1 ) = · · · = Succ HS (γ ) exists do +3: +Replace γ1 , . . . , γ by γ := ∪j=1 γ j , and redirect all incoming and outgoing edges of +the γi to γ in HS +4: end while +5: Compute a topological sort γ1 , . . . , γk of the nodes +6: for all i ∈ {1, . . . , k} (γi = [a1 , . . . , an]) do +7: +if every w ∈ S contains exactly one occurrence of a symbol in {a1 , . . . , an} then +8: +r(γi ) := (a1 + · · · + an) +9: +else if every w ∈ S contains at most one occurrence of a symbol in {a1 , . . . , an} +then +10: +r(γi ) := (a1 + · · · + an)? +11: +else if every w ∈ S contains at least one of a1 , . . . , an and there is a word that +contains at least two occurrences of symbols then +12: +r(γi ) := (a1 + · · · + an)+ +13: +else +14: +r(γi ) := (a1 + · · · + an)+ ? +15: +end if +16: +return r(γ1 ) . r(γ2 ) . · · · . r(γk) +17: end for + +PROOF. The theorem follows almost immediately from the construction. +Clearly, CRX always outputs a CHARE. Moreover, observe that after step 5 +the computed topological sort is consistent with the order of the symbols in the +words in S. More precisely, there can not exist symbols a and b, such that a ∈ γi , +b ∈ γ j , i < j, and b →∗S a. Subsequently, for each γi a chain factor is chosen +in such a manner that it is consistent with all words w ∈ S. As these factors +are ordered consistently with the order of the symbols in S, this implies that +S ⊆ L(r). +Furthermore, on the class of CHAREs, CRX is complete. +THEOREM 33. +L(CRX(S)). + +For each CHARE r there is a sample S such that L(r) = + +PROOF. Denote by Sym(r) the set of alphabet symbols occurring in r. We also +abuse notation and, for a sample S, write Sym(S) to denote the set of alphabet +symbols occurring in S. Let r = f1 · · · fk be a CHARE, with each fi a chain +factor. We construct the sample S such that the CRX(S) is syntactically equal to +r, up to commutativity of +. The theorem then follows. +Thereto, for every 1 ≤ i ≤ k, let wi be a word in L( fi ). We construct S by +subsequently adding words to it. First, for all 1 ≤ i ≤ k − 1, a ∈ Sym( fi ), +b ∈ Sym( fi+1 ), we add w1 · · · wi−1 abwi+2 · · · wk to S. Further, for all 1 ≤ i ≤ k, +we add words to S, depending on the form of fi . Specifically, if fi is of the +form: +—(a1 + · · · + an), we add w1 · · · wi−1 a1 wi+1 · · · wk; +—(a1 + · · · + an)?, we add w1 · · · wi−1 wi+1 · · · wk, and w1 · · · wi−1 a1 wi+1 · · · wk; +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:33 + +—(a1 + · · · + an)+ , we add w1 · · · wi−1 a1 a1 wi+1 · · · wk; +—(a1 + · · · + an)+ ?, we add w1 · · · wi−1 wi+1 · · · wk, and w1 · · · wi−1 a1 a1 wi+1 · · · wk. +We now argue that given S, CRX indeed derives an expression syntactically +equal to r. First observe that already before step 3, CRX computes k nodes γ1 to +γk, which are linearly ordered, such that for each 1 ≤ i ≤ k, γi contains exactly +the alphabet symbols contained in fi . Then, due to the number of occurrences +of each symbol of the different chain factors, the algorithm will associate to +each γi exactly the factor fi , and hence CRX(S) is syntactically equivalent to r, +up to commutativity of +. +From Theorems 32 and 33 it readily follows that we have the next corollary. +COROLLARY 34. + +CRX learns the class of CHAREs from positive data. + +The experiments in Section 7.3 show that the number of words in S needed +in practice is very small. Actually, the prime feature that makes CRX much +more robust than RWR for very small datasets is its strong generalization ability. Indeed, consider an expression of the form (a1 + · · · + an)+ ?. While REWRITE +requires all n2 2-grams of the form ai a j for i, j ∈ {1, . . . , n} to be present, RWR +requires around (n2 − n) 2-grams. For CRX, however, the set {ε, a1 a2 , a2 a3 , . . . , +an−1 an, ana1 } of size O(n) will suffice. This point is illustrated in practice +by example3 and example4 in Table II where n has a value of 41 and 56, +respectively. Experiments illustrate that only 400  1682 and 500  3136 +2-grams are needed by CRX to learn example3 and example4, respectively. +The following theorem shows that CRX is optimal within the class of CHAREs +when the partial order  S is in fact a linear order. +THEOREM 35. For every sample S, if  S is a linear order then for every +CHARE r such that S ⊆ L(r) and L(r) ⊆ L(CRX(S)), we have r = CRX(S), that is, r +is syntactically equal to CRX(S) up to commutativity of +. +PROOF. Assume that CRX(S) = f1 · · · fk and r = g1 · · · gl . Clearly, +Sym(CRX(S)) = Sym(r) = Sym(S). We first argue that k = l. Thereto, assume +for the purpose of contradiction that k < l. Then, there is a chain factor f in +CRX(S) with a, b ∈ Sym( f ) and two chain factors g and g in r with a ∈ Sym(g) +and b ∈ Sym(g ). We distinguish two cases. +(1) If f is of the form (a1 + · · · + an) or (a1 + · · · + an)?, then L(r) ⊆ L(CRX(S)). +(2) If f is of the form (a1 + · · · + an)+ ? or (a1 + · · · + an)+ , by construction and +since  S is linearly ordered, there are words u1 , u2 ∈ S such that a →∗u1 b +and b →∗u2 a. However, since a and b are in different chain factors of r, +/ L(r) or u2 ∈ +/ L(r), and hence S ⊆ L(r). +either u1 ∈ +Conversely, assume k > l. Then, there are chain factors f, f in CRX(S) with +a ∈ Sym( f ) and b ∈ Sym( f ), and a chain factor g in r with a, b ∈ Sym(g). We +again distinguish two cases. +(1) If g is of the form (a1 + · · · + an)+ ? or (a1 + · · · + an)+ , then L(r) ⊆ L(CRX(S)). +(2) If g is of the form (a1 +· · ·+an) or (a1 +· · ·+an)?, by construction and since  S +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:34 + +• + +G. J. Bex et al. + +is linearly ordered, there are words u1 , . . . , um ∈ S, and symbols c1 , . . . , cm−1 +such that a →∗u1 c1 , cm →∗um b, and ci →ui+1 ci+1 , for all 1 ≤ i ≤ m − 1. +/ L(r) must +However, due to the form of g, for at least one of these ui , ui ∈ +hold and hence S ⊆ L(r). +Using the same kind of argument it can be shown that Sym( fi ) = Sym(gi ), +for all 1 ≤ i ≤ k. Further, since L(r) ⊆ L(CRX(S)), for every 1 ≤ i ≤ k, we +have L(gi ) ⊆ L( fi ). Since the different chain factors can only take a restricted +numbers of forms, it now suffices to show that L(gi ) = L( fi ), for all i, to show that +they are also syntactically equivalent. Hence, towards a contradiction, assume +L(gi )  L( fi ) for some 1 ≤ i ≤ k. This can only be the case if: (1) gi = (a1 +· · ·+an) +and fi = (a1 + · · · + an); (2) gi = (a1 + · · · + an)+ ? and fi = (a1 + · · · + an)+ ; or +(3) gi = (a1 + · · · an)? and fi is one of the three other forms. However, in each of +these cases, given the construction of the algorithm, one can find a word w ∈ S +such that w ∈ +/ L(r). Hence, for all i, L( fi ) = L(gi ), and thus r = CRX(S). +Note that this property does not hold when  S is not linear. For instance, on +S = {abc, ade, abe} CRX yields a·b?·d?·c?·e? whereas the CHARE a·(b+d)·(c +e) +is a better approximation of the target language. +CRX can be efficiently executed on very large datasets by only maintaining +HS and the multiplicities of occurrences of -symbols in words in S (needed for +lines 6–13). From this representation, lines 2–5 can be executed. Hence, it is +not necessary that the entire sample resides in main memory. The complexity +of the algorithm is O(m + n3 ), where m is the size of the sample and n the +number of alphabet symbols. +7. EXPERIMENTAL EVALUATION +In this section we validate our approach by means of experimental analysis. +Specifically, we assess the quality of the expressions returned by our algorithms +on real-world corpora and DTDs, and compare it with the quality of expressions +returned by XTRACT [Garofalakis et al. 2003] and Trang [Clark]. Next, we compare the quality of RWR0 (the algorithm found in the conference version of this +article), RWR, and RWR2 . Subsequently, we investigate the performance of the algorithms on incomplete and noisy data. Finally, we discuss their running time +performance. We abuse notation and simply write RWR for the application of +2T-INF followed by RWR, similarly for RWR0 and RWR2 . All experiments were performed using a prototype implementation of our algorithms in Java executed +on a 2.5 Ghz Pentium 4 machine with 1GB of RAM. +7.1 Real-World Examples +The number of publicly available XML corpora is rather limited. We employed +the XML Data repository maintained by Miklau [2002] as a testbed. Unfortunately, most of the corpora listed there are either very small, lack a DTD, +or contain a DTD with only trivial regular expressions. Nevertheless, two of +the listed corpora are interesting. Specifically, we compared XTRACT, RWR, and +CRX on the Protein Sequence Database (683Mb in size) and the Mondial corpus +[Miklau 2002], a database of information on various countries (1Mb in size). +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:35 + +Table I. Results of RWR, CRX and XTRACT on DTDs and Sample Data from +the Protein Description Database and the Mondial Corpora +Element +Original DTD +Sample +Result of CRX/ RWR +size +Result of XTRACT +ProteinE. +a1 a2 a3 a4 + ?a5 + ?a6 + ?a7 + ?a8 + ?a9 ?a10 ?a11 + ?a12 a13 +2458 +a1 a2 a3 a4 + a5 + ?a6 + ?a7 + ?a8 + ?a9 ?a10 ?a11 + ?a12 a13 +843 +an expression of 185 tokens +organism +a1 a2 ?a3 a4 ?a5 + ? +9 +a1 a2 ?a3 a4 ?a5 + ? +9 +a1 ((a2 a3 a4 ?+a3 a4 )a5 ?+a3 a5 + ?) +reference +a1 a2 + ?a3 + ?a4 + ? +45 +a1 a2 + ?a3 + ?a4 + ? +45 +a1 (a2 + ?(a4 + ?+a3 + ?)+a2 a3 + ?a4 a4 +a3 + ?a4 + ?) +refinfo +a1 a2 a3 ?a4 ?a5 a6 ?(a7 +a8 )?a9 ? +10 +a1 a2 (a3 +a4 )?a5 a6 ?a7 ?a9 ?a8 ? +10 +a1 a2 ((a3 a5 a6 a7 ?+a4 a5 )a9 ?+a5 (a7 +a8 )?+a4 a5 a8 ) +authors +a1 + +(a2 a3 ?) +54 +a1 + ?a2 ?a3 ? / +a1 + +(a2 a3 ) +54 +a1 + ?+a2 a3 +accinfo +a1 a2 + ?a3 + ?a4 ?a5 ?a6 ?a7 + ? +124 +a1 a2 + ?a3 + a4 ?a5 ?a6 ?a7 + ? +124 +an expression of 97 tokens +genetics +a1 + ?a2 ?a3 ?a4 ?a5 ?a6 ?a7 ?a8 ?a9 ?a10 ?a11 + ?a12 + ? +219 +a1 + ?a2 ?a3 ?a4 ?a5 ?a6 ?a7 ?a8 ?a9 ?a10 ?a12 + ? +219 +an expression of 329 tokens +function +a1 ?a2 + ?a3 + ? +26 +a1 ?a2 + ?a3 + ? +26 +(a1 (a2 ?a2 ?a3 + ?+a2 + ?(a3 a3 )+ ?+a2 a2 a2 a3 )+a2 (a2 a3 + ?+a3 + ?)) +city +a1 a2 + ?a3 + ? +9 +a1 a2 + ?a3 + ? +9 +a1 (a2 + ?a3 a3 ?+a2 (a3 + ?+a2 ))? +The left column gives element names, sample size for CRX/ RWR, and sample size for +XTRACT, respectively. The right column lists original DTD, inferred DTD by CRX/ RWR, +and the result of XTRACT, in that order. + +Since no real-world data could be obtained for SOREs that are not CHAREs, +we generated our own XML data for a number of real-world DTDs considered +in Bex et al. [2004] containing a number of sophisticated regular expressions +outside the class of CHAREs. +Real-world data. In this section, we only discuss RWR as RWR0 and RWR2 give +precisely the same results. Table I lists all nontrivial element definitions2 in +the aforementioned DTDs together with the results derived by the inference +algorithms RWR, CRX, and XTRACT. It is interesting to note that only the regular +expression for authors is not a CHARE. Moreover, no elements are repeated +in any of the definitions. This should not come as a surprise given the observations discussed in the Introduction on the content models occurring in practice. +The regular expression derived by the XTRACT algorithm is shown whenever +it fitted the table, otherwise the number of tokens it consists of is listed. For +better readability the actual output of XTRACT has been simplified by replacing +expressions such as (ai + ε) by ai ?. +2 It should be noted that the examples from the Mondial corpus are not valid according to their +DTD, so for the city element only valid elements were used as training examples. + +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:36 + +• + +G. J. Bex et al. + +It can be verified that all regular expressions in Table I are learned quite +satisfactory by RWR and CRX with respect to the examples extracted from the +XML corpus. The numbers in the first column refer to the size of the sample. +RWR and CRX always produce the same result except for authors where CRX +cannot derive the target expression as it is not a CHARE. We note that no +sample was representative of its target expression. As such, RWR always had to +apply repair rules. The expressions in the table indicate that the result of these +repairs are satisfactory. For a few expressions, for instance, ProteinE(ntry), +refinfo, and genetics, the expressions produced by CRX and RWR are more +strict than the corresponding one in the DTD. This is due to the data present +in the sample. For instance, for genetics, no a11 element occurs in the sample +so it obviously cannot be part of the derived expression. The element refinfo +illustrates that a3 and a4 are mutually exclusive in the sample and that a8 is +never followed by a9 . Inspecting the original DTD illustrates the underlying +semantics. +authors, citation, volume?, month?, year, +pages?, (title | description)?, xrefs? +Indeed, volume is used in the context of a journal, while month is used for a +conference publication. Apart from the authors element XTRACT either produces +a suboptimal expression or no expression at all. For instance, XTRACT crashes on +the ProteinE(ntry) sample due to excessive memory consumption (more than +1GB of RAM). Reducing the size of the sample to approximately 800 unique +words yields a complex expression of 185 tokens. +Real-world regular expressions. Table II lists the results of the algorithms on +a number of more sophisticated regular expressions extracted from real-world +DTDs discussed in Bex et al. [2004]. Since no real-world data was available +for those DTDs, we have randomly generated samples using ToXgene [Barbosa +et al. 2002], taking care that all relevant examples where present to ensure +the target expression could be learned. Again, we list the sample size in the +first column. As some of these numbers might seem artificially large, we note +that, for instance, the SOA corresponding to example3 already contains 1897 +edges. Hence, a random dataset of 5741 words is not unreasonably large. Note +that only the first three expressions in Table II are SOREs, none of them +is a CHARE. The table shows clearly that CRX yields fairly good and concise +super-approximations to the original expressions. In some cases, the results +produced by RWR are more precise. For XTRACT, the size of the sample had to be +limited to 300–500 in order to avoid a crash. As can be seen from the table, +XTRACT performed excellently on the first example, but failed to generate an +expression that fitted the table in all other cases on all the sample sets we +tried. +Trang. We ran Trang [Clark] on the XML data discussed in this section. +In all but one case, Trang produced exactly the same output as CRX, with a +notable exception: for example1 Trang’s output depends on the order in which +the examples are presented, yielding either a1 + ?a2 ?a3 + ? or a1 + + (a2 ?a3 + ). The +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:37 + +Table II. Results of RWR, CRX and XTRACT on +Nonsimple Real-World DTDs and Generated Data +Original DTD +Element +Result of CRX +Sample +Result of RWR +size +Result of XTRACT +example1 +a1 + + (a2 ?a3 + ) +48 +a1 + ?a2 ?a3 + ? +48 +a1 + + (a2 ?a3 + ) +48 +a1 + ? + (a2 ?a3 + ?) +example2 +(a1 a2 ?a3 ?)?a4 ?(a5 + · · · + a18 )+ ? +2210 +a1 ?a2 ?a3 ?a4 ?(a5 + · · · + a18 )+ ? +2210 +(a1 a2 ?a3 ?)?a4 ?(a5 + · · · + a18 )+ ? +300 +an expression of 252 tokens +example3 +a1 ?(a2 a3 ?)?(a4 + · · · + a44 )+ ?a45 + +5741 +a1 ?a2 ?a3 ?(a4 + · · · + a44 )+ ?a45 + +5741 +a1 ?(a2 a3 ?)?(a4 + · · · + a44 )+ ?a45 + +400 +an expression of 142 tokens +example4 a1 ?a2 a3 ?a4 ?(a5 + + ((a6 + · · · + a61 )+ a5 + ?)) +10000 +a1 ?a2 a3 ?a4 ?(a6 + · · · + a61 )+ ?a5 + ? +10000 +a1 ?a2 a3 ?a4 ?(a6 + · · · + a61 )+ ?a5 + ? +500 +an expression of 185 tokens ++ +example5 +a1 (a2 + a3 )+ ?(a4 (a2 + a3 + a5 )+ ?) ? ++ +1281 +a1 (a2 + a3 + a4 + a5 ) ? ++ +1281 +a1 ((a2 + a3 + a4 )+ a5 + ?) ? +500 +an expression of 85 tokens +The left column gives element names, sample size for CRX, +RWR and XTRACT, respectively. The right column lists original +DTD, inferred DTD by CRX, by RWR and the result of XTRACT, +in that order. + +former is the same output as CRX, the latter is the intended RE that cannot +be derived by CRX as it is outside the class of CHAREs. This inconsistency in +Trang’s output casts some doubt on its correctness and underscores the need +for a formal model as the cornerstone of an implementation. Indeed, there is no +article or manual available describing the machinery underlying Trang. A look +at the Java-code indicates that Trang is related to, but different from, CRX: it +uses 2T-INF to construct an automaton, eliminates cycles by merging all nodes +in the same strongly connected component, and then transforms the obtained +DAG into a regular expression. However, no target class of REs for which Trang +is complete, as is the case for CRX, is specified. As Trang is similar to CRX, it is +outperformed by RWR and RWR2 . +7.2 RWR versus RWR2 +We tested the results and performance of RWR versus RWR2 for various values +of the rank cut-off parameter . The SOAs used in this test were randomly +generated with 5 and 10 alphabet symbols. The results are summarized in +Table III(a). We computed the average language size of the SOAs, which is the +target size. It should be noted that since no SORE corresponds to these SOAs, +the target size can never be attained since the regular expression resulting +from RWR or RWR2 will necessarily be a generalization of the SOA’s language. +It is immediately clear from Table III(a) that results of RWR2 are on average +better than those for RWR, and that they improve with increasing values of . +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:38 + +• + +G. J. Bex et al. +Table III. +(a) +|| = 5 || = 10 +target size 0.52 +0.67 +0 + +RWR + +RWR + +0.88 +0.80 + +0.98 +0.96 + +0.76 +0.73 +0.725 +0.722 +0.721 +0.720 + +0.95 +0.92 +0.916 +0.911 +0.908 +N/A + +2 + +RWR + +1 +2 +3 +4 +5 +∞ + +(b) +RWR || = 5 || = 10 + +2 + +1 +2 +3 +4 +5 +∞ + +28.8% +7.6% +3.2% +1.3% +0.7% +24.6% + +46.3% +7.3% +1.2% +0.0% +0.0% +N/A + +(a) Average language size for RWR and RWR2 for various values of +.  = ∞ denotes an exhaustive exploration of all possible repairs. +(b) Percentage of target expressions for which RWR outperforms RWR2 . + +For expressions of alphabet size 5, we were able to consider all possible repairs, +resulting in the entry for  = ∞ in Table III(a). This represents the smallest +language that includes the SOA’s language and that can be expressed by a +SORE. +Of course, the results in Table III(a) are averaged over 1000 randomly chosen +SOAs. A more detailed analysis reveals that for a considerable number of SOAs, +2 +RWR actually outperforms RWR for  = 1. Table III(a) shows the number of +2 +times RWR outperforms RWR for various values of . The probability that RWR +outperforms RWR2 drops rapidly for increasing values of , especially for larger +alphabet sizes. The last line in Table III(b) lists the probability that RWR derives +the optimal result, that is, that the smallest language representable by a SORE +is obtained for expressions of alphabet size 5. +Although the RWR2 algorithm clearly outperforms RWR in terms of the language size of the derived expression, there is a compelling argument in the +latter’s favor. In terms of running time, RWR outperforms RWR2 with a few orders of magnitude as is discussed in Section 7.5. +7.3 Incomplete Data +Unfortunately, in a real-world setting an available sample may simply contain +too little information to learn the target regular expression. To formalize this, +we introduce the notion of coverage. +Definition 36. A sample S covers a deterministic automaton A if for every +edge (s, t) in A there is a word w ∈ S whose unique accepting run in A traverses (s, t). Such a word w is called a witness for (s, t). A sample S covers a +deterministic regular expression r if it covers the automaton obtained from S +using the Glushkov construction for translating regular expressions into automata [Brüggeman-Klein 1993]. +If a sample S does not contain a witness for an edge, it may seem as if +the target expression cannot be learned, even if it is a SORE since the SOA +derived from the data has an edge missing. However, the repair rules introduce +extra edges, so this part of the algorithm may actually alleviate the problem of +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:39 + +Table IV. Percentage of +Successfully Derived Expressions +at Various Values of Sample +Coverage for CRX, RWR0 , RWR and +2 +1 + +RWR + +coverage CRX RWR0 RWR RWR21 +25.0 +85% 56% 12% 73% +35.0 +87% 48% 32% 73% +45.0 +96% 60% 57% 74% +55.0 +87% 58% 63% 57% +65.0 +82% 48% 58% 59% +75.0 +80% 51% 51% 63% +85.0 +63% 48% 47% 53% +92.5 +57% 48% 47% 61% +97.5 +85% 74% 64% 73% +100.0 +100% 100% 100% 100% + +incomplete data. This is indeed confirmed experimentally. It turns out that even +with a substantial fraction of missing witnesses, the target regular expression +can be learned with an astonishing degree of success. To quantify the missing +information, we introduce the following definition: +Definition 37. The coverage of a sample with respect to a target expression +r is the ratio of the number of edges of the SOA derived from the sample and +the SOA representing the target expression r. +The tests were done on 100 real-world regular expressions of alphabet sizes +up to 10, for 10 independently selected samples of varying coverage. The results are presented in Table IV. The straightforward CRX clearly outperforms all +other algorithms, although this result should be approached with some caution: +to give CRX a fair chance, the target expressions for this algorithm were limited +to CHAREs, while the other algorithms were tested on general SOREs as well. +Note that approximately 90% of real-world expressions are in fact CHAREs, +hence its superior performance is not only due to simpler target expressions. +The robustness of RWR21 is quite remarkable since it tends to derive more specific +regular expressions than RWR0 and RWR. One would expect the generalization +ability to decrease for algorithms that yield more specific results. This expectation is borne out when one compares RWR0 and RWR, however, RWR21 ’s greedy +application of the repair rules seems to pay off in the context of incomplete data +as well. +7.4 Noise +As already noted in the Introduction, real-world samples (such as XHTML) +need not be valid with respect to its known schema. Errors crop up due to +all sorts of circumstances. This underscores the need for a robust inference +algorithm that can handle some noise in the input sample. +Noise can come in several forms. To generate a noisy subsample, we modify +the target expression either by replacing a symbol by a different one from the +target’s expression, or by replacing it by a symbol that is not in the alphabet of +the target expression. We than use the modified target expression to generate +a complete sample. We define the noise level as follows. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:40 + +• + +G. J. Bex et al. + +Definition 38. Given a target expression r, the noise level of a sample S is +the ratio |S− L(r)|/|S|. +Here we propose an approach to filter the sample S based on the probability +of its words being generated by a probabilistic automaton, as we already used +in previous work [Bex et al. 2008]. This probabilistic automaton has one state +for each alphabet symbol, and the transition probabilities are computed using +the Baum-Welsh algorithm [Rabiner 1989]. Given the probabilistic automaton, +it is straightforward to compute the probability for each w ∈ S, so that one can +rank the sample’s words. One expects words that contain noise, that is, that +would be rejected by the target regular expression, to have low probability if +their number is not excessively large compared to the sample’s size. +To filter the sample, hoping to exclude those words that contain noise, we +compute the mean μ and standard deviation σ of the sample’s probabilities. A +string w ∈ S with probability P(w) is excluded if P(w) < μ − ασ . The factor α +is a parameter of the algorithm. The filtered sample S is now used to derive +a regular expression. It is of course possible that in the generation of S some +words needed to derive the target expression were removed. Hence there is no +guarantee that the derived regular expression will be an overapproximation of +the target expression. +Since it was shown in previous sections that RWR21 has the best overall performance, we focus solely on this algorithm in this section. In order to investigate +how robust RWR21 is with respect to noise we applied the algorithm to samples S +with increasing noise levels with a range of values for the cut-off α. We compute +the precision and the recall for each individual expression and use the average +values over many expressions to compute the F-value for a given noise level +and cut-off so that the optimal cut-off point can be determined. +To define precision and recall, consider the sample S = Svalid ∪ Sinvalid , where +Svalid ⊆ S contains the words in S accepted by the target expression and Sinvalid +contains the words in S not accepted by the target expression. A true positive is +a word in Svalid that is accepted by the derived expression, while a false negative +is a word in Svalid that is rejected by the derived expression. Similarly, a false +positive is a word in Sinvalid that is accepted by the derived expression, while a +true negative is a word in Sinvalid that is rejected by the derived expression. We +denote by St.p. the set of true positives, by St.n. the set of true negatives, by Sf .p. +the set of false positives, and by Sf .n. the set of false negatives. +Definition 39. The precision p, recall r, and F-value of a derived regular +expression on a sample S are given by +p= + +|St.p. | +, +(|St.p. | + |Sf .p. |) + +r= + +|St.p. | +, +(|St.p. | + |Sf .n. |) + +F= + +2 pr +. +p+r + +Furthermore, we are interested in the fraction of derived regular expressions +that is equivalent to the target expression. +We average over 580 SOREs obtained from a corpus of real-world DTDs. +The results are shown in Figure 16(a). From the F-value we can conclude +that a cut-off value α F ≈ 0.7 yields the best balance between precision and +recall. Figure 16(b) shows the fraction of derived regular expressions that is +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:41 + +Fig. 16. (a) F-value as a function of the cut-off value α for noise levels of 0.01 (squares), 0.02 +(circles), and 0.05 (triangles). (b) Fraction of derived expressions equivalent to the target expression +as a function of the cut-off value α for noise levels of 0.01 (squares), 0.02 (circles), and 0.05 +(triangles). + +equivalent to the target expression. For noise levels increasing from 0.01 to +0.05, the F-value as well as the percentage of derived expressions equivalent +to the target expression gradually decreases, as is to be expected. It should be +noted that recall r < 1 implies that the language represented by the derived +regular expression is not a superset of the target’s language. For the cut-off α F , +and a noise level of 0.01, approximately 16% of the derived regular expressions +allow false negatives, while the value for a noise level of 0.05 is 15%. The fact +that the derived expression is not a super-approximation may or may not be +acceptable, depending on the application. +Another interesting observation is that the number of derived expressions +that is equivalent to the target expression increases beyond the cut-off value +α F ; see Figure 16(b). For a noise level of 0.01, this trend continues up to +cut-off values of αequiv. ≈ 0.3 where it reaches a maximum of approximately +53%. However, at this value 20% of the derived regular expressions are not +super-approximations to their target expressions. For α < αequiv. , the F-value +decreases rapidly. For higher noise levels, the optimal cut-off value αequiv. is +smaller, but since it is very unlikely that one knows the noise level, it is hard +to take advantage of this fact by tuning αequiv. to a specific noise level. The +overall best result will be obtained for αequiv. ≈ 0 for noise levels not exceeding +0.05. +It should be noted that for a noise level of 0.01 at αequiv. , out the 53% of derived +regular expression that are equivalent to the target expression, about 7% is +not covered by the sample. The latter illustrates once more the generalization +ability of the algorithms RWR2 as was discussed in Section 7.3. +7.5 Performance +As mentioned previously, the one advantage RWR has over RWR2 is that the +former’s running time is much lower than the latter’s. This is illustrated in +Table V(a) for 1000 target expressions of alphabet size 10. It also shows the +relative running time for RWR0 , illustrating that RWR outperforms both RWR0 and +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:42 + +• + +G. J. Bex et al. +Table V. +(a) +relative running time +0 +RWR +6 · 102 +2 + +RWR + +1 +2 +3 +4 +5 + +2 · 102 +2 · 103 +1 · 104 +4 · 104 +1 · 105 + +(b) +|| time (ms) +5 +2 +10 +5 +15 +15 +20 +33 +50 +616 +100 +7562 + +(a) Relative running times of RWR2 versus RWR for various +values of . (b) Average running times in milliseconds for RWR +as a function of alphabet size. + +2 +2 +RWR for any value of . However, it is interesting to note that RWR1 outperforms +0 +RWR by a factor of 3, and derives more specific regular expressions, again +illustrating the superiority of the new algorithms over RWR0 . + +The performance of RWR is excellent: on average it takes only ms to derive +an expression of alphabet size 10. Table V(b) shows actual running times as a +function of the target expressions’ alphabet size, averaged over 1000 random +expressions of that alphabet size. +With respect to the performance in terms of the number of examples, we +showed in the conference version of this article that RWR0 ’s was adequate to +deal with large datasets. Example4 with 61 symbols in Table II is derived from +10000 example words in 7 seconds while CRX only needs 3.2 seconds. More +typical expressions of about 10 symbols derived from a few hundred examples +take approximately a second. These figures include the time to initialize a +Java Virtual Machine while the tests are done on a 2.5 GHz P4 with 1GB +of RAM. Given that RWR and RWR21 outperform RWR0 and the time required to +start the virtual machine and parse the data is independent of the algorithm, +our new algorithms are adequate as well. For instance, RWR derived a DTD +for PubMed from 10000 articles with a total size of over 1.2GB in 264 seconds +(again including the time needed for Java initialization and parsing of the XML +data). Trang slightly outperforms CRX thanks to very efficient XML parsing. We +did not make a detailed comparison with XTRACT for the reason that XTRACT +cannot handle samples with more than 1000 words. +8. EXTENSIONS +Incremental computation. Especially in the setting of sparse data when over +time more XML data gets generated, for instance, by answers to queries or +results of calls to Web services, it is desirable to update an already generated +schema based on the newly arrived XML data only. Such an approach is possible +for both RWR and CRX: as both algorithms make use of an internal representation +(automata or partial orders), we only need to update that representation. So, for +every element name we store the corresponding internal graph representation, +which is only quadratic in the number of different element names, and we can +forget about the XML data that generated it. Actually, for CRX, to assign the +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:43 + +qualifiers ?, + and ∗, we also need to remember for each element name how +it occurs (always exactly once, always more than once, . . . ), but this is only a +constant amount of information. +Numerical predicates. An immediate drawback of SOREs is that they cannot count. For instance, they cannot express aabb+ specifying that a string +should start with two a’s followed by any number of b’s larger than 1. XML +Schema even uses dedicated attributes for expressing the desired number of +repetitions. + + + + + +In the same way, REs can be extended by numerical predicates: when r is +an RE and i is a natural number then r ≥i and r =i are also REs. They are +semantically equivalent to r i r ∗ and r i , respectively, where r i = r · r · · · · · r (i +times). The preceding expression can then be expressed as a=2 b≥2 . To both RWR +and CRX a post-processing step can be added that rewrites + and ∗ to numerical +values based on exact occurrences of element names in the XML data. +Generation of XSDs. While the inference of DTDs essentially reduces to the +inference of regular expressions from sets of sample words (as illustrated in +Section 1.1), the inference of XSDs is much more complex. +Indeed, first and foremost, the content model of an element can only depend +on the element’s name in a DTD. XML Schema, in contrast, has a typing +mechanism that allows the content model of an element to depend not only on +its name, but also on the context in which it is used. We refer the interested +reader to Martens et al. [2006, 2007] for an in-depth discussion on the XML +Schema typing mechanism and the extra expressive power that it provides with +respect to DTDs. It is important to note, however, that the study of Martens +et al. [2006] also shows that 85% of XSDs in practice does not use this additional +power, and are hence structurally equivalent to a DTD. Obviously, inferring +such XSDs is merely a matter of using the correct syntax. How to extend +schema inference to deal with real XSDs that do use the additional power of +the XML Schema typing system is studied in a companion article [Bex et al. +2007]. +Second, DTDs have essentially only one atomic data type to describe the +textual data found in XML documents: #PCDATA. XML Schema, in contrast, has +atomic data types for numbers, strings, dates, etc. The algorithms described +here can easily be extended with heuristics to recognize these atomic data +types, such as the ones described by Hegewald et al. [2006]. +Inference of k-OREs. As the vast majority of expressions used in practical +schemas are SOREs, we focused in this article on the inference of SOREs. In +a companion article [Bex et al. 2008] we study the derivation of k-OREs, for +small values of k, thus covering virtually all expressions occurring in practice. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:44 + +• + +G. J. Bex et al. + +9. CONCLUSION +We introduced novel algorithms for the inference of concise regular expressions +from positive data. For the inference of SOREs, RWR2 was shown to yield the best +experimental results. It is also quite robust when presented with incomplete +and noisy data. The quality of inferred expressions on real-world and synthetic +datasets outperforms those returned by XTRACT where CRX is similar to Trang. +CRX’ generalization ability makes it highly qualified in dealing with very small +datasets. Further, RWR, RWR2 , and CRX always infer succinct expressions by definition which can easily be interpreted by humans. Of independent interest, we +introduced a new algorithm to transform automata into short, readable regular +expressions. +ELECTRONIC APPENDIX +The electronic appendix for this article can be accessed in the ACM Digital +Library. +ACKNOWLEDGMENTS + +We thank the authors of Garofalakis et al. [2003] for making available +XTRACT’s source code, as well as Wouter Gelade for comments on a previous draft of this article. +REFERENCES +ABITEBOUL, S., BUNEMAN, P., AND SUCIU, D. 1999. Data on the Web. Morgan Kaufmann Publishers. +AHONEN, H. 1996. Generating grammars for structured documents using grammatical inference methods. Ph.D. thesis, Report A-1996-4. Department of Computer Science, University of +Helsinki. +ANGLUIN, D. AND SMITH, C. H. 1983. Inductive inference: Theory and methods. ACM Comput. +Surv. 15, 3, 237–269. +BARBOSA, D., MENDELZON, A. O., KEENLEYSIDE, J., AND LYONS, K. A. 2002. ToXgene: An extensible +template-based data generator for XML. In Proceedings of the 5th International Workshop on the +Web and Databases (WebDB 2002). 49–54. +BARBOSA, D., MIGNET, L., AND VELTRI, P. 2006. Studying the XML web: Gathering statistics from +an XML sample. World Wide Web 9, 2, 187–212. +BENEDIKT, M., FAN, W., AND GEERTS, F. 2008. XPath satisfiability in the presence of DTDs. J. +ACM 55, 2, 1–79. +BERNSTEIN, P. A. 2003. Applying model management to classical meta data problems. In Online +Proceedings of the 1st Biennal Conference on Innovative Data Systems Research (CIDR’03). +BEX, G. J., GELADE, W., NEVEN, F., AND VANSUMMEREN, S. Learning deterministic regular expressions +for the inference of schemas from XML data. http://arxiv.org/abs/1004.2372. +BEX, G. J., GELADE, W., NEVEN, F., AND VANSUMMEREN, S. 2008. Learning deterministic regular +expressions for the inference of schemas from XML data. In Proceeding of the 17th International +Conference on World Wide Web (WWW’08). 825–834. +BEX, G. J., NEVEN, F., AND DEN BUSSCHE, J. V. 2004. DTDs versus XML Schema: A practical study. +In Proceedings of the International Workshop on Web and Database (WebDB). S. Amer-Yahia and +L. Gravano, Eds. 79–84. +BEX, G. J., NEVEN, F., SCHWENTICK, T., AND TUYLS, K. 2006. Inference of concise DTDs from XML +data. In Proceedings of the International Conference on Database Theory (VLDB). U. Dayal, K.-Y. +Whang, D. B. Lomet, G. Alonso, G. M. Lohman, M. L. Kersten, S. K. Cha, and Y.-K. Kim, Eds. +ACM, 115–126. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:45 + +BEX, G. J., NEVEN, F., AND VANSUMMEREN, S. 2007. Inferring XML schema definitions from XML +data. In Proceedings of the 33rd International Conference on Very Large Data Bases (VLDB’07). +998–1009. +BRĀZMA, A. 1993. Efficient identification of regular expressions from representative examples. In +Proceedings of the 6th Annual Conference on Computational Learning Theory (COLT’93). ACM +Press, 236–242. +BRÜGGEMAN-KLEIN, A. 1993. Regular expressions into finite automata. Theor. Comput. Sci. 120, 2, +197–213. +BRÜGGEMANN-KLEIN, A. AND WOOD, D. 1998. One-Unambiguous regular languages. Inform. Comput. 140, 2, 229–253. +BUNEMAN, P., DAVIDSON, S. B., FERNANDEZ, M. F., AND SUCIU, D. 1997. Adding structure to unstructured data. In Proceedings of the International Conference on Database Theory (ICDT’97). +Lecture Notes in Computer Science, vol. 1186. Springer, 336–350. +CARON, P. AND ZIADI, D. 2000. Characterization of Glushkov automata. Theor. Comput. Sci. 233, 1– +2, 75–90. +Castor. The Castor project. www.castor.org. +CHIDLOVSKII, B. 2001. Schema extraction from XML: A grammatical inference approach. In +Proceedings of the 8th International Workshop on Knowledge Representation meets Databases +(KRDB’01). CEUR Workshop Proceedings, vol. 45. +CLARK, +J. +Trang: +Multi-Format +schema +converter +based +on +RELAX +NG. +www.thaiopensource.com/relaxng/trang.html. +COVER, R. 2003. The Cover Pages. xml.coverpages.org. +DELGADO, M. AND MORAIS, J. 2004. Approximation to the smallest regular expression for a given +regular language. In Proceedings of the, 9th International Conference on Implementation and +Application of Automata. Lecture Notes in Computer Science, vol. 3317. Springer, 312–314. +DEUTSCH, A., FERNANDEZ, M. F., AND SUCIU, D. 1999. Storing semistructured data with STORED. +In Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM +Press, 431–442. +EHRENFEUCHT, A. AND ZEIGER, P. 1976. Complexity measures for regular expressions. J. Comput. +Syst. Sci. 12, 134–146. +FERNANDEZ, M. F. AND SUCIU, D. 1998. Optimizing regular path expressions using graph schemas. +In Proceedings of the 14th International Conference on Data Engineering (ICDE’98). 14– +23. +FERNAU, H. 2004. Extracting minimum length document type definitions is NP-hard. In Proceedings of the 7th International Colloquium on Grammatical Inference: Algorithms and Applications. +Lecture Notes in Artificial Intelligence, vol. 3264. Springer, 277–278. +FERNAU, H. 2009. Algorithms for learning regular expressions from positive data. Inform. Comput. 207, 4, 521–541. +FLORESCU, D. 2005. Managing semi-structured data. ACMQueue 3, 8, 18–24. +GARCÍA, P. AND VIDAL, E. 1990. Inference of k-testable languages in the strict sense and application +to syntactic pattern recognition. IEEE Trans. Patt. Anal. Mach. Intell. 12, 9, 920–925. +GAROFALAKIS, M., GIONIS, A., RASTOGI, R., SESHADRI, S., AND SHIM, K. 2003. XTRACT: Learning +document type descriptors from XML document collections. Data Mining Knowl. Discov. 7, 23– +56. +GELADE, W. AND NEVEN, F. 2008. Succinctness of the complement and intersection of regular +expressions. In Proceedings of the 25th Annual Symposium on Theoretical Aspects of Computer +Science (STACS’08). Dagstuhl Seminar Proceedings, vol. 08001. 325–336. +GOLD, E. 1967. Language identification in the limit. Inform. Control 10, 5, 447–474. +GOLDMAN, R. AND WIDOM, J. 1997. DataGuides: Enabling query formulation and optimization in +semistructured databases. In Proceedings of the 23rd International Conference on Very Large +Data Bases (VLDB’97). 436–445. +GRUBER, H. AND HOLZER, M. 2008. Finite automata, digraph connectivity, and regular expression size. In Proceedings of the 35th International Colloquium on Automata, Languages and +Programming. Lecture Notes in Computer Science, vol. 5126. Springer, 39–50. +HAN, Y.-S. AND WOOD, D. 2007. Obtaining shorter regular expressions from finite-state automata. +Theor. Comput. Sci. 370, 1–3, 110–120. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + 11:46 + +• + +G. J. Bex et al. + +HEGEWALD, J., NAUMANN, F., AND WEIS, M. 2006. XStruct: Efficient schema extraction from multiple and large XML documents. In Proceedings of the 22nd International Conference on Data +Engineering Workshops (ICDEW’06). IEEE Computer Society, 81–97. +HINKELMAN, S. 2005. Business integration—Information conformance statements (BI-ICS). Tech. +rep., IBM DeveloperWorks. +HOPCROFT, J. AND ULLMAN, J. 1979. Introduction to Automata Theory, Languages and computation. +Addison-Wesley. +HUET, G. 1980. Confluent reductions: Abstract properties and applications to term rewriting +systems. J. ACM 27, 4, 797–821. +KOCH, C., SCHERZINGER, S., SCHWEIKARDT, N., AND STEGMAIER, B. 2004. Schema-Based scheduling of +event processors and buffer minimization for queries on structured data streams. In Proceedings +of the 30th International Conference on Very Large Data Bases (VLDB’04). 228–239. +MANOLESCU, I., FLORESCU, D., AND KOSSMANN, D. 2001. Answering XML queries on heterogeneous data sources. In Proceedings of 27th International Conference on Very Large Data Bases +(VLDB’01). 241–250. +MARTENS, W., NEVEN, F., AND SCHWENTICK, T. 2007. Simple off the shelf abstractions for XML +schema. SIGMOD Rec. 36, 3, 15–22. +MARTENS, W., NEVEN, F., SCHWENTICK, T., AND BEX, G. J. 2006. Expressiveness and complexity of +XML schema. ACM Trans. Data. Syst. 31, 3. +MCHUGH, J., ABITEBOUL, S., GOLDMAN, R., QUASS, D., AND WIDOM, J. 1997. Lore: A database management system for semistructured data. SIGMOD Rec. 26, 3, 54–66. +MELNIK, S. 2004. Generic model management: Concepts and algorithms. Ph.D. thesis, University +of Leipzig. +MIGNET, L., BARBOSA, D., AND VELTRI, P. 2003. The XML web: A first study. In Proceedings of the +12th International World Wide Web Conference. 500–510. +MIKLAU, G. 2002. XMLData repository. www.cs.washington.edu/research/xmldatasets. +MIN, J.-K., AHN, J.-Y., AND CHUNG, C.-W. 2003. Efficient extraction of schemas for XML documents. +Inform. Process. Lett. 85, 1, 7–12. +NESTOROV, S., ABITEBOUL, S., AND MOTWANI, R. 1998. Extracting schema from semistructured data. +In Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM +Press, 295–306. +NESTOROV, S., ULLMAN, J. D., WIENER, J. L., AND CHAWATHE, S. S. 1997. Representative objects: Concise representations of semistructured, hierarchial data. In Proceedings of the 13th International +Conference on Data Engineering. IEEE Computer Society, 79–90. +NEVEN, F. AND SCHWENTICK, T. 2006. On the complexity of XPath containment in the presence of +disjunction, DTDs, and variables. Logical Methods Comput. Sci. 2, 3. +NGU, A. H. H., ROCCO, D., CRITCHLOW, T., AND BUTTLER, D. 2005. Automatic discovery and inferencing of complex bioinformatics web interfaces. World Wide Web 8, 4, 463–493. +OAKS, P. AND TER HOFSTEDE, A. H. M. 2007. Guided interaction: A mechanism to enable ad hoc +service interaction. Inform. Syst. Frontiers 9, 1, 29–51. +OHLEBUSCH, E. 2001. Implementing conditional term rewriting by graph rewriting. Theor. Comput. Sci. 262, 1, 311–331. +OPEN WEB APPLICATION SECURITY PROJECT CONSORTIUM. 2004. The top ten most critical web application security vulnerabilities—2004 update. www.owasp.org. +PITT, L. 1989. Inductive inference, DFAs, and computational complexity. In Proceedings of the +International Workshop on Analogical and Inductive Inference (AII’89). Springer-Verlag, 18– +44. +RABINER, L. 1989. A tutorial on hidden Markov models and selected applications in speech +recognition. Proc. IEEE 77, 2, 257–286. +RAHM, E. AND BERNSTEIN, P. A. 2001. A survey of approaches to automatic schema matching. +VLDB J. 10, 4, 334–350. +SAHUGUET, A. 2000. Everything you ever wanted to know about DTDs, but were afraid to ask +(extended abstract). In Proceedings of the 3rd International Workshop on The World Wide Web +and Databases, (WebDB’00), Selected Papers. 171–183. +SAKAKIBARA, Y. 1997. Recent advances of grammatical inference. Theor. Comput. Sci. 185, 1, +15–45. +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + Inference of Concise Regular Expressions and DTDs + +• + +11:47 + +SANKEY, J. AND WONG, R. K. 2001. Structural inference for semistructured data. In Proceedings of +the International Conference on Information and Knowledge Management. ACM Press, 159–166. +Sun. Sun JAXB. java.sun.com/webservices/jaxb. +THOMPSON, H. S., BEECH, D., MALONEY, M., AND MENDELSOHN, N. 2004. XML Schema part 1: Structures 2nd Ed. World Wide Web Consortium, Recommendation REC-xmlschema-1-20041028. +W3C. 2002. XHTML 1.0 The Extensible HyperText Markup Language, 2nd Ed. W3C. +WANG, G., LIU, M., YU, J. X., SUN, B., YU, G., LV, J., AND LU, H. 2003. Effective schema-based XML +query optimization techniques. In Proceedings of the 7th International Database Engineering +and Applications Symposium. 230–235. +Received January 2009; revised July 2009; accepted November 2009 + +ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. + + \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..dddbe5a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[build-system] +requires = ["setuptools>=68.0"] +build-backend = "setuptools.backends._legacy:_Backend" + +[project] +name = "grammar-inference-engine" +version = "0.1.0" +description = "BEX-based grammar inference: learn regular expression patterns from example sequences" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "PyYAML>=6.0", +] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3c8d506 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +# Core +PyYAML>=6.0 + +# Tests +pytest>=7.0 diff --git a/tests/test_bex.py b/tests/test_bex.py new file mode 100644 index 0000000..ad62471 --- /dev/null +++ b/tests/test_bex.py @@ -0,0 +1,420 @@ +"""Tests for BEX paper algorithm implementations.""" + +import sys +sys.path.insert(0, '/home/tobi/Desktop/kesai/ProjectManagement/companyweb') + +from bex.soa import SOA +from bex.twotinf import build_soa +from bex.rwr0 import rwr0 +from bex.crx import CRX +from bex.idregex import is_deterministic, idregex +from bex.expr import concat, disj, star, optional, alphabet, strip_k +from bex.koa import KOA, build_complete_koa +from bex.marking import mark_koa +from bex.rwrsq import rwr_sq, strip +from bex.ikoa import ikoa + + +def test_soa_basics(): + G = SOA() + a = G.add_state('a') + b = G.add_state('b') + G.add_edge(G.src, a) + G.add_edge(a, b) + G.add_edge(b, G.sink) + assert G.accept(['a', 'b']) + assert not G.accept(['a']) + assert not G.accept(['b']) + assert not G.accept(['a', 'b', 'c']) + print(" PASS test_soa_basics") + + +def test_soa_contract(): + G = SOA() + a = G.add_state('a') + b = G.add_state('b') + G.add_edge(G.src, a) + G.add_edge(a, b) + G.add_edge(b, G.sink) + G.contract(a, b, concat('a', 'b')) + assert G.is_final() + assert G.expression() == 'a.b' + print(" PASS test_soa_contract") + + +def test_soa_epsilon_closure(): + G = SOA() + a = G.add_state('a') + b = G.add_state('a+') + G.add_edge(G.src, a) + G.add_edge(a, b) + G.add_edge(b, G.sink) + G.add_edge(b, b) + Gs = G.epsilon_closure() + assert Gs.has_edge(b, b) + print(" PASS test_soa_epsilon_closure") + + +def test_twotinf(): + seqs = [['a', 'b', 'c'], ['a', 'c']] + G = build_soa(seqs) + assert G.accept(['a', 'b', 'c']) + assert G.accept(['a', 'c']) + assert not G.accept(['b', 'c']) + print(" PASS test_twotinf") + + +def test_rwr0_concat(): + G = SOA() + a = G.add_state('a') + b = G.add_state('b') + G.add_edge(G.src, a) + G.add_edge(a, b) + G.add_edge(b, G.sink) + result = rwr0(G) + assert result == 'a.b', f"Expected 'a.b', got {result}" + print(" PASS test_rwr0_concat") + + +def test_rwr0_disj(): + G = SOA() + a = G.add_state('a') + b = G.add_state('b') + G.add_edge(G.src, a) + G.add_edge(G.src, b) + G.add_edge(a, G.sink) + G.add_edge(b, G.sink) + result = rwr0(G) + assert result == '(a|b)', f"Expected '(a|b)', got {result}" + print(" PASS test_rwr0_disj") + + +def test_rwr0_iteration(): + G = SOA() + a = G.add_state('a') + G.add_edge(G.src, a) + G.add_edge(a, G.sink) + G.add_edge(a, a) + result = rwr0(G) + assert result == 'a+', f"Expected 'a+', got {result}" + print(" PASS test_rwr0_iteration") + + +def test_rwr0_optional(): + G = SOA() + a = G.add_state('a') + G.add_edge(G.src, a) + G.add_edge(a, G.sink) + result = rwr0(G) + # Single state src→a→sink: language is {a}, not {a,ε} + assert result == 'a', f"Expected 'a', got {result}" + print(" PASS test_rwr0_optional") + + +def test_rwr0_empty(): + G = SOA() + result = rwr0(G) + assert result == '∅', f"Expected '∅', got {result}" + print(" PASS test_rwr0_empty") + + +def test_rwr0_epsilon(): + G = SOA() + G.add_edge(G.src, G.sink) + result = rwr0(G) + assert result == 'ε', f"Expected 'ε', got {result}" + print(" PASS test_rwr0_epsilon") + + +def test_rwr0_complex_a(): + # {abc, ab, ac} is NOT a SORE language (c appears in two roles) + G = build_soa([['a', 'b', 'c'], ['a', 'b'], ['a', 'c']]) + result = rwr0(G) + assert result == '∅', f"Expected ∅ for non-SORE, got {result}" + print(" PASS test_rwr0_complex_a: ∅ (non-SORE)") + + +def test_rwr0_disj_concat(): + """a·b and a·c share Pred/Succ for b,c after processing.""" + G = build_soa([['a', 'b'], ['a', 'c']]) + result = rwr0(G) + assert result is not None + print(f" PASS test_rwr0_disj_concat: {result}") + + +def test_crx_simple(): + crx = CRX() + result = crx.infer([['a', 'b'], ['a', 'b', 'c']]) + assert result is not None and result != '∅' + assert 'a' in result + assert 'b' in result + print(f" PASS test_crx_simple: {result}") + + +def test_crx_example(): + """Example from TODS paper: S = {abccde, cccad, bfegg, bfehi}""" + crx = CRX() + S = [ + ['a', 'b', 'c', 'c', 'd', 'e'], + ['c', 'c', 'c', 'a', 'd'], + ['b', 'f', 'e', 'g', 'g'], + ['b', 'f', 'e', 'h', 'i'], + ] + result = crx.infer(S) + assert result is not None + assert '(' in result # should have disjunction factors + print(f" PASS test_crx_example: {result}") + + +def test_crx_cycle_class(): + """Symbols a,b,c form a cycle in S = {abc, bca, cab}.""" + crx = CRX() + S = [['a', 'b', 'c'], ['b', 'c', 'a'], ['c', 'a', 'b']] + result = crx.infer(S) + assert result is not None + assert 'a' in result and 'b' in result and 'c' in result + print(f" PASS test_crx_cycle_class: {result}") + + +def test_determinism_check(): + assert is_deterministic('a.b') + assert is_deterministic('a+') + assert is_deterministic('(a|b)') + assert not is_deterministic('(a|a)') + print(" PASS test_determinism_check") + + +def test_marking(): + G = KOA(k=2) + a1 = G.add_state('a_1') + a2 = G.add_state('a_2') + G.add_edge(G.src, a1) + G.add_edge(a1, a2) + G.add_edge(a2, G.sink) + H = mark_koa(G) + assert H.label(a1) == 'a_1' + assert H.label(a2) == 'a_2' + assert H.accept(['a_1', 'a_2']) + print(" PASS test_marking") + + +def test_strip(): + assert strip('a_1.b_1') == 'a.b' + assert strip('(a_1|b_1)+') == '(a|b)+' + print(" PASS test_strip") + + +def test_expr_utils(): + assert concat('a', 'b') == 'a.b' + assert disj('a', 'b') == '(a|b)' + assert star('a') == 'a+' + assert optional('a') == 'a?' + assert optional('a.b') == '(a.b)?' + assert alphabet('a.b') == {'a', 'b'} + assert alphabet('(a|b)+') == {'a', 'b'} + assert strip_k('a_1') == 'a' + print(" PASS test_expr_utils") + + +def test_idregex_deterministic(): + """iDRegEx should produce a deterministic expression for simple data.""" + seqs = [['a', 'b'], ['a'], ['a', 'b', 'c']] + result = idregex(seqs, kmax=2, N=2) + if result is None: + print(" SKIP test_idregex_deterministic (returned None)") + return + assert is_deterministic(result), f"Non-deterministic: {result}" + print(f" PASS test_idregex_deterministic: {result}") + + +def test_complete_koa(): + G, states = build_complete_koa([['a', 'b'], ['a']], k=2) + assert G.count_symbol('a') == 2 + assert G.count_symbol('b') == 2 + assert G.has_edge(G.src, G.sink) + print(" PASS test_complete_koa") + + +def run_all(): + tests = [ + test_soa_basics, + test_soa_contract, + test_soa_epsilon_closure, + test_twotinf, + test_rwr0_concat, + test_rwr0_disj, + test_rwr0_iteration, + test_rwr0_optional, + test_rwr0_empty, + test_rwr0_epsilon, + test_rwr0_complex_a, + test_rwr0_disj_concat, + test_crx_simple, + test_crx_example, + test_crx_cycle_class, + test_determinism_check, + test_marking, + test_strip, + test_expr_utils, + test_idregex_deterministic, + test_complete_koa, + ] + passed = 0 + failed = 0 + for t in tests: + try: + t() + passed += 1 + except Exception as e: + print(f" FAIL {t.__name__}: {e}") + failed += 1 + print(f"\n{passed} passed, {failed} failed") + + +# ── Integration tests with real Ansible task data ── + +def test_integration_quartz_deploy(): + """Simple linear sequence — all tasks always in same order.""" + seqs = [ + ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'], + ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'], + ] + crx = CRX() + result = crx.infer(seqs) + assert result is not None + assert all(t in result for t in ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for']) + print(f" PASS quartz_deploy: {result}") + + +def test_integration_validate_system(): + """Optional shell tasks.""" + seqs = [ + ['shell', 'debug', 'shell', 'debug'], + ['shell', 'debug', 'shell', 'debug', 'shell', 'debug'], + ['shell', 'debug'], + ] + crx = CRX() + result = crx.infer(seqs) + assert result is not None + assert 'shell' in result and 'debug' in result + print(f" PASS validate_system: {result}") + + +def test_integration_docker_detect_branch(): + """Branching: docker compose v2 check or v1 fallback.""" + seqs = [ + ['file', 'template', 'command_v2', 'set_fact', 'shell', 'wait_for'], + ['file', 'template', 'command_v1', 'set_fact', 'shell', 'wait_for'], + ] + crx = CRX() + result = crx.infer(seqs) + assert result is not None + assert 'file' in result and 'template' in result and 'shell' in result + print(f" PASS docker_detect: {result}") + + +def test_integration_firewall_gating(): + """Conditional firewall rule sequence (gated).""" + seqs = [ + ['assert', 'file', 'template', 'shell', 'wait_for'], + ['assert', 'file', 'template', 'command_fw', 'command_fw', 'shell', 'wait_for'], + ['assert', 'file', 'template', 'command_fw', 'shell', 'wait_for'], + ] + crx = CRX() + result = crx.infer(seqs) + assert result is not None + assert 'assert' in result and 'file' in result + print(f" PASS firewall_gating: {result}") + + +def test_integration_idregex_linear(): + """iDRegEx on simple linear sequences.""" + seqs = [ + ['assert', 'file', 'template', 'command', 'set_fact', 'shell', 'wait_for'], + ['assert', 'file', 'template', 'command', 'set_fact', 'shell'], + ] + try: + result = idregex(seqs, kmax=2, N=3) + if result: + assert is_deterministic(result) + print(f" PASS idregex_linear: {result}") + else: + print(" SKIP idregex_linear (returned None)") + except Exception as e: + print(f" FAIL idregex_linear: {e}") + + +def test_integration_ikoa_linear(): + """iKoa + rwr² on simple linear sequences.""" + from bex.ikoa import ikoa + from bex.rwrsq import rwr_sq + seqs = [ + ['assert', 'file', 'template', 'command', 'set_fact', 'shell', 'wait_for'], + ['assert', 'file', 'template', 'command', 'set_fact', 'shell'], + ] + G = ikoa(seqs, k=3) + if G is None: + print(" SKIP ikoa_linear (returned None)") + return + expr = rwr_sq(G) + assert expr is not None + print(f" PASS ikoa_linear: {expr}") + + +def test_integration_backup_restic(): + """Sequence with loop (systemd enable).""" + seqs = [ + ['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd', 'systemd', 'systemd'], + ['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd'], + ] + crx = CRX() + result = crx.infer(seqs) + assert result is not None + print(f" PASS backup_restic: {result}") + + +def run_all(): + tests = [ + test_soa_basics, + test_soa_contract, + test_soa_epsilon_closure, + test_twotinf, + test_rwr0_concat, + test_rwr0_disj, + test_rwr0_iteration, + test_rwr0_optional, + test_rwr0_empty, + test_rwr0_epsilon, + test_rwr0_complex_a, + test_rwr0_disj_concat, + test_crx_simple, + test_crx_example, + test_crx_cycle_class, + test_determinism_check, + test_marking, + test_strip, + test_expr_utils, + test_idregex_deterministic, + test_complete_koa, + test_integration_quartz_deploy, + test_integration_validate_system, + test_integration_docker_detect_branch, + test_integration_firewall_gating, + test_integration_idregex_linear, + test_integration_ikoa_linear, + test_integration_backup_restic, + ] + passed = 0 + failed = 0 + for t in tests: + try: + t() + passed += 1 + except Exception as e: + print(f" FAIL {t.__name__}: {e}") + failed += 1 + print(f"\n{passed} passed, {failed} failed") + + +if __name__ == '__main__': + run_all()