From 6bf7a681ce114672fde815dab2a8703809fb1a73 Mon Sep 17 00:00:00 2001 From: tobjend Date: Wed, 1 Jul 2026 11:28:42 +0200 Subject: [PATCH] purge make_charts.py, examples/, full-text papers, blog_post (moved to ~/Desktop/kesai/); translate German CLI to English --- bex/cli.py | 50 +- blog_post.md | 263 ---- examples/role_grammar.py | 111 -- examples/yaml_to_seq.py | 81 -- make_charts.py | 71 - papers/README.md | 6 + papers/paper_arxiv2010.txt | 2210 -------------------------------- papers/paper_tods2010.txt | 2492 ------------------------------------ tests/test_bex.py | 31 +- 9 files changed, 45 insertions(+), 5270 deletions(-) delete mode 100644 blog_post.md delete mode 100644 examples/role_grammar.py delete mode 100644 examples/yaml_to_seq.py delete mode 100644 make_charts.py create mode 100644 papers/README.md delete mode 100644 papers/paper_arxiv2010.txt delete mode 100644 papers/paper_tods2010.txt diff --git a/bex/cli.py b/bex/cli.py index f69d530..7d60f67 100644 --- a/bex/cli.py +++ b/bex/cli.py @@ -19,7 +19,7 @@ from .ilocal import iLocal, extract_contexts_from_file, reduce_contexts def find_yaml_files(directory): - """Findet alle YAML-Dateien in einem Verzeichnis (rekursiv).""" + """Find all YAML files in a directory (recursive).""" patterns = ['**/*.yml', '**/*.yaml'] files = [] for pattern in patterns: @@ -32,37 +32,37 @@ def main(): description='bex — BEX-based YAML Grammar Inference', ) parser.add_argument('--dir', type=str, default='roles/', - help='Verzeichnis mit YAML-Dateien (default: roles/)') + help='Directory with YAML files (default: roles/)') parser.add_argument('--k-max', type=int, default=5, - help='Max k für k-ORE-Inferenz (default: 5)') + help='Max k for k-ORE inference (default: 5)') parser.add_argument('--context', type=str, default=None, - help='Auf spezifischen Container-Key beschränken (z.B. tasks)') + help='Restrict to specific container key (e.g. tasks)') parser.add_argument('--output', type=str, default=None, - help='Output-Datei für Template (default: stdout)') + help='Output file for template (default: stdout)') parser.add_argument('--ilocal', action='store_true', - help='iLocal-Kontextanalyse durchführen') + help='Run iLocal context analysis') parser.add_argument('--crx', action='store_true', - help='CRX (direct CHARE inference) verwenden') + help='Use CRX (direct CHARE inference)') parser.add_argument('--verbose', '-v', action='store_true', - help='Ausführliche Ausgabe') + help='Verbose output') parser.add_argument('--stats', action='store_true', - help='Zeige Token-Statistiken') + help='Show token statistics') args = parser.parse_args() if not os.path.isdir(args.dir): - print(f"Fehler: Verzeichnis '{args.dir}' nicht gefunden.", file=sys.stderr) + print(f"Error: directory '{args.dir}' not found.", file=sys.stderr) sys.exit(1) yaml_files = find_yaml_files(args.dir) if not yaml_files: - print(f"Keine YAML-Dateien in '{args.dir}' gefunden.", file=sys.stderr) + print(f"No YAML files found in '{args.dir}'.", file=sys.stderr) sys.exit(1) - print(f"Gefundene YAML-Dateien: {len(yaml_files)}", file=sys.stderr) + print(f"Found YAML files: {len(yaml_files)}", file=sys.stderr) if args.ilocal: - print("\n=== iLocal: Kontext-Extraktion ===", file=sys.stderr) + print("\n=== iLocal: Context Extraction ===", file=sys.stderr) all_contexts = {} for f in yaml_files: contexts = extract_contexts_from_file(f) @@ -72,11 +72,11 @@ def main(): all_contexts[ctx].extend(seqs) reduced = reduce_contexts(all_contexts) - print(f" Kontexte gefunden: {len(reduced)}", file=sys.stderr) + print(f" Contexts found: {len(reduced)}", file=sys.stderr) for ctx, seqs in sorted(reduced.items()): lengths = [len(s) for s in seqs] - print(f" {ctx}: {len(seqs)} Sequenzen, " - f"Längen {min(lengths)}-{max(lengths)}, " + print(f" {ctx}: {len(seqs)} sequences, " + f"lengths {min(lengths)}-{max(lengths)}, " f"unique_seqs={len(set(tuple(s) for s in seqs))}", file=sys.stderr) @@ -94,30 +94,30 @@ def main(): print(f" {os.path.relpath(f)}: {seq}", file=sys.stderr) except Exception as e: if args.verbose: - print(f" Fehler in {f}: {e}", file=sys.stderr) + print(f" Error in {f}: {e}", file=sys.stderr) if not all_sequences: - print("Keine Sequenzen extrahiert.", file=sys.stderr) + print("No sequences extracted.", file=sys.stderr) sys.exit(1) - print(f" Sequenzen extrahiert: {len(all_sequences)}", file=sys.stderr) + print(f" Sequences extracted: {len(all_sequences)}", file=sys.stderr) lengths = [len(s) for s in all_sequences] - print(f" Längen: min={min(lengths)}, max={max(lengths)}, " + print(f" Lengths: min={min(lengths)}, max={max(lengths)}, " f"avg={sum(lengths)/len(lengths):.1f}", file=sys.stderr) if args.stats: stats = tokenizer.get_statistics() - print("\n=== Token-Statistiken ===", file=sys.stderr) + print("\n=== Token Statistics ===", file=sys.stderr) for token, count in list(stats.items())[:30]: print(f" {token}: {count}", file=sys.stderr) - print("\n=== k-ORE Inferenz ===", file=sys.stderr) + print("\n=== k-ORE Inference ===", file=sys.stderr) kore = kOREInference(k_max=args.k_max) if args.crx: result = kore.infer_with_crx(all_sequences) _, expr, method = result - print(f" Methode: {method}", file=sys.stderr) + print(f" Method: {method}", file=sys.stderr) else: result = kore.infer(all_sequences) if result: @@ -127,7 +127,7 @@ def main(): expr = "∅" print(" Kein Ergebnis", file=sys.stderr) - print(f" Inferierter Ausdruck: {expr}", file=sys.stderr) + print(f" Inferred expression: {expr}", file=sys.stderr) print("\n=== One-Shot Template ===", file=sys.stderr) print(file=sys.stderr) @@ -136,7 +136,7 @@ def main(): if args.output: with open(args.output, 'w') as f: f.write(template) - print(f"Template geschrieben nach: {args.output}", file=sys.stderr) + print(f"Template written to: {args.output}", file=sys.stderr) else: print(template) diff --git a/blog_post.md b/blog_post.md deleted file mode 100644 index 954e266..0000000 --- a/blog_post.md +++ /dev/null @@ -1,263 +0,0 @@ -# Dervish: Discovering Unwritten Conventions with Grammar Inference - -

Dervish

- -**How we turned 36 Ansible roles into a 200-character grammar — and why -it matters for LLM agents.** - -## The problem - -Every codebase has unwritten conventions. Your team's Docker Compose -files always put `image` before `ports` before `volumes`. Your Ansible -deploy roles always start with `assert`, then `file`, then `template`. -Your CI pipelines always run `lint` before `test` before `deploy`. - -Nobody writes these down. They're emergent — copied from role to role, -file to file, until they become a tacit standard. - -When an LLM agent needs to generate new content that follows these -conventions, you have two options: - -1. **Stuff every existing file into context** — 36 deploy roles = 15,000 - tokens. You'll hit the context window on your third example. -2. **Give it one or two examples and hope** — the LLM will guess the - pattern, and it will often guess wrong. - -Neither is good. The first is wasteful. The second is unreliable. - -What you really want is the **compiled convention** — the minimal -description of what all 36 roles share, expressed in ~200 tokens. An -LLM can follow a rule in 200 tokens far more reliably than it can -infer a pattern from 36 examples. - -This is grammar inference. - -## The approach - -Given a set of example sequences over some alphabet (e.g., Ansible -module names, Docker Compose keys, CI job names), learn a regular -expression that describes the general pattern. - -We implemented two algorithms from Bex et al., a pair of papers from -TODS 2010 and arXiv 2010: - -- **CRX** (TODS 2010 §6): A single-pass algorithm that builds a - predecessor relation over symbols, computes equivalence classes, - and emits a Chain Regular Expression (CHARE) that matches ALL - input sequences. Fast, deterministic, captures the full vocabulary. - -- **iDRegEx** (arXiv 2010): A probabilistic algorithm using k-testable - Observation Automata (k-OA) trained with Baum-Welch EM. It finds - only the *minimal common core* — the symbols that appear in every - example. Robust against noise, but fails (returns ∅) when the - examples are too diverse. - -Both run in the **ensemble**: CRX produces a permissive grammar (full -vocabulary, many optional parts), iDRegEx produces a strict grammar -(minimal core). A Minimum Description Length (MDL) score picks the -winner: the grammar that compresses the data best. - -## The algorithms, briefly - -### CRX — Chain Regular Expression inference - -CRX (Algorithm 7, TODS 2010) works in four steps: - -1. **Build the immediate-predecessor relation.** For every adjacent - pair (x, y) across all sequences, record that x precedes y. If - symbol `assert` always appears before `file`, record - `assert → file`. - -2. **Compute equivalence classes.** Take the reflexive-transitive - closure of the predecessor relation. The strongly connected - components are *equivalence classes* — groups of symbols that can - appear in the same position. If `copy` and `template` both follow - `file` and precede `command`, they're in the same class. - -3. **Merge singleton classes.** A class with one symbol that shares - the same predecessor/successor sets as another singleton class - gets merged. This handles symbols that always appear in the - same structural position. - -4. **Topological sort.** The equivalence classes are sorted by their - position in the Hasse diagram of the predecessor relation. Each - class becomes a factor in the output, annotated with a quantifier: - - `+` (one or more) if the class forms a cycle - - `+?` (zero or more) if the class appears variably - - `?` (optional) if the class can be absent - - (exact) if the class always appears exactly once - -The result is a CHARE: a sequence of factors where each factor is a -disjunction of equivalent symbols with a quantifier. - -### iDRegEx — k-optimal regular expression inference - -iDRegEx (Algorithm 4, arXiv 2010) uses a probabilistic automaton: - -1. **Build a complete k-OA.** A k-testable Observation Automaton - records all k-grams (subsequences of length k) from the input - sequences. The automaton's states represent (k-1)-grams. - -2. **Train with Baum-Welch.** EM iterations assign probabilities to - transitions, learning which paths through the automaton are most - likely given the data. - -3. **Disambiguate.** Remove nondeterministic transitions — for any - state and symbol, keep only the most probable next state. - -4. **Prune.** Remove low-probability edges and unreachable states, - leaving only the most likely paths. - -5. **Extract with rwr².** The REWRITE-SQUARED algorithm (rwr², - Algorithm 3) collapses the pruned automaton into a k-optimal - regular expression — the minimal common core. - -### MDL scoring — picking the right level of specificity - -The Minimum Description Length principle (Rissanen 1978) says: the -best grammar is the one that minimizes the sum of its own size and -the cost of encoding the data using it. - -``` -MDL = model_cost + data_cost -``` - -**model_cost** = the number of alphabet symbol occurrences in the -grammar. A grammar with 5 unique symbols used once each has -model_cost = 5. - -**data_cost** = Σ log₂(|L(r)|) across all sequences, where |L(r)| is -the number of strings of length len(s) that the grammar accepts. -A grammar like `(a+b+c+...+z)+` accepts 19 possible symbols at each -position, so for a sequence of length 120, the data cost is -120 × log₂(19) ≈ 510 bits. A grammar like `a.b.c.d.e` accepts only -1 string of length 5, so data cost is 0. - -The ensemble picks the grammar with the lowest total MDL. This -automatically balances specificity against coverage: a grammar that -matches only 1 sequence but does so perfectly (low data cost) can -beat a grammar that matches all sequences but is extremely permissive -(high data cost). - -## The results - -### Ansible deploy roles — 36 roles from companyweb - -Your own deploy roles cover everything from AdGuard Home to -Woodpecker CI. They have NO schema — each is a free-form script. - -``` -Grammar: docker_volume+?.group?.docker_container?.user?.apt?.npm?. - (assert+...+command+copy+file+template+set_fact+...+wait_for)+?. - (cron+firewalld)? -Match: 36/36 -MDL: 2186.28 -``` - -Bottleneck analysis: optional docker setup (volume, group, container, -user, apt, npm), then a large disjunction of ~25 task modules (one or -more), then optional cron/firewalld at the end. This captures the -convention precisely. - -**Compression: 36 roles (15,000 tokens) → 200 tokens (75×)** - -### Geerlingguy Galaxy roles — 15 popular roles - -Jeff Geerling's roles are the most popular on Ansible Galaxy. He has -never documented their structural pattern. Yet every one of the 15 -follows the same arc: - -``` -Grammar: fail?.(include_vars+set_fact+package+file+template+service+...)+. - include+?.(npm+pip)+?.lineinfile? -Match: 15/15 -MDL: 596.64 -``` - -Check prerequisites, OS-specific variables, install packages, -configure with templates, start services, optionally run sub-tasks, -install npm/pip packages, and optionally tweak config lines. - -**This is the first explicit description of the geerlingguy role -module ordering convention.** It took 15 roles and a grammar inference -algorithm to write it down. - -**Compression: 15 roles (5,000 tokens) → 60 tokens (83×)** - -### Ensemble dynamics - -The ensemble (CRX + iDRegEx + MDL) selects different winners -depending on the data: - -| Dataset | Winner | Why | -|---------|--------|-----| -| Ansible galaxy (15 roles) | CRX | iDRegEx returned ∅ (too diverse) | -| Helm prom-stack (6 configs) | **iDRegEx** | Finds minimal core across all configs | -| Terraform modules (8) | CRX | iDRegEx returned ∅ (no common core across domains) | -| Terraform modules (8) | CRX | Every resource type optional across domains | -| GitHub Actions Go lint (6) | CRX | Tight pattern, all match | - -iDRegEx wins when the data has a clear common core. CRX wins when -there's no single shared subsequence (the roles share the *vocabulary* -but not the *order*). - -## The MCP - -The engine is exposed as an MCP server: - -```python -from bex.mcp_server import infer_best_grammar - -# Full coverage -output = infer_best_grammar( - sequences=role_sequences, - prefer="crx", -) -# Returns: -# Best: CRX (MDL 288) -# Grammar: fail?.(include_vars+set_fact+package+file+template+service+...)+ -# .include+?.(npm+pip)+?.lineinfile? - -# Ensemble — let MDL pick -output = infer_best_grammar(sequences=role_sequences) -``` - -An agent workflow: - -1. Agent needs to write an Ansible role -2. Finds 15 existing geerlingguy roles, extracts their task module sequences -3. Calls `infer_best_grammar(sequences=..., prefer='crx')` -4. Gets back the grammar in ~60 tokens -5. Generates a new role that follows the structural pattern - -Without the MCP: 15 role files in context (5,000 tokens), or guesswork. -With the MCP: one grammar rule (~60 tokens), known to match 15/15 roles. - -## What it means - -Grammar inference turns **examples** into **rules**. The rule is a -compressed description of the structural convention — and for -schema-less content like the geerlingguy role module ordering, this is -the *first time* the convention has been written down at all. - -For LLM agents, this changes the trade-off between context and -accuracy. Instead of flooding the context window with examples, the -agent can call the MCP, get the rule in ~60 tokens, and follow it. -The rule is more reliable than guessing from examples, and it costs -less than the first example would have. - -The algorithm doesn't need to understand what a deploy role does. It -doesn't know that `file` creates directories and `template` renders -Jinja2. It only needs to see 36 sequences of module names and find -the pattern they all share. The structural convention is in the data -— you just have to extract it. - -## References - -- Bex, G. J., Gelade, W., Neven, F., & Vansummeren, S. (2010). - [*Learning Deterministic Regular Expressions for the Web.*](https://doi.org/10.1145/1806907.1806911) TODS 2010. -- Bex, G. J., Gelade, W., Martens, W., & Neven, F. (2010). - [*Simplifying XML Schema: Single-Type Approximations of Regular - Expressions.*](https://arxiv.org/abs/1004.2372) arXiv:1004.2372. -- Rissanen, J. (1978). *Modeling by shortest data description.* - Automatica 14(5). diff --git a/examples/role_grammar.py b/examples/role_grammar.py deleted file mode 100644 index 79c2fe8..0000000 --- a/examples/role_grammar.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Extract Ansible role task module sequences and learn per-group grammars.""" - -from pathlib import Path -import yaml -from collections import defaultdict - -from .crx import CRX -from .expr import strip_k - - -IGNORE_MODULES = frozenset({'name', 'tags', 'when', 'register', 'no_log', - 'changed_when', 'failed_when', 'ignore_errors', - 'run_once', 'delegate_to', 'loop', 'loop_control', - 'until', 'retries', 'delay', 'poll', 'async', - 'become', 'become_user', 'become_flags', - 'check_mode', 'diff', 'environment', - 'vars', 'notify', 'args', - 'block', 'rescue', 'always', 'include_tasks'}) - - -def extract_module_name(task): - """Extract the Ansible module name from a task dict. - - The module is the key that is NOT a known non-module key. - Returns 'skip' for non-task entries like block/rescue/always. - """ - if not isinstance(task, dict): - return None - # Check for block/rescue/always — these contain nested tasks - for key in ('block', 'rescue', 'always'): - if key in task: - nested = task[key] - if isinstance(nested, list): - return [extract_module_name(t) for t in nested] - return None - # Find the module key (not name, not meta-keys) - for key, value in task.items(): - if key in ('name',): - continue - if key in IGNORE_MODULES: - continue - if isinstance(value, (dict, list, str, bool, int, float)): - # It's the module name (venv or fqcn) - return strip_k(key) - return None - - -def flatten_nested(seq): - """Flatten nested lists into a single list.""" - result = [] - for item in seq: - if isinstance(item, list): - result.extend(flatten_nested(item)) - elif item is not None and item != 'skip': - result.append(item) - return result - - -def get_role_category(role_name): - """Extract category from role name like deploy_foo → deploy.""" - parts = role_name.split('_') - if len(parts) >= 2: - return parts[0] - return 'other' - - -def load_role_module_sequence(role_dir): - """Load a role's task file and extract the module sequence.""" - task_file = role_dir / 'tasks' / 'main.yml' - if not task_file.exists(): - return None, None - with open(task_file) as f: - data = yaml.safe_load(f) - if not isinstance(data, list): - return None, None - - modules = [] - for task in data: - result = extract_module_name(task) - if isinstance(result, list): - modules.extend(flatten_nested(result)) - elif result is not None: - modules.append(result) - - return role_dir.name, modules - - -def collect_all_role_sequences(roles_dir='roles'): - """Collect module sequences from all roles, grouped by category.""" - by_category = defaultdict(list) - all_roles = [] - for role_dir in sorted(Path(roles_dir).glob('*/tasks/main.yml')): - role_name = role_dir.parent.parent.name - name, seq = load_role_module_sequence(role_dir.parent.parent) - if seq: - cat = get_role_category(role_name) - by_category[cat].append((role_name, seq)) - all_roles.append((role_name, seq)) - return all_roles, by_category - - -def learn_grammar(sequences): - """Run CRX on a list of sequences.""" - if len(sequences) < 2: - seqs = [sequences[0]] if sequences else [] - else: - seqs = sequences - if not seqs: - return 'ε' - crx = CRX() - return crx.infer(seqs) diff --git a/examples/yaml_to_seq.py b/examples/yaml_to_seq.py deleted file mode 100644 index f8937b0..0000000 --- a/examples/yaml_to_seq.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Convert YAML files to key-path sequences for BEX grammar inference.""" - -from pathlib import Path -import yaml - - -def yaml_to_keypath_sequence(data, prefix=""): - """Convert parsed YAML data to a sequence of key paths (DFS traversal). - - Each leaf (scalar) emits its full key path as a symbol. - Lists use a generic `[]` marker (no indices). - Values are NOT included — only key paths. - """ - seq = [] - if isinstance(data, dict): - for key, value in data.items(): - path = f"{prefix}.{key}" if prefix else key - if isinstance(value, (dict, list)): - seq.extend(yaml_to_keypath_sequence(value, path)) - else: - seq.append(path) - elif isinstance(data, list): - for item in data: - list_prefix = f"{prefix}[]" if prefix else "[]" - if isinstance(item, (dict, list)): - seq.extend(yaml_to_keypath_sequence(item, list_prefix)) - else: - seq.append(list_prefix) - return seq - - -def yaml_file_to_sequence(filepath): - """Load a YAML file and convert to a key-path sequence.""" - with open(filepath) as f: - data = yaml.safe_load(f) - if data is None: - return [] - return yaml_to_keypath_sequence(data) - - -def is_vault_file(filepath): - """Check if a file is an Ansible vault file (encrypted).""" - try: - with open(filepath) as f: - first = f.read(100) - return '$ANSIBLE_VAULT' in first or first.startswith('!vault |') - except Exception: - return False - - -def collect_all_sequences(root_dir=".", include_vault=False): - """Collect key-path sequences from all YAML files. - - Returns: - list of (filepath, sequence) tuples. - """ - results = [] - for path in sorted(Path(root_dir).rglob("*.yml")): - parts = path.parts - if any(d in parts for d in ('node_modules', '.venv', '__pycache__', '.git')): - continue - skippable = ('vault.yml' in path.name or 'vault' in path.name) - if not include_vault and (skippable or is_vault_file(path)): - continue - try: - seq = yaml_file_to_sequence(path) - if seq: - results.append((path, seq)) - except Exception as e: - print(f" SKIP {path}: {e}") - return results - - -def sequences_to_crx(result_list): - """Run CRX on collected sequences.""" - from .crx import CRX - sequences = [seq for _, seq in result_list] - if not sequences: - return 'ε' - crx = CRX() - return crx.infer(sequences) diff --git a/make_charts.py b/make_charts.py deleted file mode 100644 index 1553311..0000000 --- a/make_charts.py +++ /dev/null @@ -1,71 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np - -plt.xkcd(scale=0.7, length=60, randomness=2) - -FIG_W = 8 -FIG_H = 5 - -# ── Chart 1: Context cost vs examples ── -fig1, ax1 = plt.subplots(figsize=(FIG_W, FIG_H)) - -N = [1, 5, 15, 36] -raw = [100, 500, 1500, 3600] # ~100 tokens/example -dervish = [40, 60, 60, 200] # grammar grows only when diversity grows - -x = np.arange(len(N)) -w = 0.35 - -bars1 = ax1.bar(x - w/2, raw, w, label='Raw examples', color='#e74c3c', alpha=0.85) -bars2 = ax1.bar(x + w/2, dervish, w, label='Dervish grammar', color='#3498db', alpha=0.85) - -ax1.set_xticks(x) -ax1.set_xticklabels([f'{n} examples' for n in N]) -ax1.set_ylabel('Tokens needed in context') -ax1.set_title('Context cost: raw examples vs Dervish grammar') -ax1.legend(frameon=False) - -for bar in bars1: - ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 80, - f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=9) -for bar in bars2: - ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 80, - f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=9) - -ax1.set_ylim(0, 4500) -fig1.tight_layout() -fig1.savefig('chart_context_cost.png', dpi=200) -plt.close(fig1) - -# ── Chart 2: Tokens — Without vs With Dervish (per dataset) ── -fig2, ax2 = plt.subplots(figsize=(FIG_W, FIG_H)) - -datasets = ['Ansible Galaxy\n(15 roles)', 'Helm\n(6 configs)', 'Go lint\n(6 jobs)'] -without = [5000, 3000, 900] -with_derv = [60, 40, 30] -ratios = [f'{int(w/d)}×' for w, d in zip(without, with_derv)] - -x2 = np.arange(len(datasets)) -w2 = 0.3 - -bw = ax2.bar(x2 - w2/2, without, w2, label='Without Dervish', color='#e74c3c', alpha=0.85) -bd = ax2.bar(x2 + w2/2, with_derv, w2, label='With Dervish', color='#3498db', alpha=0.85) - -ax2.set_xticks(x2) -ax2.set_xticklabels(datasets) -ax2.set_ylabel('Tokens') -ax2.set_title('Token savings per dataset') -ax2.legend(frameon=False) -ax2.set_yscale('log') -ax2.set_ylim(5, 30000) - -# Label compression ratios -for i, (r, wbar, dbar) in enumerate(zip(ratios, bw, bd)): - ax2.text(x2[i], without[i] * 1.3, r, ha='center', va='bottom', fontsize=11, fontweight='bold', - bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='gray', alpha=0.8)) - -fig2.tight_layout() -fig2.savefig('chart_token_savings.png', dpi=200) -plt.close(fig2) - -print("Charts saved: chart_context_cost.png, chart_token_savings.png") diff --git a/papers/README.md b/papers/README.md new file mode 100644 index 0000000..69eb14d --- /dev/null +++ b/papers/README.md @@ -0,0 +1,6 @@ +# Papers + +The Dervish algorithms are based on two papers by Bex et al.: + +- **CRX** — [*Learning Deterministic Regular Expressions for the Web*](https://doi.org/10.1145/1806907.1806911) (TODS 2010) +- **iDRegEx** — [*Simplifying XML Schema: Single-Type Approximations of Regular Expressions*](https://arxiv.org/abs/1004.2372) (arXiv:1004.2372) diff --git a/papers/paper_arxiv2010.txt b/papers/paper_arxiv2010.txt deleted file mode 100644 index 7e8e0af..0000000 --- a/papers/paper_arxiv2010.txt +++ /dev/null @@ -1,2210 +0,0 @@ -arXiv:1004.2372v1 [cs.DB] 14 Apr 2010 - -Learning Deterministic Regular Expressions for the -Inference of Schemas from XML Data -GEERT JAN BEX, WOUTER GELADE, FRANK NEVEN -Hasselt University and Transnational University of Limburg -and -STIJN VANSUMMEREN -Université Libre de Bruxelles - -Inferring an appropriate DTD or XML Schema Definition (XSD) for a given collection of XML -documents essentially reduces to learning deterministic regular expressions from sets of positive -example words. Unfortunately, there is no algorithm capable of learning the complete class of -deterministic regular expressions from positive examples only, as we will show. The regular expressions occurring in practical DTDs and XSDs, however, are such that every alphabet symbol -occurs only a small number of times. As such, in practice it suffices to learn the subclass of -deterministic regular expressions in which each alphabet symbol occurs at most k times, for some -small k. We refer to such expressions as k-occurrence regular expressions (k-OREs for short). -Motivated by this observation, we provide a probabilistic algorithm that learns k-OREs for increasing values of k, and selects the deterministic one that best describes the sample based on a -Minimum Description Length argument. The effectiveness of the method is empirically validated -both on real world and synthetic data. Furthermore, the method is shown to be conservative over -the simpler classes of expressions considered in previous work. -Categories and Subject Descriptors: F.4.3 [Mathematical Logic and Formal Languages]: -Formal Languages; I.2.6 [Artificial Intelligence]: Learning; I.7.2 [Document and Text Processing]: Document Preparation -General Terms: Algorithms, Languages, Theory -Additional Key Words and Phrases: regular expressions, schema inference, XML - -1. - -INTRODUCTION - -Recent studies stipulate that schemas accompanying collections of XML documents -are sparse and erroneous in practice. Indeed, Barbosa et al. [2005] and Mignet et al. -[2003] have shown that approximately half of the XML documents available on the -web do not refer to a schema. In addition, Bex et al. [2004] and Martens et al. -[2006] have noted that about two-thirds of XML Schema Definitions (XSDs) gathered from schema repositories and from the web at large are not valid with respect -to the W3C XML Schema specification [Thompson et al. 2001], rendering them -A preliminary version of this article appeared in the 17th International World Wide Web Conference (WWW 2008). -Permission to make digital/hard copy of all or part of this material without fee for personal -or classroom use provided that the copies are not made or distributed for profit or commercial -advantage, the ACM copyright/server notice, the title of the publication, and its date appear, and -notice is given that copying is by permission of the ACM, Inc. To copy otherwise, to republish, -to post on servers, or to redistribute to lists requires prior specific permission and/or a fee. -c 2024 ACM 0000-0000/2024/0000-0001 $5.00 -ACM Journal Name, Vol. V, No. N, November 2024, Pages 1–31. - - 2 - -· - -Geert Jan Bex et al. - - - - - - -Fig. 1. - -An example DTD. - -essentially useless for immedidate application. A similar observation was made by -Sahuguet [2000] concerning Document Type Definitions (DTDs). Nevertheless, the -presence of a schema strongly facilitates optimization of XML processing (cf., e.g., -[Benedikt et al. 2005; Che et al. 2006; Du et al. 2004; Freire et al. 2002; Koch et al. -2004; Manolescu et al. 2001; Neven and Schwentick 2006]) and various software -development tools such as Castor [cas ] and SUN’s JAXB [jax ] rely on schemas -as well to perform object-relational mappings for persistence. Additionally, the -existence of schemas is imperative when integrating (meta) data through schema -matching [Rahm and Bernstein 2001] and in the area of generic model management [Bernstein 2003]. -Based on the above described benefits of schemas and their unavailability in -practice, it is essential to devise algorithms that can infer a DTD or XSD for a -given collection of XML documents when none, or no syntactically correct one, is -present. This is also acknowledged by Florescu [2005] who emphasizes that in the -context of data integration -“We need to extract good-quality schemas automatically from existing -data and perform incremental maintenance of the generated schemas.” -As illustrated in Figure 1, a DTD is essentially a mapping d from element names -to regular expressions over element names. An XML document is valid with respect -to the DTD if for every occurrence of an element name e in the document, the -word formed by its children belongs to the language of the corresponding regular -expression d(e). For instance, the DTD in Figure 1 requires each store element -to have zero or more order children, which must be followed by a stock element. -Likewise, each order must have a customer child, which must be followed by one -or more item elements. -To infer a DTD from a corpus of XML documents C it hence suffices to look, -for each element name e that occurs in a document in C, at the set of element -name words that occur below e in C, and to infer from this set the corresponding -regular expression d(e). As such, the inference of DTDs reduces to the inference -of regular expressions from sets of positive example words. To illustrate, from the -words id price, id qty supplier, and id qty item item appearing under -elements in a sample XML corpus, we could derive the rule -item → (id, price + (qty, (supplier + item+ ))). -Although XSDs are more expressive than DTDs, and although XSD inference is -therefore more involved than DTD inference, derivation of regular expressions remains one of the main building blocks on which XSD inference algorithms are built. -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -In fact, apart from also inferring atomic data types, systems like Trang [Clark ] and -XStruct [Hegewald et al. 2006] simply infer DTDs in XSD syntax. The more recent -iXSD algorithm [Bex et al. 2007] does infer true XSD schemas by first deriving a -regular expression for every context in which an element name appears, where the -context is determined by the path from the root to that element, and subsequently -reduces the number of contexts by merging similar ones. -So, the effectiveness of DTD or XSD schema inference algorithms is strongly -determined by the accuracy of the employed regular expression inference method. -The present article presents a method to reliably learn regular expressions that -are far more complex than the classes of expressions previously considered in the -literature. -1.1 - -Problem setting - -In particular, let Σ be a fixed set of alphabet symbols (also called element names), -and let Σ∗ be the set of all words over Σ. -Definition 1.1 (Regular Expressions). Regular expressions are derived by the following grammar. -r, s ::= ∅ | ε | a | r . s | r + s | r? | r+ -Here, parentheses may be added to avoid ambiguity; ε denotes the empty word; -a ranges over symbols in Σ; r . s denotes concatenation; r + s denotes disjunction; -r+ denotes one-or-more repetitions; and r? denotes the optional regular expression. -That is, the language L(r) accepted by regular expression r is given by: -L(∅) = ∅ -L(a) = {a} -L(r + s) = L(r) ∪ L(s) - -L(ε) = {ε} -L(r . s) = {vw | v ∈ L(r), w ∈ L(s)} -L(r+ ) = {v1 . . . vn | n ≥ 1 and v1 , . . . , vn ∈ L(r)} - -L(r?) = L(r) ∪ {ε}. -Note that the Kleene star operator (denoting zero or more repititions as in r∗ ) is -not allowed by the above syntax. This is not a restriction, since r∗ can always be -represented as (r+ )? or (r?)+ . Conversely, the latter can always be rewritten into -the former for presentation to the user. -The class of all regular expressions is actually too large for our purposes, as both -DTDs and XSDs require the regular expressions occurring in them to be deterministic (also sometimes called one-unambiguous [Brüggemann-Klein and Wood -1998]). Intuitively, a regular expression is deterministic if, without looking ahead -in the input word, it allows to match each symbol of that word uniquely against a -position in the expression when processing the input in one pass from left to right. -For instance, (a + b)∗ a is not deterministic as already the first symbol in the word -aaa could be matched by either the first or the second a in the expression. Without -lookahead, it is impossible to know which one to choose. The equivalent expression -b∗ a(b∗ a)∗ , on the other hand, is deterministic. -Definition 1.2. Formally, let r stand for the regular expression obtained from r -by replacing the ith occurrence of alphabet symbol a in r by a(i) , for every i and -+ -+ -a. For example, for r = b+ a(ba+ )? we have r = b(1) a(1) (b(2) a(2) )?. A regular -ACM Journal Name, Vol. V, No. N, November 2024. - -3 - - 4 - -· - -Geert Jan Bex et al. - -expression r is deterministic if there are no words wa(i) v and wa(j) v 0 in L(r) such -that i 6= j. -Equivalently, an expression is deterministic if the Glushkov construction [BrüggemanKlein 1993] translates it into a deterministic finite automaton rather than a nondeterministic one [Brüggemann-Klein and Wood 1998]. Not every non-deterministic -regular expression is equivalent to a deterministic one [Brüggemann-Klein and -Wood 1998]. Thus, semantically, the class of deterministic regular expressions -forms a strict subclass of the class of all regular expressions. -For the purpose of inferring DTDs and XSDs from XML data, we are hence in -search of an algorithm that, given enough sample words of a target deterministic -regular expression r, returns a deterministic expression r0 equivalent to r. In the -framework of learning in the limit [Gold 1967], such an algorithm is said to learn -the deterministic regular expressions from positive data. -Definition 1.3. Define a sample to be a finite subset of Σ∗ and let R be a subclass -of the regular expressions. An algorithm M mapping samples to expressions in R -learns R in the limit from positive data if (1) S ⊆ L(M (S)) for every sample S and -(2) to every r ∈ R we can associate a so-called characteristic sample Sr ⊆ L(r) such -that, for each sample S with Sr ⊆ S ⊆ L(r), M (S) is equivalent to r. -Intuitively, the first condition says that M must be sound ; the second that M -must be complete, given enough data. A class of regular expressions R is learnable -in the limit from positive data if an algorithm exists that learns R. For the class of -all regular expressions, it was shown by Gold that no such algorithm exists [Gold -1967]. We extend this result to the class of deterministic expressions: -Theorem 1.4. The class of deterministic regular expressions is not learnable in -the limit from positive data. -Proof. It was shown by Gold [1967, Theorem I.8], that any class of regular -expressions that contains all non-empty finite languages as well as at least one -infinite language is not learnable in the limit from positive data. Since deterministic -regular expressions like a∗ define an infinite language, it suffices to show that every -non-empty finite language is definable by a deterministic expression. Hereto, let -S be a finite, non-empty set of words. Now consider the prefix tree T for S. For -example, if S = {a, aab, abc, aac}, we have the following prefix tree: -a -a -b c - -b -c - -Nodes for which the path from the root to that node forms a word in S are marked -by double circles. In particular, all leaf nodes are marked. -By viewing the internal nodes in T with two or more children as disjunctions; -internal nodes in T with one child as conjunctions; and adding a question mark for -every marked internal node in T , it is straightforward to transform T into a regular -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -expression. For example, with S and T as above we get r = a .(b . c + a .(b + c))?. -Clearly, L(r) = S. Moreover, since no node in T has two edges with the same label, -r must be deterministic. -Theorem 1.4 immediately excludes the possibility for an algorithm to infer the -full class of DTDs or XSDs. In practice, however, regular expressions occurring -in DTDs and XSDs are concise rather than arbitrarily complex. Indeed, a study -of 819 DTDs and XSDs gathered from the Cover Pages [Cover 2003] (including -many high-quality XML standards) as well as from the web at large, reveals that -regular expressions occurring in practical schemas are such that every alphabet -symbol occurs only a small number of times [Martens et al. 2006]. In practice, -therefore, it suffices to learn the subclass of deterministic regular expressions in -which each alphabet symbol occurs at most k times, for some small k. We refer to -such expressions as k-occurrence regular expressions. -Definition 1.5. A regular expression is k-occurrence if every alphabet symbol -occurs at most k times in it. -For example, the expressions customer . order+ and (school + institute)+ are -both 1-occurrence, while id .(qty+id) is 2-occurrence (as id occurs twice). Observe -that if r is k-occurrence, then it is also l-occurrence for every l ≥ k. To simplify -notation in what follows, we abbreviate ‘k-occurrence regular expression’ by k-ORE -and also refer to the 1-OREs as ‘single occurrence regular expressions’ or SOREs. -1.2 - -Outline and Contributions - -Actually, the above mentioned examination shows that in the majority of the cases -k = 1. Motivated by that observation, we have studied and suggested practical -learning algorithms for the class of deterministic SOREs in a companion article [Bex -et al. 2006]. These algorithms, however, can only output SOREs even when the -target regular expression is not. In that case they always return an approximation -of the target expressions. It is therefore desirable to also have learning algorithms -for the class of deterministic k-OREs with k ≥ 2. Furthermore, since the exact -k-value for the target expression, although small, is unknown in a schema inference -setting, we also require an algorithm capable of determining the best value of k -automatically. -We begin our study of this problem in Section 3 by showing that, for each fixed k, -the class of deterministic k-OREs is learnable in the limit from positive examples -only. We also argue, however, that this theoretical algorithm is unlikely to work -well in practice as it does not provide a method to automatically determine the -best value of k and needs samples whose size can be exponential in the size of the -alphabet to successfully learn some target expressions. -In view of these observations, we provide in Section 4 the practical algorithm -iDRegEx. Given a sample of words S, iDRegEx derives corresponding deterministic k-OREs for increasing values of k and selects from these candidate expressions -the expression that describes S best. To determine the “best” expression we propose two measures: (1) a Language Size measure and (2) a Minimum Description -Length measure based on the work of Adriaans and Vitányi [2006]. The main technical contribution lies in the subroutine used to derive the actual k-OREs for S. -ACM Journal Name, Vol. V, No. N, November 2024. - -5 - - 6 - -· - -Geert Jan Bex et al. - -Indeed, while for the special case where k = 1 one can derive a k-ORE by first -learning an automaton A for S using the inference algorithm of Garcia and Vidal -[1990], and by subsequently translating A into a 1-ORE (as shown in [Bex et al. -2006]), this approach does not work when k ≥ 2. In particular, the algorithm of -Garcia and Vidal only works when learning languages that are “n-testable” for -some fixed natural number n [Garcia and Vidal 1990]. Although every language -definable by a 1-ORE is 2-testable [Bex et al. 2006], there are languages definable -by a 2-ORE, for instance a∗ ba∗ , that are not n-testable for any n. We therefore -use a probabilistic method based on Hidden Markov Models to learn an automaton -for S, which is subsequently translated into a k-ORE. -The effectiveness of iDRegEx is empirically validated in Section 5 both on real -world and synthetic data. We compare the results of iDRegEx with those of -the algorithm presented in previous work [Bex et al. 2008], to which we refer as -iDRegEx(rwr0 ). -2. - -RELATED WORK - -Semi-structured data. In the context of semi-structured data, the inference of -schemas as defined in [Buneman et al. 1997; Quass et al. 1996] has been extensively studied [Goldman and Widom 1997; Nestorov et al. 1998]. No methods were -provided to translate the inferred types to regular expressions, however. -DTD and XSD inference. In the context of DTD inference, Bex et al. [2006] -gave in earlier work two inference algorithms: one for learning 1-OREs and one for -learning the subclass of 1-OREs known as chain regular expressions. The latter -class can also be learned using Trang [Clark ], state of the art software written -by James Clark that is primarily intended as a translator between the schema -languages DTD, Relax NG [Clark and Murata 2001], and XSD, but also infers a -schema for a set of XML documents. In contrast, our goal in this article is to infer -the more general class of deterministic expressions. xtract [Garofalakis et al. -2003] is another regular expression learning system with similar goals. We note -that xtract also uses the Minimum Description Length principle to choose the -best expression from a set of candidates. -Other relevant DTD inference research is [Sankey and Wong 2001] and [Chidlovskii -2001] that learn finite automata but do not consider the translation to deterministic -regular expressions. Also, in [Young-Lai and Tompa 2000] a method is proposed to -infer DTDs through stochastic grammars where right-hand sides of rules are represented by probabilistic automata. No method is provided to transform these into -regular expressions. Although Ahonen [1996] proposes such a translation, the effectiveness of her algorithm is only illustrated by a single case study of a dictionary -example; no experimental study is provided. -Also relevant are the XSD inference systems [Bex et al. 2007; Clark ; Hegewald -et al. 2006] that, as already mentioned, rely on the same methods for learning -regular expressions as DTD inference. -Regular expression inference. Most of the learning of regular languages from -positive examples in the computational learning community is directed towards inference of automata as opposed to inference of regular expressions [Angluin and -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -Smith 1983; Pitt 1989; Sakakibara 1997]. However, these approaches learn strict -subclasses of the regular languages which are incomparable to the subclasses considered here. Some approaches to inference of regular expressions for restricted cases -have been considered. For instance, [Brāzma 1993] showed that regular expressions -without union can be approximately learned in polynomial time from a set of examples satisfying some criteria. [Fernau 2005] provided a learning algorithm for -regular expressions that are finite unions of pairwise left-aligned union-free regular -expressions. The development is purely theoretical, no experimental validation has -been performed. -HMM learning. Although there has been work on Hidden Markov Model structure induction [Rabiner 1989; Freitag and McCallum 2000], the requirement in our -setting that the resulting automaton is deterministic is, to the best of our knowledge, unique. -3. - -BASIC RESULTS - -In this section we establish that, in contrast to the class of all deterministic expressions, the subclass of deterministic k-OREs can theoretically be learned in the limit -from positive data, for each fixed k. We also argue, however, that this theoretical -algorithm is unlikely to work well in practice. -Let Σ(r) denote the set of alphabet symbols that occur in a regular expression -r, and let Σ(S) be similarly defined for a sample S. Define the length of a regular expression r as the length of it string representation, including operators and -parenthesis. For example, the length of (a . b)+ ? + c is 9. -Theorem 3.1. For every k there exists an algorithm M that learns the class of -deterministic k-OREs from positive data. Furthermore, on input S, M runs in -time polynomial in the size of S, yet exponential in k and |Σ(S)|. -Proof. The algorithm M is based on the following observations. First observe -that every deterministic k-ORE r over a finite alphabet A ⊆ Σ can be simplified -into an equivalent deterministic k-ORE r0 of length at most 10k|A| by rewriting r -according to the following system of rewrite rules until no more rule is applicable: -((s)) → (s) -s?? → s? -s + ε → s? -s.ε → s -ε? → ε -s+∅ → s -s.∅ → ∅ -∅? → ∅ - -s?+ → s+ ? -s++ → s+ -ε + s → s? -ε.s → s -ε+ → ε -∅+s → s -∅.s → ∅ -∅+ → ∅ - -(The first rewrite rule removes redundant parenthesis in r.) Indeed, since each -rewrite rule clearly preserves determinism and language equivalence, r0 must be a -deterministic expression equivalent to r. Moreover, since none of the rewrite rules -duplicates a subexpression and since r is a k-ORE, so is r0 . Now note that, since -ACM Journal Name, Vol. V, No. N, November 2024. - -7 - - 8 - -· - -Geert Jan Bex et al. - -no rewrite rule applies to it, r0 is either ∅, ε, or generated by the following grammar -t ::= a | a? | a+ | a+ ? | (a) | (a)? | (a)+ | (a)+ ? -| t1 . t2 | (t1 . t2 ) | (t1 . t2 )? | (t1 . t2 )+ | (t1 . t2 )+ ? -| t1 + t2 | (t1 + t2 ) | (t1 + t2 )? | (t1 + t2 )+ | (t1 + t2 )+ ? -It is not difficult to verify by structural induction that any expression t produced -by this grammar has length -X -|t| ≤ −4 + 10 -rep(t, a), -a∈Σ(t) - -where rep(t, a) denotes the number of times alphabet symbol a occurs in t. For -instance, rep(b .(b + c), a) = 0 and rep(b .(b + c), b) = 2. Since rep(r0 , a) ≤ k for -every a ∈ Σ(r0 ), it readily follows that |r0 | ≤ 10k|A| − 4 ≤ 10k|A|. -Then observe that all possible regular expressions over A of length at most 10k|A| -can be enumerated in time exponential in k|A|. Since checking whether a regular expression is deterministic is decidable in polynomial time [Brüggemann-Klein -and Wood 1998]; and since equivalence of deterministic expressions is decidable in -polynomial time [Brüggemann-Klein and Wood 1998], it follows by the above observations that for each k and each finite alphabet A ⊆ Σ it is possible to compute -in time exponential in k|A| a finite set RA of pairwise non-equivalent deterministic -k-OREs over A such that -—every r ∈ RA is of size at most 10k|A|; and -—for every deterministic k-ORE r over A there exists an equivalent expression -r0 ∈ RA . -(Note that since RA is computable in time exponential in k|A|, it has at most an -exponential number of elements in k|A|.) Now fix, for each finite A ⊆ Σ an arbitrary -order ≺ on RA , subject to the provision that r ≺ s only if L(s) − L(r) 6= ∅. Such -an order always exists since RA does not contain equivalent expressions. -Then let M be the algorithm that, upon sample S, computes RΣ(S) and outputs -the first (according to ≺) expression r ∈ RΣ(S) for which S ⊆ L(r). Since RΣ(S) can -be computed in time exponential in k|Σ(S)|; since there are at most an exponential -number of expressions in RΣ(S) ; since each expression r ∈ RΣ(S) has size at most -10k|Σ(S)|; and since checking membership in L(r) of a single word w ∈ S can be -done in time polynomial in the size of w and r, it follows that M runs in time -polynomial in S and exponential in k|Σ(S)|. -Furthermore, we claim that M learns the class of deterministic k-OREs. Clearly, -S ⊆ L(M (S)) by definition. Hence, it remains to show completeness, i.e., that we -can associate to each deterministic k-ORE r a sample Sr ⊆ L(r) such that, for each -sample S with Sr ⊆ S ⊆ L(r), M (S) is equivalent to r. Note that, by definition of -RΣ(r) , there exists a deterministic k-ORE r0 ∈ RΣ(r) equivalent to r. Initialize Sr -to an arbitrary finite subset of L(r) = L(r0 ) such that each alphabet symbol of r -occurs at least once in S, i.e., Σ(Sr ) = Σ(r). Let r1 ≺ · · · ≺ rn be all predecessors of -r0 in RΣ(r) according to ≺. By definition of ≺, there exists a word wi ∈ L(r)−L(ri ) -for every 1 ≤ i ≤ n. Add all of these words to Sr . Then clearly, for every sample S -with Sr ⊆ S ⊆ L(r) we have Σ(S) = Σ(r) and S 6⊆ L(ri ) for every 1 ≤ i ≤ n. Since -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -M (S) is the first expression in RΣ(r) with S ⊆ L(r), we hence have M (S) = r0 ≡ r, -as desired. -While Theorem 3.1 shows that the class of deterministic k-OREs is better suited -for learning from positive data than the complete class of deterministic expressions, -it does not provide a useful practical algorithm, for the following reasons. -(1) First and foremost, M runs in time exponential in the size of the alphabet Σ(S), -which may be problematic for the inference of schema’s with many element -names. -(2) Second, while Theorem 3.1 shows that the class of deterministic k-OREs is -learnable in the limit for each fixed k, the schema inference setting is such that -we do not know k a priori. If we overestimate k then M (S) risks being an underapproximation of the target expression r, especially when S is incomplete. -To illustrate, consider the 1-ORE target expression r = a+ b+ and sample -S = {ab, abbb, aabb}. If we overestimate k to, say, 2 instead of 1, then M is free -to output aa?b+ as a sound answer. On the other hand, if we underestimate k -then M (S) risks being an over-approximation of r. Consider, for instance, the -2-ORE target expression r = aa?b+ and the same sample S = {ab, abbb, aabb}. -If we underestimate k to be 1 instead of 2, then M can only output 1-OREs, -and needs to output at least a+ b+ in order to be sound. In summary: we need -a method to determine the most suitable value of k. -(3) Third, the notion of learning in the limit is a very liberal one: correct expressions need only be derived when sufficient data is provided, i.e., when the input -sample is a superset of the characteristic sample for the target expression r. -The following theorem shows that there are reasonably simple expressions r -such that characteristic sample Sr of any sound and complete learning algorithm is at least exponential in the size of r. As such, it is unlikely for any -sound and complete learning algorithm to behave well on real-world samples, -which are typically incomplete and hence unlikely to contain all words of the -characteristic sample. -Theorem 3.2. Let A = {a1 , . . . , an } ⊆ Σ consist of n distinct element names. -Let r1 = (a1 a2 + a3 + · · · + an )+ , and let r2 = (a2 + · · · + an )+ a1 (a2 + · · · + an )+ . -For any algorithm that learns the class of deterministic (2n -Pn+ 3)-OREs and any -sample S that is characteristic for r1 or r2 we have |S| ≥ i=1 (n − 2)i . -Proof. First consider r1 = (a1 a2 + a3 + · · · + an )+ . Observe that there exist -an exponential number of deterministic (2n + 3)-OREs that differ from r1 in only -a single word. Indeed, let B = A − {a1 , a2 } and let W consist of all non-empty -words w over B of length at most n. Define, for every word w = b1 . . . bm ∈ W the -deterministic (2n + 3)-ORE rw such that L(rw ) = L(r1 ) − {w} as follows. First, -i -that accepts all words in -define, for every 1 ≤ i ≤ m the deterministic 2-ORE rw -L(r1 ) that do not start with bi : -i -rw -:= (a1 a2 + (B − {bi })) .(a1 a2 + a3 + · · · + an )∗ - -Clearly, v ∈ L(r1 ) − {w} if, and only if, v ∈ L(r1 ) and there is some 0 ≤ i ≤ m -such that v agrees with w on the first i letters, but differs in the (i + 1)-th letter. -ACM Journal Name, Vol. V, No. N, November 2024. - -9 - - 10 - -· - -Geert Jan Bex et al. - -Hence, it suffices to take -1 -2 -3 -m -rw := rw -+ b1 (ε + rw -+ b2 (ε + rw -+ b3 (· · · + bm−1 (ε + rw -+ bm . r1 ) . . . ))) - -Now assume that algorithm M learns the class of deterministic (2n + 3)-OREs and -suppose that Sr1 is characteristic for r1 . In particular, Sr1 ⊆ L(r1 ). By definition, -M (S) is equivalent to r for every sample S with Sr1 ⊆ S ⊆ L(r1 ). We claim that -in order for M to have this property, W must be a subset -of Sr . Then, since W -Pn -contains all words over B of length at most n, |Sr1 | ≥ i=1 (n−2)i , as desired. The -intuitive argument why W must be a subset of Sr is that if there exists w in W −Sr , -then M cannot distinguish between r1 and rw . Indeed, suppose for the purpose -of contradiction that there is some w ∈ W with w 6∈ Sr1 . Then Sr1 is a subset of -L(rw ). Indeed, Sr1 = Sr1 − {w} ⊆ L(r1 ) − {w} = L(rw ). Furthermore, since M -learns the class of deterministic (2n + 3)-OREs, there must be some characteristic -sample Srw for rw . Now, consider the sample Sr1 ∪ Srw . It is included in both -L(r1 ) and L(rw ) and is a superset of both Sr1 and Srw . But then, by definition of -characteristic samples, M (Sr1 ∪ Srw ) must be equivalent to both r1 and rw . This -is absurd, however, since L(r1 ) 6= L(rw ) by construction. -A similar argument shows that the P -characteristic sample Sr2 of r2 = (a2 + · · · + -n -an )+ a1 (a2 + · · · + an )+ also requires i=1 (n − 2)i elements. In this case, we take -B = A − {a1 } and we take W to be the set of all non-empty words over B of -length at most n. For each w = b1 . . . bm ∈ W , we construct the deterministic -(2n + 3)-ORE rw such that L(rw ) accepts all words in L(r) that do not end with -i -be the 2-ORE that accepts all words in B + -a1 w, as follows. Let, for 1 ≤ i ≤ m, rw -that do not start with bi : -i -rw -:= (B − {bi }) . B ∗ - -Then it suffices to take -i -2 -m -rw := B + a1 (rw -+ b1 (ε + rw -+ b3 (· · · + bm−1 (ε + rw -+ bm B + ) . . . ))). - -A similar argument as for r1 then shows that the characteristic sample Sr2 of r2 -needs to contain, for -w ∈ W , at least one word of the form va1 w with v ∈ B + . -Peach -n -Therefore, |Sr2 | ≥ i=1 (n − 2)i , as desired. -4. - -THE LEARNING ALGORITHM - -In view of the observations made in Section 3, we present in this section a practical -learning algorithm that (1) works well on incomplete data and (2) automatically -determines the best value of k (see Section 5 for an experimental evaluation). Specifically, given a sample S, the algorithm derives deterministic k-OREs for increasing -values of k and selects from these candidate expressions the k-ORE that describes -S best. To determine the “best” expression we propose two measures: (1) a Language Size measure and (2) a Minimum Description Length measure based on the -work of Adriaans and Vitányi [2006]. -Our algorithm does not derive deterministic k-OREs for S directly, but uses, for -each fixed k, a probabilistic method to first learn an automaton for S, which is subsequently translated into a k-ORE. The following section (Section 4.1) explains how -the probabilistic method that learns an automaton from S works. Section 4.2 explains how the learned automaton is translated into a k-ORE. Finally, Section 4.3, -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -introduces the whole algorithm, together with the two measures to determine the -best candidate expression. -4.1 - -Probabilistically Learning a Deterministic Automaton - -In particular, the algorithm first learns a deterministic k-occurrence automaton -(deterministic k-OA) for S. This is a specific kind of finite state automaton in -which each alphabet symbol can occur at most k times. Figure 2(a) gives an -example. Note that in contrast to the classical definition of an automaton, no -edges are labeled: all incoming edges in a state s are assumed to be labeled by the -label of s. In other words, the 2-OA of Figure 2(a) accepts the same language as -aa?b+ . -Definition 4.1 (k-OA). An automaton is a node-labeled graph G = (V, E, lab) -where -—V is a finite set of nodes (also called states) with a distinguished source src ∈ V -and sink sink ∈ V ; -—the edge relation E is such that src has only outgoing edges; sink has only -incoming edges; and every state v ∈ V − {src, sink } is reachable by a walk from -src to sink ; -—lab : V − {src, sink } → Σ is the labeling function. -In this context, an accepting run for a word a1 . . . an is a walk src s1 . . . sn sink -from src to sink in G such that ai = lab(si ) for 1 ≤ i ≤ n. As usual, we denote -by L(G) the set of all words for which an accepting run exists. An automaton is -k-occurrence (a k-OA) if there are at most k states labeled by the same alphabet -symbol. If G uses only labels in A ⊆ Σ then G is an automaton over A. -In what follows, we write Succ(s) for the set {t | (s, t) ∈ E} of all direct successors -of state s in G, and Pred(s) for the set {t | (t, s) ∈ E} of all direct predecessors -of s in G. Furthermore, we write Succ(s, a) and Pred(s, a) for the set of states in -Succ(s) and Pred(s), respectively, that are labeled by a. As usual, an automaton G -is deterministic if Succ(s, a) contains at most one state, for every s ∈ V and a ∈ Σ. -For convenience, we will also refer to the 1-OAs as “single occurence automata” -or SOAs for short. -We learn a deterministic k-OA for a sample S as follows. First, recall from -Section 3 that Σ(S) is the set of alphabet symbols occurring in words in S. We view -S as the result of a stochastic process that generates words from Σ∗ by performing -random walks on the complete k-OA Ck over Σ(S). -Definition 4.2. Define the complete k-OA Ck over Σ(S) to be the k-OA G = -(V, E, lab) over Σ(S) in which each a ∈ Σ(S) labels exactly k states such that -—there is an edge from src to sink ; -—src is connected to exactly one state labeled by a, for every a ∈ Σ(S); and -—every state s ∈ V − {src, sink } has an outgoing edge to every other state except -src. -To illustrate, the complete 2-OA over {a, b} is shown in Figure 2(b). Clearly, -L(Ck ) = Σ(S)∗ . -ACM Journal Name, Vol. V, No. N, November 2024. - -11 - - 12 - -· - -Geert Jan Bex et al. - -a - -a - -b -(a) An example 2-OA. It accepts -the same language as aa?b+ -Fig. 2. - -a - -a - -b - -b - -(b) The complete -{a, b}. - -2-OA - -over - -Two 2-OAs. - -The stochastic process that generates words from Σ∗ by performing random walks -on Ck operates as follows. First, the process picks, among all states in Succ(src), -a state s1 with probability α(src, s1 ) and emits lab(s1 ). Then it picks, among -all states in Succ(s1 ) a state s2 with probability α(s1 , s2 ) and emits lab(s2 ). The -process continues moving to new states and emitting their labels until the final state -is reached (which does not emit a symbol). Of course, α must be a true probability -distribution, i.e., -X -α(s, t) ≥ 0; and -α(s, t) = 1 -(1) -t∈Succ(s) - -for all states s 6= sink and all states t. The probability of generating a particular -accepting run ~s = src s1 s2 . . . sn sink given the process P = (Ck , α) in this setting -is -P [~s | P] = α(src, s1 ) · α(s2 , s3 ) · α(s2 , s3 ) · · · α(sn , sink ), -and the probability of generating the word w = a1 . . . an is -X -P [w | P] = -P [~s | P]. -all accepting runs ~ -s of w in Ck - -Assuming independence, the probability of obtaining all words in the sample S is -then -Y -P [S | P] = -P [w | P]. -w∈S - -Clearly, the process that best explains the observation of S is the one in which the -probabilities α are such that they maximize P [S | P]. -To learn a deterministic k-OA for S we therefore first try to infer from S the -probability distribution α that maximizes P [S | P], and use this distribution to -determine the topology of the desired deterministic k-OA. In particular, we remove -from Ck the non-deterministic edges with the lowest probability as these are the -least likely to contribute to the generation of S, and are therefore the least likely -to be necessary for the acceptance of S. -The problem of inferring α from S is well-studied in Machine Learning, where -our stochastic process P corresponds to a particular kind of Hidden Markov Model -sometimes referred to as a Partially Observable Markov Model (POMM for short). -(For the readers familiar with Hidden Markov Models we note that the initial -state distribution π usually considered in Hidden Markov Models is absorbed in -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -Algorithm 1 iKoa -Require: a sample S, a value for k -Ensure: a deterministic k-OA G with S ⊆ L(G) -1: P ← init(k, S) -2: P ← BaumWelsh(P, S) -3: G ← Disambiguate(P, S) -4: G ← Prune(G, S) -5: return G -Algorithm 2 Disambiguate -Require: a POMM P = (G, α) and sample S -Ensure: a deterministic k-OA -1: Initialize queue Q to {s ∈ Succ(src) | α(src, s) > 0} -2: Initialize set of marked states D ← ∅ -3: while Q is non-empty do -4: -s ← first(Q) -5: -while some a ∈ Σ has | Succ(s, a)| > 1 do -0 -0 -6: -pick t ∈ Succ(s, -P a) with α(s, t) = max{α(s, t ) | t ∈ Succ(s, a)} -7: -set α(s, t) ← {α(s, t0 ) | t0 ∈ Succ(s, a)} -8: -for all t0 in Succ(s, a) \ {t} do -9: -delete edge (s, t0 ) from G -10: -set α(s, t0 ) ← 0 -11: -P ← BaumWelsh(P, S) -12: -if S 6⊆ L(G) then Fail -13: -add s to marked states D and pop s from Q -14: -enqueue all states in Succ(s) \ D to Q -15: return G -the state transition distribution α(src, ·) in our context.) Inference of α is generally -accomplished by the well-known Baum-Welsh algorithm [Rabiner 1989] that adjusts -initial values for α until a (possibly local) maximum is reached. -We use Baum-Welsh in our learning algorithm iKoa shown in Algorithm 1, which -operates as follows. In line 1, iKoa initializes the stochastic process P to the tuple -(Ck , α) where -—Ck is the complete k-OA over Σ(S); -—α(src, sink ) is the fraction of empty words in S; -—α(src, s) is the fraction of words in S that start with lab(s), for every s ∈ -Succ(src); and -—α(s, t) is chosen randomly for s 6= src, subject to the constraints in equation (1). -It is important to emphasize that, since we are trying to model a stochastic process, -multiple occurrences of the same word in S are important. A sample should therefore not be considered as a set in Algorithm 1, but as a bag. Line 2 then optimizes -the initial values of α using the Baum-Welsh algorithm. -With these probabilities in hand Disambiguate, shown in Algorithm 2, determines the topology of the desired deterministic k-OA for S. In a breadth-first -ACM Journal Name, Vol. V, No. N, November 2024. - -13 - - 14 - -· - -Geert Jan Bex et al. - -manner, it picks for each state s and each symbol a the state t ∈ Succ(s, a) with -the highest probability and deletes all other edges to states labeled by a. Line 7 -merely ensures that α continues to be a probability distribution after this removal -and line 11 adjusts α to the new topology. Line 12 is a sanity check that ensures -that we have not removed edges necessary to accept all words in S; Disambiguate -reports failure otherwise. The result of a successful run of Disambiguate is a -deterministic k-OA which nevertheless may have edges (s, t) for which there is no -witness in S (i.e., a word in S whose unique accepting run traverses (s, t)). The -function Prune in line 4 of iKoa removes all such edges. It also removes all states -s ∈ Succ(src) without a witness in S. Figure 3 illustrates a hypothetical run of -iKoa. -It should be noted that BaumWelsh, which iteratively refines α until a (possibly local) maximum is reached, is computationally quite expensive. For that -reason, our implementation only executes a fixed number of refinement iterations -of BaumWelsh in Line 11. Rather surprisingly, this cut-off actually improves the -precision of iDRegEx, as our experiments in Section 5 show, where it is discussed -in more detail. -4.2 - -Translating k-OAs into k-OREs - -Once we have learned a deterministic k-OA for a given sample S using iKoa -it remains to translate this k-OA into a deterministic k-ORE. An obvious approach in this respect would be to use the classical state elimination algorithm -(cf., e.g., [Hopcroft and Ullman 2007]). Unfortunately, as already hinted upon by -Fernau [2004; 2005] and as we illustrate below, it is very difficult to get concise -regular expressions from an automaton representation. For instance, the classical -state elimination algorithm applied to the SOA in Figure 4 yields the expression:1 -(aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + -aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ -(b + aa∗ b))∗ (aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d)))(aa∗ d + -(c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + aa∗ c)(c + -aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))∗ - -which is non-deterministic and differs quite a bit from the equivalent deterministic -SORE -((b?(a + c))+ d)+ e. -Actually, results by Ehrenfeucht and Zeiger [1976]; Gelade and Neven [2008]; and -Gruber and Holzer [2008] show that it is impossible in general to generate concise -regular expressions from automata: there are k-OAs (even for k = 1) for which the -number of occurrences of alphabet symbols in the smallest equivalent expression is -exponential in the size of the automaton. For such automata, an equivalent k-ORE -hence does not exist. -It is then natural to ask whether there is an algorithm that translates a given -k-OA into an equivalent k-ORE when such a k-ORE exists, and returns a k-ORE -super approximation of the input k-OA otherwise. Clearly, the above example -shows that the classical state elimination algorithm does not suffice for this purpose. -1 Transformation computed by JFLAP: www.jflap.org. - -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -α -src -a1 -a2 -b1 -b2 - -a1 - -a2 - -a1 - -a2 - -b1 - -b2 - -b1 - -b2 - -a1 -1 -0.2 -0.4 -0.1 -0.1 - -a2 -\ -0.3 -0.1 -0.3 -0.1 - -b1 -0 -0.3 -0.2 -0.3 -0.2 - -b2 -\ -0.1 -0.1 -0.2 -0.5 - -sink -0 -0.1 -0.2 -0.1 -0.1 - -α -src -a1 -a2 -b1 -b2 - -(a) Process P returned by init with random values for α. - -α -src -a1 -a2 -b1 -b2 - -a1 -1 -0 -0.01 -0.01 -0.01 - -a1 -1 -0.2 -0.01 -0.01 -0.01 - -a2 -\ -0.3 -0.01 -0.01 -0.01 - -b1 -0 -0.3 -0.6 -0.5 -0.33 - -(b) Process P after -BaumWelsh. - -first - -a1 - -a2 - -a1 - -a2 - -b1 - -b2 - -b1 - -b2 - -a2 -\ -0.5 -0.01 -0.01 -0.01 - -b1 -0 -0.49 -0.6 -0.5 -0.33 - -b2 -\ -0 -0.37 -0.28 -0.5 - -sink -0 -0.01 -0.01 -0.2 -0.15 - -α -src -a1 -a2 -b1 -b2 - -(c) Process P after first disambiguation step -(for a1 ). Edges to a1 and b2 are removed. - -a1 -1 -0 -0.01 -0.02 -0.01 - -a2 -\ -0.5 -0.01 -0 -0.01 - -b1 -0 -0.49 -0.6 -0.78 -0.38 - -a - -a - -b - -b - -b - -returned - -sink -0 -0.01 -0.01 -0.2 -0.15 - -training - -b2 -\ -0 -0.37 -0 -0.4 - -by - -sink -0 -0.01 -0.01 -0.2 -0.2 - -(d) Process P after second disambiguation step -(for b1 ). Edges to a2 and b2 are removed. - -a - -(e) Automaton -A -Disambiguate. - -b2 -\ -0.19 -0.37 -0.28 -0.5 - -· - -a - -(f) Automaton A returned by Prune. It -accepts the same language as aa?b+ . - -by - -Fig. 3. Example run of iKoa for k = 2 with target language aa?b+ . For the process -P in (c)-(f), the α values are listed in table-form. To distinguish different states -with the same label, we have indexed the labels. - -b - -a - -d - -c - -e - -Fig. 4. A SOA on which the classical state elimination algorithm returns a complicated expression. -ACM Journal Name, Vol. V, No. N, November 2024. - -15 - - 16 - -· - -Geert Jan Bex et al. -a(1) - -a(2) - -b(1) - -Fig. 5. - -An example marking - -For that reason, we have proposed in a companion article [Bex et al. ] a family -of algorithms {rwr, rwr21 , rwr22 , rwr23 , . . . } that translate SOAs into SOREs and -have exactly these properties: -Theorem 4.3 ([Bex et al. ]). Let G be a SOA and let T be any of the algorithms in the family {rwr, rwr21 , rwr22 , rwr23 , . . . }. If G is equivalent to a SORE -r, then T (G) returns a SORE equivalent to r. Otherwise, T (G) returns a SORE -that is a super approximation of G, L(G) ⊆ L(T (G)). -(Note that SOAs and SOREs are always deterministic by definition.) -These algorithms, in short, apply an inverse Glushkov translation. Starting from -a k-OA where each state is labeled by a symbol, they iteratively rewrite subautomata into equivalent regular expressions. In the end only one state remains and -the regular expression labeling this state is the output. -In this section, we show how the above algorithms can be used to translate k-OAs -into k-OREs. For simplicity of exposition, we will focus our discussion on rwr21 as -it is the concrete translation algorithm used in our experiments in Section 5, but -the same arguments apply to the other algorithms in the family. -Definition 4.4. First, let Σ(k) denote the alphabet that consists of k copies of -the symbols in Σ, where the first copy of a ∈ Σ is denoted by a(1) , the second by -a(2) , and so on: -Σ(k) := {a(i) | a ∈ Σ, 1 ≤ i ≤ k}. -Let strip be the function mapping copies to their original symbol, i.e., strip(a(i) ) = -a. We extend strip pointwise to words, languages, and regular expressions over -Σ(k) . -For example, strip({a(1) a(2) b(1) , a(2) a(2) c(2) }) = {aab, aac} and strip(a(1) . a(2) ? . -+ -b(1) ) = a . a? . b+ . -To see how we can use rwr21 , which translates SOAs into SOREs, to translate -a k-OA into a k-ORE, observe that we can always transform a k-OA G over Σ -into a SOA H over Σ(k) by processing the nodes of G in an arbitrary order and -replacing the ith occurrence of label a ∈ Σ by a(i) . To illustrate, the SOA over Σ(2) -obtained in this way from the 2-OA in Figure 2(a) is shown in Figure 5. Clearly, -L(G) = strip(L(H)). -Definition 4.5. We call a SOA H over Σ(k) obtained from a k-OA G in the above -manner a marking of G. -Note that, by Theorem 4.3, running rwr21 on H yields a SORE r over Σ(k) -with L(H) ⊆ L(r). For instance, with H as in Figure 5, rwr2 (H) returns r = -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -Algorithm 3 rwr2 -Require: a k-OA G -Ensure: a k-ORE r with L(G) ⊆ L(r) -1: compute a marking H of G. -2: return strip(rwr21 (H)) -+ - -a(1) . a(2) ? . b(1) . By subsequently stripping r, we always obtain a k-ORE over Σ. -Moreover, L(G) = strip(L(H)) ⊆ strip(L(r)) = L(strip(r)), so the k-ORE strip(r) -is always a super approximation of G. Algorithm 3, called rwr2 , summarizes the -translation. By our discussion, rwr2 is clearly sound: -Proposition 4.6. rwr2 (G) is a (possibly non-deterministic) k-ORE with L(G) ⊆ -L(rwr2 (G)), for every k-OA G. -Note, however, that even when G is deterministic and equivalent to a deterministic k-ORE r, rwr2 (G) need not be deterministic, nor equivalent to r. For instance, -consider the 2-OA G: -b - -a - -c - -b - -Clearly, G is equivalent to the deterministic 2-ORE bc?a(ba)+ ?. Now suppose for -the purpose of illustration that rwr2 constructs the following marking H of G. (It -does not matter which marking rwr2 constructs, they all result in the same final -expression.) -b(1) - -a(1) - -c(1) - -b(2) - -Since H is not equivalent to a SORE over Σ(k) , rwr21 (H) need not be equivalent -to L(H). In fact, rwr21 (H) returns ((b(1) c(1) ?a(1) )?b(2) ?)+ , which yields the nondeterministic ((bc?a)?b?)+ after stripping. Nevertheless, G is equivalent to the -deterministic 2-ORE bc?a(ba)+ ?. -So although rwr2 is always guaranteed to return a k-ORE, it does not provide -the same strong guarantees that rwr21 provides (Theorem 4.3). The following theorem shows, however, that if we can obtain G by applying the Glushkov construction -on r [Brüggeman-Klein 1993], rwr2 (G) is always equivalent to r. Moreover, if r -is deterministic, then so is rwr2 (G). So in this sense, rwr2 applies an inverse -Glushkov construction to r. Formally, the Glushkov construction is defined as -follows. -Definition 4.7. Let r be a k-ORE. Recall from Definition 1.2 that r is the regular -expression obtained from r by replacing the ith occurrence of alphabet symbol a -by a(i) , for every a ∈ Σ and every 1 ≤ i ≤ n. Let pos(r) denote the symbols in Σ(k) -that actually appear in r. Moreover, let the sets first(r), last(r), and follow (r, a(i) ) -be defined as shown in Figure 6. A k-OA G is a Glushkov translation of r if there -exists a one-to-one onto mapping ρ : (V (G) − {src, sink }) → pos(r) such that -ACM Journal Name, Vol. V, No. N, November 2024. - -17 - - 18 - -· - -Geert Jan Bex et al. -first(∅) -first(a(i) ) -first(r+ ) - -= -= -= - -first(r . s) - -= - -last(∅) -last(a(i) ) -last(r+ ) - -= -= -= - -last(r . s) - -= - -follow (a(i) , a(i) ) -follow (r?, a(i) ) - -= -= - -follow (r+ , a(i) ) - -= - -follow (r + s, a(i) ) - -= - -follow (r . s, a(i) ) - -= - -Fig. 6. - -∅ -first(ε) -{a(i) } -first(r?) -first(r) -first(r + s) -( -first(r) -if ε ∈ -/ L(r), -first(r) ∪ first(s) otherwise. - -= -= -= - -∅ -first(r) -first(r) ∪ first(s) - -∅ -{a(i) } -last(r) -( -last(s) -last(r) ∪ last(s) - -= -= -= - -∅ -last(r) -last(r) ∪ last(s) - -last(ε) -last(r?) -last(r + s) -if ε ∈ -/ L(s), -otherwise. - -∅ -follow (r, a(i) ) -( -follow (r, a(i) ) -(i) -(follow (r, a ) ∪ first(r) -follow (r, a(i) ) -follow (s, a(i) ) - -(i) - -follow (r, a ) - -follow (r, a(i) ) ∪ first(s) - - -follow (s, a(i) ) - -if a(i) ∈ -/ last(r), -otherwise. -if a(i) ∈ pos(r), -otherwise. -if a(i) ∈ pos(r), a(i) ∈ -/ last(r), -if a(i) ∈ pos(r), a(i) ∈ last(r), -otherwise. - -Definition of first(r), last(r), and follow (r, a(i) ), for a(i) ∈ pos(r). - -(1) v ∈ Succ(src) ⇔ ρ(v) ∈ first(r); -(2) v ∈ Pred(sink ) ⇔ ρ(v) ∈ last(r); -(3) v ∈ Succ(w) ⇔ ρ(v) ∈ follow (r, ρ(w)); and -(4) strip(ρ(v)) = lab(v), -for all v, w ∈ V (G) − {src, sink }. -Theorem 4.8. If k-OA G is a Glushkov representation of a target k-ORE -r, then rwr2 (G) is equivalent to r. Moreover, if r is deterministic, then so is -rwr2 (G). -Proof. Since rwr2 (G) = strip(rwr21 (H)) for an arbitrarily chosen marking -H of G, it suffices to prove that strip(rwr21 (H)) is equivalent to r and that -strip(rwr21 (H)) is deterministic whenever r is deterministic, for every marking H -of G. Hereto, let H be an arbitrary but fixed marking of G. In particular, G and H -have the same set of nodes V and edges E, but differ in their labeling function. Let -lab G be the labeling function of G and let lab H the labeling function of H. Clearly, -lab G (v) = strip(lab H (v)) for every v ∈ V − {src, sink }. Since G is a Glushkov -translation of r, there is a one-to-one, onto mapping ρ : (V − {src, sink }) → pos(r) -satisfying properties (1)-(4) in Definition 4.7. Now let σ : pos(r) → Σ(k) be the -function that maps a(i) ∈ pos(r) to lab H (ρ−1 (a(i) )). Since lab H assigns a distinct -label to each state, σ is one-to-one and onto the subset of Σ(k) symbols used as -labels in H. Moreover, by property (4) and the fact that lab G (v) = strip(lab H (v)) -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -we have, -strip(a(i) ) = lab G (ρ−1 (a(i) )) = strip(lab H (ρ−1 (a(i) ))) = strip(σ(a(i) )) - -(?) - -(i) - -for each a ∈ pos(r). In other words, σ preserves (stripped) labels. Now let σ(r) -be the SORE obtained from r by replacing each a(i) ∈ pos(r) by σ(a(i) ). Since σ is -one-to-one and r is a SORE, so is σ(r). Moreover, we claim that L(H) = L(σ(r)). -Indeed, it is readily verified by induction on r that a word a1 (i1 ) . . . an (in ) ∈ L(r) -if, and only if, (i) a1 (i1 ) ∈ first(r); (ii) ap+1 (ip+1 ) ∈ follow (r, ap+1 (ip+1 ) ) for every -1 ≤ p < n; and (iii) an (in ) ∈ last(r). By properties (1)-(4) of Definition 4.7 we -hence obtain: -σ(a1 (i1 ) ) . . . σ(an (in ) ) ∈ L(σ(r)) -⇔ a1 (i1 ) . . . an (in ) ∈ L(r) -⇔ src, ρ−1 (a1 (i1 ) ), . . . , ρ−1 (an (in ) ), sink is a walk in G -⇔ src, ρ−1 (a1 (i1 ) ), . . . , ρ−1 (an (in ) ), sink is a walk in H -⇔ lab H (ρ−1 (a1 (i1 ) )) . . . , lab H (ρ−1 (an (in ) )) ∈ L(H) -⇔ σ(a1 (i1 ) ) . . . σ(an (in ) ) ∈ L(H) -Therefore, L(H) = L(σ(r)). -Hence, we have established that H is a SOA over Σ(k) equivalent to the SORE -σ(r) over Σ(k) . By Theorem 4.3, rwr21 (H) is hence equivalent to σ(r). Therefore, -strip(rwr21 (H)) is equivalent to strip(σ(r)), which by (?) above, is equivalent to -strip(r) = r, as desired. -Finally, to see that strip(rwr21 (H)) is deterministic if r is deterministic, let -s := strip(rwr21 (H)) and suppose for the purpose of contradiction that s is not -deterministic. Then there exists wa(i) v1 and wa(j) v2 in L(s) with i 6= j. It is -0 -0 -not hard to see that this can happen only if there exist w0 a(i ) v10 and w0 a(j ) v20 -in L(rwr21 (H)) with i0 6= j 0 . Since L(rwr21 (H)) = L(σ(r)) we know that hence -0 -0 -00 -0 -σ −1 (w0 a(i ) v10 ) ∈ L(r) and σ −1 (w0 a(j ) v20 ) ∈ L(r). Let w00 a(i ) v100 = σ −1 (w0 a(i ) v10 ) -00 -0 -and w00 a(j ) v200 = σ −1 (w0 a(i ) v20 ). Since σ is one-to-one and i0 6= j 0 , also i00 6= j 00 . -Therefore, r is not deterministic, which yields the desired contradiction. -4.3 - -The whole Algorithm - -Our deterministic regular expression inference algorithm iDRegEx combines iKoa -and rwr2 as shown in Algorithm 4. For increasing values of k until a maximum -kmax is reached, it first learns a deterministic k-OA G from the given sample S, -and subsequently translates that k-OA into a k-ORE using rwr2 . If the resulting -k-ORE is deterministic then it is added to the set C of deterministic candidate -expressions for S, otherwise it is discarded. From this set of candidate expressions, -iDRegEx returns the “best” regular expression best(C), which is determined according to one of the measures introduced below. Since it is well-known that, -depending on the initial value of α, BaumWelsh (and therefore iKoa) may converge to a local maximum that is not necessarily global, we apply iKoa a number -of times N with independently chosen random seed values for α to increase the -probability of correctly learning the target regular expression from S. -The observant reader may wonder whether we are always guaranteed to derive -at least one deterministic expression such that best(C) is defined. Indeed, Theorem 4.8 tells us that if we manage to learn from sample S a k-OA which is the -ACM Journal Name, Vol. V, No. N, November 2024. - -19 - - 20 - -· - -Geert Jan Bex et al. - -Algorithm 4 iDRegEx -Require: a sample S -Ensure: a k-ORE r -1: initialize candidate set C ← ∅ -2: for k = 1 to kmax do -3: -for n = 1 to N do -4: -G ← iKoa(S, k) -5: -if rwr2 (G) is deterministic then -6: -add rwr2 (G) to C -7: return best(C) -Glushkov representation of the target expression r, then rwr2 will always return -a deterministic k-ORE equivalent to r. When k > 1, there can be several k-OAs -representing the same language and we could therefore learn a non-Glushkov one. -In that case, rwr2 always returns a k-ORE which is a super approximation of the -target expression. Although that approximation can be non-deterministic, since we -derive k-OREs for increasing values of k and since for k = 1 the result of rwr2 is -always deterministic (as every SORE is deterministic), we always infer at least one -deterministic regular expression. In fact, in our experiments on 100 synthetic regular expressions, we derived for 96 of them a deterministic expression with k > 1, -and only for 4 expressions had to resort to a 1-ORE approximation. -4.3.1 A Language Size Measure for Determining the Best Candidate. Intuitively, -we want to select from C the simplest deterministic expression that “best” describes -S. Since each candidate expression in C accepts all words in S by construction, one -way to interpret “the best” is to select the expression that accepts the least number -of words (thereby adding the least number of words to S). Since an expression defines an infinite language in general, it is of course impossible to take all words into -account. We therefore only consider the words up to a length n, where n = 2m + 1 -with m the length of the candidate expression, excluding regular expression operators, ∅, and ε. For instance, if the candidate expression is a .(a + c+ )?, then m = 3 -and n = 7. Formally, for a language L, let |L≤n | denote the number of words in L -of length at most n. Then the best candidate in C is the one with the least value of -| L(r)≤n |. If there are multiple such candidates, we pick the shortest one (breaking -ties arbitrarily). It turns out that | L(r)≤n | can be computed quite efficiently; see -[Bex et al. ] for details. -4.3.2 A Minimum Description Length Measure for Determining the Best Candidate. An alternative measure to determine the best candidate is given by Adriaans -and Vitányi [2006], who compare the size of S with the size of the language of a -candidate r. Specifically, Adriaans and Vitányi define the data encoding cost of r -to be: - =i - -n -X -| L (r)| -datacost(r, S) := -2 · log2 i + log2 -, -|S =i | -i=0 -where n = 2m + 1 as before; |S =i | is the number of words in S that have length i; -and | L=i (r)| is the number of words in L(r) that have exactly length i. Although -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -the above formula is numerically difficult to compute, there is an easier estimation -procedure; see [Adriaans and Vitányi 2006] for details. -In this case, the model encoding cost is simply taken to be its length, thereby -preferring shorter expressions over longer ones. The best regular expression in the -candidate set C is then the one that minimizes both model and data encoding cost -(breaking ties arbitrarily). -We already mentioned that xtract [Garofalakis et al. 2003] also utilizes the -Minimum Description Length principle. However, their measure for data encoding -cost depends on the concrete structure of the regular expressions while ours only -depends on the language defined by them and is independent of the representation. -Therefore, in our setting, when two equivalent expressions are derived, the one with -the smallest model cost, that is, the simplest one, will always be taken. -5. - -EXPERIMENTS - -In this section we validate our approach by means of an experimental analysis. -Throughout the section, we say that a target k-ORE r is successfully derived when -a k-ORE s with L(r) = L(s) is generated. The success rate of our experiments -then is the percentage of successfully derived target regular expressions. -Our previous work [Bex et al. 2008] on this topic was based on a version of the -rwr0 algorithm [Bex et al. 2006], we refer to this algorithm as iDRegEx(rwr0 ). -Unfortunately, as detailed in [Bex et al. 2008], it is not known whether rwr0 is -complete on the class of all single occurrence regular expressions. Nevertheless, the -experiments in [Bex et al. 2008] which are revisited below show a good and reliable -performance. However, to obtain a theoretically complete algorithm, c.f.r. Theorem 4.8, we use the algorithm rwr2 which is sound and complete on single occurrence regular expressions. In the remainder we focus on iDRegEx, but compare -with the results for iDRegEx(rwr0 ). -As mentioned in Section 4.3.1, another new aspect of the results presented here is -the use of language size as an alternative measure over Minimum Description Length -(MDL) to compare candidates. The iDRegEx(rwr0 ) algorithm is only considered -with the MDL criterion. We note that for alphabet size 5, the success rate of -iDRegEx with the MDL criterion was only 21 %, while that of the language size -criterion is 98 %. The corpus used in this experiment is described in Section 5.3. -Therefore in the remainder of this section we only consider iDRegEx with the -language size criterion. -For all the experiments described below we take kmax = 4 and N = 10 in Algorithm 4. -5.1 - -Running times - -All experiments were performed using a prototype implementation of iDRegEx -and iDRegEx(rwr0 ) written in Java executed on Pentium M 2.0 GHz class machines equipped with 1GB RAM. For the BaumWelsh subroutine we have gratefully used Jean-Marc François’ Jahmm library [François 2006], which is a faithful -implementation of the algorithms described in Rabiner’s Hidden Markov Model tutorial [Rabiner 1989]. Since Jahmm strives for clarity rather than performance and -since only limited precautions are taken against underflows, our prototype should -be seen as a proof of concept rather than a polished product. In particular, underACM Journal Name, Vol. V, No. N, November 2024. - -21 - - 22 - -· - -Geert Jan Bex et al. - -flows currently limit us to target regular expressions whose total number of symbol -occurrences is at most 40. Here, the total number of symbol occurrences occ(r) of -a regular expression r is its length excluding the regular expression operators and -parenthesis. To illustrate, the total number of symbol occurrences in aa?b+ is 3. -Furthermore, the lack of optimization in Jahmm leads to average running times -ranging from 4 minutes for target expressions r with |Σ(r)| = 5 and occ(r) = 6 to -9 hours for targets expression with |Σ(r)| = 15 and occ(r) = 30. Running times for -iDRegEx and iDRegEx(rwr0 ) are similar. -As already mentioned in Section 4.3, one of the bottlenecks of iDRegEx is the application of BaumWelsh in Line 11 of Disambiguate (Algorithm 2). BaumWelsh -is an iterative procedure that is typically run until convergence, i.e., until the -computed probability distribution no longer change significantly. To improve the -running time, we only apply a fixed number ` of iteration steps when calling -BaumWelsh in Line 11 of Disambiguate. Experiments show that the running -time performance scales linear with ` as one expects, but, perhaps surprisingly, the -success rate improves as well for an optimal value of `. This optimal value for ` -depends on the alphabet size. These improved results can be explained as follows: -applying BaumWelsh in each disambiguation step until it converges guarantees -that the probability distribution for that step will have reached a local optimum. -However, we know that the search space for the algorithm contains many local optima, and that BaumWelsh is a local optimization algorithm, i.e., it will converge -to one of the local optima it can reach from its starting point by hill climbing. The -disambiguation procedure proceeds state by state, so fine tuning the probability -distribution for a disambiguation step may transform the search space so that certain local optima for the next iteration can no longer be reached by a local search -algorithm such as BaumWelsh. Table I shows the performance of the algorithm -for various number of BaumWelsh iterations ` for expressions of alphabet size 5, -10 and 15. These expressions are those described in Section 5.3. In this Table, -` = ∞ denotes the case where BaumWelsh is ran until convergence after each -disambiguation step. The Table illustrates that the success rate is actually higher -for small values of `. The running time performance gains increase rapidly with -the expressions’ alphabet size: for |Σ| = 5, we gain a factor of 3.5 (` = 2), for -|Σ| = 10, it is already a factor of 10 (` = 3) and for |Σ| = 15, we gain a factor -of 25 (` = 3). This brings the running time for the largest expressions we tested -down to 22 minutes, in contrast with 9 hours mentioned for iDRegEx(rwr0 ) and -iDRegEx. The algorithm with the optimal number of BaumWelsh steps in the -disambiguation process will be referred to as iDRegExfixed . In particular for small -alphabet sizes (|Σ| ≤ 7) we use ` = 2, for large alphabet size ` = 3 (|Σ| > 7). We -note that the alphabet size can easily be determined from the sample. -We should also note that Experience with Hidden Markov Model learning in bioinformatics [Finn et al. 2006] suggests that both the running time and the maximum -number of symbol occurrences that can be handled can be significantly improved -by moving to an industrial-strength BaumWelsh implementation. Our focus for -the rest of the section will therefore be on the precision of iDRegEx. -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data -` -1 -2 -3 -4 -∞ - -rate |Σ| = 5 -95 % -100 % -95 % -95 % -98 % - -rate |Σ| = 10 -80 % -75 % -84 % -77 % -75 % - -· - -rate |Σ| = 15 -40 % -50 % -60 % -50 % -50 % - -Table I. Success rate for a limited number of BaumWelsh iterations in the disambiguation procedure, ` = ∞ corresponds to iDRegEx, for ` = 1, . . . , 4 correspond to iDRegExfixed . - -5.2 - -Real-world target expressions and real-world samples - -We want to test how iDRegEx performs on real-world data. Since the number -of publicly available XML corpora with valid schemas is rather limited, we have -used as target expressions the 49 content models occurring in the XSD for XML -Schema Definitions [Thompson et al. 2001] and have drawn multiset samples for -these expressions from a large corpus of real-world XSDs harvested from the Cover -Pages [Cover 2003]. In other words, the goal of our first experiment is to derive, from -a corpus of XSD definitions, the regular expression content models in the schema -for XML Schema Definitions2 . As it turns out, the XSD regular expressions are all -single occurrence regular expressions. -The iDRegEx(rwr0 ) algorithm infers all these expressions correctly, showing -that it is conservative with respect to k since, as mentioned above, the algorithm -considers k values ranging from 1 to 4. In this setting, iDRegEx performs not -as well, deriving only 73 % of the regular expressions correctly. We note that for -each expression that was not derived exactly, always an expression was obtained -describing the input sample and which in addition is more specific than the target -expression. iDRegEx therefore seems to favor more specific regular expressions, -based on the available examples. -5.3 - -Synthetic target expressions - -Although the successful inference of the real-world expressions in Section 5.2 suggests that iDRegEx is applicable in real-world scenarios, we further test its behavior on a sizable and diverse set of regular expressions. Due to the lack of real-world -data, we have developed a synthetic regular expression generator that is parameterized for flexibility. -Synthetic expression generation. In particular, the occurrence of the regular -expression operators concatenation, disjunction (+), zero-or-one (?), zero-or-more -(∗ ), and one-or-more (+ ) in the generated expressions is determined by a userdefined probability distribution. We found that typical values yielding realistic -expressions are 1/10 for the unary operators and 7/20 for others. The alphabet -can be specified, as well as the number of times that each individual symbol should -occur. The maximum of these numbers determines the value k of the generated -k-ORE. -To ensure the validity of our experiments, we want to generate a wide range of -different expressions. To this end, we measure how much the language of a generated -2 This corpus was also used in [Bex et al. 2007] for XSD inference. - -ACM Journal Name, Vol. V, No. N, November 2024. - -23 - - 24 - -· - -Geert Jan Bex et al. - -((debab) + c)∗ a -((((c + b)b) + a)ca) + e + d -(((ea)∗ db) + b + a + c)+ -((b+ + c + e + d)aab)+ -((((eabh) + d + j + c + b)+ f ) + a + g + i)? -((((aa) + e)+ + c)b) + b + d -((((d + a)∗ eabcb) + c)a)? -((((ac) + b + d)eab) + c)∗ -(((((bab) + c)+ + e)?a) + d)+ -((((ecb)+ a) + b)+ + d + a)? -((bagbf eid) + c + a + j + h)∗ -((gdab) + a + i + c + j + e + f )+ hb -((h∗ cdf a) + j + e + g + b + i)∗ ab -((g + b + e + f + i + d)∗ aba) + h + j + c -((((h + b + c + j + f )+ + e)?aaidb) + g)? - -Fig. 7. - -(((((dbe)∗ cf ) + j)hac) + b + i)∗ gad -(((((ihaaj) + d)+ + g)b) + e + b + f + c)+ -(((ecgecd) + b + d + a + j + f )∗ ihaba)∗ -(l + c + d + m + n)∗ aojahbegcbf idke -(((c + b)ab) + d + i + a)+ + j + g + f + e + h -(((a?clf habgd) + b + n + o)iedjcem)∗ k -((a + k + f + c + m + e)+ bdieclbonjgda)∗ h -(((k?jghadf celif cjbhom)+ -b + g + a + e + i + n)+ + d)? -(((aedoadenhdbci) + h + k + m + j + g + b)∗ -f ccgelbif ja) -((a+ + f + d + o + g + n + h + c + b + j + i + e) -keacdlbm) -(((k + f + o + a + j)?edhldf hngicjmab)?cie)∗ bg -((((a?d)+ ba) + h + g + e + c)+ + j + i + b)?f - -A snapshot of the 100 generated expressions. - -expression overlaps with Σ∗ . The larger the overlap, the greater its language size -as defined in Section 4.3.1. -To ensure that the generated expressions do not impede readability by containing -redundant subexpressions (as in e.g., (a+ )+ ), the final step of our generator is to -syntactically simplify the generated expressions using the following straightforward -equivalences: -r∗ → r+ ? -r?? → r? -(r+ )+ → r+ -(r?)+ → r+ ? -(r1 · r2 ) · r3 → r1 · (r2 · r3 ) -r1 · (r2 · r3 ) → r1 · r2 · r3 -(r1 ? · r2 ?)? → r1 ? · r2 ? -(r1 + r2 ) + r3 → r1 + (r2 + r3 ) -r1 + (r2 + r3 ) → r1 + r2 + r3 -(r1 + r2+ )+ → (r1 + r2 )+ -(r1+ + r2+ ) → (r1 + r2 )+ -r1 + r2 ? → (r1 + r2 )? -Of course, the resulting expression is rejected if it is non-deterministic. -To obtain a diverse target set, we synthesized expressions with alphabet size 5 -(45 expressions), 10 (45 expressions), and 15 (10 expressions) with a variety of -symbol occurrences (k = 1, 2, 3). For each of the alphabet sizes, the expressions -were selected to cover language size ranging from 0 to 1. All in all, this yielded a -set of 100 deterministic target expressions. A snapshot is given in Figure 7. -Synthetic sample generation. For each of those 100 target expressions, we -generated synthetic samples by transforming the target expressions into stochastic -processes that perform random walks on the automata representing the expressions -(cf. Section 4). The probability distributions of these processes are derived from the -structure of the originating expression. In particular, each operand in a disjunction -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data -p - -r1 · · · rn - -p - -1 - -r1 - -1 - -··· - -1 - -rn - -· - -1 - -r1 -p/n -p - -r1 + · · · + rn - -1 - -1 -. -. -. -1 - -p/n -rn -p/2 -p -r? - -1 - -r -p/2 - -1 - -2/3 -p - -Fig. 8. - -r+ - -1 -p - -r -1/3 - -From a regular expression to a probabilistic automaton. - -is equally likely and the probability to have zero or one occurrences for the zeroor-one operator ? is 1/2 for each option. The probability to have n repetitions in -a one-or-more or zero-or-more operator (∗ and + ) is determined by the probability -that we choose to continue looping (2/3) or choose to leave the loop (1/3). The -latter values are based on observations of real-world corpora. Figure 8 illustrates -how we construct the desired stochastic process from a regular expression r: starting -from the following initial graph, -1 - -r - -1 - -we continue applying the rewrite rules shown until each internal node is an individual alphabet symbol. -Experiments on covering samples. Our first experiment is designed to test -how iDRegEx performs on samples that are at least large enough to cover the -target regular expression, in the following sense. -Definition 5.1. A sample S covers a deterministic automaton G if for every edge -(s, t) in G there is a word w ∈ S whose unique accepting run in G traverses (s, t). -Such a word w is called a witness for (s, t). A sample S covers a deterministic -regular expression r if it covers the automaton obtained from S using the Glushkov -construction for translating regular expressions into automata as defined in Definition 4.7. -Intuitively, if a sample does not cover a target regular expression r then there -will be parts of r that cannot be learned from S. In this sense, covering samples -are the minimal samples necessary to learn r. Note that such samples are far from -“complete” or “characteristic” in the sense of the theoretical framework of learning -in the limit, as some characteristic samples are bound to be of size exponential in -the size of r by Theorem 3.2, while samples of size at most quadratic in r suffice -to cover r. Indeed, the Glushkov construction always yields an automaton whose -number of states is bounded by the size of r. Therefore, this automaton can have -ACM Journal Name, Vol. V, No. N, November 2024. - -25 - - 26 - -· - -Geert Jan Bex et al. - -at most |r|2 edges, and hence |r|2 witness words suffice to cover r. -Table II shows how iDRegEx performs on covering samples, broken up by alphabet size of the target expressions. The size of the sample used is depicted as well. -The table demonstrates a remarkable precision. Out of a total of 100 expressions, -82 are derived exactly for iDRegEx. Although iDRegEx(rwr0 ) outperforms -iDRegEx with a success rate of 87 %, overall iDRegExfixed performs best with -89 %. The performance decreases with the alphabet size of the target expressions: -this is to be expected since the inference task’s complexity increases. It should -be emphasized that even if iDRegExfixed does not derive the target expression -exactly, it always yields an over-approximation, i.e., its language is a superset of -the target language. -Table III shows an alternative view on the results. It shows the success rate as a -function of the target expression’s language size, grouped in intervals. In particular, -it demonstrates that the method works well for all language sizes. -A final perspective is offered in Table IV which shows the success rate in function -of the average states per symbol κ for an expression. The latter quantity is defined -as the length of the regular expression excluding operators, divided by the alphabet size. For instance, for the expression a(a + b)+ cab, κ = 6/3 since its length -excluding operators is 6 and |Σ| = 3. It is clear that the learning task is harder -for increasing values of κ. To verify the latter, a few extra expressions with large κ -values were added to the target expressions. For the algorithm iDRegExfixed the -success rate is quite high for target expressions with a large value of κ. Conversely, -iDRegEx(rwr0 ) yields better results for κ < 1.6, while its success rate drops to -around 50 % for larger values of κ. This illustrates that neither iDRegEx(rwr0 ) -nor iDRegExfixed outperforms the other in all situations. -|Σ| -5 -10 -15 -total - -#regex -45 -45 -10 -100 - -iDRegEx(rwr0 ) -86 % -93 % -70 % -87 % - -iDRegEx -97 % -75 % -50 % -82 % - -iDRegExfixed -100 % -84 % -60 % -89 % - -|S| -300 -1000 -1500 - -Table II. Success rate on the target regular expressions and the sample size used per alphabet size -for the various algorithms. - -Density(r) -[0.0, 0.2[ -[0.2, 0.4[ -[0.4, 0.6[ -[0.6, 0.8[ -[0.8, 1.0] -Table III. - -#regex -24 -22 -20 -22 -12 - -iDRegEx(rwr0 ) -100 % -82 % -90 % -95 % -83 % - -iDRegEx -87 % -91 % -75 % -72 % -78 % - -iDRegExfixed -96 % -91 % -85 % -83 % -78 % - -Success rate on the target regular expressions, grouped by language size. - -It is also interesting to note that iDRegEx successfully derived the regular expression r1 = (a1 a2 + a3 + · · · + an )+ of Theorem 3.2 for n = 8, n = 10, and n = 12 -from covering samples of size 500, 800, and 1100, respectively. This is quite surprising considering that the characteristic samples for these expressions was proven to -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data -κ -[1.2, 1.4[ -[1.4, 1.6[ -[1.6, 1.8[ -[1.8, 2.0[ -[2.0, 2.5[ -[2.5, 3.0] - -#regex -29 -37 -24 -11 -12 -18 - -iDRegEx(rwr0 ) -96 % -100 % -91 % -54 % -41 % -66 % - -iDRegEx -72 % -89 % -92 % -91 % -50 % -71 % - -· - -iDRegExfixed -83 % -89 % -100 % -100 % -50 % -78 % - -Table IV. Success rate on the target regular expressions, grouped by κ, the average number of -states per symbol. - -be of size at least (n − 2)!, i.e., 720, 40320, and 3628800 respectively. The regular -expression r2 = (Σ \ a1 )+ a1 (Σ \ a1 )+ , in contrast, was not derivable by iDRegEx -from small samples. -Experiments on partially covering samples. Unfortunately, samples to learn -regular expressions from are often smaller than one would prefer. In an extreme, but -not uncommon case, the sample does not even entirely cover the target expression. -In this section we therefore test how iDRegEx performs on such samples. -Definition 5.2. The coverage of a target regular expression r by a sample S is -defined as the fraction of transitions in the corresponding Glushkov automaton for -r that have at least one witness in S. -Note that to successfully learn r from a partially covering sample, iDRegEx -needs to “guess” the edges for which there is no witness in S. This guessing capability is built into iDRegEx(rwr0 ) and iDRegEx in the form of repair rules [Bex -et al. 2006; Bex et al. 2008]. Our experiments show that for target expressions -with alphabet size |Σ| = 10, this is highly effective for iDRegEx(rwr0 ): even at a -coverage of 70%, half the target expressions can still be learned correctly as Table V -shows. The algorithm iDRegEx is performing very poorly in this setting, being -only successful occasionally for coverages close to 100 %. iDRegExfixed performs -better, although not as well as iDRegEx(rwr0 ). This again illustrates that both -algorithms have their merits. -coverage -1.0 -0.9 -0.8 -0.7 -0.6 - -iDRegEx(rwr0 ) -100 % -64 % -60 % -52 % -0% - -iDRegEx -80 % -20 % -0% -0% -0% - -iDRegExfixed -80 % -60 % -40 % -0% -0% - -Table V. Success rate for 25 target expressions for |Σ| = 10 for samples that provide partial -coverage of the target expressions. - -We also experimented with target expressions with alphabet size |Σ| = 5. In this -case, the results were not very promising for iDRegEx(rwr0 ), but as Table VI -illustrates, iDRegEx and iDRegExfixed performs better, on par with the target -expressions for |Σ| = 10 in the case of iDRegExfixed . This is interesting since -the absolute amount of information missing for smaller regular expressions is larger -than in the case of larger expressions. -ACM Journal Name, Vol. V, No. N, November 2024. - -27 - - 28 - -· - -Geert Jan Bex et al. -coverage -1.0 -0.9 -0.8 -0.7 -0.6 -0.5 - -Table VI. - -6. - -iDRegEx(rwr0 ) -100 % -25 % -16 % -8% -8% -0% - -iDRegEx -100 % -75 % -75 % -25 % -25 % -8% - -iDRegExfixed -100 % -66 % -41 % -33 % -17 % -17 % - -Success rate for 12 target expressions for |Σ| = 5 with partially covering samples. - -CONCLUSIONS - -We presented the algorithm iDRegEx for inferring a deterministic regular expression from a sample of words. Motivated by regular expressions occurring in practice, -we use a novel measure based on the number k of occurrences of the same alphabet -symbol and derive expressions for increasing values of k. We demonstrated the -remarkable effectiveness of iDRegEx on a large corpus of real-world and synthetic -regular expressions of different densities. -Our experiments show that iDRegEx(rwr0 ) performs better than iDRegEx -for target expressions with a κ < 1.6 and vice versa for larger values of κ. For -partially covering samples, iDRegEx(rwr0 ) is more robust than iDRegEx. As κ -values and sample coverage are not known in advance, it makes sense to run both -algorithms and select the smallest expression or the one with the smallest language -size, depending on the application at hand. -Some questions need further attention. First, in our experiments, iDRegEx -always derived the correct expression or a super-approximation of the target expression. It remains to investigate for which kind of input samples this behavior -can be formally proved. Second, it would also be interesting to characterize precisely which classes of expressions can be learned with our method. Although the -parameter κ explains this to some extend, we probably need more fine grained -measures. A last and obvious goal for future work is to speed up the inference of -the probabilistic automaton which forms the bottleneck of the proposed algorithm. -A possibility is to use an industrial strength implementation of the Baum-Welsh -algorithm as in [Finn et al. 2006] rather than a straightforward one or to explore -different methods for learning probabilistic automata. -Although iDRegEx can be directly plugged into the XSD inference engine iXSD -of [Bex et al. 2007], it would be interesting to investigate how to extend these -techniques to the more robust class of Relax NG schemas [Clark and Murata 2001]. -REFERENCES -Castor. www.castor.org. -SUN Microsystems JAXB. java.sun.com/webservices/jaxb. -Adriaans, P. and Vitányi, P. 2006. The Power and Perils of MDL. -Ahonen, H. 1996. Generating Grammars for structured documents using grammatical inference -methods. Report A-1996-4, Department of Computer Science, University of Finland. -Angluin, D. and Smith, C. H. 1983. Inductive Inference: Theory and Methods. ACM Computing -Surveys 15, 3, 237–269. -Barbosa, D., Mignet, L., and Veltri, P. 2005. Studying the XML Web: gathering statistics -from an XML sample. World Wide Web 8, 4, 413–438. -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -Benedikt, M., Fan, W., and Geerts, F. 2005. XPath satisfiability in the presence of DTDs. In -Proceedings of the Twenty-fourth ACM SIGACT-SIGMOD-SIGART Symposium on Principles -of Database Systems. 25–36. -Bernstein, P. A. 2003. Applying Model Management to Classical Meta Data Problems. In First -Biennial Conference on Innovative Data Systems Research. -Bex, G., Neven, F., Schwentick, T., and Vansummeren, S. Inference of Concise Regular -Expressions and DTDs. ACM TODS . To Appear. -Bex, G. J., Gelade, W., Neven, F., and Vansummeren, S. 2008. Learning deterministic regular -expressions for the inference of schemas from XML data. In WWW. Beijing, China, 825–834. -Accepted for WWW 2008. -Bex, G. J., Neven, F., Schwentick, T., and Tuyls, K. 2006. Inference of concise DTDs from -XML data. In Proceedings of the 32nd International Conference on Very Large Data Bases. -115–126. -Bex, G. J., Neven, F., Schwentick, T., and Vansummeren, S. 2008. Inference of Concise -Regular Expressions and DTDs. submitted to VLDB Journal. -Bex, G. J., Neven, F., and Van den Bussche, J. 2004. DTDs versus XML Schema: a practical -study. In Proceedings of the 7th International Workshop on the Web and Databases. 79–84. -Bex, G. J., Neven, F., and Vansummeren, S. 2007. Inferring XML Schema Definitions from -XML data. In Proceedings of the 33rd International Conference on Very Large Databases. -998–1009. -Brāzma, A. 1993. Efficient identification of regular expressions from representative examples. -In Proceedings of the 6th Annual ACM Conference on Computational Learning Theory. ACM -Press, 236–242. -Brüggeman-Klein, A. 1993. Regular expressions into finite automata. Theoretical Computer -Science 120, 2, 197–213. -Brüggemann-Klein, A. and Wood, D. 1998. One-unambiguous regular languages. Information -and computation 140, 2, 229–253. -Buneman, P., Davidson, S. B., Fernandez, M. F., and Suciu, D. 1997. Adding structure to -unstructured data. In Database Theory - ICDT ’97, 6th International Conference, F. N. Afrati -and P. G. Kolaitis, Eds. Lecture Notes in Computer Science, vol. 1186. Springer, 336–350. -Che, D., Aberer, K., and Özsu, M. T. 2006. Query optimization in XML structured-document -databases. VLDB Journal 15, 3, 263–289. -Chidlovskii, B. 2001. Schema extraction from XML: a grammatical inference approach. In -Proceedings of the 8th International Workshop on Knowledge Representation meets Databases. -Clark, J. Trang: Multi-format schema converter based on RELAX NG. http://www. -thaiopensource.com/relaxng/trang.html. -Clark, J. and Murata, M. 2001. RELAX NG Specification. OASIS. -Cover, R. 2003. The Cover Pages. http://xml.coverpages.org/. -Du, F., Amer-Yahia, S., and Freire, J. 2004. ShreX: Managing XML Documents in Relational -Databases. In Proceedings of the 30th International Conference on Very Large Data Bases. -1297–1300. -Ehrenfeucht, A. and Zeiger, P. 1976. Complexity measures for regular expressions. Journal -of computer and system sciences 12, 134–146. -Fernau, H. 2004. Extracting minimum length Document Type Definitions is NP-hard. In ICGI. -277–278. -Fernau, H. 2005. Algorithms for Learning Regular Expressions. In Algorithmic Learning Theory, -16th International Conference. 297–311. -Finn, R., Mistry, J., Schuster-Bckler, B., Griffiths-Jones, S., et al. 2006. Pfam: clans, -web tools and services. Nucleic Acids Research 34, D247–D251. -Florescu, D. 2005. Managing semi-structured data. ACM Queue 3, 8 (October). -François, J.-M. 2006. Jahmm. http://www.run.montefiore.ulg.ac.be/~francois/software/ -jahmm/. -ACM Journal Name, Vol. V, No. N, November 2024. - -29 - - 30 - -· - -Geert Jan Bex et al. - -Freire, J., Haritsa, J. R., Ramanath, M., Roy, P., and Siméon, J. 2002. StatiX: making XML -count. In SIGMOD Conference. 181–191. -Freitag, D. and McCallum, A. 2000. Information Extraction with HMM Structures Learned -by Stochastic Optimization. In AAAI/IAAI. AAAI Press / The MIT Press, 584–589. -Garcia, P. and Vidal, E. 1990. Inference of k-testable languages in the strict sense and application to syntactic pattern recognition. IEEE Transactions on Pattern Analysis and Machine -Intelligence 12, 9 (September), 920–925. -Garofalakis, M., Gionis, A., Rastogi, R., Seshadri, S., and Shim, K. 2003. XTRACT: learning document type descriptors from XML document collections. Data mining and knowledge -discovery 7, 23–56. -Gelade, W. and Neven, F. 2008. Succinctness of the Complement and Intersection of Regular -Expressions. In STACS. 325–336. -Gold, E. 1967. Language identification in the limit. Information and Control 10, 5 (May), -447–474. -Goldman, R. and Widom, J. 1997. DataGuides: Enabling Query Formulation and Optimization -in Semistructured Databases. In Proceedings of 23rd International Conference on Very Large -Data Bases. 436–445. -Gruber, H. and Holzer, M. 2008. Finite Automata, Digraph Connectivity, and Regular Expression Size. In ICALP (2). 39–50. -Hegewald, J., Naumann, F., and Weis, M. 2006. XStruct: efficient schema extraction from -multiple and large XML documents. In ICDE Workshops. 81. -Hopcroft, J. and Ullman, J. 2007. Introduction to automata theory, languages and computation. Addison-Wesley, Reading, MA. -Koch, C., Scherzinger, S., Schweikardt, N., and Stegmaier, B. 2004. Schema-based scheduling of event processors and buffer minimization for queries on structured data streams. In -Proceedings of the 30th International Conference on Very Large Data Bases. 228–239. -Manolescu, I., Florescu, D., and Kossmann, D. 2001. Answering XML Queries on Heterogeneous Data Sources. In Proceedings of 27th International Conference on Very Large Data -Bases. 241–250. -Martens, W., Neven, F., Schwentick, T., and Bex, G. J. 2006. Expressiveness and Complexity -of XML Schema. ACM Transactions on Database Systems 31, 3, 770–813. -Mignet, L., Barbosa, D., and Veltri, P. 2003. The XML web: a first study. In Proceedings of -the 12th International World Wide Web Conference. Budapest, Hungary, 500–510. -Nestorov, S., Abiteboul, S., and Motwani, R. 1998. Extracting Schema from Semistructured -Data. In International Conference on Management of Data. ACM Press, 295–306. -Neven, F. and Schwentick, T. 2006. On the complexity of XPath containment in the presence -of disjunction, DTDs, and variables. Logical Methods in Computer Science 2, 3. -Pitt, L. 1989. Inductive Inference, DFAs, and Computational Complexity. In Proceedings of -the International Workshop on Analogical and Inductive Inference, K. P. Jantke, Ed. Lecture -Notes in Computer Science, vol. 397. Springer-Verlag, 18–44. -Quass, D., Widom, J., Goldman, R., et al. 1996. LORE: a Lightweight Object REpository for -semistructured data. In Proceedings of the 1996 ACM SIGMOD International Conference on -Management of Data. 549. -Rabiner, L. 1989. A tutorial on Hidden Markov Models and selected applications in speech -recognition. Proc. IEEE 77, 2, 257–286. -Rahm, E. and Bernstein, P. A. 2001. A survey of approaches to automatic schema matching. -VLDB Journal 10, 4, 334–350. -Sahuguet, A. 2000. Everything You Ever Wanted to Know About DTDs, But Were Afraid to Ask -(Extended Abstract). In The World Wide Web and Databases, 3rd International Workshop, -D. Suciu and G. Vossen, Eds. Lecture Notes in Computer Science, vol. 1997. Springer, 171–183. -Sakakibara, Y. 1997. Recent advances of grammatical inference. Theoretical Computer Science 185, 1, 15–45. -ACM Journal Name, Vol. V, No. N, November 2024. - - Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data - -· - -Sankey, J. and Wong, R. K. 2001. Structural inference for semistructured data. In Proceedings -of the 10th international conference on Information and knowledge management. ACM Press, -159–166. -Thompson, H., Beech, D., Maloney, M., and Mendelsohn, N. 2001. XML Schema part 1: -structures. W3C. -Young-Lai, M. and Tompa, F. W. 2000. Stochastic Grammatical Inference of Text Database -Structure. Machine Learning 40, 2, 111–137. - -Received Month Year; revised Month Year; accepted Month Year - -ACM Journal Name, Vol. V, No. N, November 2024. - -31 - - \ No newline at end of file diff --git a/papers/paper_tods2010.txt b/papers/paper_tods2010.txt deleted file mode 100644 index 7822b57..0000000 --- a/papers/paper_tods2010.txt +++ /dev/null @@ -1,2492 +0,0 @@ -Inference of Concise Regular Expressions -and DTDs -GEERT JAN BEX and FRANK NEVEN -Hasselt University and Transnational University of Limburg -THOMAS SCHWENTICK -Dortmund University -and -STIJN VANSUMMEREN -Université Libre de Bruxelles - -We consider the problem of inferring a concise Document Type Definition (DTD) for a given set -of XML-documents, a problem that basically reduces to learning concise regular expressions from -positive examples strings. We identify two classes of concise regular expressions—the single occurrence regular expressions (SOREs) and the chain regular expressions (CHAREs)—that capture the -far majority of expressions used in practical DTDs. For the inference of SOREs we present several -algorithms that first infer an automaton for a given set of example strings and then translate that -automaton to a corresponding SORE, possibly repairing the automaton when no equivalent SORE -can be found. In the process, we introduce a novel automaton to regular expression rewrite technique which is of independent interest. When only a very small amount of XML data is available, -however (for instance when the data is generated by Web service requests or by answers to queries), -these algorithms produce regular expressions that are too specific. Therefore, we introduce a novel -learning algorithm CRX that directly infers CHAREs (which form a subclass of SOREs) without -going through an automaton representation. We show that CRX performs very well within its target -class on very small datasets. - -This research was done while S. Vansummeren was a Postdoctoral Fellow of the Research -Foundation-Flanders (FWO) at Hasselt University. -This work was funded by FWO-G.0821.09N and the Future and Emerging Technologies (FET) -programme within the Seventh Framework Programme for Research of the European Commision, -under the FET-Open grant agreement FOX, number FP7-ICT-233599. -Authors’ addresses: G. J. Bex and F. Neven, Database and Theoretical Computer Science Research Group, Hasselt University and Transnational University of Limburg, Agoralaan, gebouw D, -B-3590 Diepenbeek Belgium; email: {geertjan.bex, frank.neven}@uhasselt.be; T. Schwentick, TU -Dortmund, Fakultät für Informatik, Otto-Hahn-Str. 16, Raum 214, 44227 Dortmund, Germany. -email: thomas.schwentick@udo.edu; S. Vansummeren, Research Laboratory for Web and Information Technologies (WIT), Université Libre de Bruxelles, 50 Av. F. Roosevelt, CP 165/15 B-1050 -Brussels, Belgium; email: stijn.vansummeren@ulb.ac.be. -Permission to make digital or hard copies of part or all of this work for personal or classroom use -is granted without fee provided that copies are not made or distributed for profit or commercial -advantage and that copies show this notice on the first page or initial screen of a display along -with the full citation. Copyrights for components of this work owned by others than ACM must be -honored. Abstracting with credit is permitted. To copy otherwise, to republish, to post on servers, -to redistribute to lists, or to use any component of this work in other works requires prior specific -permission and/or a fee. Permissions may be requested from Publications Dept., ACM, Inc., 2 Penn -Plaza, Suite 701, New York, NY 10121-0701 USA, fax +1 (212) 869-0481, or permissions@acm.org. - 2010 ACM 0362-5915/2010/04-ART11 $10.00 -C -DOI 10.1145/1735886.1735890 http://doi.acm.org/10.1145/1735886.1735890 -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - -11 - - 11:2 - -• - -G. J. Bex et al. - -Categories and Subject Descriptors: F.4.3 [Mathematical Logic and Formal Languages]: -Formal Languages; H.2.1 [Database Management]: Logical Design; I.2.6 [Artificial Intelligence]: Learning; I.7.2 [Document and Text Processing]: Document Preparation -General Terms: Algorithms, Languages, Theory -Additional Key Words and Phrases: Regular expressions, schema inference, XML -ACM Reference Format: -Bex, G. J., Neven, F., Schwentick, T., and Vansummeren, S. 2010. Inference of concise regular -expressions and DTDs. ACM Trans. Datab. Syst, 35. 2, Article 11 (April 2010), 47 pages. -DOI = 10.1145/1735886.1735890 http://doi.acm.org/10.1145/1735886.1735890 - -1. INTRODUCTION -The eXtensible Markup Language (XML) serves as the lingua franca for data -exchange on the Internet [Abiteboul et al. 1999]. Because XML documents -in general can be of any form, most communities and applications impose -structural constraints on the documents that are to be exchanged or processed. -These constraints can be formally specified in a schema, which is written in a -schema language such as the Document Type Definitions (DTDs) or the XML -Schema Definitions (XSDs) [Thompson et al. 2004]. -The advantages offered by the presence of a fully specified schema are -numerous. First and foremost, a schema allows automatic validation of the -input document structure, which not only facilitates automatic processing but -also ensures soundness of the input. Unvalidated input data from Web requests -is considered as the number one vulnerability for Web applications [Open Web -Application Security Project Consortium 2004]. The presence of a schema also -allows for automation and optimization of search, integration, and processing -of XML data (refer to, e.g., Benedikt et al. [2008], Deutsch et al. [1999], Koch -et al. [2004], Manolescu et al. [2001], Neven and Schwentick [2006], Wang -et al. [2003]). Moreover, various software development tools such as Castor -[Castor] and SUN’s JAXB [Sun] rely on schemas to perform object-relational -mappings for persistence. Furthermore, the existence of schemas is imperative -when integrating (meta) data through schema matching [Rahm and Bernstein -2001] and in the area of generic model management [Bernstein 2003; Melnik -2004]. A final advantage of a schema is that it assigns meaning to the data. -That is, it provides a user with a concrete semantics of the document and -aids in the specification of meaningful queries over XML data. Although the -examples mentioned here just scrape the surface of current applications, -they already underscore the importance of schemas accompanying XML -data. -Unfortunately, in spite of the aforementioned advantages, the presence of -a schema is not mandatory and many XML documents are not accompanied -by one. For instance, in a recent study Mignet et al. [2003] and Barbosa et al. -[2006] have shown that approximately half of the XML documents available -on the Web do not refer to a schema. In another study Bex et al. [2004] and -Martens et al. [2006] have noted that about two-thirds of XSDs gathered from -schema repositories and from the Web are not valid with respect to the W3C -XML Schema specification [Thompson et al. 2004], rendering them essentially -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:3 - -useless for immedidate application. A similar observation was made by -Sahuguet [2000] concerning DTDs. -Based on the lack of schemas in practice, it is essential to devise algorithms -that can infer a schema for a given collection of XML documents when none, or -no syntactically correct one, is present. This is also acknowledged by Florescu -[2005] who emphasizes that in the context of data integration: -“We need to extract good-quality schemas automatically from existing data and perform incremental maintenance of the generated -schemas.” -In this article, we describe two novel schema inference algorithms outperforming existing systems in accuracy, conciseness, and speed. -It should be noted that even when a schema is already available, there -are situations where inference can be useful. One such situation is schema -cleaning: sometimes a schema is too general with respect to the XML data -that it is supposed to describe. In that case, it can be advantageous to infer a new schema based solely on the data at hand. This situation is nicely -illustrated by the following real-world example taken from the Protein Sequence Database DTD [Miklau 2002], which gives the following definition for -the refinfo-element. -authors, citation, volume?, month?, year, -pages?, (title | description)?, xrefs? -An analysis of the available XML corpus (683MB of data) with our inference -algorithms yields following more precise expression for the refinfo-element. -authors, citation, (volume | month), year, -pages?, (title | description)?, xrefs? -Note that the latter is more strict than the former, as it emphasizes that volume -and month do not occur together: either one specifies a month of publication for -a given journal article, or the volume that it has appeared in, but not both. -As this example illustrates, schema inference algorithms can hence be used to -better understand the semantics of a given XML dataset, making it possible to -adapt an existing schema when necessary. In general, schema inference can be -used to restrict schemas to a relevant subset of data needed by the application -at hand, thereby facilitating difficult tasks like schema matching and data -integration. Indeed, as argued by Hinkelman [2005], industry-level standards -are too loosely defined in general, which can result in XML schemas where -many business structures are formally specified as being optional. -The second situation where schema inference is useful even though a schema -already exists is in the presence of noisy XML data. In such a situation, part or -all of the data that needs to be processed is rejected by the existing schema. For -instance, we have harvested and investigated a corpus of XHTML documents -from the Web and found that an astonishing 89% of 2092 documents was not -valid with respect to the XHTML Transitional specification [W3C 2002]. In this -case, the inference of a new schema based on the corpus and its comparison -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:4 - -• - -G. J. Bex et al. - -Fig. 1. An example DTD. - -with the XHTML Transitional specification provides a uniform view of the kind -of errors made. Further, given that one often has no choice but to deal with such -noisy data, one may infer a new schema from a subset of the corpus (deleting -documents that make unacceptable errors) and work with that schema rather -than with the official specification to retain at least a minimal validation. -1.1 Problem Setting -Based on the previous observations, it is hence essential to devise algorithms -that can automatically infer a DTD or XSD from a given corpus of XML -documents. -As illustrated in Figure 1, a DTD is essentially a mapping d from element -names to regular expressions over element names. An XML document is valid -with respect to d if for every occurrence of an element name e in the document, -the word formed by its children belongs to the language of the corresponding -regular expression d(e). For instance, the DTD in Figure 1 requires each store -element to have zero or more order children, which must be followed by a -stock element. Likewise, each order must have a customer child, which must -be followed by one or more item elements. -To infer a DTD from a corpus of XML documents C it hence suffices to look, -for each element name e that occurs in a document in C, at the set of element -name words that occur below e in C, and to infer from this set the corresponding -regular expression d(e). As such, the inference of DTDs reduces to the inference of regular expressions from sets of positive example words. To illustrate, -from the words id price, id qty supplier, and id qty item item appearing under elements in a sample XML corpus, we could derive the following -rule. -item → (id, price | (qty, (supplier | item+ ))) -While the inference of XSDs is more complicated than the inference of DTDs, -recent characterizations [Martens et al. 2006] show that the structural core of -XML schema (that is, the sets of trees that are definable by XSDs) correspond -to DTDs extended with vertical regular expressions. Therefore, one cannot -hope to successfully infer XSDs without good algorithms for inferring regular -expressions. As such, we focus in this article on the inference of regular expressions (and therefore, by the preceding reduction, on the inference of DTDs). -The inference of XSDs, building on the algorithms presented here, is treated in -a companion article [Bex et al. 2007]. -In particular, let  be a fixed set of alphabet symbols (also called element -names), and let  ∗ be the set of all words over . -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:5 - -Definition 1 (Regular Expressions). In this article, we are interested in -learning regular expressions r, s of the form -r, s ::= ∅ | ε | a | r . s | r + s | r? | r + , -where parentheses may be added to avoid ambiguity. Here, ε denotes the empty -word; a ranges over symbols in ; r . s denotes concatenation; r + s denotes -disjunction; r + denotes one-or-more repetitions; and r? denotes the optional -regular expression. That is, the language L(r) accepted by regular expression -r is given by -L(∅) = ∅ -L(ε) = {ε} -L(a) = {a} -L(r . s) = {vw | v ∈ L(r), w ∈ L(s)} -L(r + s) = L(r) ∪ L(s) -L(r + ) = {v1 . . . vn | n ≥ 1 and v1 , . . . , vn ∈ L(r)} -L(r?) = L(r) ∪ {ε}. -For convenience, we sometimes omit the concatenation symbol, simply writing rs for r.s. Note that the Kleene star operator (denoting zero or more repititions as in r ∗ ) is not allowed by the preceding syntax. This is not a restriction, -since r ∗ can always be represented as (r + )? or (r?)+ . Conversely, the latter can -always be rewritten into the former for presentation to the user. Also note that -the previous syntax uses r + s, to denote disjunction rather than the vertical -bar notation r | s used by DTDs. The former notation should not be confused -with the one-ore-more repetition operator r + , where the plus symbol is used in -the exponent. -The class of all regular expressions is actually too large for our purposes, -as both DTDs and XSDs require the regular expressions occurring in them to -be deterministic (also sometimes called one-unambiguous [Brüggemann-Klein -and Wood 1998]). Intuitively, a regular expression is deterministic if, without -looking ahead in the input word, it allows to match each symbol of that word -uniquely against a position in the expression when processing the input in -one pass from left to right. For instance, (a + b)∗ a is not deterministic as already the first symbol in the word aaa could be matched by either the first or -the second a in the expression. Without lookahead, it is impossible to know -which one to choose. The equivalent expression b∗ a(b∗ a)∗ , on the other hand, is -deterministic. -Definition 2. Let r stand for the regular expression obtained from r by -replacing the ith occurrence of alphabet symbol a in r by a(i) , for every i and -+ -+ -a. For example, for r = b+ a(ba+ )? we have r = b(1) a(1) (b(2) a(2) )?. A regular -expression r is deterministic if there are no words wa(i) v and wa( j) v in L(r) -such that i = j. -Equivalently, an expression is deterministic if the so-called Glushkov construction [Brüggeman-Klein 1993] translates it into a deterministic finite -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:6 - -• - -G. J. Bex et al. - -automaton rather than a nondeterministic one [Brüggemann-Klein and Wood -1998]. Not every nondeterministic regular expression is equivalent to a deterministic one [Brüggemann-Klein and Wood 1998]. Thus, semantically, the class -of deterministic regular expressions forms a strict subclass of the class of all -regular expressions. -Learning in the limit. For the purpose of inferring DTDs from XML data, -we are hence in search of an algorithm that, given enough sample words of a -target deterministic regular expression r, returns a deterministic expression r -equivalent to r. In the framework of learning in the limit [Gold 1967], such an -algorithm is said to learn the deterministic regular expressions from positive -data. -Definition 3. Define a sample to be a finite subset of  ∗ and let R be -a subclass of the regular expressions. An algorithm M mapping samples to -expressions in R is said to learn R from positive data if: (1) S ⊆ L(M(S)) for -every sample Sand (2) to every r ∈ R we can associate a so-called characteristic -sample Sr ⊆ L(r) such that, for each sample S with Sr ⊆ S ⊆ L(r), M(S) is -equivalent to r. -Intuitively, the first condition says that M must be sound; the second that -M must be complete, given enough data. A class of regular expressions R is -learnable in the limit from positive data if an algorithm exists that learns R. -For the class of all regular expressions, it was shown by Gold [1967] that no -such algorithm exists. The same holds for the class of deterministic regular -expressions, as shown in our companion article [Bex et al. 2008]. -PROPOSITION 4 (BEX ET AL. 2008). The class of deterministic regular expressions is not learnable in the limit from positive data. -Proposition 4 immediately excludes the possibility for an algorithm to infer -the full class of DTDs. In practice, however, regular expressions occurring in -DTDs and XSDs are concise rather than arbitrarily complex. Indeed, a study -of 819 DTDs and XSDs gathered from the Cover Pages [Cover 2003] (including -many high-quality XML standards) as well as from the Web at large, revealed -that regular expressions occurring in practical schemas are such that every -alphabet symbol occurs at most k times, with k small. Actually, in 98% of the -cases k = 1. -Definition 5. A regular expression is k-occurrence if every alphabet symbol -occurs at most k times in it. -For example, the expressions customer . order+ and (school + institute)+ -are both 1-occurrence, while id .(qty + id) is 2-occurrence (as id occurs twice). -Observe that if r is k-occurrence, then it is also l-occurrence for every l ≥ k. -To simplify notation, we often abbreviate “k-occurrence regular expression” by -k-ORE and also refer to the 1-OREs as “single occurrence regular expressions” -or SOREs. -Note that, since every alphabet symbol can occur at most once in a SORE, -every SORE is necessarily deterministic. Indeed, we have the following strict -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:7 - -inclusion hierarchy among the various classes of regular expressions just -discussed. -SOREs -⊂ 2-OREs ⊂ 3-OREs ⊂ · · · ⊂ k-OREs -⊂ -⊂ -deterministic regex -⊂ -all regex -(For k ≥ 2, the classes of k-OREs and deterministic regular expressions are -incomparable.) Given their importance in practical schemas, we focus in this -article on the inference of SOREs. The inference of deterministic k-OREs for -k > 1 is treated in a companion article [Bex et al. 2008]. -1.2 Outline and Contributions -In particular, we show in Section 3 that the class of SOREs can be efficiently -learned in the limit from positive data by first constructing an automaton -representation of the target SORE using techniques of Garcı́a and Vidal [1990], -and by subsequently transforming this automaton into an equivalent SORE (if -such a SORE exists) using a novel polynomial-time algorithm called REWRITE. -For the general class of regular expressions the resulting expression can be of -exponential size, as we explain in more detail in Section 3. In Section 4, we -improve REWRITE to deal with real-world, and therefore incomplete, samples. In -contrast to REWRITE, which fails when its input automaton is not equivalent to -a SORE, the resulting improvement, called RWR, repairs the input automaton -until it becomes equivalent to a SORE. We also develop an extension of RWR, -called RWR2 , which improves the precision of RWR at the cost of increased running -time. -For the settings where extremely little XML data is available to infer a -schema from (for instance, when the data is returned as answers to queries or -Web service requests [Ngu et al. 2005; Oaks and ter Hofstede 2007]), we -introduce in Section 6 the algorithm CRX. CRX successfully learns the class -of CHAREs, a strict subclass of the SOREs that nevertheless holds great -practical importance. Indeed, the same investigation as before reveals that -more than 90% of the regular expressions occurring in practical schemas are -CHAREs [Martens et al. 2006]. -We experimentally validate RWR, RWR2 , and CRX in Section 7 on both small and -large samples drawn from real-world target DTDs whose regular expressions -fall both within the class of SOREs/CHAREs and outside of those classes. In -all settings, our algorithms outperform existing systems in accuracy, conciseness, and speed. Further, we assess the strong generalization ability of CRX by -establishing on average the minimal number of sample words needed to derive -optimal regular expressions. In Section 8 we discuss how to extend RWR and -CRX to incrementally compute the inferred regular expressions when new data -arrive, how to address noise, and how to deal with numerical predicates. We -begin in the next section with a discussion of related work, and conclude in -Section 9. -It is important to note that this article differs from its conference version [Bex -et al. 2006] in the following way. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:8 - -• - -G. J. Bex et al. - -—First and foremost, it corrects the results of Bex et al. [2006] by providing -a completely new algorithm for converting automata into equivalent SOREs -(provided such a SORE exists), and gives a full correctness proof (Section 3). -In contrast to what is claimed in Bex et al. [2006], the conversion algorithm -of Bex et al. [2006] does not always yield an equivalent SORE, as discussed -in Section 5. -—It introduces new heuristics (based on a language size criterion) for dealing -with real-world, and therefore incomplete datasets (Section 4). -—It adds new experiments that measure: (1) the impact of noise and (2) the -accuracy of our algorithms under various levels of missing data. -2. RELATED WORK -Schema inference. Schemas for semistructured data have been defined in -Buneman et al. [1997], Fernandez and Suciu [1998], and McHugh et al. -[1997] and their inference has been addressed in Goldman and Widom [1997], -and Nestorov et al. [1997, 1998]. The methods in Nestorov et al. [1997] and -Goldman and Widom [1997] focus on the derivation of a graph summary -structure (called full representative object or dataguide) for a semistructured -database. This data structure contains all paths in the database. Approximations of this structure are considered by restricting to paths of a certain length. -The latter then basically reduces to the derivation of an automaton from a set -of bounded length strings. Naively restricting the algorithms to trees rather -than graphs is inappropriate since no order is considered between the children -of a node so that DTD-like schemas cannot be derived. However, even the use -of more sophisticated encodings of the XML documents using edges between -siblings would be to no avail since no algorithms are given to translate the -obtained automata to regular expressions. In Nestorov et al. [1998], a schema -is a typing by means of a datalog program. Again, no algorithms are given -to transform datalog types into regular expressions. So, these approaches -can therefore not be used to derive DTDs, not even when the semistructured -database is tree-shaped. -DTD inference. In the context of DTD inference, Sankey and Wong [2001] -propose several approaches to generate probabilistic string automata to represent regular expressions. To transform these into actual regular expressions, -and hence to obtain DTDs, the authors refer to the methods of Ahonen [1996]. -The latter provides a method to translate one-unambiguous nonprobabilistic -string automata to regular expressions, as given by Brüggemann-Klein and -Wood [1998], followed by a post-processing simplification step. Apart from several case analyses based on a dictionary example, no systematic study of the -effectiveness of the approach is provided. In particular, in contrast to our results, no target class is given for which the set of transformations is complete. -There are only a few papers describing systems for direct DTD inference -[Garofalakis et al. 2003; Min et al. 2003; Chidlovskii 2001]. Only one of them is -available for testing: XTRACT [Garofalakis et al. 2003]. In Section 7, we make a -detailed comparison with our proposal. In contrast to our approach, the XTRACT -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:9 - -system generates for every separate string a regular expression while representing repeated subparts by introducing Kleene-*. In a second step, the system -factorizes common subexpressions of these candidate regular expressions using algorithms from the logic optimization literature. Finally, in the third step, -XTRACT applies the Minimum Description Length (MDL) principle to find the -best RE among the candidates. Although the approach has been shown to work -on real-world DTDs in Garofalakis et al. [2003] the XML data complying to -these DTDs was generated. We report in Section 7 that XTRACT has two kinds of -shortcomings on real-world XML data: (1) it generates large, long-winded, and -difficult to interpret regular expressions; and (2) it cannot handle large datasets (over 1000 strings). The latter is due to the NP-hard submodule in the -third step of the XTRACT algorithm [Fernau 2004]. The former problem seems -to be more fundamental. The final step results in expressions consisting of -disjunctions of regular expressions while in practice the large majority of regular expressions are concatenations of disjunctions [Martens et al. 2006]. As a -result, larger datasets result in larger regular expressions. -In Min et al. [2003] an adaptation of the XTRACT approach to a restricted -class of regular expressions which form a subclass of SOREs is described. Although the system, according to the experiments conducted in Min et al. [2003], -outperforms XTRACT in accuracy and efficiency, it seems that the two fundamental shortcomings described earlier remain. It would thus be surprising if the -system performed much better than XTRACT on real-world data. Similarly to -Ahonen [1996], the approach of Chidlovskii [2001] relies on the translation of -Glushkov automata to regular expressions which, in general, can lead to an -exponential size increase. -Trang [Clark ] is state-of-the-art software written by James Clark intended -as a schema translator for the schema languages DTDs, Relax NG, and XML -Schema. In addition, Trang allows to infer a schema for a given set of XML -documents. We discuss Trang further in Section 7.1. -Language inference. Learning of regular languages from positive examples in -the computational learning community is mostly directed towards inference of -automata as opposed to inference of regular expressions [Angluin and Smith -1983; Pitt 1989; Sakakibara 1997]. As noted by Fernau [2004] and argued -in the previous section, first using learning algorithms for deterministic automata and then transforming these into regular expressions in general leads -to unmanageable and long-winded regular expressions. Some approaches to -inference of regular expressions for restricted cases have been considered. For -instance, Brāzma [1993] showed that regular expressions without union can -be approximately learned in polynomial time from a set of examples satisfying -some criteria. Fernau [2009] provided a learning algorithm for finite unions -of pairwise left-aligned union-free regular expressions. These expressions are -different from the expressions we consider here: they are not included in the -class of SOREs and do not contain all CHAREs. The development is purely -theoretical, no experimental validation has been performed. -Automata to RE translation. Although heuristics for automata to RE translations [Delgado and Morais 2004; Han and Wood 2007] have been proposed, -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:10 - -• - -G. J. Bex et al. - -Fig. 2. (a) The SOA accepting the same language as the SORE a . b .(c+d+ ). (b) The SOA generated -by 2T-INF for the sample S = {bacacdacde, cbacdbacde, abccaadcde}. - -all of them are optimizations of the classical state elimination algorithm. In -particular, they investigate the best order to eliminate states when going from -automata to regular expressions. So, they focus on the class of all automata -for which, as explained in Section 3, an exponential increase in size cannot be -avoided in general. Further, the methods remain theoretical as no experimental -analysis has been performed. Caron and Ziadi [2000] devise an algorithm deciding whether an automaton is Glushkov. If so, the automaton can be rewritten -into a short equivalent regular expression. Their method works in a top-down -fashion, that is, it derives the top nodes of the parse tree corresponding to -the regular expression first, and subsequently proceeds downward in the tree. -Consequently, the method first derives the largest subexpressions of the expression, making it harder to devise heuristics in the presence of missing data. -In contrast, our approach is bottom-up, that is, starting from the leaf nodes of -the parse tree, composing them into the smallest subexpressions. -3. A COMPLETE ALGORITHM FOR INFERRING SORES -Our goal in this section is to infer a SORE s equivalent to a target SORE r -given only a finite sample S ⊆ L(r). To this end, we first learn from S a Single -Occurrence Automaton (SOA for short). A SOA is a specific kind of deterministic -finite state automaton in which all states, except for the initial and final state, -are element names. Figure 2(a) gives an example. Note that in contrast to the -classical definition of automata, no edges are labeled: all incoming edges in a -state a are assumed to be labeled by a. As such, a word a1 , . . . , an is accepted if -there is an edge from the initial state to a1 , an edge from a1 to a2 ,. . . , and an -edge from an to the final state. Thus, the SOA in Figure 2(a) accepts the same -language as a . b .(c + d+ ). -Definition 6 (SOA). Let src and sink be two special symbols, distinct from -the element names, that will serve as the initial and final state, respectively. A -single occurrence automaton is a finite directed graph G = (V, E) such that: -(1) {src, sink} ⊆ V and all nodes in V − {src, sink} are element names; and -(2) src has only outgoing edges; sink has only incoming edges; and every v ∈ -V − {src, sink} is visited during a walk from src to sink. -Note that V − {src, sink} can be empty. We write L(G) for the set of all words -accepted by G; V(G) for the set of G’s vertices, and E(G) for G’s edge relation. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:11 - -Algorithm 1. 2T-INF -Input: a finite set of sample strings S -Output: a SOA G such that S ⊆ L(G) -1: Let V be the set of states consisting of all element names occurring in S plus the -initial state src and final state sink -2: Initialize E := ∅ -3: for each string a1 . . . an in S do -4: -add the edges (src, a1 ), (a1 , a2 ), . . . , (an, sink) to E -5: end for -6: return G = (V, E) - -3.1 Learning an Automaton -Given a sample S, we can learn an automaton G that accepts all words in S by -means of the algorithm 2T-INF shown in Algorithm 1. Its behavior is illustrated -in Figure 2(a) on the sample S = {abc, abdd} and in Figure 2(b) on the sample -S = {bacacdacde, cbacdbacde, abccaadcde}. 2T-INF was introduced by Garcı́a and -Vidal [1990], who also proved the following proposition. -PROPOSITION 7 ([GARCÍA AND VIDAL 1990]). 2T-INF is sound, that is, S ⊆ -L(2T-INF(S)) for each sample S. Moreover, 2T-INF is minimal, that is, for each SOA -G with S ⊆ L(G), 2T-INF(S) is a subgraph of G and hence L(2T-INF(S)) ⊆ L(G). -It turns out that 2T-INF is also complete for building a SOA representation of -a target SORE r, provided that its input sample is representative with regard -to r. -Definition 8 (Representative Sample). A word v of length 2 is said to be a -2-gram of a set of words W if it occurs as a subword in some w ∈ W. A sample -S is representative of a SORE r if S ⊆ L(r) and the following statements hold: -(1) for every a ∈  starting a word in L(r) there is a word in S that starts with -a; -(2) for every a ∈  ending a word in L(r) there is a word in S that ends with a; -(3) every 2-gram of L(r) is a 2-gram of S. -If S is not representative of r, then we say that S does not cover r. -For instance, the sample {a, b, c} is representative for a + b + c but {a, c} -is not since it lacks a word starting with b. Furthermore, the sample -{bacacdacde, cbacdbacde, abccaadcde} is representative for ((b?(a + c)+ )d)+ e but -{bacacdacde, cbacdbacde} is not since it does not contain the 2-gram ab. -PROPOSITION 9. -L(r). - -If S is a representative sample of SORE r then L(2T-INF(S)) = - -PROOF. It is not hard to see that every SORE r can be transformed into an -equivalent SOA Gr : we take as nodes of Gr all element names occurring in r -plus the initial state src and the final state sink; for each alphabet symbol that -starts a word in L(r) we add the edge (src, a) to Gr ; for each alphabet symbol -that ends a word in L(r) we add an edge (a, sink) to Gr , and for each alphabet -symbol b that follows an alphabet symbol a in a word in L(r) we add the edge -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:12 - -• - -G. J. Bex et al. - -Fig. 3. A SOA not equivalent to any SORE. It accepts the same language as a(ba)+ . - -(a, b) to Gr . Now reason as follows. Clearly, S ⊆ L(r) = L(Gr ). Hence, 2T-INF(S) -is a subgraph of Gr by Proposition 7. Since S is a representative sample of r, -however, every edge of Gr must also be in 2T-INF(S). As such, 2T-INF(S) = Gr and -hence L(2T-INF(S)) = L(Gr ). -3.2 From SOA to SORE -Proposition 9 shows that it is possible to learn a SOA representation of a target -SORE r, provided that we are given enough data. To transform this SOA into -a regular expression, an obvious approach would be to use known techniques -such as the classical state elimination algorithm (refer to, e.g., Hopcroft and -Ullman [1979]). Unfortunately, as already hinted upon by Fernau [2004, 2009] -and as we illustrate shortly, it is very difficult to get concise regular expressions -from an automaton representation. For instance, the classical state elimination -algorithm applied to the SOA generated by 2T-INF in Figure 2(b) yields the -expression:1 -(aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + -aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ -(b + aa∗ b))∗ (aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d)))(aa∗ d + -(c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + aa∗ c)(c + -aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))∗ - -which differs quite a bit from the equivalent SORE -((b?(a + c))+ d)+ e - -(‡). - -Actually, results by Ehrenfeucht and Zeiger [1976], Gelade and Neven [2008], -and Gruber and Holzer [2008] show that it is impossible in general to generate -concise regular expressions from automata: there are automata, even SOAs as -generated by 2T-INF, for which the number of occurrences of alphabet symbols in -the smallest equivalent expression is exponential in the size of the automaton. -For such automata, a concise regular expression representation hence does not -exist. -These results imply that there are SOAs G for which an equivalent SORE -does not exist (Figure 3 gives a simple example). Note, however, that when -such a SORE r does exist, its size is always linearly bounded by the number of -states of G. Indeed, since every alphabet symbol can occur at most once in r, the -size of r is linearly bounded by the alphabet symbols that it mentions. Since G -and r are equivalent, these symbols are exactly the states of G (minus src and -sink). Hence, the SOREs constitute a well-behaved and concisely representable -subset of the regular languages. It is therefore natural to investigate how to -1 Transformation computed by JFLAP: www.jflap.org. - -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:13 - -transform a given SOA into an equivalent SORE when such a SORE exists. -Clearly, the previous example illustrates that the classical state elimination -algorithm does not suffice for this purpose. -For that reason, we introduce in this section a novel graph-rewriting approach for transforming SOAs into SOREs. While our approach is related to the -classical state-elimination algorithm for transforming an arbitrary automaton -into a regular expression, we do not eliminate states by introducing additional -edges (thereby duplicating subexpressions) but instead replace sets of states -by single states (taking care to avoid duplication). In addition, there are two -rewriting steps that only remove edges. -Just as the classical algorithm, it is necessary for the definition of the graph -rewrite rules to define a generalization of SOAs in which internal states are -allowed to be labeled by SOREs (as opposed to element names from ). This generalization is defined as follows. Call two regular expressions r and s alphabetdisjoint if r and s have no alphabet symbol in common. For example, (a+b)? and -c+ are alphabet-disjoint, whereas (a + b) and b?c+ are not. Call an expression -r proper if it accepts at least one nonempty word (i.e., it is not equivalent to ∅, -nor to ε). -Definition 10. A generalized Single Occurrence Automaton (generalized -SOA for short) is a finite graph G = (V, E) such that: -(1) {src, sink} ⊆ V and all vertices in V − {src, sink} are pairwise alphabetdisjoint proper SOREs; and -(2) the edge relation E is such that src has only outgoing edges; sink has only -incoming edges; and every v ∈ V is visited by a walk from src to sink. -A word w ∈  ∗ is accepted by G if there is a walk src r1 . . . rm sink in G and a -division of w into subwords w = w1 . . . wm such that wi ∈ L(ri ), for 1 ≤ i ≤ m. -Again, we write L(G) for the set of all words accepted by G. -Figure 7 shows some examples. Clearly, every SOA is also a generalized -SOA. In what follows, we write PredG (s) for the set of all direct predecessors of -a SORE s in G, and SuccG (s) for the set of all direct successors of s in G. -PredG (s) := {r | (r, s) ∈ E(G)}, -SuccG (s) := {t | (s, t) ∈ E(G)}. -− -Furthermore, we write Pred− -G (s) for PredG (s) − {s} and similarly SuccG (s) for -SuccG (s) − {s}. Finally, we write - -PredG (s) ∪ {s} if s = s + for some s -+ -PredG (s) := -PredG (s) -otherwise - -SuccG (s) ∪ {s} if s = s + for some s -(s) -:= -Succ+ -G -SuccG (s) -otherwise. - -Rewrite rules. Our system of rewrite rules consists of the seven rules shown -in Figures 4–6: one rule to introduce disjunction (r + s), four rules to introduce -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:14 - -• - -G. J. Bex et al. - -Fig. 4. Rewrite rules part 1. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)− -+ -{r, s}. The gray loops on r and s indicate that r ∈ Succ+ -G (r) and s ∈ SuccG (s), respectively. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:15 - -Fig. 5. Rewrite rules part 2. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)− -+ -{r, s}. The gray loops on r and s indicate that r ∈ Succ+ -G (r) and s ∈ SuccG (s), respectively. - -concatenation (r . s, r? . s, r . s?, and r? . s?), one rule to introduce iteration (r + ), -and one rule to introduce optionals (r?). At the basis of the first five rules lies -the contraction of two states r and s into a single new state t, which is defined -as follows. -Definition 11 (State Contraction). Let G be a generalized SOA; let r and s -be states in G; and let t be a state not in G. The contraction of r and s into t is -the generalized SOA G[r, s ⇒ t] obtained from G as follows: -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:16 - -• - -G. J. Bex et al. - -Fig. 6. Rewrite rules part 3. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)− -{r, s}. Note in particular that the rule OPTIONAL r? can only be applied when G contains only one -node besides src and sink. - -(1) Add t as a new state to G; -(2) make every v ∈ PredG (r) − {r, s} a predecessor of t; -(3) make every w ∈ SuccG (r) − {r, s} a successor of t; -(4) add a loop t → t if r ∈ SuccG (s); and -(5) remove r, s and all of their incoming and outgoing edges. -Note that state contraction is not symmetric. -To illustrate, the contraction G[a, c ⇒ a + c] of the generalized SOA G in -Figure 7(a) is shown in Figure 7(b). Similarly, the contraction G[b, a + c ⇒ -b? .(a + c)] of the generalized SOA G in Figure 7(b) is shown in Figure 7(c). Note -that if r = s, then G[r, s ⇒ t] is simply a substitution of r by the new state t. -To simplify notation, we simply write G[r ⇒ t] for such contractions in what -follows. -In addition to contraction, the rewrite rules also use the following -operation. -Definition 12. If G is a generalized SOA and r, s are states in G, then we -write G (r, s) to denote the generalized SOA obtained from G by removing the -edge from r to s, if present. -In what follows, we write G  H to indicate that G rewrites to H in a single -step according to the rewrite rules in Figures 4–6, and G ∗ H to indicate that -G rewrites to H in zero or more steps. -The following proposition shows that the rewrite rules are sound. -PROPOSITION 13. If G is a generalized SOA and G  H then H is also a -generalized SOA and L(G) = L(H). -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:17 - -PROOF. First observe that, since all states in a generalized SOA are pairwise -alphabet-disjoint proper SOREs, the new states r + s; r . s; r? . s; r . s?; r? . s?; r + ; -and r? introduced by the rewrite rules in Figures 4–6 must themselves be proper -SOREs alphabet-disjoint with the remaining states. As such, all states in H -are pairwise alphabet-disjoint proper SOREs. To show that H is a generalized -SOA, it hence remains to show that every state in H participates in a walk -from src to sink. Hereto, we distinguish the following three cases. -—H = G[r, s ⇒ t] for some t. Then, since G is a generalized SOA, and r and s -particpate in a walk from src to sink. In particular, there is a walk from src -to r in G, and a walk from s to sink. Then, by definition of state contraction, -there is a walk from src to t and from t to sink in H, that is, t participates in -a walk from src to sink in H. -—H = G[r ⇒ r + ] (r + , r + ). Then, by definition of state contraction and since -r participates in a walk from src to sink in G, r + must participate in a -walk from src to sink in G[r ⇒ r + ]. This walk can always be transformed -into a walk from src to sink in H by removing the edge (r + , r + ) should it -occur. -—H = G[r ⇒ r?] (src, sink). Then, by definition of state contraction and since -r participates in a walk from src to sink in G, r? must participate in a walk -from src to sink in G[r ⇒ r?]. Since the edge (src, sink) cannot occur in this -walk (recall that src has no incoming edges and sink has no outgoing edges), -r? also participates in a walk from src to sink in H. -To see that L(G) = L(H) we reason by a case analysis on the rewrite rule used -to transform G into H. For economy of space, we only illustrate this reasoning -for DISJUNCTION r + s; the other cases are similar. -So, suppose that G was rewritten into H by DISJUNCTION r + s, that is, H = -G[r, s ⇒ r+s]. Then r and s have the same (extended) predecessor and successor -set. From this, it follows that the following statements are equivalent. -(1) s ∈ SuccG (r); -(2) r ∈ SuccG (s); -(3) s ∈ Succ+ -G (s); -(4) r ∈ Succ+ -G (r). -For instance, s ∈ SuccG (r) ⇔ r ∈ SuccG (s) since: -s ∈ SuccG (r) ⇔ s ∈ SuccG (r) ∪ {r} -⇔ s ∈ Succ+ -G (r) -+ -⇔ s ∈ SuccG (s) -⇔ s ∈ Pred+ -G (s) -+ -⇔ s ∈ PredG (r) - -since r = s -by definition of Succ+ -G (r) -+ -since Succ+ -G (r) = SuccG (s) -+ -by definition of Succ+ -G (s) and PredG (s) -+ -since Pred+ -G (r) = PredG (s) - -⇔ s ∈ PredG (r) ∪ {r} -⇔ s ∈ PredG (r) - -by definition of Pred+ -G (r) -since r = s - -⇔ r ∈ SuccG (s) - -by definition of PredG (r) and SuccG (s) - -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:18 - -• - -G. J. Bex et al. - -The other equivalences can be similarly obtained. From these equivalences, -it follows that G must take one the two forms illustrated for rewrite rule -DISJUNCTION r + s in Figure 4. In both cases, the corresponding H is also shown. -Now suppose that w = w1 . . . wm ∈  ∗ is recognized by the walk src, t1 , . . . , -tm, sink in G with wi ∈ L(ti ) for 1 ≤ i ≤ m. Let the sequence src, t1 , . . . , tm, sink -be obtained from src, t1 , . . . , tm, sink by replacing every occurrence of r and s by -r + s. By inspection of the illustrations for rule DISJUNCTION r + s in Figure 4 it -is not difficult to see that src, t1 , . . . , tm, sink is a walk in H. Moreover, wi ∈ L(ti ) -by construction for 1 ≤ i ≤ m. Therefore, w ∈ L(H) and hence L(G) ⊆ L(H). -Conversely, suppose that w = w1 . . . wm ∈  ∗ is recognized by src, t1 , . . . , tm, sink -in H with wi ∈ L(ti ) for 1 ≤ i ≤ m. Determine vi as follows: -⎧ -⎪ -⎨ti if ti = r + s -ti = r if ti = r + s and wi ∈ L(r) -⎪ -⎩ -s if ti = r + s and wi ∈ L(s) -By inspection of the illustrations for rule DISJUNCTION r + s in Figure 4 it is -not difficult to see that src, t1 , . . . , tm, sink is a walk in G. Moreover, wi ∈ L(ti ) -for 1 ≤ i ≤ m. Therefore w ∈ L(G) and hence L(H) ⊆ L(G). As such, L(G) = -L(H). -Since each rewrite rule either contracts two states into a single state or -removes an edge from G, the size of H is always smaller than G. Therefore, we -have the next proposition. -PROPOSITION 14. The system of rewrite rules in Figures 4–6 is terminating: -there is no infinite sequence of rewrite steps G  H  I  . . . -Our algorithm REWRITE, shown in Algorithm 2, then operates as follows. First, -it checks whether the input SOA G corresponds to the empty language (∅) or -the empty word (ε) in lines 1–5. If so, it returns the corresponding regular -expression. Otherwise, it rewrites G until no further rules apply. It then checks -whether the resulting generalized SOA is final. -Definition 15. As generalized SOA G is final if E(G) = {(src, r), (r, sink)} -with r distinct from src and sink. In other words, G is final if it is a chain -consisting of the source, an arbitrary regular expression, and the sink. -If the resulting generalized SOA is indeed final, then clearly L(G) = L(r), -and r is returned as result. If the resulting generalized SOA is not final, then -G is not equivalent to a SORE (as we formally show further on), and REWRITE -fails. To illustrate, Figure 7 shows an example run of REWRITE on the example -SOA from Figure 2(b). -THEOREM 16. On input SOA G, REWRITE fails if and only if G is not equivalent -to a SORE. Otherwise, REWRITE returns a SORE equivalent to G. Moreover, -5 -REWRITE operates in time O(n ) where n is the number of states in G. -Note that the complexity O(n5 ) is reasonable since when we apply REWRITE to -the result of 2T-INF on a sample S, n corresponds to the (typically small) number -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:19 - -Algorithm 2. REWRITE -Input: a SOA G -Output: a SORE r such that L(r) = L(G) -1: if sink is not reachable from src in G then -2: -return ∅ -3: else if E(G) = {(src, sink)} then -4: -return ε -5: else -6: -while a rewrite rule from Figures 4–6 can be applied do -7: -perform the rewrite rule on G -8: -end while -9: -if G is final then -10: -return the corresponding regular expression -11: -else -12: -fail -13: -end if -14: end if - -of distinct element names occurring in S, not the total number or total length -of words in S. -The remainder of this section is devoted to the proof of Theorem 16, which -is divided into three steps. First, we show that REWRITE is sound. -PROPOSITION 17. If REWRITE(G) does not fail then it returns a SORE equivalent to G, for any SOA G. -PROOF. - -We distinguish three cases. - -(1) If sink is not reachable from src then REWRITE(G) = ∅ (clearly a SORE) and -L(G) = ∅ = L(∅), as desired. -(2) If E(G) = {(src, sink)} then REWRITE(G) = ε (again clearly a SORE), and -L(G) = {ε} = L(ε), as desired. -(3) Otherwise, G is rewritten into a final generalized SOA H with E(H) = -{(src, t), (t, sink)} (t distinct from src and sink) and REWRITE(G) = t. In -particular, t is a SORE. By Proposition 13, L(G) = L(H) and thus, since -E(H) = {(src, t), (t, sink)}, L(G) = L(H) = L(t) = L(REWRITE(G)), as desired. -Next, we show that REWRITE has the claimed complexity. -PROPOSITION 18. REWRITE operates in time O(n5 ), where n is the number of -states of its input G. -PROOF. We assume that checking whether there is an edge from state r -to state s can be done in constant time (for instance, using an adjacency matrix representation). To see that REWRITE runs in time O(n5 ) under this assumption, let us check that lines 1–4, lines 6–7, and lines 8–10 all run in -O(n5 ). -(Lines 1–4). Since G has at most n2 edges, checking whether sink is reachable -from src can be done in time O(n2 ) using depth-first search. Moreover, checking -whether E(G) = {(src, sink)} can also be done in time O(n2 ). -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:20 - -• - -G. J. Bex et al. - -Fig. 7. An execution of REWRITE on the example automaton in Figure 2(b). Step (1) applies DISJUNCTION r + s with r = a and s = b. Step (2) applies CONCATENATION r? . s with r = b and s = a + c. Step -(3) applies ITERATION r + with r = b? .(a+ c). Step (4) applies CONCATENATION r . s with r = (b? .(a+ c))+ -and s = d. Step (5) applies ITERATION r + with r = (b? .(a + c))+ . d. One more application of CON+ -+ -CATENATION r . s with r = ((b? .(a + c)) . d) and s = e (not shown) leads to the resulting expression -((b? .(a + c))+ . d)+ . e. - - = G1 , G2 , . . . , Gk is the sequence of generalized -(Lines 6–7). Suppose that G -SOAs produced by lines 6–7 when rewriting G = G1 until no further rewrite -rule applies. Since rewrite rules never introduce new states without also removing a state, every Gi has at most n states. Now reason as follows. - since the automaton -—The rule for optionals can be applied at most once in G -that it returns is always final, and since no rewrite rule applies to a final -generalized SOA. Checking the preconditions of the rule for optionals can be -done in time O(n2 ), and its action can be performed in time O(n). As such, the - on applying the rewrite rule for optionals is bounded -total time spent in G -2 -by O(n ). -—Since the rewrite rules for disjunction and concatenation contract two states -into a single one, these rewrite rules can be applied at most n times in  -G. -Since of all their preconditions can be checked in time O(n4 ) (by iterating -over all pairs of states r and s in the current automaton Gi and comparing -Pred(r), Pred(s), Succ(r), and Succ(s) as desired) and since state contraction - on the rewrite rules for -can be done in time O(n), the total time spent in G -disjunction and concatenation is bounded by O(n × n4 ) = O(n5 ). -—Since the rule for iteration removes the loop of the state to which it is applied, -and since each generalized SOA contains at most n loops, there can be at most -n consecutive applications of this rule before another rewrite rule is applied. -By the preceding remarks, there are at most n applications of the other -rewrite rules, so the rewrite rule for iteration can be applied at most n2 times - Since its precondition can be checked in constant time, and since its -in G. - on the rewrite rule -action can be done in time O(n), the total time spent in G -for iteration is bounded by O(n2 × n) = O(n3 ). -(Lines 8–11). Finally, checking whether a generalized SOA is final and extracting the corresponding regular expression can be done in time O(n2 ). -In summary, lines 1–4 run in time O(n2 ), lines 6–7 run in time O(n5 ), and -lines 8–11 run in time O(n2 ), yielding a total running time of O(n5 ). -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:21 - -Finally, we show that REWRITE(G) fails if and only if G is not equivalent -to a SORE, or equivalently, that REWRITE(G) does not fail if, and only if, G is -equivalent to a SORE. This is actually the most involved part of the proof of -Theorem 16. Proposition 17 already shows that if REWRITE(G) does not fail, then -G is equivalent to a SORE. Hence, we remain to show the next proposition. -PROPOSITION 19. -not fail. - -If SOA G is equivalent to a SORE, then REWRITE(G) does - -Essentially, we prove this proposition in two steps. Call a generalized SOA -proper if L(G) = ∅ and L(G) = {ε}. -(1) We first show that for any proper SOA G equivalent to a SORE there exists -a sequence of rewrite steps that ends in a final automaton (Corollary 46). -(2) In addition, we show that if proper G can be rewritten into a final automaton -by a particular sequence of rewrite steps, then any sequence of rewrite steps -on G ends in a final automaton (Corollary 54). -As such, REWRITE(G) cannot fail when G is equivalent to a SORE: either G is -not proper, in which case lines 1–4 of Algorithm 2 return a valid expression, or -G is proper and will hence be rewritten into a final automaton, in which case -line 9 returns a valid expression. The details may be found in Appendix A. -3.3 Discussion -It should be noted that while the result of REWRITE is always a SORE, this -SORE need not be easy to read (depending on the order of rewriting). For -instance, it is possible for REWRITE to generate an expression r .(s? . t?)?. Clearly, -the optional around (s? . t?) is redundant. Removing it leads to the simpler -r .(s? . t?). For presentation to the user, it is therefore advisable to postprocess -the result of REWRITE (and its variations in Section 4) using a regular expression -simplification algorithm. -4. DEALING WITH MISSING DATA -The results of Section 3 suggest the following method to infer a SORE from a -given sample S. -(1) First, use 2T-INF to learn from S an automaton representation G of the -target SORE r. -(2) Next, convert G into a SORE using REWRITE. -If S is a representative sample of r then G is equivalent to r by Proposition 9. -Therefore, REWRITE(G) does not fail by Theorem 16, and hence REWRITE(G) is -equivalent to r. -Unfortunately, real-world samples are rarely representative. For instance, -for target r = (a1 +· · ·+an)+ and increasing values of n, it is increasingly unlikely -that a sample bears witness to each of the n2 2-grams needed to represent r. -On such nonrepresentative samples, 2T-INF will construct an automaton for -which L(G) is a strict subset of L(r). In particular, this automaton need not be -equivalent to a SORE, and REWRITE(G) can fail. Figure 8 shows an example. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:22 - -• - -G. J. Bex et al. - -Fig. 8. The SOA generated by 2T-INF for the nonrepresentative sample S = {bacacdacde, -abccaadcde}. The only rewrite rules that can be applied are ITERATION a+ and ITERATION c+ , after which REWRITE gets stuck in a nonfinal automaton and fails. - -Fig. 9. Repair rules. - -For that reason, we present in this section two modifications of REWRITE -that “repair” G when rewriting gets stuck in a nonfinal automaton. The first -modification, RWR, picks a single repair when rewriting gets stuck, independent -of how the repair affects G. The second modification, RWR2 , in contrast, considers -multiple repair strategies and selects the one that extends G in a minimal way. -The repair rules used by both algorithms are shown in Figure 9. After a repair -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:23 - -Algorithm 3. RWR -Input: a SOA G -Output: a SORE r such that L(G) ⊆ L(r) if G is not equivalent to a SORE, and L(G) = -L(r) otherwise. -1: if sink is not reachable from src in G then -2: -return ∅ -3: else if E(G) = {(src, sink)} then -4: -return ε -5: else -6: -while G is not final do -7: -if a rewrite rule from Figures 4–6 can be applied then -8: -apply the rewrite rule on G -9: -else -10: -apply a repair rule from Figure 9 -11: -end if -12: -end while -13: -return the corresponding regular expression r -14: end if - -rule is applied, the automaton necessarily satisfies the precondition of the -corresponding rewrite rule. Now note the following. -PROPOSITION 20. Let G be a proper generalized SOA. If G is not final and no -rewrite rule applies to G, then at least one of the repair rules in Figure 9 applies -to G. -PROOF. Since G is proper, it recognizes at least one nonempty word. Clearly, -this can only happen when src has a successor r distinct from sink. We distinguish two cases. -—Either r has a successor s distinct from src, sink, and r. Clearly, REPAIR r? . s? -is then applicable to G. -—If r does not have such a successor s, then we claim that src has another -successor t, distinct from src, sink, and r. Indeed, suppose for the purpose -of contradiction that no such successor exists. Then, since every state in G -participates in a walk from src to sink, either E(G) = {(src, r), (r, sink)}, or -E(G) = {(src, r), (r, r), (r, sink)}. In the first case G is final, in the second we -can rewrite G using ITERATION r + —a contradiction in both cases. As such, -the claimed t exists. Then, since src ∈ PredG (r) ∩ PredG (t), REPAIR r + t is -applicable to G. -As such, we can always apply a repair rule if rewriting gets stuck in a -nonfinal automaton, after which rewriting can continue. -4.1 A Greedy Approach: RWR -An outline of RWR (short for REWRITE with REPAIRS) is shown in Algorithm 3. Like -REWRITE, it first checks whether its input G is equivalent to ∅ or ε. Otherwise, -G is rewritten using the rewrite rules in Figures 4–6 until a final automaton is -reached, arbitrarily selecting a repair rule when rewriting gets stuck. (In our -implementation we prefer repairs that make small extensions to the language -of the automaton over repairs that make larger extensions. In particular, we -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:24 - -• - -G. J. Bex et al. - -first check whether there are r and s for which REPAIR r . s? can be applied. Then -we check whether there are r and s for which REPAIR r? . s can be applied. Next, -we check for REPAIR r + s and finally for REPAIR r? . s?.) -Since the repair rules add edges to G, thereby increasing L(G), we may -conclude the following theorem. -THEOREM 21. For a SOA G, RWR always produces a SORE r with L(G) ⊆ -L(r). Moreover, if G is equivalent to a SORE, then L(G) = L(r). -(The second statement follows by Theorem 16.) Combined with Proposition 9, -we hence obtain the next corollary. -COROLLARY 22. - -Let M be the composition of 2T-INF with RWR, that is, M(S) := - -RWR(2T-INF(S)). Then M learns the class of SOREs from positive data. - -4.2 Exploring the Search Space: RWR2 -When rewriting gets stuck, RWR arbitrarily selects a repair rule (perhaps based -on some ordering of the rules as in our implementation), and discards the others. It should be clear, however, that when different repair rules are applicable, -one rule may have a smaller impact on the language of the automaton than -another. For that reason we present in this section a different modification -of REWRITE that, in contrast to RWR, tries the “best”  repair rules when there -are several candidates. Here, the “best” repair rules are those that add the -least number of words to the language. Since an automaton defines an infinite -language in general, it is of course impossible to take all added words into -account. We therefore only consider the words up to a length n, where n is twice -the number of alphabet symbols in the automaton. Formally, for a language L, -let |L≤n| denote the number of words in L of length at most n. Moreover, say -that generalized SOA H is a repair of generalized SOA G if H is obtained by -applying a repair rule on G. Then the repairs of the current automaton G are -ordered according to increasing values of | L(H)≤n|, and the best (i.e., first)  -among them are further investigated. -The resulting algorithm, called RWR2 (an abbreviation of REWRITE with  -best RANKED REPAIRS) is shown in Algorithm 4. Like REWRITE, it first checks -whether its input G is equivalent to ∅ or ε. Otherwise, RWR2 uses RWR2 -AUX to -Algorithm 4. RWR2 -Input: SOA G -Output: a SORE r such that L(G) ⊆ L(r) if G is not equivalent to a SORE, and L(G) = -L(r) otherwise. -1: if sink is not reachable from src in G then -2: -return ∅ -3: else if E(G) = {(src, sink)} then -4: -return ε -5: else -6: -initialize the final automaton Hopt to recognize (G)∗ -7: -return the SORE corresponding to the final automaton computed by -2 -RWR -AUX(G, Hopt ) -8: end if -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:25 - -Algorithm 5. RWR2 -AUX -Input: generalized SOAs G and Hopt -Output: final generalized SOA I such that L(G) ⊆ L(I) if G is not equivalent to a -SORE, and L(G) = L(I) otherwise. -1: while a rewrite rule from Figures 4–6 can be applied to G do -2: -perform the rewrite rule on G -3: end while -4: if G is final then -5: -return G -6: else -7: -compute the set R of all possible repairs H of G -8: -sort R in increasing order by | L(H)≤n| -9: -for each of the min(, |R|) best repairs H do -10: -if | L(H)≤n| < | L(Hopt )≤n| then -11: -recursively compute H := RWR2 -AUX(H, Hopt ) -12: -set Hopt := H if | L(H )≤n| < | L(Hopt )≤n| -13: -end if -14: -end for -15: -return Hopt -16: end if - -recursively rewrite and repair G until a final automaton is reached. During -this recursion, Hopt is the best final generalized SOA found so far. Initially, on -line 6 of RWR2 , Hopt is set to the final generalized SOA that accepts all words -over alphabet symbols mentioned in G. RWR2 -AUX then rewrites G in lines 1–2 -until no more rewrite rule is applicable. If the resulting G is final then it is -returned. Otherwise, RWR2 -AUX computes in line 6 all possible repairs H of G -and orders them according to increasing values of | L(H)≤n|. The algorithm then -recursively calls itself on the  best ranked repairs in lines 8–10. The test in -line 10 is an optimization: if the current repair is already worse than the best -final generalized SOA Hopt computed so far in terms of language size, then -further rewriting and repairing cannot yield a final generalized SOA that is -better than Hopt . Lines 11 and 12 update Hopt when appropriate. Finally, Hopt -is returned. -Given its definition, it is clear that RWR2 results in regular expressions with -a smaller language size for increasing values of , of course at the cost of -increased computation time. In the experiments (Section 7.2) the trade-off between precision and computation time of RWR and RWR2 , for increasing values -of , is investigated in more detail. -4.3 Efficiently Computing the Language Size -During its executing, RWR2 repeatedly needs to compute the language size of -the possible repairs. This computation can actually be done quite efficiently -for SOAs, as we show next. Of course, in general RWR2 needs to compute the -language size also for generalized SOAs, not just ordinary SOAs. Our implementation first expands such generalized SOAs into an equivalent SOA using -the Glushkov construction (similar to the ideas of the proof of Proposition 45 -in the online appendix that can be accessed in the ACM Digital Library), and -then invokes the language size computation procedure explained next. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:26 - -• - -G. J. Bex et al. - -Let |L=m| denote the number of words in L of length exactly m. Let G be a -SOA; and assume that V(G) − {src, sink} = {a1 , . . . , an}. Then consider the n × n -matrix D where for i, j ∈ {1, . . . , n} - -1 if (ai , a j ) ∈ E; and, -D[i, j] = -0 otherwise. -In addition, define the 1 × n and n× 1 matrices I and F, respectively, as follows: -for i, j ∈ {1, . . . , n} - -1 if (src, j) ∈ E; and, -I[1, j] = -0 otherwise; -and - - -F[i, 1] = - -1 if (i, sink) ∈ E; and, -0 otherwise. - -The following lemma is straightforward to prove by induction on n using -the fact that each walk from src to sink in G uniquely determines an accepted -word. Let Dm denote the m-times multiplication of D, with D0 the unit matrix. -LEMMA 23. - -Let m > 0 and let G be a SOA. Then | L(G)=m| = I · Dm−1 · F. - -Since for m = 0, we simply have | L(G)=m| = 1 if (src, sink) ∈ E, and -n -| L(G)=m|, we can deter| L(G)=m| = 0, otherwise and since | L(G)≤n| = m=0 -≤n -mine | L(G) | by iteratively computing the matrices D1 to Dm, and applying -Lemma 23. This immediately gives the following corollary. -COROLLARY 24. -time O(n|G|3 ). - -For each n > 0 and SOA G, | L(G)≤n| can be computed in - -5. CORRECTION -In the conference version of this article [Bex et al. 2006] we proposed a different set of rewrite and repair rules for transforming SOAs into SOREs. While -those rewrite rules were claimed in Bex et al. [2006] to possess the analog of -Proposition 19 (namely that they always produce a SORE equivalent to the -input SOA, provided that such a SORE exists), this claim is false, as we will -detail next. Readers unfamiliar with Bex et al. [2006] may freely skip this -section without endangering comprehension of the rest of the article. -To illustrate why the preceding claim is false, the rewrite rules of Bex et al. -[2006] are given in Figure 10, where G∗ refers to the ε-closure of G, defined as -follows. -Definition 25. Let G = (V, E) be a generalized SOA. The ε-closure G∗ of G -is the graph (V, E∗ ) where E∗ contains: -—all edges of E; -—all edges (r, r) with r = s+ or r = s+ ?; -—all edges (r, s) for which there is a path from r to s in G that passes only -through intermediate nodes t with ε ∈ L(t). -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:27 - -Fig. 10. Set of rewrite rules introduced in the conference version of this article [Bex et al. 2006]. - -Figure 11 shows a sequence of rewrite steps using these rules starting from -the SOA recognizing (a + b)+ ? or, equivalently, (a? . b?)+ . Note that the second -rewrite step, which introduces b?, causes the automaton to become disconnected: because a? ∈ PredG∗ (b) and sink ∈ SuccG∗ (b) − {b} it deletes (a?, sink)— -the only edge linking src to sink. As such, the accepted language changes from -L((a + b)+ ?) to ∅. This clearly illustrates that the OPTIONAL r? rule in Figure 10 -is unsound. For that reason, we have moved in this article to the new rewrite -rules in Figures 4–6. -It is peculiar, however, that we have extensively used the rewrite rules of -Figures 10 together with the repair rules in Figure 13 in a prototype implementation but have never encountered a situation where: -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:28 - -• - -G. J. Bex et al. - -Fig. 11. A problematic sequence of rewrite steps using the rules in Figure 10. The input SOA -accepts the same language as (a+b)+ ?, or, equivalently (a? . b?)+ . Note that the automaton resulting -from by the second rewrite step is disconnected and hence accepts the empty language. Rewriting -is therefore not sound. - -Fig. 12. A succesfull sequence of rewrite steps using the rules in Figure 10. The input SOA accepts -the same language as (a + b)+ ?, or, equivalently (a? . b?)+ . - -—we obtained a SORE r that failed to accept at least all words in the input -SOA G; or -—we obtained a SORE r that accepted a strict superset of L(G) when G was -equivalent to a SORE. -We suspect that this behavior is due to the strict order in which we apply the -rewrite rules in our implementation: first CONCATENATION, then DISJUNCTION, -then SELF-LOOP, and finally OPTIONAL. To illustrate, Figure 12 shows a successful -rewriting of the SOA accepting (a + b)+ ? under this order. -The inference algorithm of Bex et al. [2006], which we shall call RWR0 in this -article, is shown in Algorithm 6. It is based on the rewrite rules in Figure 10 -and the repair rules in Figure 13. The experiments in Section 7 indicate that -0 -2 -RWR has no benefits over RWR and RWR . Moreover, as we do not have a formal -soundness and completeness proof showing that rewriting always produces a -SORE equivalent to the input SOA (provided that such a SORE exists) under -this order, it does not make much sense to consider RWR0 for the class of SOREs. -In strong contrast, on the class of k-occurrence regular expressions (k > 1), RWR0 -can make a difference over RWR and RWR2 [Bex et al.]. So even without formal -guarantees, RWR0 still has its its merits. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:29 - -Algorithm 6. RWR0 -Input: a SOA G -Output: a SORE r -1: if sink is not reachable from src in G then -2: -return ∅ -3: else if E(G) = {(src, sink)} then -4: -return ε -5: else -6: -initialize done to false -7: -while not done do -8: -if there a rewrite rule in Figure 10 is applicable then -9: -rewrite G, giving precedence to CONCATENATION, then DISJUNCTION, then SELFLOOP, then OPTIONAL -10: -else if a repair rule in Figure 13 is applicable then -11: -repair G, giving precedence to ENABLE-DISJUNCTION, then ENABLE-OPTIONAL-1, -then ENABLE-OPTIONAL-2 -12: -else -13: -set done to true -14: -end if -15: -end while -16: -if G is final then -17: -return the corresponding regular expression r -18: -else -19: -return ∅ -20: -end if -21: end if - -6. INFERRING CHARES: CRX -In this section, we present the algorithm CRX for the inference of chain regular -expressions (CHAREs). -Definition 26 (CHAREs ). The class of chain regular expressions consists of -those SOREs of the form f1 · · · fn where every fi is a chain factor—an expression -of the form (a1 + · · · + ak), (a1 + · · · + ak)?, (a1 + · · · + ak)+ , or, (a1 + · · · + ak)+ ? with -k ≥ 1 and every ai is an alphabet symbol. -For instance, the expression a(b+c)+ ?d+ (e + f )? is a CHARE, while (ab+c)+ ? -and (a+ ? + b?)+ ? are not. -Since each CHARE is a concatenation of alphabet-disjoint chain factors, -every occurrence of an alphabet symbol in a word must be generated by the -same chain factor in the target CHARE. The positional relationships between -occurrences of alphabet symbols in a given sample then allow us to deduce -which chain factors are present in the target CHARE, and how they are ordered. -Example 27. Consider the sample S = {u, v, w} with u = abd, v = bcdee, -and w = cade. Clearly a occurs before b in u, b occurs before c in v, and c occurs -before a in w. In the target CHARE, therefore, a, b, and c must belong to the -same chain factor which can only be (a + b + c)+ or (a + b + c)+ ?. Since one of -{a, b, c} is present in every word of S, we choose (a + b + c)+ . Similarly, d and -e form chain factors by themselves. Whereas d occurs once in every word in S, -e can occur zero, one, or more times. Therefore, d is represented by the chain -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:30 - -• - -G. J. Bex et al. - -Fig. 13. Repair rules accompanying the rewrite rules in Figure 10. These rules are a correction -of the rules presented in Bex et al. [2006]. Repairs are tried in the order shown. In particular, -ENABLE-OPTIONAL-2 is only applied if none of the other rules is applicable. - -factor d, while e is represented by the chain factor e+ ?. Since a, b, c always occur -before d, which in turn always occurs before the e’s, the derived CHARE is then -(a + b + c)+ de+ ?. -So, in brief, CRX computes chain factors, orders them, and uses that order to -generate a CHARE. Of course, the order of the chain factors is not necessarily -linear. In that case, a linear order can be constructed by making the factors -optional. Some care has to be taken, however, to generate factors that are -disjunctions without repetitions. -Definition 28. Let S be a sample. We denote by → S the partial preorder on - such that a → S b if, and only if, a immediately precedes b in some w ∈ S. -(I.e., ab is a 2-gram of S.) We say that a occurs before b in S if a →∗S b, where -→∗S is the reflexive and transitive closure of → S. -For instance, Figure 14 illustrates → S when S = {abccde, cccad, bf egg, -bf ehi}. -Definition 29. Define a ≈ S b if a occurs before b in S and b occurs before a. -That is, a ≈ S b if a →∗S b and b →∗S a. -Clearly, ≈ S is an equivalence relation. Let  S denote the set of equivalence classes of ≈ S. In what follows, we denote such equivalence classes by, for -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:31 - -Fig. 14. The partial preorder → S for S = {abccde, cccad, bf egg, bf ehi}. - -Fig. 15. The Hasse diagram HS of the sample S = {abccde, cccad, bf egg, bf ehi}. The corresponding -partial preorder from which HS is derived is shown in Figure 14. - -example, [a1 , . . . , an]. As usual, an equivalence class of cardinality 1 is called a -singleton. -Definition 30. The Hasse diagram of S, denoted HS, is the graph over  S -in which there is an edge from equivalence class [a1 , . . . , an] to class [b1 , . . . , bm] -if: (1) [a1 , . . . , an] and [b1 , . . . , bm] are distinct and (2) there exists 1 ≤ i ≤ n and -1 ≤ j ≤ m such that ai → S b j . -For instance, the Hasse diagram of the sample S = {abccde, cccad, bf egg, -bf ehi} is shown in Figure 15. The operation of CRX is then shown in Algorithm 7 -and illustrated in the following example. -Example 31. Consider again the sample S = {abccde, cccad, bf egg, bf ehi} -and its corresponding Hasse diagram in Figure 15. Since Pred HS ([d]) = -Pred HS ([ f ]) and Succ HS ([d]) = Succ HS ([ f ]), line 3 applies to [d] and [ f ]. Although -Pred HS ([g]) = Pred HS ([h]), step 2 cannot be applied as Succ HS ([g]) = Succ HS ([h]). -Similarly [g] and [i] share successors, that is, ∅, but have different predecessors. -Hence, after the while loop in line 2 we obtain: - -A possible topological sort is [a, b, c], [d, f ], [e], [g], [h], [i]. Since at least one of -a, b, and c occurs once or more in every string of W, r([a, b, c]) = (a + b + c)+ is -the first factor; the second factor is (d + f ) since either d or f occurs exactly -once; the factor derived from [e] is e? since W contains a string without e -and similarly for those from [h] and [i]. Finally, g occurs multiple times in a -single string. Hence the simple regular expression derived by the algorithm is -(a + b + c)+ · (d + f ) · e? · g+ ? · h? · i? which completes step 6. -Note that the order of the chain factors in the CHARE depends on the -topological sort. -THEOREM 32. -L(S). - -Given a sample S, CRX computes a CHARE r such that S ⊆ - -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:32 - -• - -G. J. Bex et al. - -Algorithm 7. CRX -Input: a sample S -Output: a CHARE r such that S ⊆ L(r) -1: Compute the set  S of equivalence classes of ≈ S -2: while a maximal set of singleton nodes γ1 , . . . , γ such that Pred HS (γ1 ) = · · · = -Pred HS (γ ) and Succ HS (γ1 ) = · · · = Succ HS (γ ) exists do -3: -Replace γ1 , . . . , γ by γ := ∪j=1 γ j , and redirect all incoming and outgoing edges of -the γi to γ in HS -4: end while -5: Compute a topological sort γ1 , . . . , γk of the nodes -6: for all i ∈ {1, . . . , k} (γi = [a1 , . . . , an]) do -7: -if every w ∈ S contains exactly one occurrence of a symbol in {a1 , . . . , an} then -8: -r(γi ) := (a1 + · · · + an) -9: -else if every w ∈ S contains at most one occurrence of a symbol in {a1 , . . . , an} -then -10: -r(γi ) := (a1 + · · · + an)? -11: -else if every w ∈ S contains at least one of a1 , . . . , an and there is a word that -contains at least two occurrences of symbols then -12: -r(γi ) := (a1 + · · · + an)+ -13: -else -14: -r(γi ) := (a1 + · · · + an)+ ? -15: -end if -16: -return r(γ1 ) . r(γ2 ) . · · · . r(γk) -17: end for - -PROOF. The theorem follows almost immediately from the construction. -Clearly, CRX always outputs a CHARE. Moreover, observe that after step 5 -the computed topological sort is consistent with the order of the symbols in the -words in S. More precisely, there can not exist symbols a and b, such that a ∈ γi , -b ∈ γ j , i < j, and b →∗S a. Subsequently, for each γi a chain factor is chosen -in such a manner that it is consistent with all words w ∈ S. As these factors -are ordered consistently with the order of the symbols in S, this implies that -S ⊆ L(r). -Furthermore, on the class of CHAREs, CRX is complete. -THEOREM 33. -L(CRX(S)). - -For each CHARE r there is a sample S such that L(r) = - -PROOF. Denote by Sym(r) the set of alphabet symbols occurring in r. We also -abuse notation and, for a sample S, write Sym(S) to denote the set of alphabet -symbols occurring in S. Let r = f1 · · · fk be a CHARE, with each fi a chain -factor. We construct the sample S such that the CRX(S) is syntactically equal to -r, up to commutativity of +. The theorem then follows. -Thereto, for every 1 ≤ i ≤ k, let wi be a word in L( fi ). We construct S by -subsequently adding words to it. First, for all 1 ≤ i ≤ k − 1, a ∈ Sym( fi ), -b ∈ Sym( fi+1 ), we add w1 · · · wi−1 abwi+2 · · · wk to S. Further, for all 1 ≤ i ≤ k, -we add words to S, depending on the form of fi . Specifically, if fi is of the -form: -—(a1 + · · · + an), we add w1 · · · wi−1 a1 wi+1 · · · wk; -—(a1 + · · · + an)?, we add w1 · · · wi−1 wi+1 · · · wk, and w1 · · · wi−1 a1 wi+1 · · · wk; -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:33 - -—(a1 + · · · + an)+ , we add w1 · · · wi−1 a1 a1 wi+1 · · · wk; -—(a1 + · · · + an)+ ?, we add w1 · · · wi−1 wi+1 · · · wk, and w1 · · · wi−1 a1 a1 wi+1 · · · wk. -We now argue that given S, CRX indeed derives an expression syntactically -equal to r. First observe that already before step 3, CRX computes k nodes γ1 to -γk, which are linearly ordered, such that for each 1 ≤ i ≤ k, γi contains exactly -the alphabet symbols contained in fi . Then, due to the number of occurrences -of each symbol of the different chain factors, the algorithm will associate to -each γi exactly the factor fi , and hence CRX(S) is syntactically equivalent to r, -up to commutativity of +. -From Theorems 32 and 33 it readily follows that we have the next corollary. -COROLLARY 34. - -CRX learns the class of CHAREs from positive data. - -The experiments in Section 7.3 show that the number of words in S needed -in practice is very small. Actually, the prime feature that makes CRX much -more robust than RWR for very small datasets is its strong generalization ability. Indeed, consider an expression of the form (a1 + · · · + an)+ ?. While REWRITE -requires all n2 2-grams of the form ai a j for i, j ∈ {1, . . . , n} to be present, RWR -requires around (n2 − n) 2-grams. For CRX, however, the set {ε, a1 a2 , a2 a3 , . . . , -an−1 an, ana1 } of size O(n) will suffice. This point is illustrated in practice -by example3 and example4 in Table II where n has a value of 41 and 56, -respectively. Experiments illustrate that only 400  1682 and 500  3136 -2-grams are needed by CRX to learn example3 and example4, respectively. -The following theorem shows that CRX is optimal within the class of CHAREs -when the partial order  S is in fact a linear order. -THEOREM 35. For every sample S, if  S is a linear order then for every -CHARE r such that S ⊆ L(r) and L(r) ⊆ L(CRX(S)), we have r = CRX(S), that is, r -is syntactically equal to CRX(S) up to commutativity of +. -PROOF. Assume that CRX(S) = f1 · · · fk and r = g1 · · · gl . Clearly, -Sym(CRX(S)) = Sym(r) = Sym(S). We first argue that k = l. Thereto, assume -for the purpose of contradiction that k < l. Then, there is a chain factor f in -CRX(S) with a, b ∈ Sym( f ) and two chain factors g and g in r with a ∈ Sym(g) -and b ∈ Sym(g ). We distinguish two cases. -(1) If f is of the form (a1 + · · · + an) or (a1 + · · · + an)?, then L(r) ⊆ L(CRX(S)). -(2) If f is of the form (a1 + · · · + an)+ ? or (a1 + · · · + an)+ , by construction and -since  S is linearly ordered, there are words u1 , u2 ∈ S such that a →∗u1 b -and b →∗u2 a. However, since a and b are in different chain factors of r, -/ L(r) or u2 ∈ -/ L(r), and hence S ⊆ L(r). -either u1 ∈ -Conversely, assume k > l. Then, there are chain factors f, f in CRX(S) with -a ∈ Sym( f ) and b ∈ Sym( f ), and a chain factor g in r with a, b ∈ Sym(g). We -again distinguish two cases. -(1) If g is of the form (a1 + · · · + an)+ ? or (a1 + · · · + an)+ , then L(r) ⊆ L(CRX(S)). -(2) If g is of the form (a1 +· · ·+an) or (a1 +· · ·+an)?, by construction and since  S -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:34 - -• - -G. J. Bex et al. - -is linearly ordered, there are words u1 , . . . , um ∈ S, and symbols c1 , . . . , cm−1 -such that a →∗u1 c1 , cm →∗um b, and ci →ui+1 ci+1 , for all 1 ≤ i ≤ m − 1. -/ L(r) must -However, due to the form of g, for at least one of these ui , ui ∈ -hold and hence S ⊆ L(r). -Using the same kind of argument it can be shown that Sym( fi ) = Sym(gi ), -for all 1 ≤ i ≤ k. Further, since L(r) ⊆ L(CRX(S)), for every 1 ≤ i ≤ k, we -have L(gi ) ⊆ L( fi ). Since the different chain factors can only take a restricted -numbers of forms, it now suffices to show that L(gi ) = L( fi ), for all i, to show that -they are also syntactically equivalent. Hence, towards a contradiction, assume -L(gi )  L( fi ) for some 1 ≤ i ≤ k. This can only be the case if: (1) gi = (a1 +· · ·+an) -and fi = (a1 + · · · + an); (2) gi = (a1 + · · · + an)+ ? and fi = (a1 + · · · + an)+ ; or -(3) gi = (a1 + · · · an)? and fi is one of the three other forms. However, in each of -these cases, given the construction of the algorithm, one can find a word w ∈ S -such that w ∈ -/ L(r). Hence, for all i, L( fi ) = L(gi ), and thus r = CRX(S). -Note that this property does not hold when  S is not linear. For instance, on -S = {abc, ade, abe} CRX yields a·b?·d?·c?·e? whereas the CHARE a·(b+d)·(c +e) -is a better approximation of the target language. -CRX can be efficiently executed on very large datasets by only maintaining -HS and the multiplicities of occurrences of -symbols in words in S (needed for -lines 6–13). From this representation, lines 2–5 can be executed. Hence, it is -not necessary that the entire sample resides in main memory. The complexity -of the algorithm is O(m + n3 ), where m is the size of the sample and n the -number of alphabet symbols. -7. EXPERIMENTAL EVALUATION -In this section we validate our approach by means of experimental analysis. -Specifically, we assess the quality of the expressions returned by our algorithms -on real-world corpora and DTDs, and compare it with the quality of expressions -returned by XTRACT [Garofalakis et al. 2003] and Trang [Clark]. Next, we compare the quality of RWR0 (the algorithm found in the conference version of this -article), RWR, and RWR2 . Subsequently, we investigate the performance of the algorithms on incomplete and noisy data. Finally, we discuss their running time -performance. We abuse notation and simply write RWR for the application of -2T-INF followed by RWR, similarly for RWR0 and RWR2 . All experiments were performed using a prototype implementation of our algorithms in Java executed -on a 2.5 Ghz Pentium 4 machine with 1GB of RAM. -7.1 Real-World Examples -The number of publicly available XML corpora is rather limited. We employed -the XML Data repository maintained by Miklau [2002] as a testbed. Unfortunately, most of the corpora listed there are either very small, lack a DTD, -or contain a DTD with only trivial regular expressions. Nevertheless, two of -the listed corpora are interesting. Specifically, we compared XTRACT, RWR, and -CRX on the Protein Sequence Database (683Mb in size) and the Mondial corpus -[Miklau 2002], a database of information on various countries (1Mb in size). -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:35 - -Table I. Results of RWR, CRX and XTRACT on DTDs and Sample Data from -the Protein Description Database and the Mondial Corpora -Element -Original DTD -Sample -Result of CRX/ RWR -size -Result of XTRACT -ProteinE. -a1 a2 a3 a4 + ?a5 + ?a6 + ?a7 + ?a8 + ?a9 ?a10 ?a11 + ?a12 a13 -2458 -a1 a2 a3 a4 + a5 + ?a6 + ?a7 + ?a8 + ?a9 ?a10 ?a11 + ?a12 a13 -843 -an expression of 185 tokens -organism -a1 a2 ?a3 a4 ?a5 + ? -9 -a1 a2 ?a3 a4 ?a5 + ? -9 -a1 ((a2 a3 a4 ?+a3 a4 )a5 ?+a3 a5 + ?) -reference -a1 a2 + ?a3 + ?a4 + ? -45 -a1 a2 + ?a3 + ?a4 + ? -45 -a1 (a2 + ?(a4 + ?+a3 + ?)+a2 a3 + ?a4 a4 +a3 + ?a4 + ?) -refinfo -a1 a2 a3 ?a4 ?a5 a6 ?(a7 +a8 )?a9 ? -10 -a1 a2 (a3 +a4 )?a5 a6 ?a7 ?a9 ?a8 ? -10 -a1 a2 ((a3 a5 a6 a7 ?+a4 a5 )a9 ?+a5 (a7 +a8 )?+a4 a5 a8 ) -authors -a1 + +(a2 a3 ?) -54 -a1 + ?a2 ?a3 ? / -a1 + +(a2 a3 ) -54 -a1 + ?+a2 a3 -accinfo -a1 a2 + ?a3 + ?a4 ?a5 ?a6 ?a7 + ? -124 -a1 a2 + ?a3 + a4 ?a5 ?a6 ?a7 + ? -124 -an expression of 97 tokens -genetics -a1 + ?a2 ?a3 ?a4 ?a5 ?a6 ?a7 ?a8 ?a9 ?a10 ?a11 + ?a12 + ? -219 -a1 + ?a2 ?a3 ?a4 ?a5 ?a6 ?a7 ?a8 ?a9 ?a10 ?a12 + ? -219 -an expression of 329 tokens -function -a1 ?a2 + ?a3 + ? -26 -a1 ?a2 + ?a3 + ? -26 -(a1 (a2 ?a2 ?a3 + ?+a2 + ?(a3 a3 )+ ?+a2 a2 a2 a3 )+a2 (a2 a3 + ?+a3 + ?)) -city -a1 a2 + ?a3 + ? -9 -a1 a2 + ?a3 + ? -9 -a1 (a2 + ?a3 a3 ?+a2 (a3 + ?+a2 ))? -The left column gives element names, sample size for CRX/ RWR, and sample size for -XTRACT, respectively. The right column lists original DTD, inferred DTD by CRX/ RWR, -and the result of XTRACT, in that order. - -Since no real-world data could be obtained for SOREs that are not CHAREs, -we generated our own XML data for a number of real-world DTDs considered -in Bex et al. [2004] containing a number of sophisticated regular expressions -outside the class of CHAREs. -Real-world data. In this section, we only discuss RWR as RWR0 and RWR2 give -precisely the same results. Table I lists all nontrivial element definitions2 in -the aforementioned DTDs together with the results derived by the inference -algorithms RWR, CRX, and XTRACT. It is interesting to note that only the regular -expression for authors is not a CHARE. Moreover, no elements are repeated -in any of the definitions. This should not come as a surprise given the observations discussed in the Introduction on the content models occurring in practice. -The regular expression derived by the XTRACT algorithm is shown whenever -it fitted the table, otherwise the number of tokens it consists of is listed. For -better readability the actual output of XTRACT has been simplified by replacing -expressions such as (ai + ε) by ai ?. -2 It should be noted that the examples from the Mondial corpus are not valid according to their -DTD, so for the city element only valid elements were used as training examples. - -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:36 - -• - -G. J. Bex et al. - -It can be verified that all regular expressions in Table I are learned quite -satisfactory by RWR and CRX with respect to the examples extracted from the -XML corpus. The numbers in the first column refer to the size of the sample. -RWR and CRX always produce the same result except for authors where CRX -cannot derive the target expression as it is not a CHARE. We note that no -sample was representative of its target expression. As such, RWR always had to -apply repair rules. The expressions in the table indicate that the result of these -repairs are satisfactory. For a few expressions, for instance, ProteinE(ntry), -refinfo, and genetics, the expressions produced by CRX and RWR are more -strict than the corresponding one in the DTD. This is due to the data present -in the sample. For instance, for genetics, no a11 element occurs in the sample -so it obviously cannot be part of the derived expression. The element refinfo -illustrates that a3 and a4 are mutually exclusive in the sample and that a8 is -never followed by a9 . Inspecting the original DTD illustrates the underlying -semantics. -authors, citation, volume?, month?, year, -pages?, (title | description)?, xrefs? -Indeed, volume is used in the context of a journal, while month is used for a -conference publication. Apart from the authors element XTRACT either produces -a suboptimal expression or no expression at all. For instance, XTRACT crashes on -the ProteinE(ntry) sample due to excessive memory consumption (more than -1GB of RAM). Reducing the size of the sample to approximately 800 unique -words yields a complex expression of 185 tokens. -Real-world regular expressions. Table II lists the results of the algorithms on -a number of more sophisticated regular expressions extracted from real-world -DTDs discussed in Bex et al. [2004]. Since no real-world data was available -for those DTDs, we have randomly generated samples using ToXgene [Barbosa -et al. 2002], taking care that all relevant examples where present to ensure -the target expression could be learned. Again, we list the sample size in the -first column. As some of these numbers might seem artificially large, we note -that, for instance, the SOA corresponding to example3 already contains 1897 -edges. Hence, a random dataset of 5741 words is not unreasonably large. Note -that only the first three expressions in Table II are SOREs, none of them -is a CHARE. The table shows clearly that CRX yields fairly good and concise -super-approximations to the original expressions. In some cases, the results -produced by RWR are more precise. For XTRACT, the size of the sample had to be -limited to 300–500 in order to avoid a crash. As can be seen from the table, -XTRACT performed excellently on the first example, but failed to generate an -expression that fitted the table in all other cases on all the sample sets we -tried. -Trang. We ran Trang [Clark] on the XML data discussed in this section. -In all but one case, Trang produced exactly the same output as CRX, with a -notable exception: for example1 Trang’s output depends on the order in which -the examples are presented, yielding either a1 + ?a2 ?a3 + ? or a1 + + (a2 ?a3 + ). The -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:37 - -Table II. Results of RWR, CRX and XTRACT on -Nonsimple Real-World DTDs and Generated Data -Original DTD -Element -Result of CRX -Sample -Result of RWR -size -Result of XTRACT -example1 -a1 + + (a2 ?a3 + ) -48 -a1 + ?a2 ?a3 + ? -48 -a1 + + (a2 ?a3 + ) -48 -a1 + ? + (a2 ?a3 + ?) -example2 -(a1 a2 ?a3 ?)?a4 ?(a5 + · · · + a18 )+ ? -2210 -a1 ?a2 ?a3 ?a4 ?(a5 + · · · + a18 )+ ? -2210 -(a1 a2 ?a3 ?)?a4 ?(a5 + · · · + a18 )+ ? -300 -an expression of 252 tokens -example3 -a1 ?(a2 a3 ?)?(a4 + · · · + a44 )+ ?a45 + -5741 -a1 ?a2 ?a3 ?(a4 + · · · + a44 )+ ?a45 + -5741 -a1 ?(a2 a3 ?)?(a4 + · · · + a44 )+ ?a45 + -400 -an expression of 142 tokens -example4 a1 ?a2 a3 ?a4 ?(a5 + + ((a6 + · · · + a61 )+ a5 + ?)) -10000 -a1 ?a2 a3 ?a4 ?(a6 + · · · + a61 )+ ?a5 + ? -10000 -a1 ?a2 a3 ?a4 ?(a6 + · · · + a61 )+ ?a5 + ? -500 -an expression of 185 tokens -+ -example5 -a1 (a2 + a3 )+ ?(a4 (a2 + a3 + a5 )+ ?) ? -+ -1281 -a1 (a2 + a3 + a4 + a5 ) ? -+ -1281 -a1 ((a2 + a3 + a4 )+ a5 + ?) ? -500 -an expression of 85 tokens -The left column gives element names, sample size for CRX, -RWR and XTRACT, respectively. The right column lists original -DTD, inferred DTD by CRX, by RWR and the result of XTRACT, -in that order. - -former is the same output as CRX, the latter is the intended RE that cannot -be derived by CRX as it is outside the class of CHAREs. This inconsistency in -Trang’s output casts some doubt on its correctness and underscores the need -for a formal model as the cornerstone of an implementation. Indeed, there is no -article or manual available describing the machinery underlying Trang. A look -at the Java-code indicates that Trang is related to, but different from, CRX: it -uses 2T-INF to construct an automaton, eliminates cycles by merging all nodes -in the same strongly connected component, and then transforms the obtained -DAG into a regular expression. However, no target class of REs for which Trang -is complete, as is the case for CRX, is specified. As Trang is similar to CRX, it is -outperformed by RWR and RWR2 . -7.2 RWR versus RWR2 -We tested the results and performance of RWR versus RWR2 for various values -of the rank cut-off parameter . The SOAs used in this test were randomly -generated with 5 and 10 alphabet symbols. The results are summarized in -Table III(a). We computed the average language size of the SOAs, which is the -target size. It should be noted that since no SORE corresponds to these SOAs, -the target size can never be attained since the regular expression resulting -from RWR or RWR2 will necessarily be a generalization of the SOA’s language. -It is immediately clear from Table III(a) that results of RWR2 are on average -better than those for RWR, and that they improve with increasing values of . -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:38 - -• - -G. J. Bex et al. -Table III. -(a) -|| = 5 || = 10 -target size 0.52 -0.67 -0 - -RWR - -RWR - -0.88 -0.80 - -0.98 -0.96 - -0.76 -0.73 -0.725 -0.722 -0.721 -0.720 - -0.95 -0.92 -0.916 -0.911 -0.908 -N/A - -2 - -RWR - -1 -2 -3 -4 -5 -∞ - -(b) -RWR || = 5 || = 10 - -2 - -1 -2 -3 -4 -5 -∞ - -28.8% -7.6% -3.2% -1.3% -0.7% -24.6% - -46.3% -7.3% -1.2% -0.0% -0.0% -N/A - -(a) Average language size for RWR and RWR2 for various values of -.  = ∞ denotes an exhaustive exploration of all possible repairs. -(b) Percentage of target expressions for which RWR outperforms RWR2 . - -For expressions of alphabet size 5, we were able to consider all possible repairs, -resulting in the entry for  = ∞ in Table III(a). This represents the smallest -language that includes the SOA’s language and that can be expressed by a -SORE. -Of course, the results in Table III(a) are averaged over 1000 randomly chosen -SOAs. A more detailed analysis reveals that for a considerable number of SOAs, -2 -RWR actually outperforms RWR for  = 1. Table III(a) shows the number of -2 -times RWR outperforms RWR for various values of . The probability that RWR -outperforms RWR2 drops rapidly for increasing values of , especially for larger -alphabet sizes. The last line in Table III(b) lists the probability that RWR derives -the optimal result, that is, that the smallest language representable by a SORE -is obtained for expressions of alphabet size 5. -Although the RWR2 algorithm clearly outperforms RWR in terms of the language size of the derived expression, there is a compelling argument in the -latter’s favor. In terms of running time, RWR outperforms RWR2 with a few orders of magnitude as is discussed in Section 7.5. -7.3 Incomplete Data -Unfortunately, in a real-world setting an available sample may simply contain -too little information to learn the target regular expression. To formalize this, -we introduce the notion of coverage. -Definition 36. A sample S covers a deterministic automaton A if for every -edge (s, t) in A there is a word w ∈ S whose unique accepting run in A traverses (s, t). Such a word w is called a witness for (s, t). A sample S covers a -deterministic regular expression r if it covers the automaton obtained from S -using the Glushkov construction for translating regular expressions into automata [Brüggeman-Klein 1993]. -If a sample S does not contain a witness for an edge, it may seem as if -the target expression cannot be learned, even if it is a SORE since the SOA -derived from the data has an edge missing. However, the repair rules introduce -extra edges, so this part of the algorithm may actually alleviate the problem of -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:39 - -Table IV. Percentage of -Successfully Derived Expressions -at Various Values of Sample -Coverage for CRX, RWR0 , RWR and -2 -1 - -RWR - -coverage CRX RWR0 RWR RWR21 -25.0 -85% 56% 12% 73% -35.0 -87% 48% 32% 73% -45.0 -96% 60% 57% 74% -55.0 -87% 58% 63% 57% -65.0 -82% 48% 58% 59% -75.0 -80% 51% 51% 63% -85.0 -63% 48% 47% 53% -92.5 -57% 48% 47% 61% -97.5 -85% 74% 64% 73% -100.0 -100% 100% 100% 100% - -incomplete data. This is indeed confirmed experimentally. It turns out that even -with a substantial fraction of missing witnesses, the target regular expression -can be learned with an astonishing degree of success. To quantify the missing -information, we introduce the following definition: -Definition 37. The coverage of a sample with respect to a target expression -r is the ratio of the number of edges of the SOA derived from the sample and -the SOA representing the target expression r. -The tests were done on 100 real-world regular expressions of alphabet sizes -up to 10, for 10 independently selected samples of varying coverage. The results are presented in Table IV. The straightforward CRX clearly outperforms all -other algorithms, although this result should be approached with some caution: -to give CRX a fair chance, the target expressions for this algorithm were limited -to CHAREs, while the other algorithms were tested on general SOREs as well. -Note that approximately 90% of real-world expressions are in fact CHAREs, -hence its superior performance is not only due to simpler target expressions. -The robustness of RWR21 is quite remarkable since it tends to derive more specific -regular expressions than RWR0 and RWR. One would expect the generalization -ability to decrease for algorithms that yield more specific results. This expectation is borne out when one compares RWR0 and RWR, however, RWR21 ’s greedy -application of the repair rules seems to pay off in the context of incomplete data -as well. -7.4 Noise -As already noted in the Introduction, real-world samples (such as XHTML) -need not be valid with respect to its known schema. Errors crop up due to -all sorts of circumstances. This underscores the need for a robust inference -algorithm that can handle some noise in the input sample. -Noise can come in several forms. To generate a noisy subsample, we modify -the target expression either by replacing a symbol by a different one from the -target’s expression, or by replacing it by a symbol that is not in the alphabet of -the target expression. We than use the modified target expression to generate -a complete sample. We define the noise level as follows. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:40 - -• - -G. J. Bex et al. - -Definition 38. Given a target expression r, the noise level of a sample S is -the ratio |S− L(r)|/|S|. -Here we propose an approach to filter the sample S based on the probability -of its words being generated by a probabilistic automaton, as we already used -in previous work [Bex et al. 2008]. This probabilistic automaton has one state -for each alphabet symbol, and the transition probabilities are computed using -the Baum-Welsh algorithm [Rabiner 1989]. Given the probabilistic automaton, -it is straightforward to compute the probability for each w ∈ S, so that one can -rank the sample’s words. One expects words that contain noise, that is, that -would be rejected by the target regular expression, to have low probability if -their number is not excessively large compared to the sample’s size. -To filter the sample, hoping to exclude those words that contain noise, we -compute the mean μ and standard deviation σ of the sample’s probabilities. A -string w ∈ S with probability P(w) is excluded if P(w) < μ − ασ . The factor α -is a parameter of the algorithm. The filtered sample S is now used to derive -a regular expression. It is of course possible that in the generation of S some -words needed to derive the target expression were removed. Hence there is no -guarantee that the derived regular expression will be an overapproximation of -the target expression. -Since it was shown in previous sections that RWR21 has the best overall performance, we focus solely on this algorithm in this section. In order to investigate -how robust RWR21 is with respect to noise we applied the algorithm to samples S -with increasing noise levels with a range of values for the cut-off α. We compute -the precision and the recall for each individual expression and use the average -values over many expressions to compute the F-value for a given noise level -and cut-off so that the optimal cut-off point can be determined. -To define precision and recall, consider the sample S = Svalid ∪ Sinvalid , where -Svalid ⊆ S contains the words in S accepted by the target expression and Sinvalid -contains the words in S not accepted by the target expression. A true positive is -a word in Svalid that is accepted by the derived expression, while a false negative -is a word in Svalid that is rejected by the derived expression. Similarly, a false -positive is a word in Sinvalid that is accepted by the derived expression, while a -true negative is a word in Sinvalid that is rejected by the derived expression. We -denote by St.p. the set of true positives, by St.n. the set of true negatives, by Sf .p. -the set of false positives, and by Sf .n. the set of false negatives. -Definition 39. The precision p, recall r, and F-value of a derived regular -expression on a sample S are given by -p= - -|St.p. | -, -(|St.p. | + |Sf .p. |) - -r= - -|St.p. | -, -(|St.p. | + |Sf .n. |) - -F= - -2 pr -. -p+r - -Furthermore, we are interested in the fraction of derived regular expressions -that is equivalent to the target expression. -We average over 580 SOREs obtained from a corpus of real-world DTDs. -The results are shown in Figure 16(a). From the F-value we can conclude -that a cut-off value α F ≈ 0.7 yields the best balance between precision and -recall. Figure 16(b) shows the fraction of derived regular expressions that is -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:41 - -Fig. 16. (a) F-value as a function of the cut-off value α for noise levels of 0.01 (squares), 0.02 -(circles), and 0.05 (triangles). (b) Fraction of derived expressions equivalent to the target expression -as a function of the cut-off value α for noise levels of 0.01 (squares), 0.02 (circles), and 0.05 -(triangles). - -equivalent to the target expression. For noise levels increasing from 0.01 to -0.05, the F-value as well as the percentage of derived expressions equivalent -to the target expression gradually decreases, as is to be expected. It should be -noted that recall r < 1 implies that the language represented by the derived -regular expression is not a superset of the target’s language. For the cut-off α F , -and a noise level of 0.01, approximately 16% of the derived regular expressions -allow false negatives, while the value for a noise level of 0.05 is 15%. The fact -that the derived expression is not a super-approximation may or may not be -acceptable, depending on the application. -Another interesting observation is that the number of derived expressions -that is equivalent to the target expression increases beyond the cut-off value -α F ; see Figure 16(b). For a noise level of 0.01, this trend continues up to -cut-off values of αequiv. ≈ 0.3 where it reaches a maximum of approximately -53%. However, at this value 20% of the derived regular expressions are not -super-approximations to their target expressions. For α < αequiv. , the F-value -decreases rapidly. For higher noise levels, the optimal cut-off value αequiv. is -smaller, but since it is very unlikely that one knows the noise level, it is hard -to take advantage of this fact by tuning αequiv. to a specific noise level. The -overall best result will be obtained for αequiv. ≈ 0 for noise levels not exceeding -0.05. -It should be noted that for a noise level of 0.01 at αequiv. , out the 53% of derived -regular expression that are equivalent to the target expression, about 7% is -not covered by the sample. The latter illustrates once more the generalization -ability of the algorithms RWR2 as was discussed in Section 7.3. -7.5 Performance -As mentioned previously, the one advantage RWR has over RWR2 is that the -former’s running time is much lower than the latter’s. This is illustrated in -Table V(a) for 1000 target expressions of alphabet size 10. It also shows the -relative running time for RWR0 , illustrating that RWR outperforms both RWR0 and -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:42 - -• - -G. J. Bex et al. -Table V. -(a) -relative running time -0 -RWR -6 · 102 -2 - -RWR - -1 -2 -3 -4 -5 - -2 · 102 -2 · 103 -1 · 104 -4 · 104 -1 · 105 - -(b) -|| time (ms) -5 -2 -10 -5 -15 -15 -20 -33 -50 -616 -100 -7562 - -(a) Relative running times of RWR2 versus RWR for various -values of . (b) Average running times in milliseconds for RWR -as a function of alphabet size. - -2 -2 -RWR for any value of . However, it is interesting to note that RWR1 outperforms -0 -RWR by a factor of 3, and derives more specific regular expressions, again -illustrating the superiority of the new algorithms over RWR0 . - -The performance of RWR is excellent: on average it takes only ms to derive -an expression of alphabet size 10. Table V(b) shows actual running times as a -function of the target expressions’ alphabet size, averaged over 1000 random -expressions of that alphabet size. -With respect to the performance in terms of the number of examples, we -showed in the conference version of this article that RWR0 ’s was adequate to -deal with large datasets. Example4 with 61 symbols in Table II is derived from -10000 example words in 7 seconds while CRX only needs 3.2 seconds. More -typical expressions of about 10 symbols derived from a few hundred examples -take approximately a second. These figures include the time to initialize a -Java Virtual Machine while the tests are done on a 2.5 GHz P4 with 1GB -of RAM. Given that RWR and RWR21 outperform RWR0 and the time required to -start the virtual machine and parse the data is independent of the algorithm, -our new algorithms are adequate as well. For instance, RWR derived a DTD -for PubMed from 10000 articles with a total size of over 1.2GB in 264 seconds -(again including the time needed for Java initialization and parsing of the XML -data). Trang slightly outperforms CRX thanks to very efficient XML parsing. We -did not make a detailed comparison with XTRACT for the reason that XTRACT -cannot handle samples with more than 1000 words. -8. EXTENSIONS -Incremental computation. Especially in the setting of sparse data when over -time more XML data gets generated, for instance, by answers to queries or -results of calls to Web services, it is desirable to update an already generated -schema based on the newly arrived XML data only. Such an approach is possible -for both RWR and CRX: as both algorithms make use of an internal representation -(automata or partial orders), we only need to update that representation. So, for -every element name we store the corresponding internal graph representation, -which is only quadratic in the number of different element names, and we can -forget about the XML data that generated it. Actually, for CRX, to assign the -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:43 - -qualifiers ?, + and ∗, we also need to remember for each element name how -it occurs (always exactly once, always more than once, . . . ), but this is only a -constant amount of information. -Numerical predicates. An immediate drawback of SOREs is that they cannot count. For instance, they cannot express aabb+ specifying that a string -should start with two a’s followed by any number of b’s larger than 1. XML -Schema even uses dedicated attributes for expressing the desired number of -repetitions. - - - - - -In the same way, REs can be extended by numerical predicates: when r is -an RE and i is a natural number then r ≥i and r =i are also REs. They are -semantically equivalent to r i r ∗ and r i , respectively, where r i = r · r · · · · · r (i -times). The preceding expression can then be expressed as a=2 b≥2 . To both RWR -and CRX a post-processing step can be added that rewrites + and ∗ to numerical -values based on exact occurrences of element names in the XML data. -Generation of XSDs. While the inference of DTDs essentially reduces to the -inference of regular expressions from sets of sample words (as illustrated in -Section 1.1), the inference of XSDs is much more complex. -Indeed, first and foremost, the content model of an element can only depend -on the element’s name in a DTD. XML Schema, in contrast, has a typing -mechanism that allows the content model of an element to depend not only on -its name, but also on the context in which it is used. We refer the interested -reader to Martens et al. [2006, 2007] for an in-depth discussion on the XML -Schema typing mechanism and the extra expressive power that it provides with -respect to DTDs. It is important to note, however, that the study of Martens -et al. [2006] also shows that 85% of XSDs in practice does not use this additional -power, and are hence structurally equivalent to a DTD. Obviously, inferring -such XSDs is merely a matter of using the correct syntax. How to extend -schema inference to deal with real XSDs that do use the additional power of -the XML Schema typing system is studied in a companion article [Bex et al. -2007]. -Second, DTDs have essentially only one atomic data type to describe the -textual data found in XML documents: #PCDATA. XML Schema, in contrast, has -atomic data types for numbers, strings, dates, etc. The algorithms described -here can easily be extended with heuristics to recognize these atomic data -types, such as the ones described by Hegewald et al. [2006]. -Inference of k-OREs. As the vast majority of expressions used in practical -schemas are SOREs, we focused in this article on the inference of SOREs. In -a companion article [Bex et al. 2008] we study the derivation of k-OREs, for -small values of k, thus covering virtually all expressions occurring in practice. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:44 - -• - -G. J. Bex et al. - -9. CONCLUSION -We introduced novel algorithms for the inference of concise regular expressions -from positive data. For the inference of SOREs, RWR2 was shown to yield the best -experimental results. It is also quite robust when presented with incomplete -and noisy data. The quality of inferred expressions on real-world and synthetic -datasets outperforms those returned by XTRACT where CRX is similar to Trang. -CRX’ generalization ability makes it highly qualified in dealing with very small -datasets. Further, RWR, RWR2 , and CRX always infer succinct expressions by definition which can easily be interpreted by humans. Of independent interest, we -introduced a new algorithm to transform automata into short, readable regular -expressions. -ELECTRONIC APPENDIX -The electronic appendix for this article can be accessed in the ACM Digital -Library. -ACKNOWLEDGMENTS - -We thank the authors of Garofalakis et al. [2003] for making available -XTRACT’s source code, as well as Wouter Gelade for comments on a previous draft of this article. -REFERENCES -ABITEBOUL, S., BUNEMAN, P., AND SUCIU, D. 1999. Data on the Web. Morgan Kaufmann Publishers. -AHONEN, H. 1996. Generating grammars for structured documents using grammatical inference methods. Ph.D. thesis, Report A-1996-4. Department of Computer Science, University of -Helsinki. -ANGLUIN, D. AND SMITH, C. H. 1983. Inductive inference: Theory and methods. ACM Comput. -Surv. 15, 3, 237–269. -BARBOSA, D., MENDELZON, A. O., KEENLEYSIDE, J., AND LYONS, K. A. 2002. ToXgene: An extensible -template-based data generator for XML. In Proceedings of the 5th International Workshop on the -Web and Databases (WebDB 2002). 49–54. -BARBOSA, D., MIGNET, L., AND VELTRI, P. 2006. Studying the XML web: Gathering statistics from -an XML sample. World Wide Web 9, 2, 187–212. -BENEDIKT, M., FAN, W., AND GEERTS, F. 2008. XPath satisfiability in the presence of DTDs. J. -ACM 55, 2, 1–79. -BERNSTEIN, P. A. 2003. Applying model management to classical meta data problems. In Online -Proceedings of the 1st Biennal Conference on Innovative Data Systems Research (CIDR’03). -BEX, G. J., GELADE, W., NEVEN, F., AND VANSUMMEREN, S. Learning deterministic regular expressions -for the inference of schemas from XML data. http://arxiv.org/abs/1004.2372. -BEX, G. J., GELADE, W., NEVEN, F., AND VANSUMMEREN, S. 2008. Learning deterministic regular -expressions for the inference of schemas from XML data. In Proceeding of the 17th International -Conference on World Wide Web (WWW’08). 825–834. -BEX, G. J., NEVEN, F., AND DEN BUSSCHE, J. V. 2004. DTDs versus XML Schema: A practical study. -In Proceedings of the International Workshop on Web and Database (WebDB). S. Amer-Yahia and -L. Gravano, Eds. 79–84. -BEX, G. J., NEVEN, F., SCHWENTICK, T., AND TUYLS, K. 2006. Inference of concise DTDs from XML -data. In Proceedings of the International Conference on Database Theory (VLDB). U. Dayal, K.-Y. -Whang, D. B. Lomet, G. Alonso, G. M. Lohman, M. L. Kersten, S. K. Cha, and Y.-K. Kim, Eds. -ACM, 115–126. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:45 - -BEX, G. J., NEVEN, F., AND VANSUMMEREN, S. 2007. Inferring XML schema definitions from XML -data. In Proceedings of the 33rd International Conference on Very Large Data Bases (VLDB’07). -998–1009. -BRĀZMA, A. 1993. Efficient identification of regular expressions from representative examples. In -Proceedings of the 6th Annual Conference on Computational Learning Theory (COLT’93). ACM -Press, 236–242. -BRÜGGEMAN-KLEIN, A. 1993. Regular expressions into finite automata. Theor. Comput. Sci. 120, 2, -197–213. -BRÜGGEMANN-KLEIN, A. AND WOOD, D. 1998. One-Unambiguous regular languages. Inform. Comput. 140, 2, 229–253. -BUNEMAN, P., DAVIDSON, S. B., FERNANDEZ, M. F., AND SUCIU, D. 1997. Adding structure to unstructured data. In Proceedings of the International Conference on Database Theory (ICDT’97). -Lecture Notes in Computer Science, vol. 1186. Springer, 336–350. -CARON, P. AND ZIADI, D. 2000. Characterization of Glushkov automata. Theor. Comput. Sci. 233, 1– -2, 75–90. -Castor. The Castor project. www.castor.org. -CHIDLOVSKII, B. 2001. Schema extraction from XML: A grammatical inference approach. In -Proceedings of the 8th International Workshop on Knowledge Representation meets Databases -(KRDB’01). CEUR Workshop Proceedings, vol. 45. -CLARK, -J. -Trang: -Multi-Format -schema -converter -based -on -RELAX -NG. -www.thaiopensource.com/relaxng/trang.html. -COVER, R. 2003. The Cover Pages. xml.coverpages.org. -DELGADO, M. AND MORAIS, J. 2004. Approximation to the smallest regular expression for a given -regular language. In Proceedings of the, 9th International Conference on Implementation and -Application of Automata. Lecture Notes in Computer Science, vol. 3317. Springer, 312–314. -DEUTSCH, A., FERNANDEZ, M. F., AND SUCIU, D. 1999. Storing semistructured data with STORED. -In Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM -Press, 431–442. -EHRENFEUCHT, A. AND ZEIGER, P. 1976. Complexity measures for regular expressions. J. Comput. -Syst. Sci. 12, 134–146. -FERNANDEZ, M. F. AND SUCIU, D. 1998. Optimizing regular path expressions using graph schemas. -In Proceedings of the 14th International Conference on Data Engineering (ICDE’98). 14– -23. -FERNAU, H. 2004. Extracting minimum length document type definitions is NP-hard. In Proceedings of the 7th International Colloquium on Grammatical Inference: Algorithms and Applications. -Lecture Notes in Artificial Intelligence, vol. 3264. Springer, 277–278. -FERNAU, H. 2009. Algorithms for learning regular expressions from positive data. Inform. Comput. 207, 4, 521–541. -FLORESCU, D. 2005. Managing semi-structured data. ACMQueue 3, 8, 18–24. -GARCÍA, P. AND VIDAL, E. 1990. Inference of k-testable languages in the strict sense and application -to syntactic pattern recognition. IEEE Trans. Patt. Anal. Mach. Intell. 12, 9, 920–925. -GAROFALAKIS, M., GIONIS, A., RASTOGI, R., SESHADRI, S., AND SHIM, K. 2003. XTRACT: Learning -document type descriptors from XML document collections. Data Mining Knowl. Discov. 7, 23– -56. -GELADE, W. AND NEVEN, F. 2008. Succinctness of the complement and intersection of regular -expressions. In Proceedings of the 25th Annual Symposium on Theoretical Aspects of Computer -Science (STACS’08). Dagstuhl Seminar Proceedings, vol. 08001. 325–336. -GOLD, E. 1967. Language identification in the limit. Inform. Control 10, 5, 447–474. -GOLDMAN, R. AND WIDOM, J. 1997. DataGuides: Enabling query formulation and optimization in -semistructured databases. In Proceedings of the 23rd International Conference on Very Large -Data Bases (VLDB’97). 436–445. -GRUBER, H. AND HOLZER, M. 2008. Finite automata, digraph connectivity, and regular expression size. In Proceedings of the 35th International Colloquium on Automata, Languages and -Programming. Lecture Notes in Computer Science, vol. 5126. Springer, 39–50. -HAN, Y.-S. AND WOOD, D. 2007. Obtaining shorter regular expressions from finite-state automata. -Theor. Comput. Sci. 370, 1–3, 110–120. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - 11:46 - -• - -G. J. Bex et al. - -HEGEWALD, J., NAUMANN, F., AND WEIS, M. 2006. XStruct: Efficient schema extraction from multiple and large XML documents. In Proceedings of the 22nd International Conference on Data -Engineering Workshops (ICDEW’06). IEEE Computer Society, 81–97. -HINKELMAN, S. 2005. Business integration—Information conformance statements (BI-ICS). Tech. -rep., IBM DeveloperWorks. -HOPCROFT, J. AND ULLMAN, J. 1979. Introduction to Automata Theory, Languages and computation. -Addison-Wesley. -HUET, G. 1980. Confluent reductions: Abstract properties and applications to term rewriting -systems. J. ACM 27, 4, 797–821. -KOCH, C., SCHERZINGER, S., SCHWEIKARDT, N., AND STEGMAIER, B. 2004. Schema-Based scheduling of -event processors and buffer minimization for queries on structured data streams. In Proceedings -of the 30th International Conference on Very Large Data Bases (VLDB’04). 228–239. -MANOLESCU, I., FLORESCU, D., AND KOSSMANN, D. 2001. Answering XML queries on heterogeneous data sources. In Proceedings of 27th International Conference on Very Large Data Bases -(VLDB’01). 241–250. -MARTENS, W., NEVEN, F., AND SCHWENTICK, T. 2007. Simple off the shelf abstractions for XML -schema. SIGMOD Rec. 36, 3, 15–22. -MARTENS, W., NEVEN, F., SCHWENTICK, T., AND BEX, G. J. 2006. Expressiveness and complexity of -XML schema. ACM Trans. Data. Syst. 31, 3. -MCHUGH, J., ABITEBOUL, S., GOLDMAN, R., QUASS, D., AND WIDOM, J. 1997. Lore: A database management system for semistructured data. SIGMOD Rec. 26, 3, 54–66. -MELNIK, S. 2004. Generic model management: Concepts and algorithms. Ph.D. thesis, University -of Leipzig. -MIGNET, L., BARBOSA, D., AND VELTRI, P. 2003. The XML web: A first study. In Proceedings of the -12th International World Wide Web Conference. 500–510. -MIKLAU, G. 2002. XMLData repository. www.cs.washington.edu/research/xmldatasets. -MIN, J.-K., AHN, J.-Y., AND CHUNG, C.-W. 2003. Efficient extraction of schemas for XML documents. -Inform. Process. Lett. 85, 1, 7–12. -NESTOROV, S., ABITEBOUL, S., AND MOTWANI, R. 1998. Extracting schema from semistructured data. -In Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM -Press, 295–306. -NESTOROV, S., ULLMAN, J. D., WIENER, J. L., AND CHAWATHE, S. S. 1997. Representative objects: Concise representations of semistructured, hierarchial data. In Proceedings of the 13th International -Conference on Data Engineering. IEEE Computer Society, 79–90. -NEVEN, F. AND SCHWENTICK, T. 2006. On the complexity of XPath containment in the presence of -disjunction, DTDs, and variables. Logical Methods Comput. Sci. 2, 3. -NGU, A. H. H., ROCCO, D., CRITCHLOW, T., AND BUTTLER, D. 2005. Automatic discovery and inferencing of complex bioinformatics web interfaces. World Wide Web 8, 4, 463–493. -OAKS, P. AND TER HOFSTEDE, A. H. M. 2007. Guided interaction: A mechanism to enable ad hoc -service interaction. Inform. Syst. Frontiers 9, 1, 29–51. -OHLEBUSCH, E. 2001. Implementing conditional term rewriting by graph rewriting. Theor. Comput. Sci. 262, 1, 311–331. -OPEN WEB APPLICATION SECURITY PROJECT CONSORTIUM. 2004. The top ten most critical web application security vulnerabilities—2004 update. www.owasp.org. -PITT, L. 1989. Inductive inference, DFAs, and computational complexity. In Proceedings of the -International Workshop on Analogical and Inductive Inference (AII’89). Springer-Verlag, 18– -44. -RABINER, L. 1989. A tutorial on hidden Markov models and selected applications in speech -recognition. Proc. IEEE 77, 2, 257–286. -RAHM, E. AND BERNSTEIN, P. A. 2001. A survey of approaches to automatic schema matching. -VLDB J. 10, 4, 334–350. -SAHUGUET, A. 2000. Everything you ever wanted to know about DTDs, but were afraid to ask -(extended abstract). In Proceedings of the 3rd International Workshop on The World Wide Web -and Databases, (WebDB’00), Selected Papers. 171–183. -SAKAKIBARA, Y. 1997. Recent advances of grammatical inference. Theor. Comput. Sci. 185, 1, -15–45. -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - Inference of Concise Regular Expressions and DTDs - -• - -11:47 - -SANKEY, J. AND WONG, R. K. 2001. Structural inference for semistructured data. In Proceedings of -the International Conference on Information and Knowledge Management. ACM Press, 159–166. -Sun. Sun JAXB. java.sun.com/webservices/jaxb. -THOMPSON, H. S., BEECH, D., MALONEY, M., AND MENDELSOHN, N. 2004. XML Schema part 1: Structures 2nd Ed. World Wide Web Consortium, Recommendation REC-xmlschema-1-20041028. -W3C. 2002. XHTML 1.0 The Extensible HyperText Markup Language, 2nd Ed. W3C. -WANG, G., LIU, M., YU, J. X., SUN, B., YU, G., LV, J., AND LU, H. 2003. Effective schema-based XML -query optimization techniques. In Proceedings of the 7th International Database Engineering -and Applications Symposium. 230–235. -Received January 2009; revised July 2009; accepted November 2009 - -ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010. - - \ No newline at end of file diff --git a/tests/test_bex.py b/tests/test_bex.py index ad62471..a00eabe 100644 --- a/tests/test_bex.py +++ b/tests/test_bex.py @@ -1,8 +1,5 @@ """Tests for BEX paper algorithm implementations.""" -import sys -sys.path.insert(0, '/home/tobi/Desktop/kesai/ProjectManagement/companyweb') - from bex.soa import SOA from bex.twotinf import build_soa from bex.rwr0 import rwr0 @@ -273,7 +270,7 @@ def run_all(): # ── Integration tests with real Ansible task data ── -def test_integration_quartz_deploy(): +def test_integration_linear_sequence(): """Simple linear sequence — all tasks always in same order.""" seqs = [ ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'], @@ -283,11 +280,11 @@ def test_integration_quartz_deploy(): result = crx.infer(seqs) assert result is not None assert all(t in result for t in ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for']) - print(f" PASS quartz_deploy: {result}") + print(f" PASS linear_sequence: {result}") -def test_integration_validate_system(): - """Optional shell tasks.""" +def test_integration_optional_tasks(): + """Optional tasks — some sequences have more of the same.""" seqs = [ ['shell', 'debug', 'shell', 'debug'], ['shell', 'debug', 'shell', 'debug', 'shell', 'debug'], @@ -297,11 +294,11 @@ def test_integration_validate_system(): result = crx.infer(seqs) assert result is not None assert 'shell' in result and 'debug' in result - print(f" PASS validate_system: {result}") + print(f" PASS optional_tasks: {result}") -def test_integration_docker_detect_branch(): - """Branching: docker compose v2 check or v1 fallback.""" +def test_integration_branching_paths(): + """Branching: one path or an alternative.""" seqs = [ ['file', 'template', 'command_v2', 'set_fact', 'shell', 'wait_for'], ['file', 'template', 'command_v1', 'set_fact', 'shell', 'wait_for'], @@ -310,11 +307,11 @@ def test_integration_docker_detect_branch(): result = crx.infer(seqs) assert result is not None assert 'file' in result and 'template' in result and 'shell' in result - print(f" PASS docker_detect: {result}") + print(f" PASS branching_paths: {result}") -def test_integration_firewall_gating(): - """Conditional firewall rule sequence (gated).""" +def test_integration_conditional_tasks(): + """Tasks that sometimes appear, sometimes not.""" seqs = [ ['assert', 'file', 'template', 'shell', 'wait_for'], ['assert', 'file', 'template', 'command_fw', 'command_fw', 'shell', 'wait_for'], @@ -324,7 +321,7 @@ def test_integration_firewall_gating(): result = crx.infer(seqs) assert result is not None assert 'assert' in result and 'file' in result - print(f" PASS firewall_gating: {result}") + print(f" PASS conditional_tasks: {result}") def test_integration_idregex_linear(): @@ -361,8 +358,8 @@ def test_integration_ikoa_linear(): print(f" PASS ikoa_linear: {expr}") -def test_integration_backup_restic(): - """Sequence with loop (systemd enable).""" +def test_integration_looping_tasks(): + """Sequence with loop (repeated tasks).""" seqs = [ ['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd', 'systemd', 'systemd'], ['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd'], @@ -370,7 +367,7 @@ def test_integration_backup_restic(): crx = CRX() result = crx.infer(seqs) assert result is not None - print(f" PASS backup_restic: {result}") + print(f" PASS looping_tasks: {result}") def run_all():