From 6bf7a681ce114672fde815dab2a8703809fb1a73 Mon Sep 17 00:00:00 2001
From: tobjend <tobend85@gmail.com>
Date: Wed, 1 Jul 2026 11:28:42 +0200
Subject: [PATCH] purge make_charts.py, examples/, full-text papers, blog_post
 (moved to ~/Desktop/kesai/); translate German CLI to English

---
 bex/cli.py                 |   50 +-
 blog_post.md               |  263 ----
 examples/role_grammar.py   |  111 --
 examples/yaml_to_seq.py    |   81 --
 make_charts.py             |   71 -
 papers/README.md           |    6 +
 papers/paper_arxiv2010.txt | 2210 --------------------------------
 papers/paper_tods2010.txt  | 2492 ------------------------------------
 tests/test_bex.py          |   31 +-
 9 files changed, 45 insertions(+), 5270 deletions(-)
 delete mode 100644 blog_post.md
 delete mode 100644 examples/role_grammar.py
 delete mode 100644 examples/yaml_to_seq.py
 delete mode 100644 make_charts.py
 create mode 100644 papers/README.md
 delete mode 100644 papers/paper_arxiv2010.txt
 delete mode 100644 papers/paper_tods2010.txt

diff --git a/bex/cli.py b/bex/cli.py
index f69d530..7d60f67 100644
--- a/bex/cli.py
+++ b/bex/cli.py
@@ -19,7 +19,7 @@ from .ilocal import iLocal, extract_contexts_from_file, reduce_contexts
 
 
 def find_yaml_files(directory):
-    """Findet alle YAML-Dateien in einem Verzeichnis (rekursiv)."""
+    """Find all YAML files in a directory (recursive)."""
     patterns = ['**/*.yml', '**/*.yaml']
     files = []
     for pattern in patterns:
@@ -32,37 +32,37 @@ def main():
         description='bex — BEX-based YAML Grammar Inference',
     )
     parser.add_argument('--dir', type=str, default='roles/',
-                        help='Verzeichnis mit YAML-Dateien (default: roles/)')
+                        help='Directory with YAML files (default: roles/)')
     parser.add_argument('--k-max', type=int, default=5,
-                        help='Max k für k-ORE-Inferenz (default: 5)')
+                        help='Max k for k-ORE inference (default: 5)')
     parser.add_argument('--context', type=str, default=None,
-                        help='Auf spezifischen Container-Key beschränken (z.B. tasks)')
+                        help='Restrict to specific container key (e.g. tasks)')
     parser.add_argument('--output', type=str, default=None,
-                        help='Output-Datei für Template (default: stdout)')
+                        help='Output file for template (default: stdout)')
     parser.add_argument('--ilocal', action='store_true',
-                        help='iLocal-Kontextanalyse durchführen')
+                        help='Run iLocal context analysis')
     parser.add_argument('--crx', action='store_true',
-                        help='CRX (direct CHARE inference) verwenden')
+                        help='Use CRX (direct CHARE inference)')
     parser.add_argument('--verbose', '-v', action='store_true',
-                        help='Ausführliche Ausgabe')
+                        help='Verbose output')
     parser.add_argument('--stats', action='store_true',
-                        help='Zeige Token-Statistiken')
+                        help='Show token statistics')
 
     args = parser.parse_args()
 
     if not os.path.isdir(args.dir):
-        print(f"Fehler: Verzeichnis '{args.dir}' nicht gefunden.", file=sys.stderr)
+        print(f"Error: directory '{args.dir}' not found.", file=sys.stderr)
         sys.exit(1)
 
     yaml_files = find_yaml_files(args.dir)
     if not yaml_files:
-        print(f"Keine YAML-Dateien in '{args.dir}' gefunden.", file=sys.stderr)
+        print(f"No YAML files found in '{args.dir}'.", file=sys.stderr)
         sys.exit(1)
 
-    print(f"Gefundene YAML-Dateien: {len(yaml_files)}", file=sys.stderr)
+    print(f"Found YAML files: {len(yaml_files)}", file=sys.stderr)
 
     if args.ilocal:
-        print("\n=== iLocal: Kontext-Extraktion ===", file=sys.stderr)
+        print("\n=== iLocal: Context Extraction ===", file=sys.stderr)
         all_contexts = {}
         for f in yaml_files:
             contexts = extract_contexts_from_file(f)
@@ -72,11 +72,11 @@ def main():
                 all_contexts[ctx].extend(seqs)
 
         reduced = reduce_contexts(all_contexts)
-        print(f"  Kontexte gefunden: {len(reduced)}", file=sys.stderr)
+        print(f"  Contexts found: {len(reduced)}", file=sys.stderr)
         for ctx, seqs in sorted(reduced.items()):
             lengths = [len(s) for s in seqs]
-            print(f"    {ctx}: {len(seqs)} Sequenzen, "
-                  f"Längen {min(lengths)}-{max(lengths)}, "
+            print(f"    {ctx}: {len(seqs)} sequences, "
+                  f"lengths {min(lengths)}-{max(lengths)}, "
                   f"unique_seqs={len(set(tuple(s) for s in seqs))}",
                   file=sys.stderr)
 
@@ -94,30 +94,30 @@ def main():
                     print(f"  {os.path.relpath(f)}: {seq}", file=sys.stderr)
         except Exception as e:
             if args.verbose:
-                print(f"  Fehler in {f}: {e}", file=sys.stderr)
+                print(f"  Error in {f}: {e}", file=sys.stderr)
 
     if not all_sequences:
-        print("Keine Sequenzen extrahiert.", file=sys.stderr)
+        print("No sequences extracted.", file=sys.stderr)
         sys.exit(1)
 
-    print(f"  Sequenzen extrahiert: {len(all_sequences)}", file=sys.stderr)
+    print(f"  Sequences extracted: {len(all_sequences)}", file=sys.stderr)
     lengths = [len(s) for s in all_sequences]
-    print(f"  Längen: min={min(lengths)}, max={max(lengths)}, "
+    print(f"  Lengths: min={min(lengths)}, max={max(lengths)}, "
           f"avg={sum(lengths)/len(lengths):.1f}", file=sys.stderr)
 
     if args.stats:
         stats = tokenizer.get_statistics()
-        print("\n=== Token-Statistiken ===", file=sys.stderr)
+        print("\n=== Token Statistics ===", file=sys.stderr)
         for token, count in list(stats.items())[:30]:
             print(f"  {token}: {count}", file=sys.stderr)
 
-    print("\n=== k-ORE Inferenz ===", file=sys.stderr)
+    print("\n=== k-ORE Inference ===", file=sys.stderr)
     kore = kOREInference(k_max=args.k_max)
 
     if args.crx:
         result = kore.infer_with_crx(all_sequences)
         _, expr, method = result
-        print(f"  Methode: {method}", file=sys.stderr)
+        print(f"  Method: {method}", file=sys.stderr)
     else:
         result = kore.infer(all_sequences)
         if result:
@@ -127,7 +127,7 @@ def main():
             expr = "∅"
             print("  Kein Ergebnis", file=sys.stderr)
 
-    print(f"  Inferierter Ausdruck: {expr}", file=sys.stderr)
+    print(f"  Inferred expression: {expr}", file=sys.stderr)
 
     print("\n=== One-Shot Template ===", file=sys.stderr)
     print(file=sys.stderr)
@@ -136,7 +136,7 @@ def main():
     if args.output:
         with open(args.output, 'w') as f:
             f.write(template)
-        print(f"Template geschrieben nach: {args.output}", file=sys.stderr)
+        print(f"Template written to: {args.output}", file=sys.stderr)
     else:
         print(template)
 
diff --git a/blog_post.md b/blog_post.md
deleted file mode 100644
index 954e266..0000000
--- a/blog_post.md
+++ /dev/null
@@ -1,263 +0,0 @@
-# Dervish: Discovering Unwritten Conventions with Grammar Inference
-
-<p align="left"><img src="dervish-logo.png" alt="Dervish" width="180"></p>
-
-**How we turned 36 Ansible roles into a 200-character grammar — and why
-it matters for LLM agents.**
-
-## The problem
-
-Every codebase has unwritten conventions. Your team's Docker Compose
-files always put `image` before `ports` before `volumes`. Your Ansible
-deploy roles always start with `assert`, then `file`, then `template`.
-Your CI pipelines always run `lint` before `test` before `deploy`.
-
-Nobody writes these down. They're emergent — copied from role to role,
-file to file, until they become a tacit standard.
-
-When an LLM agent needs to generate new content that follows these
-conventions, you have two options:
-
-1. **Stuff every existing file into context** — 36 deploy roles = 15,000
-   tokens. You'll hit the context window on your third example.
-2. **Give it one or two examples and hope** — the LLM will guess the
-   pattern, and it will often guess wrong.
-
-Neither is good. The first is wasteful. The second is unreliable.
-
-What you really want is the **compiled convention** — the minimal
-description of what all 36 roles share, expressed in ~200 tokens. An
-LLM can follow a rule in 200 tokens far more reliably than it can
-infer a pattern from 36 examples.
-
-This is grammar inference.
-
-## The approach
-
-Given a set of example sequences over some alphabet (e.g., Ansible
-module names, Docker Compose keys, CI job names), learn a regular
-expression that describes the general pattern.
-
-We implemented two algorithms from Bex et al., a pair of papers from
-TODS 2010 and arXiv 2010:
-
-- **CRX** (TODS 2010 §6): A single-pass algorithm that builds a
-  predecessor relation over symbols, computes equivalence classes,
-  and emits a Chain Regular Expression (CHARE) that matches ALL
-  input sequences. Fast, deterministic, captures the full vocabulary.
-
-- **iDRegEx** (arXiv 2010): A probabilistic algorithm using k-testable
-  Observation Automata (k-OA) trained with Baum-Welch EM. It finds
-  only the *minimal common core* — the symbols that appear in every
-  example. Robust against noise, but fails (returns ∅) when the
-  examples are too diverse.
-
-Both run in the **ensemble**: CRX produces a permissive grammar (full
-vocabulary, many optional parts), iDRegEx produces a strict grammar
-(minimal core). A Minimum Description Length (MDL) score picks the
-winner: the grammar that compresses the data best.
-
-## The algorithms, briefly
-
-### CRX — Chain Regular Expression inference
-
-CRX (Algorithm 7, TODS 2010) works in four steps:
-
-1. **Build the immediate-predecessor relation.** For every adjacent
-   pair (x, y) across all sequences, record that x precedes y. If
-   symbol `assert` always appears before `file`, record
-   `assert → file`.
-
-2. **Compute equivalence classes.** Take the reflexive-transitive
-   closure of the predecessor relation. The strongly connected
-   components are *equivalence classes* — groups of symbols that can
-   appear in the same position. If `copy` and `template` both follow
-   `file` and precede `command`, they're in the same class.
-
-3. **Merge singleton classes.** A class with one symbol that shares
-   the same predecessor/successor sets as another singleton class
-   gets merged. This handles symbols that always appear in the
-   same structural position.
-
-4. **Topological sort.** The equivalence classes are sorted by their
-   position in the Hasse diagram of the predecessor relation. Each
-   class becomes a factor in the output, annotated with a quantifier:
-   - `+` (one or more) if the class forms a cycle
-   - `+?` (zero or more) if the class appears variably
-   - `?` (optional) if the class can be absent
-   - (exact) if the class always appears exactly once
-
-The result is a CHARE: a sequence of factors where each factor is a
-disjunction of equivalent symbols with a quantifier.
-
-### iDRegEx — k-optimal regular expression inference
-
-iDRegEx (Algorithm 4, arXiv 2010) uses a probabilistic automaton:
-
-1. **Build a complete k-OA.** A k-testable Observation Automaton
-   records all k-grams (subsequences of length k) from the input
-   sequences. The automaton's states represent (k-1)-grams.
-
-2. **Train with Baum-Welch.** EM iterations assign probabilities to
-   transitions, learning which paths through the automaton are most
-   likely given the data.
-
-3. **Disambiguate.** Remove nondeterministic transitions — for any
-   state and symbol, keep only the most probable next state.
-
-4. **Prune.** Remove low-probability edges and unreachable states,
-   leaving only the most likely paths.
-
-5. **Extract with rwr².** The REWRITE-SQUARED algorithm (rwr²,
-   Algorithm 3) collapses the pruned automaton into a k-optimal
-   regular expression — the minimal common core.
-
-### MDL scoring — picking the right level of specificity
-
-The Minimum Description Length principle (Rissanen 1978) says: the
-best grammar is the one that minimizes the sum of its own size and
-the cost of encoding the data using it.
-
-```
-MDL = model_cost + data_cost
-```
-
-**model_cost** = the number of alphabet symbol occurrences in the
-grammar. A grammar with 5 unique symbols used once each has
-model_cost = 5.
-
-**data_cost** = Σ log₂(|L(r)|) across all sequences, where |L(r)| is
-the number of strings of length len(s) that the grammar accepts.
-A grammar like `(a+b+c+...+z)+` accepts 19 possible symbols at each
-position, so for a sequence of length 120, the data cost is
-120 × log₂(19) ≈ 510 bits. A grammar like `a.b.c.d.e` accepts only
-1 string of length 5, so data cost is 0.
-
-The ensemble picks the grammar with the lowest total MDL. This
-automatically balances specificity against coverage: a grammar that
-matches only 1 sequence but does so perfectly (low data cost) can
-beat a grammar that matches all sequences but is extremely permissive
-(high data cost).
-
-## The results
-
-### Ansible deploy roles — 36 roles from companyweb
-
-Your own deploy roles cover everything from AdGuard Home to
-Woodpecker CI. They have NO schema — each is a free-form script.
-
-```
-Grammar: docker_volume+?.group?.docker_container?.user?.apt?.npm?.
-         (assert+...+command+copy+file+template+set_fact+...+wait_for)+?.
-         (cron+firewalld)?
-Match:   36/36
-MDL:     2186.28
-```
-
-Bottleneck analysis: optional docker setup (volume, group, container,
-user, apt, npm), then a large disjunction of ~25 task modules (one or
-more), then optional cron/firewalld at the end. This captures the
-convention precisely.
-
-**Compression: 36 roles (15,000 tokens) → 200 tokens (75×)**
-
-### Geerlingguy Galaxy roles — 15 popular roles
-
-Jeff Geerling's roles are the most popular on Ansible Galaxy. He has
-never documented their structural pattern. Yet every one of the 15
-follows the same arc:
-
-```
-Grammar: fail?.(include_vars+set_fact+package+file+template+service+...)+.
-         include+?.(npm+pip)+?.lineinfile?
-Match:   15/15
-MDL:     596.64
-```
-
-Check prerequisites, OS-specific variables, install packages,
-configure with templates, start services, optionally run sub-tasks,
-install npm/pip packages, and optionally tweak config lines.
-
-**This is the first explicit description of the geerlingguy role
-module ordering convention.** It took 15 roles and a grammar inference
-algorithm to write it down.
-
-**Compression: 15 roles (5,000 tokens) → 60 tokens (83×)**
-
-### Ensemble dynamics
-
-The ensemble (CRX + iDRegEx + MDL) selects different winners
-depending on the data:
-
-| Dataset | Winner | Why |
-|---------|--------|-----|
-| Ansible galaxy (15 roles) | CRX | iDRegEx returned ∅ (too diverse) |
-| Helm prom-stack (6 configs) | **iDRegEx** | Finds minimal core across all configs |
-| Terraform modules (8) | CRX | iDRegEx returned ∅ (no common core across domains) |
-| Terraform modules (8) | CRX | Every resource type optional across domains |
-| GitHub Actions Go lint (6) | CRX | Tight pattern, all match |
-
-iDRegEx wins when the data has a clear common core. CRX wins when
-there's no single shared subsequence (the roles share the *vocabulary*
-but not the *order*).
-
-## The MCP
-
-The engine is exposed as an MCP server:
-
-```python
-from bex.mcp_server import infer_best_grammar
-
-# Full coverage
-output = infer_best_grammar(
-    sequences=role_sequences,
-    prefer="crx",
-)
-# Returns:
-#   Best: CRX (MDL 288)
-#   Grammar: fail?.(include_vars+set_fact+package+file+template+service+...)+
-#            .include+?.(npm+pip)+?.lineinfile?
-
-# Ensemble — let MDL pick
-output = infer_best_grammar(sequences=role_sequences)
-```
-
-An agent workflow:
-
-1. Agent needs to write an Ansible role
-2. Finds 15 existing geerlingguy roles, extracts their task module sequences
-3. Calls `infer_best_grammar(sequences=..., prefer='crx')`
-4. Gets back the grammar in ~60 tokens
-5. Generates a new role that follows the structural pattern
-
-Without the MCP: 15 role files in context (5,000 tokens), or guesswork.
-With the MCP: one grammar rule (~60 tokens), known to match 15/15 roles.
-
-## What it means
-
-Grammar inference turns **examples** into **rules**. The rule is a
-compressed description of the structural convention — and for
-schema-less content like the geerlingguy role module ordering, this is
-the *first time* the convention has been written down at all.
-
-For LLM agents, this changes the trade-off between context and
-accuracy. Instead of flooding the context window with examples, the
-agent can call the MCP, get the rule in ~60 tokens, and follow it.
-The rule is more reliable than guessing from examples, and it costs
-less than the first example would have.
-
-The algorithm doesn't need to understand what a deploy role does. It
-doesn't know that `file` creates directories and `template` renders
-Jinja2. It only needs to see 36 sequences of module names and find
-the pattern they all share. The structural convention is in the data
-— you just have to extract it.
-
-## References
-
-- Bex, G. J., Gelade, W., Neven, F., & Vansummeren, S. (2010).
-  [*Learning Deterministic Regular Expressions for the Web.*](https://doi.org/10.1145/1806907.1806911) TODS 2010.
-- Bex, G. J., Gelade, W., Martens, W., & Neven, F. (2010).
-  [*Simplifying XML Schema: Single-Type Approximations of Regular
-  Expressions.*](https://arxiv.org/abs/1004.2372) arXiv:1004.2372.
-- Rissanen, J. (1978). *Modeling by shortest data description.*
-  Automatica 14(5).
diff --git a/examples/role_grammar.py b/examples/role_grammar.py
deleted file mode 100644
index 79c2fe8..0000000
--- a/examples/role_grammar.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""Extract Ansible role task module sequences and learn per-group grammars."""
-
-from pathlib import Path
-import yaml
-from collections import defaultdict
-
-from .crx import CRX
-from .expr import strip_k
-
-
-IGNORE_MODULES = frozenset({'name', 'tags', 'when', 'register', 'no_log',
-                            'changed_when', 'failed_when', 'ignore_errors',
-                            'run_once', 'delegate_to', 'loop', 'loop_control',
-                            'until', 'retries', 'delay', 'poll', 'async',
-                            'become', 'become_user', 'become_flags',
-                            'check_mode', 'diff', 'environment',
-                            'vars', 'notify', 'args',
-                            'block', 'rescue', 'always', 'include_tasks'})
-
-
-def extract_module_name(task):
-    """Extract the Ansible module name from a task dict.
-
-    The module is the key that is NOT a known non-module key.
-    Returns 'skip' for non-task entries like block/rescue/always.
-    """
-    if not isinstance(task, dict):
-        return None
-    # Check for block/rescue/always — these contain nested tasks
-    for key in ('block', 'rescue', 'always'):
-        if key in task:
-            nested = task[key]
-            if isinstance(nested, list):
-                return [extract_module_name(t) for t in nested]
-            return None
-    # Find the module key (not name, not meta-keys)
-    for key, value in task.items():
-        if key in ('name',):
-            continue
-        if key in IGNORE_MODULES:
-            continue
-        if isinstance(value, (dict, list, str, bool, int, float)):
-            # It's the module name (venv or fqcn)
-            return strip_k(key)
-    return None
-
-
-def flatten_nested(seq):
-    """Flatten nested lists into a single list."""
-    result = []
-    for item in seq:
-        if isinstance(item, list):
-            result.extend(flatten_nested(item))
-        elif item is not None and item != 'skip':
-            result.append(item)
-    return result
-
-
-def get_role_category(role_name):
-    """Extract category from role name like deploy_foo → deploy."""
-    parts = role_name.split('_')
-    if len(parts) >= 2:
-        return parts[0]
-    return 'other'
-
-
-def load_role_module_sequence(role_dir):
-    """Load a role's task file and extract the module sequence."""
-    task_file = role_dir / 'tasks' / 'main.yml'
-    if not task_file.exists():
-        return None, None
-    with open(task_file) as f:
-        data = yaml.safe_load(f)
-    if not isinstance(data, list):
-        return None, None
-
-    modules = []
-    for task in data:
-        result = extract_module_name(task)
-        if isinstance(result, list):
-            modules.extend(flatten_nested(result))
-        elif result is not None:
-            modules.append(result)
-
-    return role_dir.name, modules
-
-
-def collect_all_role_sequences(roles_dir='roles'):
-    """Collect module sequences from all roles, grouped by category."""
-    by_category = defaultdict(list)
-    all_roles = []
-    for role_dir in sorted(Path(roles_dir).glob('*/tasks/main.yml')):
-        role_name = role_dir.parent.parent.name
-        name, seq = load_role_module_sequence(role_dir.parent.parent)
-        if seq:
-            cat = get_role_category(role_name)
-            by_category[cat].append((role_name, seq))
-            all_roles.append((role_name, seq))
-    return all_roles, by_category
-
-
-def learn_grammar(sequences):
-    """Run CRX on a list of sequences."""
-    if len(sequences) < 2:
-        seqs = [sequences[0]] if sequences else []
-    else:
-        seqs = sequences
-    if not seqs:
-        return 'ε'
-    crx = CRX()
-    return crx.infer(seqs)
diff --git a/examples/yaml_to_seq.py b/examples/yaml_to_seq.py
deleted file mode 100644
index f8937b0..0000000
--- a/examples/yaml_to_seq.py
+++ /dev/null
@@ -1,81 +0,0 @@
-"""Convert YAML files to key-path sequences for BEX grammar inference."""
-
-from pathlib import Path
-import yaml
-
-
-def yaml_to_keypath_sequence(data, prefix=""):
-    """Convert parsed YAML data to a sequence of key paths (DFS traversal).
-
-    Each leaf (scalar) emits its full key path as a symbol.
-    Lists use a generic `[]` marker (no indices).
-    Values are NOT included — only key paths.
-    """
-    seq = []
-    if isinstance(data, dict):
-        for key, value in data.items():
-            path = f"{prefix}.{key}" if prefix else key
-            if isinstance(value, (dict, list)):
-                seq.extend(yaml_to_keypath_sequence(value, path))
-            else:
-                seq.append(path)
-    elif isinstance(data, list):
-        for item in data:
-            list_prefix = f"{prefix}[]" if prefix else "[]"
-            if isinstance(item, (dict, list)):
-                seq.extend(yaml_to_keypath_sequence(item, list_prefix))
-            else:
-                seq.append(list_prefix)
-    return seq
-
-
-def yaml_file_to_sequence(filepath):
-    """Load a YAML file and convert to a key-path sequence."""
-    with open(filepath) as f:
-        data = yaml.safe_load(f)
-    if data is None:
-        return []
-    return yaml_to_keypath_sequence(data)
-
-
-def is_vault_file(filepath):
-    """Check if a file is an Ansible vault file (encrypted)."""
-    try:
-        with open(filepath) as f:
-            first = f.read(100)
-            return '$ANSIBLE_VAULT' in first or first.startswith('!vault |')
-    except Exception:
-        return False
-
-
-def collect_all_sequences(root_dir=".", include_vault=False):
-    """Collect key-path sequences from all YAML files.
-
-    Returns:
-        list of (filepath, sequence) tuples.
-    """
-    results = []
-    for path in sorted(Path(root_dir).rglob("*.yml")):
-        parts = path.parts
-        if any(d in parts for d in ('node_modules', '.venv', '__pycache__', '.git')):
-            continue
-        skippable = ('vault.yml' in path.name or 'vault' in path.name)
-        if not include_vault and (skippable or is_vault_file(path)):
-            continue
-        try:
-            seq = yaml_file_to_sequence(path)
-            if seq:
-                results.append((path, seq))
-        except Exception as e:
-            print(f"  SKIP {path}: {e}")
-    return results
-
-
-def sequences_to_crx(result_list):
-    """Run CRX on collected sequences."""
-    from .crx import CRX
-    sequences = [seq for _, seq in result_list]
-    if not sequences:
-        return 'ε'
-    crx = CRX()
-    return crx.infer(sequences)
diff --git a/make_charts.py b/make_charts.py
deleted file mode 100644
index 1553311..0000000
--- a/make_charts.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import matplotlib.pyplot as plt
-import numpy as np
-
-plt.xkcd(scale=0.7, length=60, randomness=2)
-
-FIG_W = 8
-FIG_H = 5
-
-# ── Chart 1: Context cost vs examples ──
-fig1, ax1 = plt.subplots(figsize=(FIG_W, FIG_H))
-
-N = [1, 5, 15, 36]
-raw = [100, 500, 1500, 3600]  # ~100 tokens/example
-dervish = [40, 60, 60, 200]   # grammar grows only when diversity grows
-
-x = np.arange(len(N))
-w = 0.35
-
-bars1 = ax1.bar(x - w/2, raw, w, label='Raw examples', color='#e74c3c', alpha=0.85)
-bars2 = ax1.bar(x + w/2, dervish, w, label='Dervish grammar', color='#3498db', alpha=0.85)
-
-ax1.set_xticks(x)
-ax1.set_xticklabels([f'{n} examples' for n in N])
-ax1.set_ylabel('Tokens needed in context')
-ax1.set_title('Context cost: raw examples vs Dervish grammar')
-ax1.legend(frameon=False)
-
-for bar in bars1:
-    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 80,
-             f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=9)
-for bar in bars2:
-    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 80,
-             f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=9)
-
-ax1.set_ylim(0, 4500)
-fig1.tight_layout()
-fig1.savefig('chart_context_cost.png', dpi=200)
-plt.close(fig1)
-
-# ── Chart 2: Tokens — Without vs With Dervish (per dataset) ──
-fig2, ax2 = plt.subplots(figsize=(FIG_W, FIG_H))
-
-datasets = ['Ansible Galaxy\n(15 roles)', 'Helm\n(6 configs)', 'Go lint\n(6 jobs)']
-without = [5000, 3000, 900]
-with_derv = [60, 40, 30]
-ratios = [f'{int(w/d)}×' for w, d in zip(without, with_derv)]
-
-x2 = np.arange(len(datasets))
-w2 = 0.3
-
-bw = ax2.bar(x2 - w2/2, without, w2, label='Without Dervish', color='#e74c3c', alpha=0.85)
-bd = ax2.bar(x2 + w2/2, with_derv, w2, label='With Dervish', color='#3498db', alpha=0.85)
-
-ax2.set_xticks(x2)
-ax2.set_xticklabels(datasets)
-ax2.set_ylabel('Tokens')
-ax2.set_title('Token savings per dataset')
-ax2.legend(frameon=False)
-ax2.set_yscale('log')
-ax2.set_ylim(5, 30000)
-
-# Label compression ratios
-for i, (r, wbar, dbar) in enumerate(zip(ratios, bw, bd)):
-    ax2.text(x2[i], without[i] * 1.3, r, ha='center', va='bottom', fontsize=11, fontweight='bold',
-             bbox=dict(boxstyle='round,pad=0.2', facecolor='white', edgecolor='gray', alpha=0.8))
-
-fig2.tight_layout()
-fig2.savefig('chart_token_savings.png', dpi=200)
-plt.close(fig2)
-
-print("Charts saved: chart_context_cost.png, chart_token_savings.png")
diff --git a/papers/README.md b/papers/README.md
new file mode 100644
index 0000000..69eb14d
--- /dev/null
+++ b/papers/README.md
@@ -0,0 +1,6 @@
+# Papers
+
+The Dervish algorithms are based on two papers by Bex et al.:
+
+- **CRX** — [*Learning Deterministic Regular Expressions for the Web*](https://doi.org/10.1145/1806907.1806911) (TODS 2010)
+- **iDRegEx** — [*Simplifying XML Schema: Single-Type Approximations of Regular Expressions*](https://arxiv.org/abs/1004.2372) (arXiv:1004.2372)
diff --git a/papers/paper_arxiv2010.txt b/papers/paper_arxiv2010.txt
deleted file mode 100644
index 7e8e0af..0000000
--- a/papers/paper_arxiv2010.txt
+++ /dev/null
@@ -1,2210 +0,0 @@
-arXiv:1004.2372v1 [cs.DB] 14 Apr 2010
-
-Learning Deterministic Regular Expressions for the
-Inference of Schemas from XML Data
-GEERT JAN BEX, WOUTER GELADE, FRANK NEVEN
-Hasselt University and Transnational University of Limburg
-and
-STIJN VANSUMMEREN
-Université Libre de Bruxelles
-
-Inferring an appropriate DTD or XML Schema Definition (XSD) for a given collection of XML
-documents essentially reduces to learning deterministic regular expressions from sets of positive
-example words. Unfortunately, there is no algorithm capable of learning the complete class of
-deterministic regular expressions from positive examples only, as we will show. The regular expressions occurring in practical DTDs and XSDs, however, are such that every alphabet symbol
-occurs only a small number of times. As such, in practice it suffices to learn the subclass of
-deterministic regular expressions in which each alphabet symbol occurs at most k times, for some
-small k. We refer to such expressions as k-occurrence regular expressions (k-OREs for short).
-Motivated by this observation, we provide a probabilistic algorithm that learns k-OREs for increasing values of k, and selects the deterministic one that best describes the sample based on a
-Minimum Description Length argument. The effectiveness of the method is empirically validated
-both on real world and synthetic data. Furthermore, the method is shown to be conservative over
-the simpler classes of expressions considered in previous work.
-Categories and Subject Descriptors: F.4.3 [Mathematical Logic and Formal Languages]:
-Formal Languages; I.2.6 [Artificial Intelligence]: Learning; I.7.2 [Document and Text Processing]: Document Preparation
-General Terms: Algorithms, Languages, Theory
-Additional Key Words and Phrases: regular expressions, schema inference, XML
-
-1.
-
-INTRODUCTION
-
-Recent studies stipulate that schemas accompanying collections of XML documents
-are sparse and erroneous in practice. Indeed, Barbosa et al. [2005] and Mignet et al.
-[2003] have shown that approximately half of the XML documents available on the
-web do not refer to a schema. In addition, Bex et al. [2004] and Martens et al.
-[2006] have noted that about two-thirds of XML Schema Definitions (XSDs) gathered from schema repositories and from the web at large are not valid with respect
-to the W3C XML Schema specification [Thompson et al. 2001], rendering them
-A preliminary version of this article appeared in the 17th International World Wide Web Conference (WWW 2008).
-Permission to make digital/hard copy of all or part of this material without fee for personal
-or classroom use provided that the copies are not made or distributed for profit or commercial
-advantage, the ACM copyright/server notice, the title of the publication, and its date appear, and
-notice is given that copying is by permission of the ACM, Inc. To copy otherwise, to republish,
-to post on servers, or to redistribute to lists requires prior specific permission and/or a fee.
-c 2024 ACM 0000-0000/2024/0000-0001 $5.00
-ACM Journal Name, Vol. V, No. N, November 2024, Pages 1–31.
-
-2
-
-·
-
-Geert Jan Bex et al.
-<!ELEMENT store (order∗ , stock)>
-<!ELEMENT order (customer, item+ )>
-<!ELEMENT customer (first, last, email∗ )>
-<!ELEMENT item (id, price + (qty, (supplier + item+ )))>
-<!ELEMENT stock (item∗ )>
-<!ELEMENT supplier (first, last, email∗ )>
-Fig. 1.
-
-An example DTD.
-
-essentially useless for immedidate application. A similar observation was made by
-Sahuguet [2000] concerning Document Type Definitions (DTDs). Nevertheless, the
-presence of a schema strongly facilitates optimization of XML processing (cf., e.g.,
-[Benedikt et al. 2005; Che et al. 2006; Du et al. 2004; Freire et al. 2002; Koch et al.
-2004; Manolescu et al. 2001; Neven and Schwentick 2006]) and various software
-development tools such as Castor [cas ] and SUN’s JAXB [jax ] rely on schemas
-as well to perform object-relational mappings for persistence. Additionally, the
-existence of schemas is imperative when integrating (meta) data through schema
-matching [Rahm and Bernstein 2001] and in the area of generic model management [Bernstein 2003].
-Based on the above described benefits of schemas and their unavailability in
-practice, it is essential to devise algorithms that can infer a DTD or XSD for a
-given collection of XML documents when none, or no syntactically correct one, is
-present. This is also acknowledged by Florescu [2005] who emphasizes that in the
-context of data integration
-“We need to extract good-quality schemas automatically from existing
-data and perform incremental maintenance of the generated schemas.”
-As illustrated in Figure 1, a DTD is essentially a mapping d from element names
-to regular expressions over element names. An XML document is valid with respect
-to the DTD if for every occurrence of an element name e in the document, the
-word formed by its children belongs to the language of the corresponding regular
-expression d(e). For instance, the DTD in Figure 1 requires each store element
-to have zero or more order children, which must be followed by a stock element.
-Likewise, each order must have a customer child, which must be followed by one
-or more item elements.
-To infer a DTD from a corpus of XML documents C it hence suffices to look,
-for each element name e that occurs in a document in C, at the set of element
-name words that occur below e in C, and to infer from this set the corresponding
-regular expression d(e). As such, the inference of DTDs reduces to the inference
-of regular expressions from sets of positive example words. To illustrate, from the
-words id price, id qty supplier, and id qty item item appearing under <item>
-elements in a sample XML corpus, we could derive the rule
-item → (id, price + (qty, (supplier + item+ ))).
-Although XSDs are more expressive than DTDs, and although XSD inference is
-therefore more involved than DTD inference, derivation of regular expressions remains one of the main building blocks on which XSD inference algorithms are built.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-In fact, apart from also inferring atomic data types, systems like Trang [Clark ] and
-XStruct [Hegewald et al. 2006] simply infer DTDs in XSD syntax. The more recent
-iXSD algorithm [Bex et al. 2007] does infer true XSD schemas by first deriving a
-regular expression for every context in which an element name appears, where the
-context is determined by the path from the root to that element, and subsequently
-reduces the number of contexts by merging similar ones.
-So, the effectiveness of DTD or XSD schema inference algorithms is strongly
-determined by the accuracy of the employed regular expression inference method.
-The present article presents a method to reliably learn regular expressions that
-are far more complex than the classes of expressions previously considered in the
-literature.
-1.1
-
-Problem setting
-
-In particular, let Σ be a fixed set of alphabet symbols (also called element names),
-and let Σ∗ be the set of all words over Σ.
-Definition 1.1 (Regular Expressions). Regular expressions are derived by the following grammar.
-r, s ::= ∅ | ε | a | r . s | r + s | r? | r+
-Here, parentheses may be added to avoid ambiguity; ε denotes the empty word;
-a ranges over symbols in Σ; r . s denotes concatenation; r + s denotes disjunction;
-r+ denotes one-or-more repetitions; and r? denotes the optional regular expression.
-That is, the language L(r) accepted by regular expression r is given by:
-L(∅) = ∅
-L(a) = {a}
-L(r + s) = L(r) ∪ L(s)
-
-L(ε) = {ε}
-L(r . s) = {vw | v ∈ L(r), w ∈ L(s)}
-L(r+ ) = {v1 . . . vn | n ≥ 1 and v1 , . . . , vn ∈ L(r)}
-
-L(r?) = L(r) ∪ {ε}.
-Note that the Kleene star operator (denoting zero or more repititions as in r∗ ) is
-not allowed by the above syntax. This is not a restriction, since r∗ can always be
-represented as (r+ )? or (r?)+ . Conversely, the latter can always be rewritten into
-the former for presentation to the user.
-The class of all regular expressions is actually too large for our purposes, as both
-DTDs and XSDs require the regular expressions occurring in them to be deterministic (also sometimes called one-unambiguous [Brüggemann-Klein and Wood
-1998]). Intuitively, a regular expression is deterministic if, without looking ahead
-in the input word, it allows to match each symbol of that word uniquely against a
-position in the expression when processing the input in one pass from left to right.
-For instance, (a + b)∗ a is not deterministic as already the first symbol in the word
-aaa could be matched by either the first or the second a in the expression. Without
-lookahead, it is impossible to know which one to choose. The equivalent expression
-b∗ a(b∗ a)∗ , on the other hand, is deterministic.
-Definition 1.2. Formally, let r stand for the regular expression obtained from r
-by replacing the ith occurrence of alphabet symbol a in r by a(i) , for every i and
-+
-+
-a. For example, for r = b+ a(ba+ )? we have r = b(1) a(1) (b(2) a(2) )?. A regular
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-3
-
-4
-
-·
-
-Geert Jan Bex et al.
-
-expression r is deterministic if there are no words wa(i) v and wa(j) v 0 in L(r) such
-that i 6= j.
-Equivalently, an expression is deterministic if the Glushkov construction [BrüggemanKlein 1993] translates it into a deterministic finite automaton rather than a nondeterministic one [Brüggemann-Klein and Wood 1998]. Not every non-deterministic
-regular expression is equivalent to a deterministic one [Brüggemann-Klein and
-Wood 1998]. Thus, semantically, the class of deterministic regular expressions
-forms a strict subclass of the class of all regular expressions.
-For the purpose of inferring DTDs and XSDs from XML data, we are hence in
-search of an algorithm that, given enough sample words of a target deterministic
-regular expression r, returns a deterministic expression r0 equivalent to r. In the
-framework of learning in the limit [Gold 1967], such an algorithm is said to learn
-the deterministic regular expressions from positive data.
-Definition 1.3. Define a sample to be a finite subset of Σ∗ and let R be a subclass
-of the regular expressions. An algorithm M mapping samples to expressions in R
-learns R in the limit from positive data if (1) S ⊆ L(M (S)) for every sample S and
-(2) to every r ∈ R we can associate a so-called characteristic sample Sr ⊆ L(r) such
-that, for each sample S with Sr ⊆ S ⊆ L(r), M (S) is equivalent to r.
-Intuitively, the first condition says that M must be sound ; the second that M
-must be complete, given enough data. A class of regular expressions R is learnable
-in the limit from positive data if an algorithm exists that learns R. For the class of
-all regular expressions, it was shown by Gold that no such algorithm exists [Gold
-1967]. We extend this result to the class of deterministic expressions:
-Theorem 1.4. The class of deterministic regular expressions is not learnable in
-the limit from positive data.
-Proof. It was shown by Gold [1967, Theorem I.8], that any class of regular
-expressions that contains all non-empty finite languages as well as at least one
-infinite language is not learnable in the limit from positive data. Since deterministic
-regular expressions like a∗ define an infinite language, it suffices to show that every
-non-empty finite language is definable by a deterministic expression. Hereto, let
-S be a finite, non-empty set of words. Now consider the prefix tree T for S. For
-example, if S = {a, aab, abc, aac}, we have the following prefix tree:
-a
-a
-b c
-
-b
-c
-
-Nodes for which the path from the root to that node forms a word in S are marked
-by double circles. In particular, all leaf nodes are marked.
-By viewing the internal nodes in T with two or more children as disjunctions;
-internal nodes in T with one child as conjunctions; and adding a question mark for
-every marked internal node in T , it is straightforward to transform T into a regular
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-expression. For example, with S and T as above we get r = a .(b . c + a .(b + c))?.
-Clearly, L(r) = S. Moreover, since no node in T has two edges with the same label,
-r must be deterministic.
-Theorem 1.4 immediately excludes the possibility for an algorithm to infer the
-full class of DTDs or XSDs. In practice, however, regular expressions occurring
-in DTDs and XSDs are concise rather than arbitrarily complex. Indeed, a study
-of 819 DTDs and XSDs gathered from the Cover Pages [Cover 2003] (including
-many high-quality XML standards) as well as from the web at large, reveals that
-regular expressions occurring in practical schemas are such that every alphabet
-symbol occurs only a small number of times [Martens et al. 2006]. In practice,
-therefore, it suffices to learn the subclass of deterministic regular expressions in
-which each alphabet symbol occurs at most k times, for some small k. We refer to
-such expressions as k-occurrence regular expressions.
-Definition 1.5. A regular expression is k-occurrence if every alphabet symbol
-occurs at most k times in it.
-For example, the expressions customer . order+ and (school + institute)+ are
-both 1-occurrence, while id .(qty+id) is 2-occurrence (as id occurs twice). Observe
-that if r is k-occurrence, then it is also l-occurrence for every l ≥ k. To simplify
-notation in what follows, we abbreviate ‘k-occurrence regular expression’ by k-ORE
-and also refer to the 1-OREs as ‘single occurrence regular expressions’ or SOREs.
-1.2
-
-Outline and Contributions
-
-Actually, the above mentioned examination shows that in the majority of the cases
-k = 1. Motivated by that observation, we have studied and suggested practical
-learning algorithms for the class of deterministic SOREs in a companion article [Bex
-et al. 2006]. These algorithms, however, can only output SOREs even when the
-target regular expression is not. In that case they always return an approximation
-of the target expressions. It is therefore desirable to also have learning algorithms
-for the class of deterministic k-OREs with k ≥ 2. Furthermore, since the exact
-k-value for the target expression, although small, is unknown in a schema inference
-setting, we also require an algorithm capable of determining the best value of k
-automatically.
-We begin our study of this problem in Section 3 by showing that, for each fixed k,
-the class of deterministic k-OREs is learnable in the limit from positive examples
-only. We also argue, however, that this theoretical algorithm is unlikely to work
-well in practice as it does not provide a method to automatically determine the
-best value of k and needs samples whose size can be exponential in the size of the
-alphabet to successfully learn some target expressions.
-In view of these observations, we provide in Section 4 the practical algorithm
-iDRegEx. Given a sample of words S, iDRegEx derives corresponding deterministic k-OREs for increasing values of k and selects from these candidate expressions
-the expression that describes S best. To determine the “best” expression we propose two measures: (1) a Language Size measure and (2) a Minimum Description
-Length measure based on the work of Adriaans and Vitányi [2006]. The main technical contribution lies in the subroutine used to derive the actual k-OREs for S.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-5
-
-6
-
-·
-
-Geert Jan Bex et al.
-
-Indeed, while for the special case where k = 1 one can derive a k-ORE by first
-learning an automaton A for S using the inference algorithm of Garcia and Vidal
-[1990], and by subsequently translating A into a 1-ORE (as shown in [Bex et al.
-2006]), this approach does not work when k ≥ 2. In particular, the algorithm of
-Garcia and Vidal only works when learning languages that are “n-testable” for
-some fixed natural number n [Garcia and Vidal 1990]. Although every language
-definable by a 1-ORE is 2-testable [Bex et al. 2006], there are languages definable
-by a 2-ORE, for instance a∗ ba∗ , that are not n-testable for any n. We therefore
-use a probabilistic method based on Hidden Markov Models to learn an automaton
-for S, which is subsequently translated into a k-ORE.
-The effectiveness of iDRegEx is empirically validated in Section 5 both on real
-world and synthetic data. We compare the results of iDRegEx with those of
-the algorithm presented in previous work [Bex et al. 2008], to which we refer as
-iDRegEx(rwr0 ).
-2.
-
-RELATED WORK
-
-Semi-structured data. In the context of semi-structured data, the inference of
-schemas as defined in [Buneman et al. 1997; Quass et al. 1996] has been extensively studied [Goldman and Widom 1997; Nestorov et al. 1998]. No methods were
-provided to translate the inferred types to regular expressions, however.
-DTD and XSD inference. In the context of DTD inference, Bex et al. [2006]
-gave in earlier work two inference algorithms: one for learning 1-OREs and one for
-learning the subclass of 1-OREs known as chain regular expressions. The latter
-class can also be learned using Trang [Clark ], state of the art software written
-by James Clark that is primarily intended as a translator between the schema
-languages DTD, Relax NG [Clark and Murata 2001], and XSD, but also infers a
-schema for a set of XML documents. In contrast, our goal in this article is to infer
-the more general class of deterministic expressions. xtract [Garofalakis et al.
-2003] is another regular expression learning system with similar goals. We note
-that xtract also uses the Minimum Description Length principle to choose the
-best expression from a set of candidates.
-Other relevant DTD inference research is [Sankey and Wong 2001] and [Chidlovskii
-2001] that learn finite automata but do not consider the translation to deterministic
-regular expressions. Also, in [Young-Lai and Tompa 2000] a method is proposed to
-infer DTDs through stochastic grammars where right-hand sides of rules are represented by probabilistic automata. No method is provided to transform these into
-regular expressions. Although Ahonen [1996] proposes such a translation, the effectiveness of her algorithm is only illustrated by a single case study of a dictionary
-example; no experimental study is provided.
-Also relevant are the XSD inference systems [Bex et al. 2007; Clark ; Hegewald
-et al. 2006] that, as already mentioned, rely on the same methods for learning
-regular expressions as DTD inference.
-Regular expression inference. Most of the learning of regular languages from
-positive examples in the computational learning community is directed towards inference of automata as opposed to inference of regular expressions [Angluin and
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-Smith 1983; Pitt 1989; Sakakibara 1997]. However, these approaches learn strict
-subclasses of the regular languages which are incomparable to the subclasses considered here. Some approaches to inference of regular expressions for restricted cases
-have been considered. For instance, [Brāzma 1993] showed that regular expressions
-without union can be approximately learned in polynomial time from a set of examples satisfying some criteria. [Fernau 2005] provided a learning algorithm for
-regular expressions that are finite unions of pairwise left-aligned union-free regular
-expressions. The development is purely theoretical, no experimental validation has
-been performed.
-HMM learning. Although there has been work on Hidden Markov Model structure induction [Rabiner 1989; Freitag and McCallum 2000], the requirement in our
-setting that the resulting automaton is deterministic is, to the best of our knowledge, unique.
-3.
-
-BASIC RESULTS
-
-In this section we establish that, in contrast to the class of all deterministic expressions, the subclass of deterministic k-OREs can theoretically be learned in the limit
-from positive data, for each fixed k. We also argue, however, that this theoretical
-algorithm is unlikely to work well in practice.
-Let Σ(r) denote the set of alphabet symbols that occur in a regular expression
-r, and let Σ(S) be similarly defined for a sample S. Define the length of a regular expression r as the length of it string representation, including operators and
-parenthesis. For example, the length of (a . b)+ ? + c is 9.
-Theorem 3.1. For every k there exists an algorithm M that learns the class of
-deterministic k-OREs from positive data. Furthermore, on input S, M runs in
-time polynomial in the size of S, yet exponential in k and |Σ(S)|.
-Proof. The algorithm M is based on the following observations. First observe
-that every deterministic k-ORE r over a finite alphabet A ⊆ Σ can be simplified
-into an equivalent deterministic k-ORE r0 of length at most 10k|A| by rewriting r
-according to the following system of rewrite rules until no more rule is applicable:
-((s)) → (s)
-s?? → s?
-s + ε → s?
-s.ε → s
-ε? → ε
-s+∅ → s
-s.∅ → ∅
-∅? → ∅
-
-s?+ → s+ ?
-s++ → s+
-ε + s → s?
-ε.s → s
-ε+ → ε
-∅+s → s
-∅.s → ∅
-∅+ → ∅
-
-(The first rewrite rule removes redundant parenthesis in r.) Indeed, since each
-rewrite rule clearly preserves determinism and language equivalence, r0 must be a
-deterministic expression equivalent to r. Moreover, since none of the rewrite rules
-duplicates a subexpression and since r is a k-ORE, so is r0 . Now note that, since
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-7
-
-8
-
-·
-
-Geert Jan Bex et al.
-
-no rewrite rule applies to it, r0 is either ∅, ε, or generated by the following grammar
-t ::= a | a? | a+ | a+ ? | (a) | (a)? | (a)+ | (a)+ ?
-| t1 . t2 | (t1 . t2 ) | (t1 . t2 )? | (t1 . t2 )+ | (t1 . t2 )+ ?
-| t1 + t2 | (t1 + t2 ) | (t1 + t2 )? | (t1 + t2 )+ | (t1 + t2 )+ ?
-It is not difficult to verify by structural induction that any expression t produced
-by this grammar has length
-X
-|t| ≤ −4 + 10
-rep(t, a),
-a∈Σ(t)
-
-where rep(t, a) denotes the number of times alphabet symbol a occurs in t. For
-instance, rep(b .(b + c), a) = 0 and rep(b .(b + c), b) = 2. Since rep(r0 , a) ≤ k for
-every a ∈ Σ(r0 ), it readily follows that |r0 | ≤ 10k|A| − 4 ≤ 10k|A|.
-Then observe that all possible regular expressions over A of length at most 10k|A|
-can be enumerated in time exponential in k|A|. Since checking whether a regular expression is deterministic is decidable in polynomial time [Brüggemann-Klein
-and Wood 1998]; and since equivalence of deterministic expressions is decidable in
-polynomial time [Brüggemann-Klein and Wood 1998], it follows by the above observations that for each k and each finite alphabet A ⊆ Σ it is possible to compute
-in time exponential in k|A| a finite set RA of pairwise non-equivalent deterministic
-k-OREs over A such that
-—every r ∈ RA is of size at most 10k|A|; and
-—for every deterministic k-ORE r over A there exists an equivalent expression
-r0 ∈ RA .
-(Note that since RA is computable in time exponential in k|A|, it has at most an
-exponential number of elements in k|A|.) Now fix, for each finite A ⊆ Σ an arbitrary
-order ≺ on RA , subject to the provision that r ≺ s only if L(s) − L(r) 6= ∅. Such
-an order always exists since RA does not contain equivalent expressions.
-Then let M be the algorithm that, upon sample S, computes RΣ(S) and outputs
-the first (according to ≺) expression r ∈ RΣ(S) for which S ⊆ L(r). Since RΣ(S) can
-be computed in time exponential in k|Σ(S)|; since there are at most an exponential
-number of expressions in RΣ(S) ; since each expression r ∈ RΣ(S) has size at most
-10k|Σ(S)|; and since checking membership in L(r) of a single word w ∈ S can be
-done in time polynomial in the size of w and r, it follows that M runs in time
-polynomial in S and exponential in k|Σ(S)|.
-Furthermore, we claim that M learns the class of deterministic k-OREs. Clearly,
-S ⊆ L(M (S)) by definition. Hence, it remains to show completeness, i.e., that we
-can associate to each deterministic k-ORE r a sample Sr ⊆ L(r) such that, for each
-sample S with Sr ⊆ S ⊆ L(r), M (S) is equivalent to r. Note that, by definition of
-RΣ(r) , there exists a deterministic k-ORE r0 ∈ RΣ(r) equivalent to r. Initialize Sr
-to an arbitrary finite subset of L(r) = L(r0 ) such that each alphabet symbol of r
-occurs at least once in S, i.e., Σ(Sr ) = Σ(r). Let r1 ≺ · · · ≺ rn be all predecessors of
-r0 in RΣ(r) according to ≺. By definition of ≺, there exists a word wi ∈ L(r)−L(ri )
-for every 1 ≤ i ≤ n. Add all of these words to Sr . Then clearly, for every sample S
-with Sr ⊆ S ⊆ L(r) we have Σ(S) = Σ(r) and S 6⊆ L(ri ) for every 1 ≤ i ≤ n. Since
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-M (S) is the first expression in RΣ(r) with S ⊆ L(r), we hence have M (S) = r0 ≡ r,
-as desired.
-While Theorem 3.1 shows that the class of deterministic k-OREs is better suited
-for learning from positive data than the complete class of deterministic expressions,
-it does not provide a useful practical algorithm, for the following reasons.
-(1) First and foremost, M runs in time exponential in the size of the alphabet Σ(S),
-which may be problematic for the inference of schema’s with many element
-names.
-(2) Second, while Theorem 3.1 shows that the class of deterministic k-OREs is
-learnable in the limit for each fixed k, the schema inference setting is such that
-we do not know k a priori. If we overestimate k then M (S) risks being an underapproximation of the target expression r, especially when S is incomplete.
-To illustrate, consider the 1-ORE target expression r = a+ b+ and sample
-S = {ab, abbb, aabb}. If we overestimate k to, say, 2 instead of 1, then M is free
-to output aa?b+ as a sound answer. On the other hand, if we underestimate k
-then M (S) risks being an over-approximation of r. Consider, for instance, the
-2-ORE target expression r = aa?b+ and the same sample S = {ab, abbb, aabb}.
-If we underestimate k to be 1 instead of 2, then M can only output 1-OREs,
-and needs to output at least a+ b+ in order to be sound. In summary: we need
-a method to determine the most suitable value of k.
-(3) Third, the notion of learning in the limit is a very liberal one: correct expressions need only be derived when sufficient data is provided, i.e., when the input
-sample is a superset of the characteristic sample for the target expression r.
-The following theorem shows that there are reasonably simple expressions r
-such that characteristic sample Sr of any sound and complete learning algorithm is at least exponential in the size of r. As such, it is unlikely for any
-sound and complete learning algorithm to behave well on real-world samples,
-which are typically incomplete and hence unlikely to contain all words of the
-characteristic sample.
-Theorem 3.2. Let A = {a1 , . . . , an } ⊆ Σ consist of n distinct element names.
-Let r1 = (a1 a2 + a3 + · · · + an )+ , and let r2 = (a2 + · · · + an )+ a1 (a2 + · · · + an )+ .
-For any algorithm that learns the class of deterministic (2n
-Pn+ 3)-OREs and any
-sample S that is characteristic for r1 or r2 we have |S| ≥ i=1 (n − 2)i .
-Proof. First consider r1 = (a1 a2 + a3 + · · · + an )+ . Observe that there exist
-an exponential number of deterministic (2n + 3)-OREs that differ from r1 in only
-a single word. Indeed, let B = A − {a1 , a2 } and let W consist of all non-empty
-words w over B of length at most n. Define, for every word w = b1 . . . bm ∈ W the
-deterministic (2n + 3)-ORE rw such that L(rw ) = L(r1 ) − {w} as follows. First,
-i
-that accepts all words in
-define, for every 1 ≤ i ≤ m the deterministic 2-ORE rw
-L(r1 ) that do not start with bi :
-i
-rw
-:= (a1 a2 + (B − {bi })) .(a1 a2 + a3 + · · · + an )∗
-
-Clearly, v ∈ L(r1 ) − {w} if, and only if, v ∈ L(r1 ) and there is some 0 ≤ i ≤ m
-such that v agrees with w on the first i letters, but differs in the (i + 1)-th letter.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-9
-
-10
-
-·
-
-Geert Jan Bex et al.
-
-Hence, it suffices to take
-1
-2
-3
-m
-rw := rw
-+ b1 (ε + rw
-+ b2 (ε + rw
-+ b3 (· · · + bm−1 (ε + rw
-+ bm . r1 ) . . . )))
-
-Now assume that algorithm M learns the class of deterministic (2n + 3)-OREs and
-suppose that Sr1 is characteristic for r1 . In particular, Sr1 ⊆ L(r1 ). By definition,
-M (S) is equivalent to r for every sample S with Sr1 ⊆ S ⊆ L(r1 ). We claim that
-in order for M to have this property, W must be a subset
-of Sr . Then, since W
-Pn
-contains all words over B of length at most n, |Sr1 | ≥ i=1 (n−2)i , as desired. The
-intuitive argument why W must be a subset of Sr is that if there exists w in W −Sr ,
-then M cannot distinguish between r1 and rw . Indeed, suppose for the purpose
-of contradiction that there is some w ∈ W with w 6∈ Sr1 . Then Sr1 is a subset of
-L(rw ). Indeed, Sr1 = Sr1 − {w} ⊆ L(r1 ) − {w} = L(rw ). Furthermore, since M
-learns the class of deterministic (2n + 3)-OREs, there must be some characteristic
-sample Srw for rw . Now, consider the sample Sr1 ∪ Srw . It is included in both
-L(r1 ) and L(rw ) and is a superset of both Sr1 and Srw . But then, by definition of
-characteristic samples, M (Sr1 ∪ Srw ) must be equivalent to both r1 and rw . This
-is absurd, however, since L(r1 ) 6= L(rw ) by construction.
-A similar argument shows that the P
-characteristic sample Sr2 of r2 = (a2 + · · · +
-n
-an )+ a1 (a2 + · · · + an )+ also requires i=1 (n − 2)i elements. In this case, we take
-B = A − {a1 } and we take W to be the set of all non-empty words over B of
-length at most n. For each w = b1 . . . bm ∈ W , we construct the deterministic
-(2n + 3)-ORE rw such that L(rw ) accepts all words in L(r) that do not end with
-i
-be the 2-ORE that accepts all words in B +
-a1 w, as follows. Let, for 1 ≤ i ≤ m, rw
-that do not start with bi :
-i
-rw
-:= (B − {bi }) . B ∗
-
-Then it suffices to take
-i
-2
-m
-rw := B + a1 (rw
-+ b1 (ε + rw
-+ b3 (· · · + bm−1 (ε + rw
-+ bm B + ) . . . ))).
-
-A similar argument as for r1 then shows that the characteristic sample Sr2 of r2
-needs to contain, for
-w ∈ W , at least one word of the form va1 w with v ∈ B + .
-Peach
-n
-Therefore, |Sr2 | ≥ i=1 (n − 2)i , as desired.
-4.
-
-THE LEARNING ALGORITHM
-
-In view of the observations made in Section 3, we present in this section a practical
-learning algorithm that (1) works well on incomplete data and (2) automatically
-determines the best value of k (see Section 5 for an experimental evaluation). Specifically, given a sample S, the algorithm derives deterministic k-OREs for increasing
-values of k and selects from these candidate expressions the k-ORE that describes
-S best. To determine the “best” expression we propose two measures: (1) a Language Size measure and (2) a Minimum Description Length measure based on the
-work of Adriaans and Vitányi [2006].
-Our algorithm does not derive deterministic k-OREs for S directly, but uses, for
-each fixed k, a probabilistic method to first learn an automaton for S, which is subsequently translated into a k-ORE. The following section (Section 4.1) explains how
-the probabilistic method that learns an automaton from S works. Section 4.2 explains how the learned automaton is translated into a k-ORE. Finally, Section 4.3,
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-introduces the whole algorithm, together with the two measures to determine the
-best candidate expression.
-4.1
-
-Probabilistically Learning a Deterministic Automaton
-
-In particular, the algorithm first learns a deterministic k-occurrence automaton
-(deterministic k-OA) for S. This is a specific kind of finite state automaton in
-which each alphabet symbol can occur at most k times. Figure 2(a) gives an
-example. Note that in contrast to the classical definition of an automaton, no
-edges are labeled: all incoming edges in a state s are assumed to be labeled by the
-label of s. In other words, the 2-OA of Figure 2(a) accepts the same language as
-aa?b+ .
-Definition 4.1 (k-OA). An automaton is a node-labeled graph G = (V, E, lab)
-where
-—V is a finite set of nodes (also called states) with a distinguished source src ∈ V
-and sink sink ∈ V ;
-—the edge relation E is such that src has only outgoing edges; sink has only
-incoming edges; and every state v ∈ V − {src, sink } is reachable by a walk from
-src to sink ;
-—lab : V − {src, sink } → Σ is the labeling function.
-In this context, an accepting run for a word a1 . . . an is a walk src s1 . . . sn sink
-from src to sink in G such that ai = lab(si ) for 1 ≤ i ≤ n. As usual, we denote
-by L(G) the set of all words for which an accepting run exists. An automaton is
-k-occurrence (a k-OA) if there are at most k states labeled by the same alphabet
-symbol. If G uses only labels in A ⊆ Σ then G is an automaton over A.
-In what follows, we write Succ(s) for the set {t | (s, t) ∈ E} of all direct successors
-of state s in G, and Pred(s) for the set {t | (t, s) ∈ E} of all direct predecessors
-of s in G. Furthermore, we write Succ(s, a) and Pred(s, a) for the set of states in
-Succ(s) and Pred(s), respectively, that are labeled by a. As usual, an automaton G
-is deterministic if Succ(s, a) contains at most one state, for every s ∈ V and a ∈ Σ.
-For convenience, we will also refer to the 1-OAs as “single occurence automata”
-or SOAs for short.
-We learn a deterministic k-OA for a sample S as follows. First, recall from
-Section 3 that Σ(S) is the set of alphabet symbols occurring in words in S. We view
-S as the result of a stochastic process that generates words from Σ∗ by performing
-random walks on the complete k-OA Ck over Σ(S).
-Definition 4.2. Define the complete k-OA Ck over Σ(S) to be the k-OA G =
-(V, E, lab) over Σ(S) in which each a ∈ Σ(S) labels exactly k states such that
-—there is an edge from src to sink ;
-—src is connected to exactly one state labeled by a, for every a ∈ Σ(S); and
-—every state s ∈ V − {src, sink } has an outgoing edge to every other state except
-src.
-To illustrate, the complete 2-OA over {a, b} is shown in Figure 2(b). Clearly,
-L(Ck ) = Σ(S)∗ .
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-11
-
-12
-
-·
-
-Geert Jan Bex et al.
-
-a
-
-a
-
-b
-(a) An example 2-OA. It accepts
-the same language as aa?b+
-Fig. 2.
-
-a
-
-a
-
-b
-
-b
-
-(b) The complete
-{a, b}.
-
-2-OA
-
-over
-
-Two 2-OAs.
-
-The stochastic process that generates words from Σ∗ by performing random walks
-on Ck operates as follows. First, the process picks, among all states in Succ(src),
-a state s1 with probability α(src, s1 ) and emits lab(s1 ). Then it picks, among
-all states in Succ(s1 ) a state s2 with probability α(s1 , s2 ) and emits lab(s2 ). The
-process continues moving to new states and emitting their labels until the final state
-is reached (which does not emit a symbol). Of course, α must be a true probability
-distribution, i.e.,
-X
-α(s, t) ≥ 0; and
-α(s, t) = 1
-(1)
-t∈Succ(s)
-
-for all states s 6= sink and all states t. The probability of generating a particular
-accepting run ~s = src s1 s2 . . . sn sink given the process P = (Ck , α) in this setting
-is
-P [~s | P] = α(src, s1 ) · α(s2 , s3 ) · α(s2 , s3 ) · · · α(sn , sink ),
-and the probability of generating the word w = a1 . . . an is
-X
-P [w | P] =
-P [~s | P].
-all accepting runs ~
-s of w in Ck
-
-Assuming independence, the probability of obtaining all words in the sample S is
-then
-Y
-P [S | P] =
-P [w | P].
-w∈S
-
-Clearly, the process that best explains the observation of S is the one in which the
-probabilities α are such that they maximize P [S | P].
-To learn a deterministic k-OA for S we therefore first try to infer from S the
-probability distribution α that maximizes P [S | P], and use this distribution to
-determine the topology of the desired deterministic k-OA. In particular, we remove
-from Ck the non-deterministic edges with the lowest probability as these are the
-least likely to contribute to the generation of S, and are therefore the least likely
-to be necessary for the acceptance of S.
-The problem of inferring α from S is well-studied in Machine Learning, where
-our stochastic process P corresponds to a particular kind of Hidden Markov Model
-sometimes referred to as a Partially Observable Markov Model (POMM for short).
-(For the readers familiar with Hidden Markov Models we note that the initial
-state distribution π usually considered in Hidden Markov Models is absorbed in
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-Algorithm 1 iKoa
-Require: a sample S, a value for k
-Ensure: a deterministic k-OA G with S ⊆ L(G)
-1: P ← init(k, S)
-2: P ← BaumWelsh(P, S)
-3: G ← Disambiguate(P, S)
-4: G ← Prune(G, S)
-5: return G
-Algorithm 2 Disambiguate
-Require: a POMM P = (G, α) and sample S
-Ensure: a deterministic k-OA
-1: Initialize queue Q to {s ∈ Succ(src) | α(src, s) > 0}
-2: Initialize set of marked states D ← ∅
-3: while Q is non-empty do
-4:
-s ← first(Q)
-5:
-while some a ∈ Σ has | Succ(s, a)| > 1 do
-0
-0
-6:
-pick t ∈ Succ(s,
-P a) with α(s, t) = max{α(s, t ) | t ∈ Succ(s, a)}
-7:
-set α(s, t) ← {α(s, t0 ) | t0 ∈ Succ(s, a)}
-8:
-for all t0 in Succ(s, a) \ {t} do
-9:
-delete edge (s, t0 ) from G
-10:
-set α(s, t0 ) ← 0
-11:
-P ← BaumWelsh(P, S)
-12:
-if S 6⊆ L(G) then Fail
-13:
-add s to marked states D and pop s from Q
-14:
-enqueue all states in Succ(s) \ D to Q
-15: return G
-the state transition distribution α(src, ·) in our context.) Inference of α is generally
-accomplished by the well-known Baum-Welsh algorithm [Rabiner 1989] that adjusts
-initial values for α until a (possibly local) maximum is reached.
-We use Baum-Welsh in our learning algorithm iKoa shown in Algorithm 1, which
-operates as follows. In line 1, iKoa initializes the stochastic process P to the tuple
-(Ck , α) where
-—Ck is the complete k-OA over Σ(S);
-—α(src, sink ) is the fraction of empty words in S;
-—α(src, s) is the fraction of words in S that start with lab(s), for every s ∈
-Succ(src); and
-—α(s, t) is chosen randomly for s 6= src, subject to the constraints in equation (1).
-It is important to emphasize that, since we are trying to model a stochastic process,
-multiple occurrences of the same word in S are important. A sample should therefore not be considered as a set in Algorithm 1, but as a bag. Line 2 then optimizes
-the initial values of α using the Baum-Welsh algorithm.
-With these probabilities in hand Disambiguate, shown in Algorithm 2, determines the topology of the desired deterministic k-OA for S. In a breadth-first
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-13
-
-14
-
-·
-
-Geert Jan Bex et al.
-
-manner, it picks for each state s and each symbol a the state t ∈ Succ(s, a) with
-the highest probability and deletes all other edges to states labeled by a. Line 7
-merely ensures that α continues to be a probability distribution after this removal
-and line 11 adjusts α to the new topology. Line 12 is a sanity check that ensures
-that we have not removed edges necessary to accept all words in S; Disambiguate
-reports failure otherwise. The result of a successful run of Disambiguate is a
-deterministic k-OA which nevertheless may have edges (s, t) for which there is no
-witness in S (i.e., a word in S whose unique accepting run traverses (s, t)). The
-function Prune in line 4 of iKoa removes all such edges. It also removes all states
-s ∈ Succ(src) without a witness in S. Figure 3 illustrates a hypothetical run of
-iKoa.
-It should be noted that BaumWelsh, which iteratively refines α until a (possibly local) maximum is reached, is computationally quite expensive. For that
-reason, our implementation only executes a fixed number of refinement iterations
-of BaumWelsh in Line 11. Rather surprisingly, this cut-off actually improves the
-precision of iDRegEx, as our experiments in Section 5 show, where it is discussed
-in more detail.
-4.2
-
-Translating k-OAs into k-OREs
-
-Once we have learned a deterministic k-OA for a given sample S using iKoa
-it remains to translate this k-OA into a deterministic k-ORE. An obvious approach in this respect would be to use the classical state elimination algorithm
-(cf., e.g., [Hopcroft and Ullman 2007]). Unfortunately, as already hinted upon by
-Fernau [2004; 2005] and as we illustrate below, it is very difficult to get concise
-regular expressions from an automaton representation. For instance, the classical
-state elimination algorithm applied to the SOA in Figure 4 yields the expression:1
-(aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c +
-aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗
-(b + aa∗ b))∗ (aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d)))(aa∗ d +
-(c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + aa∗ c)(c +
-aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))∗
-
-which is non-deterministic and differs quite a bit from the equivalent deterministic
-SORE
-((b?(a + c))+ d)+ e.
-Actually, results by Ehrenfeucht and Zeiger [1976]; Gelade and Neven [2008]; and
-Gruber and Holzer [2008] show that it is impossible in general to generate concise
-regular expressions from automata: there are k-OAs (even for k = 1) for which the
-number of occurrences of alphabet symbols in the smallest equivalent expression is
-exponential in the size of the automaton. For such automata, an equivalent k-ORE
-hence does not exist.
-It is then natural to ask whether there is an algorithm that translates a given
-k-OA into an equivalent k-ORE when such a k-ORE exists, and returns a k-ORE
-super approximation of the input k-OA otherwise. Clearly, the above example
-shows that the classical state elimination algorithm does not suffice for this purpose.
-1 Transformation computed by JFLAP: www.jflap.org.
-
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-α
-src
-a1
-a2
-b1
-b2
-
-a1
-
-a2
-
-a1
-
-a2
-
-b1
-
-b2
-
-b1
-
-b2
-
-a1
-1
-0.2
-0.4
-0.1
-0.1
-
-a2
-\
-0.3
-0.1
-0.3
-0.1
-
-b1
-0
-0.3
-0.2
-0.3
-0.2
-
-b2
-\
-0.1
-0.1
-0.2
-0.5
-
-sink
-0
-0.1
-0.2
-0.1
-0.1
-
-α
-src
-a1
-a2
-b1
-b2
-
-(a) Process P returned by init with random values for α.
-
-α
-src
-a1
-a2
-b1
-b2
-
-a1
-1
-0
-0.01
-0.01
-0.01
-
-a1
-1
-0.2
-0.01
-0.01
-0.01
-
-a2
-\
-0.3
-0.01
-0.01
-0.01
-
-b1
-0
-0.3
-0.6
-0.5
-0.33
-
-(b) Process P after
-BaumWelsh.
-
-first
-
-a1
-
-a2
-
-a1
-
-a2
-
-b1
-
-b2
-
-b1
-
-b2
-
-a2
-\
-0.5
-0.01
-0.01
-0.01
-
-b1
-0
-0.49
-0.6
-0.5
-0.33
-
-b2
-\
-0
-0.37
-0.28
-0.5
-
-sink
-0
-0.01
-0.01
-0.2
-0.15
-
-α
-src
-a1
-a2
-b1
-b2
-
-(c) Process P after first disambiguation step
-(for a1 ). Edges to a1 and b2 are removed.
-
-a1
-1
-0
-0.01
-0.02
-0.01
-
-a2
-\
-0.5
-0.01
-0
-0.01
-
-b1
-0
-0.49
-0.6
-0.78
-0.38
-
-a
-
-a
-
-b
-
-b
-
-b
-
-returned
-
-sink
-0
-0.01
-0.01
-0.2
-0.15
-
-training
-
-b2
-\
-0
-0.37
-0
-0.4
-
-by
-
-sink
-0
-0.01
-0.01
-0.2
-0.2
-
-(d) Process P after second disambiguation step
-(for b1 ). Edges to a2 and b2 are removed.
-
-a
-
-(e) Automaton
-A
-Disambiguate.
-
-b2
-\
-0.19
-0.37
-0.28
-0.5
-
-·
-
-a
-
-(f) Automaton A returned by Prune. It
-accepts the same language as aa?b+ .
-
-by
-
-Fig. 3. Example run of iKoa for k = 2 with target language aa?b+ . For the process
-P in (c)-(f), the α values are listed in table-form. To distinguish different states
-with the same label, we have indexed the labels.
-
-b
-
-a
-
-d
-
-c
-
-e
-
-Fig. 4. A SOA on which the classical state elimination algorithm returns a complicated expression.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-15
-
-16
-
-·
-
-Geert Jan Bex et al.
-a(1)
-
-a(2)
-
-b(1)
-
-Fig. 5.
-
-An example marking
-
-For that reason, we have proposed in a companion article [Bex et al. ] a family
-of algorithms {rwr, rwr21 , rwr22 , rwr23 , . . . } that translate SOAs into SOREs and
-have exactly these properties:
-Theorem 4.3 ([Bex et al. ]). Let G be a SOA and let T be any of the algorithms in the family {rwr, rwr21 , rwr22 , rwr23 , . . . }. If G is equivalent to a SORE
-r, then T (G) returns a SORE equivalent to r. Otherwise, T (G) returns a SORE
-that is a super approximation of G, L(G) ⊆ L(T (G)).
-(Note that SOAs and SOREs are always deterministic by definition.)
-These algorithms, in short, apply an inverse Glushkov translation. Starting from
-a k-OA where each state is labeled by a symbol, they iteratively rewrite subautomata into equivalent regular expressions. In the end only one state remains and
-the regular expression labeling this state is the output.
-In this section, we show how the above algorithms can be used to translate k-OAs
-into k-OREs. For simplicity of exposition, we will focus our discussion on rwr21 as
-it is the concrete translation algorithm used in our experiments in Section 5, but
-the same arguments apply to the other algorithms in the family.
-Definition 4.4. First, let Σ(k) denote the alphabet that consists of k copies of
-the symbols in Σ, where the first copy of a ∈ Σ is denoted by a(1) , the second by
-a(2) , and so on:
-Σ(k) := {a(i) | a ∈ Σ, 1 ≤ i ≤ k}.
-Let strip be the function mapping copies to their original symbol, i.e., strip(a(i) ) =
-a. We extend strip pointwise to words, languages, and regular expressions over
-Σ(k) .
-For example, strip({a(1) a(2) b(1) , a(2) a(2) c(2) }) = {aab, aac} and strip(a(1) . a(2) ? .
-+
-b(1) ) = a . a? . b+ .
-To see how we can use rwr21 , which translates SOAs into SOREs, to translate
-a k-OA into a k-ORE, observe that we can always transform a k-OA G over Σ
-into a SOA H over Σ(k) by processing the nodes of G in an arbitrary order and
-replacing the ith occurrence of label a ∈ Σ by a(i) . To illustrate, the SOA over Σ(2)
-obtained in this way from the 2-OA in Figure 2(a) is shown in Figure 5. Clearly,
-L(G) = strip(L(H)).
-Definition 4.5. We call a SOA H over Σ(k) obtained from a k-OA G in the above
-manner a marking of G.
-Note that, by Theorem 4.3, running rwr21 on H yields a SORE r over Σ(k)
-with L(H) ⊆ L(r). For instance, with H as in Figure 5, rwr2 (H) returns r =
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-Algorithm 3 rwr2
-Require: a k-OA G
-Ensure: a k-ORE r with L(G) ⊆ L(r)
-1: compute a marking H of G.
-2: return strip(rwr21 (H))
-+
-
-a(1) . a(2) ? . b(1) . By subsequently stripping r, we always obtain a k-ORE over Σ.
-Moreover, L(G) = strip(L(H)) ⊆ strip(L(r)) = L(strip(r)), so the k-ORE strip(r)
-is always a super approximation of G. Algorithm 3, called rwr2 , summarizes the
-translation. By our discussion, rwr2 is clearly sound:
-Proposition 4.6. rwr2 (G) is a (possibly non-deterministic) k-ORE with L(G) ⊆
-L(rwr2 (G)), for every k-OA G.
-Note, however, that even when G is deterministic and equivalent to a deterministic k-ORE r, rwr2 (G) need not be deterministic, nor equivalent to r. For instance,
-consider the 2-OA G:
-b
-
-a
-
-c
-
-b
-
-Clearly, G is equivalent to the deterministic 2-ORE bc?a(ba)+ ?. Now suppose for
-the purpose of illustration that rwr2 constructs the following marking H of G. (It
-does not matter which marking rwr2 constructs, they all result in the same final
-expression.)
-b(1)
-
-a(1)
-
-c(1)
-
-b(2)
-
-Since H is not equivalent to a SORE over Σ(k) , rwr21 (H) need not be equivalent
-to L(H). In fact, rwr21 (H) returns ((b(1) c(1) ?a(1) )?b(2) ?)+ , which yields the nondeterministic ((bc?a)?b?)+ after stripping. Nevertheless, G is equivalent to the
-deterministic 2-ORE bc?a(ba)+ ?.
-So although rwr2 is always guaranteed to return a k-ORE, it does not provide
-the same strong guarantees that rwr21 provides (Theorem 4.3). The following theorem shows, however, that if we can obtain G by applying the Glushkov construction
-on r [Brüggeman-Klein 1993], rwr2 (G) is always equivalent to r. Moreover, if r
-is deterministic, then so is rwr2 (G). So in this sense, rwr2 applies an inverse
-Glushkov construction to r. Formally, the Glushkov construction is defined as
-follows.
-Definition 4.7. Let r be a k-ORE. Recall from Definition 1.2 that r is the regular
-expression obtained from r by replacing the ith occurrence of alphabet symbol a
-by a(i) , for every a ∈ Σ and every 1 ≤ i ≤ n. Let pos(r) denote the symbols in Σ(k)
-that actually appear in r. Moreover, let the sets first(r), last(r), and follow (r, a(i) )
-be defined as shown in Figure 6. A k-OA G is a Glushkov translation of r if there
-exists a one-to-one onto mapping ρ : (V (G) − {src, sink }) → pos(r) such that
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-17
-
-18
-
-·
-
-Geert Jan Bex et al.
-first(∅)
-first(a(i) )
-first(r+ )
-
-=
-=
-=
-
-first(r . s)
-
-=
-
-last(∅)
-last(a(i) )
-last(r+ )
-
-=
-=
-=
-
-last(r . s)
-
-=
-
-follow (a(i) , a(i) )
-follow (r?, a(i) )
-
-=
-=
-
-follow (r+ , a(i) )
-
-=
-
-follow (r + s, a(i) )
-
-=
-
-follow (r . s, a(i) )
-
-=
-
-Fig. 6.
-
-∅
-first(ε)
-{a(i) }
-first(r?)
-first(r)
-first(r + s)
-(
-first(r)
-if ε ∈
-/ L(r),
-first(r) ∪ first(s) otherwise.
-
-=
-=
-=
-
-∅
-first(r)
-first(r) ∪ first(s)
-
-∅
-{a(i) }
-last(r)
-(
-last(s)
-last(r) ∪ last(s)
-
-=
-=
-=
-
-∅
-last(r)
-last(r) ∪ last(s)
-
-last(ε)
-last(r?)
-last(r + s)
-if ε ∈
-/ L(s),
-otherwise.
-
-∅
-follow (r, a(i) )
-(
-follow (r, a(i) )
-(i)
-(follow (r, a ) ∪ first(r)
-follow (r, a(i) )
-follow (s, a(i) )
-
-(i)
-
-follow (r, a )
-
-follow (r, a(i) ) ∪ first(s)
-
-
-follow (s, a(i) )
-
-if a(i) ∈
-/ last(r),
-otherwise.
-if a(i) ∈ pos(r),
-otherwise.
-if a(i) ∈ pos(r), a(i) ∈
-/ last(r),
-if a(i) ∈ pos(r), a(i) ∈ last(r),
-otherwise.
-
-Definition of first(r), last(r), and follow (r, a(i) ), for a(i) ∈ pos(r).
-
-(1) v ∈ Succ(src) ⇔ ρ(v) ∈ first(r);
-(2) v ∈ Pred(sink ) ⇔ ρ(v) ∈ last(r);
-(3) v ∈ Succ(w) ⇔ ρ(v) ∈ follow (r, ρ(w)); and
-(4) strip(ρ(v)) = lab(v),
-for all v, w ∈ V (G) − {src, sink }.
-Theorem 4.8. If k-OA G is a Glushkov representation of a target k-ORE
-r, then rwr2 (G) is equivalent to r. Moreover, if r is deterministic, then so is
-rwr2 (G).
-Proof. Since rwr2 (G) = strip(rwr21 (H)) for an arbitrarily chosen marking
-H of G, it suffices to prove that strip(rwr21 (H)) is equivalent to r and that
-strip(rwr21 (H)) is deterministic whenever r is deterministic, for every marking H
-of G. Hereto, let H be an arbitrary but fixed marking of G. In particular, G and H
-have the same set of nodes V and edges E, but differ in their labeling function. Let
-lab G be the labeling function of G and let lab H the labeling function of H. Clearly,
-lab G (v) = strip(lab H (v)) for every v ∈ V − {src, sink }. Since G is a Glushkov
-translation of r, there is a one-to-one, onto mapping ρ : (V − {src, sink }) → pos(r)
-satisfying properties (1)-(4) in Definition 4.7. Now let σ : pos(r) → Σ(k) be the
-function that maps a(i) ∈ pos(r) to lab H (ρ−1 (a(i) )). Since lab H assigns a distinct
-label to each state, σ is one-to-one and onto the subset of Σ(k) symbols used as
-labels in H. Moreover, by property (4) and the fact that lab G (v) = strip(lab H (v))
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-we have,
-strip(a(i) ) = lab G (ρ−1 (a(i) )) = strip(lab H (ρ−1 (a(i) ))) = strip(σ(a(i) ))
-
-(?)
-
-(i)
-
-for each a ∈ pos(r). In other words, σ preserves (stripped) labels. Now let σ(r)
-be the SORE obtained from r by replacing each a(i) ∈ pos(r) by σ(a(i) ). Since σ is
-one-to-one and r is a SORE, so is σ(r). Moreover, we claim that L(H) = L(σ(r)).
-Indeed, it is readily verified by induction on r that a word a1 (i1 ) . . . an (in ) ∈ L(r)
-if, and only if, (i) a1 (i1 ) ∈ first(r); (ii) ap+1 (ip+1 ) ∈ follow (r, ap+1 (ip+1 ) ) for every
-1 ≤ p < n; and (iii) an (in ) ∈ last(r). By properties (1)-(4) of Definition 4.7 we
-hence obtain:
-σ(a1 (i1 ) ) . . . σ(an (in ) ) ∈ L(σ(r))
-⇔ a1 (i1 ) . . . an (in ) ∈ L(r)
-⇔ src, ρ−1 (a1 (i1 ) ), . . . , ρ−1 (an (in ) ), sink is a walk in G
-⇔ src, ρ−1 (a1 (i1 ) ), . . . , ρ−1 (an (in ) ), sink is a walk in H
-⇔ lab H (ρ−1 (a1 (i1 ) )) . . . , lab H (ρ−1 (an (in ) )) ∈ L(H)
-⇔ σ(a1 (i1 ) ) . . . σ(an (in ) ) ∈ L(H)
-Therefore, L(H) = L(σ(r)).
-Hence, we have established that H is a SOA over Σ(k) equivalent to the SORE
-σ(r) over Σ(k) . By Theorem 4.3, rwr21 (H) is hence equivalent to σ(r). Therefore,
-strip(rwr21 (H)) is equivalent to strip(σ(r)), which by (?) above, is equivalent to
-strip(r) = r, as desired.
-Finally, to see that strip(rwr21 (H)) is deterministic if r is deterministic, let
-s := strip(rwr21 (H)) and suppose for the purpose of contradiction that s is not
-deterministic. Then there exists wa(i) v1 and wa(j) v2 in L(s) with i 6= j. It is
-0
-0
-not hard to see that this can happen only if there exist w0 a(i ) v10 and w0 a(j ) v20
-in L(rwr21 (H)) with i0 6= j 0 . Since L(rwr21 (H)) = L(σ(r)) we know that hence
-0
-0
-00
-0
-σ −1 (w0 a(i ) v10 ) ∈ L(r) and σ −1 (w0 a(j ) v20 ) ∈ L(r). Let w00 a(i ) v100 = σ −1 (w0 a(i ) v10 )
-00
-0
-and w00 a(j ) v200 = σ −1 (w0 a(i ) v20 ). Since σ is one-to-one and i0 6= j 0 , also i00 6= j 00 .
-Therefore, r is not deterministic, which yields the desired contradiction.
-4.3
-
-The whole Algorithm
-
-Our deterministic regular expression inference algorithm iDRegEx combines iKoa
-and rwr2 as shown in Algorithm 4. For increasing values of k until a maximum
-kmax is reached, it first learns a deterministic k-OA G from the given sample S,
-and subsequently translates that k-OA into a k-ORE using rwr2 . If the resulting
-k-ORE is deterministic then it is added to the set C of deterministic candidate
-expressions for S, otherwise it is discarded. From this set of candidate expressions,
-iDRegEx returns the “best” regular expression best(C), which is determined according to one of the measures introduced below. Since it is well-known that,
-depending on the initial value of α, BaumWelsh (and therefore iKoa) may converge to a local maximum that is not necessarily global, we apply iKoa a number
-of times N with independently chosen random seed values for α to increase the
-probability of correctly learning the target regular expression from S.
-The observant reader may wonder whether we are always guaranteed to derive
-at least one deterministic expression such that best(C) is defined. Indeed, Theorem 4.8 tells us that if we manage to learn from sample S a k-OA which is the
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-19
-
-20
-
-·
-
-Geert Jan Bex et al.
-
-Algorithm 4 iDRegEx
-Require: a sample S
-Ensure: a k-ORE r
-1: initialize candidate set C ← ∅
-2: for k = 1 to kmax do
-3:
-for n = 1 to N do
-4:
-G ← iKoa(S, k)
-5:
-if rwr2 (G) is deterministic then
-6:
-add rwr2 (G) to C
-7: return best(C)
-Glushkov representation of the target expression r, then rwr2 will always return
-a deterministic k-ORE equivalent to r. When k > 1, there can be several k-OAs
-representing the same language and we could therefore learn a non-Glushkov one.
-In that case, rwr2 always returns a k-ORE which is a super approximation of the
-target expression. Although that approximation can be non-deterministic, since we
-derive k-OREs for increasing values of k and since for k = 1 the result of rwr2 is
-always deterministic (as every SORE is deterministic), we always infer at least one
-deterministic regular expression. In fact, in our experiments on 100 synthetic regular expressions, we derived for 96 of them a deterministic expression with k > 1,
-and only for 4 expressions had to resort to a 1-ORE approximation.
-4.3.1 A Language Size Measure for Determining the Best Candidate. Intuitively,
-we want to select from C the simplest deterministic expression that “best” describes
-S. Since each candidate expression in C accepts all words in S by construction, one
-way to interpret “the best” is to select the expression that accepts the least number
-of words (thereby adding the least number of words to S). Since an expression defines an infinite language in general, it is of course impossible to take all words into
-account. We therefore only consider the words up to a length n, where n = 2m + 1
-with m the length of the candidate expression, excluding regular expression operators, ∅, and ε. For instance, if the candidate expression is a .(a + c+ )?, then m = 3
-and n = 7. Formally, for a language L, let |L≤n | denote the number of words in L
-of length at most n. Then the best candidate in C is the one with the least value of
-| L(r)≤n |. If there are multiple such candidates, we pick the shortest one (breaking
-ties arbitrarily). It turns out that | L(r)≤n | can be computed quite efficiently; see
-[Bex et al. ] for details.
-4.3.2 A Minimum Description Length Measure for Determining the Best Candidate. An alternative measure to determine the best candidate is given by Adriaans
-and Vitányi [2006], who compare the size of S with the size of the language of a
-candidate r. Specifically, Adriaans and Vitányi define the data encoding cost of r
-to be:
- =i
-
-n
-X
-| L (r)|
-datacost(r, S) :=
-2 · log2 i + log2
-,
-|S =i |
-i=0
-where n = 2m + 1 as before; |S =i | is the number of words in S that have length i;
-and | L=i (r)| is the number of words in L(r) that have exactly length i. Although
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-the above formula is numerically difficult to compute, there is an easier estimation
-procedure; see [Adriaans and Vitányi 2006] for details.
-In this case, the model encoding cost is simply taken to be its length, thereby
-preferring shorter expressions over longer ones. The best regular expression in the
-candidate set C is then the one that minimizes both model and data encoding cost
-(breaking ties arbitrarily).
-We already mentioned that xtract [Garofalakis et al. 2003] also utilizes the
-Minimum Description Length principle. However, their measure for data encoding
-cost depends on the concrete structure of the regular expressions while ours only
-depends on the language defined by them and is independent of the representation.
-Therefore, in our setting, when two equivalent expressions are derived, the one with
-the smallest model cost, that is, the simplest one, will always be taken.
-5.
-
-EXPERIMENTS
-
-In this section we validate our approach by means of an experimental analysis.
-Throughout the section, we say that a target k-ORE r is successfully derived when
-a k-ORE s with L(r) = L(s) is generated. The success rate of our experiments
-then is the percentage of successfully derived target regular expressions.
-Our previous work [Bex et al. 2008] on this topic was based on a version of the
-rwr0 algorithm [Bex et al. 2006], we refer to this algorithm as iDRegEx(rwr0 ).
-Unfortunately, as detailed in [Bex et al. 2008], it is not known whether rwr0 is
-complete on the class of all single occurrence regular expressions. Nevertheless, the
-experiments in [Bex et al. 2008] which are revisited below show a good and reliable
-performance. However, to obtain a theoretically complete algorithm, c.f.r. Theorem 4.8, we use the algorithm rwr2 which is sound and complete on single occurrence regular expressions. In the remainder we focus on iDRegEx, but compare
-with the results for iDRegEx(rwr0 ).
-As mentioned in Section 4.3.1, another new aspect of the results presented here is
-the use of language size as an alternative measure over Minimum Description Length
-(MDL) to compare candidates. The iDRegEx(rwr0 ) algorithm is only considered
-with the MDL criterion. We note that for alphabet size 5, the success rate of
-iDRegEx with the MDL criterion was only 21 %, while that of the language size
-criterion is 98 %. The corpus used in this experiment is described in Section 5.3.
-Therefore in the remainder of this section we only consider iDRegEx with the
-language size criterion.
-For all the experiments described below we take kmax = 4 and N = 10 in Algorithm 4.
-5.1
-
-Running times
-
-All experiments were performed using a prototype implementation of iDRegEx
-and iDRegEx(rwr0 ) written in Java executed on Pentium M 2.0 GHz class machines equipped with 1GB RAM. For the BaumWelsh subroutine we have gratefully used Jean-Marc François’ Jahmm library [François 2006], which is a faithful
-implementation of the algorithms described in Rabiner’s Hidden Markov Model tutorial [Rabiner 1989]. Since Jahmm strives for clarity rather than performance and
-since only limited precautions are taken against underflows, our prototype should
-be seen as a proof of concept rather than a polished product. In particular, underACM Journal Name, Vol. V, No. N, November 2024.
-
-21
-
-22
-
-·
-
-Geert Jan Bex et al.
-
-flows currently limit us to target regular expressions whose total number of symbol
-occurrences is at most 40. Here, the total number of symbol occurrences occ(r) of
-a regular expression r is its length excluding the regular expression operators and
-parenthesis. To illustrate, the total number of symbol occurrences in aa?b+ is 3.
-Furthermore, the lack of optimization in Jahmm leads to average running times
-ranging from 4 minutes for target expressions r with |Σ(r)| = 5 and occ(r) = 6 to
-9 hours for targets expression with |Σ(r)| = 15 and occ(r) = 30. Running times for
-iDRegEx and iDRegEx(rwr0 ) are similar.
-As already mentioned in Section 4.3, one of the bottlenecks of iDRegEx is the application of BaumWelsh in Line 11 of Disambiguate (Algorithm 2). BaumWelsh
-is an iterative procedure that is typically run until convergence, i.e., until the
-computed probability distribution no longer change significantly. To improve the
-running time, we only apply a fixed number ` of iteration steps when calling
-BaumWelsh in Line 11 of Disambiguate. Experiments show that the running
-time performance scales linear with ` as one expects, but, perhaps surprisingly, the
-success rate improves as well for an optimal value of `. This optimal value for `
-depends on the alphabet size. These improved results can be explained as follows:
-applying BaumWelsh in each disambiguation step until it converges guarantees
-that the probability distribution for that step will have reached a local optimum.
-However, we know that the search space for the algorithm contains many local optima, and that BaumWelsh is a local optimization algorithm, i.e., it will converge
-to one of the local optima it can reach from its starting point by hill climbing. The
-disambiguation procedure proceeds state by state, so fine tuning the probability
-distribution for a disambiguation step may transform the search space so that certain local optima for the next iteration can no longer be reached by a local search
-algorithm such as BaumWelsh. Table I shows the performance of the algorithm
-for various number of BaumWelsh iterations ` for expressions of alphabet size 5,
-10 and 15. These expressions are those described in Section 5.3. In this Table,
-` = ∞ denotes the case where BaumWelsh is ran until convergence after each
-disambiguation step. The Table illustrates that the success rate is actually higher
-for small values of `. The running time performance gains increase rapidly with
-the expressions’ alphabet size: for |Σ| = 5, we gain a factor of 3.5 (` = 2), for
-|Σ| = 10, it is already a factor of 10 (` = 3) and for |Σ| = 15, we gain a factor
-of 25 (` = 3). This brings the running time for the largest expressions we tested
-down to 22 minutes, in contrast with 9 hours mentioned for iDRegEx(rwr0 ) and
-iDRegEx. The algorithm with the optimal number of BaumWelsh steps in the
-disambiguation process will be referred to as iDRegExfixed . In particular for small
-alphabet sizes (|Σ| ≤ 7) we use ` = 2, for large alphabet size ` = 3 (|Σ| > 7). We
-note that the alphabet size can easily be determined from the sample.
-We should also note that Experience with Hidden Markov Model learning in bioinformatics [Finn et al. 2006] suggests that both the running time and the maximum
-number of symbol occurrences that can be handled can be significantly improved
-by moving to an industrial-strength BaumWelsh implementation. Our focus for
-the rest of the section will therefore be on the precision of iDRegEx.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-`
-1
-2
-3
-4
-∞
-
-rate |Σ| = 5
-95 %
-100 %
-95 %
-95 %
-98 %
-
-rate |Σ| = 10
-80 %
-75 %
-84 %
-77 %
-75 %
-
-·
-
-rate |Σ| = 15
-40 %
-50 %
-60 %
-50 %
-50 %
-
-Table I. Success rate for a limited number of BaumWelsh iterations in the disambiguation procedure, ` = ∞ corresponds to iDRegEx, for ` = 1, . . . , 4 correspond to iDRegExfixed .
-
-5.2
-
-Real-world target expressions and real-world samples
-
-We want to test how iDRegEx performs on real-world data. Since the number
-of publicly available XML corpora with valid schemas is rather limited, we have
-used as target expressions the 49 content models occurring in the XSD for XML
-Schema Definitions [Thompson et al. 2001] and have drawn multiset samples for
-these expressions from a large corpus of real-world XSDs harvested from the Cover
-Pages [Cover 2003]. In other words, the goal of our first experiment is to derive, from
-a corpus of XSD definitions, the regular expression content models in the schema
-for XML Schema Definitions2 . As it turns out, the XSD regular expressions are all
-single occurrence regular expressions.
-The iDRegEx(rwr0 ) algorithm infers all these expressions correctly, showing
-that it is conservative with respect to k since, as mentioned above, the algorithm
-considers k values ranging from 1 to 4. In this setting, iDRegEx performs not
-as well, deriving only 73 % of the regular expressions correctly. We note that for
-each expression that was not derived exactly, always an expression was obtained
-describing the input sample and which in addition is more specific than the target
-expression. iDRegEx therefore seems to favor more specific regular expressions,
-based on the available examples.
-5.3
-
-Synthetic target expressions
-
-Although the successful inference of the real-world expressions in Section 5.2 suggests that iDRegEx is applicable in real-world scenarios, we further test its behavior on a sizable and diverse set of regular expressions. Due to the lack of real-world
-data, we have developed a synthetic regular expression generator that is parameterized for flexibility.
-Synthetic expression generation. In particular, the occurrence of the regular
-expression operators concatenation, disjunction (+), zero-or-one (?), zero-or-more
-(∗ ), and one-or-more (+ ) in the generated expressions is determined by a userdefined probability distribution. We found that typical values yielding realistic
-expressions are 1/10 for the unary operators and 7/20 for others. The alphabet
-can be specified, as well as the number of times that each individual symbol should
-occur. The maximum of these numbers determines the value k of the generated
-k-ORE.
-To ensure the validity of our experiments, we want to generate a wide range of
-different expressions. To this end, we measure how much the language of a generated
-2 This corpus was also used in [Bex et al. 2007] for XSD inference.
-
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-23
-
-24
-
-·
-
-Geert Jan Bex et al.
-
-((debab) + c)∗ a
-((((c + b)b) + a)ca) + e + d
-(((ea)∗ db) + b + a + c)+
-((b+ + c + e + d)aab)+
-((((eabh) + d + j + c + b)+ f ) + a + g + i)?
-((((aa) + e)+ + c)b) + b + d
-((((d + a)∗ eabcb) + c)a)?
-((((ac) + b + d)eab) + c)∗
-(((((bab) + c)+ + e)?a) + d)+
-((((ecb)+ a) + b)+ + d + a)?
-((bagbf eid) + c + a + j + h)∗
-((gdab) + a + i + c + j + e + f )+ hb
-((h∗ cdf a) + j + e + g + b + i)∗ ab
-((g + b + e + f + i + d)∗ aba) + h + j + c
-((((h + b + c + j + f )+ + e)?aaidb) + g)?
-
-Fig. 7.
-
-(((((dbe)∗ cf ) + j)hac) + b + i)∗ gad
-(((((ihaaj) + d)+ + g)b) + e + b + f + c)+
-(((ecgecd) + b + d + a + j + f )∗ ihaba)∗
-(l + c + d + m + n)∗ aojahbegcbf idke
-(((c + b)ab) + d + i + a)+ + j + g + f + e + h
-(((a?clf habgd) + b + n + o)iedjcem)∗ k
-((a + k + f + c + m + e)+ bdieclbonjgda)∗ h
-(((k?jghadf celif cjbhom)+
-b + g + a + e + i + n)+ + d)?
-(((aedoadenhdbci) + h + k + m + j + g + b)∗
-f ccgelbif ja)
-((a+ + f + d + o + g + n + h + c + b + j + i + e)
-keacdlbm)
-(((k + f + o + a + j)?edhldf hngicjmab)?cie)∗ bg
-((((a?d)+ ba) + h + g + e + c)+ + j + i + b)?f
-
-A snapshot of the 100 generated expressions.
-
-expression overlaps with Σ∗ . The larger the overlap, the greater its language size
-as defined in Section 4.3.1.
-To ensure that the generated expressions do not impede readability by containing
-redundant subexpressions (as in e.g., (a+ )+ ), the final step of our generator is to
-syntactically simplify the generated expressions using the following straightforward
-equivalences:
-r∗ → r+ ?
-r?? → r?
-(r+ )+ → r+
-(r?)+ → r+ ?
-(r1 · r2 ) · r3 → r1 · (r2 · r3 )
-r1 · (r2 · r3 ) → r1 · r2 · r3
-(r1 ? · r2 ?)? → r1 ? · r2 ?
-(r1 + r2 ) + r3 → r1 + (r2 + r3 )
-r1 + (r2 + r3 ) → r1 + r2 + r3
-(r1 + r2+ )+ → (r1 + r2 )+
-(r1+ + r2+ ) → (r1 + r2 )+
-r1 + r2 ? → (r1 + r2 )?
-Of course, the resulting expression is rejected if it is non-deterministic.
-To obtain a diverse target set, we synthesized expressions with alphabet size 5
-(45 expressions), 10 (45 expressions), and 15 (10 expressions) with a variety of
-symbol occurrences (k = 1, 2, 3). For each of the alphabet sizes, the expressions
-were selected to cover language size ranging from 0 to 1. All in all, this yielded a
-set of 100 deterministic target expressions. A snapshot is given in Figure 7.
-Synthetic sample generation. For each of those 100 target expressions, we
-generated synthetic samples by transforming the target expressions into stochastic
-processes that perform random walks on the automata representing the expressions
-(cf. Section 4). The probability distributions of these processes are derived from the
-structure of the originating expression. In particular, each operand in a disjunction
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-p
-
-r1 · · · rn
-
-p
-
-1
-
-r1
-
-1
-
-···
-
-1
-
-rn
-
-·
-
-1
-
-r1
-p/n
-p
-
-r1 + · · · + rn
-
-1
-
-1
-.
-.
-.
-1
-
-p/n
-rn
-p/2
-p
-r?
-
-1
-
-r
-p/2
-
-1
-
-2/3
-p
-
-Fig. 8.
-
-r+
-
-1
-p
-
-r
-1/3
-
-From a regular expression to a probabilistic automaton.
-
-is equally likely and the probability to have zero or one occurrences for the zeroor-one operator ? is 1/2 for each option. The probability to have n repetitions in
-a one-or-more or zero-or-more operator (∗ and + ) is determined by the probability
-that we choose to continue looping (2/3) or choose to leave the loop (1/3). The
-latter values are based on observations of real-world corpora. Figure 8 illustrates
-how we construct the desired stochastic process from a regular expression r: starting
-from the following initial graph,
-1
-
-r
-
-1
-
-we continue applying the rewrite rules shown until each internal node is an individual alphabet symbol.
-Experiments on covering samples. Our first experiment is designed to test
-how iDRegEx performs on samples that are at least large enough to cover the
-target regular expression, in the following sense.
-Definition 5.1. A sample S covers a deterministic automaton G if for every edge
-(s, t) in G there is a word w ∈ S whose unique accepting run in G traverses (s, t).
-Such a word w is called a witness for (s, t). A sample S covers a deterministic
-regular expression r if it covers the automaton obtained from S using the Glushkov
-construction for translating regular expressions into automata as defined in Definition 4.7.
-Intuitively, if a sample does not cover a target regular expression r then there
-will be parts of r that cannot be learned from S. In this sense, covering samples
-are the minimal samples necessary to learn r. Note that such samples are far from
-“complete” or “characteristic” in the sense of the theoretical framework of learning
-in the limit, as some characteristic samples are bound to be of size exponential in
-the size of r by Theorem 3.2, while samples of size at most quadratic in r suffice
-to cover r. Indeed, the Glushkov construction always yields an automaton whose
-number of states is bounded by the size of r. Therefore, this automaton can have
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-25
-
-26
-
-·
-
-Geert Jan Bex et al.
-
-at most |r|2 edges, and hence |r|2 witness words suffice to cover r.
-Table II shows how iDRegEx performs on covering samples, broken up by alphabet size of the target expressions. The size of the sample used is depicted as well.
-The table demonstrates a remarkable precision. Out of a total of 100 expressions,
-82 are derived exactly for iDRegEx. Although iDRegEx(rwr0 ) outperforms
-iDRegEx with a success rate of 87 %, overall iDRegExfixed performs best with
-89 %. The performance decreases with the alphabet size of the target expressions:
-this is to be expected since the inference task’s complexity increases. It should
-be emphasized that even if iDRegExfixed does not derive the target expression
-exactly, it always yields an over-approximation, i.e., its language is a superset of
-the target language.
-Table III shows an alternative view on the results. It shows the success rate as a
-function of the target expression’s language size, grouped in intervals. In particular,
-it demonstrates that the method works well for all language sizes.
-A final perspective is offered in Table IV which shows the success rate in function
-of the average states per symbol κ for an expression. The latter quantity is defined
-as the length of the regular expression excluding operators, divided by the alphabet size. For instance, for the expression a(a + b)+ cab, κ = 6/3 since its length
-excluding operators is 6 and |Σ| = 3. It is clear that the learning task is harder
-for increasing values of κ. To verify the latter, a few extra expressions with large κ
-values were added to the target expressions. For the algorithm iDRegExfixed the
-success rate is quite high for target expressions with a large value of κ. Conversely,
-iDRegEx(rwr0 ) yields better results for κ < 1.6, while its success rate drops to
-around 50 % for larger values of κ. This illustrates that neither iDRegEx(rwr0 )
-nor iDRegExfixed outperforms the other in all situations.
-|Σ|
-5
-10
-15
-total
-
-#regex
-45
-45
-10
-100
-
-iDRegEx(rwr0 )
-86 %
-93 %
-70 %
-87 %
-
-iDRegEx
-97 %
-75 %
-50 %
-82 %
-
-iDRegExfixed
-100 %
-84 %
-60 %
-89 %
-
-|S|
-300
-1000
-1500
-
-Table II. Success rate on the target regular expressions and the sample size used per alphabet size
-for the various algorithms.
-
-Density(r)
-[0.0, 0.2[
-[0.2, 0.4[
-[0.4, 0.6[
-[0.6, 0.8[
-[0.8, 1.0]
-Table III.
-
-#regex
-24
-22
-20
-22
-12
-
-iDRegEx(rwr0 )
-100 %
-82 %
-90 %
-95 %
-83 %
-
-iDRegEx
-87 %
-91 %
-75 %
-72 %
-78 %
-
-iDRegExfixed
-96 %
-91 %
-85 %
-83 %
-78 %
-
-Success rate on the target regular expressions, grouped by language size.
-
-It is also interesting to note that iDRegEx successfully derived the regular expression r1 = (a1 a2 + a3 + · · · + an )+ of Theorem 3.2 for n = 8, n = 10, and n = 12
-from covering samples of size 500, 800, and 1100, respectively. This is quite surprising considering that the characteristic samples for these expressions was proven to
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-κ
-[1.2, 1.4[
-[1.4, 1.6[
-[1.6, 1.8[
-[1.8, 2.0[
-[2.0, 2.5[
-[2.5, 3.0]
-
-#regex
-29
-37
-24
-11
-12
-18
-
-iDRegEx(rwr0 )
-96 %
-100 %
-91 %
-54 %
-41 %
-66 %
-
-iDRegEx
-72 %
-89 %
-92 %
-91 %
-50 %
-71 %
-
-·
-
-iDRegExfixed
-83 %
-89 %
-100 %
-100 %
-50 %
-78 %
-
-Table IV. Success rate on the target regular expressions, grouped by κ, the average number of
-states per symbol.
-
-be of size at least (n − 2)!, i.e., 720, 40320, and 3628800 respectively. The regular
-expression r2 = (Σ \ a1 )+ a1 (Σ \ a1 )+ , in contrast, was not derivable by iDRegEx
-from small samples.
-Experiments on partially covering samples. Unfortunately, samples to learn
-regular expressions from are often smaller than one would prefer. In an extreme, but
-not uncommon case, the sample does not even entirely cover the target expression.
-In this section we therefore test how iDRegEx performs on such samples.
-Definition 5.2. The coverage of a target regular expression r by a sample S is
-defined as the fraction of transitions in the corresponding Glushkov automaton for
-r that have at least one witness in S.
-Note that to successfully learn r from a partially covering sample, iDRegEx
-needs to “guess” the edges for which there is no witness in S. This guessing capability is built into iDRegEx(rwr0 ) and iDRegEx in the form of repair rules [Bex
-et al. 2006; Bex et al. 2008]. Our experiments show that for target expressions
-with alphabet size |Σ| = 10, this is highly effective for iDRegEx(rwr0 ): even at a
-coverage of 70%, half the target expressions can still be learned correctly as Table V
-shows. The algorithm iDRegEx is performing very poorly in this setting, being
-only successful occasionally for coverages close to 100 %. iDRegExfixed performs
-better, although not as well as iDRegEx(rwr0 ). This again illustrates that both
-algorithms have their merits.
-coverage
-1.0
-0.9
-0.8
-0.7
-0.6
-
-iDRegEx(rwr0 )
-100 %
-64 %
-60 %
-52 %
-0%
-
-iDRegEx
-80 %
-20 %
-0%
-0%
-0%
-
-iDRegExfixed
-80 %
-60 %
-40 %
-0%
-0%
-
-Table V. Success rate for 25 target expressions for |Σ| = 10 for samples that provide partial
-coverage of the target expressions.
-
-We also experimented with target expressions with alphabet size |Σ| = 5. In this
-case, the results were not very promising for iDRegEx(rwr0 ), but as Table VI
-illustrates, iDRegEx and iDRegExfixed performs better, on par with the target
-expressions for |Σ| = 10 in the case of iDRegExfixed . This is interesting since
-the absolute amount of information missing for smaller regular expressions is larger
-than in the case of larger expressions.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-27
-
-28
-
-·
-
-Geert Jan Bex et al.
-coverage
-1.0
-0.9
-0.8
-0.7
-0.6
-0.5
-
-Table VI.
-
-6.
-
-iDRegEx(rwr0 )
-100 %
-25 %
-16 %
-8%
-8%
-0%
-
-iDRegEx
-100 %
-75 %
-75 %
-25 %
-25 %
-8%
-
-iDRegExfixed
-100 %
-66 %
-41 %
-33 %
-17 %
-17 %
-
-Success rate for 12 target expressions for |Σ| = 5 with partially covering samples.
-
-CONCLUSIONS
-
-We presented the algorithm iDRegEx for inferring a deterministic regular expression from a sample of words. Motivated by regular expressions occurring in practice,
-we use a novel measure based on the number k of occurrences of the same alphabet
-symbol and derive expressions for increasing values of k. We demonstrated the
-remarkable effectiveness of iDRegEx on a large corpus of real-world and synthetic
-regular expressions of different densities.
-Our experiments show that iDRegEx(rwr0 ) performs better than iDRegEx
-for target expressions with a κ < 1.6 and vice versa for larger values of κ. For
-partially covering samples, iDRegEx(rwr0 ) is more robust than iDRegEx. As κ
-values and sample coverage are not known in advance, it makes sense to run both
-algorithms and select the smallest expression or the one with the smallest language
-size, depending on the application at hand.
-Some questions need further attention. First, in our experiments, iDRegEx
-always derived the correct expression or a super-approximation of the target expression. It remains to investigate for which kind of input samples this behavior
-can be formally proved. Second, it would also be interesting to characterize precisely which classes of expressions can be learned with our method. Although the
-parameter κ explains this to some extend, we probably need more fine grained
-measures. A last and obvious goal for future work is to speed up the inference of
-the probabilistic automaton which forms the bottleneck of the proposed algorithm.
-A possibility is to use an industrial strength implementation of the Baum-Welsh
-algorithm as in [Finn et al. 2006] rather than a straightforward one or to explore
-different methods for learning probabilistic automata.
-Although iDRegEx can be directly plugged into the XSD inference engine iXSD
-of [Bex et al. 2007], it would be interesting to investigate how to extend these
-techniques to the more robust class of Relax NG schemas [Clark and Murata 2001].
-REFERENCES
-Castor. www.castor.org.
-SUN Microsystems JAXB. java.sun.com/webservices/jaxb.
-Adriaans, P. and Vitányi, P. 2006. The Power and Perils of MDL.
-Ahonen, H. 1996. Generating Grammars for structured documents using grammatical inference
-methods. Report A-1996-4, Department of Computer Science, University of Finland.
-Angluin, D. and Smith, C. H. 1983. Inductive Inference: Theory and Methods. ACM Computing
-Surveys 15, 3, 237–269.
-Barbosa, D., Mignet, L., and Veltri, P. 2005. Studying the XML Web: gathering statistics
-from an XML sample. World Wide Web 8, 4, 413–438.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-Benedikt, M., Fan, W., and Geerts, F. 2005. XPath satisfiability in the presence of DTDs. In
-Proceedings of the Twenty-fourth ACM SIGACT-SIGMOD-SIGART Symposium on Principles
-of Database Systems. 25–36.
-Bernstein, P. A. 2003. Applying Model Management to Classical Meta Data Problems. In First
-Biennial Conference on Innovative Data Systems Research.
-Bex, G., Neven, F., Schwentick, T., and Vansummeren, S. Inference of Concise Regular
-Expressions and DTDs. ACM TODS . To Appear.
-Bex, G. J., Gelade, W., Neven, F., and Vansummeren, S. 2008. Learning deterministic regular
-expressions for the inference of schemas from XML data. In WWW. Beijing, China, 825–834.
-Accepted for WWW 2008.
-Bex, G. J., Neven, F., Schwentick, T., and Tuyls, K. 2006. Inference of concise DTDs from
-XML data. In Proceedings of the 32nd International Conference on Very Large Data Bases.
-115–126.
-Bex, G. J., Neven, F., Schwentick, T., and Vansummeren, S. 2008. Inference of Concise
-Regular Expressions and DTDs. submitted to VLDB Journal.
-Bex, G. J., Neven, F., and Van den Bussche, J. 2004. DTDs versus XML Schema: a practical
-study. In Proceedings of the 7th International Workshop on the Web and Databases. 79–84.
-Bex, G. J., Neven, F., and Vansummeren, S. 2007. Inferring XML Schema Definitions from
-XML data. In Proceedings of the 33rd International Conference on Very Large Databases.
-998–1009.
-Brāzma, A. 1993. Efficient identification of regular expressions from representative examples.
-In Proceedings of the 6th Annual ACM Conference on Computational Learning Theory. ACM
-Press, 236–242.
-Brüggeman-Klein, A. 1993. Regular expressions into finite automata. Theoretical Computer
-Science 120, 2, 197–213.
-Brüggemann-Klein, A. and Wood, D. 1998. One-unambiguous regular languages. Information
-and computation 140, 2, 229–253.
-Buneman, P., Davidson, S. B., Fernandez, M. F., and Suciu, D. 1997. Adding structure to
-unstructured data. In Database Theory - ICDT ’97, 6th International Conference, F. N. Afrati
-and P. G. Kolaitis, Eds. Lecture Notes in Computer Science, vol. 1186. Springer, 336–350.
-Che, D., Aberer, K., and Özsu, M. T. 2006. Query optimization in XML structured-document
-databases. VLDB Journal 15, 3, 263–289.
-Chidlovskii, B. 2001. Schema extraction from XML: a grammatical inference approach. In
-Proceedings of the 8th International Workshop on Knowledge Representation meets Databases.
-Clark, J. Trang: Multi-format schema converter based on RELAX NG. http://www.
-thaiopensource.com/relaxng/trang.html.
-Clark, J. and Murata, M. 2001. RELAX NG Specification. OASIS.
-Cover, R. 2003. The Cover Pages. http://xml.coverpages.org/.
-Du, F., Amer-Yahia, S., and Freire, J. 2004. ShreX: Managing XML Documents in Relational
-Databases. In Proceedings of the 30th International Conference on Very Large Data Bases.
-1297–1300.
-Ehrenfeucht, A. and Zeiger, P. 1976. Complexity measures for regular expressions. Journal
-of computer and system sciences 12, 134–146.
-Fernau, H. 2004. Extracting minimum length Document Type Definitions is NP-hard. In ICGI.
-277–278.
-Fernau, H. 2005. Algorithms for Learning Regular Expressions. In Algorithmic Learning Theory,
-16th International Conference. 297–311.
-Finn, R., Mistry, J., Schuster-Bckler, B., Griffiths-Jones, S., et al. 2006. Pfam: clans,
-web tools and services. Nucleic Acids Research 34, D247–D251.
-Florescu, D. 2005. Managing semi-structured data. ACM Queue 3, 8 (October).
-François, J.-M. 2006. Jahmm. http://www.run.montefiore.ulg.ac.be/~francois/software/
-jahmm/.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-29
-
-30
-
-·
-
-Geert Jan Bex et al.
-
-Freire, J., Haritsa, J. R., Ramanath, M., Roy, P., and Siméon, J. 2002. StatiX: making XML
-count. In SIGMOD Conference. 181–191.
-Freitag, D. and McCallum, A. 2000. Information Extraction with HMM Structures Learned
-by Stochastic Optimization. In AAAI/IAAI. AAAI Press / The MIT Press, 584–589.
-Garcia, P. and Vidal, E. 1990. Inference of k-testable languages in the strict sense and application to syntactic pattern recognition. IEEE Transactions on Pattern Analysis and Machine
-Intelligence 12, 9 (September), 920–925.
-Garofalakis, M., Gionis, A., Rastogi, R., Seshadri, S., and Shim, K. 2003. XTRACT: learning document type descriptors from XML document collections. Data mining and knowledge
-discovery 7, 23–56.
-Gelade, W. and Neven, F. 2008. Succinctness of the Complement and Intersection of Regular
-Expressions. In STACS. 325–336.
-Gold, E. 1967. Language identification in the limit. Information and Control 10, 5 (May),
-447–474.
-Goldman, R. and Widom, J. 1997. DataGuides: Enabling Query Formulation and Optimization
-in Semistructured Databases. In Proceedings of 23rd International Conference on Very Large
-Data Bases. 436–445.
-Gruber, H. and Holzer, M. 2008. Finite Automata, Digraph Connectivity, and Regular Expression Size. In ICALP (2). 39–50.
-Hegewald, J., Naumann, F., and Weis, M. 2006. XStruct: efficient schema extraction from
-multiple and large XML documents. In ICDE Workshops. 81.
-Hopcroft, J. and Ullman, J. 2007. Introduction to automata theory, languages and computation. Addison-Wesley, Reading, MA.
-Koch, C., Scherzinger, S., Schweikardt, N., and Stegmaier, B. 2004. Schema-based scheduling of event processors and buffer minimization for queries on structured data streams. In
-Proceedings of the 30th International Conference on Very Large Data Bases. 228–239.
-Manolescu, I., Florescu, D., and Kossmann, D. 2001. Answering XML Queries on Heterogeneous Data Sources. In Proceedings of 27th International Conference on Very Large Data
-Bases. 241–250.
-Martens, W., Neven, F., Schwentick, T., and Bex, G. J. 2006. Expressiveness and Complexity
-of XML Schema. ACM Transactions on Database Systems 31, 3, 770–813.
-Mignet, L., Barbosa, D., and Veltri, P. 2003. The XML web: a first study. In Proceedings of
-the 12th International World Wide Web Conference. Budapest, Hungary, 500–510.
-Nestorov, S., Abiteboul, S., and Motwani, R. 1998. Extracting Schema from Semistructured
-Data. In International Conference on Management of Data. ACM Press, 295–306.
-Neven, F. and Schwentick, T. 2006. On the complexity of XPath containment in the presence
-of disjunction, DTDs, and variables. Logical Methods in Computer Science 2, 3.
-Pitt, L. 1989. Inductive Inference, DFAs, and Computational Complexity. In Proceedings of
-the International Workshop on Analogical and Inductive Inference, K. P. Jantke, Ed. Lecture
-Notes in Computer Science, vol. 397. Springer-Verlag, 18–44.
-Quass, D., Widom, J., Goldman, R., et al. 1996. LORE: a Lightweight Object REpository for
-semistructured data. In Proceedings of the 1996 ACM SIGMOD International Conference on
-Management of Data. 549.
-Rabiner, L. 1989. A tutorial on Hidden Markov Models and selected applications in speech
-recognition. Proc. IEEE 77, 2, 257–286.
-Rahm, E. and Bernstein, P. A. 2001. A survey of approaches to automatic schema matching.
-VLDB Journal 10, 4, 334–350.
-Sahuguet, A. 2000. Everything You Ever Wanted to Know About DTDs, But Were Afraid to Ask
-(Extended Abstract). In The World Wide Web and Databases, 3rd International Workshop,
-D. Suciu and G. Vossen, Eds. Lecture Notes in Computer Science, vol. 1997. Springer, 171–183.
-Sakakibara, Y. 1997. Recent advances of grammatical inference. Theoretical Computer Science 185, 1, 15–45.
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-Learning Deterministic Regular Expressions for the Inference of Schemas from XML Data
-
-·
-
-Sankey, J. and Wong, R. K. 2001. Structural inference for semistructured data. In Proceedings
-of the 10th international conference on Information and knowledge management. ACM Press,
-159–166.
-Thompson, H., Beech, D., Maloney, M., and Mendelsohn, N. 2001. XML Schema part 1:
-structures. W3C.
-Young-Lai, M. and Tompa, F. W. 2000. Stochastic Grammatical Inference of Text Database
-Structure. Machine Learning 40, 2, 111–137.
-
-Received Month Year; revised Month Year; accepted Month Year
-
-ACM Journal Name, Vol. V, No. N, November 2024.
-
-31
-
-
\ No newline at end of file
diff --git a/papers/paper_tods2010.txt b/papers/paper_tods2010.txt
deleted file mode 100644
index 7822b57..0000000
--- a/papers/paper_tods2010.txt
+++ /dev/null
@@ -1,2492 +0,0 @@
-Inference of Concise Regular Expressions
-and DTDs
-GEERT JAN BEX and FRANK NEVEN
-Hasselt University and Transnational University of Limburg
-THOMAS SCHWENTICK
-Dortmund University
-and
-STIJN VANSUMMEREN
-Université Libre de Bruxelles
-
-We consider the problem of inferring a concise Document Type Definition (DTD) for a given set
-of XML-documents, a problem that basically reduces to learning concise regular expressions from
-positive examples strings. We identify two classes of concise regular expressions—the single occurrence regular expressions (SOREs) and the chain regular expressions (CHAREs)—that capture the
-far majority of expressions used in practical DTDs. For the inference of SOREs we present several
-algorithms that first infer an automaton for a given set of example strings and then translate that
-automaton to a corresponding SORE, possibly repairing the automaton when no equivalent SORE
-can be found. In the process, we introduce a novel automaton to regular expression rewrite technique which is of independent interest. When only a very small amount of XML data is available,
-however (for instance when the data is generated by Web service requests or by answers to queries),
-these algorithms produce regular expressions that are too specific. Therefore, we introduce a novel
-learning algorithm CRX that directly infers CHAREs (which form a subclass of SOREs) without
-going through an automaton representation. We show that CRX performs very well within its target
-class on very small datasets.
-
-This research was done while S. Vansummeren was a Postdoctoral Fellow of the Research
-Foundation-Flanders (FWO) at Hasselt University.
-This work was funded by FWO-G.0821.09N and the Future and Emerging Technologies (FET)
-programme within the Seventh Framework Programme for Research of the European Commision,
-under the FET-Open grant agreement FOX, number FP7-ICT-233599.
-Authors’ addresses: G. J. Bex and F. Neven, Database and Theoretical Computer Science Research Group, Hasselt University and Transnational University of Limburg, Agoralaan, gebouw D,
-B-3590 Diepenbeek Belgium; email: {geertjan.bex, frank.neven}@uhasselt.be; T. Schwentick, TU
-Dortmund, Fakultät für Informatik, Otto-Hahn-Str. 16, Raum 214, 44227 Dortmund, Germany.
-email: thomas.schwentick@udo.edu; S. Vansummeren, Research Laboratory for Web and Information Technologies (WIT), Université Libre de Bruxelles, 50 Av. F. Roosevelt, CP 165/15 B-1050
-Brussels, Belgium; email: stijn.vansummeren@ulb.ac.be.
-Permission to make digital or hard copies of part or all of this work for personal or classroom use
-is granted without fee provided that copies are not made or distributed for profit or commercial
-advantage and that copies show this notice on the first page or initial screen of a display along
-with the full citation. Copyrights for components of this work owned by others than ACM must be
-honored. Abstracting with credit is permitted. To copy otherwise, to republish, to post on servers,
-to redistribute to lists, or to use any component of this work in other works requires prior specific
-permission and/or a fee. Permissions may be requested from Publications Dept., ACM, Inc., 2 Penn
-Plaza, Suite 701, New York, NY 10121-0701 USA, fax +1 (212) 869-0481, or permissions@acm.org.
- 2010 ACM 0362-5915/2010/04-ART11 $10.00
-C
-DOI 10.1145/1735886.1735890 http://doi.acm.org/10.1145/1735886.1735890
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11
-
-11:2
-
-•
-
-G. J. Bex et al.
-
-Categories and Subject Descriptors: F.4.3 [Mathematical Logic and Formal Languages]:
-Formal Languages; H.2.1 [Database Management]: Logical Design; I.2.6 [Artificial Intelligence]: Learning; I.7.2 [Document and Text Processing]: Document Preparation
-General Terms: Algorithms, Languages, Theory
-Additional Key Words and Phrases: Regular expressions, schema inference, XML
-ACM Reference Format:
-Bex, G. J., Neven, F., Schwentick, T., and Vansummeren, S. 2010. Inference of concise regular
-expressions and DTDs. ACM Trans. Datab. Syst, 35. 2, Article 11 (April 2010), 47 pages.
-DOI = 10.1145/1735886.1735890 http://doi.acm.org/10.1145/1735886.1735890
-
-1. INTRODUCTION
-The eXtensible Markup Language (XML) serves as the lingua franca for data
-exchange on the Internet [Abiteboul et al. 1999]. Because XML documents
-in general can be of any form, most communities and applications impose
-structural constraints on the documents that are to be exchanged or processed.
-These constraints can be formally specified in a schema, which is written in a
-schema language such as the Document Type Definitions (DTDs) or the XML
-Schema Definitions (XSDs) [Thompson et al. 2004].
-The advantages offered by the presence of a fully specified schema are
-numerous. First and foremost, a schema allows automatic validation of the
-input document structure, which not only facilitates automatic processing but
-also ensures soundness of the input. Unvalidated input data from Web requests
-is considered as the number one vulnerability for Web applications [Open Web
-Application Security Project Consortium 2004]. The presence of a schema also
-allows for automation and optimization of search, integration, and processing
-of XML data (refer to, e.g., Benedikt et al. [2008], Deutsch et al. [1999], Koch
-et al. [2004], Manolescu et al. [2001], Neven and Schwentick [2006], Wang
-et al. [2003]). Moreover, various software development tools such as Castor
-[Castor] and SUN’s JAXB [Sun] rely on schemas to perform object-relational
-mappings for persistence. Furthermore, the existence of schemas is imperative
-when integrating (meta) data through schema matching [Rahm and Bernstein
-2001] and in the area of generic model management [Bernstein 2003; Melnik
-2004]. A final advantage of a schema is that it assigns meaning to the data.
-That is, it provides a user with a concrete semantics of the document and
-aids in the specification of meaningful queries over XML data. Although the
-examples mentioned here just scrape the surface of current applications,
-they already underscore the importance of schemas accompanying XML
-data.
-Unfortunately, in spite of the aforementioned advantages, the presence of
-a schema is not mandatory and many XML documents are not accompanied
-by one. For instance, in a recent study Mignet et al. [2003] and Barbosa et al.
-[2006] have shown that approximately half of the XML documents available
-on the Web do not refer to a schema. In another study Bex et al. [2004] and
-Martens et al. [2006] have noted that about two-thirds of XSDs gathered from
-schema repositories and from the Web are not valid with respect to the W3C
-XML Schema specification [Thompson et al. 2004], rendering them essentially
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:3
-
-useless for immedidate application. A similar observation was made by
-Sahuguet [2000] concerning DTDs.
-Based on the lack of schemas in practice, it is essential to devise algorithms
-that can infer a schema for a given collection of XML documents when none, or
-no syntactically correct one, is present. This is also acknowledged by Florescu
-[2005] who emphasizes that in the context of data integration:
-“We need to extract good-quality schemas automatically from existing data and perform incremental maintenance of the generated
-schemas.”
-In this article, we describe two novel schema inference algorithms outperforming existing systems in accuracy, conciseness, and speed.
-It should be noted that even when a schema is already available, there
-are situations where inference can be useful. One such situation is schema
-cleaning: sometimes a schema is too general with respect to the XML data
-that it is supposed to describe. In that case, it can be advantageous to infer a new schema based solely on the data at hand. This situation is nicely
-illustrated by the following real-world example taken from the Protein Sequence Database DTD [Miklau 2002], which gives the following definition for
-the refinfo-element.
-authors, citation, volume?, month?, year,
-pages?, (title | description)?, xrefs?
-An analysis of the available XML corpus (683MB of data) with our inference
-algorithms yields following more precise expression for the refinfo-element.
-authors, citation, (volume | month), year,
-pages?, (title | description)?, xrefs?
-Note that the latter is more strict than the former, as it emphasizes that volume
-and month do not occur together: either one specifies a month of publication for
-a given journal article, or the volume that it has appeared in, but not both.
-As this example illustrates, schema inference algorithms can hence be used to
-better understand the semantics of a given XML dataset, making it possible to
-adapt an existing schema when necessary. In general, schema inference can be
-used to restrict schemas to a relevant subset of data needed by the application
-at hand, thereby facilitating difficult tasks like schema matching and data
-integration. Indeed, as argued by Hinkelman [2005], industry-level standards
-are too loosely defined in general, which can result in XML schemas where
-many business structures are formally specified as being optional.
-The second situation where schema inference is useful even though a schema
-already exists is in the presence of noisy XML data. In such a situation, part or
-all of the data that needs to be processed is rejected by the existing schema. For
-instance, we have harvested and investigated a corpus of XHTML documents
-from the Web and found that an astonishing 89% of 2092 documents was not
-valid with respect to the XHTML Transitional specification [W3C 2002]. In this
-case, the inference of a new schema based on the corpus and its comparison
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:4
-
-•
-
-G. J. Bex et al.
-
-Fig. 1. An example DTD.
-
-with the XHTML Transitional specification provides a uniform view of the kind
-of errors made. Further, given that one often has no choice but to deal with such
-noisy data, one may infer a new schema from a subset of the corpus (deleting
-documents that make unacceptable errors) and work with that schema rather
-than with the official specification to retain at least a minimal validation.
-1.1 Problem Setting
-Based on the previous observations, it is hence essential to devise algorithms
-that can automatically infer a DTD or XSD from a given corpus of XML
-documents.
-As illustrated in Figure 1, a DTD is essentially a mapping d from element
-names to regular expressions over element names. An XML document is valid
-with respect to d if for every occurrence of an element name e in the document,
-the word formed by its children belongs to the language of the corresponding
-regular expression d(e). For instance, the DTD in Figure 1 requires each store
-element to have zero or more order children, which must be followed by a
-stock element. Likewise, each order must have a customer child, which must
-be followed by one or more item elements.
-To infer a DTD from a corpus of XML documents C it hence suffices to look,
-for each element name e that occurs in a document in C, at the set of element
-name words that occur below e in C, and to infer from this set the corresponding
-regular expression d(e). As such, the inference of DTDs reduces to the inference of regular expressions from sets of positive example words. To illustrate,
-from the words id price, id qty supplier, and id qty item item appearing under <item> elements in a sample XML corpus, we could derive the following
-rule.
-item → (id, price | (qty, (supplier | item+ )))
-While the inference of XSDs is more complicated than the inference of DTDs,
-recent characterizations [Martens et al. 2006] show that the structural core of
-XML schema (that is, the sets of trees that are definable by XSDs) correspond
-to DTDs extended with vertical regular expressions. Therefore, one cannot
-hope to successfully infer XSDs without good algorithms for inferring regular
-expressions. As such, we focus in this article on the inference of regular expressions (and therefore, by the preceding reduction, on the inference of DTDs).
-The inference of XSDs, building on the algorithms presented here, is treated in
-a companion article [Bex et al. 2007].
-In particular, let  be a fixed set of alphabet symbols (also called element
-names), and let  ∗ be the set of all words over .
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:5
-
-Definition 1 (Regular Expressions). In this article, we are interested in
-learning regular expressions r, s of the form
-r, s ::= ∅ | ε | a | r . s | r + s | r? | r + ,
-where parentheses may be added to avoid ambiguity. Here, ε denotes the empty
-word; a ranges over symbols in ; r . s denotes concatenation; r + s denotes
-disjunction; r + denotes one-or-more repetitions; and r? denotes the optional
-regular expression. That is, the language L(r) accepted by regular expression
-r is given by
-L(∅) = ∅
-L(ε) = {ε}
-L(a) = {a}
-L(r . s) = {vw | v ∈ L(r), w ∈ L(s)}
-L(r + s) = L(r) ∪ L(s)
-L(r + ) = {v1 . . . vn | n ≥ 1 and v1 , . . . , vn ∈ L(r)}
-L(r?) = L(r) ∪ {ε}.
-For convenience, we sometimes omit the concatenation symbol, simply writing rs for r.s. Note that the Kleene star operator (denoting zero or more repititions as in r ∗ ) is not allowed by the preceding syntax. This is not a restriction,
-since r ∗ can always be represented as (r + )? or (r?)+ . Conversely, the latter can
-always be rewritten into the former for presentation to the user. Also note that
-the previous syntax uses r + s, to denote disjunction rather than the vertical
-bar notation r | s used by DTDs. The former notation should not be confused
-with the one-ore-more repetition operator r + , where the plus symbol is used in
-the exponent.
-The class of all regular expressions is actually too large for our purposes,
-as both DTDs and XSDs require the regular expressions occurring in them to
-be deterministic (also sometimes called one-unambiguous [Brüggemann-Klein
-and Wood 1998]). Intuitively, a regular expression is deterministic if, without
-looking ahead in the input word, it allows to match each symbol of that word
-uniquely against a position in the expression when processing the input in
-one pass from left to right. For instance, (a + b)∗ a is not deterministic as already the first symbol in the word aaa could be matched by either the first or
-the second a in the expression. Without lookahead, it is impossible to know
-which one to choose. The equivalent expression b∗ a(b∗ a)∗ , on the other hand, is
-deterministic.
-Definition 2. Let r stand for the regular expression obtained from r by
-replacing the ith occurrence of alphabet symbol a in r by a(i) , for every i and
-+
-+
-a. For example, for r = b+ a(ba+ )? we have r = b(1) a(1) (b(2) a(2) )?. A regular
-expression r is deterministic if there are no words wa(i) v and wa( j) v in L(r)
-such that i = j.
-Equivalently, an expression is deterministic if the so-called Glushkov construction [Brüggeman-Klein 1993] translates it into a deterministic finite
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:6
-
-•
-
-G. J. Bex et al.
-
-automaton rather than a nondeterministic one [Brüggemann-Klein and Wood
-1998]. Not every nondeterministic regular expression is equivalent to a deterministic one [Brüggemann-Klein and Wood 1998]. Thus, semantically, the class
-of deterministic regular expressions forms a strict subclass of the class of all
-regular expressions.
-Learning in the limit. For the purpose of inferring DTDs from XML data,
-we are hence in search of an algorithm that, given enough sample words of a
-target deterministic regular expression r, returns a deterministic expression r
-equivalent to r. In the framework of learning in the limit [Gold 1967], such an
-algorithm is said to learn the deterministic regular expressions from positive
-data.
-Definition 3. Define a sample to be a finite subset of  ∗ and let R be
-a subclass of the regular expressions. An algorithm M mapping samples to
-expressions in R is said to learn R from positive data if: (1) S ⊆ L(M(S)) for
-every sample Sand (2) to every r ∈ R we can associate a so-called characteristic
-sample Sr ⊆ L(r) such that, for each sample S with Sr ⊆ S ⊆ L(r), M(S) is
-equivalent to r.
-Intuitively, the first condition says that M must be sound; the second that
-M must be complete, given enough data. A class of regular expressions R is
-learnable in the limit from positive data if an algorithm exists that learns R.
-For the class of all regular expressions, it was shown by Gold [1967] that no
-such algorithm exists. The same holds for the class of deterministic regular
-expressions, as shown in our companion article [Bex et al. 2008].
-PROPOSITION 4 (BEX ET AL. 2008). The class of deterministic regular expressions is not learnable in the limit from positive data.
-Proposition 4 immediately excludes the possibility for an algorithm to infer
-the full class of DTDs. In practice, however, regular expressions occurring in
-DTDs and XSDs are concise rather than arbitrarily complex. Indeed, a study
-of 819 DTDs and XSDs gathered from the Cover Pages [Cover 2003] (including
-many high-quality XML standards) as well as from the Web at large, revealed
-that regular expressions occurring in practical schemas are such that every
-alphabet symbol occurs at most k times, with k small. Actually, in 98% of the
-cases k = 1.
-Definition 5. A regular expression is k-occurrence if every alphabet symbol
-occurs at most k times in it.
-For example, the expressions customer . order+ and (school + institute)+
-are both 1-occurrence, while id .(qty + id) is 2-occurrence (as id occurs twice).
-Observe that if r is k-occurrence, then it is also l-occurrence for every l ≥ k.
-To simplify notation, we often abbreviate “k-occurrence regular expression” by
-k-ORE and also refer to the 1-OREs as “single occurrence regular expressions”
-or SOREs.
-Note that, since every alphabet symbol can occur at most once in a SORE,
-every SORE is necessarily deterministic. Indeed, we have the following strict
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:7
-
-inclusion hierarchy among the various classes of regular expressions just
-discussed.
-SOREs
-⊂ 2-OREs ⊂ 3-OREs ⊂ · · · ⊂ k-OREs
-⊂
-⊂
-deterministic regex
-⊂
-all regex
-(For k ≥ 2, the classes of k-OREs and deterministic regular expressions are
-incomparable.) Given their importance in practical schemas, we focus in this
-article on the inference of SOREs. The inference of deterministic k-OREs for
-k > 1 is treated in a companion article [Bex et al. 2008].
-1.2 Outline and Contributions
-In particular, we show in Section 3 that the class of SOREs can be efficiently
-learned in the limit from positive data by first constructing an automaton
-representation of the target SORE using techniques of Garcı́a and Vidal [1990],
-and by subsequently transforming this automaton into an equivalent SORE (if
-such a SORE exists) using a novel polynomial-time algorithm called REWRITE.
-For the general class of regular expressions the resulting expression can be of
-exponential size, as we explain in more detail in Section 3. In Section 4, we
-improve REWRITE to deal with real-world, and therefore incomplete, samples. In
-contrast to REWRITE, which fails when its input automaton is not equivalent to
-a SORE, the resulting improvement, called RWR, repairs the input automaton
-until it becomes equivalent to a SORE. We also develop an extension of RWR,
-called RWR2 , which improves the precision of RWR at the cost of increased running
-time.
-For the settings where extremely little XML data is available to infer a
-schema from (for instance, when the data is returned as answers to queries or
-Web service requests [Ngu et al. 2005; Oaks and ter Hofstede 2007]), we
-introduce in Section 6 the algorithm CRX. CRX successfully learns the class
-of CHAREs, a strict subclass of the SOREs that nevertheless holds great
-practical importance. Indeed, the same investigation as before reveals that
-more than 90% of the regular expressions occurring in practical schemas are
-CHAREs [Martens et al. 2006].
-We experimentally validate RWR, RWR2 , and CRX in Section 7 on both small and
-large samples drawn from real-world target DTDs whose regular expressions
-fall both within the class of SOREs/CHAREs and outside of those classes. In
-all settings, our algorithms outperform existing systems in accuracy, conciseness, and speed. Further, we assess the strong generalization ability of CRX by
-establishing on average the minimal number of sample words needed to derive
-optimal regular expressions. In Section 8 we discuss how to extend RWR and
-CRX to incrementally compute the inferred regular expressions when new data
-arrive, how to address noise, and how to deal with numerical predicates. We
-begin in the next section with a discussion of related work, and conclude in
-Section 9.
-It is important to note that this article differs from its conference version [Bex
-et al. 2006] in the following way.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:8
-
-•
-
-G. J. Bex et al.
-
-—First and foremost, it corrects the results of Bex et al. [2006] by providing
-a completely new algorithm for converting automata into equivalent SOREs
-(provided such a SORE exists), and gives a full correctness proof (Section 3).
-In contrast to what is claimed in Bex et al. [2006], the conversion algorithm
-of Bex et al. [2006] does not always yield an equivalent SORE, as discussed
-in Section 5.
-—It introduces new heuristics (based on a language size criterion) for dealing
-with real-world, and therefore incomplete datasets (Section 4).
-—It adds new experiments that measure: (1) the impact of noise and (2) the
-accuracy of our algorithms under various levels of missing data.
-2. RELATED WORK
-Schema inference. Schemas for semistructured data have been defined in
-Buneman et al. [1997], Fernandez and Suciu [1998], and McHugh et al.
-[1997] and their inference has been addressed in Goldman and Widom [1997],
-and Nestorov et al. [1997, 1998]. The methods in Nestorov et al. [1997] and
-Goldman and Widom [1997] focus on the derivation of a graph summary
-structure (called full representative object or dataguide) for a semistructured
-database. This data structure contains all paths in the database. Approximations of this structure are considered by restricting to paths of a certain length.
-The latter then basically reduces to the derivation of an automaton from a set
-of bounded length strings. Naively restricting the algorithms to trees rather
-than graphs is inappropriate since no order is considered between the children
-of a node so that DTD-like schemas cannot be derived. However, even the use
-of more sophisticated encodings of the XML documents using edges between
-siblings would be to no avail since no algorithms are given to translate the
-obtained automata to regular expressions. In Nestorov et al. [1998], a schema
-is a typing by means of a datalog program. Again, no algorithms are given
-to transform datalog types into regular expressions. So, these approaches
-can therefore not be used to derive DTDs, not even when the semistructured
-database is tree-shaped.
-DTD inference. In the context of DTD inference, Sankey and Wong [2001]
-propose several approaches to generate probabilistic string automata to represent regular expressions. To transform these into actual regular expressions,
-and hence to obtain DTDs, the authors refer to the methods of Ahonen [1996].
-The latter provides a method to translate one-unambiguous nonprobabilistic
-string automata to regular expressions, as given by Brüggemann-Klein and
-Wood [1998], followed by a post-processing simplification step. Apart from several case analyses based on a dictionary example, no systematic study of the
-effectiveness of the approach is provided. In particular, in contrast to our results, no target class is given for which the set of transformations is complete.
-There are only a few papers describing systems for direct DTD inference
-[Garofalakis et al. 2003; Min et al. 2003; Chidlovskii 2001]. Only one of them is
-available for testing: XTRACT [Garofalakis et al. 2003]. In Section 7, we make a
-detailed comparison with our proposal. In contrast to our approach, the XTRACT
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:9
-
-system generates for every separate string a regular expression while representing repeated subparts by introducing Kleene-*. In a second step, the system
-factorizes common subexpressions of these candidate regular expressions using algorithms from the logic optimization literature. Finally, in the third step,
-XTRACT applies the Minimum Description Length (MDL) principle to find the
-best RE among the candidates. Although the approach has been shown to work
-on real-world DTDs in Garofalakis et al. [2003] the XML data complying to
-these DTDs was generated. We report in Section 7 that XTRACT has two kinds of
-shortcomings on real-world XML data: (1) it generates large, long-winded, and
-difficult to interpret regular expressions; and (2) it cannot handle large datasets (over 1000 strings). The latter is due to the NP-hard submodule in the
-third step of the XTRACT algorithm [Fernau 2004]. The former problem seems
-to be more fundamental. The final step results in expressions consisting of
-disjunctions of regular expressions while in practice the large majority of regular expressions are concatenations of disjunctions [Martens et al. 2006]. As a
-result, larger datasets result in larger regular expressions.
-In Min et al. [2003] an adaptation of the XTRACT approach to a restricted
-class of regular expressions which form a subclass of SOREs is described. Although the system, according to the experiments conducted in Min et al. [2003],
-outperforms XTRACT in accuracy and efficiency, it seems that the two fundamental shortcomings described earlier remain. It would thus be surprising if the
-system performed much better than XTRACT on real-world data. Similarly to
-Ahonen [1996], the approach of Chidlovskii [2001] relies on the translation of
-Glushkov automata to regular expressions which, in general, can lead to an
-exponential size increase.
-Trang [Clark ] is state-of-the-art software written by James Clark intended
-as a schema translator for the schema languages DTDs, Relax NG, and XML
-Schema. In addition, Trang allows to infer a schema for a given set of XML
-documents. We discuss Trang further in Section 7.1.
-Language inference. Learning of regular languages from positive examples in
-the computational learning community is mostly directed towards inference of
-automata as opposed to inference of regular expressions [Angluin and Smith
-1983; Pitt 1989; Sakakibara 1997]. As noted by Fernau [2004] and argued
-in the previous section, first using learning algorithms for deterministic automata and then transforming these into regular expressions in general leads
-to unmanageable and long-winded regular expressions. Some approaches to
-inference of regular expressions for restricted cases have been considered. For
-instance, Brāzma [1993] showed that regular expressions without union can
-be approximately learned in polynomial time from a set of examples satisfying
-some criteria. Fernau [2009] provided a learning algorithm for finite unions
-of pairwise left-aligned union-free regular expressions. These expressions are
-different from the expressions we consider here: they are not included in the
-class of SOREs and do not contain all CHAREs. The development is purely
-theoretical, no experimental validation has been performed.
-Automata to RE translation. Although heuristics for automata to RE translations [Delgado and Morais 2004; Han and Wood 2007] have been proposed,
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:10
-
-•
-
-G. J. Bex et al.
-
-Fig. 2. (a) The SOA accepting the same language as the SORE a . b .(c+d+ ). (b) The SOA generated
-by 2T-INF for the sample S = {bacacdacde, cbacdbacde, abccaadcde}.
-
-all of them are optimizations of the classical state elimination algorithm. In
-particular, they investigate the best order to eliminate states when going from
-automata to regular expressions. So, they focus on the class of all automata
-for which, as explained in Section 3, an exponential increase in size cannot be
-avoided in general. Further, the methods remain theoretical as no experimental
-analysis has been performed. Caron and Ziadi [2000] devise an algorithm deciding whether an automaton is Glushkov. If so, the automaton can be rewritten
-into a short equivalent regular expression. Their method works in a top-down
-fashion, that is, it derives the top nodes of the parse tree corresponding to
-the regular expression first, and subsequently proceeds downward in the tree.
-Consequently, the method first derives the largest subexpressions of the expression, making it harder to devise heuristics in the presence of missing data.
-In contrast, our approach is bottom-up, that is, starting from the leaf nodes of
-the parse tree, composing them into the smallest subexpressions.
-3. A COMPLETE ALGORITHM FOR INFERRING SORES
-Our goal in this section is to infer a SORE s equivalent to a target SORE r
-given only a finite sample S ⊆ L(r). To this end, we first learn from S a Single
-Occurrence Automaton (SOA for short). A SOA is a specific kind of deterministic
-finite state automaton in which all states, except for the initial and final state,
-are element names. Figure 2(a) gives an example. Note that in contrast to the
-classical definition of automata, no edges are labeled: all incoming edges in a
-state a are assumed to be labeled by a. As such, a word a1 , . . . , an is accepted if
-there is an edge from the initial state to a1 , an edge from a1 to a2 ,. . . , and an
-edge from an to the final state. Thus, the SOA in Figure 2(a) accepts the same
-language as a . b .(c + d+ ).
-Definition 6 (SOA). Let src and sink be two special symbols, distinct from
-the element names, that will serve as the initial and final state, respectively. A
-single occurrence automaton is a finite directed graph G = (V, E) such that:
-(1) {src, sink} ⊆ V and all nodes in V − {src, sink} are element names; and
-(2) src has only outgoing edges; sink has only incoming edges; and every v ∈
-V − {src, sink} is visited during a walk from src to sink.
-Note that V − {src, sink} can be empty. We write L(G) for the set of all words
-accepted by G; V(G) for the set of G’s vertices, and E(G) for G’s edge relation.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:11
-
-Algorithm 1. 2T-INF
-Input: a finite set of sample strings S
-Output: a SOA G such that S ⊆ L(G)
-1: Let V be the set of states consisting of all element names occurring in S plus the
-initial state src and final state sink
-2: Initialize E := ∅
-3: for each string a1 . . . an in S do
-4:
-add the edges (src, a1 ), (a1 , a2 ), . . . , (an, sink) to E
-5: end for
-6: return G = (V, E)
-
-3.1 Learning an Automaton
-Given a sample S, we can learn an automaton G that accepts all words in S by
-means of the algorithm 2T-INF shown in Algorithm 1. Its behavior is illustrated
-in Figure 2(a) on the sample S = {abc, abdd} and in Figure 2(b) on the sample
-S = {bacacdacde, cbacdbacde, abccaadcde}. 2T-INF was introduced by Garcı́a and
-Vidal [1990], who also proved the following proposition.
-PROPOSITION 7 ([GARCÍA AND VIDAL 1990]). 2T-INF is sound, that is, S ⊆
-L(2T-INF(S)) for each sample S. Moreover, 2T-INF is minimal, that is, for each SOA
-G with S ⊆ L(G), 2T-INF(S) is a subgraph of G and hence L(2T-INF(S)) ⊆ L(G).
-It turns out that 2T-INF is also complete for building a SOA representation of
-a target SORE r, provided that its input sample is representative with regard
-to r.
-Definition 8 (Representative Sample). A word v of length 2 is said to be a
-2-gram of a set of words W if it occurs as a subword in some w ∈ W. A sample
-S is representative of a SORE r if S ⊆ L(r) and the following statements hold:
-(1) for every a ∈  starting a word in L(r) there is a word in S that starts with
-a;
-(2) for every a ∈  ending a word in L(r) there is a word in S that ends with a;
-(3) every 2-gram of L(r) is a 2-gram of S.
-If S is not representative of r, then we say that S does not cover r.
-For instance, the sample {a, b, c} is representative for a + b + c but {a, c}
-is not since it lacks a word starting with b. Furthermore, the sample
-{bacacdacde, cbacdbacde, abccaadcde} is representative for ((b?(a + c)+ )d)+ e but
-{bacacdacde, cbacdbacde} is not since it does not contain the 2-gram ab.
-PROPOSITION 9.
-L(r).
-
-If S is a representative sample of SORE r then L(2T-INF(S)) =
-
-PROOF. It is not hard to see that every SORE r can be transformed into an
-equivalent SOA Gr : we take as nodes of Gr all element names occurring in r
-plus the initial state src and the final state sink; for each alphabet symbol that
-starts a word in L(r) we add the edge (src, a) to Gr ; for each alphabet symbol
-that ends a word in L(r) we add an edge (a, sink) to Gr , and for each alphabet
-symbol b that follows an alphabet symbol a in a word in L(r) we add the edge
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:12
-
-•
-
-G. J. Bex et al.
-
-Fig. 3. A SOA not equivalent to any SORE. It accepts the same language as a(ba)+ .
-
-(a, b) to Gr . Now reason as follows. Clearly, S ⊆ L(r) = L(Gr ). Hence, 2T-INF(S)
-is a subgraph of Gr by Proposition 7. Since S is a representative sample of r,
-however, every edge of Gr must also be in 2T-INF(S). As such, 2T-INF(S) = Gr and
-hence L(2T-INF(S)) = L(Gr ).
-3.2 From SOA to SORE
-Proposition 9 shows that it is possible to learn a SOA representation of a target
-SORE r, provided that we are given enough data. To transform this SOA into
-a regular expression, an obvious approach would be to use known techniques
-such as the classical state elimination algorithm (refer to, e.g., Hopcroft and
-Ullman [1979]). Unfortunately, as already hinted upon by Fernau [2004, 2009]
-and as we illustrate shortly, it is very difficult to get concise regular expressions
-from an automaton representation. For instance, the classical state elimination
-algorithm applied to the SOA generated by 2T-INF in Figure 2(b) yields the
-expression:1
-(aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c +
-aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗
-(b + aa∗ b))∗ (aa∗ d + (c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d)))(aa∗ d +
-(c + aa∗ c)(c + aa∗ c)∗ (d + aa∗ d) + (b + aa∗ b + (c + aa∗ c)(c +
-aa∗ c)∗ (b + aa∗ b))(aa∗ b + (c + aa∗ c)(c + aa∗ c)∗ (b + aa∗ b))∗
-
-which differs quite a bit from the equivalent SORE
-((b?(a + c))+ d)+ e
-
-(‡).
-
-Actually, results by Ehrenfeucht and Zeiger [1976], Gelade and Neven [2008],
-and Gruber and Holzer [2008] show that it is impossible in general to generate
-concise regular expressions from automata: there are automata, even SOAs as
-generated by 2T-INF, for which the number of occurrences of alphabet symbols in
-the smallest equivalent expression is exponential in the size of the automaton.
-For such automata, a concise regular expression representation hence does not
-exist.
-These results imply that there are SOAs G for which an equivalent SORE
-does not exist (Figure 3 gives a simple example). Note, however, that when
-such a SORE r does exist, its size is always linearly bounded by the number of
-states of G. Indeed, since every alphabet symbol can occur at most once in r, the
-size of r is linearly bounded by the alphabet symbols that it mentions. Since G
-and r are equivalent, these symbols are exactly the states of G (minus src and
-sink). Hence, the SOREs constitute a well-behaved and concisely representable
-subset of the regular languages. It is therefore natural to investigate how to
-1 Transformation computed by JFLAP: www.jflap.org.
-
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:13
-
-transform a given SOA into an equivalent SORE when such a SORE exists.
-Clearly, the previous example illustrates that the classical state elimination
-algorithm does not suffice for this purpose.
-For that reason, we introduce in this section a novel graph-rewriting approach for transforming SOAs into SOREs. While our approach is related to the
-classical state-elimination algorithm for transforming an arbitrary automaton
-into a regular expression, we do not eliminate states by introducing additional
-edges (thereby duplicating subexpressions) but instead replace sets of states
-by single states (taking care to avoid duplication). In addition, there are two
-rewriting steps that only remove edges.
-Just as the classical algorithm, it is necessary for the definition of the graph
-rewrite rules to define a generalization of SOAs in which internal states are
-allowed to be labeled by SOREs (as opposed to element names from ). This generalization is defined as follows. Call two regular expressions r and s alphabetdisjoint if r and s have no alphabet symbol in common. For example, (a+b)? and
-c+ are alphabet-disjoint, whereas (a + b) and b?c+ are not. Call an expression
-r proper if it accepts at least one nonempty word (i.e., it is not equivalent to ∅,
-nor to ε).
-Definition 10. A generalized Single Occurrence Automaton (generalized
-SOA for short) is a finite graph G = (V, E) such that:
-(1) {src, sink} ⊆ V and all vertices in V − {src, sink} are pairwise alphabetdisjoint proper SOREs; and
-(2) the edge relation E is such that src has only outgoing edges; sink has only
-incoming edges; and every v ∈ V is visited by a walk from src to sink.
-A word w ∈  ∗ is accepted by G if there is a walk src r1 . . . rm sink in G and a
-division of w into subwords w = w1 . . . wm such that wi ∈ L(ri ), for 1 ≤ i ≤ m.
-Again, we write L(G) for the set of all words accepted by G.
-Figure 7 shows some examples. Clearly, every SOA is also a generalized
-SOA. In what follows, we write PredG (s) for the set of all direct predecessors of
-a SORE s in G, and SuccG (s) for the set of all direct successors of s in G.
-PredG (s) := {r | (r, s) ∈ E(G)},
-SuccG (s) := {t | (s, t) ∈ E(G)}.
-−
-Furthermore, we write Pred−
-G (s) for PredG (s) − {s} and similarly SuccG (s) for
-SuccG (s) − {s}. Finally, we write
-
-PredG (s) ∪ {s} if s = s + for some s
-+
-PredG (s) :=
-PredG (s)
-otherwise
-
-SuccG (s) ∪ {s} if s = s + for some s
-(s)
-:=
-Succ+
-G
-SuccG (s)
-otherwise.
-
-Rewrite rules. Our system of rewrite rules consists of the seven rules shown
-in Figures 4–6: one rule to introduce disjunction (r + s), four rules to introduce
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:14
-
-•
-
-G. J. Bex et al.
-
-Fig. 4. Rewrite rules part 1. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)−
-+
-{r, s}. The gray loops on r and s indicate that r ∈ Succ+
-G (r) and s ∈ SuccG (s), respectively.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:15
-
-Fig. 5. Rewrite rules part 2. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)−
-+
-{r, s}. The gray loops on r and s indicate that r ∈ Succ+
-G (r) and s ∈ SuccG (s), respectively.
-
-concatenation (r . s, r? . s, r . s?, and r? . s?), one rule to introduce iteration (r + ),
-and one rule to introduce optionals (r?). At the basis of the first five rules lies
-the contraction of two states r and s into a single new state t, which is defined
-as follows.
-Definition 11 (State Contraction). Let G be a generalized SOA; let r and s
-be states in G; and let t be a state not in G. The contraction of r and s into t is
-the generalized SOA G[r, s ⇒ t] obtained from G as follows:
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:16
-
-•
-
-G. J. Bex et al.
-
-Fig. 6. Rewrite rules part 3. In the illustrations, P is the set PredG (r)−{r, s}. Sis the set SuccG (s)−
-{r, s}. Note in particular that the rule OPTIONAL r? can only be applied when G contains only one
-node besides src and sink.
-
-(1) Add t as a new state to G;
-(2) make every v ∈ PredG (r) − {r, s} a predecessor of t;
-(3) make every w ∈ SuccG (r) − {r, s} a successor of t;
-(4) add a loop t → t if r ∈ SuccG (s); and
-(5) remove r, s and all of their incoming and outgoing edges.
-Note that state contraction is not symmetric.
-To illustrate, the contraction G[a, c ⇒ a + c] of the generalized SOA G in
-Figure 7(a) is shown in Figure 7(b). Similarly, the contraction G[b, a + c ⇒
-b? .(a + c)] of the generalized SOA G in Figure 7(b) is shown in Figure 7(c). Note
-that if r = s, then G[r, s ⇒ t] is simply a substitution of r by the new state t.
-To simplify notation, we simply write G[r ⇒ t] for such contractions in what
-follows.
-In addition to contraction, the rewrite rules also use the following
-operation.
-Definition 12. If G is a generalized SOA and r, s are states in G, then we
-write G (r, s) to denote the generalized SOA obtained from G by removing the
-edge from r to s, if present.
-In what follows, we write G  H to indicate that G rewrites to H in a single
-step according to the rewrite rules in Figures 4–6, and G ∗ H to indicate that
-G rewrites to H in zero or more steps.
-The following proposition shows that the rewrite rules are sound.
-PROPOSITION 13. If G is a generalized SOA and G  H then H is also a
-generalized SOA and L(G) = L(H).
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:17
-
-PROOF. First observe that, since all states in a generalized SOA are pairwise
-alphabet-disjoint proper SOREs, the new states r + s; r . s; r? . s; r . s?; r? . s?; r + ;
-and r? introduced by the rewrite rules in Figures 4–6 must themselves be proper
-SOREs alphabet-disjoint with the remaining states. As such, all states in H
-are pairwise alphabet-disjoint proper SOREs. To show that H is a generalized
-SOA, it hence remains to show that every state in H participates in a walk
-from src to sink. Hereto, we distinguish the following three cases.
-—H = G[r, s ⇒ t] for some t. Then, since G is a generalized SOA, and r and s
-particpate in a walk from src to sink. In particular, there is a walk from src
-to r in G, and a walk from s to sink. Then, by definition of state contraction,
-there is a walk from src to t and from t to sink in H, that is, t participates in
-a walk from src to sink in H.
-—H = G[r ⇒ r + ] (r + , r + ). Then, by definition of state contraction and since
-r participates in a walk from src to sink in G, r + must participate in a
-walk from src to sink in G[r ⇒ r + ]. This walk can always be transformed
-into a walk from src to sink in H by removing the edge (r + , r + ) should it
-occur.
-—H = G[r ⇒ r?] (src, sink). Then, by definition of state contraction and since
-r participates in a walk from src to sink in G, r? must participate in a walk
-from src to sink in G[r ⇒ r?]. Since the edge (src, sink) cannot occur in this
-walk (recall that src has no incoming edges and sink has no outgoing edges),
-r? also participates in a walk from src to sink in H.
-To see that L(G) = L(H) we reason by a case analysis on the rewrite rule used
-to transform G into H. For economy of space, we only illustrate this reasoning
-for DISJUNCTION r + s; the other cases are similar.
-So, suppose that G was rewritten into H by DISJUNCTION r + s, that is, H =
-G[r, s ⇒ r+s]. Then r and s have the same (extended) predecessor and successor
-set. From this, it follows that the following statements are equivalent.
-(1) s ∈ SuccG (r);
-(2) r ∈ SuccG (s);
-(3) s ∈ Succ+
-G (s);
-(4) r ∈ Succ+
-G (r).
-For instance, s ∈ SuccG (r) ⇔ r ∈ SuccG (s) since:
-s ∈ SuccG (r) ⇔ s ∈ SuccG (r) ∪ {r}
-⇔ s ∈ Succ+
-G (r)
-+
-⇔ s ∈ SuccG (s)
-⇔ s ∈ Pred+
-G (s)
-+
-⇔ s ∈ PredG (r)
-
-since r = s
-by definition of Succ+
-G (r)
-+
-since Succ+
-G (r) = SuccG (s)
-+
-by definition of Succ+
-G (s) and PredG (s)
-+
-since Pred+
-G (r) = PredG (s)
-
-⇔ s ∈ PredG (r) ∪ {r}
-⇔ s ∈ PredG (r)
-
-by definition of Pred+
-G (r)
-since r = s
-
-⇔ r ∈ SuccG (s)
-
-by definition of PredG (r) and SuccG (s)
-
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:18
-
-•
-
-G. J. Bex et al.
-
-The other equivalences can be similarly obtained. From these equivalences,
-it follows that G must take one the two forms illustrated for rewrite rule
-DISJUNCTION r + s in Figure 4. In both cases, the corresponding H is also shown.
-Now suppose that w = w1 . . . wm ∈  ∗ is recognized by the walk src, t1 , . . . ,
-tm, sink in G with wi ∈ L(ti ) for 1 ≤ i ≤ m. Let the sequence src, t1 , . . . , tm, sink
-be obtained from src, t1 , . . . , tm, sink by replacing every occurrence of r and s by
-r + s. By inspection of the illustrations for rule DISJUNCTION r + s in Figure 4 it
-is not difficult to see that src, t1 , . . . , tm, sink is a walk in H. Moreover, wi ∈ L(ti )
-by construction for 1 ≤ i ≤ m. Therefore, w ∈ L(H) and hence L(G) ⊆ L(H).
-Conversely, suppose that w = w1 . . . wm ∈  ∗ is recognized by src, t1 , . . . , tm, sink
-in H with wi ∈ L(ti ) for 1 ≤ i ≤ m. Determine vi as follows:
-⎧
-⎪
-⎨ti if ti = r + s
-ti = r if ti = r + s and wi ∈ L(r)
-⎪
-⎩
-s if ti = r + s and wi ∈ L(s)
-By inspection of the illustrations for rule DISJUNCTION r + s in Figure 4 it is
-not difficult to see that src, t1 , . . . , tm, sink is a walk in G. Moreover, wi ∈ L(ti )
-for 1 ≤ i ≤ m. Therefore w ∈ L(G) and hence L(H) ⊆ L(G). As such, L(G) =
-L(H).
-Since each rewrite rule either contracts two states into a single state or
-removes an edge from G, the size of H is always smaller than G. Therefore, we
-have the next proposition.
-PROPOSITION 14. The system of rewrite rules in Figures 4–6 is terminating:
-there is no infinite sequence of rewrite steps G  H  I  . . .
-Our algorithm REWRITE, shown in Algorithm 2, then operates as follows. First,
-it checks whether the input SOA G corresponds to the empty language (∅) or
-the empty word (ε) in lines 1–5. If so, it returns the corresponding regular
-expression. Otherwise, it rewrites G until no further rules apply. It then checks
-whether the resulting generalized SOA is final.
-Definition 15. As generalized SOA G is final if E(G) = {(src, r), (r, sink)}
-with r distinct from src and sink. In other words, G is final if it is a chain
-consisting of the source, an arbitrary regular expression, and the sink.
-If the resulting generalized SOA is indeed final, then clearly L(G) = L(r),
-and r is returned as result. If the resulting generalized SOA is not final, then
-G is not equivalent to a SORE (as we formally show further on), and REWRITE
-fails. To illustrate, Figure 7 shows an example run of REWRITE on the example
-SOA from Figure 2(b).
-THEOREM 16. On input SOA G, REWRITE fails if and only if G is not equivalent
-to a SORE. Otherwise, REWRITE returns a SORE equivalent to G. Moreover,
-5
-REWRITE operates in time O(n ) where n is the number of states in G.
-Note that the complexity O(n5 ) is reasonable since when we apply REWRITE to
-the result of 2T-INF on a sample S, n corresponds to the (typically small) number
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:19
-
-Algorithm 2. REWRITE
-Input: a SOA G
-Output: a SORE r such that L(r) = L(G)
-1: if sink is not reachable from src in G then
-2:
-return ∅
-3: else if E(G) = {(src, sink)} then
-4:
-return ε
-5: else
-6:
-while a rewrite rule from Figures 4–6 can be applied do
-7:
-perform the rewrite rule on G
-8:
-end while
-9:
-if G is final then
-10:
-return the corresponding regular expression
-11:
-else
-12:
-fail
-13:
-end if
-14: end if
-
-of distinct element names occurring in S, not the total number or total length
-of words in S.
-The remainder of this section is devoted to the proof of Theorem 16, which
-is divided into three steps. First, we show that REWRITE is sound.
-PROPOSITION 17. If REWRITE(G) does not fail then it returns a SORE equivalent to G, for any SOA G.
-PROOF.
-
-We distinguish three cases.
-
-(1) If sink is not reachable from src then REWRITE(G) = ∅ (clearly a SORE) and
-L(G) = ∅ = L(∅), as desired.
-(2) If E(G) = {(src, sink)} then REWRITE(G) = ε (again clearly a SORE), and
-L(G) = {ε} = L(ε), as desired.
-(3) Otherwise, G is rewritten into a final generalized SOA H with E(H) =
-{(src, t), (t, sink)} (t distinct from src and sink) and REWRITE(G) = t. In
-particular, t is a SORE. By Proposition 13, L(G) = L(H) and thus, since
-E(H) = {(src, t), (t, sink)}, L(G) = L(H) = L(t) = L(REWRITE(G)), as desired.
-Next, we show that REWRITE has the claimed complexity.
-PROPOSITION 18. REWRITE operates in time O(n5 ), where n is the number of
-states of its input G.
-PROOF. We assume that checking whether there is an edge from state r
-to state s can be done in constant time (for instance, using an adjacency matrix representation). To see that REWRITE runs in time O(n5 ) under this assumption, let us check that lines 1–4, lines 6–7, and lines 8–10 all run in
-O(n5 ).
-(Lines 1–4). Since G has at most n2 edges, checking whether sink is reachable
-from src can be done in time O(n2 ) using depth-first search. Moreover, checking
-whether E(G) = {(src, sink)} can also be done in time O(n2 ).
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:20
-
-•
-
-G. J. Bex et al.
-
-Fig. 7. An execution of REWRITE on the example automaton in Figure 2(b). Step (1) applies DISJUNCTION r + s with r = a and s = b. Step (2) applies CONCATENATION r? . s with r = b and s = a + c. Step
-(3) applies ITERATION r + with r = b? .(a+ c). Step (4) applies CONCATENATION r . s with r = (b? .(a+ c))+
-and s = d. Step (5) applies ITERATION r + with r = (b? .(a + c))+ . d. One more application of CON+
-+
-CATENATION r . s with r = ((b? .(a + c)) . d) and s = e (not shown) leads to the resulting expression
-((b? .(a + c))+ . d)+ . e.
-
- = G1 , G2 , . . . , Gk is the sequence of generalized
-(Lines 6–7). Suppose that G
-SOAs produced by lines 6–7 when rewriting G = G1 until no further rewrite
-rule applies. Since rewrite rules never introduce new states without also removing a state, every Gi has at most n states. Now reason as follows.
- since the automaton
-—The rule for optionals can be applied at most once in G
-that it returns is always final, and since no rewrite rule applies to a final
-generalized SOA. Checking the preconditions of the rule for optionals can be
-done in time O(n2 ), and its action can be performed in time O(n). As such, the
- on applying the rewrite rule for optionals is bounded
-total time spent in G
-2
-by O(n ).
-—Since the rewrite rules for disjunction and concatenation contract two states
-into a single one, these rewrite rules can be applied at most n times in 
-G.
-Since of all their preconditions can be checked in time O(n4 ) (by iterating
-over all pairs of states r and s in the current automaton Gi and comparing
-Pred(r), Pred(s), Succ(r), and Succ(s) as desired) and since state contraction
- on the rewrite rules for
-can be done in time O(n), the total time spent in G
-disjunction and concatenation is bounded by O(n × n4 ) = O(n5 ).
-—Since the rule for iteration removes the loop of the state to which it is applied,
-and since each generalized SOA contains at most n loops, there can be at most
-n consecutive applications of this rule before another rewrite rule is applied.
-By the preceding remarks, there are at most n applications of the other
-rewrite rules, so the rewrite rule for iteration can be applied at most n2 times
- Since its precondition can be checked in constant time, and since its
-in G.
- on the rewrite rule
-action can be done in time O(n), the total time spent in G
-for iteration is bounded by O(n2 × n) = O(n3 ).
-(Lines 8–11). Finally, checking whether a generalized SOA is final and extracting the corresponding regular expression can be done in time O(n2 ).
-In summary, lines 1–4 run in time O(n2 ), lines 6–7 run in time O(n5 ), and
-lines 8–11 run in time O(n2 ), yielding a total running time of O(n5 ).
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:21
-
-Finally, we show that REWRITE(G) fails if and only if G is not equivalent
-to a SORE, or equivalently, that REWRITE(G) does not fail if, and only if, G is
-equivalent to a SORE. This is actually the most involved part of the proof of
-Theorem 16. Proposition 17 already shows that if REWRITE(G) does not fail, then
-G is equivalent to a SORE. Hence, we remain to show the next proposition.
-PROPOSITION 19.
-not fail.
-
-If SOA G is equivalent to a SORE, then REWRITE(G) does
-
-Essentially, we prove this proposition in two steps. Call a generalized SOA
-proper if L(G) = ∅ and L(G) = {ε}.
-(1) We first show that for any proper SOA G equivalent to a SORE there exists
-a sequence of rewrite steps that ends in a final automaton (Corollary 46).
-(2) In addition, we show that if proper G can be rewritten into a final automaton
-by a particular sequence of rewrite steps, then any sequence of rewrite steps
-on G ends in a final automaton (Corollary 54).
-As such, REWRITE(G) cannot fail when G is equivalent to a SORE: either G is
-not proper, in which case lines 1–4 of Algorithm 2 return a valid expression, or
-G is proper and will hence be rewritten into a final automaton, in which case
-line 9 returns a valid expression. The details may be found in Appendix A.
-3.3 Discussion
-It should be noted that while the result of REWRITE is always a SORE, this
-SORE need not be easy to read (depending on the order of rewriting). For
-instance, it is possible for REWRITE to generate an expression r .(s? . t?)?. Clearly,
-the optional around (s? . t?) is redundant. Removing it leads to the simpler
-r .(s? . t?). For presentation to the user, it is therefore advisable to postprocess
-the result of REWRITE (and its variations in Section 4) using a regular expression
-simplification algorithm.
-4. DEALING WITH MISSING DATA
-The results of Section 3 suggest the following method to infer a SORE from a
-given sample S.
-(1) First, use 2T-INF to learn from S an automaton representation G of the
-target SORE r.
-(2) Next, convert G into a SORE using REWRITE.
-If S is a representative sample of r then G is equivalent to r by Proposition 9.
-Therefore, REWRITE(G) does not fail by Theorem 16, and hence REWRITE(G) is
-equivalent to r.
-Unfortunately, real-world samples are rarely representative. For instance,
-for target r = (a1 +· · ·+an)+ and increasing values of n, it is increasingly unlikely
-that a sample bears witness to each of the n2 2-grams needed to represent r.
-On such nonrepresentative samples, 2T-INF will construct an automaton for
-which L(G) is a strict subset of L(r). In particular, this automaton need not be
-equivalent to a SORE, and REWRITE(G) can fail. Figure 8 shows an example.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:22
-
-•
-
-G. J. Bex et al.
-
-Fig. 8. The SOA generated by 2T-INF for the nonrepresentative sample S = {bacacdacde,
-abccaadcde}. The only rewrite rules that can be applied are ITERATION a+ and ITERATION c+ , after which REWRITE gets stuck in a nonfinal automaton and fails.
-
-Fig. 9. Repair rules.
-
-For that reason, we present in this section two modifications of REWRITE
-that “repair” G when rewriting gets stuck in a nonfinal automaton. The first
-modification, RWR, picks a single repair when rewriting gets stuck, independent
-of how the repair affects G. The second modification, RWR2 , in contrast, considers
-multiple repair strategies and selects the one that extends G in a minimal way.
-The repair rules used by both algorithms are shown in Figure 9. After a repair
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:23
-
-Algorithm 3. RWR
-Input: a SOA G
-Output: a SORE r such that L(G) ⊆ L(r) if G is not equivalent to a SORE, and L(G) =
-L(r) otherwise.
-1: if sink is not reachable from src in G then
-2:
-return ∅
-3: else if E(G) = {(src, sink)} then
-4:
-return ε
-5: else
-6:
-while G is not final do
-7:
-if a rewrite rule from Figures 4–6 can be applied then
-8:
-apply the rewrite rule on G
-9:
-else
-10:
-apply a repair rule from Figure 9
-11:
-end if
-12:
-end while
-13:
-return the corresponding regular expression r
-14: end if
-
-rule is applied, the automaton necessarily satisfies the precondition of the
-corresponding rewrite rule. Now note the following.
-PROPOSITION 20. Let G be a proper generalized SOA. If G is not final and no
-rewrite rule applies to G, then at least one of the repair rules in Figure 9 applies
-to G.
-PROOF. Since G is proper, it recognizes at least one nonempty word. Clearly,
-this can only happen when src has a successor r distinct from sink. We distinguish two cases.
-—Either r has a successor s distinct from src, sink, and r. Clearly, REPAIR r? . s?
-is then applicable to G.
-—If r does not have such a successor s, then we claim that src has another
-successor t, distinct from src, sink, and r. Indeed, suppose for the purpose
-of contradiction that no such successor exists. Then, since every state in G
-participates in a walk from src to sink, either E(G) = {(src, r), (r, sink)}, or
-E(G) = {(src, r), (r, r), (r, sink)}. In the first case G is final, in the second we
-can rewrite G using ITERATION r + —a contradiction in both cases. As such,
-the claimed t exists. Then, since src ∈ PredG (r) ∩ PredG (t), REPAIR r + t is
-applicable to G.
-As such, we can always apply a repair rule if rewriting gets stuck in a
-nonfinal automaton, after which rewriting can continue.
-4.1 A Greedy Approach: RWR
-An outline of RWR (short for REWRITE with REPAIRS) is shown in Algorithm 3. Like
-REWRITE, it first checks whether its input G is equivalent to ∅ or ε. Otherwise,
-G is rewritten using the rewrite rules in Figures 4–6 until a final automaton is
-reached, arbitrarily selecting a repair rule when rewriting gets stuck. (In our
-implementation we prefer repairs that make small extensions to the language
-of the automaton over repairs that make larger extensions. In particular, we
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:24
-
-•
-
-G. J. Bex et al.
-
-first check whether there are r and s for which REPAIR r . s? can be applied. Then
-we check whether there are r and s for which REPAIR r? . s can be applied. Next,
-we check for REPAIR r + s and finally for REPAIR r? . s?.)
-Since the repair rules add edges to G, thereby increasing L(G), we may
-conclude the following theorem.
-THEOREM 21. For a SOA G, RWR always produces a SORE r with L(G) ⊆
-L(r). Moreover, if G is equivalent to a SORE, then L(G) = L(r).
-(The second statement follows by Theorem 16.) Combined with Proposition 9,
-we hence obtain the next corollary.
-COROLLARY 22.
-
-Let M be the composition of 2T-INF with RWR, that is, M(S) :=
-
-RWR(2T-INF(S)). Then M learns the class of SOREs from positive data.
-
-4.2 Exploring the Search Space: RWR2
-When rewriting gets stuck, RWR arbitrarily selects a repair rule (perhaps based
-on some ordering of the rules as in our implementation), and discards the others. It should be clear, however, that when different repair rules are applicable,
-one rule may have a smaller impact on the language of the automaton than
-another. For that reason we present in this section a different modification
-of REWRITE that, in contrast to RWR, tries the “best”  repair rules when there
-are several candidates. Here, the “best” repair rules are those that add the
-least number of words to the language. Since an automaton defines an infinite
-language in general, it is of course impossible to take all added words into
-account. We therefore only consider the words up to a length n, where n is twice
-the number of alphabet symbols in the automaton. Formally, for a language L,
-let |L≤n| denote the number of words in L of length at most n. Moreover, say
-that generalized SOA H is a repair of generalized SOA G if H is obtained by
-applying a repair rule on G. Then the repairs of the current automaton G are
-ordered according to increasing values of | L(H)≤n|, and the best (i.e., first) 
-among them are further investigated.
-The resulting algorithm, called RWR2 (an abbreviation of REWRITE with 
-best RANKED REPAIRS) is shown in Algorithm 4. Like REWRITE, it first checks
-whether its input G is equivalent to ∅ or ε. Otherwise, RWR2 uses RWR2 -AUX to
-Algorithm 4. RWR2
-Input: SOA G
-Output: a SORE r such that L(G) ⊆ L(r) if G is not equivalent to a SORE, and L(G) =
-L(r) otherwise.
-1: if sink is not reachable from src in G then
-2:
-return ∅
-3: else if E(G) = {(src, sink)} then
-4:
-return ε
-5: else
-6:
-initialize the final automaton Hopt to recognize (G)∗
-7:
-return the SORE corresponding to the final automaton computed by
-2
-RWR -AUX(G, Hopt )
-8: end if
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:25
-
-Algorithm 5. RWR2 -AUX
-Input: generalized SOAs G and Hopt
-Output: final generalized SOA I such that L(G) ⊆ L(I) if G is not equivalent to a
-SORE, and L(G) = L(I) otherwise.
-1: while a rewrite rule from Figures 4–6 can be applied to G do
-2:
-perform the rewrite rule on G
-3: end while
-4: if G is final then
-5:
-return G
-6: else
-7:
-compute the set R of all possible repairs H of G
-8:
-sort R in increasing order by | L(H)≤n|
-9:
-for each of the min(, |R|) best repairs H do
-10:
-if | L(H)≤n| < | L(Hopt )≤n| then
-11:
-recursively compute H := RWR2 -AUX(H, Hopt )
-12:
-set Hopt := H if | L(H )≤n| < | L(Hopt )≤n|
-13:
-end if
-14:
-end for
-15:
-return Hopt
-16: end if
-
-recursively rewrite and repair G until a final automaton is reached. During
-this recursion, Hopt is the best final generalized SOA found so far. Initially, on
-line 6 of RWR2 , Hopt is set to the final generalized SOA that accepts all words
-over alphabet symbols mentioned in G. RWR2 -AUX then rewrites G in lines 1–2
-until no more rewrite rule is applicable. If the resulting G is final then it is
-returned. Otherwise, RWR2 -AUX computes in line 6 all possible repairs H of G
-and orders them according to increasing values of | L(H)≤n|. The algorithm then
-recursively calls itself on the  best ranked repairs in lines 8–10. The test in
-line 10 is an optimization: if the current repair is already worse than the best
-final generalized SOA Hopt computed so far in terms of language size, then
-further rewriting and repairing cannot yield a final generalized SOA that is
-better than Hopt . Lines 11 and 12 update Hopt when appropriate. Finally, Hopt
-is returned.
-Given its definition, it is clear that RWR2 results in regular expressions with
-a smaller language size for increasing values of , of course at the cost of
-increased computation time. In the experiments (Section 7.2) the trade-off between precision and computation time of RWR and RWR2 , for increasing values
-of , is investigated in more detail.
-4.3 Efficiently Computing the Language Size
-During its executing, RWR2 repeatedly needs to compute the language size of
-the possible repairs. This computation can actually be done quite efficiently
-for SOAs, as we show next. Of course, in general RWR2 needs to compute the
-language size also for generalized SOAs, not just ordinary SOAs. Our implementation first expands such generalized SOAs into an equivalent SOA using
-the Glushkov construction (similar to the ideas of the proof of Proposition 45
-in the online appendix that can be accessed in the ACM Digital Library), and
-then invokes the language size computation procedure explained next.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:26
-
-•
-
-G. J. Bex et al.
-
-Let |L=m| denote the number of words in L of length exactly m. Let G be a
-SOA; and assume that V(G) − {src, sink} = {a1 , . . . , an}. Then consider the n × n
-matrix D where for i, j ∈ {1, . . . , n}
-
-1 if (ai , a j ) ∈ E; and,
-D[i, j] =
-0 otherwise.
-In addition, define the 1 × n and n× 1 matrices I and F, respectively, as follows:
-for i, j ∈ {1, . . . , n}
-
-1 if (src, j) ∈ E; and,
-I[1, j] =
-0 otherwise;
-and
-
-
-F[i, 1] =
-
-1 if (i, sink) ∈ E; and,
-0 otherwise.
-
-The following lemma is straightforward to prove by induction on n using
-the fact that each walk from src to sink in G uniquely determines an accepted
-word. Let Dm denote the m-times multiplication of D, with D0 the unit matrix.
-LEMMA 23.
-
-Let m > 0 and let G be a SOA. Then | L(G)=m| = I · Dm−1 · F.
-
-Since for m = 0, we simply have | L(G)=m| = 1 if (src, sink) ∈ E, and
-n
-| L(G)=m|, we can deter| L(G)=m| = 0, otherwise and since | L(G)≤n| = m=0
-≤n
-mine | L(G) | by iteratively computing the matrices D1 to Dm, and applying
-Lemma 23. This immediately gives the following corollary.
-COROLLARY 24.
-time O(n|G|3 ).
-
-For each n > 0 and SOA G, | L(G)≤n| can be computed in
-
-5. CORRECTION
-In the conference version of this article [Bex et al. 2006] we proposed a different set of rewrite and repair rules for transforming SOAs into SOREs. While
-those rewrite rules were claimed in Bex et al. [2006] to possess the analog of
-Proposition 19 (namely that they always produce a SORE equivalent to the
-input SOA, provided that such a SORE exists), this claim is false, as we will
-detail next. Readers unfamiliar with Bex et al. [2006] may freely skip this
-section without endangering comprehension of the rest of the article.
-To illustrate why the preceding claim is false, the rewrite rules of Bex et al.
-[2006] are given in Figure 10, where G∗ refers to the ε-closure of G, defined as
-follows.
-Definition 25. Let G = (V, E) be a generalized SOA. The ε-closure G∗ of G
-is the graph (V, E∗ ) where E∗ contains:
-—all edges of E;
-—all edges (r, r) with r = s+ or r = s+ ?;
-—all edges (r, s) for which there is a path from r to s in G that passes only
-through intermediate nodes t with ε ∈ L(t).
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:27
-
-Fig. 10. Set of rewrite rules introduced in the conference version of this article [Bex et al. 2006].
-
-Figure 11 shows a sequence of rewrite steps using these rules starting from
-the SOA recognizing (a + b)+ ? or, equivalently, (a? . b?)+ . Note that the second
-rewrite step, which introduces b?, causes the automaton to become disconnected: because a? ∈ PredG∗ (b) and sink ∈ SuccG∗ (b) − {b} it deletes (a?, sink)—
-the only edge linking src to sink. As such, the accepted language changes from
-L((a + b)+ ?) to ∅. This clearly illustrates that the OPTIONAL r? rule in Figure 10
-is unsound. For that reason, we have moved in this article to the new rewrite
-rules in Figures 4–6.
-It is peculiar, however, that we have extensively used the rewrite rules of
-Figures 10 together with the repair rules in Figure 13 in a prototype implementation but have never encountered a situation where:
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:28
-
-•
-
-G. J. Bex et al.
-
-Fig. 11. A problematic sequence of rewrite steps using the rules in Figure 10. The input SOA
-accepts the same language as (a+b)+ ?, or, equivalently (a? . b?)+ . Note that the automaton resulting
-from by the second rewrite step is disconnected and hence accepts the empty language. Rewriting
-is therefore not sound.
-
-Fig. 12. A succesfull sequence of rewrite steps using the rules in Figure 10. The input SOA accepts
-the same language as (a + b)+ ?, or, equivalently (a? . b?)+ .
-
-—we obtained a SORE r that failed to accept at least all words in the input
-SOA G; or
-—we obtained a SORE r that accepted a strict superset of L(G) when G was
-equivalent to a SORE.
-We suspect that this behavior is due to the strict order in which we apply the
-rewrite rules in our implementation: first CONCATENATION, then DISJUNCTION,
-then SELF-LOOP, and finally OPTIONAL. To illustrate, Figure 12 shows a successful
-rewriting of the SOA accepting (a + b)+ ? under this order.
-The inference algorithm of Bex et al. [2006], which we shall call RWR0 in this
-article, is shown in Algorithm 6. It is based on the rewrite rules in Figure 10
-and the repair rules in Figure 13. The experiments in Section 7 indicate that
-0
-2
-RWR has no benefits over RWR and RWR . Moreover, as we do not have a formal
-soundness and completeness proof showing that rewriting always produces a
-SORE equivalent to the input SOA (provided that such a SORE exists) under
-this order, it does not make much sense to consider RWR0 for the class of SOREs.
-In strong contrast, on the class of k-occurrence regular expressions (k > 1), RWR0
-can make a difference over RWR and RWR2 [Bex et al.]. So even without formal
-guarantees, RWR0 still has its its merits.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:29
-
-Algorithm 6. RWR0
-Input: a SOA G
-Output: a SORE r
-1: if sink is not reachable from src in G then
-2:
-return ∅
-3: else if E(G) = {(src, sink)} then
-4:
-return ε
-5: else
-6:
-initialize done to false
-7:
-while not done do
-8:
-if there a rewrite rule in Figure 10 is applicable then
-9:
-rewrite G, giving precedence to CONCATENATION, then DISJUNCTION, then SELFLOOP, then OPTIONAL
-10:
-else if a repair rule in Figure 13 is applicable then
-11:
-repair G, giving precedence to ENABLE-DISJUNCTION, then ENABLE-OPTIONAL-1,
-then ENABLE-OPTIONAL-2
-12:
-else
-13:
-set done to true
-14:
-end if
-15:
-end while
-16:
-if G is final then
-17:
-return the corresponding regular expression r
-18:
-else
-19:
-return ∅
-20:
-end if
-21: end if
-
-6. INFERRING CHARES: CRX
-In this section, we present the algorithm CRX for the inference of chain regular
-expressions (CHAREs).
-Definition 26 (CHAREs ). The class of chain regular expressions consists of
-those SOREs of the form f1 · · · fn where every fi is a chain factor—an expression
-of the form (a1 + · · · + ak), (a1 + · · · + ak)?, (a1 + · · · + ak)+ , or, (a1 + · · · + ak)+ ? with
-k ≥ 1 and every ai is an alphabet symbol.
-For instance, the expression a(b+c)+ ?d+ (e + f )? is a CHARE, while (ab+c)+ ?
-and (a+ ? + b?)+ ? are not.
-Since each CHARE is a concatenation of alphabet-disjoint chain factors,
-every occurrence of an alphabet symbol in a word must be generated by the
-same chain factor in the target CHARE. The positional relationships between
-occurrences of alphabet symbols in a given sample then allow us to deduce
-which chain factors are present in the target CHARE, and how they are ordered.
-Example 27. Consider the sample S = {u, v, w} with u = abd, v = bcdee,
-and w = cade. Clearly a occurs before b in u, b occurs before c in v, and c occurs
-before a in w. In the target CHARE, therefore, a, b, and c must belong to the
-same chain factor which can only be (a + b + c)+ or (a + b + c)+ ?. Since one of
-{a, b, c} is present in every word of S, we choose (a + b + c)+ . Similarly, d and
-e form chain factors by themselves. Whereas d occurs once in every word in S,
-e can occur zero, one, or more times. Therefore, d is represented by the chain
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:30
-
-•
-
-G. J. Bex et al.
-
-Fig. 13. Repair rules accompanying the rewrite rules in Figure 10. These rules are a correction
-of the rules presented in Bex et al. [2006]. Repairs are tried in the order shown. In particular,
-ENABLE-OPTIONAL-2 is only applied if none of the other rules is applicable.
-
-factor d, while e is represented by the chain factor e+ ?. Since a, b, c always occur
-before d, which in turn always occurs before the e’s, the derived CHARE is then
-(a + b + c)+ de+ ?.
-So, in brief, CRX computes chain factors, orders them, and uses that order to
-generate a CHARE. Of course, the order of the chain factors is not necessarily
-linear. In that case, a linear order can be constructed by making the factors
-optional. Some care has to be taken, however, to generate factors that are
-disjunctions without repetitions.
-Definition 28. Let S be a sample. We denote by → S the partial preorder on
- such that a → S b if, and only if, a immediately precedes b in some w ∈ S.
-(I.e., ab is a 2-gram of S.) We say that a occurs before b in S if a →∗S b, where
-→∗S is the reflexive and transitive closure of → S.
-For instance, Figure 14 illustrates → S when S = {abccde, cccad, bf egg,
-bf ehi}.
-Definition 29. Define a ≈ S b if a occurs before b in S and b occurs before a.
-That is, a ≈ S b if a →∗S b and b →∗S a.
-Clearly, ≈ S is an equivalence relation. Let  S denote the set of equivalence classes of ≈ S. In what follows, we denote such equivalence classes by, for
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:31
-
-Fig. 14. The partial preorder → S for S = {abccde, cccad, bf egg, bf ehi}.
-
-Fig. 15. The Hasse diagram HS of the sample S = {abccde, cccad, bf egg, bf ehi}. The corresponding
-partial preorder from which HS is derived is shown in Figure 14.
-
-example, [a1 , . . . , an]. As usual, an equivalence class of cardinality 1 is called a
-singleton.
-Definition 30. The Hasse diagram of S, denoted HS, is the graph over  S
-in which there is an edge from equivalence class [a1 , . . . , an] to class [b1 , . . . , bm]
-if: (1) [a1 , . . . , an] and [b1 , . . . , bm] are distinct and (2) there exists 1 ≤ i ≤ n and
-1 ≤ j ≤ m such that ai → S b j .
-For instance, the Hasse diagram of the sample S = {abccde, cccad, bf egg,
-bf ehi} is shown in Figure 15. The operation of CRX is then shown in Algorithm 7
-and illustrated in the following example.
-Example 31. Consider again the sample S = {abccde, cccad, bf egg, bf ehi}
-and its corresponding Hasse diagram in Figure 15. Since Pred HS ([d]) =
-Pred HS ([ f ]) and Succ HS ([d]) = Succ HS ([ f ]), line 3 applies to [d] and [ f ]. Although
-Pred HS ([g]) = Pred HS ([h]), step 2 cannot be applied as Succ HS ([g]) = Succ HS ([h]).
-Similarly [g] and [i] share successors, that is, ∅, but have different predecessors.
-Hence, after the while loop in line 2 we obtain:
-
-A possible topological sort is [a, b, c], [d, f ], [e], [g], [h], [i]. Since at least one of
-a, b, and c occurs once or more in every string of W, r([a, b, c]) = (a + b + c)+ is
-the first factor; the second factor is (d + f ) since either d or f occurs exactly
-once; the factor derived from [e] is e? since W contains a string without e
-and similarly for those from [h] and [i]. Finally, g occurs multiple times in a
-single string. Hence the simple regular expression derived by the algorithm is
-(a + b + c)+ · (d + f ) · e? · g+ ? · h? · i? which completes step 6.
-Note that the order of the chain factors in the CHARE depends on the
-topological sort.
-THEOREM 32.
-L(S).
-
-Given a sample S, CRX computes a CHARE r such that S ⊆
-
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:32
-
-•
-
-G. J. Bex et al.
-
-Algorithm 7. CRX
-Input: a sample S
-Output: a CHARE r such that S ⊆ L(r)
-1: Compute the set  S of equivalence classes of ≈ S
-2: while a maximal set of singleton nodes γ1 , . . . , γ such that Pred HS (γ1 ) = · · · =
-Pred HS (γ ) and Succ HS (γ1 ) = · · · = Succ HS (γ ) exists do
-3:
-Replace γ1 , . . . , γ by γ := ∪j=1 γ j , and redirect all incoming and outgoing edges of
-the γi to γ in HS
-4: end while
-5: Compute a topological sort γ1 , . . . , γk of the nodes
-6: for all i ∈ {1, . . . , k} (γi = [a1 , . . . , an]) do
-7:
-if every w ∈ S contains exactly one occurrence of a symbol in {a1 , . . . , an} then
-8:
-r(γi ) := (a1 + · · · + an)
-9:
-else if every w ∈ S contains at most one occurrence of a symbol in {a1 , . . . , an}
-then
-10:
-r(γi ) := (a1 + · · · + an)?
-11:
-else if every w ∈ S contains at least one of a1 , . . . , an and there is a word that
-contains at least two occurrences of symbols then
-12:
-r(γi ) := (a1 + · · · + an)+
-13:
-else
-14:
-r(γi ) := (a1 + · · · + an)+ ?
-15:
-end if
-16:
-return r(γ1 ) . r(γ2 ) . · · · . r(γk)
-17: end for
-
-PROOF. The theorem follows almost immediately from the construction.
-Clearly, CRX always outputs a CHARE. Moreover, observe that after step 5
-the computed topological sort is consistent with the order of the symbols in the
-words in S. More precisely, there can not exist symbols a and b, such that a ∈ γi ,
-b ∈ γ j , i < j, and b →∗S a. Subsequently, for each γi a chain factor is chosen
-in such a manner that it is consistent with all words w ∈ S. As these factors
-are ordered consistently with the order of the symbols in S, this implies that
-S ⊆ L(r).
-Furthermore, on the class of CHAREs, CRX is complete.
-THEOREM 33.
-L(CRX(S)).
-
-For each CHARE r there is a sample S such that L(r) =
-
-PROOF. Denote by Sym(r) the set of alphabet symbols occurring in r. We also
-abuse notation and, for a sample S, write Sym(S) to denote the set of alphabet
-symbols occurring in S. Let r = f1 · · · fk be a CHARE, with each fi a chain
-factor. We construct the sample S such that the CRX(S) is syntactically equal to
-r, up to commutativity of +. The theorem then follows.
-Thereto, for every 1 ≤ i ≤ k, let wi be a word in L( fi ). We construct S by
-subsequently adding words to it. First, for all 1 ≤ i ≤ k − 1, a ∈ Sym( fi ),
-b ∈ Sym( fi+1 ), we add w1 · · · wi−1 abwi+2 · · · wk to S. Further, for all 1 ≤ i ≤ k,
-we add words to S, depending on the form of fi . Specifically, if fi is of the
-form:
-—(a1 + · · · + an), we add w1 · · · wi−1 a1 wi+1 · · · wk;
-—(a1 + · · · + an)?, we add w1 · · · wi−1 wi+1 · · · wk, and w1 · · · wi−1 a1 wi+1 · · · wk;
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:33
-
-—(a1 + · · · + an)+ , we add w1 · · · wi−1 a1 a1 wi+1 · · · wk;
-—(a1 + · · · + an)+ ?, we add w1 · · · wi−1 wi+1 · · · wk, and w1 · · · wi−1 a1 a1 wi+1 · · · wk.
-We now argue that given S, CRX indeed derives an expression syntactically
-equal to r. First observe that already before step 3, CRX computes k nodes γ1 to
-γk, which are linearly ordered, such that for each 1 ≤ i ≤ k, γi contains exactly
-the alphabet symbols contained in fi . Then, due to the number of occurrences
-of each symbol of the different chain factors, the algorithm will associate to
-each γi exactly the factor fi , and hence CRX(S) is syntactically equivalent to r,
-up to commutativity of +.
-From Theorems 32 and 33 it readily follows that we have the next corollary.
-COROLLARY 34.
-
-CRX learns the class of CHAREs from positive data.
-
-The experiments in Section 7.3 show that the number of words in S needed
-in practice is very small. Actually, the prime feature that makes CRX much
-more robust than RWR for very small datasets is its strong generalization ability. Indeed, consider an expression of the form (a1 + · · · + an)+ ?. While REWRITE
-requires all n2 2-grams of the form ai a j for i, j ∈ {1, . . . , n} to be present, RWR
-requires around (n2 − n) 2-grams. For CRX, however, the set {ε, a1 a2 , a2 a3 , . . . ,
-an−1 an, ana1 } of size O(n) will suffice. This point is illustrated in practice
-by example3 and example4 in Table II where n has a value of 41 and 56,
-respectively. Experiments illustrate that only 400  1682 and 500  3136
-2-grams are needed by CRX to learn example3 and example4, respectively.
-The following theorem shows that CRX is optimal within the class of CHAREs
-when the partial order  S is in fact a linear order.
-THEOREM 35. For every sample S, if  S is a linear order then for every
-CHARE r such that S ⊆ L(r) and L(r) ⊆ L(CRX(S)), we have r = CRX(S), that is, r
-is syntactically equal to CRX(S) up to commutativity of +.
-PROOF. Assume that CRX(S) = f1 · · · fk and r = g1 · · · gl . Clearly,
-Sym(CRX(S)) = Sym(r) = Sym(S). We first argue that k = l. Thereto, assume
-for the purpose of contradiction that k < l. Then, there is a chain factor f in
-CRX(S) with a, b ∈ Sym( f ) and two chain factors g and g in r with a ∈ Sym(g)
-and b ∈ Sym(g ). We distinguish two cases.
-(1) If f is of the form (a1 + · · · + an) or (a1 + · · · + an)?, then L(r) ⊆ L(CRX(S)).
-(2) If f is of the form (a1 + · · · + an)+ ? or (a1 + · · · + an)+ , by construction and
-since  S is linearly ordered, there are words u1 , u2 ∈ S such that a →∗u1 b
-and b →∗u2 a. However, since a and b are in different chain factors of r,
-/ L(r) or u2 ∈
-/ L(r), and hence S ⊆ L(r).
-either u1 ∈
-Conversely, assume k > l. Then, there are chain factors f, f in CRX(S) with
-a ∈ Sym( f ) and b ∈ Sym( f ), and a chain factor g in r with a, b ∈ Sym(g). We
-again distinguish two cases.
-(1) If g is of the form (a1 + · · · + an)+ ? or (a1 + · · · + an)+ , then L(r) ⊆ L(CRX(S)).
-(2) If g is of the form (a1 +· · ·+an) or (a1 +· · ·+an)?, by construction and since  S
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:34
-
-•
-
-G. J. Bex et al.
-
-is linearly ordered, there are words u1 , . . . , um ∈ S, and symbols c1 , . . . , cm−1
-such that a →∗u1 c1 , cm →∗um b, and ci →ui+1 ci+1 , for all 1 ≤ i ≤ m − 1.
-/ L(r) must
-However, due to the form of g, for at least one of these ui , ui ∈
-hold and hence S ⊆ L(r).
-Using the same kind of argument it can be shown that Sym( fi ) = Sym(gi ),
-for all 1 ≤ i ≤ k. Further, since L(r) ⊆ L(CRX(S)), for every 1 ≤ i ≤ k, we
-have L(gi ) ⊆ L( fi ). Since the different chain factors can only take a restricted
-numbers of forms, it now suffices to show that L(gi ) = L( fi ), for all i, to show that
-they are also syntactically equivalent. Hence, towards a contradiction, assume
-L(gi )  L( fi ) for some 1 ≤ i ≤ k. This can only be the case if: (1) gi = (a1 +· · ·+an)
-and fi = (a1 + · · · + an); (2) gi = (a1 + · · · + an)+ ? and fi = (a1 + · · · + an)+ ; or
-(3) gi = (a1 + · · · an)? and fi is one of the three other forms. However, in each of
-these cases, given the construction of the algorithm, one can find a word w ∈ S
-such that w ∈
-/ L(r). Hence, for all i, L( fi ) = L(gi ), and thus r = CRX(S).
-Note that this property does not hold when  S is not linear. For instance, on
-S = {abc, ade, abe} CRX yields a·b?·d?·c?·e? whereas the CHARE a·(b+d)·(c +e)
-is a better approximation of the target language.
-CRX can be efficiently executed on very large datasets by only maintaining
-HS and the multiplicities of occurrences of -symbols in words in S (needed for
-lines 6–13). From this representation, lines 2–5 can be executed. Hence, it is
-not necessary that the entire sample resides in main memory. The complexity
-of the algorithm is O(m + n3 ), where m is the size of the sample and n the
-number of alphabet symbols.
-7. EXPERIMENTAL EVALUATION
-In this section we validate our approach by means of experimental analysis.
-Specifically, we assess the quality of the expressions returned by our algorithms
-on real-world corpora and DTDs, and compare it with the quality of expressions
-returned by XTRACT [Garofalakis et al. 2003] and Trang [Clark]. Next, we compare the quality of RWR0 (the algorithm found in the conference version of this
-article), RWR, and RWR2 . Subsequently, we investigate the performance of the algorithms on incomplete and noisy data. Finally, we discuss their running time
-performance. We abuse notation and simply write RWR for the application of
-2T-INF followed by RWR, similarly for RWR0 and RWR2 . All experiments were performed using a prototype implementation of our algorithms in Java executed
-on a 2.5 Ghz Pentium 4 machine with 1GB of RAM.
-7.1 Real-World Examples
-The number of publicly available XML corpora is rather limited. We employed
-the XML Data repository maintained by Miklau [2002] as a testbed. Unfortunately, most of the corpora listed there are either very small, lack a DTD,
-or contain a DTD with only trivial regular expressions. Nevertheless, two of
-the listed corpora are interesting. Specifically, we compared XTRACT, RWR, and
-CRX on the Protein Sequence Database (683Mb in size) and the Mondial corpus
-[Miklau 2002], a database of information on various countries (1Mb in size).
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:35
-
-Table I. Results of RWR, CRX and XTRACT on DTDs and Sample Data from
-the Protein Description Database and the Mondial Corpora
-Element
-Original DTD
-Sample
-Result of CRX/ RWR
-size
-Result of XTRACT
-ProteinE.
-a1 a2 a3 a4 + ?a5 + ?a6 + ?a7 + ?a8 + ?a9 ?a10 ?a11 + ?a12 a13
-2458
-a1 a2 a3 a4 + a5 + ?a6 + ?a7 + ?a8 + ?a9 ?a10 ?a11 + ?a12 a13
-843
-an expression of 185 tokens
-organism
-a1 a2 ?a3 a4 ?a5 + ?
-9
-a1 a2 ?a3 a4 ?a5 + ?
-9
-a1 ((a2 a3 a4 ?+a3 a4 )a5 ?+a3 a5 + ?)
-reference
-a1 a2 + ?a3 + ?a4 + ?
-45
-a1 a2 + ?a3 + ?a4 + ?
-45
-a1 (a2 + ?(a4 + ?+a3 + ?)+a2 a3 + ?a4 a4 +a3 + ?a4 + ?)
-refinfo
-a1 a2 a3 ?a4 ?a5 a6 ?(a7 +a8 )?a9 ?
-10
-a1 a2 (a3 +a4 )?a5 a6 ?a7 ?a9 ?a8 ?
-10
-a1 a2 ((a3 a5 a6 a7 ?+a4 a5 )a9 ?+a5 (a7 +a8 )?+a4 a5 a8 )
-authors
-a1 + +(a2 a3 ?)
-54
-a1 + ?a2 ?a3 ? /
-a1 + +(a2 a3 )
-54
-a1 + ?+a2 a3
-accinfo
-a1 a2 + ?a3 + ?a4 ?a5 ?a6 ?a7 + ?
-124
-a1 a2 + ?a3 + a4 ?a5 ?a6 ?a7 + ?
-124
-an expression of 97 tokens
-genetics
-a1 + ?a2 ?a3 ?a4 ?a5 ?a6 ?a7 ?a8 ?a9 ?a10 ?a11 + ?a12 + ?
-219
-a1 + ?a2 ?a3 ?a4 ?a5 ?a6 ?a7 ?a8 ?a9 ?a10 ?a12 + ?
-219
-an expression of 329 tokens
-function
-a1 ?a2 + ?a3 + ?
-26
-a1 ?a2 + ?a3 + ?
-26
-(a1 (a2 ?a2 ?a3 + ?+a2 + ?(a3 a3 )+ ?+a2 a2 a2 a3 )+a2 (a2 a3 + ?+a3 + ?))
-city
-a1 a2 + ?a3 + ?
-9
-a1 a2 + ?a3 + ?
-9
-a1 (a2 + ?a3 a3 ?+a2 (a3 + ?+a2 ))?
-The left column gives element names, sample size for CRX/ RWR, and sample size for
-XTRACT, respectively. The right column lists original DTD, inferred DTD by CRX/ RWR,
-and the result of XTRACT, in that order.
-
-Since no real-world data could be obtained for SOREs that are not CHAREs,
-we generated our own XML data for a number of real-world DTDs considered
-in Bex et al. [2004] containing a number of sophisticated regular expressions
-outside the class of CHAREs.
-Real-world data. In this section, we only discuss RWR as RWR0 and RWR2 give
-precisely the same results. Table I lists all nontrivial element definitions2 in
-the aforementioned DTDs together with the results derived by the inference
-algorithms RWR, CRX, and XTRACT. It is interesting to note that only the regular
-expression for authors is not a CHARE. Moreover, no elements are repeated
-in any of the definitions. This should not come as a surprise given the observations discussed in the Introduction on the content models occurring in practice.
-The regular expression derived by the XTRACT algorithm is shown whenever
-it fitted the table, otherwise the number of tokens it consists of is listed. For
-better readability the actual output of XTRACT has been simplified by replacing
-expressions such as (ai + ε) by ai ?.
-2 It should be noted that the examples from the Mondial corpus are not valid according to their
-DTD, so for the city element only valid elements were used as training examples.
-
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:36
-
-•
-
-G. J. Bex et al.
-
-It can be verified that all regular expressions in Table I are learned quite
-satisfactory by RWR and CRX with respect to the examples extracted from the
-XML corpus. The numbers in the first column refer to the size of the sample.
-RWR and CRX always produce the same result except for authors where CRX
-cannot derive the target expression as it is not a CHARE. We note that no
-sample was representative of its target expression. As such, RWR always had to
-apply repair rules. The expressions in the table indicate that the result of these
-repairs are satisfactory. For a few expressions, for instance, ProteinE(ntry),
-refinfo, and genetics, the expressions produced by CRX and RWR are more
-strict than the corresponding one in the DTD. This is due to the data present
-in the sample. For instance, for genetics, no a11 element occurs in the sample
-so it obviously cannot be part of the derived expression. The element refinfo
-illustrates that a3 and a4 are mutually exclusive in the sample and that a8 is
-never followed by a9 . Inspecting the original DTD illustrates the underlying
-semantics.
-authors, citation, volume?, month?, year,
-pages?, (title | description)?, xrefs?
-Indeed, volume is used in the context of a journal, while month is used for a
-conference publication. Apart from the authors element XTRACT either produces
-a suboptimal expression or no expression at all. For instance, XTRACT crashes on
-the ProteinE(ntry) sample due to excessive memory consumption (more than
-1GB of RAM). Reducing the size of the sample to approximately 800 unique
-words yields a complex expression of 185 tokens.
-Real-world regular expressions. Table II lists the results of the algorithms on
-a number of more sophisticated regular expressions extracted from real-world
-DTDs discussed in Bex et al. [2004]. Since no real-world data was available
-for those DTDs, we have randomly generated samples using ToXgene [Barbosa
-et al. 2002], taking care that all relevant examples where present to ensure
-the target expression could be learned. Again, we list the sample size in the
-first column. As some of these numbers might seem artificially large, we note
-that, for instance, the SOA corresponding to example3 already contains 1897
-edges. Hence, a random dataset of 5741 words is not unreasonably large. Note
-that only the first three expressions in Table II are SOREs, none of them
-is a CHARE. The table shows clearly that CRX yields fairly good and concise
-super-approximations to the original expressions. In some cases, the results
-produced by RWR are more precise. For XTRACT, the size of the sample had to be
-limited to 300–500 in order to avoid a crash. As can be seen from the table,
-XTRACT performed excellently on the first example, but failed to generate an
-expression that fitted the table in all other cases on all the sample sets we
-tried.
-Trang. We ran Trang [Clark] on the XML data discussed in this section.
-In all but one case, Trang produced exactly the same output as CRX, with a
-notable exception: for example1 Trang’s output depends on the order in which
-the examples are presented, yielding either a1 + ?a2 ?a3 + ? or a1 + + (a2 ?a3 + ). The
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:37
-
-Table II. Results of RWR, CRX and XTRACT on
-Nonsimple Real-World DTDs and Generated Data
-Original DTD
-Element
-Result of CRX
-Sample
-Result of RWR
-size
-Result of XTRACT
-example1
-a1 + + (a2 ?a3 + )
-48
-a1 + ?a2 ?a3 + ?
-48
-a1 + + (a2 ?a3 + )
-48
-a1 + ? + (a2 ?a3 + ?)
-example2
-(a1 a2 ?a3 ?)?a4 ?(a5 + · · · + a18 )+ ?
-2210
-a1 ?a2 ?a3 ?a4 ?(a5 + · · · + a18 )+ ?
-2210
-(a1 a2 ?a3 ?)?a4 ?(a5 + · · · + a18 )+ ?
-300
-an expression of 252 tokens
-example3
-a1 ?(a2 a3 ?)?(a4 + · · · + a44 )+ ?a45 +
-5741
-a1 ?a2 ?a3 ?(a4 + · · · + a44 )+ ?a45 +
-5741
-a1 ?(a2 a3 ?)?(a4 + · · · + a44 )+ ?a45 +
-400
-an expression of 142 tokens
-example4 a1 ?a2 a3 ?a4 ?(a5 + + ((a6 + · · · + a61 )+ a5 + ?))
-10000
-a1 ?a2 a3 ?a4 ?(a6 + · · · + a61 )+ ?a5 + ?
-10000
-a1 ?a2 a3 ?a4 ?(a6 + · · · + a61 )+ ?a5 + ?
-500
-an expression of 185 tokens
-+
-example5
-a1 (a2 + a3 )+ ?(a4 (a2 + a3 + a5 )+ ?) ?
-+
-1281
-a1 (a2 + a3 + a4 + a5 ) ?
-+
-1281
-a1 ((a2 + a3 + a4 )+ a5 + ?) ?
-500
-an expression of 85 tokens
-The left column gives element names, sample size for CRX,
-RWR and XTRACT, respectively. The right column lists original
-DTD, inferred DTD by CRX, by RWR and the result of XTRACT,
-in that order.
-
-former is the same output as CRX, the latter is the intended RE that cannot
-be derived by CRX as it is outside the class of CHAREs. This inconsistency in
-Trang’s output casts some doubt on its correctness and underscores the need
-for a formal model as the cornerstone of an implementation. Indeed, there is no
-article or manual available describing the machinery underlying Trang. A look
-at the Java-code indicates that Trang is related to, but different from, CRX: it
-uses 2T-INF to construct an automaton, eliminates cycles by merging all nodes
-in the same strongly connected component, and then transforms the obtained
-DAG into a regular expression. However, no target class of REs for which Trang
-is complete, as is the case for CRX, is specified. As Trang is similar to CRX, it is
-outperformed by RWR and RWR2 .
-7.2 RWR versus RWR2
-We tested the results and performance of RWR versus RWR2 for various values
-of the rank cut-off parameter . The SOAs used in this test were randomly
-generated with 5 and 10 alphabet symbols. The results are summarized in
-Table III(a). We computed the average language size of the SOAs, which is the
-target size. It should be noted that since no SORE corresponds to these SOAs,
-the target size can never be attained since the regular expression resulting
-from RWR or RWR2 will necessarily be a generalization of the SOA’s language.
-It is immediately clear from Table III(a) that results of RWR2 are on average
-better than those for RWR, and that they improve with increasing values of .
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:38
-
-•
-
-G. J. Bex et al.
-Table III.
-(a)
-|| = 5 || = 10
-target size 0.52
-0.67
-0
-
-RWR
-
-RWR
-
-0.88
-0.80
-
-0.98
-0.96
-
-0.76
-0.73
-0.725
-0.722
-0.721
-0.720
-
-0.95
-0.92
-0.916
-0.911
-0.908
-N/A
-
-2
-
-RWR
-
-1
-2
-3
-4
-5
-∞
-
-(b)
-RWR || = 5 || = 10
-
-2
-
-1
-2
-3
-4
-5
-∞
-
-28.8%
-7.6%
-3.2%
-1.3%
-0.7%
-24.6%
-
-46.3%
-7.3%
-1.2%
-0.0%
-0.0%
-N/A
-
-(a) Average language size for RWR and RWR2 for various values of
-.  = ∞ denotes an exhaustive exploration of all possible repairs.
-(b) Percentage of target expressions for which RWR outperforms RWR2 .
-
-For expressions of alphabet size 5, we were able to consider all possible repairs,
-resulting in the entry for  = ∞ in Table III(a). This represents the smallest
-language that includes the SOA’s language and that can be expressed by a
-SORE.
-Of course, the results in Table III(a) are averaged over 1000 randomly chosen
-SOAs. A more detailed analysis reveals that for a considerable number of SOAs,
-2
-RWR actually outperforms RWR for  = 1. Table III(a) shows the number of
-2
-times RWR outperforms RWR for various values of . The probability that RWR
-outperforms RWR2 drops rapidly for increasing values of , especially for larger
-alphabet sizes. The last line in Table III(b) lists the probability that RWR derives
-the optimal result, that is, that the smallest language representable by a SORE
-is obtained for expressions of alphabet size 5.
-Although the RWR2 algorithm clearly outperforms RWR in terms of the language size of the derived expression, there is a compelling argument in the
-latter’s favor. In terms of running time, RWR outperforms RWR2 with a few orders of magnitude as is discussed in Section 7.5.
-7.3 Incomplete Data
-Unfortunately, in a real-world setting an available sample may simply contain
-too little information to learn the target regular expression. To formalize this,
-we introduce the notion of coverage.
-Definition 36. A sample S covers a deterministic automaton A if for every
-edge (s, t) in A there is a word w ∈ S whose unique accepting run in A traverses (s, t). Such a word w is called a witness for (s, t). A sample S covers a
-deterministic regular expression r if it covers the automaton obtained from S
-using the Glushkov construction for translating regular expressions into automata [Brüggeman-Klein 1993].
-If a sample S does not contain a witness for an edge, it may seem as if
-the target expression cannot be learned, even if it is a SORE since the SOA
-derived from the data has an edge missing. However, the repair rules introduce
-extra edges, so this part of the algorithm may actually alleviate the problem of
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:39
-
-Table IV. Percentage of
-Successfully Derived Expressions
-at Various Values of Sample
-Coverage for CRX, RWR0 , RWR and
-2
-1
-
-RWR
-
-coverage CRX RWR0 RWR RWR21
-25.0
-85% 56% 12% 73%
-35.0
-87% 48% 32% 73%
-45.0
-96% 60% 57% 74%
-55.0
-87% 58% 63% 57%
-65.0
-82% 48% 58% 59%
-75.0
-80% 51% 51% 63%
-85.0
-63% 48% 47% 53%
-92.5
-57% 48% 47% 61%
-97.5
-85% 74% 64% 73%
-100.0
-100% 100% 100% 100%
-
-incomplete data. This is indeed confirmed experimentally. It turns out that even
-with a substantial fraction of missing witnesses, the target regular expression
-can be learned with an astonishing degree of success. To quantify the missing
-information, we introduce the following definition:
-Definition 37. The coverage of a sample with respect to a target expression
-r is the ratio of the number of edges of the SOA derived from the sample and
-the SOA representing the target expression r.
-The tests were done on 100 real-world regular expressions of alphabet sizes
-up to 10, for 10 independently selected samples of varying coverage. The results are presented in Table IV. The straightforward CRX clearly outperforms all
-other algorithms, although this result should be approached with some caution:
-to give CRX a fair chance, the target expressions for this algorithm were limited
-to CHAREs, while the other algorithms were tested on general SOREs as well.
-Note that approximately 90% of real-world expressions are in fact CHAREs,
-hence its superior performance is not only due to simpler target expressions.
-The robustness of RWR21 is quite remarkable since it tends to derive more specific
-regular expressions than RWR0 and RWR. One would expect the generalization
-ability to decrease for algorithms that yield more specific results. This expectation is borne out when one compares RWR0 and RWR, however, RWR21 ’s greedy
-application of the repair rules seems to pay off in the context of incomplete data
-as well.
-7.4 Noise
-As already noted in the Introduction, real-world samples (such as XHTML)
-need not be valid with respect to its known schema. Errors crop up due to
-all sorts of circumstances. This underscores the need for a robust inference
-algorithm that can handle some noise in the input sample.
-Noise can come in several forms. To generate a noisy subsample, we modify
-the target expression either by replacing a symbol by a different one from the
-target’s expression, or by replacing it by a symbol that is not in the alphabet of
-the target expression. We than use the modified target expression to generate
-a complete sample. We define the noise level as follows.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:40
-
-•
-
-G. J. Bex et al.
-
-Definition 38. Given a target expression r, the noise level of a sample S is
-the ratio |S− L(r)|/|S|.
-Here we propose an approach to filter the sample S based on the probability
-of its words being generated by a probabilistic automaton, as we already used
-in previous work [Bex et al. 2008]. This probabilistic automaton has one state
-for each alphabet symbol, and the transition probabilities are computed using
-the Baum-Welsh algorithm [Rabiner 1989]. Given the probabilistic automaton,
-it is straightforward to compute the probability for each w ∈ S, so that one can
-rank the sample’s words. One expects words that contain noise, that is, that
-would be rejected by the target regular expression, to have low probability if
-their number is not excessively large compared to the sample’s size.
-To filter the sample, hoping to exclude those words that contain noise, we
-compute the mean μ and standard deviation σ of the sample’s probabilities. A
-string w ∈ S with probability P(w) is excluded if P(w) < μ − ασ . The factor α
-is a parameter of the algorithm. The filtered sample S is now used to derive
-a regular expression. It is of course possible that in the generation of S some
-words needed to derive the target expression were removed. Hence there is no
-guarantee that the derived regular expression will be an overapproximation of
-the target expression.
-Since it was shown in previous sections that RWR21 has the best overall performance, we focus solely on this algorithm in this section. In order to investigate
-how robust RWR21 is with respect to noise we applied the algorithm to samples S
-with increasing noise levels with a range of values for the cut-off α. We compute
-the precision and the recall for each individual expression and use the average
-values over many expressions to compute the F-value for a given noise level
-and cut-off so that the optimal cut-off point can be determined.
-To define precision and recall, consider the sample S = Svalid ∪ Sinvalid , where
-Svalid ⊆ S contains the words in S accepted by the target expression and Sinvalid
-contains the words in S not accepted by the target expression. A true positive is
-a word in Svalid that is accepted by the derived expression, while a false negative
-is a word in Svalid that is rejected by the derived expression. Similarly, a false
-positive is a word in Sinvalid that is accepted by the derived expression, while a
-true negative is a word in Sinvalid that is rejected by the derived expression. We
-denote by St.p. the set of true positives, by St.n. the set of true negatives, by Sf .p.
-the set of false positives, and by Sf .n. the set of false negatives.
-Definition 39. The precision p, recall r, and F-value of a derived regular
-expression on a sample S are given by
-p=
-
-|St.p. |
-,
-(|St.p. | + |Sf .p. |)
-
-r=
-
-|St.p. |
-,
-(|St.p. | + |Sf .n. |)
-
-F=
-
-2 pr
-.
-p+r
-
-Furthermore, we are interested in the fraction of derived regular expressions
-that is equivalent to the target expression.
-We average over 580 SOREs obtained from a corpus of real-world DTDs.
-The results are shown in Figure 16(a). From the F-value we can conclude
-that a cut-off value α F ≈ 0.7 yields the best balance between precision and
-recall. Figure 16(b) shows the fraction of derived regular expressions that is
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:41
-
-Fig. 16. (a) F-value as a function of the cut-off value α for noise levels of 0.01 (squares), 0.02
-(circles), and 0.05 (triangles). (b) Fraction of derived expressions equivalent to the target expression
-as a function of the cut-off value α for noise levels of 0.01 (squares), 0.02 (circles), and 0.05
-(triangles).
-
-equivalent to the target expression. For noise levels increasing from 0.01 to
-0.05, the F-value as well as the percentage of derived expressions equivalent
-to the target expression gradually decreases, as is to be expected. It should be
-noted that recall r < 1 implies that the language represented by the derived
-regular expression is not a superset of the target’s language. For the cut-off α F ,
-and a noise level of 0.01, approximately 16% of the derived regular expressions
-allow false negatives, while the value for a noise level of 0.05 is 15%. The fact
-that the derived expression is not a super-approximation may or may not be
-acceptable, depending on the application.
-Another interesting observation is that the number of derived expressions
-that is equivalent to the target expression increases beyond the cut-off value
-α F ; see Figure 16(b). For a noise level of 0.01, this trend continues up to
-cut-off values of αequiv. ≈ 0.3 where it reaches a maximum of approximately
-53%. However, at this value 20% of the derived regular expressions are not
-super-approximations to their target expressions. For α < αequiv. , the F-value
-decreases rapidly. For higher noise levels, the optimal cut-off value αequiv. is
-smaller, but since it is very unlikely that one knows the noise level, it is hard
-to take advantage of this fact by tuning αequiv. to a specific noise level. The
-overall best result will be obtained for αequiv. ≈ 0 for noise levels not exceeding
-0.05.
-It should be noted that for a noise level of 0.01 at αequiv. , out the 53% of derived
-regular expression that are equivalent to the target expression, about 7% is
-not covered by the sample. The latter illustrates once more the generalization
-ability of the algorithms RWR2 as was discussed in Section 7.3.
-7.5 Performance
-As mentioned previously, the one advantage RWR has over RWR2 is that the
-former’s running time is much lower than the latter’s. This is illustrated in
-Table V(a) for 1000 target expressions of alphabet size 10. It also shows the
-relative running time for RWR0 , illustrating that RWR outperforms both RWR0 and
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:42
-
-•
-
-G. J. Bex et al.
-Table V.
-(a)
-relative running time
-0
-RWR
-6 · 102
-2
-
-RWR
-
-1
-2
-3
-4
-5
-
-2 · 102
-2 · 103
-1 · 104
-4 · 104
-1 · 105
-
-(b)
-|| time (ms)
-5
-2
-10
-5
-15
-15
-20
-33
-50
-616
-100
-7562
-
-(a) Relative running times of RWR2 versus RWR for various
-values of . (b) Average running times in milliseconds for RWR
-as a function of alphabet size.
-
-2
-2
-RWR for any value of . However, it is interesting to note that RWR1 outperforms
-0
-RWR by a factor of 3, and derives more specific regular expressions, again
-illustrating the superiority of the new algorithms over RWR0 .
-
-The performance of RWR is excellent: on average it takes only ms to derive
-an expression of alphabet size 10. Table V(b) shows actual running times as a
-function of the target expressions’ alphabet size, averaged over 1000 random
-expressions of that alphabet size.
-With respect to the performance in terms of the number of examples, we
-showed in the conference version of this article that RWR0 ’s was adequate to
-deal with large datasets. Example4 with 61 symbols in Table II is derived from
-10000 example words in 7 seconds while CRX only needs 3.2 seconds. More
-typical expressions of about 10 symbols derived from a few hundred examples
-take approximately a second. These figures include the time to initialize a
-Java Virtual Machine while the tests are done on a 2.5 GHz P4 with 1GB
-of RAM. Given that RWR and RWR21 outperform RWR0 and the time required to
-start the virtual machine and parse the data is independent of the algorithm,
-our new algorithms are adequate as well. For instance, RWR derived a DTD
-for PubMed from 10000 articles with a total size of over 1.2GB in 264 seconds
-(again including the time needed for Java initialization and parsing of the XML
-data). Trang slightly outperforms CRX thanks to very efficient XML parsing. We
-did not make a detailed comparison with XTRACT for the reason that XTRACT
-cannot handle samples with more than 1000 words.
-8. EXTENSIONS
-Incremental computation. Especially in the setting of sparse data when over
-time more XML data gets generated, for instance, by answers to queries or
-results of calls to Web services, it is desirable to update an already generated
-schema based on the newly arrived XML data only. Such an approach is possible
-for both RWR and CRX: as both algorithms make use of an internal representation
-(automata or partial orders), we only need to update that representation. So, for
-every element name we store the corresponding internal graph representation,
-which is only quadratic in the number of different element names, and we can
-forget about the XML data that generated it. Actually, for CRX, to assign the
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:43
-
-qualifiers ?, + and ∗, we also need to remember for each element name how
-it occurs (always exactly once, always more than once, . . . ), but this is only a
-constant amount of information.
-Numerical predicates. An immediate drawback of SOREs is that they cannot count. For instance, they cannot express aabb+ specifying that a string
-should start with two a’s followed by any number of b’s larger than 1. XML
-Schema even uses dedicated attributes for expressing the desired number of
-repetitions.
-<xs:sequence>
-<xs:element name="a" minOccurs=2 maxOccurs=2/>
-<xs:element name="b" minOccurs=2 maxOccurs="unbounded"/>
-</xs:sequence>
-
-In the same way, REs can be extended by numerical predicates: when r is
-an RE and i is a natural number then r ≥i and r =i are also REs. They are
-semantically equivalent to r i r ∗ and r i , respectively, where r i = r · r · · · · · r (i
-times). The preceding expression can then be expressed as a=2 b≥2 . To both RWR
-and CRX a post-processing step can be added that rewrites + and ∗ to numerical
-values based on exact occurrences of element names in the XML data.
-Generation of XSDs. While the inference of DTDs essentially reduces to the
-inference of regular expressions from sets of sample words (as illustrated in
-Section 1.1), the inference of XSDs is much more complex.
-Indeed, first and foremost, the content model of an element can only depend
-on the element’s name in a DTD. XML Schema, in contrast, has a typing
-mechanism that allows the content model of an element to depend not only on
-its name, but also on the context in which it is used. We refer the interested
-reader to Martens et al. [2006, 2007] for an in-depth discussion on the XML
-Schema typing mechanism and the extra expressive power that it provides with
-respect to DTDs. It is important to note, however, that the study of Martens
-et al. [2006] also shows that 85% of XSDs in practice does not use this additional
-power, and are hence structurally equivalent to a DTD. Obviously, inferring
-such XSDs is merely a matter of using the correct syntax. How to extend
-schema inference to deal with real XSDs that do use the additional power of
-the XML Schema typing system is studied in a companion article [Bex et al.
-2007].
-Second, DTDs have essentially only one atomic data type to describe the
-textual data found in XML documents: #PCDATA. XML Schema, in contrast, has
-atomic data types for numbers, strings, dates, etc. The algorithms described
-here can easily be extended with heuristics to recognize these atomic data
-types, such as the ones described by Hegewald et al. [2006].
-Inference of k-OREs. As the vast majority of expressions used in practical
-schemas are SOREs, we focused in this article on the inference of SOREs. In
-a companion article [Bex et al. 2008] we study the derivation of k-OREs, for
-small values of k, thus covering virtually all expressions occurring in practice.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:44
-
-•
-
-G. J. Bex et al.
-
-9. CONCLUSION
-We introduced novel algorithms for the inference of concise regular expressions
-from positive data. For the inference of SOREs, RWR2 was shown to yield the best
-experimental results. It is also quite robust when presented with incomplete
-and noisy data. The quality of inferred expressions on real-world and synthetic
-datasets outperforms those returned by XTRACT where CRX is similar to Trang.
-CRX’ generalization ability makes it highly qualified in dealing with very small
-datasets. Further, RWR, RWR2 , and CRX always infer succinct expressions by definition which can easily be interpreted by humans. Of independent interest, we
-introduced a new algorithm to transform automata into short, readable regular
-expressions.
-ELECTRONIC APPENDIX
-The electronic appendix for this article can be accessed in the ACM Digital
-Library.
-ACKNOWLEDGMENTS
-
-We thank the authors of Garofalakis et al. [2003] for making available
-XTRACT’s source code, as well as Wouter Gelade for comments on a previous draft of this article.
-REFERENCES
-ABITEBOUL, S., BUNEMAN, P., AND SUCIU, D. 1999. Data on the Web. Morgan Kaufmann Publishers.
-AHONEN, H. 1996. Generating grammars for structured documents using grammatical inference methods. Ph.D. thesis, Report A-1996-4. Department of Computer Science, University of
-Helsinki.
-ANGLUIN, D. AND SMITH, C. H. 1983. Inductive inference: Theory and methods. ACM Comput.
-Surv. 15, 3, 237–269.
-BARBOSA, D., MENDELZON, A. O., KEENLEYSIDE, J., AND LYONS, K. A. 2002. ToXgene: An extensible
-template-based data generator for XML. In Proceedings of the 5th International Workshop on the
-Web and Databases (WebDB 2002). 49–54.
-BARBOSA, D., MIGNET, L., AND VELTRI, P. 2006. Studying the XML web: Gathering statistics from
-an XML sample. World Wide Web 9, 2, 187–212.
-BENEDIKT, M., FAN, W., AND GEERTS, F. 2008. XPath satisfiability in the presence of DTDs. J.
-ACM 55, 2, 1–79.
-BERNSTEIN, P. A. 2003. Applying model management to classical meta data problems. In Online
-Proceedings of the 1st Biennal Conference on Innovative Data Systems Research (CIDR’03).
-BEX, G. J., GELADE, W., NEVEN, F., AND VANSUMMEREN, S. Learning deterministic regular expressions
-for the inference of schemas from XML data. http://arxiv.org/abs/1004.2372.
-BEX, G. J., GELADE, W., NEVEN, F., AND VANSUMMEREN, S. 2008. Learning deterministic regular
-expressions for the inference of schemas from XML data. In Proceeding of the 17th International
-Conference on World Wide Web (WWW’08). 825–834.
-BEX, G. J., NEVEN, F., AND DEN BUSSCHE, J. V. 2004. DTDs versus XML Schema: A practical study.
-In Proceedings of the International Workshop on Web and Database (WebDB). S. Amer-Yahia and
-L. Gravano, Eds. 79–84.
-BEX, G. J., NEVEN, F., SCHWENTICK, T., AND TUYLS, K. 2006. Inference of concise DTDs from XML
-data. In Proceedings of the International Conference on Database Theory (VLDB). U. Dayal, K.-Y.
-Whang, D. B. Lomet, G. Alonso, G. M. Lohman, M. L. Kersten, S. K. Cha, and Y.-K. Kim, Eds.
-ACM, 115–126.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:45
-
-BEX, G. J., NEVEN, F., AND VANSUMMEREN, S. 2007. Inferring XML schema definitions from XML
-data. In Proceedings of the 33rd International Conference on Very Large Data Bases (VLDB’07).
-998–1009.
-BRĀZMA, A. 1993. Efficient identification of regular expressions from representative examples. In
-Proceedings of the 6th Annual Conference on Computational Learning Theory (COLT’93). ACM
-Press, 236–242.
-BRÜGGEMAN-KLEIN, A. 1993. Regular expressions into finite automata. Theor. Comput. Sci. 120, 2,
-197–213.
-BRÜGGEMANN-KLEIN, A. AND WOOD, D. 1998. One-Unambiguous regular languages. Inform. Comput. 140, 2, 229–253.
-BUNEMAN, P., DAVIDSON, S. B., FERNANDEZ, M. F., AND SUCIU, D. 1997. Adding structure to unstructured data. In Proceedings of the International Conference on Database Theory (ICDT’97).
-Lecture Notes in Computer Science, vol. 1186. Springer, 336–350.
-CARON, P. AND ZIADI, D. 2000. Characterization of Glushkov automata. Theor. Comput. Sci. 233, 1–
-2, 75–90.
-Castor. The Castor project. www.castor.org.
-CHIDLOVSKII, B. 2001. Schema extraction from XML: A grammatical inference approach. In
-Proceedings of the 8th International Workshop on Knowledge Representation meets Databases
-(KRDB’01). CEUR Workshop Proceedings, vol. 45.
-CLARK,
-J.
-Trang:
-Multi-Format
-schema
-converter
-based
-on
-RELAX
-NG.
-www.thaiopensource.com/relaxng/trang.html.
-COVER, R. 2003. The Cover Pages. xml.coverpages.org.
-DELGADO, M. AND MORAIS, J. 2004. Approximation to the smallest regular expression for a given
-regular language. In Proceedings of the, 9th International Conference on Implementation and
-Application of Automata. Lecture Notes in Computer Science, vol. 3317. Springer, 312–314.
-DEUTSCH, A., FERNANDEZ, M. F., AND SUCIU, D. 1999. Storing semistructured data with STORED.
-In Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM
-Press, 431–442.
-EHRENFEUCHT, A. AND ZEIGER, P. 1976. Complexity measures for regular expressions. J. Comput.
-Syst. Sci. 12, 134–146.
-FERNANDEZ, M. F. AND SUCIU, D. 1998. Optimizing regular path expressions using graph schemas.
-In Proceedings of the 14th International Conference on Data Engineering (ICDE’98). 14–
-23.
-FERNAU, H. 2004. Extracting minimum length document type definitions is NP-hard. In Proceedings of the 7th International Colloquium on Grammatical Inference: Algorithms and Applications.
-Lecture Notes in Artificial Intelligence, vol. 3264. Springer, 277–278.
-FERNAU, H. 2009. Algorithms for learning regular expressions from positive data. Inform. Comput. 207, 4, 521–541.
-FLORESCU, D. 2005. Managing semi-structured data. ACMQueue 3, 8, 18–24.
-GARCÍA, P. AND VIDAL, E. 1990. Inference of k-testable languages in the strict sense and application
-to syntactic pattern recognition. IEEE Trans. Patt. Anal. Mach. Intell. 12, 9, 920–925.
-GAROFALAKIS, M., GIONIS, A., RASTOGI, R., SESHADRI, S., AND SHIM, K. 2003. XTRACT: Learning
-document type descriptors from XML document collections. Data Mining Knowl. Discov. 7, 23–
-56.
-GELADE, W. AND NEVEN, F. 2008. Succinctness of the complement and intersection of regular
-expressions. In Proceedings of the 25th Annual Symposium on Theoretical Aspects of Computer
-Science (STACS’08). Dagstuhl Seminar Proceedings, vol. 08001. 325–336.
-GOLD, E. 1967. Language identification in the limit. Inform. Control 10, 5, 447–474.
-GOLDMAN, R. AND WIDOM, J. 1997. DataGuides: Enabling query formulation and optimization in
-semistructured databases. In Proceedings of the 23rd International Conference on Very Large
-Data Bases (VLDB’97). 436–445.
-GRUBER, H. AND HOLZER, M. 2008. Finite automata, digraph connectivity, and regular expression size. In Proceedings of the 35th International Colloquium on Automata, Languages and
-Programming. Lecture Notes in Computer Science, vol. 5126. Springer, 39–50.
-HAN, Y.-S. AND WOOD, D. 2007. Obtaining shorter regular expressions from finite-state automata.
-Theor. Comput. Sci. 370, 1–3, 110–120.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-11:46
-
-•
-
-G. J. Bex et al.
-
-HEGEWALD, J., NAUMANN, F., AND WEIS, M. 2006. XStruct: Efficient schema extraction from multiple and large XML documents. In Proceedings of the 22nd International Conference on Data
-Engineering Workshops (ICDEW’06). IEEE Computer Society, 81–97.
-HINKELMAN, S. 2005. Business integration—Information conformance statements (BI-ICS). Tech.
-rep., IBM DeveloperWorks.
-HOPCROFT, J. AND ULLMAN, J. 1979. Introduction to Automata Theory, Languages and computation.
-Addison-Wesley.
-HUET, G. 1980. Confluent reductions: Abstract properties and applications to term rewriting
-systems. J. ACM 27, 4, 797–821.
-KOCH, C., SCHERZINGER, S., SCHWEIKARDT, N., AND STEGMAIER, B. 2004. Schema-Based scheduling of
-event processors and buffer minimization for queries on structured data streams. In Proceedings
-of the 30th International Conference on Very Large Data Bases (VLDB’04). 228–239.
-MANOLESCU, I., FLORESCU, D., AND KOSSMANN, D. 2001. Answering XML queries on heterogeneous data sources. In Proceedings of 27th International Conference on Very Large Data Bases
-(VLDB’01). 241–250.
-MARTENS, W., NEVEN, F., AND SCHWENTICK, T. 2007. Simple off the shelf abstractions for XML
-schema. SIGMOD Rec. 36, 3, 15–22.
-MARTENS, W., NEVEN, F., SCHWENTICK, T., AND BEX, G. J. 2006. Expressiveness and complexity of
-XML schema. ACM Trans. Data. Syst. 31, 3.
-MCHUGH, J., ABITEBOUL, S., GOLDMAN, R., QUASS, D., AND WIDOM, J. 1997. Lore: A database management system for semistructured data. SIGMOD Rec. 26, 3, 54–66.
-MELNIK, S. 2004. Generic model management: Concepts and algorithms. Ph.D. thesis, University
-of Leipzig.
-MIGNET, L., BARBOSA, D., AND VELTRI, P. 2003. The XML web: A first study. In Proceedings of the
-12th International World Wide Web Conference. 500–510.
-MIKLAU, G. 2002. XMLData repository. www.cs.washington.edu/research/xmldatasets.
-MIN, J.-K., AHN, J.-Y., AND CHUNG, C.-W. 2003. Efficient extraction of schemas for XML documents.
-Inform. Process. Lett. 85, 1, 7–12.
-NESTOROV, S., ABITEBOUL, S., AND MOTWANI, R. 1998. Extracting schema from semistructured data.
-In Proceedings of the ACM SIGMOD International Conference on Management of Data. ACM
-Press, 295–306.
-NESTOROV, S., ULLMAN, J. D., WIENER, J. L., AND CHAWATHE, S. S. 1997. Representative objects: Concise representations of semistructured, hierarchial data. In Proceedings of the 13th International
-Conference on Data Engineering. IEEE Computer Society, 79–90.
-NEVEN, F. AND SCHWENTICK, T. 2006. On the complexity of XPath containment in the presence of
-disjunction, DTDs, and variables. Logical Methods Comput. Sci. 2, 3.
-NGU, A. H. H., ROCCO, D., CRITCHLOW, T., AND BUTTLER, D. 2005. Automatic discovery and inferencing of complex bioinformatics web interfaces. World Wide Web 8, 4, 463–493.
-OAKS, P. AND TER HOFSTEDE, A. H. M. 2007. Guided interaction: A mechanism to enable ad hoc
-service interaction. Inform. Syst. Frontiers 9, 1, 29–51.
-OHLEBUSCH, E. 2001. Implementing conditional term rewriting by graph rewriting. Theor. Comput. Sci. 262, 1, 311–331.
-OPEN WEB APPLICATION SECURITY PROJECT CONSORTIUM. 2004. The top ten most critical web application security vulnerabilities—2004 update. www.owasp.org.
-PITT, L. 1989. Inductive inference, DFAs, and computational complexity. In Proceedings of the
-International Workshop on Analogical and Inductive Inference (AII’89). Springer-Verlag, 18–
-44.
-RABINER, L. 1989. A tutorial on hidden Markov models and selected applications in speech
-recognition. Proc. IEEE 77, 2, 257–286.
-RAHM, E. AND BERNSTEIN, P. A. 2001. A survey of approaches to automatic schema matching.
-VLDB J. 10, 4, 334–350.
-SAHUGUET, A. 2000. Everything you ever wanted to know about DTDs, but were afraid to ask
-(extended abstract). In Proceedings of the 3rd International Workshop on The World Wide Web
-and Databases, (WebDB’00), Selected Papers. 171–183.
-SAKAKIBARA, Y. 1997. Recent advances of grammatical inference. Theor. Comput. Sci. 185, 1,
-15–45.
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-Inference of Concise Regular Expressions and DTDs
-
-•
-
-11:47
-
-SANKEY, J. AND WONG, R. K. 2001. Structural inference for semistructured data. In Proceedings of
-the International Conference on Information and Knowledge Management. ACM Press, 159–166.
-Sun. Sun JAXB. java.sun.com/webservices/jaxb.
-THOMPSON, H. S., BEECH, D., MALONEY, M., AND MENDELSOHN, N. 2004. XML Schema part 1: Structures 2nd Ed. World Wide Web Consortium, Recommendation REC-xmlschema-1-20041028.
-W3C. 2002. XHTML 1.0 The Extensible HyperText Markup Language, 2nd Ed. W3C.
-WANG, G., LIU, M., YU, J. X., SUN, B., YU, G., LV, J., AND LU, H. 2003. Effective schema-based XML
-query optimization techniques. In Proceedings of the 7th International Database Engineering
-and Applications Symposium. 230–235.
-Received January 2009; revised July 2009; accepted November 2009
-
-ACM Transactions on Database Systems, Vol. 35, No. 2, Article 11, Publication date: April 2010.
-
-
\ No newline at end of file
diff --git a/tests/test_bex.py b/tests/test_bex.py
index ad62471..a00eabe 100644
--- a/tests/test_bex.py
+++ b/tests/test_bex.py
@@ -1,8 +1,5 @@
 """Tests for BEX paper algorithm implementations."""
 
-import sys
-sys.path.insert(0, '/home/tobi/Desktop/kesai/ProjectManagement/companyweb')
-
 from bex.soa import SOA
 from bex.twotinf import build_soa
 from bex.rwr0 import rwr0
@@ -273,7 +270,7 @@ def run_all():
 
 # ── Integration tests with real Ansible task data ──
 
-def test_integration_quartz_deploy():
+def test_integration_linear_sequence():
     """Simple linear sequence — all tasks always in same order."""
     seqs = [
         ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'],
@@ -283,11 +280,11 @@ def test_integration_quartz_deploy():
     result = crx.infer(seqs)
     assert result is not None
     assert all(t in result for t in ['file', 'template', 'docker_image', 'command', 'set_fact', 'shell', 'wait_for'])
-    print(f"  PASS quartz_deploy: {result}")
+    print(f"  PASS linear_sequence: {result}")
 
 
-def test_integration_validate_system():
-    """Optional shell tasks."""
+def test_integration_optional_tasks():
+    """Optional tasks — some sequences have more of the same."""
     seqs = [
         ['shell', 'debug', 'shell', 'debug'],
         ['shell', 'debug', 'shell', 'debug', 'shell', 'debug'],
@@ -297,11 +294,11 @@ def test_integration_validate_system():
     result = crx.infer(seqs)
     assert result is not None
     assert 'shell' in result and 'debug' in result
-    print(f"  PASS validate_system: {result}")
+    print(f"  PASS optional_tasks: {result}")
 
 
-def test_integration_docker_detect_branch():
-    """Branching: docker compose v2 check or v1 fallback."""
+def test_integration_branching_paths():
+    """Branching: one path or an alternative."""
     seqs = [
         ['file', 'template', 'command_v2', 'set_fact', 'shell', 'wait_for'],
         ['file', 'template', 'command_v1', 'set_fact', 'shell', 'wait_for'],
@@ -310,11 +307,11 @@ def test_integration_docker_detect_branch():
     result = crx.infer(seqs)
     assert result is not None
     assert 'file' in result and 'template' in result and 'shell' in result
-    print(f"  PASS docker_detect: {result}")
+    print(f"  PASS branching_paths: {result}")
 
 
-def test_integration_firewall_gating():
-    """Conditional firewall rule sequence (gated)."""
+def test_integration_conditional_tasks():
+    """Tasks that sometimes appear, sometimes not."""
     seqs = [
         ['assert', 'file', 'template', 'shell', 'wait_for'],
         ['assert', 'file', 'template', 'command_fw', 'command_fw', 'shell', 'wait_for'],
@@ -324,7 +321,7 @@ def test_integration_firewall_gating():
     result = crx.infer(seqs)
     assert result is not None
     assert 'assert' in result and 'file' in result
-    print(f"  PASS firewall_gating: {result}")
+    print(f"  PASS conditional_tasks: {result}")
 
 
 def test_integration_idregex_linear():
@@ -361,8 +358,8 @@ def test_integration_ikoa_linear():
     print(f"  PASS ikoa_linear: {expr}")
 
 
-def test_integration_backup_restic():
-    """Sequence with loop (systemd enable)."""
+def test_integration_looping_tasks():
+    """Sequence with loop (repeated tasks)."""
     seqs = [
         ['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd', 'systemd', 'systemd'],
         ['package', 'assert', 'file', 'template', 'template', 'template', 'template', 'template', 'template', 'systemd'],
@@ -370,7 +367,7 @@ def test_integration_backup_restic():
     crx = CRX()
     result = crx.infer(seqs)
     assert result is not None
-    print(f"  PASS backup_restic: {result}")
+    print(f"  PASS looping_tasks: {result}")
 
 
 def run_all():