chore: ignore examples/

2026-07-01 15:40:02 +02:00 · 2026-07-01 15:40:02 +02:00 · a70024397f
commit a70024397f
parent 929a50c95d
2 changed files with 1 additions and 239 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,3 +6,4 @@ venv/
 *.egg-info/
 dist/
 build/
+examples/
--- a/examples/readme_analysis.py
+++ b/examples/readme_analysis.py
@ -1,239 +0,0 @@
-"""
-README Structure Analysis — infer the conventional heading structure of
-top GitHub repositories using Dervish grammar inference.
-"""
-
-import re
-import sys
-import time
-import json
-import requests
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-from bex.ensemble import infer_ensemble, _matches
-
-# ── Synonym normalization map ──
-NORMALIZE = {
-    'description': 'description',
-    'overview': 'description',
-    'about': 'description',
-    'introduction': 'description',
-    'getting started': 'getting-started',
-    'quick start': 'getting-started',
-    'quickstart': 'getting-started',
-    'installation': 'installation',
-    'install': 'installation',
-    'setup': 'installation',
-    'usage': 'usage',
-    'how to use': 'usage',
-    'examples': 'usage',
-    'example': 'usage',
-    'api': 'api',
-    'api reference': 'api',
-    'api documentation': 'api',
-    'documentation': 'api',
-    'features': 'features',
-    'configuration': 'configuration',
-    'config': 'configuration',
-    'contributing': 'contributing',
-    'development': 'contributing',
-    'building': 'contributing',
-    'build': 'contributing',
-    'license': 'license',
-    'changelog': 'changelog',
-    'faq': 'faq',
-    'frequently asked questions': 'faq',
-    'support': 'support',
-    'screenshots': 'screenshots',
-    'demo': 'screenshots',
-    'tests': 'testing',
-    'testing': 'testing',
-    'badges': 'badges',
-    'acknowledgments': 'acknowledgments',
-    'acknowledgements': 'acknowledgments',
-    'credits': 'acknowledgments',
-    'roadmap': 'roadmap',
-    'related projects': 'related',
-    'see also': 'related',
-}
-
-def normalize_heading(text):
-    """Normalize a heading to a canonical name, or return the raw slug."""
-    t = text.strip().lower()
-    t = re.sub(r'[^a-z0-9 ]', '', t)
-    t = re.sub(r'\s+', ' ', t).strip()
-    return NORMALIZE.get(t, t)
-
-def fetch_top_repos(n=100, min_stars=5000):
-    """Fetch top N repos by stars from GitHub search API."""
-    repos = []
-    page = 1
-    headers = {'Accept': 'application/vnd.github.v3+json'}
-    per_page = min(n, 100)
-
-    while len(repos) < n:
-        url = (
-            f'https://api.github.com/search/repositories'
-            f'?q=stars:>{min_stars}&sort=stars&order=desc'
-            f'&per_page={per_page}&page={page}'
-        )
-        resp = requests.get(url, headers=headers)
-        if resp.status_code == 403:
-            print("  Rate limited. Sleeping 60s...")
-            time.sleep(60)
-            continue
-        if resp.status_code != 200:
-            print(f"  API error {resp.status_code}: {resp.text[:200]}")
-            break
-        data = resp.json()
-        items = data.get('items', [])
-        if not items:
-            break
-        for r in items:
-            repos.append({
-                'full_name': r['full_name'],
-                'stars': r['stargazers_count'],
-                'default_branch': r.get('default_branch', 'main'),
-                'description': r.get('description', ''),
-                'language': r.get('language', ''),
-            })
-        print(f"  Page {page}: got {len(items)} repos (total {len(repos)})")
-        page += 1
-        # Small delay to avoid secondary rate limits
-        time.sleep(0.5)
-        if len(repos) >= n:
-            break
-
-    return repos[:n]
-
-def fetch_readme(repo):
-    """Fetch README content from a GitHub repo. Tries main, master, and common variants."""
-    branches = [repo['default_branch'], 'main', 'master']
-    attempted = set()
-
-    for branch in branches:
-        if branch in attempted:
-            continue
-        attempted.add(branch)
-        for path in ['README.md', 'readme.md', 'README.markdown', 'README.rst']:
-            url = f'https://raw.githubusercontent.com/{repo["full_name"]}/{branch}/{path}'
-            try:
-                resp = requests.get(url, timeout=10)
-                if resp.status_code == 200:
-                    return resp.text, path
-            except:
-                pass
-    return None, None
-
-def extract_headings(text):
-    """Extract heading sequence from markdown text.
-    Returns list of (level, text) tuples, e.g. [(1, "Title"), (2, "Installation"), ...]
-    """
-    headings = []
-    for line in text.splitlines():
-        m = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
-        if m:
-            level = len(m.group(1))
-            text = m.group(2).strip()
-            # Remove trailing `#` characters (common in some markdowns)
-            text = re.sub(r'\s+#+\s*$', '', text).strip()
-            headings.append((level, text))
-    return headings
-
-def compress_headings(headings):
-    """Convert heading sequence to our symbol vocabulary.
-    H1 becomes just the section key; H2+ include their parent context.
-    """
-    # For simplicity: treat all headings as symbols, normalized.
-    # H1 = title (always present, strip it)
-    # Return list of normalized H2+ heading texts
-    seq = []
-    seen_h1 = False
-    for level, text in headings:
-        if level == 1 and not seen_h1:
-            seen_h1 = True
-            continue  # skip the title
-        norm = normalize_heading(text)
-        if norm:
-            seq.append(norm)
-    return seq
-
-def main():
-    print("=" * 60)
-    print("README Structure Analysis")
-    print("=" * 60)
-
-    # Step 1: Fetch top repos
-    print("\n[1] Fetching top repos from GitHub...")
-    repos = fetch_top_repos(n=100)
-    print(f"  Got {len(repos)} repos")
-
-    # Step 2: Fetch READMEs
-    print("\n[2] Fetching READMEs...")
-    sequences = []
-    failed = 0
-    for i, repo in enumerate(repos, 1):
-        raw_text, path = fetch_readme(repo)
-        if raw_text is None:
-            failed += 1
-            continue
-        headings = extract_headings(raw_text)
-        seq = compress_headings(headings)
-        if len(seq) >= 3:  # need at least a few sections
-            sequences.append(seq)
-        if i % 20 == 0:
-            print(f"  {i}/{len(repos)}: {len(sequences)} valid, {failed} failed")
-
-    print(f"  Total: {len(sequences)} valid sequences, {failed} failed")
-
-    # Step 3: Collect vocabulary stats
-    print("\n[3] Vocabulary statistics...")
-    all_symbols = set()
-    symbol_counts = {}
-    for seq in sequences:
-        for s in seq:
-            all_symbols.add(s)
-            symbol_counts[s] = symbol_counts.get(s, 0) + 1
-
-    print(f"  Unique symbols: {len(all_symbols)}")
-    print(f"  Top symbols:")
-    for sym, cnt in sorted(symbol_counts.items(), key=lambda x: -x[1])[:25]:
-        pct = cnt / len(sequences) * 100
-        print(f"    {sym:30s}  {cnt:4d} ({pct:5.1f}%)")
-
-    # Step 4: Run Dervish
-    print("\n[4] Running Dervish grammar inference...")
-    result = infer_ensemble(sequences)
-
-    print(f"\n  Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})")
-    print(f"  Grammar: {result['best']['grammar']}")
-    if len(result['all']) > 1:
-        for r in result['all']:
-            m = sum(1 for s in sequences if _matches(r['grammar'], s))
-            print(f"    {r['algorithm']:10s}  MDL={r['mdl_score']:>8.2f}  match={m}/{len(sequences)}")
-    print(f"\n  Why: {result['why']}")
-
-    # Step 5: Print example sequences
-    print("\n[5] Sample sequences:")
-    for seq in sequences[:10]:
-        print(f"  {' → '.join(seq[:10])}" + (" → ..." if len(seq) > 10 else ""))
-    print(f"  ... ({len(sequences)} total)")
-
-    # Save results
-    out = {
-        'num_repos': len(sequences),
-        'failed': failed,
-        'unique_symbols': len(all_symbols),
-        'top_symbols': {s: symbol_counts[s] for s in sorted(symbol_counts, key=lambda x: -symbol_counts[x])[:30]},
-        'grammar': result['best']['grammar'],
-        'algorithm': result['best']['algorithm'],
-        'mdl': result['best']['mdl_score'],
-    }
-    path = Path(__file__).resolve().parent.parent / 'readme_analysis.json'
-    with open(path, 'w') as f:
-        json.dump(out, f, indent=2)
-    print(f"\nResults saved to {path}")
-
-if __name__ == '__main__':
-    main()