grammar-inference-engine/examples/readme_analysis.py

"""
README Structure Analysis — infer the conventional heading structure of
top GitHub repositories using Dervish grammar inference.
"""

import re
import sys
import time
import json
import requests
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from bex.ensemble import infer_ensemble, _matches

# ── Synonym normalization map ──
NORMALIZE = {
    'description': 'description',
    'overview': 'description',
    'about': 'description',
    'introduction': 'description',
    'getting started': 'getting-started',
    'quick start': 'getting-started',
    'quickstart': 'getting-started',
    'installation': 'installation',
    'install': 'installation',
    'setup': 'installation',
    'usage': 'usage',
    'how to use': 'usage',
    'examples': 'usage',
    'example': 'usage',
    'api': 'api',
    'api reference': 'api',
    'api documentation': 'api',
    'documentation': 'api',
    'features': 'features',
    'configuration': 'configuration',
    'config': 'configuration',
    'contributing': 'contributing',
    'development': 'contributing',
    'building': 'contributing',
    'build': 'contributing',
    'license': 'license',
    'changelog': 'changelog',
    'faq': 'faq',
    'frequently asked questions': 'faq',
    'support': 'support',
    'screenshots': 'screenshots',
    'demo': 'screenshots',
    'tests': 'testing',
    'testing': 'testing',
    'badges': 'badges',
    'acknowledgments': 'acknowledgments',
    'acknowledgements': 'acknowledgments',
    'credits': 'acknowledgments',
    'roadmap': 'roadmap',
    'related projects': 'related',
    'see also': 'related',
}

def normalize_heading(text):
    """Normalize a heading to a canonical name, or return the raw slug."""
    t = text.strip().lower()
    t = re.sub(r'[^a-z0-9 ]', '', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return NORMALIZE.get(t, t)

def fetch_top_repos(n=100, min_stars=5000):
    """Fetch top N repos by stars from GitHub search API."""
    repos = []
    page = 1
    headers = {'Accept': 'application/vnd.github.v3+json'}
    per_page = min(n, 100)

    while len(repos) < n:
        url = (
            f'https://api.github.com/search/repositories'
            f'?q=stars:>{min_stars}&sort=stars&order=desc'
            f'&per_page={per_page}&page={page}'
        )
        resp = requests.get(url, headers=headers)
        if resp.status_code == 403:
            print("  Rate limited. Sleeping 60s...")
            time.sleep(60)
            continue
        if resp.status_code != 200:
            print(f"  API error {resp.status_code}: {resp.text[:200]}")
            break
        data = resp.json()
        items = data.get('items', [])
        if not items:
            break
        for r in items:
            repos.append({
                'full_name': r['full_name'],
                'stars': r['stargazers_count'],
                'default_branch': r.get('default_branch', 'main'),
                'description': r.get('description', ''),
                'language': r.get('language', ''),
            })
        print(f"  Page {page}: got {len(items)} repos (total {len(repos)})")
        page += 1
        # Small delay to avoid secondary rate limits
        time.sleep(0.5)
        if len(repos) >= n:
            break

    return repos[:n]

def fetch_readme(repo):
    """Fetch README content from a GitHub repo. Tries main, master, and common variants."""
    branches = [repo['default_branch'], 'main', 'master']
    attempted = set()

    for branch in branches:
        if branch in attempted:
            continue
        attempted.add(branch)
        for path in ['README.md', 'readme.md', 'README.markdown', 'README.rst']:
            url = f'https://raw.githubusercontent.com/{repo["full_name"]}/{branch}/{path}'
            try:
                resp = requests.get(url, timeout=10)
                if resp.status_code == 200:
                    return resp.text, path
            except:
                pass
    return None, None

def extract_headings(text):
    """Extract heading sequence from markdown text.
    Returns list of (level, text) tuples, e.g. [(1, "Title"), (2, "Installation"), ...]
    """
    headings = []
    for line in text.splitlines():
        m = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
        if m:
            level = len(m.group(1))
            text = m.group(2).strip()
            # Remove trailing `#` characters (common in some markdowns)
            text = re.sub(r'\s+#+\s*$', '', text).strip()
            headings.append((level, text))
    return headings

def compress_headings(headings):
    """Convert heading sequence to our symbol vocabulary.
    H1 becomes just the section key; H2+ include their parent context.
    """
    # For simplicity: treat all headings as symbols, normalized.
    # H1 = title (always present, strip it)
    # Return list of normalized H2+ heading texts
    seq = []
    seen_h1 = False
    for level, text in headings:
        if level == 1 and not seen_h1:
            seen_h1 = True
            continue  # skip the title
        norm = normalize_heading(text)
        if norm:
            seq.append(norm)
    return seq

def main():
    print("=" * 60)
    print("README Structure Analysis")
    print("=" * 60)

    # Step 1: Fetch top repos
    print("\n[1] Fetching top repos from GitHub...")
    repos = fetch_top_repos(n=100)
    print(f"  Got {len(repos)} repos")

    # Step 2: Fetch READMEs
    print("\n[2] Fetching READMEs...")
    sequences = []
    failed = 0
    for i, repo in enumerate(repos, 1):
        raw_text, path = fetch_readme(repo)
        if raw_text is None:
            failed += 1
            continue
        headings = extract_headings(raw_text)
        seq = compress_headings(headings)
        if len(seq) >= 3:  # need at least a few sections
            sequences.append(seq)
        if i % 20 == 0:
            print(f"  {i}/{len(repos)}: {len(sequences)} valid, {failed} failed")

    print(f"  Total: {len(sequences)} valid sequences, {failed} failed")

    # Step 3: Collect vocabulary stats
    print("\n[3] Vocabulary statistics...")
    all_symbols = set()
    symbol_counts = {}
    for seq in sequences:
        for s in seq:
            all_symbols.add(s)
            symbol_counts[s] = symbol_counts.get(s, 0) + 1

    print(f"  Unique symbols: {len(all_symbols)}")
    print(f"  Top symbols:")
    for sym, cnt in sorted(symbol_counts.items(), key=lambda x: -x[1])[:25]:
        pct = cnt / len(sequences) * 100
        print(f"    {sym:30s}  {cnt:4d} ({pct:5.1f}%)")

    # Step 4: Run Dervish
    print("\n[4] Running Dervish grammar inference...")
    result = infer_ensemble(sequences)

    print(f"\n  Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})")
    print(f"  Grammar: {result['best']['grammar']}")
    if len(result['all']) > 1:
        for r in result['all']:
            m = sum(1 for s in sequences if _matches(r['grammar'], s))
            print(f"    {r['algorithm']:10s}  MDL={r['mdl_score']:>8.2f}  match={m}/{len(sequences)}")
    print(f"\n  Why: {result['why']}")

    # Step 5: Print example sequences
    print("\n[5] Sample sequences:")
    for seq in sequences[:10]:
        print(f"  {' → '.join(seq[:10])}" + (" → ..." if len(seq) > 10 else ""))
    print(f"  ... ({len(sequences)} total)")

    # Save results
    out = {
        'num_repos': len(sequences),
        'failed': failed,
        'unique_symbols': len(all_symbols),
        'top_symbols': {s: symbol_counts[s] for s in sorted(symbol_counts, key=lambda x: -symbol_counts[x])[:30]},
        'grammar': result['best']['grammar'],
        'algorithm': result['best']['algorithm'],
        'mdl': result['best']['mdl_score'],
    }
    path = Path(__file__).resolve().parent.parent / 'readme_analysis.json'
    with open(path, 'w') as f:
        json.dump(out, f, indent=2)
    print(f"\nResults saved to {path}")

if __name__ == '__main__':
    main()