""" README Structure Analysis — infer the conventional heading structure of top GitHub repositories using Dervish grammar inference. """ import re import sys import time import json import requests from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from bex.ensemble import infer_ensemble, _matches # ── Synonym normalization map ── NORMALIZE = { 'description': 'description', 'overview': 'description', 'about': 'description', 'introduction': 'description', 'getting started': 'getting-started', 'quick start': 'getting-started', 'quickstart': 'getting-started', 'installation': 'installation', 'install': 'installation', 'setup': 'installation', 'usage': 'usage', 'how to use': 'usage', 'examples': 'usage', 'example': 'usage', 'api': 'api', 'api reference': 'api', 'api documentation': 'api', 'documentation': 'api', 'features': 'features', 'configuration': 'configuration', 'config': 'configuration', 'contributing': 'contributing', 'development': 'contributing', 'building': 'contributing', 'build': 'contributing', 'license': 'license', 'changelog': 'changelog', 'faq': 'faq', 'frequently asked questions': 'faq', 'support': 'support', 'screenshots': 'screenshots', 'demo': 'screenshots', 'tests': 'testing', 'testing': 'testing', 'badges': 'badges', 'acknowledgments': 'acknowledgments', 'acknowledgements': 'acknowledgments', 'credits': 'acknowledgments', 'roadmap': 'roadmap', 'related projects': 'related', 'see also': 'related', } def normalize_heading(text): """Normalize a heading to a canonical name, or return the raw slug.""" t = text.strip().lower() t = re.sub(r'[^a-z0-9 ]', '', t) t = re.sub(r'\s+', ' ', t).strip() return NORMALIZE.get(t, t) def fetch_top_repos(n=100, min_stars=5000): """Fetch top N repos by stars from GitHub search API.""" repos = [] page = 1 headers = {'Accept': 'application/vnd.github.v3+json'} per_page = min(n, 100) while len(repos) < n: url = ( f'https://api.github.com/search/repositories' f'?q=stars:>{min_stars}&sort=stars&order=desc' f'&per_page={per_page}&page={page}' ) resp = requests.get(url, headers=headers) if resp.status_code == 403: print(" Rate limited. Sleeping 60s...") time.sleep(60) continue if resp.status_code != 200: print(f" API error {resp.status_code}: {resp.text[:200]}") break data = resp.json() items = data.get('items', []) if not items: break for r in items: repos.append({ 'full_name': r['full_name'], 'stars': r['stargazers_count'], 'default_branch': r.get('default_branch', 'main'), 'description': r.get('description', ''), 'language': r.get('language', ''), }) print(f" Page {page}: got {len(items)} repos (total {len(repos)})") page += 1 # Small delay to avoid secondary rate limits time.sleep(0.5) if len(repos) >= n: break return repos[:n] def fetch_readme(repo): """Fetch README content from a GitHub repo. Tries main, master, and common variants.""" branches = [repo['default_branch'], 'main', 'master'] attempted = set() for branch in branches: if branch in attempted: continue attempted.add(branch) for path in ['README.md', 'readme.md', 'README.markdown', 'README.rst']: url = f'https://raw.githubusercontent.com/{repo["full_name"]}/{branch}/{path}' try: resp = requests.get(url, timeout=10) if resp.status_code == 200: return resp.text, path except: pass return None, None def extract_headings(text): """Extract heading sequence from markdown text. Returns list of (level, text) tuples, e.g. [(1, "Title"), (2, "Installation"), ...] """ headings = [] for line in text.splitlines(): m = re.match(r'^(#{1,6})\s+(.+)$', line.strip()) if m: level = len(m.group(1)) text = m.group(2).strip() # Remove trailing `#` characters (common in some markdowns) text = re.sub(r'\s+#+\s*$', '', text).strip() headings.append((level, text)) return headings def compress_headings(headings): """Convert heading sequence to our symbol vocabulary. H1 becomes just the section key; H2+ include their parent context. """ # For simplicity: treat all headings as symbols, normalized. # H1 = title (always present, strip it) # Return list of normalized H2+ heading texts seq = [] seen_h1 = False for level, text in headings: if level == 1 and not seen_h1: seen_h1 = True continue # skip the title norm = normalize_heading(text) if norm: seq.append(norm) return seq def main(): print("=" * 60) print("README Structure Analysis") print("=" * 60) # Step 1: Fetch top repos print("\n[1] Fetching top repos from GitHub...") repos = fetch_top_repos(n=100) print(f" Got {len(repos)} repos") # Step 2: Fetch READMEs print("\n[2] Fetching READMEs...") sequences = [] failed = 0 for i, repo in enumerate(repos, 1): raw_text, path = fetch_readme(repo) if raw_text is None: failed += 1 continue headings = extract_headings(raw_text) seq = compress_headings(headings) if len(seq) >= 3: # need at least a few sections sequences.append(seq) if i % 20 == 0: print(f" {i}/{len(repos)}: {len(sequences)} valid, {failed} failed") print(f" Total: {len(sequences)} valid sequences, {failed} failed") # Step 3: Collect vocabulary stats print("\n[3] Vocabulary statistics...") all_symbols = set() symbol_counts = {} for seq in sequences: for s in seq: all_symbols.add(s) symbol_counts[s] = symbol_counts.get(s, 0) + 1 print(f" Unique symbols: {len(all_symbols)}") print(f" Top symbols:") for sym, cnt in sorted(symbol_counts.items(), key=lambda x: -x[1])[:25]: pct = cnt / len(sequences) * 100 print(f" {sym:30s} {cnt:4d} ({pct:5.1f}%)") # Step 4: Run Dervish print("\n[4] Running Dervish grammar inference...") result = infer_ensemble(sequences) print(f"\n Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})") print(f" Grammar: {result['best']['grammar']}") if len(result['all']) > 1: for r in result['all']: m = sum(1 for s in sequences if _matches(r['grammar'], s)) print(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}") print(f"\n Why: {result['why']}") # Step 5: Print example sequences print("\n[5] Sample sequences:") for seq in sequences[:10]: print(f" {' → '.join(seq[:10])}" + (" → ..." if len(seq) > 10 else "")) print(f" ... ({len(sequences)} total)") # Save results out = { 'num_repos': len(sequences), 'failed': failed, 'unique_symbols': len(all_symbols), 'top_symbols': {s: symbol_counts[s] for s in sorted(symbol_counts, key=lambda x: -symbol_counts[x])[:30]}, 'grammar': result['best']['grammar'], 'algorithm': result['best']['algorithm'], 'mdl': result['best']['mdl_score'], } path = Path(__file__).resolve().parent.parent / 'readme_analysis.json' with open(path, 'w') as f: json.dump(out, f, indent=2) print(f"\nResults saved to {path}") if __name__ == '__main__': main()