From a70024397f7f6fe5041511a49409b8316cb3108b Mon Sep 17 00:00:00 2001 From: tobjend Date: Wed, 1 Jul 2026 15:40:02 +0200 Subject: [PATCH] chore: ignore examples/ --- .gitignore | 1 + examples/readme_analysis.py | 239 ------------------------------------ 2 files changed, 1 insertion(+), 239 deletions(-) delete mode 100644 examples/readme_analysis.py diff --git a/.gitignore b/.gitignore index c2f4095..d362a70 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv/ *.egg-info/ dist/ build/ +examples/ diff --git a/examples/readme_analysis.py b/examples/readme_analysis.py deleted file mode 100644 index 2a6c7c8..0000000 --- a/examples/readme_analysis.py +++ /dev/null @@ -1,239 +0,0 @@ -""" -README Structure Analysis — infer the conventional heading structure of -top GitHub repositories using Dervish grammar inference. -""" - -import re -import sys -import time -import json -import requests -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) -from bex.ensemble import infer_ensemble, _matches - -# ── Synonym normalization map ── -NORMALIZE = { - 'description': 'description', - 'overview': 'description', - 'about': 'description', - 'introduction': 'description', - 'getting started': 'getting-started', - 'quick start': 'getting-started', - 'quickstart': 'getting-started', - 'installation': 'installation', - 'install': 'installation', - 'setup': 'installation', - 'usage': 'usage', - 'how to use': 'usage', - 'examples': 'usage', - 'example': 'usage', - 'api': 'api', - 'api reference': 'api', - 'api documentation': 'api', - 'documentation': 'api', - 'features': 'features', - 'configuration': 'configuration', - 'config': 'configuration', - 'contributing': 'contributing', - 'development': 'contributing', - 'building': 'contributing', - 'build': 'contributing', - 'license': 'license', - 'changelog': 'changelog', - 'faq': 'faq', - 'frequently asked questions': 'faq', - 'support': 'support', - 'screenshots': 'screenshots', - 'demo': 'screenshots', - 'tests': 'testing', - 'testing': 'testing', - 'badges': 'badges', - 'acknowledgments': 'acknowledgments', - 'acknowledgements': 'acknowledgments', - 'credits': 'acknowledgments', - 'roadmap': 'roadmap', - 'related projects': 'related', - 'see also': 'related', -} - -def normalize_heading(text): - """Normalize a heading to a canonical name, or return the raw slug.""" - t = text.strip().lower() - t = re.sub(r'[^a-z0-9 ]', '', t) - t = re.sub(r'\s+', ' ', t).strip() - return NORMALIZE.get(t, t) - -def fetch_top_repos(n=100, min_stars=5000): - """Fetch top N repos by stars from GitHub search API.""" - repos = [] - page = 1 - headers = {'Accept': 'application/vnd.github.v3+json'} - per_page = min(n, 100) - - while len(repos) < n: - url = ( - f'https://api.github.com/search/repositories' - f'?q=stars:>{min_stars}&sort=stars&order=desc' - f'&per_page={per_page}&page={page}' - ) - resp = requests.get(url, headers=headers) - if resp.status_code == 403: - print(" Rate limited. Sleeping 60s...") - time.sleep(60) - continue - if resp.status_code != 200: - print(f" API error {resp.status_code}: {resp.text[:200]}") - break - data = resp.json() - items = data.get('items', []) - if not items: - break - for r in items: - repos.append({ - 'full_name': r['full_name'], - 'stars': r['stargazers_count'], - 'default_branch': r.get('default_branch', 'main'), - 'description': r.get('description', ''), - 'language': r.get('language', ''), - }) - print(f" Page {page}: got {len(items)} repos (total {len(repos)})") - page += 1 - # Small delay to avoid secondary rate limits - time.sleep(0.5) - if len(repos) >= n: - break - - return repos[:n] - -def fetch_readme(repo): - """Fetch README content from a GitHub repo. Tries main, master, and common variants.""" - branches = [repo['default_branch'], 'main', 'master'] - attempted = set() - - for branch in branches: - if branch in attempted: - continue - attempted.add(branch) - for path in ['README.md', 'readme.md', 'README.markdown', 'README.rst']: - url = f'https://raw.githubusercontent.com/{repo["full_name"]}/{branch}/{path}' - try: - resp = requests.get(url, timeout=10) - if resp.status_code == 200: - return resp.text, path - except: - pass - return None, None - -def extract_headings(text): - """Extract heading sequence from markdown text. - Returns list of (level, text) tuples, e.g. [(1, "Title"), (2, "Installation"), ...] - """ - headings = [] - for line in text.splitlines(): - m = re.match(r'^(#{1,6})\s+(.+)$', line.strip()) - if m: - level = len(m.group(1)) - text = m.group(2).strip() - # Remove trailing `#` characters (common in some markdowns) - text = re.sub(r'\s+#+\s*$', '', text).strip() - headings.append((level, text)) - return headings - -def compress_headings(headings): - """Convert heading sequence to our symbol vocabulary. - H1 becomes just the section key; H2+ include their parent context. - """ - # For simplicity: treat all headings as symbols, normalized. - # H1 = title (always present, strip it) - # Return list of normalized H2+ heading texts - seq = [] - seen_h1 = False - for level, text in headings: - if level == 1 and not seen_h1: - seen_h1 = True - continue # skip the title - norm = normalize_heading(text) - if norm: - seq.append(norm) - return seq - -def main(): - print("=" * 60) - print("README Structure Analysis") - print("=" * 60) - - # Step 1: Fetch top repos - print("\n[1] Fetching top repos from GitHub...") - repos = fetch_top_repos(n=100) - print(f" Got {len(repos)} repos") - - # Step 2: Fetch READMEs - print("\n[2] Fetching READMEs...") - sequences = [] - failed = 0 - for i, repo in enumerate(repos, 1): - raw_text, path = fetch_readme(repo) - if raw_text is None: - failed += 1 - continue - headings = extract_headings(raw_text) - seq = compress_headings(headings) - if len(seq) >= 3: # need at least a few sections - sequences.append(seq) - if i % 20 == 0: - print(f" {i}/{len(repos)}: {len(sequences)} valid, {failed} failed") - - print(f" Total: {len(sequences)} valid sequences, {failed} failed") - - # Step 3: Collect vocabulary stats - print("\n[3] Vocabulary statistics...") - all_symbols = set() - symbol_counts = {} - for seq in sequences: - for s in seq: - all_symbols.add(s) - symbol_counts[s] = symbol_counts.get(s, 0) + 1 - - print(f" Unique symbols: {len(all_symbols)}") - print(f" Top symbols:") - for sym, cnt in sorted(symbol_counts.items(), key=lambda x: -x[1])[:25]: - pct = cnt / len(sequences) * 100 - print(f" {sym:30s} {cnt:4d} ({pct:5.1f}%)") - - # Step 4: Run Dervish - print("\n[4] Running Dervish grammar inference...") - result = infer_ensemble(sequences) - - print(f"\n Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})") - print(f" Grammar: {result['best']['grammar']}") - if len(result['all']) > 1: - for r in result['all']: - m = sum(1 for s in sequences if _matches(r['grammar'], s)) - print(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}") - print(f"\n Why: {result['why']}") - - # Step 5: Print example sequences - print("\n[5] Sample sequences:") - for seq in sequences[:10]: - print(f" {' → '.join(seq[:10])}" + (" → ..." if len(seq) > 10 else "")) - print(f" ... ({len(sequences)} total)") - - # Save results - out = { - 'num_repos': len(sequences), - 'failed': failed, - 'unique_symbols': len(all_symbols), - 'top_symbols': {s: symbol_counts[s] for s in sorted(symbol_counts, key=lambda x: -symbol_counts[x])[:30]}, - 'grammar': result['best']['grammar'], - 'algorithm': result['best']['algorithm'], - 'mdl': result['best']['mdl_score'], - } - path = Path(__file__).resolve().parent.parent / 'readme_analysis.json' - with open(path, 'w') as f: - json.dump(out, f, indent=2) - print(f"\nResults saved to {path}") - -if __name__ == '__main__': - main()