chore: ignore examples/
All checks were successful
ci/woodpecker/pr/woodpecker Pipeline was successful
ci/woodpecker/push/woodpecker Pipeline was successful

This commit is contained in:
tobjend 2026-07-01 15:40:02 +02:00
parent 929a50c95d
commit a70024397f
2 changed files with 1 additions and 239 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@ venv/
*.egg-info/ *.egg-info/
dist/ dist/
build/ build/
examples/

View file

@ -1,239 +0,0 @@
"""
README Structure Analysis infer the conventional heading structure of
top GitHub repositories using Dervish grammar inference.
"""
import re
import sys
import time
import json
import requests
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from bex.ensemble import infer_ensemble, _matches
# ── Synonym normalization map ──
NORMALIZE = {
'description': 'description',
'overview': 'description',
'about': 'description',
'introduction': 'description',
'getting started': 'getting-started',
'quick start': 'getting-started',
'quickstart': 'getting-started',
'installation': 'installation',
'install': 'installation',
'setup': 'installation',
'usage': 'usage',
'how to use': 'usage',
'examples': 'usage',
'example': 'usage',
'api': 'api',
'api reference': 'api',
'api documentation': 'api',
'documentation': 'api',
'features': 'features',
'configuration': 'configuration',
'config': 'configuration',
'contributing': 'contributing',
'development': 'contributing',
'building': 'contributing',
'build': 'contributing',
'license': 'license',
'changelog': 'changelog',
'faq': 'faq',
'frequently asked questions': 'faq',
'support': 'support',
'screenshots': 'screenshots',
'demo': 'screenshots',
'tests': 'testing',
'testing': 'testing',
'badges': 'badges',
'acknowledgments': 'acknowledgments',
'acknowledgements': 'acknowledgments',
'credits': 'acknowledgments',
'roadmap': 'roadmap',
'related projects': 'related',
'see also': 'related',
}
def normalize_heading(text):
"""Normalize a heading to a canonical name, or return the raw slug."""
t = text.strip().lower()
t = re.sub(r'[^a-z0-9 ]', '', t)
t = re.sub(r'\s+', ' ', t).strip()
return NORMALIZE.get(t, t)
def fetch_top_repos(n=100, min_stars=5000):
"""Fetch top N repos by stars from GitHub search API."""
repos = []
page = 1
headers = {'Accept': 'application/vnd.github.v3+json'}
per_page = min(n, 100)
while len(repos) < n:
url = (
f'https://api.github.com/search/repositories'
f'?q=stars:>{min_stars}&sort=stars&order=desc'
f'&per_page={per_page}&page={page}'
)
resp = requests.get(url, headers=headers)
if resp.status_code == 403:
print(" Rate limited. Sleeping 60s...")
time.sleep(60)
continue
if resp.status_code != 200:
print(f" API error {resp.status_code}: {resp.text[:200]}")
break
data = resp.json()
items = data.get('items', [])
if not items:
break
for r in items:
repos.append({
'full_name': r['full_name'],
'stars': r['stargazers_count'],
'default_branch': r.get('default_branch', 'main'),
'description': r.get('description', ''),
'language': r.get('language', ''),
})
print(f" Page {page}: got {len(items)} repos (total {len(repos)})")
page += 1
# Small delay to avoid secondary rate limits
time.sleep(0.5)
if len(repos) >= n:
break
return repos[:n]
def fetch_readme(repo):
"""Fetch README content from a GitHub repo. Tries main, master, and common variants."""
branches = [repo['default_branch'], 'main', 'master']
attempted = set()
for branch in branches:
if branch in attempted:
continue
attempted.add(branch)
for path in ['README.md', 'readme.md', 'README.markdown', 'README.rst']:
url = f'https://raw.githubusercontent.com/{repo["full_name"]}/{branch}/{path}'
try:
resp = requests.get(url, timeout=10)
if resp.status_code == 200:
return resp.text, path
except:
pass
return None, None
def extract_headings(text):
"""Extract heading sequence from markdown text.
Returns list of (level, text) tuples, e.g. [(1, "Title"), (2, "Installation"), ...]
"""
headings = []
for line in text.splitlines():
m = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
if m:
level = len(m.group(1))
text = m.group(2).strip()
# Remove trailing `#` characters (common in some markdowns)
text = re.sub(r'\s+#+\s*$', '', text).strip()
headings.append((level, text))
return headings
def compress_headings(headings):
"""Convert heading sequence to our symbol vocabulary.
H1 becomes just the section key; H2+ include their parent context.
"""
# For simplicity: treat all headings as symbols, normalized.
# H1 = title (always present, strip it)
# Return list of normalized H2+ heading texts
seq = []
seen_h1 = False
for level, text in headings:
if level == 1 and not seen_h1:
seen_h1 = True
continue # skip the title
norm = normalize_heading(text)
if norm:
seq.append(norm)
return seq
def main():
print("=" * 60)
print("README Structure Analysis")
print("=" * 60)
# Step 1: Fetch top repos
print("\n[1] Fetching top repos from GitHub...")
repos = fetch_top_repos(n=100)
print(f" Got {len(repos)} repos")
# Step 2: Fetch READMEs
print("\n[2] Fetching READMEs...")
sequences = []
failed = 0
for i, repo in enumerate(repos, 1):
raw_text, path = fetch_readme(repo)
if raw_text is None:
failed += 1
continue
headings = extract_headings(raw_text)
seq = compress_headings(headings)
if len(seq) >= 3: # need at least a few sections
sequences.append(seq)
if i % 20 == 0:
print(f" {i}/{len(repos)}: {len(sequences)} valid, {failed} failed")
print(f" Total: {len(sequences)} valid sequences, {failed} failed")
# Step 3: Collect vocabulary stats
print("\n[3] Vocabulary statistics...")
all_symbols = set()
symbol_counts = {}
for seq in sequences:
for s in seq:
all_symbols.add(s)
symbol_counts[s] = symbol_counts.get(s, 0) + 1
print(f" Unique symbols: {len(all_symbols)}")
print(f" Top symbols:")
for sym, cnt in sorted(symbol_counts.items(), key=lambda x: -x[1])[:25]:
pct = cnt / len(sequences) * 100
print(f" {sym:30s} {cnt:4d} ({pct:5.1f}%)")
# Step 4: Run Dervish
print("\n[4] Running Dervish grammar inference...")
result = infer_ensemble(sequences)
print(f"\n Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})")
print(f" Grammar: {result['best']['grammar']}")
if len(result['all']) > 1:
for r in result['all']:
m = sum(1 for s in sequences if _matches(r['grammar'], s))
print(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}")
print(f"\n Why: {result['why']}")
# Step 5: Print example sequences
print("\n[5] Sample sequences:")
for seq in sequences[:10]:
print(f" {''.join(seq[:10])}" + (" → ..." if len(seq) > 10 else ""))
print(f" ... ({len(sequences)} total)")
# Save results
out = {
'num_repos': len(sequences),
'failed': failed,
'unique_symbols': len(all_symbols),
'top_symbols': {s: symbol_counts[s] for s in sorted(symbol_counts, key=lambda x: -symbol_counts[x])[:30]},
'grammar': result['best']['grammar'],
'algorithm': result['best']['algorithm'],
'mdl': result['best']['mdl_score'],
}
path = Path(__file__).resolve().parent.parent / 'readme_analysis.json'
with open(path, 'w') as f:
json.dump(out, f, indent=2)
print(f"\nResults saved to {path}")
if __name__ == '__main__':
main()