chore: ignore examples/
This commit is contained in:
parent
929a50c95d
commit
a70024397f
2 changed files with 1 additions and 239 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -6,3 +6,4 @@ venv/
|
||||||
*.egg-info/
|
*.egg-info/
|
||||||
dist/
|
dist/
|
||||||
build/
|
build/
|
||||||
|
examples/
|
||||||
|
|
|
||||||
|
|
@ -1,239 +0,0 @@
|
||||||
"""
|
|
||||||
README Structure Analysis — infer the conventional heading structure of
|
|
||||||
top GitHub repositories using Dervish grammar inference.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import requests
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
||||||
from bex.ensemble import infer_ensemble, _matches
|
|
||||||
|
|
||||||
# ── Synonym normalization map ──
|
|
||||||
NORMALIZE = {
|
|
||||||
'description': 'description',
|
|
||||||
'overview': 'description',
|
|
||||||
'about': 'description',
|
|
||||||
'introduction': 'description',
|
|
||||||
'getting started': 'getting-started',
|
|
||||||
'quick start': 'getting-started',
|
|
||||||
'quickstart': 'getting-started',
|
|
||||||
'installation': 'installation',
|
|
||||||
'install': 'installation',
|
|
||||||
'setup': 'installation',
|
|
||||||
'usage': 'usage',
|
|
||||||
'how to use': 'usage',
|
|
||||||
'examples': 'usage',
|
|
||||||
'example': 'usage',
|
|
||||||
'api': 'api',
|
|
||||||
'api reference': 'api',
|
|
||||||
'api documentation': 'api',
|
|
||||||
'documentation': 'api',
|
|
||||||
'features': 'features',
|
|
||||||
'configuration': 'configuration',
|
|
||||||
'config': 'configuration',
|
|
||||||
'contributing': 'contributing',
|
|
||||||
'development': 'contributing',
|
|
||||||
'building': 'contributing',
|
|
||||||
'build': 'contributing',
|
|
||||||
'license': 'license',
|
|
||||||
'changelog': 'changelog',
|
|
||||||
'faq': 'faq',
|
|
||||||
'frequently asked questions': 'faq',
|
|
||||||
'support': 'support',
|
|
||||||
'screenshots': 'screenshots',
|
|
||||||
'demo': 'screenshots',
|
|
||||||
'tests': 'testing',
|
|
||||||
'testing': 'testing',
|
|
||||||
'badges': 'badges',
|
|
||||||
'acknowledgments': 'acknowledgments',
|
|
||||||
'acknowledgements': 'acknowledgments',
|
|
||||||
'credits': 'acknowledgments',
|
|
||||||
'roadmap': 'roadmap',
|
|
||||||
'related projects': 'related',
|
|
||||||
'see also': 'related',
|
|
||||||
}
|
|
||||||
|
|
||||||
def normalize_heading(text):
|
|
||||||
"""Normalize a heading to a canonical name, or return the raw slug."""
|
|
||||||
t = text.strip().lower()
|
|
||||||
t = re.sub(r'[^a-z0-9 ]', '', t)
|
|
||||||
t = re.sub(r'\s+', ' ', t).strip()
|
|
||||||
return NORMALIZE.get(t, t)
|
|
||||||
|
|
||||||
def fetch_top_repos(n=100, min_stars=5000):
|
|
||||||
"""Fetch top N repos by stars from GitHub search API."""
|
|
||||||
repos = []
|
|
||||||
page = 1
|
|
||||||
headers = {'Accept': 'application/vnd.github.v3+json'}
|
|
||||||
per_page = min(n, 100)
|
|
||||||
|
|
||||||
while len(repos) < n:
|
|
||||||
url = (
|
|
||||||
f'https://api.github.com/search/repositories'
|
|
||||||
f'?q=stars:>{min_stars}&sort=stars&order=desc'
|
|
||||||
f'&per_page={per_page}&page={page}'
|
|
||||||
)
|
|
||||||
resp = requests.get(url, headers=headers)
|
|
||||||
if resp.status_code == 403:
|
|
||||||
print(" Rate limited. Sleeping 60s...")
|
|
||||||
time.sleep(60)
|
|
||||||
continue
|
|
||||||
if resp.status_code != 200:
|
|
||||||
print(f" API error {resp.status_code}: {resp.text[:200]}")
|
|
||||||
break
|
|
||||||
data = resp.json()
|
|
||||||
items = data.get('items', [])
|
|
||||||
if not items:
|
|
||||||
break
|
|
||||||
for r in items:
|
|
||||||
repos.append({
|
|
||||||
'full_name': r['full_name'],
|
|
||||||
'stars': r['stargazers_count'],
|
|
||||||
'default_branch': r.get('default_branch', 'main'),
|
|
||||||
'description': r.get('description', ''),
|
|
||||||
'language': r.get('language', ''),
|
|
||||||
})
|
|
||||||
print(f" Page {page}: got {len(items)} repos (total {len(repos)})")
|
|
||||||
page += 1
|
|
||||||
# Small delay to avoid secondary rate limits
|
|
||||||
time.sleep(0.5)
|
|
||||||
if len(repos) >= n:
|
|
||||||
break
|
|
||||||
|
|
||||||
return repos[:n]
|
|
||||||
|
|
||||||
def fetch_readme(repo):
|
|
||||||
"""Fetch README content from a GitHub repo. Tries main, master, and common variants."""
|
|
||||||
branches = [repo['default_branch'], 'main', 'master']
|
|
||||||
attempted = set()
|
|
||||||
|
|
||||||
for branch in branches:
|
|
||||||
if branch in attempted:
|
|
||||||
continue
|
|
||||||
attempted.add(branch)
|
|
||||||
for path in ['README.md', 'readme.md', 'README.markdown', 'README.rst']:
|
|
||||||
url = f'https://raw.githubusercontent.com/{repo["full_name"]}/{branch}/{path}'
|
|
||||||
try:
|
|
||||||
resp = requests.get(url, timeout=10)
|
|
||||||
if resp.status_code == 200:
|
|
||||||
return resp.text, path
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
def extract_headings(text):
|
|
||||||
"""Extract heading sequence from markdown text.
|
|
||||||
Returns list of (level, text) tuples, e.g. [(1, "Title"), (2, "Installation"), ...]
|
|
||||||
"""
|
|
||||||
headings = []
|
|
||||||
for line in text.splitlines():
|
|
||||||
m = re.match(r'^(#{1,6})\s+(.+)$', line.strip())
|
|
||||||
if m:
|
|
||||||
level = len(m.group(1))
|
|
||||||
text = m.group(2).strip()
|
|
||||||
# Remove trailing `#` characters (common in some markdowns)
|
|
||||||
text = re.sub(r'\s+#+\s*$', '', text).strip()
|
|
||||||
headings.append((level, text))
|
|
||||||
return headings
|
|
||||||
|
|
||||||
def compress_headings(headings):
|
|
||||||
"""Convert heading sequence to our symbol vocabulary.
|
|
||||||
H1 becomes just the section key; H2+ include their parent context.
|
|
||||||
"""
|
|
||||||
# For simplicity: treat all headings as symbols, normalized.
|
|
||||||
# H1 = title (always present, strip it)
|
|
||||||
# Return list of normalized H2+ heading texts
|
|
||||||
seq = []
|
|
||||||
seen_h1 = False
|
|
||||||
for level, text in headings:
|
|
||||||
if level == 1 and not seen_h1:
|
|
||||||
seen_h1 = True
|
|
||||||
continue # skip the title
|
|
||||||
norm = normalize_heading(text)
|
|
||||||
if norm:
|
|
||||||
seq.append(norm)
|
|
||||||
return seq
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("=" * 60)
|
|
||||||
print("README Structure Analysis")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Step 1: Fetch top repos
|
|
||||||
print("\n[1] Fetching top repos from GitHub...")
|
|
||||||
repos = fetch_top_repos(n=100)
|
|
||||||
print(f" Got {len(repos)} repos")
|
|
||||||
|
|
||||||
# Step 2: Fetch READMEs
|
|
||||||
print("\n[2] Fetching READMEs...")
|
|
||||||
sequences = []
|
|
||||||
failed = 0
|
|
||||||
for i, repo in enumerate(repos, 1):
|
|
||||||
raw_text, path = fetch_readme(repo)
|
|
||||||
if raw_text is None:
|
|
||||||
failed += 1
|
|
||||||
continue
|
|
||||||
headings = extract_headings(raw_text)
|
|
||||||
seq = compress_headings(headings)
|
|
||||||
if len(seq) >= 3: # need at least a few sections
|
|
||||||
sequences.append(seq)
|
|
||||||
if i % 20 == 0:
|
|
||||||
print(f" {i}/{len(repos)}: {len(sequences)} valid, {failed} failed")
|
|
||||||
|
|
||||||
print(f" Total: {len(sequences)} valid sequences, {failed} failed")
|
|
||||||
|
|
||||||
# Step 3: Collect vocabulary stats
|
|
||||||
print("\n[3] Vocabulary statistics...")
|
|
||||||
all_symbols = set()
|
|
||||||
symbol_counts = {}
|
|
||||||
for seq in sequences:
|
|
||||||
for s in seq:
|
|
||||||
all_symbols.add(s)
|
|
||||||
symbol_counts[s] = symbol_counts.get(s, 0) + 1
|
|
||||||
|
|
||||||
print(f" Unique symbols: {len(all_symbols)}")
|
|
||||||
print(f" Top symbols:")
|
|
||||||
for sym, cnt in sorted(symbol_counts.items(), key=lambda x: -x[1])[:25]:
|
|
||||||
pct = cnt / len(sequences) * 100
|
|
||||||
print(f" {sym:30s} {cnt:4d} ({pct:5.1f}%)")
|
|
||||||
|
|
||||||
# Step 4: Run Dervish
|
|
||||||
print("\n[4] Running Dervish grammar inference...")
|
|
||||||
result = infer_ensemble(sequences)
|
|
||||||
|
|
||||||
print(f"\n Best: {result['best']['algorithm']} (MDL {result['best']['mdl_score']})")
|
|
||||||
print(f" Grammar: {result['best']['grammar']}")
|
|
||||||
if len(result['all']) > 1:
|
|
||||||
for r in result['all']:
|
|
||||||
m = sum(1 for s in sequences if _matches(r['grammar'], s))
|
|
||||||
print(f" {r['algorithm']:10s} MDL={r['mdl_score']:>8.2f} match={m}/{len(sequences)}")
|
|
||||||
print(f"\n Why: {result['why']}")
|
|
||||||
|
|
||||||
# Step 5: Print example sequences
|
|
||||||
print("\n[5] Sample sequences:")
|
|
||||||
for seq in sequences[:10]:
|
|
||||||
print(f" {' → '.join(seq[:10])}" + (" → ..." if len(seq) > 10 else ""))
|
|
||||||
print(f" ... ({len(sequences)} total)")
|
|
||||||
|
|
||||||
# Save results
|
|
||||||
out = {
|
|
||||||
'num_repos': len(sequences),
|
|
||||||
'failed': failed,
|
|
||||||
'unique_symbols': len(all_symbols),
|
|
||||||
'top_symbols': {s: symbol_counts[s] for s in sorted(symbol_counts, key=lambda x: -symbol_counts[x])[:30]},
|
|
||||||
'grammar': result['best']['grammar'],
|
|
||||||
'algorithm': result['best']['algorithm'],
|
|
||||||
'mdl': result['best']['mdl_score'],
|
|
||||||
}
|
|
||||||
path = Path(__file__).resolve().parent.parent / 'readme_analysis.json'
|
|
||||||
with open(path, 'w') as f:
|
|
||||||
json.dump(out, f, indent=2)
|
|
||||||
print(f"\nResults saved to {path}")
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
Loading…
Add table
Reference in a new issue