llm-library/api/extract_data.py

#!/usr/bin/env python3
"""Build papers.json by regex-extracting paper entries from llm_library.html"""
import re, json, os

ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
html_path = os.path.join(ROOT, 'llm_library.html')

with open(html_path, 'r') as f:
    html = f.read()

# Step 1: Parse modules (each module is a top-level key in PAPER_DATA)
# Find each module block by matching "  arch: {" style patterns
# Actually, let's parse line by line since this is a human-readable format

# Simpler approach: extract all paper entries with regex
# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] }
paper_re = re.compile(
    r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*'
    r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)'
    r'tags:\s*\[(.*?)\]\s*\}',
    re.DOTALL
)

papers = []
for m in paper_re.finditer(html):
    title = m.group(1)
    authors = m.group(2)
    year = int(m.group(3))
    venue = m.group(4)
    arxiv = m.group(5) or None
    pdf = m.group(6) or None
    tags_str = m.group(7)
    tags = re.findall(r'"([^"]*)"', tags_str)

    # Find which module/area this paper belongs to
    pos = m.start()
    # Search backwards for module and area context
    before = html[max(0,pos-3000):pos]

    # Find module id
    mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before)
    if not mod_match:
        # Try broader pattern
        mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before)
    if mod_match:
        mod_id = mod_match.group(1)
        mod_name = mod_match.group(2)
    else:
        mod_id = 'unknown'
        mod_name = 'Unknown'

    # Find area id
    area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before)
    if area_match:
        area_id = area_match.group(1)
        area_name = area_match.group(2)
    else:
        area_id = 'unknown'
        area_name = 'Unknown'

    papers.append({
        'module': mod_id,
        'module_name': mod_name,
        'area': area_id,
        'area_name': area_name,
        'title': title,
        'authors': authors,
        'year': year,
        'venue': venue,
        'arxiv': arxiv,
        'pdf': pdf,
        'tags': tags,
    })

print(f'Extracted {len(papers)} papers')

# Group by module → area → section (mainline/branches/forward)
# For now, just save as flat list for verification
# We'll reconstruct the proper nested structure after verifying

# Also extract module metadata
modules = {}
for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html):
    mod_id = m.group(1)
    modules[mod_id] = {
        'name': m.group(2),
        'icon': m.group(3),
        'desc': m.group(4),
        'color': mod_id,
        'areas': []
    }

print(f'Found {len(modules)} modules')
for mod_id, mod in modules.items():
    print(f'  {mod_id}: {mod["name"]}')

# Save the flat list for now
output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
    json.dump(papers, f, ensure_ascii=False, indent=2)

print(f'Saved {len(papers)} papers (flat) to {output_path}')