llm-library/api/parse_papers.py

#!/usr/bin/env python3
"""Parse llm_library.html PAPER_DATA block → nested papers.json"""
import re, json, os

HTML = '/app/working/workspaces/default/llm_library.html'
JSON = '/app/working/workspaces/default/llm-library/data/papers.json'

with open(HTML) as f:
    html = f.read()

s = html.index('const PAPER_DATA = {')
e = html.index('APP STATE')
block = html[s+22:e]

modules = {}
current_mod = None
current_area = None
current_section = 'mainline'

for line in block.split('\n'):
    stripped = line.strip()
    if not stripped:
        continue
    indent = len(line) - len(line.lstrip())

    # Module start: "  arch: {" at indent 2
    if indent == 2 and re.match(r'^\w+:\s*\{', stripped):
        mid = stripped.split(':')[0]
        if current_mod and current_mod.get('name'):
            modules[current_mod['id']] = current_mod
        current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []}
        current_area = None
        current_section = 'mainline'
        continue

    if not current_mod:
        continue

    # Module metadata at indent 4
    if indent == 4:
        m = re.match(r'(\w+):\s*"([^"]*)"', stripped)
        if m and m.group(1) in ('name', 'icon', 'desc', 'color'):
            current_mod[m.group(1)] = m.group(2)

    # Area header at indent 8
    if indent == 8:
        m_id = re.match(r'id:\s*"(\w+)"', stripped)
        if m_id:
            current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []}
            current_mod['areas'].append(current_area)
            current_section = 'mainline'
            continue
        m_name = re.match(r'name:\s*"([^"]+)"', stripped)
        if m_name and current_area:
            current_area['name'] = m_name.group(1)
            continue
        if re.match(r'mainline:\s*\[', stripped):
            current_section = 'mainline'
        elif re.match(r'branches:\s*\[', stripped):
            current_section = 'branches'
        elif re.match(r'forward:\s*\[', stripped):
            current_section = 'forward'

    # Paper entry
    if stripped.startswith('{ title:') and 'tags:' in stripped and current_area:
        title = re.search(r'title:\s*"([^"]+)"', stripped)
        authors = re.search(r'authors:\s*"([^"]*?)"', stripped)
        year = re.search(r'year:\s*(\d+)', stripped)
        venue = re.search(r'venue:\s*"([^"]*?)"', stripped)
        arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped)
        pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped)
        tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL)

        if title and year and tags_m:
            tags = re.findall(r'"([^"]*)"', tags_m.group(1))
            entry = {
                'title': title.group(1),
                'authors': authors.group(1) if authors else '',
                'year': int(year.group(1)),
                'venue': venue.group(1) if venue else '',
                'tags': tags
            }
            if arxiv and arxiv.group(1):
                entry['arxiv'] = arxiv.group(1)
            if pdf and pdf.group(1):
                entry['pdf'] = pdf.group(1)
            current_area[current_section].append(entry)

# Save last module
if current_mod and current_mod.get('name'):
    modules[current_mod['id']] = current_mod

# Count
total = sum(
    len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[]))
    for m in modules.values() for a in m.get('areas',[])
)

print(f'Parsed: {len(modules)} modules, {total} papers')
for mid, m in sorted(modules.items()):
    pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas'])
    print(f'  {mid}: {m["name"]} — {len(m["areas"])} areas, {pc} papers')

os.makedirs(os.path.dirname(JSON), exist_ok=True)
with open(JSON, 'w') as f:
    json.dump(modules, f, ensure_ascii=False, indent=2)
print(f'\nSaved to {JSON}')