#!/usr/bin/env python3 """Build papers.json by regex-extracting paper entries from llm_library.html""" import re, json, os ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) html_path = os.path.join(ROOT, 'llm_library.html') with open(html_path, 'r') as f: html = f.read() # Step 1: Parse modules (each module is a top-level key in PAPER_DATA) # Find each module block by matching " arch: {" style patterns # Actually, let's parse line by line since this is a human-readable format # Simpler approach: extract all paper entries with regex # Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] } paper_re = re.compile( r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*' r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)' r'tags:\s*\[(.*?)\]\s*\}', re.DOTALL ) papers = [] for m in paper_re.finditer(html): title = m.group(1) authors = m.group(2) year = int(m.group(3)) venue = m.group(4) arxiv = m.group(5) or None pdf = m.group(6) or None tags_str = m.group(7) tags = re.findall(r'"([^"]*)"', tags_str) # Find which module/area this paper belongs to pos = m.start() # Search backwards for module and area context before = html[max(0,pos-3000):pos] # Find module id mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before) if not mod_match: # Try broader pattern mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before) if mod_match: mod_id = mod_match.group(1) mod_name = mod_match.group(2) else: mod_id = 'unknown' mod_name = 'Unknown' # Find area id area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before) if area_match: area_id = area_match.group(1) area_name = area_match.group(2) else: area_id = 'unknown' area_name = 'Unknown' papers.append({ 'module': mod_id, 'module_name': mod_name, 'area': area_id, 'area_name': area_name, 'title': title, 'authors': authors, 'year': year, 'venue': venue, 'arxiv': arxiv, 'pdf': pdf, 'tags': tags, }) print(f'Extracted {len(papers)} papers') # Group by module → area → section (mainline/branches/forward) # For now, just save as flat list for verification # We'll reconstruct the proper nested structure after verifying # Also extract module metadata modules = {} for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html): mod_id = m.group(1) modules[mod_id] = { 'name': m.group(2), 'icon': m.group(3), 'desc': m.group(4), 'color': mod_id, 'areas': [] } print(f'Found {len(modules)} modules') for mod_id, mod in modules.items(): print(f' {mod_id}: {mod["name"]}') # Save the flat list for now output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json') os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, 'w') as f: json.dump(papers, f, ensure_ascii=False, indent=2) print(f'Saved {len(papers)} papers (flat) to {output_path}')