#!/usr/bin/env python3 """Parse llm_library.html PAPER_DATA block → nested papers.json""" import re, json, os HTML = '/app/working/workspaces/default/llm_library.html' JSON = '/app/working/workspaces/default/llm-library/data/papers.json' with open(HTML) as f: html = f.read() s = html.index('const PAPER_DATA = {') e = html.index('APP STATE') block = html[s+22:e] modules = {} current_mod = None current_area = None current_section = 'mainline' for line in block.split('\n'): stripped = line.strip() if not stripped: continue indent = len(line) - len(line.lstrip()) # Module start: " arch: {" at indent 2 if indent == 2 and re.match(r'^\w+:\s*\{', stripped): mid = stripped.split(':')[0] if current_mod and current_mod.get('name'): modules[current_mod['id']] = current_mod current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []} current_area = None current_section = 'mainline' continue if not current_mod: continue # Module metadata at indent 4 if indent == 4: m = re.match(r'(\w+):\s*"([^"]*)"', stripped) if m and m.group(1) in ('name', 'icon', 'desc', 'color'): current_mod[m.group(1)] = m.group(2) # Area header at indent 8 if indent == 8: m_id = re.match(r'id:\s*"(\w+)"', stripped) if m_id: current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []} current_mod['areas'].append(current_area) current_section = 'mainline' continue m_name = re.match(r'name:\s*"([^"]+)"', stripped) if m_name and current_area: current_area['name'] = m_name.group(1) continue if re.match(r'mainline:\s*\[', stripped): current_section = 'mainline' elif re.match(r'branches:\s*\[', stripped): current_section = 'branches' elif re.match(r'forward:\s*\[', stripped): current_section = 'forward' # Paper entry if stripped.startswith('{ title:') and 'tags:' in stripped and current_area: title = re.search(r'title:\s*"([^"]+)"', stripped) authors = re.search(r'authors:\s*"([^"]*?)"', stripped) year = re.search(r'year:\s*(\d+)', stripped) venue = re.search(r'venue:\s*"([^"]*?)"', stripped) arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped) pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped) tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL) if title and year and tags_m: tags = re.findall(r'"([^"]*)"', tags_m.group(1)) entry = { 'title': title.group(1), 'authors': authors.group(1) if authors else '', 'year': int(year.group(1)), 'venue': venue.group(1) if venue else '', 'tags': tags } if arxiv and arxiv.group(1): entry['arxiv'] = arxiv.group(1) if pdf and pdf.group(1): entry['pdf'] = pdf.group(1) current_area[current_section].append(entry) # Save last module if current_mod and current_mod.get('name'): modules[current_mod['id']] = current_mod # Count total = sum( len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[])) for m in modules.values() for a in m.get('areas',[]) ) print(f'Parsed: {len(modules)} modules, {total} papers') for mid, m in sorted(modules.items()): pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas']) print(f' {mid}: {m["name"]} — {len(m["areas"])} areas, {pc} papers') os.makedirs(os.path.dirname(JSON), exist_ok=True) with open(JSON, 'w') as f: json.dump(modules, f, ensure_ascii=False, indent=2) print(f'\nSaved to {JSON}')