- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
108 lines
3.9 KiB
Python
108 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Parse llm_library.html PAPER_DATA block → nested papers.json"""
|
|
import re, json, os
|
|
|
|
HTML = '/app/working/workspaces/default/llm_library.html'
|
|
JSON = '/app/working/workspaces/default/llm-library/data/papers.json'
|
|
|
|
with open(HTML) as f:
|
|
html = f.read()
|
|
|
|
s = html.index('const PAPER_DATA = {')
|
|
e = html.index('APP STATE')
|
|
block = html[s+22:e]
|
|
|
|
modules = {}
|
|
current_mod = None
|
|
current_area = None
|
|
current_section = 'mainline'
|
|
|
|
for line in block.split('\n'):
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
indent = len(line) - len(line.lstrip())
|
|
|
|
# Module start: " arch: {" at indent 2
|
|
if indent == 2 and re.match(r'^\w+:\s*\{', stripped):
|
|
mid = stripped.split(':')[0]
|
|
if current_mod and current_mod.get('name'):
|
|
modules[current_mod['id']] = current_mod
|
|
current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []}
|
|
current_area = None
|
|
current_section = 'mainline'
|
|
continue
|
|
|
|
if not current_mod:
|
|
continue
|
|
|
|
# Module metadata at indent 4
|
|
if indent == 4:
|
|
m = re.match(r'(\w+):\s*"([^"]*)"', stripped)
|
|
if m and m.group(1) in ('name', 'icon', 'desc', 'color'):
|
|
current_mod[m.group(1)] = m.group(2)
|
|
|
|
# Area header at indent 8
|
|
if indent == 8:
|
|
m_id = re.match(r'id:\s*"(\w+)"', stripped)
|
|
if m_id:
|
|
current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []}
|
|
current_mod['areas'].append(current_area)
|
|
current_section = 'mainline'
|
|
continue
|
|
m_name = re.match(r'name:\s*"([^"]+)"', stripped)
|
|
if m_name and current_area:
|
|
current_area['name'] = m_name.group(1)
|
|
continue
|
|
if re.match(r'mainline:\s*\[', stripped):
|
|
current_section = 'mainline'
|
|
elif re.match(r'branches:\s*\[', stripped):
|
|
current_section = 'branches'
|
|
elif re.match(r'forward:\s*\[', stripped):
|
|
current_section = 'forward'
|
|
|
|
# Paper entry
|
|
if stripped.startswith('{ title:') and 'tags:' in stripped and current_area:
|
|
title = re.search(r'title:\s*"([^"]+)"', stripped)
|
|
authors = re.search(r'authors:\s*"([^"]*?)"', stripped)
|
|
year = re.search(r'year:\s*(\d+)', stripped)
|
|
venue = re.search(r'venue:\s*"([^"]*?)"', stripped)
|
|
arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped)
|
|
pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped)
|
|
tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL)
|
|
|
|
if title and year and tags_m:
|
|
tags = re.findall(r'"([^"]*)"', tags_m.group(1))
|
|
entry = {
|
|
'title': title.group(1),
|
|
'authors': authors.group(1) if authors else '',
|
|
'year': int(year.group(1)),
|
|
'venue': venue.group(1) if venue else '',
|
|
'tags': tags
|
|
}
|
|
if arxiv and arxiv.group(1):
|
|
entry['arxiv'] = arxiv.group(1)
|
|
if pdf and pdf.group(1):
|
|
entry['pdf'] = pdf.group(1)
|
|
current_area[current_section].append(entry)
|
|
|
|
# Save last module
|
|
if current_mod and current_mod.get('name'):
|
|
modules[current_mod['id']] = current_mod
|
|
|
|
# Count
|
|
total = sum(
|
|
len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[]))
|
|
for m in modules.values() for a in m.get('areas',[])
|
|
)
|
|
|
|
print(f'Parsed: {len(modules)} modules, {total} papers')
|
|
for mid, m in sorted(modules.items()):
|
|
pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas'])
|
|
print(f' {mid}: {m["name"]} — {len(m["areas"])} areas, {pc} papers')
|
|
|
|
os.makedirs(os.path.dirname(JSON), exist_ok=True)
|
|
with open(JSON, 'w') as f:
|
|
json.dump(modules, f, ensure_ascii=False, indent=2)
|
|
print(f'\nSaved to {JSON}')
|