feat: LLM 论文图书馆 — 初始提交

- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理
- 180 篇论文数据 (data/papers.json): 9 模块、32 子领域
- 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读
- 底部状态栏: arXiv/HF 连通性检测
- PDF 加载: arXiv 优先(5s超时) → HK 本地兜底
- Docker 化部署 (Dockerfile + start.sh + nginx.conf)
- arXiv + HF 批量下载器 (api/downloader.py)
This commit is contained in:
2026-06-02 10:25:14 +00:00
commit f0ff62e082
21 changed files with 4042 additions and 0 deletions

107
api/parse_papers.py Normal file
View File

@@ -0,0 +1,107 @@
#!/usr/bin/env python3
"""Parse llm_library.html PAPER_DATA block → nested papers.json"""
import re, json, os
HTML = '/app/working/workspaces/default/llm_library.html'
JSON = '/app/working/workspaces/default/llm-library/data/papers.json'
with open(HTML) as f:
html = f.read()
s = html.index('const PAPER_DATA = {')
e = html.index('APP STATE')
block = html[s+22:e]
modules = {}
current_mod = None
current_area = None
current_section = 'mainline'
for line in block.split('\n'):
stripped = line.strip()
if not stripped:
continue
indent = len(line) - len(line.lstrip())
# Module start: " arch: {" at indent 2
if indent == 2 and re.match(r'^\w+:\s*\{', stripped):
mid = stripped.split(':')[0]
if current_mod and current_mod.get('name'):
modules[current_mod['id']] = current_mod
current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []}
current_area = None
current_section = 'mainline'
continue
if not current_mod:
continue
# Module metadata at indent 4
if indent == 4:
m = re.match(r'(\w+):\s*"([^"]*)"', stripped)
if m and m.group(1) in ('name', 'icon', 'desc', 'color'):
current_mod[m.group(1)] = m.group(2)
# Area header at indent 8
if indent == 8:
m_id = re.match(r'id:\s*"(\w+)"', stripped)
if m_id:
current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []}
current_mod['areas'].append(current_area)
current_section = 'mainline'
continue
m_name = re.match(r'name:\s*"([^"]+)"', stripped)
if m_name and current_area:
current_area['name'] = m_name.group(1)
continue
if re.match(r'mainline:\s*\[', stripped):
current_section = 'mainline'
elif re.match(r'branches:\s*\[', stripped):
current_section = 'branches'
elif re.match(r'forward:\s*\[', stripped):
current_section = 'forward'
# Paper entry
if stripped.startswith('{ title:') and 'tags:' in stripped and current_area:
title = re.search(r'title:\s*"([^"]+)"', stripped)
authors = re.search(r'authors:\s*"([^"]*?)"', stripped)
year = re.search(r'year:\s*(\d+)', stripped)
venue = re.search(r'venue:\s*"([^"]*?)"', stripped)
arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped)
pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped)
tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL)
if title and year and tags_m:
tags = re.findall(r'"([^"]*)"', tags_m.group(1))
entry = {
'title': title.group(1),
'authors': authors.group(1) if authors else '',
'year': int(year.group(1)),
'venue': venue.group(1) if venue else '',
'tags': tags
}
if arxiv and arxiv.group(1):
entry['arxiv'] = arxiv.group(1)
if pdf and pdf.group(1):
entry['pdf'] = pdf.group(1)
current_area[current_section].append(entry)
# Save last module
if current_mod and current_mod.get('name'):
modules[current_mod['id']] = current_mod
# Count
total = sum(
len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[]))
for m in modules.values() for a in m.get('areas',[])
)
print(f'Parsed: {len(modules)} modules, {total} papers')
for mid, m in sorted(modules.items()):
pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas'])
print(f' {mid}: {m["name"]}{len(m["areas"])} areas, {pc} papers')
os.makedirs(os.path.dirname(JSON), exist_ok=True)
with open(JSON, 'w') as f:
json.dump(modules, f, ensure_ascii=False, indent=2)
print(f'\nSaved to {JSON}')