feat: LLM 论文图书馆 — 初始提交
- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
This commit is contained in:
107
api/parse_papers.py
Normal file
107
api/parse_papers.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Parse llm_library.html PAPER_DATA block → nested papers.json"""
|
||||
import re, json, os
|
||||
|
||||
HTML = '/app/working/workspaces/default/llm_library.html'
|
||||
JSON = '/app/working/workspaces/default/llm-library/data/papers.json'
|
||||
|
||||
with open(HTML) as f:
|
||||
html = f.read()
|
||||
|
||||
s = html.index('const PAPER_DATA = {')
|
||||
e = html.index('APP STATE')
|
||||
block = html[s+22:e]
|
||||
|
||||
modules = {}
|
||||
current_mod = None
|
||||
current_area = None
|
||||
current_section = 'mainline'
|
||||
|
||||
for line in block.split('\n'):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
indent = len(line) - len(line.lstrip())
|
||||
|
||||
# Module start: " arch: {" at indent 2
|
||||
if indent == 2 and re.match(r'^\w+:\s*\{', stripped):
|
||||
mid = stripped.split(':')[0]
|
||||
if current_mod and current_mod.get('name'):
|
||||
modules[current_mod['id']] = current_mod
|
||||
current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []}
|
||||
current_area = None
|
||||
current_section = 'mainline'
|
||||
continue
|
||||
|
||||
if not current_mod:
|
||||
continue
|
||||
|
||||
# Module metadata at indent 4
|
||||
if indent == 4:
|
||||
m = re.match(r'(\w+):\s*"([^"]*)"', stripped)
|
||||
if m and m.group(1) in ('name', 'icon', 'desc', 'color'):
|
||||
current_mod[m.group(1)] = m.group(2)
|
||||
|
||||
# Area header at indent 8
|
||||
if indent == 8:
|
||||
m_id = re.match(r'id:\s*"(\w+)"', stripped)
|
||||
if m_id:
|
||||
current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []}
|
||||
current_mod['areas'].append(current_area)
|
||||
current_section = 'mainline'
|
||||
continue
|
||||
m_name = re.match(r'name:\s*"([^"]+)"', stripped)
|
||||
if m_name and current_area:
|
||||
current_area['name'] = m_name.group(1)
|
||||
continue
|
||||
if re.match(r'mainline:\s*\[', stripped):
|
||||
current_section = 'mainline'
|
||||
elif re.match(r'branches:\s*\[', stripped):
|
||||
current_section = 'branches'
|
||||
elif re.match(r'forward:\s*\[', stripped):
|
||||
current_section = 'forward'
|
||||
|
||||
# Paper entry
|
||||
if stripped.startswith('{ title:') and 'tags:' in stripped and current_area:
|
||||
title = re.search(r'title:\s*"([^"]+)"', stripped)
|
||||
authors = re.search(r'authors:\s*"([^"]*?)"', stripped)
|
||||
year = re.search(r'year:\s*(\d+)', stripped)
|
||||
venue = re.search(r'venue:\s*"([^"]*?)"', stripped)
|
||||
arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped)
|
||||
pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped)
|
||||
tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL)
|
||||
|
||||
if title and year and tags_m:
|
||||
tags = re.findall(r'"([^"]*)"', tags_m.group(1))
|
||||
entry = {
|
||||
'title': title.group(1),
|
||||
'authors': authors.group(1) if authors else '',
|
||||
'year': int(year.group(1)),
|
||||
'venue': venue.group(1) if venue else '',
|
||||
'tags': tags
|
||||
}
|
||||
if arxiv and arxiv.group(1):
|
||||
entry['arxiv'] = arxiv.group(1)
|
||||
if pdf and pdf.group(1):
|
||||
entry['pdf'] = pdf.group(1)
|
||||
current_area[current_section].append(entry)
|
||||
|
||||
# Save last module
|
||||
if current_mod and current_mod.get('name'):
|
||||
modules[current_mod['id']] = current_mod
|
||||
|
||||
# Count
|
||||
total = sum(
|
||||
len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[]))
|
||||
for m in modules.values() for a in m.get('areas',[])
|
||||
)
|
||||
|
||||
print(f'Parsed: {len(modules)} modules, {total} papers')
|
||||
for mid, m in sorted(modules.items()):
|
||||
pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas'])
|
||||
print(f' {mid}: {m["name"]} — {len(m["areas"])} areas, {pc} papers')
|
||||
|
||||
os.makedirs(os.path.dirname(JSON), exist_ok=True)
|
||||
with open(JSON, 'w') as f:
|
||||
json.dump(modules, f, ensure_ascii=False, indent=2)
|
||||
print(f'\nSaved to {JSON}')
|
||||
Reference in New Issue
Block a user