- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
104 lines
3.2 KiB
Python
104 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Build papers.json by regex-extracting paper entries from llm_library.html"""
|
|
import re, json, os
|
|
|
|
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
html_path = os.path.join(ROOT, 'llm_library.html')
|
|
|
|
with open(html_path, 'r') as f:
|
|
html = f.read()
|
|
|
|
# Step 1: Parse modules (each module is a top-level key in PAPER_DATA)
|
|
# Find each module block by matching " arch: {" style patterns
|
|
# Actually, let's parse line by line since this is a human-readable format
|
|
|
|
# Simpler approach: extract all paper entries with regex
|
|
# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] }
|
|
paper_re = re.compile(
|
|
r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*'
|
|
r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)'
|
|
r'tags:\s*\[(.*?)\]\s*\}',
|
|
re.DOTALL
|
|
)
|
|
|
|
papers = []
|
|
for m in paper_re.finditer(html):
|
|
title = m.group(1)
|
|
authors = m.group(2)
|
|
year = int(m.group(3))
|
|
venue = m.group(4)
|
|
arxiv = m.group(5) or None
|
|
pdf = m.group(6) or None
|
|
tags_str = m.group(7)
|
|
tags = re.findall(r'"([^"]*)"', tags_str)
|
|
|
|
# Find which module/area this paper belongs to
|
|
pos = m.start()
|
|
# Search backwards for module and area context
|
|
before = html[max(0,pos-3000):pos]
|
|
|
|
# Find module id
|
|
mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before)
|
|
if not mod_match:
|
|
# Try broader pattern
|
|
mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before)
|
|
if mod_match:
|
|
mod_id = mod_match.group(1)
|
|
mod_name = mod_match.group(2)
|
|
else:
|
|
mod_id = 'unknown'
|
|
mod_name = 'Unknown'
|
|
|
|
# Find area id
|
|
area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before)
|
|
if area_match:
|
|
area_id = area_match.group(1)
|
|
area_name = area_match.group(2)
|
|
else:
|
|
area_id = 'unknown'
|
|
area_name = 'Unknown'
|
|
|
|
papers.append({
|
|
'module': mod_id,
|
|
'module_name': mod_name,
|
|
'area': area_id,
|
|
'area_name': area_name,
|
|
'title': title,
|
|
'authors': authors,
|
|
'year': year,
|
|
'venue': venue,
|
|
'arxiv': arxiv,
|
|
'pdf': pdf,
|
|
'tags': tags,
|
|
})
|
|
|
|
print(f'Extracted {len(papers)} papers')
|
|
|
|
# Group by module → area → section (mainline/branches/forward)
|
|
# For now, just save as flat list for verification
|
|
# We'll reconstruct the proper nested structure after verifying
|
|
|
|
# Also extract module metadata
|
|
modules = {}
|
|
for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html):
|
|
mod_id = m.group(1)
|
|
modules[mod_id] = {
|
|
'name': m.group(2),
|
|
'icon': m.group(3),
|
|
'desc': m.group(4),
|
|
'color': mod_id,
|
|
'areas': []
|
|
}
|
|
|
|
print(f'Found {len(modules)} modules')
|
|
for mod_id, mod in modules.items():
|
|
print(f' {mod_id}: {mod["name"]}')
|
|
|
|
# Save the flat list for now
|
|
output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json')
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
with open(output_path, 'w') as f:
|
|
json.dump(papers, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f'Saved {len(papers)} papers (flat) to {output_path}')
|