feat: LLM 论文图书馆 — 初始提交
- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
This commit is contained in:
103
api/extract_data.py
Normal file
103
api/extract_data.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build papers.json by regex-extracting paper entries from llm_library.html"""
|
||||
import re, json, os
|
||||
|
||||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
html_path = os.path.join(ROOT, 'llm_library.html')
|
||||
|
||||
with open(html_path, 'r') as f:
|
||||
html = f.read()
|
||||
|
||||
# Step 1: Parse modules (each module is a top-level key in PAPER_DATA)
|
||||
# Find each module block by matching " arch: {" style patterns
|
||||
# Actually, let's parse line by line since this is a human-readable format
|
||||
|
||||
# Simpler approach: extract all paper entries with regex
|
||||
# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] }
|
||||
paper_re = re.compile(
|
||||
r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*'
|
||||
r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)'
|
||||
r'tags:\s*\[(.*?)\]\s*\}',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
papers = []
|
||||
for m in paper_re.finditer(html):
|
||||
title = m.group(1)
|
||||
authors = m.group(2)
|
||||
year = int(m.group(3))
|
||||
venue = m.group(4)
|
||||
arxiv = m.group(5) or None
|
||||
pdf = m.group(6) or None
|
||||
tags_str = m.group(7)
|
||||
tags = re.findall(r'"([^"]*)"', tags_str)
|
||||
|
||||
# Find which module/area this paper belongs to
|
||||
pos = m.start()
|
||||
# Search backwards for module and area context
|
||||
before = html[max(0,pos-3000):pos]
|
||||
|
||||
# Find module id
|
||||
mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before)
|
||||
if not mod_match:
|
||||
# Try broader pattern
|
||||
mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before)
|
||||
if mod_match:
|
||||
mod_id = mod_match.group(1)
|
||||
mod_name = mod_match.group(2)
|
||||
else:
|
||||
mod_id = 'unknown'
|
||||
mod_name = 'Unknown'
|
||||
|
||||
# Find area id
|
||||
area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before)
|
||||
if area_match:
|
||||
area_id = area_match.group(1)
|
||||
area_name = area_match.group(2)
|
||||
else:
|
||||
area_id = 'unknown'
|
||||
area_name = 'Unknown'
|
||||
|
||||
papers.append({
|
||||
'module': mod_id,
|
||||
'module_name': mod_name,
|
||||
'area': area_id,
|
||||
'area_name': area_name,
|
||||
'title': title,
|
||||
'authors': authors,
|
||||
'year': year,
|
||||
'venue': venue,
|
||||
'arxiv': arxiv,
|
||||
'pdf': pdf,
|
||||
'tags': tags,
|
||||
})
|
||||
|
||||
print(f'Extracted {len(papers)} papers')
|
||||
|
||||
# Group by module → area → section (mainline/branches/forward)
|
||||
# For now, just save as flat list for verification
|
||||
# We'll reconstruct the proper nested structure after verifying
|
||||
|
||||
# Also extract module metadata
|
||||
modules = {}
|
||||
for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html):
|
||||
mod_id = m.group(1)
|
||||
modules[mod_id] = {
|
||||
'name': m.group(2),
|
||||
'icon': m.group(3),
|
||||
'desc': m.group(4),
|
||||
'color': mod_id,
|
||||
'areas': []
|
||||
}
|
||||
|
||||
print(f'Found {len(modules)} modules')
|
||||
for mod_id, mod in modules.items():
|
||||
print(f' {mod_id}: {mod["name"]}')
|
||||
|
||||
# Save the flat list for now
|
||||
output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json')
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(papers, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f'Saved {len(papers)} papers (flat) to {output_path}')
|
||||
Reference in New Issue
Block a user