Files
llm-library/api/extract_data.py
LaoWang f0ff62e082 feat: LLM 论文图书馆 — 初始提交
- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理
- 180 篇论文数据 (data/papers.json): 9 模块、32 子领域
- 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读
- 底部状态栏: arXiv/HF 连通性检测
- PDF 加载: arXiv 优先(5s超时) → HK 本地兜底
- Docker 化部署 (Dockerfile + start.sh + nginx.conf)
- arXiv + HF 批量下载器 (api/downloader.py)
2026-06-02 10:25:14 +00:00

104 lines
3.2 KiB
Python

#!/usr/bin/env python3
"""Build papers.json by regex-extracting paper entries from llm_library.html"""
import re, json, os
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
html_path = os.path.join(ROOT, 'llm_library.html')
with open(html_path, 'r') as f:
html = f.read()
# Step 1: Parse modules (each module is a top-level key in PAPER_DATA)
# Find each module block by matching " arch: {" style patterns
# Actually, let's parse line by line since this is a human-readable format
# Simpler approach: extract all paper entries with regex
# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] }
paper_re = re.compile(
r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*'
r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)'
r'tags:\s*\[(.*?)\]\s*\}',
re.DOTALL
)
papers = []
for m in paper_re.finditer(html):
title = m.group(1)
authors = m.group(2)
year = int(m.group(3))
venue = m.group(4)
arxiv = m.group(5) or None
pdf = m.group(6) or None
tags_str = m.group(7)
tags = re.findall(r'"([^"]*)"', tags_str)
# Find which module/area this paper belongs to
pos = m.start()
# Search backwards for module and area context
before = html[max(0,pos-3000):pos]
# Find module id
mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before)
if not mod_match:
# Try broader pattern
mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before)
if mod_match:
mod_id = mod_match.group(1)
mod_name = mod_match.group(2)
else:
mod_id = 'unknown'
mod_name = 'Unknown'
# Find area id
area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before)
if area_match:
area_id = area_match.group(1)
area_name = area_match.group(2)
else:
area_id = 'unknown'
area_name = 'Unknown'
papers.append({
'module': mod_id,
'module_name': mod_name,
'area': area_id,
'area_name': area_name,
'title': title,
'authors': authors,
'year': year,
'venue': venue,
'arxiv': arxiv,
'pdf': pdf,
'tags': tags,
})
print(f'Extracted {len(papers)} papers')
# Group by module → area → section (mainline/branches/forward)
# For now, just save as flat list for verification
# We'll reconstruct the proper nested structure after verifying
# Also extract module metadata
modules = {}
for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html):
mod_id = m.group(1)
modules[mod_id] = {
'name': m.group(2),
'icon': m.group(3),
'desc': m.group(4),
'color': mod_id,
'areas': []
}
print(f'Found {len(modules)} modules')
for mod_id, mod in modules.items():
print(f' {mod_id}: {mod["name"]}')
# Save the flat list for now
output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
json.dump(papers, f, ensure_ascii=False, indent=2)
print(f'Saved {len(papers)} papers (flat) to {output_path}')