feat: LLM 论文图书馆 — 初始提交

- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
2026-06-02 10:25:14 +00:00
commit f0ff62e082
21 changed files with 4042 additions and 0 deletions
--- a/api/extract_data.py
+++ b/api/extract_data.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""Build papers.json by regex-extracting paper entries from llm_library.html"""
+import re, json, os
+
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+html_path = os.path.join(ROOT, 'llm_library.html')
+
+with open(html_path, 'r') as f:
+    html = f.read()
+
+# Step 1: Parse modules (each module is a top-level key in PAPER_DATA)
+# Find each module block by matching "  arch: {" style patterns
+# Actually, let's parse line by line since this is a human-readable format
+
+# Simpler approach: extract all paper entries with regex
+# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] }
+paper_re = re.compile(
+    r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*'
+    r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)'
+    r'tags:\s*\[(.*?)\]\s*\}',
+    re.DOTALL
+)
+
+papers = []
+for m in paper_re.finditer(html):
+    title = m.group(1)
+    authors = m.group(2)
+    year = int(m.group(3))
+    venue = m.group(4)
+    arxiv = m.group(5) or None
+    pdf = m.group(6) or None
+    tags_str = m.group(7)
+    tags = re.findall(r'"([^"]*)"', tags_str)
+    
+    # Find which module/area this paper belongs to
+    pos = m.start()
+    # Search backwards for module and area context
+    before = html[max(0,pos-3000):pos]
+    
+    # Find module id
+    mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before)
+    if not mod_match:
+        # Try broader pattern
+        mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before)
+    if mod_match:
+        mod_id = mod_match.group(1)
+        mod_name = mod_match.group(2)
+    else:
+        mod_id = 'unknown'
+        mod_name = 'Unknown'
+    
+    # Find area id  
+    area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before)
+    if area_match:
+        area_id = area_match.group(1)
+        area_name = area_match.group(2)
+    else:
+        area_id = 'unknown'
+        area_name = 'Unknown'
+    
+    papers.append({
+        'module': mod_id,
+        'module_name': mod_name,
+        'area': area_id,
+        'area_name': area_name,
+        'title': title,
+        'authors': authors,
+        'year': year,
+        'venue': venue,
+        'arxiv': arxiv,
+        'pdf': pdf,
+        'tags': tags,
+    })
+
+print(f'Extracted {len(papers)} papers')
+
+# Group by module → area → section (mainline/branches/forward)
+# For now, just save as flat list for verification
+# We'll reconstruct the proper nested structure after verifying
+
+# Also extract module metadata
+modules = {}
+for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html):
+    mod_id = m.group(1)
+    modules[mod_id] = {
+        'name': m.group(2),
+        'icon': m.group(3),
+        'desc': m.group(4),
+        'color': mod_id,
+        'areas': []
+    }
+
+print(f'Found {len(modules)} modules')
+for mod_id, mod in modules.items():
+    print(f'  {mod_id}: {mod["name"]}')
+
+# Save the flat list for now
+output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json')
+os.makedirs(os.path.dirname(output_path), exist_ok=True)
+with open(output_path, 'w') as f:
+    json.dump(papers, f, ensure_ascii=False, indent=2)
+
+print(f'Saved {len(papers)} papers (flat) to {output_path}')