feat: LLM 论文图书馆 — 初始提交

- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
2026-06-02 10:25:14 +00:00
commit f0ff62e082
21 changed files with 4042 additions and 0 deletions
--- a/api/parse_papers.py
+++ b/api/parse_papers.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""Parse llm_library.html PAPER_DATA block → nested papers.json"""
+import re, json, os
+
+HTML = '/app/working/workspaces/default/llm_library.html'
+JSON = '/app/working/workspaces/default/llm-library/data/papers.json'
+
+with open(HTML) as f:
+    html = f.read()
+
+s = html.index('const PAPER_DATA = {')
+e = html.index('APP STATE')
+block = html[s+22:e]
+
+modules = {}
+current_mod = None
+current_area = None
+current_section = 'mainline'
+
+for line in block.split('\n'):
+    stripped = line.strip()
+    if not stripped:
+        continue
+    indent = len(line) - len(line.lstrip())
+    
+    # Module start: "  arch: {" at indent 2
+    if indent == 2 and re.match(r'^\w+:\s*\{', stripped):
+        mid = stripped.split(':')[0]
+        if current_mod and current_mod.get('name'):
+            modules[current_mod['id']] = current_mod
+        current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []}
+        current_area = None
+        current_section = 'mainline'
+        continue
+    
+    if not current_mod:
+        continue
+    
+    # Module metadata at indent 4
+    if indent == 4:
+        m = re.match(r'(\w+):\s*"([^"]*)"', stripped)
+        if m and m.group(1) in ('name', 'icon', 'desc', 'color'):
+            current_mod[m.group(1)] = m.group(2)
+    
+    # Area header at indent 8
+    if indent == 8:
+        m_id = re.match(r'id:\s*"(\w+)"', stripped)
+        if m_id:
+            current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []}
+            current_mod['areas'].append(current_area)
+            current_section = 'mainline'
+            continue
+        m_name = re.match(r'name:\s*"([^"]+)"', stripped)
+        if m_name and current_area:
+            current_area['name'] = m_name.group(1)
+            continue
+        if re.match(r'mainline:\s*\[', stripped):
+            current_section = 'mainline'
+        elif re.match(r'branches:\s*\[', stripped):
+            current_section = 'branches'
+        elif re.match(r'forward:\s*\[', stripped):
+            current_section = 'forward'
+    
+    # Paper entry
+    if stripped.startswith('{ title:') and 'tags:' in stripped and current_area:
+        title = re.search(r'title:\s*"([^"]+)"', stripped)
+        authors = re.search(r'authors:\s*"([^"]*?)"', stripped)
+        year = re.search(r'year:\s*(\d+)', stripped)
+        venue = re.search(r'venue:\s*"([^"]*?)"', stripped)
+        arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped)
+        pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped)
+        tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL)
+        
+        if title and year and tags_m:
+            tags = re.findall(r'"([^"]*)"', tags_m.group(1))
+            entry = {
+                'title': title.group(1),
+                'authors': authors.group(1) if authors else '',
+                'year': int(year.group(1)),
+                'venue': venue.group(1) if venue else '',
+                'tags': tags
+            }
+            if arxiv and arxiv.group(1):
+                entry['arxiv'] = arxiv.group(1)
+            if pdf and pdf.group(1):
+                entry['pdf'] = pdf.group(1)
+            current_area[current_section].append(entry)
+
+# Save last module
+if current_mod and current_mod.get('name'):
+    modules[current_mod['id']] = current_mod
+
+# Count
+total = sum(
+    len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[]))
+    for m in modules.values() for a in m.get('areas',[])
+)
+
+print(f'Parsed: {len(modules)} modules, {total} papers')
+for mid, m in sorted(modules.items()):
+    pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas'])
+    print(f'  {mid}: {m["name"]} — {len(m["areas"])} areas, {pc} papers')
+
+os.makedirs(os.path.dirname(JSON), exist_ok=True)
+with open(JSON, 'w') as f:
+    json.dump(modules, f, ensure_ascii=False, indent=2)
+print(f'\nSaved to {JSON}')