From f0ff62e082b16f124574fc7417d2b26af0d11fcb Mon Sep 17 00:00:00 2001 From: LaoWang <257199637@qq.com> Date: Tue, 2 Jun 2026 10:25:14 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20LLM=20=E8=AE=BA=E6=96=87=E5=9B=BE?= =?UTF-8?q?=E4=B9=A6=E9=A6=86=20=E2=80=94=20=E5=88=9D=E5=A7=8B=E6=8F=90?= =?UTF-8?q?=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py) --- .env.example | 17 + .gitignore | 9 + Dockerfile | 16 + README.md | 123 +++ api/__init__.py | 1 + api/backfill.py | 57 ++ api/check_trans.py | 13 + api/downloader.py | 198 ++++ api/extract_data.py | 103 ++ api/parse_papers.py | 107 +++ api/server.py | 484 ++++++++++ data/papers.json | 2175 +++++++++++++++++++++++++++++++++++++++++++ nginx.conf | 55 ++ proxy_conf.txt | 10 + pyproject.toml | 17 + requirements.txt | 8 + start.sh | 43 + static/app.js | 252 +++++ static/index.html | 34 + static/pdf.min.js | 7 + static/style.css | 313 +++++++ 21 files changed, 4042 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 api/__init__.py create mode 100644 api/backfill.py create mode 100644 api/check_trans.py create mode 100644 api/downloader.py create mode 100644 api/extract_data.py create mode 100644 api/parse_papers.py create mode 100644 api/server.py create mode 100644 data/papers.json create mode 100644 nginx.conf create mode 100644 proxy_conf.txt create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100755 start.sh create mode 100644 static/app.js create mode 100644 static/index.html create mode 100644 static/pdf.min.js create mode 100644 static/style.css diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..8079f44 --- /dev/null +++ b/.env.example @@ -0,0 +1,17 @@ +# LLM 论文图书馆 — 环境配置 +# 复制此文件为 .env.local 并填入实际值 + +# API 管理密钥 (用于 POST/PUT/DELETE 鉴权) +API_KEY=change-me-to-a-strong-random-string + +# 服务端口 +PORT=8000 + +# PDF 存储路径 +PAPER_DIR=papers + +# 日志级别 +LOG_LEVEL=info + +# CORS 允许的域名 (逗号分隔) +CORS_ORIGINS=* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a024d2c --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +__pycache__/ +*.pyc +papers/arxiv/*.pdf +papers/hf/*.pdf +.env +.env.local +*.egg-info/ +.venv/ +papers/download.log diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..98566b6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11-slim + +RUN apt-get update && apt-get install -y --no-install-recommends poppler-utils && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY . . + +VOLUME ["/app/papers", "/app/data"] +EXPOSE 8000 +ENV PORT=8000 +ENV LOG_LEVEL=info + +CMD ["sh", "-c", "python3 -m uvicorn api.server:app --host 0.0.0.0 --port ${PORT} --log-level ${LOG_LEVEL}"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..bb979c2 --- /dev/null +++ b/README.md @@ -0,0 +1,123 @@ +# LLM 论文图书馆 + +大模型全链路技术论文知识库 — 从架构设计到 Agent 应用,覆盖 9 大模块、30+ 子领域、180+ 篇关键论文。 + +**在线访问:** https://your-domain.com +**API 文档:** https://your-domain.com/docs (FastAPI Swagger) + +## 项目结构 + +``` +llm-library/ +├── api/ +│ ├── server.py # FastAPI 服务 (REST API + PDF 代理) +│ ├── downloader.py # PDF 批量下载器 +│ ├── parse_papers.py # 从 HTML 提取论文数据 +│ └── extract_data.py # 备用提取脚本 +├── data/ +│ └── papers.json # 论文元数据 (单一数据源) +├── papers/ +│ ├── arxiv/ # arXiv PDF 缓存 +│ └── hf/ # HuggingFace PDF 缓存 +├── static/ # 前端 (index.html + CSS + JS) +├── start.sh # 一键启动 +├── requirements.txt +└── pyproject.toml +``` + +## 快速启动 + +```bash +# 1. 配置 API Key +echo "API_KEY=$(python3 -c 'import secrets; print(secrets.token_urlsafe(32))')" > .env + +# 2. 安装依赖 +pip install -r requirements.txt + +# 3. 启动服务 +bash start.sh +# 或 +python3 -m uvicorn api.server:app --host 0.0.0.0 --port 8000 +``` + +服务启动后访问 `http://localhost:8000` 即可使用。 + +## API 接口 + +| 方法 | 路径 | 说明 | 鉴权 | +|------|------|------|------| +| GET | `/api/stats` | 图书馆统计 | 无 | +| GET | `/api/modules` | 列出所有模块 | 无 | +| GET | `/api/modules/{id}` | 获取模块详情 (含论文) | 无 | +| GET | `/api/papers?q=xxx` | 搜索论文 | 无 | +| POST | `/api/papers` | 添加论文 | Bearer Token | +| PUT | `/api/papers` | 更新论文 | Bearer Token | +| DELETE | `/api/papers` | 删除论文 | Bearer Token | +| GET | `/papers/arxiv/{id}.pdf` | 本地 PDF 代理 | 无 | + +### 管理接口示例 + +```bash +# 添加一篇论文 +curl -X POST http://localhost:8000/api/papers \ + -H "Authorization: Bearer $API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "module_id": "arch", + "area_id": "attention", + "section": "mainline", + "title": "Paper Title Here", + "authors": "Author et al.", + "year": 2026, + "venue": "arXiv", + "arxiv": "2601.01234", + "tags": ["前沿"] + }' +``` + +## PDF 下载 + +```bash +# 下载所有论文 PDF 到本地 (增量) +python3 api/downloader.py + +# 只下载前 5 篇测试 +python3 api/downloader.py --limit 5 + +# 强制重新下载 +python3 api/downloader.py --no-incremental +``` + +## 部署 (Nginx 反向代理) + +```nginx +server { + listen 80; + server_name your-domain.com; + + location / { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } + + # 静态文件直接由 Nginx 服务 (可选, 提升性能) + location /style.css { alias /path/to/llm-library/static/style.css; } + location /app.js { alias /path/to/llm-library/static/app.js; } +} +``` + +## 数据维护 + +论文数据存储在 `data/papers.json`,也可通过 API 管理。 + +**标签系统:** +- 🏁 **起点** — 该子领域的奠基论文 +- 🔴 **关键节点** — 改变技术方向的里程碑论文 +- 🟢 **前沿** — 当前 SOTA,已被主流模型采纳 +- 🟣 **前瞻** — 有潜力的想法,尚未被主流采纳 (如 Engram, Titans) +- 🟠 **支线** — 有影响力的替代技术路线 + +## 许可证 + +MIT diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..6f79f29 --- /dev/null +++ b/api/__init__.py @@ -0,0 +1 @@ +# LLM 论文图书馆 — API 模块 diff --git a/api/backfill.py b/api/backfill.py new file mode 100644 index 0000000..7690c11 --- /dev/null +++ b/api/backfill.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +"""Backfill all uncached PDFs, skipping dead arXiv IDs""" +import json, subprocess, sys, time +from pathlib import Path + +PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json" +ARXIV_DIR = Path(__file__).resolve().parent.parent / "papers" / "arxiv" +LOG_FILE = Path("/app/papers/backfill.log") + +def main(): + with open(PAPERS_JSON) as f: + data = json.load(f) + + # Collect all arxiv IDs + arxiv_ids = set() + for mod in data.values(): + for area in mod.get("areas", []): + for section in ("mainline", "branches", "forward"): + for p in area.get(section, []): + aid = p.get("arxiv") + if aid: + arxiv_ids.add(aid) + + cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")} + missing = [aid for aid in arxiv_ids if aid not in cached] + print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}") + + if not missing: + print("All caught up!") + return + + ok, fail = 0, 0 + for aid in missing: + url = f"https://arxiv.org/pdf/{aid}.pdf" + dest = ARXIV_DIR / f"{aid}.pdf" + try: + r = subprocess.run( + ["wget", "-q", "-T", "15", "-O", str(dest), url], + timeout=20 + ) + if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000: + ok += 1 + print(f" OK {aid} ({dest.stat().st_size//1024} KB)") + else: + dest.unlink(missing_ok=True) + fail += 1 + print(f" FAIL {aid} (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})") + except Exception as e: + dest.unlink(missing_ok=True) + fail += 1 + print(f" ERR {aid} {e}") + time.sleep(0.8) # Be nice to arXiv + + print(f"\nDone: {ok} ok, {fail} failed") + +if __name__ == "__main__": + main() diff --git a/api/check_trans.py b/api/check_trans.py new file mode 100644 index 0000000..8f03c37 --- /dev/null +++ b/api/check_trans.py @@ -0,0 +1,13 @@ +import urllib.request, json +r = urllib.request.urlopen("http://127.0.0.1:8000/api/translate/2005.14165") +data = json.loads(r.read()) +p = data["paragraphs"][0] +page1 = p["page"] +en1 = p["en"][:60] +zh1 = p["zh"][:80] +print(f"page={page1}, en={en1}") +print(f"zh={zh1}") +p5 = data["paragraphs"][5] +page5 = p5["page"] +zh5 = p5["zh"][:80] +print(f"para[5] page={page5}, zh={zh5}") diff --git a/api/downloader.py b/api/downloader.py new file mode 100644 index 0000000..dd1059f --- /dev/null +++ b/api/downloader.py @@ -0,0 +1,198 @@ +""" +LLM 论文图书馆 — PDF 下载器 +从 arXiv 和 HuggingFace 下载论文 PDF 到本地缓存 +""" + +import os +import json +import time +import logging +from pathlib import Path + +import httpx +from tqdm import tqdm + +ROOT = Path(__file__).resolve().parent.parent +DATA_FILE = ROOT / "data" / "papers.json" +ARXIV_DIR = ROOT / "papers" / "arxiv" +HF_DIR = ROOT / "papers" / "hf" +LOG_FILE = ROOT / "papers" / "download.log" + +log = logging.getLogger("downloader") +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(message)s", + handlers=[ + logging.FileHandler(LOG_FILE), + logging.StreamHandler(), + ], +) + + +def collect_urls() -> tuple[list[tuple[str, str]], list[tuple[str, str]]]: + """从 papers.json 收集所有需要下载的 PDF URL + + Returns: + arxiv_list: [(arxiv_id, title), ...] + hf_list: [(url, filename), ...] + """ + with open(DATA_FILE) as f: + data = json.load(f) + + arxiv_seen = set() + hf_seen = set() + arxiv_list = [] + hf_list = [] + + for mod in data.values(): + for area in mod.get("areas", []): + for section in ("mainline", "branches", "forward"): + for p in area.get(section, []): + if p.get("arxiv") and p["arxiv"] not in arxiv_seen: + arxiv_seen.add(p["arxiv"]) + arxiv_list.append((p["arxiv"], p.get("title", ""))) + if p.get("pdf") and p["pdf"] not in hf_seen: + hf_seen.add(p["pdf"]) + # Derive a safe filename from the URL + name = p["pdf"].split("/")[-1].replace(".pdf", "") + hf_list.append((p["pdf"], name)) + + return arxiv_list, hf_list + + +def download_arxiv(client: httpx.Client, arxiv_id: str, title: str) -> bool: + """下载单个 arXiv PDF""" + pdf_path = ARXIV_DIR / f"{arxiv_id}.pdf" + if pdf_path.exists(): + log.debug(f"Skip (exists): {arxiv_id}") + return True + + url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" + try: + resp = client.get(url, follow_redirects=True, timeout=30) + resp.raise_for_status() + + # Verify it's actually a PDF (arxiv returns HTML for missing papers) + content_type = resp.headers.get("content-type", "") + if "pdf" not in content_type and not resp.content.startswith(b"%PDF"): + log.warning(f"Not a PDF: {arxiv_id} — {title[:60]}") + return False + + pdf_path.write_bytes(resp.content) + size_kb = len(resp.content) / 1024 + log.info(f"OK: {arxiv_id} ({size_kb:.0f} KB) — {title[:60]}") + return True + except httpx.HTTPError as e: + log.error(f"HTTP error {arxiv_id}: {e}") + return False + except Exception as e: + log.error(f"Error {arxiv_id}: {e}") + return False + + +def download_hf(client: httpx.Client, url: str, filename: str) -> bool: + """下载单个 HuggingFace PDF""" + safe_name = filename.replace("..", "").replace("/", "_") + pdf_path = HF_DIR / f"{safe_name}.pdf" + if pdf_path.exists(): + log.debug(f"Skip (exists): {safe_name}") + return True + + try: + resp = client.get(url, follow_redirects=True, timeout=60) + resp.raise_for_status() + + if not resp.content.startswith(b"%PDF"): + log.warning(f"Not a PDF: {safe_name}") + return False + + pdf_path.write_bytes(resp.content) + size_kb = len(resp.content) / 1024 + log.info(f"OK (HF): {safe_name} ({size_kb:.0f} KB)") + return True + except httpx.HTTPError as e: + log.error(f"HTTP error {safe_name}: {e}") + return False + except Exception as e: + log.error(f"Error {safe_name}: {e}") + return False + + +def run(incremental: bool = True, limit: int = 0, delay: float = 1.0): + """批量下载所有 PDF + + Args: + incremental: True=跳过已有文件 + limit: 0=全部, N=只下载前N篇 + delay: 请求间延迟(秒) + """ + ARXIV_DIR.mkdir(parents=True, exist_ok=True) + HF_DIR.mkdir(parents=True, exist_ok=True) + + arxiv_list, hf_list = collect_urls() + total = len(arxiv_list) + len(hf_list) + log.info(f"Found {len(arxiv_list)} arXiv + {len(hf_list)} HF = {total} PDFs to download") + log.info(f"Incremental: {incremental}, Delay: {delay}s") + + if not incremental: + log.warning("Non-incremental mode: will re-download existing files") + + # Count existing + arxiv_existing = sum(1 for aid, _ in arxiv_list if (ARXIV_DIR / f"{aid}.pdf").exists()) + hf_existing = sum(1 for _, name in hf_list if (HF_DIR / f"{name}.pdf").exists()) + log.info(f"Already cached: {arxiv_existing} arXiv + {hf_existing} HF") + + ok, fail = 0, 0 + total_size = 0.0 + + with httpx.Client( + headers={"User-Agent": "LLM-Library-Downloader/0.1"}, + timeout=30, + follow_redirects=True, + ) as client: + + # Download arXiv + if limit > 0: + arxiv_list = arxiv_list[:limit] + for arxiv_id, title in tqdm(arxiv_list, desc="arXiv"): + if incremental and (ARXIV_DIR / f"{arxiv_id}.pdf").exists(): + ok += 1 + continue + success = download_arxiv(client, arxiv_id, title) + if success: + ok += 1 + p = ARXIV_DIR / f"{arxiv_id}.pdf" + if p.exists(): + total_size += p.stat().st_size + else: + fail += 1 + time.sleep(delay) + + # Download HF + if limit > 0: + hf_list = hf_list[:limit] + for url, name in tqdm(hf_list, desc="HF "): + if incremental and (HF_DIR / f"{name}.pdf").exists(): + ok += 1 + continue + success = download_hf(client, url, name) + if success: + ok += 1 + p = HF_DIR / f"{name}.pdf" + if p.exists(): + total_size += p.stat().st_size + else: + fail += 1 + time.sleep(delay) + + log.info(f"Done: {ok} OK, {fail} failed, {total_size/1024/1024:.1f} MB total") + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="下载论文 PDF 到本地缓存") + parser.add_argument("--no-incremental", action="store_true", help="重新下载所有 (默认跳过已有)") + parser.add_argument("--limit", type=int, default=0, help="限制下载数量 (0=全部)") + parser.add_argument("--delay", type=float, default=1.0, help="请求间延迟 (秒)") + args = parser.parse_args() + run(incremental=not args.no_incremental, limit=args.limit, delay=args.delay) diff --git a/api/extract_data.py b/api/extract_data.py new file mode 100644 index 0000000..5e63fca --- /dev/null +++ b/api/extract_data.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +"""Build papers.json by regex-extracting paper entries from llm_library.html""" +import re, json, os + +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +html_path = os.path.join(ROOT, 'llm_library.html') + +with open(html_path, 'r') as f: + html = f.read() + +# Step 1: Parse modules (each module is a top-level key in PAPER_DATA) +# Find each module block by matching " arch: {" style patterns +# Actually, let's parse line by line since this is a human-readable format + +# Simpler approach: extract all paper entries with regex +# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] } +paper_re = re.compile( + r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*' + r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)' + r'tags:\s*\[(.*?)\]\s*\}', + re.DOTALL +) + +papers = [] +for m in paper_re.finditer(html): + title = m.group(1) + authors = m.group(2) + year = int(m.group(3)) + venue = m.group(4) + arxiv = m.group(5) or None + pdf = m.group(6) or None + tags_str = m.group(7) + tags = re.findall(r'"([^"]*)"', tags_str) + + # Find which module/area this paper belongs to + pos = m.start() + # Search backwards for module and area context + before = html[max(0,pos-3000):pos] + + # Find module id + mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before) + if not mod_match: + # Try broader pattern + mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before) + if mod_match: + mod_id = mod_match.group(1) + mod_name = mod_match.group(2) + else: + mod_id = 'unknown' + mod_name = 'Unknown' + + # Find area id + area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before) + if area_match: + area_id = area_match.group(1) + area_name = area_match.group(2) + else: + area_id = 'unknown' + area_name = 'Unknown' + + papers.append({ + 'module': mod_id, + 'module_name': mod_name, + 'area': area_id, + 'area_name': area_name, + 'title': title, + 'authors': authors, + 'year': year, + 'venue': venue, + 'arxiv': arxiv, + 'pdf': pdf, + 'tags': tags, + }) + +print(f'Extracted {len(papers)} papers') + +# Group by module → area → section (mainline/branches/forward) +# For now, just save as flat list for verification +# We'll reconstruct the proper nested structure after verifying + +# Also extract module metadata +modules = {} +for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html): + mod_id = m.group(1) + modules[mod_id] = { + 'name': m.group(2), + 'icon': m.group(3), + 'desc': m.group(4), + 'color': mod_id, + 'areas': [] + } + +print(f'Found {len(modules)} modules') +for mod_id, mod in modules.items(): + print(f' {mod_id}: {mod["name"]}') + +# Save the flat list for now +output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json') +os.makedirs(os.path.dirname(output_path), exist_ok=True) +with open(output_path, 'w') as f: + json.dump(papers, f, ensure_ascii=False, indent=2) + +print(f'Saved {len(papers)} papers (flat) to {output_path}') diff --git a/api/parse_papers.py b/api/parse_papers.py new file mode 100644 index 0000000..87015df --- /dev/null +++ b/api/parse_papers.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +"""Parse llm_library.html PAPER_DATA block → nested papers.json""" +import re, json, os + +HTML = '/app/working/workspaces/default/llm_library.html' +JSON = '/app/working/workspaces/default/llm-library/data/papers.json' + +with open(HTML) as f: + html = f.read() + +s = html.index('const PAPER_DATA = {') +e = html.index('APP STATE') +block = html[s+22:e] + +modules = {} +current_mod = None +current_area = None +current_section = 'mainline' + +for line in block.split('\n'): + stripped = line.strip() + if not stripped: + continue + indent = len(line) - len(line.lstrip()) + + # Module start: " arch: {" at indent 2 + if indent == 2 and re.match(r'^\w+:\s*\{', stripped): + mid = stripped.split(':')[0] + if current_mod and current_mod.get('name'): + modules[current_mod['id']] = current_mod + current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []} + current_area = None + current_section = 'mainline' + continue + + if not current_mod: + continue + + # Module metadata at indent 4 + if indent == 4: + m = re.match(r'(\w+):\s*"([^"]*)"', stripped) + if m and m.group(1) in ('name', 'icon', 'desc', 'color'): + current_mod[m.group(1)] = m.group(2) + + # Area header at indent 8 + if indent == 8: + m_id = re.match(r'id:\s*"(\w+)"', stripped) + if m_id: + current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []} + current_mod['areas'].append(current_area) + current_section = 'mainline' + continue + m_name = re.match(r'name:\s*"([^"]+)"', stripped) + if m_name and current_area: + current_area['name'] = m_name.group(1) + continue + if re.match(r'mainline:\s*\[', stripped): + current_section = 'mainline' + elif re.match(r'branches:\s*\[', stripped): + current_section = 'branches' + elif re.match(r'forward:\s*\[', stripped): + current_section = 'forward' + + # Paper entry + if stripped.startswith('{ title:') and 'tags:' in stripped and current_area: + title = re.search(r'title:\s*"([^"]+)"', stripped) + authors = re.search(r'authors:\s*"([^"]*?)"', stripped) + year = re.search(r'year:\s*(\d+)', stripped) + venue = re.search(r'venue:\s*"([^"]*?)"', stripped) + arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped) + pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped) + tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL) + + if title and year and tags_m: + tags = re.findall(r'"([^"]*)"', tags_m.group(1)) + entry = { + 'title': title.group(1), + 'authors': authors.group(1) if authors else '', + 'year': int(year.group(1)), + 'venue': venue.group(1) if venue else '', + 'tags': tags + } + if arxiv and arxiv.group(1): + entry['arxiv'] = arxiv.group(1) + if pdf and pdf.group(1): + entry['pdf'] = pdf.group(1) + current_area[current_section].append(entry) + +# Save last module +if current_mod and current_mod.get('name'): + modules[current_mod['id']] = current_mod + +# Count +total = sum( + len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[])) + for m in modules.values() for a in m.get('areas',[]) +) + +print(f'Parsed: {len(modules)} modules, {total} papers') +for mid, m in sorted(modules.items()): + pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas']) + print(f' {mid}: {m["name"]} — {len(m["areas"])} areas, {pc} papers') + +os.makedirs(os.path.dirname(JSON), exist_ok=True) +with open(JSON, 'w') as f: + json.dump(modules, f, ensure_ascii=False, indent=2) +print(f'\nSaved to {JSON}') diff --git a/api/server.py b/api/server.py new file mode 100644 index 0000000..4ba1b0a --- /dev/null +++ b/api/server.py @@ -0,0 +1,484 @@ +""" +LLM 论文图书馆 — FastAPI 后端 +提供 REST API 进行论文查询、管理、PDF 代理服务 +""" + +import json +import os +import hashlib +import secrets +import logging +from pathlib import Path +from typing import Optional + +from fastapi import FastAPI, HTTPException, Query, Depends, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse, JSONResponse +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel + +# ─── Config ──────────────────────────────────────────── +ROOT = Path(__file__).resolve().parent.parent +DATA_FILE = ROOT / "data" / "papers.json" +PAPERS_DIR = ROOT / "papers" +API_KEY = os.environ.get("LLM_LIB_API_KEY", "change-me") + +log = logging.getLogger("llm-library") +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + +# ─── App ─────────────────────────────────────────────── +app = FastAPI( + title="LLM 论文图书馆", + description="大模型论文知识库 API — 查询、搜索、管理论文", + version="0.1.0", +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + +# ─── Auth ────────────────────────────────────────────── +def verify_api_key(request: Request): + """简单的 API Key 鉴权 — 用于写操作 (POST/PUT/DELETE)""" + auth = request.headers.get("Authorization", "") + if auth.startswith("Bearer "): + token = auth[7:] + else: + token = request.query_params.get("api_key", "") + if not token or token != API_KEY: + raise HTTPException(status_code=401, detail="Invalid or missing API key") + return True + +# ─── Data loading ────────────────────────────────────── +def load_data(): + if not DATA_FILE.exists(): + return {} + with open(DATA_FILE, 'r') as f: + return json.load(f) + +def save_data(data): + with open(DATA_FILE, 'w') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + +# ─── Paper CRUD helpers ──────────────────────────────── +def find_paper(data, module_id, area_id, title): + """Find a paper index by title within module/area""" + mod = data.get(module_id) + if not mod: + return None, None, None, None + for area in mod.get("areas", []): + if area["id"] == area_id: + for section in ("mainline", "branches", "forward"): + for i, p in enumerate(area.get(section, [])): + if p["title"] == title: + return mod, area, section, i + return None, None, None, None + +# ─── Routes: Query ───────────────────────────────────── +@app.get("/api/stats") +def get_stats(): + """获取图书馆统计信息""" + data = load_data() + mods = len(data) + areas = 0 + papers = 0 + sections = {"mainline": 0, "branches": 0, "forward": 0} + for mod in data.values(): + areas += len(mod.get("areas", [])) + for area in mod.get("areas", []): + for s in ("mainline", "branches", "forward"): + n = len(area.get(s, [])) + papers += n + sections[s] += n + return { + "modules": mods, + "areas": areas, + "papers": papers, + "sections": sections, + "data_file": str(DATA_FILE), + } + +@app.get("/api/modules") +def list_modules(): + """列出所有模块 (不含论文详情)""" + data = load_data() + return [ + { + "id": mid, + "name": m["name"], + "icon": m["icon"], + "desc": m["desc"], + "area_count": len(m.get("areas", [])), + "paper_count": sum( + len(a.get("mainline", [])) + len(a.get("branches", [])) + len(a.get("forward", [])) + for a in m.get("areas", []) + ), + } + for mid, m in data.items() + ] + +@app.get("/api/modules/{module_id}") +def get_module(module_id: str): + """获取单个模块的完整论文数据""" + data = load_data() + mod = data.get(module_id) + if not mod: + raise HTTPException(status_code=404, detail=f"Module '{module_id}' not found") + return mod + +@app.get("/api/papers") +def search_papers( + q: str = Query(default="", description="搜索关键词: 标题/作者"), + module: Optional[str] = Query(default=None), + tag: Optional[str] = Query(default=None, description="起点/关键节点/前沿/前瞻/支线"), + limit: int = Query(default=50, ge=1, le=200), +): + """搜索论文 (全文/按模块/按标签)""" + data = load_data() + results = [] + q = q.lower() + for mid, mod in data.items(): + if module and mid != module: + continue + for area in mod.get("areas", []): + for section in ("mainline", "branches", "forward"): + for p in area.get(section, []): + # Filter by tag + if tag and tag not in p.get("tags", []): + continue + # Filter by query + if q: + if q not in (p.get("title", "") + p.get("authors", "")).lower(): + continue + results.append({ + "module_id": mid, + "module_name": mod["name"], + "area_id": area["id"], + "area_name": area["name"], + "section": section, + **p, + }) + if len(results) >= limit: + break + if len(results) >= limit: + break + if len(results) >= limit: + break + if len(results) >= limit: + break + return results + +# ─── Routes: Management (写操作, 需 API Key) ──────────── +class PaperCreate(BaseModel): + module_id: str + area_id: str + section: str = "mainline" # mainline / branches / forward + title: str + authors: str = "" + year: int + venue: str = "" + arxiv: Optional[str] = None + pdf: Optional[str] = None + tags: list[str] = [] + +class PaperUpdate(BaseModel): + authors: Optional[str] = None + year: Optional[int] = None + venue: Optional[str] = None + arxiv: Optional[str] = None + pdf: Optional[str] = None + tags: Optional[list[str]] = None + section: Optional[str] = None # move to different section + +@app.post("/api/papers", dependencies=[Depends(verify_api_key)]) +def add_paper(paper: PaperCreate): + """添加一篇新论文""" + data = load_data() + mod = data.get(paper.module_id) + if not mod: + raise HTTPException(status_code=404, detail="Module not found") + + area = next((a for a in mod["areas"] if a["id"] == paper.area_id), None) + if not area: + raise HTTPException(status_code=404, detail="Area not found") + + section = paper.section + if section not in ("mainline", "branches", "forward"): + raise HTTPException(status_code=400, detail="section must be mainline/branches/forward") + + entry = { + "title": paper.title, + "authors": paper.authors, + "year": paper.year, + "venue": paper.venue, + "tags": paper.tags, + } + if paper.arxiv: + entry["arxiv"] = paper.arxiv + if paper.pdf: + entry["pdf"] = paper.pdf + + area.setdefault(section, []).append(entry) + save_data(data) + log.info(f"Added paper: {paper.title}") + return {"ok": True, "title": paper.title} + +@app.put("/api/papers") +def update_paper( + module_id: str, + area_id: str, + title: str, + update: PaperUpdate, + _=Depends(verify_api_key), +): + """更新一篇论文""" + data = load_data() + mod, area, section, idx = find_paper(data, module_id, area_id, title) + if mod is None: + raise HTTPException(status_code=404, detail="Paper not found") + + paper = area[section][idx] + for field in ("authors", "year", "venue", "arxiv", "pdf", "tags"): + val = getattr(update, field) + if val is not None: + paper[field] = val + + # Move to different section? + if update.section and update.section != section: + if update.section not in ("mainline", "branches", "forward"): + raise HTTPException(status_code=400, detail="Invalid section") + area[section].pop(idx) + area.setdefault(update.section, []).append(paper) + + save_data(data) + log.info(f"Updated paper: {title}") + return {"ok": True, "title": title} + +@app.delete("/api/papers") +def delete_paper( + module_id: str, + area_id: str, + title: str, + _=Depends(verify_api_key), +): + """删除一篇论文""" + data = load_data() + mod, area, section, idx = find_paper(data, module_id, area_id, title) + if mod is None: + raise HTTPException(status_code=404, detail="Paper not found") + + area[section].pop(idx) + save_data(data) + log.info(f"Deleted paper: {title}") + return {"ok": True, "title": title} + +# ─── Routes: PDF proxy ────────────────────────────────── +@app.get("/papers/arxiv/{arxiv_id}.pdf") +@app.get("/papers/arxiv/{arxiv_id}") +def serve_arxiv_pdf(arxiv_id: str): + """从本地缓存提供 arXiv PDF(无 .pdf 后缀路由防 IDM 拦截)""" + pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf" + if not pdf_path.exists(): + raise HTTPException(status_code=404, detail=f"PDF not in local cache: {arxiv_id}") + return FileResponse( + pdf_path, media_type="application/pdf", + headers={"Cache-Control": "public, max-age=86400"}, + ) + +@app.get("/papers/hf/{filename}.pdf") +@app.get("/papers/hf/{filename}") +def serve_hf_pdf(filename: str): + """从本地缓存提供 HuggingFace PDF(无 .pdf 后缀路由防 IDM 拦截)""" + safe_name = filename.replace("..", "").replace("/", "_").removesuffix(".pdf") + pdf_path = PAPERS_DIR / "hf" / f"{safe_name}.pdf" + if not pdf_path.exists(): + raise HTTPException(status_code=404, detail=f"PDF not in local cache: {filename}") + return FileResponse( + pdf_path, media_type="application/pdf", + headers={"Cache-Control": "public, max-age=86400"}, + ) + +# ─── Routes: Translation ─────────────────────────────── +TRANSLATE_CACHE = ROOT / "data" / "translations" +TRANSLATE_CACHE.mkdir(parents=True, exist_ok=True) + +def extract_pdf_text_with_pages(pdf_path: Path, max_chars: int = 12000) -> list[dict]: + """从 PDF 提取文本和页码信息,使用 pdftotext (Poppler) 避免 PyMuPDF GPU 依赖""" + import subprocess, tempfile + + # Strip arXiv stamp (first page header) + stamp = f"{pdf_path.stem}.pdf" # e.g. "1706.03762.pdf" + + result = subprocess.run( + ["pdftotext", "-layout", "-q", str(pdf_path), "-"], + capture_output=True, text=True, timeout=30 + ) + + if result.returncode != 0: + log.error(f"pdftotext failed: {result.stderr}") + raise HTTPException(status_code=500, detail="PDF text extraction failed") + + text = result.stdout + + # Remove arXiv stamp line + import re + text = re.sub(r'arXiv:' + re.escape(stamp.split('.pdf')[0]) + r'.*?\n\n', '', text, flags=re.DOTALL) + text = re.sub(r'arXiv:' + re.escape(stamp) + r'.*?\n\n', '', text, flags=re.DOTALL) + + # Split by form-feed (page break) + pages = text.split('\f') + + result_pages = [] + total = 0 + for i, page_text in enumerate(pages): + pt = page_text.strip() + if not pt: continue + result_pages.append({"page": i + 1, "text": pt}) + total += len(pt) + if total >= max_chars: break + + if not result_pages: + raise HTTPException(status_code=500, detail="No text extracted from PDF") + + return result_pages + + +def split_text_with_pages(page_texts: list[dict], max_len: int = 400) -> list[dict]: + """将按页拆分的文本进一步拆为段落,保留页码""" + chunks = [] + for pt in page_texts: + page = pt["page"] + text = pt["text"] + raw_paras = [p.strip() for p in text.split("\n\n") if p.strip()] + for para in raw_paras: + if len(para) <= max_len: + chunks.append({"page": page, "text": para}) + else: + sentences = para.replace(". ", ".|").replace("? ", "?|").replace("! ", "!|").split("|") + current = "" + for s in sentences: + s = s.strip() + if not s: continue + if len(current) + len(s) + 1 <= max_len: + current = (current + " " + s).strip() + else: + if current: chunks.append({"page": page, "text": current}) + current = s + if current: chunks.append({"page": page, "text": current}) + return chunks + + +def translate_text(text: str, source: str = "en", target: str = "zh") -> str: + """使用 MyMemory 免费 API 翻译文本""" + import urllib.request + import urllib.parse + + url = "https://api.mymemory.translated.net/get" + params = urllib.parse.urlencode({ + "q": text, + "langpair": f"{source}|{target}", + "mt": "1", # Force machine translation, not memory + "de": "me@llm-library.local", + }) + full_url = f"{url}?{params}" + + try: + with urllib.request.urlopen(full_url, timeout=15) as resp: + data = json.loads(resp.read()) + except Exception as e: + log.warning(f"Translation API error: {e}") + return text + + if data.get("responseStatus") == 200 and data.get("responseData"): + return data["responseData"]["translatedText"] + return text + + +@app.get("/api/translate/{arxiv_id}") +def translate_paper(arxiv_id: str): + """翻译论文正文 (从本地 PDF 提取文本,每段带页码)""" + pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf" + if not pdf_path.exists(): + raise HTTPException(status_code=404, detail=f"PDF not cached: {arxiv_id}") + + cache_file = TRANSLATE_CACHE / f"{arxiv_id}.json" + if cache_file.exists(): + with open(cache_file) as f: + return json.load(f) + + # Extract text with page numbers + log.info(f"Extracting text from {arxiv_id}") + page_texts = extract_pdf_text_with_pages(pdf_path) + chunks = split_text_with_pages(page_texts) + log.info(f"Translating {len(chunks)} paragraphs for {arxiv_id}") + + translated = [] + for i, chunk in enumerate(chunks): + if i % 10 == 0: + log.info(f" [{arxiv_id}] translating paragraph {i+1}/{len(chunks)}") + zh = translate_text(chunk["text"]) + translated.append({ + "page": chunk["page"], + "en": chunk["text"], + "zh": zh, + }) + + result = {"arxiv_id": arxiv_id, "paragraphs": translated, "count": len(translated)} + + with open(cache_file, "w") as f: + json.dump(result, f, ensure_ascii=False) + + return result + + +@app.get("/api/translate/{arxiv_id}/status") +def translate_status(arxiv_id: str): + """检查翻译缓存状态""" + cache_file = TRANSLATE_CACHE / f"{arxiv_id}.json" + return { + "arxiv_id": arxiv_id, + "cached": cache_file.exists(), + "pdf_exists": (PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf").exists(), + } + +# ─── Routes: PDF download on-demand ──────────────────── +@app.post("/api/download/{arxiv_id}") +def download_single_pdf(arxiv_id: str): + """按需下载单篇 arXiv PDF""" + import subprocess, sys + pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf" + if pdf_path.exists(): + return {"ok": True, "arxiv_id": arxiv_id, "status": "cached"} + + cmd = [sys.executable, str(ROOT / "api" / "downloader.py"), "--limit", "1", "--delay", "0"] + # We need a way to download specific arxiv IDs — for now, just run the downloader + # It will try all uncached papers, but the specific one will be among them + try: + subprocess.run(cmd, cwd=str(ROOT), timeout=60, capture_output=True) + if pdf_path.exists(): + return {"ok": True, "arxiv_id": arxiv_id, "status": "downloaded"} + return {"ok": False, "arxiv_id": arxiv_id, "status": "failed"} + except subprocess.TimeoutExpired: + return {"ok": False, "arxiv_id": arxiv_id, "status": "timeout"} + +# ─── Health ───────────────────────────────────────────── +@app.get("/api/health") +def health(): + return {"status": "ok", "version": "0.1.0"} + +# ─── Mount static frontend (at /) ────────────────────── +# Static files mounted after API routes to avoid conflicts +static_dir = ROOT / "static" +if static_dir.exists() and any(static_dir.iterdir()): + app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static") + +# ─── Main ─────────────────────────────────────────────── +def main(): + import uvicorn + uvicorn.run("api.server:app", host="0.0.0.0", port=8000, reload=True) + +if __name__ == "__main__": + main() diff --git a/data/papers.json b/data/papers.json new file mode 100644 index 0000000..cfa801b --- /dev/null +++ b/data/papers.json @@ -0,0 +1,2175 @@ +{ + "arch": { + "id": "arch", + "name": "模型架构设计", + "icon": "🏗️", + "desc": "Transformer 变体、注意力机制、MoE、位置编码、SSM 等", + "color": "arch", + "areas": [ + { + "id": "transformer", + "name": "Transformer 核心架构", + "mainline": [ + { + "title": "Attention Is All You Need", + "authors": "Vaswani et al.", + "year": 2017, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "1706.03762" + }, + { + "title": "GPT-2 / GPT-3: Language Models are Few-Shot Learners", + "authors": "Radford et al. / Brown et al.", + "year": 2020, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2005.14165" + }, + { + "title": "LLaMA: Open and Efficient Foundation Language Models", + "authors": "Touvron et al.", + "year": 2023, + "venue": "Meta AI", + "tags": [ + "关键节点" + ], + "arxiv": "2302.13971" + }, + { + "title": "Llama 3 / Llama 4 (Scout/Maverick/Behemoth, MoE, 10M context)", + "authors": "Meta AI", + "year": 2025, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2601.11659" + }, + { + "title": "DeepSeek-V3 / V3.2: MoE + MLA + MTP + Sparse Attention", + "authors": "DeepSeek-AI", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2412.19437" + }, + { + "title": "DeepSeek-V4: 1.6T MoE + CSA+HCA Hybrid Attention + 1M Context", + "authors": "DeepSeek-AI", + "year": 2026, + "venue": "DeepSeek", + "tags": [ + "前沿" + ], + "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" + }, + { + "title": "Kimi K2: Open Agentic Intelligence (1T MoE, 128K ctx, Muon optimizer)", + "authors": "Moonshot AI", + "year": 2025, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2507.20534" + }, + { + "title": "Kimi K2.5: Visual Agentic Intelligence (native multimodal, Agent Swarm)", + "authors": "Moonshot AI", + "year": 2026, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2602.02276" + } + ], + "branches": [ + { + "title": "T5: Exploring the Limits of Transfer Learning", + "authors": "Raffel et al.", + "year": 2020, + "venue": "JMLR", + "tags": [ + "支线" + ], + "arxiv": "1910.10683" + }, + { + "title": "PaLM / PaLM-2: Pathways Language Model", + "authors": "Chowdhery et al.", + "year": 2022, + "venue": "JMLR", + "tags": [ + "支线" + ], + "arxiv": "2204.02311" + }, + { + "title": "Gemma 2 / Gemma 3 系列", + "authors": "Google DeepMind", + "year": 2024, + "venue": "Google", + "tags": [ + "支线" + ], + "arxiv": "2408.00118" + } + ], + "forward": [] + }, + { + "id": "attention", + "name": "注意力机制演进", + "mainline": [ + { + "title": "Multi-Head Attention (MHA)", + "authors": "Vaswani et al.", + "year": 2017, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "1706.03762" + }, + { + "title": "MQA: Fast Transformer Decoding — One Write-Head Is All You Need", + "authors": "Shazeer", + "year": 2019, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "1911.02150" + }, + { + "title": "GQA: Training Generalized Multi-Query Transformer Models", + "authors": "Ainslie et al.", + "year": 2023, + "venue": "EMNLP", + "tags": [ + "关键节点" + ], + "arxiv": "2305.13245" + }, + { + "title": "MLA: Multi-head Latent Attention (DeepSeek-V2/V3)", + "authors": "DeepSeek-AI", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2405.04434" + }, + { + "title": "DeepSeek-V4: CSA+HCA Hybrid Attention (KV Cache → 10% of V3.2)", + "authors": "DeepSeek-AI", + "year": 2026, + "venue": "DeepSeek", + "tags": [ + "前沿" + ], + "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" + }, + { + "title": "FlashAttention-3 / Sparse Attention 工程化", + "authors": "Dao et al. / DeepSeek", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2407.08608" + }, + { + "title": "TEST", + "authors": "Test", + "year": 2026, + "venue": "Test", + "tags": [ + "test" + ] + } + ], + "branches": [ + { + "title": "FlashAttention-1 / FlashAttention-2: IO-Aware Exact Attention", + "authors": "Dao et al.", + "year": 2022, + "venue": "NeurIPS", + "tags": [ + "支线" + ], + "arxiv": "2205.14135" + }, + { + "title": "Differential Transformer (DIFF Transformer)", + "authors": "Ye et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2410.05258" + } + ], + "forward": [ + { + "title": "Engram: Conditional Memory via Scalable Lookup (N-gram O(1), 分离记忆与推理)", + "authors": "DeepSeek-AI", + "year": 2026, + "venue": "arXiv", + "tags": [ + "前瞻" + ], + "arxiv": "2601.07372" + } + ] + }, + { + "id": "moe", + "name": "MoE 混合专家", + "mainline": [ + { + "title": "Sparsely-Gated MoE Layer: Outrageously Large Neural Networks", + "authors": "Shazeer et al.", + "year": 2017, + "venue": "ICLR", + "tags": [ + "起点" + ], + "arxiv": "1701.06538" + }, + { + "title": "Switch Transformers: Scaling to Trillion Parameter Models", + "authors": "Fedus et al.", + "year": 2022, + "venue": "JMLR", + "tags": [ + "关键节点" + ], + "arxiv": "2101.03961" + }, + { + "title": "Mixtral of Experts (8x7B, 开放 MoE)", + "authors": "Jiang et al.", + "year": 2024, + "venue": "Mistral AI", + "tags": [ + "关键节点" + ], + "arxiv": "2401.04088" + }, + { + "title": "DeepSeekMoE: Fine-Grained Expert Segmentation + Shared Experts", + "authors": "DeepSeek-AI", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2401.06066" + }, + { + "title": "DeepSeek-V4: 1.6T参数 MoE + 领域专家独立训练后合并 + On-Policy Distillation", + "authors": "DeepSeek-AI", + "year": 2026, + "venue": "DeepSeek", + "tags": [ + "前沿" + ], + "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" + } + ], + "branches": [ + { + "title": "GLaM: Efficient Scaling with Mixture-of-Experts", + "authors": "Du et al.", + "year": 2022, + "venue": "ICML", + "tags": [ + "支线" + ], + "arxiv": "2112.06905" + }, + { + "title": "DeepSeek-V3 Multi-Token Prediction (MTP)", + "authors": "DeepSeek-AI", + "year": 2024, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2412.19437" + }, + { + "title": "Muon: Muon is Scalable for LLM Training (Kimi 核心优化器)", + "authors": "Jordan et al.", + "year": 2025, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2502.16982" + } + ], + "forward": [] + }, + { + "id": "posenc", + "name": "位置编码", + "mainline": [ + { + "title": "Sinusoidal Positional Encoding", + "authors": "Vaswani et al.", + "year": 2017, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "1706.03762" + }, + { + "title": "RoPE: Rotary Position Embedding", + "authors": "Su et al.", + "year": 2021, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2104.09864" + }, + { + "title": "ALiBi: Train Short, Test Long", + "authors": "Press et al.", + "year": 2022, + "venue": "ICLR", + "tags": [ + "关键节点" + ], + "arxiv": "2108.12409" + }, + { + "title": "YaRN: Efficient Context Window Extension of LLMs", + "authors": "Peng et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2309.00071" + }, + { + "title": "NoPE + 10M Token Context (Llama 4 Scout)", + "authors": "Kazemnejad et al. / Meta", + "year": 2025, + "venue": "ICML / Meta", + "tags": [ + "前沿" + ], + "arxiv": "2305.19466" + } + ], + "branches": [ + { + "title": "ReRoPE / SelfExtend: Training-Free Length Extension", + "authors": "Su et al. / Jin et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2401.01325" + } + ], + "forward": [] + }, + { + "id": "norm", + "name": "归一化与激活函数", + "mainline": [ + { + "title": "Layer Normalization", + "authors": "Ba et al.", + "year": 2016, + "venue": "arXiv", + "tags": [ + "起点" + ], + "arxiv": "1607.06450" + }, + { + "title": "RMSNorm: Root Mean Square Layer Normalization", + "authors": "Zhang & Sennrich", + "year": 2019, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "1910.07467" + }, + { + "title": "DeepNorm: DeepNet — Scaling Transformers to 1,000 Layers", + "authors": "Wang et al.", + "year": 2022, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2203.00555" + }, + { + "title": "SwiGLU / GLU Variants (成为行业标配)", + "authors": "Shazeer et al.", + "year": 2020, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2002.05202" + }, + { + "title": "mHC: Manifold-Constrained Hyper-Connections (DeepSeek-V4 基础)", + "authors": "DeepSeek-AI", + "year": 2026, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2512.24880" + } + ], + "branches": [], + "forward": [] + }, + { + "id": "ssm", + "name": "新兴架构 (SSM / Linear Attention)", + "mainline": [ + { + "title": "S4: Efficiently Modeling Long Sequences with Structured State Spaces", + "authors": "Gu et al.", + "year": 2022, + "venue": "ICLR", + "tags": [ + "起点" + ], + "arxiv": "2111.00396" + }, + { + "title": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces", + "authors": "Gu & Dao", + "year": 2023, + "venue": "COLM", + "tags": [ + "关键节点" + ], + "arxiv": "2312.00752" + }, + { + "title": "RWKV: Reinventing RNNs for the Transformer Era", + "authors": "Peng et al.", + "year": 2023, + "venue": "EMNLP", + "tags": [ + "关键节点" + ], + "arxiv": "2305.13048" + }, + { + "title": "Mamba-2: Transformers are SSMs", + "authors": "Dao & Gu", + "year": 2024, + "venue": "ICML", + "tags": [ + "关键节点" + ], + "arxiv": "2405.21060" + }, + { + "title": "Kimi Linear (K2.5 混合架构: Transformer + Linear Attention)", + "authors": "Moonshot AI", + "year": 2026, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2602.02276" + }, + { + "title": "Titans: Learning to Memorize at Test Time (Neural Memory)", + "authors": "Behrouz et al.", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2501.00663" + } + ], + "branches": [ + { + "title": "RetNet: Retentive Network — A Successor to Transformer", + "authors": "Sun et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2307.08621" + }, + { + "title": "Hymba: Hybrid Mamba-Transformer", + "authors": "NVIDIA / Meta", + "year": 2025, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2411.13676" + } + ], + "forward": [ + { + "title": "Titans: Learning to Memorize at Test Time (神经记忆, 超越 Transformer)", + "authors": "Behrouz et al.", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前瞻" + ], + "arxiv": "2501.00663" + }, + { + "title": "MatMul-free Language Modeling (消除矩阵乘法, 类脑计算)", + "authors": "Zhu et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "前瞻" + ], + "arxiv": "2406.02528" + } + ] + } + ] + }, + "multi": { + "id": "multi", + "name": "多模态", + "icon": "🖼️", + "desc": "视觉-语言、音频、视频、Omni 统一模态", + "color": "multi", + "areas": [ + { + "id": "vision", + "name": "视觉-语言模型", + "mainline": [ + { + "title": "CLIP: Learning Transferable Visual Models (对比学习奠基)", + "authors": "Radford et al.", + "year": 2021, + "venue": "ICML", + "tags": [ + "起点" + ], + "arxiv": "2103.00020" + }, + { + "title": "BLIP-2: Bootstrapping Language-Image Pre-training (Q-Former)", + "authors": "Li et al.", + "year": 2023, + "venue": "ICML", + "tags": [ + "关键节点" + ], + "arxiv": "2301.12597" + }, + { + "title": "LLaVA / LLaVA-1.5: Visual Instruction Tuning", + "authors": "Liu et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2304.08485" + }, + { + "title": "GPT-4o System Card: 原生多模态 (文本/图像/音频端到端)", + "authors": "OpenAI", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2410.21276" + }, + { + "title": "Qwen3-VL / InternVL3 / Kimi K2.5 Visual: 开源 SOTA 视觉-语言", + "authors": "Alibaba / Shanghai AI Lab / Moonshot", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2602.02276" + } + ], + "branches": [ + { + "title": "SigLIP: Sigmoid Loss for Language Image Pre-training", + "authors": "Zhai et al.", + "year": 2023, + "venue": "ICCV", + "tags": [ + "支线" + ], + "arxiv": "2303.15343" + }, + { + "title": "Flamingo: Visual Language Model for Few-Shot Learning", + "authors": "Alayrac et al.", + "year": 2022, + "venue": "NeurIPS", + "tags": [ + "支线" + ], + "arxiv": "2204.14198" + } + ], + "forward": [] + }, + { + "id": "omni", + "name": "Omni 统一多模态", + "mainline": [ + { + "title": "Gemini 1.0 Technical Report: 原生多模态 (文本/图像/音频/视频)", + "authors": "Google DeepMind", + "year": 2023, + "venue": "arXiv", + "tags": [ + "起点" + ], + "arxiv": "2312.11805" + }, + { + "title": "GPT-4o System Card: 端到端 omni 多模态", + "authors": "OpenAI", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2410.21276" + }, + { + "title": "Chameleon: Mixed-Modal Early-Fusion Foundation Models", + "authors": "Team Chameleon (Meta)", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2405.09818" + }, + { + "title": "Qwen3.5-Omni Technical Report: 全模态 (文本/图像/音频/视频/语音)", + "authors": "Alibaba Qwen", + "year": 2026, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2604.15804" + } + ], + "branches": [ + { + "title": "EMU3: Next-Token Prediction is All You Need (Meta 统一多模态)", + "authors": "Wang et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2409.18869" + } + ], + "forward": [] + }, + { + "id": "audio", + "name": "音频/语音模型", + "mainline": [ + { + "title": "Whisper / Whisper-large-v3: Robust Speech Recognition", + "authors": "Radford et al.", + "year": 2023, + "venue": "ICML", + "tags": [ + "起点" + ], + "arxiv": "2212.04356" + }, + { + "title": "Qwen-Audio / Qwen2-Audio: 通用音频理解", + "authors": "Alibaba", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2311.07919" + }, + { + "title": "CosyVoice 2: Scalable Streaming Speech Synthesis", + "authors": "Du et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2412.10117" + } + ], + "branches": [ + { + "title": "Moshi: Real-time Speech Dialogue (Kyutai)", + "authors": "Défossez et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2410.00037" + } + ], + "forward": [] + }, + { + "id": "video", + "name": "视频理解", + "mainline": [ + { + "title": "Video-LLaMA / VideoChat: 视频-语言对话", + "authors": "Zhang et al.", + "year": 2023, + "venue": "EMNLP", + "tags": [ + "起点" + ], + "arxiv": "2306.02858" + }, + { + "title": "Gemini 1.5 Pro: 1M token 长上下文视频理解", + "authors": "Google DeepMind", + "year": 2024, + "venue": "Google", + "tags": [ + "关键节点" + ], + "arxiv": "2403.05530" + }, + { + "title": "LLaVA-OneVision / Qwen3-VL 视频能力", + "authors": "Liu et al. / Qwen", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2408.03326" + } + ], + "branches": [], + "forward": [] + } + ] + }, + "data": { + "id": "data", + "name": "数据工程", + "icon": "📊", + "desc": "采集清洗、数据配比、合成数据、质量筛选", + "color": "data", + "areas": [ + { + "id": "dedup", + "name": "数据清洗与去重", + "mainline": [ + { + "title": "Deduplicating Training Data Makes Language Models Better", + "authors": "Lee et al.", + "year": 2022, + "venue": "ACL", + "tags": [ + "起点" + ], + "arxiv": "2107.06499" + }, + { + "title": "The Pile: An 800GB Dataset of Diverse Text", + "authors": "Gao et al.", + "year": 2020, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2101.00027" + }, + { + "title": "CCNet / RefinedWeb / FineWeb: 大规模高质量 Web 数据", + "authors": "Penedo et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2306.01116" + }, + { + "title": "DCLM: DataComp-LM — 数据筛选标准化基准", + "authors": "Li et al.", + "year": 2024, + "venue": "NeurIPS", + "tags": [ + "前沿" + ], + "arxiv": "2406.11794" + }, + { + "title": "Dolma / OLMo / datatrove: 全开放数据处理管线", + "authors": "Soldaini et al.", + "year": 2024, + "venue": "ACL", + "tags": [ + "前沿" + ], + "arxiv": "2402.00159" + } + ], + "branches": [ + { + "title": "C4 / mC4: Colossal Clean Crawled Corpus (T5)", + "authors": "Raffel et al.", + "year": 2020, + "venue": "JMLR", + "tags": [ + "支线" + ], + "arxiv": "1910.10683" + } + ], + "forward": [] + }, + { + "id": "mixing", + "name": "数据配比与课程学习", + "mainline": [ + { + "title": "DoReMi: Optimizing Data Mixtures Speeds Up LM Pretraining", + "authors": "Xie et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "2305.10429" + }, + { + "title": "DOGE: Domain-General Reweighting for LLM Pretraining", + "authors": "Fan et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2405.13063" + } + ], + "branches": [ + { + "title": "Scaling Data-Constrained Language Models (多轮重复训练)", + "authors": "Muennighoff et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "支线" + ], + "arxiv": "2305.16264" + } + ], + "forward": [] + }, + { + "id": "synthesis", + "name": "合成数据生成", + "mainline": [ + { + "title": "Self-Instruct: Aligning LM with Self-Generated Instructions", + "authors": "Wang et al.", + "year": 2023, + "venue": "ACL", + "tags": [ + "起点" + ], + "arxiv": "2212.10560" + }, + { + "title": "Evol-Instruct (WizardLM) / Orca: 渐进式指令演化", + "authors": "Xu et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2304.12244" + }, + { + "title": "Magpie: Alignment Data Synthesis from Scratch", + "authors": "Xu et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2406.06859" + }, + { + "title": "Phi-4: 合成数据驱动的推理训练 + 代码合成", + "authors": "Microsoft Research", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2412.08905" + }, + { + "title": "DeepSeek-R1 冷启动数据合成 (长推理链)", + "authors": "DeepSeek-AI", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2501.12948" + } + ], + "branches": [ + { + "title": "Alpaca: A Strong, Replicable Instruction-Following Model (Stanford CRFM)", + "authors": "Taori et al.", + "year": 2023, + "venue": "Stanford", + "tags": [ + "支线" + ], + "arxiv": "2303.08774" + } + ], + "forward": [] + } + ] + }, + "pretrain": { + "id": "pretrain", + "name": "预训练", + "icon": "🔥", + "desc": "训练目标、分布式并行、训练稳定性、Scaling Law、长上下文", + "color": "pretrain", + "areas": [ + { + "id": "scaling", + "name": "Scaling Law 与计算优化", + "mainline": [ + { + "title": "Scaling Laws for Neural Language Models (Kaplan)", + "authors": "Kaplan et al.", + "year": 2020, + "venue": "arXiv", + "tags": [ + "起点" + ], + "arxiv": "2001.08361" + }, + { + "title": "Chinchilla: Training Compute-Optimal Large Language Models", + "authors": "Hoffmann et al.", + "year": 2022, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2203.15556" + }, + { + "title": "Scaling Data-Constrained Language Models (重复训练 scaling)", + "authors": "Muennighoff et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2305.16264" + }, + { + "title": "Qwen3 / DeepSeek-V3 实践: 超 Chinchilla ~15×~60× tokens", + "authors": "Alibaba / DeepSeek", + "year": 2025, + "venue": "Industry", + "tags": [ + "前沿" + ], + "arxiv": "2503.20630" + }, + { + "title": "Inference-Aware Scaling Laws (部署成本纳入 scaling)", + "authors": "Sardana & Frankle", + "year": 2024, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2401.00448" + } + ], + "branches": [], + "forward": [] + }, + { + "id": "distributed", + "name": "分布式训练系统", + "mainline": [ + { + "title": "Megatron-LM: Training Multi-Billion Parameter Models (TP/PP)", + "authors": "Shoeybi et al.", + "year": 2020, + "venue": "arXiv", + "tags": [ + "起点" + ], + "arxiv": "1909.08053" + }, + { + "title": "ZeRO / ZeRO++ / ZeRO-3: Memory Optimizations (DeepSpeed)", + "authors": "Rajbhandari et al.", + "year": 2020, + "venue": "SC", + "tags": [ + "关键节点" + ], + "arxiv": "1910.02054" + }, + { + "title": "3D 并行 + 序列并行 + 专家并行 (DeepSeek-V3 部署)", + "authors": "DeepSeek-AI", + "year": 2024, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2412.19437" + } + ], + "branches": [ + { + "title": "GPipe / PipeDream: Pipeline Parallelism 基础", + "authors": "Huang et al.", + "year": 2019, + "venue": "NeurIPS", + "tags": [ + "支线" + ], + "arxiv": "1811.06965" + }, + { + "title": "Ring Attention / Striped Attention: 长序列分布式训练", + "authors": "Liu et al. / Brandon et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2310.01889" + } + ], + "forward": [] + }, + { + "id": "stability", + "name": "训练稳定性与精度", + "mainline": [ + { + "title": "Mixed Precision Training (FP16 → 标准)", + "authors": "Micikevicius et al.", + "year": 2018, + "venue": "ICLR", + "tags": [ + "起点" + ], + "arxiv": "1710.03740" + }, + { + "title": "FP8 Training for LLMs (H100/B200 原生支持)", + "authors": "Micikevicius et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2310.18313" + } + ], + "branches": [ + { + "title": "GLM / BLOOM: 千卡训练稳定性工程经验", + "authors": "Zeng et al. / BigScience", + "year": 2023, + "venue": "KDD", + "tags": [ + "支线" + ], + "arxiv": "2210.02414" + } + ], + "forward": [] + } + ] + }, + "post": { + "id": "post", + "name": "后训练", + "icon": "🎯", + "desc": "SFT、RLHF/DPO/GRPO、推理增强、安全对齐", + "color": "post", + "areas": [ + { + "id": "sft", + "name": "SFT 监督微调", + "mainline": [ + { + "title": "Finetuned Language Models Are Zero-Shot Learners (FLAN)", + "authors": "Chung et al.", + "year": 2022, + "venue": "ICLR", + "tags": [ + "起点" + ], + "arxiv": "2109.01652" + }, + { + "title": "LIMA: Less Is More for Alignment (1,000 高质量样本)", + "authors": "Zhou et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2305.11206" + }, + { + "title": "Tulu 3 / Orca 3: 系统化后训练管线", + "authors": "AllenAI / MSR", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2411.15124" + } + ], + "branches": [ + { + "title": "LoRA: Low-Rank Adaptation of LLMs", + "authors": "Hu et al.", + "year": 2022, + "venue": "ICLR", + "tags": [ + "支线" + ], + "arxiv": "2106.09685" + }, + { + "title": "QLoRA: Efficient Finetuning of Quantized LLMs", + "authors": "Dettmers et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "支线" + ], + "arxiv": "2305.14314" + } + ], + "forward": [] + }, + { + "id": "alignment", + "name": "偏好对齐 (RLHF → DPO → GRPO)", + "mainline": [ + { + "title": "InstructGPT: Training Language Models to Follow Instructions (RLHF + PPO)", + "authors": "Ouyang et al.", + "year": 2022, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "2203.02155" + }, + { + "title": "Constitutional AI: Harmlessness from AI Feedback", + "authors": "Bai et al.", + "year": 2022, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2212.08073" + }, + { + "title": "DPO: Direct Preference Optimization", + "authors": "Rafailov et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2305.18290" + }, + { + "title": "DeepSeek-R1 / GRPO: 纯 RL 驱动推理涌现 (无人工标注)", + "authors": "DeepSeek-AI", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2501.12948" + }, + { + "title": "SimPO / ORPO / KTO: 参考模型不可知的对齐方法", + "authors": "Meng et al.", + "year": 2024, + "venue": "ICML", + "tags": [ + "前沿" + ], + "arxiv": "2405.14734" + } + ], + "branches": [ + { + "title": "RLAIF: Constitutional AI — RL from AI Feedback", + "authors": "Bai et al.", + "year": 2022, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2212.08073" + } + ], + "forward": [] + }, + { + "id": "reasoning", + "name": "推理增强 (Reasoning)", + "mainline": [ + { + "title": "Chain-of-Thought Prompting Elicits Reasoning in LLMs", + "authors": "Wei et al.", + "year": 2022, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "2201.11903" + }, + { + "title": "Self-Consistency Improves Chain of Thought Reasoning", + "authors": "Wang et al.", + "year": 2023, + "venue": "ICLR", + "tags": [ + "关键节点" + ], + "arxiv": "2203.11171" + }, + { + "title": "STaR: Self-Taught Reasoner / ReST (自学习推理链)", + "authors": "Zelikman et al.", + "year": 2022, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2203.14465" + }, + { + "title": "OpenAI o1 System Card / o3 System Card (推理时 Scaling)", + "authors": "OpenAI", + "year": 2024, + "venue": "OpenAI", + "tags": [ + "前沿" + ], + "arxiv": "2412.16720" + }, + { + "title": "DeepSeek-R1: 开源推理模型 (RL + Cold-Start SFT)", + "authors": "DeepSeek-AI", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2501.12948" + }, + { + "title": "Kimi K2-Thinking (扩展推理, 256K上下文)", + "authors": "Moonshot AI", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2507.20534" + } + ], + "branches": [ + { + "title": "Tree of Thoughts: Deliberate Problem Solving with LLMs", + "authors": "Yao et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "支线" + ], + "arxiv": "2305.10601" + }, + { + "title": "PRM/ORM: Let's Verify Step by Step (过程/结果奖励模型)", + "authors": "Lightman et al.", + "year": 2024, + "venue": "ICLR", + "tags": [ + "支线" + ], + "arxiv": "2305.20050" + } + ], + "forward": [ + { + "title": "Quiet-STaR: Language Models Can Teach Themselves to Think Before Speaking", + "authors": "Zelikman et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "前瞻" + ], + "arxiv": "2403.09629" + } + ] + } + ] + }, + "compress": { + "id": "compress", + "name": "模型压缩", + "icon": "📦", + "desc": "量化、剪枝、蒸馏、KV Cache 压缩", + "color": "compress", + "areas": [ + { + "id": "quant", + "name": "量化 (Quantization)", + "mainline": [ + { + "title": "GPTQ: Accurate Post-Training Quantization for GPT", + "authors": "Frantar et al.", + "year": 2023, + "venue": "ICLR", + "tags": [ + "起点" + ], + "arxiv": "2210.17323" + }, + { + "title": "AWQ: Activation-aware Weight Quantization", + "authors": "Lin et al.", + "year": 2024, + "venue": "MLSys", + "tags": [ + "关键节点" + ], + "arxiv": "2306.00978" + }, + { + "title": "SmoothQuant: Accurate and Efficient Post-Training Quantization", + "authors": "Xiao et al.", + "year": 2024, + "venue": "ICML", + "tags": [ + "关键节点" + ], + "arxiv": "2211.10438" + }, + { + "title": "QuaRot / SpinQuant: Outlier-Free 4-bit Quantization", + "authors": "Ashkboos et al.", + "year": 2025, + "venue": "ICLR", + "tags": [ + "前沿" + ], + "arxiv": "2404.00456" + }, + { + "title": "DeepSeek-V4 FP4 QAT: 4-bit 量化感知训练 (99.7% recall)", + "authors": "DeepSeek-AI", + "year": 2026, + "venue": "DeepSeek", + "tags": [ + "前沿" + ], + "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" + } + ], + "branches": [ + { + "title": "FP8 / FP4 推理 (NVIDIA B200 原生支持)", + "authors": "Micikevicius et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2310.18313" + } + ], + "forward": [ + { + "title": "BitNet b1.58: 1.58-bit LLM (三值量化 {−1,0,1})", + "authors": "Wang et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "前瞻" + ], + "arxiv": "2402.17764" + } + ] + }, + { + "id": "prune", + "name": "剪枝 (Pruning)", + "mainline": [ + { + "title": "SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot", + "authors": "Frantar & Alistarh", + "year": 2023, + "venue": "ICML", + "tags": [ + "起点" + ], + "arxiv": "2301.00774" + }, + { + "title": "Wanda: A Simple and Effective Pruning Approach for LLMs", + "authors": "Sun et al.", + "year": 2024, + "venue": "ICLR", + "tags": [ + "关键节点" + ], + "arxiv": "2306.11695" + }, + { + "title": "SliceGPT: Compress LLMs by Deleting Rows/Columns", + "authors": "Ashkboos et al.", + "year": 2024, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2401.15024" + }, + { + "title": "LLM-Pruner / ShortGPT: 结构化剪枝 + 层剪枝", + "authors": "Ma et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2305.11627" + } + ], + "branches": [], + "forward": [] + }, + { + "id": "distill", + "name": "知识蒸馏", + "mainline": [ + { + "title": "Distilling the Knowledge in a Neural Network", + "authors": "Hinton et al.", + "year": 2015, + "venue": "arXiv", + "tags": [ + "起点" + ], + "arxiv": "1503.02531" + }, + { + "title": "Orca: Progressive Learning from Complex Explanation Traces", + "authors": "Mukherjee et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2306.02707" + }, + { + "title": "DeepSeek-R1 蒸馏 (R1 → Qwen/Llama 小模型)", + "authors": "DeepSeek-AI", + "year": 2025, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2501.12948" + }, + { + "title": "DeepSeek-V4 On-Policy Distillation: 10 Teacher Models → 1 Student", + "authors": "DeepSeek-AI", + "year": 2026, + "venue": "DeepSeek", + "tags": [ + "前沿" + ], + "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" + } + ], + "branches": [], + "forward": [] + }, + { + "id": "kvcache", + "name": "KV Cache 压缩", + "mainline": [ + { + "title": "StreamingLLM: Efficient Streaming Language Models with Attention Sinks", + "authors": "Xiao et al.", + "year": 2024, + "venue": "ICLR", + "tags": [ + "起点" + ], + "arxiv": "2309.17453" + }, + { + "title": "H2O: Heavy-Hitter Oracle for Efficient KV Cache Eviction", + "authors": "Zhang et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2306.14048" + }, + { + "title": "FastGen / KIVI / CacheGen: 自适应 KV 压缩与量化", + "authors": "Ge et al.", + "year": 2024, + "venue": "ICLR / SIGCOMM", + "tags": [ + "关键节点" + ], + "arxiv": "2310.07240" + }, + { + "title": "DeepSeek-V4 CSA+HCA: KV Cache 降至 V3.2 的 10%, FLOPs 降至 27%", + "authors": "DeepSeek-AI", + "year": 2026, + "venue": "DeepSeek", + "tags": [ + "前沿" + ], + "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" + }, + { + "title": "GQA/MQA: 从架构层面减少 KV 头 (Llama3/DeepSeek 标配)", + "authors": "Ainslie et al.", + "year": 2023, + "venue": "EMNLP", + "tags": [ + "前沿" + ], + "arxiv": "2305.13245" + } + ], + "branches": [], + "forward": [] + } + ] + }, + "deploy": { + "id": "deploy", + "name": "部署推理", + "icon": "🚀", + "desc": "推理引擎、加速技术、服务化、边缘部署、成本优化", + "color": "deploy", + "areas": [ + { + "id": "engine", + "name": "推理引擎与框架", + "mainline": [ + { + "title": "vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention", + "authors": "Kwon et al.", + "year": 2023, + "venue": "SOSP", + "tags": [ + "起点" + ], + "arxiv": "2309.06180" + }, + { + "title": "SGLang: Efficient LLM Programming and Serving", + "authors": "Zheng et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2312.07104" + } + ], + "branches": [ + { + "title": "MII / DeepSpeed-FastGen: Dynamic SplitFuse", + "authors": "Microsoft", + "year": 2024, + "venue": "Industry", + "tags": [ + "支线" + ], + "arxiv": "2401.08671" + } + ], + "forward": [] + }, + { + "id": "accel", + "name": "推理加速技术", + "mainline": [ + { + "title": "Speculative Decoding: Fast Inference from Transformers", + "authors": "Leviathan et al. / Chen et al.", + "year": 2023, + "venue": "ICML", + "tags": [ + "起点" + ], + "arxiv": "2211.17192" + }, + { + "title": "Medusa: Simple LLM Inference Acceleration with Multiple Heads", + "authors": "Cai et al.", + "year": 2024, + "venue": "ICML", + "tags": [ + "关键节点" + ], + "arxiv": "2401.10774" + }, + { + "title": "EAGLE / EAGLE-2: 推测解码框架 (无需 draft model)", + "authors": "Li et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2401.15077" + }, + { + "title": "Continuous Batching + Prefill-Decode Disaggregation", + "authors": "Patel et al. / Yu et al.", + "year": 2024, + "venue": "OSDI", + "tags": [ + "前沿" + ], + "arxiv": "2401.08671" + }, + { + "title": "DeepSeek-V3 Multi-Token Prediction (MTP) + V4 Flash 推理", + "authors": "DeepSeek-AI", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2412.19437" + } + ], + "branches": [ + { + "title": "FlashDecoding / FlashInfer: GPU 算子级加速", + "authors": "Dao et al. / Ye et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2310.12049" + }, + { + "title": "KV Cache Offloading (GPU ↔ CPU 动态迁移)", + "authors": "Sheng et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2303.06865" + } + ], + "forward": [] + } + ] + }, + "agent": { + "id": "agent", + "name": "Agent & 应用", + "icon": "🤖", + "desc": "Tool Use、RAG、Multi-Agent、MCP/A2A、Computer Use", + "color": "agent", + "areas": [ + { + "id": "react", + "name": "Agent 核心 (ReAct / Tool Use / Computer Use)", + "mainline": [ + { + "title": "ReAct: Synergizing Reasoning and Acting in Language Models", + "authors": "Yao et al.", + "year": 2023, + "venue": "ICLR", + "tags": [ + "起点" + ], + "arxiv": "2210.03629" + }, + { + "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", + "authors": "Schick et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2302.04761" + }, + { + "title": "SWE-Agent: Agent-Computer Interfaces for Software Engineering", + "authors": "Yang et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2405.15793" + }, + { + "title": "SEAgent: Self-Evolving Computer Use Agent (GRPO 自主学习)", + "authors": "Sun et al.", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2508.04700" + } + ], + "branches": [ + { + "title": "Voyager: Open-Ended Embodied Agent with LLMs (Minecraft)", + "authors": "Wang et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "支线" + ], + "arxiv": "2305.16291" + }, + { + "title": "Claude Opus 4 & Sonnet 4 System Card (Agent 安全评估)", + "authors": "Anthropic", + "year": 2025, + "venue": "Anthropic", + "tags": [ + "支线" + ], + "pdf": "https://www-cdn.anthropic.com/6be99a52cb68eb70eb9572b4cafad13df32ed995.pdf" + } + ], + "forward": [] + }, + { + "id": "rag", + "name": "RAG 检索增强生成", + "mainline": [ + { + "title": "RAG: Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", + "authors": "Lewis et al.", + "year": 2020, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "2005.11401" + }, + { + "title": "Self-RAG: Learning to Retrieve, Generate, and Critique", + "authors": "Asai et al.", + "year": 2024, + "venue": "ICLR", + "tags": [ + "关键节点" + ], + "arxiv": "2310.11511" + }, + { + "title": "GraphRAG: From Local to Global — Graph-based RAG", + "authors": "Edge et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2404.16130" + }, + { + "title": "Agentic RAG / Corrective RAG / Adaptive RAG", + "authors": "Yan et al. / Asai et al.", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2401.15884" + } + ], + "branches": [ + { + "title": "HyDE: Precise Zero-Shot Dense Retrieval without Relevant Labels", + "authors": "Gao et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2212.10496" + }, + { + "title": "ColBERT / ColPali: Late-Interaction / Visual RAG", + "authors": "Khattab et al.", + "year": 2022, + "venue": "SIGIR", + "tags": [ + "支线" + ], + "arxiv": "2112.01488" + } + ], + "forward": [] + }, + { + "id": "multiagent", + "name": "Multi-Agent & 协议", + "mainline": [ + { + "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", + "authors": "Wu et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "起点" + ], + "arxiv": "2308.08155" + }, + { + "title": "MetaGPT / ChatDev / CrewAI: 多 Agent 协作框架", + "authors": "Hong et al. / Qian et al.", + "year": 2024, + "venue": "ACL", + "tags": [ + "关键节点" + ], + "arxiv": "2308.00352" + }, + { + "title": "Kimi K2.5 Agent Swarm: 100 子 Agent / 1,500 工具调用", + "authors": "Moonshot AI", + "year": 2026, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2602.02276" + } + ], + "branches": [ + { + "title": "Gorilla: Large Language Model Connected with Massive APIs", + "authors": "Patil et al.", + "year": 2023, + "venue": "ICML", + "tags": [ + "支线" + ], + "arxiv": "2305.15334" + } + ], + "forward": [] + } + ] + }, + "eval": { + "id": "eval", + "name": "评估体系", + "icon": "📏", + "desc": "通用、推理、编码、Agent、安全评估", + "color": "eval", + "areas": [ + { + "id": "general", + "name": "通用能力评估", + "mainline": [ + { + "title": "MMLU: Measuring Massive Multitask Language Understanding", + "authors": "Hendrycks et al.", + "year": 2021, + "venue": "ICLR", + "tags": [ + "起点" + ], + "arxiv": "2009.03300" + }, + { + "title": "BIG-bench: Beyond the Imitation Game (204 tasks)", + "authors": "Srivastava et al.", + "year": 2023, + "venue": "TMLR", + "tags": [ + "关键节点" + ], + "arxiv": "2206.04615" + }, + { + "title": "MMLU-Pro: A More Robust and Challenging Benchmark", + "authors": "Wang et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2406.01574" + }, + { + "title": "LiveBench: A Challenging, Contamination-Free LLM Benchmark", + "authors": "White et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2406.19314" + }, + { + "title": "Humanity's Last Exam (HLE): 人类知识极限测试", + "authors": "Phan et al.", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2501.14249" + } + ], + "branches": [ + { + "title": "C-Eval / CMMLU: 中文评估", + "authors": "Huang et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "支线" + ], + "arxiv": "2305.08322" + } + ], + "forward": [] + }, + { + "id": "reasoning", + "name": "推理 & 数学评估", + "mainline": [ + { + "title": "GSM8K / MATH: 数学推理基础", + "authors": "Cobbe et al. / Hendrycks et al.", + "year": 2021, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "2103.03874" + }, + { + "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark", + "authors": "Rein et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2311.12022" + }, + { + "title": "BBH: Challenging BIG-Bench Tasks (BIG-Bench Hard)", + "authors": "Suzgun et al.", + "year": 2023, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2210.09261" + }, + { + "title": "HLE (Humanity's Last Exam): 3000问题极限测试", + "authors": "Phan et al.", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2501.14249" + } + ], + "branches": [], + "forward": [] + }, + { + "id": "code", + "name": "编码评估", + "mainline": [ + { + "title": "HumanEval / MBPP: 函数级代码生成", + "authors": "Chen et al. / Austin et al.", + "year": 2021, + "venue": "NeurIPS", + "tags": [ + "起点" + ], + "arxiv": "2107.03374" + }, + { + "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", + "authors": "Jimenez et al.", + "year": 2024, + "venue": "ICLR", + "tags": [ + "关键节点" + ], + "arxiv": "2310.06770" + }, + { + "title": "LiveCodeBench: Holistic and Contamination-Free Coding", + "authors": "Jain et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2408.07935" + }, + { + "title": "BigCodeBench: Benchmarking Code Generation with Diverse Tasks", + "authors": "Zhuo et al.", + "year": 2025, + "venue": "arXiv", + "tags": [ + "前沿" + ], + "arxiv": "2406.15877" + } + ], + "branches": [], + "forward": [] + }, + { + "id": "agent_eval", + "name": "Agent 评估", + "mainline": [ + { + "title": "BFCL: Berkeley Function Calling Leaderboard", + "authors": "Yan et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "起点" + ], + "arxiv": "2402.16053" + }, + { + "title": "τ-bench: Agent Tool Use & Task Completion", + "authors": "Yao et al.", + "year": 2024, + "venue": "arXiv", + "tags": [ + "关键节点" + ], + "arxiv": "2406.12045" + }, + { + "title": "WebArena / VisualWebArena: 真实 Web 任务 Agent 评估", + "authors": "Zhou et al.", + "year": 2024, + "venue": "ICLR", + "tags": [ + "关键节点" + ], + "arxiv": "2307.13854" + }, + { + "title": "GAIA: A Benchmark for General AI Assistants", + "authors": "Mialon et al.", + "year": 2023, + "venue": "NeurIPS", + "tags": [ + "关键节点" + ], + "arxiv": "2311.12983" + } + ], + "branches": [], + "forward": [] + } + ] + } +} \ No newline at end of file diff --git a/nginx.conf b/nginx.conf new file mode 100644 index 0000000..7ec7960 --- /dev/null +++ b/nginx.conf @@ -0,0 +1,55 @@ +# LLM 论文图书馆 — Nginx 反向代理配置 +# 部署路径: /etc/nginx/sites-available/llm-library + +server { + listen 80; + server_name your-domain.com; + + # 安全头 + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; + add_header Referrer-Policy "no-referrer" always; + + # 限制请求速率 (防止滥用) + limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s; + + # API 代理 + location /api/ { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + limit_req zone=api burst=20 nodelay; + } + + # PDF 代理 — 强制 inline 阻止 IDM 弹下载 + location /papers/ { + proxy_pass http://127.0.0.1:8741; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_read_timeout 60s; + # 追加 inline header 防止 IDM 拦截 + add_header Content-Disposition "inline" always; + add_header X-Content-Type-Options "nosniff" always; + } + + # 静态文件直接由 Nginx 服务 (性能更好) + location /style.css { + alias /opt/llm-library/static/style.css; + expires 1d; + } + location /app.js { + alias /opt/llm-library/static/app.js; + expires 1d; + } + location /favicon.ico { + return 204; + } + + # 首页 + location / { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + } +} diff --git a/proxy_conf.txt b/proxy_conf.txt new file mode 100644 index 0000000..622984d --- /dev/null +++ b/proxy_conf.txt @@ -0,0 +1,10 @@ +location / { + proxy_pass http://127.0.0.1:8741; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + # 强制 inline 阻止 IDM 弹下载 + add_header Content-Disposition "inline" always; + add_header X-Content-Type-Options "nosniff" always; +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..900ce4a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[project] +name = "llm-library" +version = "0.1.0" +description = "LLM 论文图书馆 — 可维护的大模型论文知识库" +requires-python = ">=3.10" +dependencies = [ + "fastapi>=0.115", + "uvicorn[standard]>=0.34", + "httpx>=0.28", + "pydantic>=2.10", + "python-multipart>=0.0.19", + "aiofiles>=24.0", + "tqdm>=4.66", +] + +[project.scripts] +llm-lib = "api.server:main" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d71dac4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +fastapi>=0.115 +uvicorn[standard]>=0.34 +httpx>=0.28 +pydantic>=2.10 +python-multipart>=0.0.19 +aiofiles>=24.0 +tqdm>=4.66 +PyMuPDF>=1.24 diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..9baaf0c --- /dev/null +++ b/start.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# LLM 论文图书馆 — 启动脚本 +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +cd "$SCRIPT_DIR" + +# 加载环境变量 +if [ -f .env ]; then + export $(grep -v '^#' .env | xargs) +fi + +# 生成 API Key (如果未设置) +if [ -z "$LLM_LIB_API_KEY" ]; then + export LLM_LIB_API_KEY=$(python3 -c "import secrets; print(secrets.token_urlsafe(32))") + echo "API_KEY=$LLM_LIB_API_KEY" > .env + echo "⚠️ 自动生成 API Key: $LLM_LIB_API_KEY" +fi + +echo "═══ LLM 论文图书馆 ═══" +echo " API Key: ${LLM_LIB_API_KEY:0:8}..." +echo " Port: ${PORT:-8000}" +echo " PDF Dir: papers/" +echo + +# 首次运行: 下载依赖 +if ! python3 -c "import fastapi" 2>/dev/null; then + echo "📦 安装依赖..." + pip install -r requirements.txt -q +fi + +# 如果 papers.json 不存在,从 HTML 重新提取 +if [ ! -f data/papers.json ]; then + echo "📊 提取论文数据..." + python3 api/extract_data.py || echo "⚠️ extract_data.py 失败,请手动运行" +fi + +# 启动服务 +echo "🚀 启动服务..." +exec python3 -m uvicorn api.server:app \ + --host 0.0.0.0 \ + --port ${PORT:-8000} \ + --log-level ${LOG_LEVEL:-info} diff --git a/static/app.js b/static/app.js new file mode 100644 index 0000000..62483d8 --- /dev/null +++ b/static/app.js @@ -0,0 +1,252 @@ +/** + * LLM 论文图书馆 — 前端 JS + * 页面加载检测 arXiv/HF 连通性 → 底部状态条 + * 点击论文:arXiv 连通? → iframe 直连 arXiv → 5s 超时 → HK 兜底 + * IDM 拦就拦,不额外对抗 + */ + +const API = '/api'; + +let modules = {}; +let moduleData = {}; +let pdfTimeout = null; +let networkStatus = {}; + +const $ = (sel) => document.querySelector(sel); +const $$ = (sel) => document.querySelectorAll(sel); +const TAG_CLASS = { '起点':'tag-start','关键节点':'tag-milestone','前沿':'tag-frontier','前瞻':'tag-forward','支线':'tag-branch' }; + +// ══════════════════ INIT ═══════════════════════════════ +async function init() { + buildStatusBar(); + try { + const resp = await fetch(`${API}/modules`); + const mods = await resp.json(); + for (const m of mods) modules[m.id] = m; + renderCards(mods); + attachGlowTracking(); + } catch (e) { console.error(e); } + checkSources(); + document.addEventListener('keydown', e => { + if (e.key === 'Escape') { + if ($('#pdfOverlay')?.classList.contains('open')) closePdf(); + else if ($('#overlay').classList.contains('open')) closeModal(); + } + }); +} + +// ══════════════════ STATUS BAR ════════════════════════ +function buildStatusBar() { + const bar = document.createElement('div'); + bar.id = 'statusBar'; bar.className = 'status-bar'; + bar.innerHTML = `连通性检测 + arXiv — + HuggingFace —`; + document.body.appendChild(bar); +} + +function setStatus(id, ok, ms, aborted) { + networkStatus[id.replace('status-','')] = ok; + const el = document.getElementById(id); if (!el) return; + el.querySelector('.status-dot').className = 'status-dot ' + (ok ? 'status-ok' : 'status-fail'); + const msEl = document.getElementById('ms-'+id.replace('status-','')); + if (msEl) { + if (aborted) msEl.textContent = '超时'; + else if (ok) msEl.textContent = ms+'ms'; + else msEl.textContent = '—'; + } +} + +async function checkSource(name, url, statusId) { + const start = performance.now(); + let aborted = false; + const probeUrl = url + '?_=' + Date.now(); + try { + const ctrl = new AbortController(); + setTimeout(() => { aborted = true; ctrl.abort(); }, 4000); + await fetch(probeUrl, { mode: 'no-cors', signal: ctrl.signal, cache: 'no-store' }); + const ms = Math.round(performance.now() - start); + setStatus(statusId, true, ms, false); + } catch { + const ms = Math.round(performance.now() - start); + setStatus(statusId, false, ms, aborted); + } +} + +function checkSources() { + checkSource('arxiv', 'https://arxiv.org/favicon.ico', 'status-arxiv'); + checkSource('hf', 'https://huggingface.co/favicon.ico', 'status-hf'); +} + +// ══════════════════ GLOW ══════════════════════════════ +function attachGlowTracking() { + $$('.card').forEach(card => { + card.addEventListener('mousemove', e => { + const r = card.getBoundingClientRect(); + card.style.setProperty('--mx', (e.clientX-r.left)/r.width*100+'%'); + card.style.setProperty('--my', (e.clientY-r.top)/r.height*100+'%'); + }); + }); +} + +// ══════════════════ CARDS → MODAL → PAPERS ═══════════ +function renderCards(mods) { + $('#moduleGrid').innerHTML = mods.map(m => + `
暂无论文数据
'; +} + +function renderPaper(p) { + const pdfUrl = getPdfLink(p); + const tags = (p.tags||[]).map(t=>`${t}`).join(' '); + const links = []; + if (pdfUrl) links.push(``); + else if (p.arxiv) links.push(`📋 arXiv`); + return `