feat: LLM 论文图书馆 — 初始提交

- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
2026-06-02 10:25:14 +00:00
commit f0ff62e082
21 changed files with 4042 additions and 0 deletions
@@ -0,0 +1 @@
+# LLM 论文图书馆 — API 模块
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""Backfill all uncached PDFs, skipping dead arXiv IDs"""
+import json, subprocess, sys, time
+from pathlib import Path
+
+PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json"
+ARXIV_DIR   = Path(__file__).resolve().parent.parent / "papers" / "arxiv"
+LOG_FILE    = Path("/app/papers/backfill.log")
+
+def main():
+    with open(PAPERS_JSON) as f:
+        data = json.load(f)
+
+    # Collect all arxiv IDs
+    arxiv_ids = set()
+    for mod in data.values():
+        for area in mod.get("areas", []):
+            for section in ("mainline", "branches", "forward"):
+                for p in area.get(section, []):
+                    aid = p.get("arxiv")
+                    if aid:
+                        arxiv_ids.add(aid)
+
+    cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")}
+    missing = [aid for aid in arxiv_ids if aid not in cached]
+    print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}")
+
+    if not missing:
+        print("All caught up!")
+        return
+
+    ok, fail = 0, 0
+    for aid in missing:
+        url = f"https://arxiv.org/pdf/{aid}.pdf"
+        dest = ARXIV_DIR / f"{aid}.pdf"
+        try:
+            r = subprocess.run(
+                ["wget", "-q", "-T", "15", "-O", str(dest), url],
+                timeout=20
+            )
+            if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000:
+                ok += 1
+                print(f"  OK  {aid}  ({dest.stat().st_size//1024} KB)")
+            else:
+                dest.unlink(missing_ok=True)
+                fail += 1
+                print(f"  FAIL {aid}  (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})")
+        except Exception as e:
+            dest.unlink(missing_ok=True)
+            fail += 1
+            print(f"  ERR {aid}  {e}")
+        time.sleep(0.8)  # Be nice to arXiv
+
+    print(f"\nDone: {ok} ok, {fail} failed")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,13 @@
+import urllib.request, json
+r = urllib.request.urlopen("http://127.0.0.1:8000/api/translate/2005.14165")
+data = json.loads(r.read())
+p = data["paragraphs"][0]
+page1 = p["page"]
+en1 = p["en"][:60]
+zh1 = p["zh"][:80]
+print(f"page={page1}, en={en1}")
+print(f"zh={zh1}")
+p5 = data["paragraphs"][5]
+page5 = p5["page"]
+zh5 = p5["zh"][:80]
+print(f"para[5] page={page5}, zh={zh5}")
@@ -0,0 +1,198 @@
+"""
+LLM 论文图书馆 — PDF 下载器
+从 arXiv 和 HuggingFace 下载论文 PDF 到本地缓存
+"""
+
+import os
+import json
+import time
+import logging
+from pathlib import Path
+
+import httpx
+from tqdm import tqdm
+
+ROOT = Path(__file__).resolve().parent.parent
+DATA_FILE = ROOT / "data" / "papers.json"
+ARXIV_DIR = ROOT / "papers" / "arxiv"
+HF_DIR = ROOT / "papers" / "hf"
+LOG_FILE = ROOT / "papers" / "download.log"
+
+log = logging.getLogger("downloader")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    handlers=[
+        logging.FileHandler(LOG_FILE),
+        logging.StreamHandler(),
+    ],
+)
+
+
+def collect_urls() -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
+    """从 papers.json 收集所有需要下载的 PDF URL
+
+    Returns:
+        arxiv_list: [(arxiv_id, title), ...]
+        hf_list: [(url, filename), ...]
+    """
+    with open(DATA_FILE) as f:
+        data = json.load(f)
+
+    arxiv_seen = set()
+    hf_seen = set()
+    arxiv_list = []
+    hf_list = []
+
+    for mod in data.values():
+        for area in mod.get("areas", []):
+            for section in ("mainline", "branches", "forward"):
+                for p in area.get(section, []):
+                    if p.get("arxiv") and p["arxiv"] not in arxiv_seen:
+                        arxiv_seen.add(p["arxiv"])
+                        arxiv_list.append((p["arxiv"], p.get("title", "")))
+                    if p.get("pdf") and p["pdf"] not in hf_seen:
+                        hf_seen.add(p["pdf"])
+                        # Derive a safe filename from the URL
+                        name = p["pdf"].split("/")[-1].replace(".pdf", "")
+                        hf_list.append((p["pdf"], name))
+
+    return arxiv_list, hf_list
+
+
+def download_arxiv(client: httpx.Client, arxiv_id: str, title: str) -> bool:
+    """下载单个 arXiv PDF"""
+    pdf_path = ARXIV_DIR / f"{arxiv_id}.pdf"
+    if pdf_path.exists():
+        log.debug(f"Skip (exists): {arxiv_id}")
+        return True
+
+    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+    try:
+        resp = client.get(url, follow_redirects=True, timeout=30)
+        resp.raise_for_status()
+
+        # Verify it's actually a PDF (arxiv returns HTML for missing papers)
+        content_type = resp.headers.get("content-type", "")
+        if "pdf" not in content_type and not resp.content.startswith(b"%PDF"):
+            log.warning(f"Not a PDF: {arxiv_id} — {title[:60]}")
+            return False
+
+        pdf_path.write_bytes(resp.content)
+        size_kb = len(resp.content) / 1024
+        log.info(f"OK: {arxiv_id} ({size_kb:.0f} KB) — {title[:60]}")
+        return True
+    except httpx.HTTPError as e:
+        log.error(f"HTTP error {arxiv_id}: {e}")
+        return False
+    except Exception as e:
+        log.error(f"Error {arxiv_id}: {e}")
+        return False
+
+
+def download_hf(client: httpx.Client, url: str, filename: str) -> bool:
+    """下载单个 HuggingFace PDF"""
+    safe_name = filename.replace("..", "").replace("/", "_")
+    pdf_path = HF_DIR / f"{safe_name}.pdf"
+    if pdf_path.exists():
+        log.debug(f"Skip (exists): {safe_name}")
+        return True
+
+    try:
+        resp = client.get(url, follow_redirects=True, timeout=60)
+        resp.raise_for_status()
+
+        if not resp.content.startswith(b"%PDF"):
+            log.warning(f"Not a PDF: {safe_name}")
+            return False
+
+        pdf_path.write_bytes(resp.content)
+        size_kb = len(resp.content) / 1024
+        log.info(f"OK (HF): {safe_name} ({size_kb:.0f} KB)")
+        return True
+    except httpx.HTTPError as e:
+        log.error(f"HTTP error {safe_name}: {e}")
+        return False
+    except Exception as e:
+        log.error(f"Error {safe_name}: {e}")
+        return False
+
+
+def run(incremental: bool = True, limit: int = 0, delay: float = 1.0):
+    """批量下载所有 PDF
+
+    Args:
+        incremental: True=跳过已有文件
+        limit: 0=全部, N=只下载前N篇
+        delay: 请求间延迟(秒)
+    """
+    ARXIV_DIR.mkdir(parents=True, exist_ok=True)
+    HF_DIR.mkdir(parents=True, exist_ok=True)
+
+    arxiv_list, hf_list = collect_urls()
+    total = len(arxiv_list) + len(hf_list)
+    log.info(f"Found {len(arxiv_list)} arXiv + {len(hf_list)} HF = {total} PDFs to download")
+    log.info(f"Incremental: {incremental}, Delay: {delay}s")
+
+    if not incremental:
+        log.warning("Non-incremental mode: will re-download existing files")
+
+    # Count existing
+    arxiv_existing = sum(1 for aid, _ in arxiv_list if (ARXIV_DIR / f"{aid}.pdf").exists())
+    hf_existing = sum(1 for _, name in hf_list if (HF_DIR / f"{name}.pdf").exists())
+    log.info(f"Already cached: {arxiv_existing} arXiv + {hf_existing} HF")
+
+    ok, fail = 0, 0
+    total_size = 0.0
+
+    with httpx.Client(
+        headers={"User-Agent": "LLM-Library-Downloader/0.1"},
+        timeout=30,
+        follow_redirects=True,
+    ) as client:
+
+        # Download arXiv
+        if limit > 0:
+            arxiv_list = arxiv_list[:limit]
+        for arxiv_id, title in tqdm(arxiv_list, desc="arXiv"):
+            if incremental and (ARXIV_DIR / f"{arxiv_id}.pdf").exists():
+                ok += 1
+                continue
+            success = download_arxiv(client, arxiv_id, title)
+            if success:
+                ok += 1
+                p = ARXIV_DIR / f"{arxiv_id}.pdf"
+                if p.exists():
+                    total_size += p.stat().st_size
+            else:
+                fail += 1
+            time.sleep(delay)
+
+        # Download HF
+        if limit > 0:
+            hf_list = hf_list[:limit]
+        for url, name in tqdm(hf_list, desc="HF   "):
+            if incremental and (HF_DIR / f"{name}.pdf").exists():
+                ok += 1
+                continue
+            success = download_hf(client, url, name)
+            if success:
+                ok += 1
+                p = HF_DIR / f"{name}.pdf"
+                if p.exists():
+                    total_size += p.stat().st_size
+            else:
+                fail += 1
+            time.sleep(delay)
+
+    log.info(f"Done: {ok} OK, {fail} failed, {total_size/1024/1024:.1f} MB total")
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="下载论文 PDF 到本地缓存")
+    parser.add_argument("--no-incremental", action="store_true", help="重新下载所有 (默认跳过已有)")
+    parser.add_argument("--limit", type=int, default=0, help="限制下载数量 (0=全部)")
+    parser.add_argument("--delay", type=float, default=1.0, help="请求间延迟 (秒)")
+    args = parser.parse_args()
+    run(incremental=not args.no_incremental, limit=args.limit, delay=args.delay)
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""Build papers.json by regex-extracting paper entries from llm_library.html"""
+import re, json, os
+
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+html_path = os.path.join(ROOT, 'llm_library.html')
+
+with open(html_path, 'r') as f:
+    html = f.read()
+
+# Step 1: Parse modules (each module is a top-level key in PAPER_DATA)
+# Find each module block by matching "  arch: {" style patterns
+# Actually, let's parse line by line since this is a human-readable format
+
+# Simpler approach: extract all paper entries with regex
+# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] }
+paper_re = re.compile(
+    r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*'
+    r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)'
+    r'tags:\s*\[(.*?)\]\s*\}',
+    re.DOTALL
+)
+
+papers = []
+for m in paper_re.finditer(html):
+    title = m.group(1)
+    authors = m.group(2)
+    year = int(m.group(3))
+    venue = m.group(4)
+    arxiv = m.group(5) or None
+    pdf = m.group(6) or None
+    tags_str = m.group(7)
+    tags = re.findall(r'"([^"]*)"', tags_str)
+    
+    # Find which module/area this paper belongs to
+    pos = m.start()
+    # Search backwards for module and area context
+    before = html[max(0,pos-3000):pos]
+    
+    # Find module id
+    mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before)
+    if not mod_match:
+        # Try broader pattern
+        mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before)
+    if mod_match:
+        mod_id = mod_match.group(1)
+        mod_name = mod_match.group(2)
+    else:
+        mod_id = 'unknown'
+        mod_name = 'Unknown'
+    
+    # Find area id  
+    area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before)
+    if area_match:
+        area_id = area_match.group(1)
+        area_name = area_match.group(2)
+    else:
+        area_id = 'unknown'
+        area_name = 'Unknown'
+    
+    papers.append({
+        'module': mod_id,
+        'module_name': mod_name,
+        'area': area_id,
+        'area_name': area_name,
+        'title': title,
+        'authors': authors,
+        'year': year,
+        'venue': venue,
+        'arxiv': arxiv,
+        'pdf': pdf,
+        'tags': tags,
+    })
+
+print(f'Extracted {len(papers)} papers')
+
+# Group by module → area → section (mainline/branches/forward)
+# For now, just save as flat list for verification
+# We'll reconstruct the proper nested structure after verifying
+
+# Also extract module metadata
+modules = {}
+for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html):
+    mod_id = m.group(1)
+    modules[mod_id] = {
+        'name': m.group(2),
+        'icon': m.group(3),
+        'desc': m.group(4),
+        'color': mod_id,
+        'areas': []
+    }
+
+print(f'Found {len(modules)} modules')
+for mod_id, mod in modules.items():
+    print(f'  {mod_id}: {mod["name"]}')
+
+# Save the flat list for now
+output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json')
+os.makedirs(os.path.dirname(output_path), exist_ok=True)
+with open(output_path, 'w') as f:
+    json.dump(papers, f, ensure_ascii=False, indent=2)
+
+print(f'Saved {len(papers)} papers (flat) to {output_path}')
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""Parse llm_library.html PAPER_DATA block → nested papers.json"""
+import re, json, os
+
+HTML = '/app/working/workspaces/default/llm_library.html'
+JSON = '/app/working/workspaces/default/llm-library/data/papers.json'
+
+with open(HTML) as f:
+    html = f.read()
+
+s = html.index('const PAPER_DATA = {')
+e = html.index('APP STATE')
+block = html[s+22:e]
+
+modules = {}
+current_mod = None
+current_area = None
+current_section = 'mainline'
+
+for line in block.split('\n'):
+    stripped = line.strip()
+    if not stripped:
+        continue
+    indent = len(line) - len(line.lstrip())
+    
+    # Module start: "  arch: {" at indent 2
+    if indent == 2 and re.match(r'^\w+:\s*\{', stripped):
+        mid = stripped.split(':')[0]
+        if current_mod and current_mod.get('name'):
+            modules[current_mod['id']] = current_mod
+        current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []}
+        current_area = None
+        current_section = 'mainline'
+        continue
+    
+    if not current_mod:
+        continue
+    
+    # Module metadata at indent 4
+    if indent == 4:
+        m = re.match(r'(\w+):\s*"([^"]*)"', stripped)
+        if m and m.group(1) in ('name', 'icon', 'desc', 'color'):
+            current_mod[m.group(1)] = m.group(2)
+    
+    # Area header at indent 8
+    if indent == 8:
+        m_id = re.match(r'id:\s*"(\w+)"', stripped)
+        if m_id:
+            current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []}
+            current_mod['areas'].append(current_area)
+            current_section = 'mainline'
+            continue
+        m_name = re.match(r'name:\s*"([^"]+)"', stripped)
+        if m_name and current_area:
+            current_area['name'] = m_name.group(1)
+            continue
+        if re.match(r'mainline:\s*\[', stripped):
+            current_section = 'mainline'
+        elif re.match(r'branches:\s*\[', stripped):
+            current_section = 'branches'
+        elif re.match(r'forward:\s*\[', stripped):
+            current_section = 'forward'
+    
+    # Paper entry
+    if stripped.startswith('{ title:') and 'tags:' in stripped and current_area:
+        title = re.search(r'title:\s*"([^"]+)"', stripped)
+        authors = re.search(r'authors:\s*"([^"]*?)"', stripped)
+        year = re.search(r'year:\s*(\d+)', stripped)
+        venue = re.search(r'venue:\s*"([^"]*?)"', stripped)
+        arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped)
+        pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped)
+        tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL)
+        
+        if title and year and tags_m:
+            tags = re.findall(r'"([^"]*)"', tags_m.group(1))
+            entry = {
+                'title': title.group(1),
+                'authors': authors.group(1) if authors else '',
+                'year': int(year.group(1)),
+                'venue': venue.group(1) if venue else '',
+                'tags': tags
+            }
+            if arxiv and arxiv.group(1):
+                entry['arxiv'] = arxiv.group(1)
+            if pdf and pdf.group(1):
+                entry['pdf'] = pdf.group(1)
+            current_area[current_section].append(entry)
+
+# Save last module
+if current_mod and current_mod.get('name'):
+    modules[current_mod['id']] = current_mod
+
+# Count
+total = sum(
+    len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[]))
+    for m in modules.values() for a in m.get('areas',[])
+)
+
+print(f'Parsed: {len(modules)} modules, {total} papers')
+for mid, m in sorted(modules.items()):
+    pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas'])
+    print(f'  {mid}: {m["name"]} — {len(m["areas"])} areas, {pc} papers')
+
+os.makedirs(os.path.dirname(JSON), exist_ok=True)
+with open(JSON, 'w') as f:
+    json.dump(modules, f, ensure_ascii=False, indent=2)
+print(f'\nSaved to {JSON}')
@@ -0,0 +1,484 @@
+"""
+LLM 论文图书馆 — FastAPI 后端
+提供 REST API 进行论文查询、管理、PDF 代理服务
+"""
+
+import json
+import os
+import hashlib
+import secrets
+import logging
+from pathlib import Path
+from typing import Optional
+
+from fastapi import FastAPI, HTTPException, Query, Depends, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+from pydantic import BaseModel
+
+# ─── Config ────────────────────────────────────────────
+ROOT = Path(__file__).resolve().parent.parent
+DATA_FILE = ROOT / "data" / "papers.json"
+PAPERS_DIR = ROOT / "papers"
+API_KEY = os.environ.get("LLM_LIB_API_KEY", "change-me")
+
+log = logging.getLogger("llm-library")
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+# ─── App ───────────────────────────────────────────────
+app = FastAPI(
+    title="LLM 论文图书馆",
+    description="大模型论文知识库 API — 查询、搜索、管理论文",
+    version="0.1.0",
+)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# ─── Auth ──────────────────────────────────────────────
+def verify_api_key(request: Request):
+    """简单的 API Key 鉴权 — 用于写操作 (POST/PUT/DELETE)"""
+    auth = request.headers.get("Authorization", "")
+    if auth.startswith("Bearer "):
+        token = auth[7:]
+    else:
+        token = request.query_params.get("api_key", "")
+    if not token or token != API_KEY:
+        raise HTTPException(status_code=401, detail="Invalid or missing API key")
+    return True
+
+# ─── Data loading ──────────────────────────────────────
+def load_data():
+    if not DATA_FILE.exists():
+        return {}
+    with open(DATA_FILE, 'r') as f:
+        return json.load(f)
+
+def save_data(data):
+    with open(DATA_FILE, 'w') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+# ─── Paper CRUD helpers ────────────────────────────────
+def find_paper(data, module_id, area_id, title):
+    """Find a paper index by title within module/area"""
+    mod = data.get(module_id)
+    if not mod:
+        return None, None, None, None
+    for area in mod.get("areas", []):
+        if area["id"] == area_id:
+            for section in ("mainline", "branches", "forward"):
+                for i, p in enumerate(area.get(section, [])):
+                    if p["title"] == title:
+                        return mod, area, section, i
+    return None, None, None, None
+
+# ─── Routes: Query ─────────────────────────────────────
+@app.get("/api/stats")
+def get_stats():
+    """获取图书馆统计信息"""
+    data = load_data()
+    mods = len(data)
+    areas = 0
+    papers = 0
+    sections = {"mainline": 0, "branches": 0, "forward": 0}
+    for mod in data.values():
+        areas += len(mod.get("areas", []))
+        for area in mod.get("areas", []):
+            for s in ("mainline", "branches", "forward"):
+                n = len(area.get(s, []))
+                papers += n
+                sections[s] += n
+    return {
+        "modules": mods,
+        "areas": areas,
+        "papers": papers,
+        "sections": sections,
+        "data_file": str(DATA_FILE),
+    }
+
+@app.get("/api/modules")
+def list_modules():
+    """列出所有模块 (不含论文详情)"""
+    data = load_data()
+    return [
+        {
+            "id": mid,
+            "name": m["name"],
+            "icon": m["icon"],
+            "desc": m["desc"],
+            "area_count": len(m.get("areas", [])),
+            "paper_count": sum(
+                len(a.get("mainline", [])) + len(a.get("branches", [])) + len(a.get("forward", []))
+                for a in m.get("areas", [])
+            ),
+        }
+        for mid, m in data.items()
+    ]
+
+@app.get("/api/modules/{module_id}")
+def get_module(module_id: str):
+    """获取单个模块的完整论文数据"""
+    data = load_data()
+    mod = data.get(module_id)
+    if not mod:
+        raise HTTPException(status_code=404, detail=f"Module '{module_id}' not found")
+    return mod
+
+@app.get("/api/papers")
+def search_papers(
+    q: str = Query(default="", description="搜索关键词: 标题/作者"),
+    module: Optional[str] = Query(default=None),
+    tag: Optional[str] = Query(default=None, description="起点/关键节点/前沿/前瞻/支线"),
+    limit: int = Query(default=50, ge=1, le=200),
+):
+    """搜索论文 (全文/按模块/按标签)"""
+    data = load_data()
+    results = []
+    q = q.lower()
+    for mid, mod in data.items():
+        if module and mid != module:
+            continue
+        for area in mod.get("areas", []):
+            for section in ("mainline", "branches", "forward"):
+                for p in area.get(section, []):
+                    # Filter by tag
+                    if tag and tag not in p.get("tags", []):
+                        continue
+                    # Filter by query
+                    if q:
+                        if q not in (p.get("title", "") + p.get("authors", "")).lower():
+                            continue
+                    results.append({
+                        "module_id": mid,
+                        "module_name": mod["name"],
+                        "area_id": area["id"],
+                        "area_name": area["name"],
+                        "section": section,
+                        **p,
+                    })
+                    if len(results) >= limit:
+                        break
+                if len(results) >= limit:
+                    break
+            if len(results) >= limit:
+                break
+        if len(results) >= limit:
+            break
+    return results
+
+# ─── Routes: Management (写操作, 需 API Key) ────────────
+class PaperCreate(BaseModel):
+    module_id: str
+    area_id: str
+    section: str = "mainline"  # mainline / branches / forward
+    title: str
+    authors: str = ""
+    year: int
+    venue: str = ""
+    arxiv: Optional[str] = None
+    pdf: Optional[str] = None
+    tags: list[str] = []
+
+class PaperUpdate(BaseModel):
+    authors: Optional[str] = None
+    year: Optional[int] = None
+    venue: Optional[str] = None
+    arxiv: Optional[str] = None
+    pdf: Optional[str] = None
+    tags: Optional[list[str]] = None
+    section: Optional[str] = None  # move to different section
+
+@app.post("/api/papers", dependencies=[Depends(verify_api_key)])
+def add_paper(paper: PaperCreate):
+    """添加一篇新论文"""
+    data = load_data()
+    mod = data.get(paper.module_id)
+    if not mod:
+        raise HTTPException(status_code=404, detail="Module not found")
+    
+    area = next((a for a in mod["areas"] if a["id"] == paper.area_id), None)
+    if not area:
+        raise HTTPException(status_code=404, detail="Area not found")
+    
+    section = paper.section
+    if section not in ("mainline", "branches", "forward"):
+        raise HTTPException(status_code=400, detail="section must be mainline/branches/forward")
+    
+    entry = {
+        "title": paper.title,
+        "authors": paper.authors,
+        "year": paper.year,
+        "venue": paper.venue,
+        "tags": paper.tags,
+    }
+    if paper.arxiv:
+        entry["arxiv"] = paper.arxiv
+    if paper.pdf:
+        entry["pdf"] = paper.pdf
+    
+    area.setdefault(section, []).append(entry)
+    save_data(data)
+    log.info(f"Added paper: {paper.title}")
+    return {"ok": True, "title": paper.title}
+
+@app.put("/api/papers")
+def update_paper(
+    module_id: str,
+    area_id: str,
+    title: str,
+    update: PaperUpdate,
+    _=Depends(verify_api_key),
+):
+    """更新一篇论文"""
+    data = load_data()
+    mod, area, section, idx = find_paper(data, module_id, area_id, title)
+    if mod is None:
+        raise HTTPException(status_code=404, detail="Paper not found")
+    
+    paper = area[section][idx]
+    for field in ("authors", "year", "venue", "arxiv", "pdf", "tags"):
+        val = getattr(update, field)
+        if val is not None:
+            paper[field] = val
+    
+    # Move to different section?
+    if update.section and update.section != section:
+        if update.section not in ("mainline", "branches", "forward"):
+            raise HTTPException(status_code=400, detail="Invalid section")
+        area[section].pop(idx)
+        area.setdefault(update.section, []).append(paper)
+    
+    save_data(data)
+    log.info(f"Updated paper: {title}")
+    return {"ok": True, "title": title}
+
+@app.delete("/api/papers")
+def delete_paper(
+    module_id: str,
+    area_id: str,
+    title: str,
+    _=Depends(verify_api_key),
+):
+    """删除一篇论文"""
+    data = load_data()
+    mod, area, section, idx = find_paper(data, module_id, area_id, title)
+    if mod is None:
+        raise HTTPException(status_code=404, detail="Paper not found")
+    
+    area[section].pop(idx)
+    save_data(data)
+    log.info(f"Deleted paper: {title}")
+    return {"ok": True, "title": title}
+
+# ─── Routes: PDF proxy ──────────────────────────────────
+@app.get("/papers/arxiv/{arxiv_id}.pdf")
+@app.get("/papers/arxiv/{arxiv_id}")
+def serve_arxiv_pdf(arxiv_id: str):
+    """从本地缓存提供 arXiv PDF（无 .pdf 后缀路由防 IDM 拦截）"""
+    pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
+    if not pdf_path.exists():
+        raise HTTPException(status_code=404, detail=f"PDF not in local cache: {arxiv_id}")
+    return FileResponse(
+        pdf_path, media_type="application/pdf",
+        headers={"Cache-Control": "public, max-age=86400"},
+    )
+
+@app.get("/papers/hf/{filename}.pdf")
+@app.get("/papers/hf/{filename}")
+def serve_hf_pdf(filename: str):
+    """从本地缓存提供 HuggingFace PDF（无 .pdf 后缀路由防 IDM 拦截）"""
+    safe_name = filename.replace("..", "").replace("/", "_").removesuffix(".pdf")
+    pdf_path = PAPERS_DIR / "hf" / f"{safe_name}.pdf"
+    if not pdf_path.exists():
+        raise HTTPException(status_code=404, detail=f"PDF not in local cache: {filename}")
+    return FileResponse(
+        pdf_path, media_type="application/pdf",
+        headers={"Cache-Control": "public, max-age=86400"},
+    )
+
+# ─── Routes: Translation ───────────────────────────────
+TRANSLATE_CACHE = ROOT / "data" / "translations"
+TRANSLATE_CACHE.mkdir(parents=True, exist_ok=True)
+
+def extract_pdf_text_with_pages(pdf_path: Path, max_chars: int = 12000) -> list[dict]:
+    """从 PDF 提取文本和页码信息，使用 pdftotext (Poppler) 避免 PyMuPDF GPU 依赖"""
+    import subprocess, tempfile
+    
+    # Strip arXiv stamp (first page header)
+    stamp = f"{pdf_path.stem}.pdf"  # e.g. "1706.03762.pdf"
+    
+    result = subprocess.run(
+        ["pdftotext", "-layout", "-q", str(pdf_path), "-"],
+        capture_output=True, text=True, timeout=30
+    )
+    
+    if result.returncode != 0:
+        log.error(f"pdftotext failed: {result.stderr}")
+        raise HTTPException(status_code=500, detail="PDF text extraction failed")
+    
+    text = result.stdout
+    
+    # Remove arXiv stamp line
+    import re
+    text = re.sub(r'arXiv:' + re.escape(stamp.split('.pdf')[0]) + r'.*?\n\n', '', text, flags=re.DOTALL)
+    text = re.sub(r'arXiv:' + re.escape(stamp) + r'.*?\n\n', '', text, flags=re.DOTALL)
+    
+    # Split by form-feed (page break)
+    pages = text.split('\f')
+    
+    result_pages = []
+    total = 0
+    for i, page_text in enumerate(pages):
+        pt = page_text.strip()
+        if not pt: continue
+        result_pages.append({"page": i + 1, "text": pt})
+        total += len(pt)
+        if total >= max_chars: break
+    
+    if not result_pages:
+        raise HTTPException(status_code=500, detail="No text extracted from PDF")
+    
+    return result_pages
+
+
+def split_text_with_pages(page_texts: list[dict], max_len: int = 400) -> list[dict]:
+    """将按页拆分的文本进一步拆为段落，保留页码"""
+    chunks = []
+    for pt in page_texts:
+        page = pt["page"]
+        text = pt["text"]
+        raw_paras = [p.strip() for p in text.split("\n\n") if p.strip()]
+        for para in raw_paras:
+            if len(para) <= max_len:
+                chunks.append({"page": page, "text": para})
+            else:
+                sentences = para.replace(". ", ".|").replace("? ", "?|").replace("! ", "!|").split("|")
+                current = ""
+                for s in sentences:
+                    s = s.strip()
+                    if not s: continue
+                    if len(current) + len(s) + 1 <= max_len:
+                        current = (current + " " + s).strip()
+                    else:
+                        if current: chunks.append({"page": page, "text": current})
+                        current = s
+                if current: chunks.append({"page": page, "text": current})
+    return chunks
+
+
+def translate_text(text: str, source: str = "en", target: str = "zh") -> str:
+    """使用 MyMemory 免费 API 翻译文本"""
+    import urllib.request
+    import urllib.parse
+
+    url = "https://api.mymemory.translated.net/get"
+    params = urllib.parse.urlencode({
+        "q": text,
+        "langpair": f"{source}|{target}",
+        "mt": "1",  # Force machine translation, not memory
+        "de": "me@llm-library.local",
+    })
+    full_url = f"{url}?{params}"
+
+    try:
+        with urllib.request.urlopen(full_url, timeout=15) as resp:
+            data = json.loads(resp.read())
+    except Exception as e:
+        log.warning(f"Translation API error: {e}")
+        return text
+
+    if data.get("responseStatus") == 200 and data.get("responseData"):
+        return data["responseData"]["translatedText"]
+    return text
+
+
+@app.get("/api/translate/{arxiv_id}")
+def translate_paper(arxiv_id: str):
+    """翻译论文正文 (从本地 PDF 提取文本，每段带页码)"""
+    pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
+    if not pdf_path.exists():
+        raise HTTPException(status_code=404, detail=f"PDF not cached: {arxiv_id}")
+
+    cache_file = TRANSLATE_CACHE / f"{arxiv_id}.json"
+    if cache_file.exists():
+        with open(cache_file) as f:
+            return json.load(f)
+
+    # Extract text with page numbers
+    log.info(f"Extracting text from {arxiv_id}")
+    page_texts = extract_pdf_text_with_pages(pdf_path)
+    chunks = split_text_with_pages(page_texts)
+    log.info(f"Translating {len(chunks)} paragraphs for {arxiv_id}")
+
+    translated = []
+    for i, chunk in enumerate(chunks):
+        if i % 10 == 0:
+            log.info(f"  [{arxiv_id}] translating paragraph {i+1}/{len(chunks)}")
+        zh = translate_text(chunk["text"])
+        translated.append({
+            "page": chunk["page"],
+            "en": chunk["text"],
+            "zh": zh,
+        })
+
+    result = {"arxiv_id": arxiv_id, "paragraphs": translated, "count": len(translated)}
+
+    with open(cache_file, "w") as f:
+        json.dump(result, f, ensure_ascii=False)
+
+    return result
+
+
+@app.get("/api/translate/{arxiv_id}/status")
+def translate_status(arxiv_id: str):
+    """检查翻译缓存状态"""
+    cache_file = TRANSLATE_CACHE / f"{arxiv_id}.json"
+    return {
+        "arxiv_id": arxiv_id,
+        "cached": cache_file.exists(),
+        "pdf_exists": (PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf").exists(),
+    }
+
+# ─── Routes: PDF download on-demand ────────────────────
+@app.post("/api/download/{arxiv_id}")
+def download_single_pdf(arxiv_id: str):
+    """按需下载单篇 arXiv PDF"""
+    import subprocess, sys
+    pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
+    if pdf_path.exists():
+        return {"ok": True, "arxiv_id": arxiv_id, "status": "cached"}
+
+    cmd = [sys.executable, str(ROOT / "api" / "downloader.py"), "--limit", "1", "--delay", "0"]
+    # We need a way to download specific arxiv IDs — for now, just run the downloader
+    # It will try all uncached papers, but the specific one will be among them
+    try:
+        subprocess.run(cmd, cwd=str(ROOT), timeout=60, capture_output=True)
+        if pdf_path.exists():
+            return {"ok": True, "arxiv_id": arxiv_id, "status": "downloaded"}
+        return {"ok": False, "arxiv_id": arxiv_id, "status": "failed"}
+    except subprocess.TimeoutExpired:
+        return {"ok": False, "arxiv_id": arxiv_id, "status": "timeout"}
+
+# ─── Health ─────────────────────────────────────────────
+@app.get("/api/health")
+def health():
+    return {"status": "ok", "version": "0.1.0"}
+
+# ─── Mount static frontend (at /) ──────────────────────
+# Static files mounted after API routes to avoid conflicts
+static_dir = ROOT / "static"
+if static_dir.exists() and any(static_dir.iterdir()):
+    app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static")
+
+# ─── Main ───────────────────────────────────────────────
+def main():
+    import uvicorn
+    uvicorn.run("api.server:app", host="0.0.0.0", port=8000, reload=True)
+
+if __name__ == "__main__":
+    main()