feat: LLM 论文图书馆 — 初始提交

- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理
- 180 篇论文数据 (data/papers.json): 9 模块、32 子领域
- 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读
- 底部状态栏: arXiv/HF 连通性检测
- PDF 加载: arXiv 优先(5s超时) → HK 本地兜底
- Docker 化部署 (Dockerfile + start.sh + nginx.conf)
- arXiv + HF 批量下载器 (api/downloader.py)
This commit is contained in:
2026-06-02 10:25:14 +00:00
commit f0ff62e082
21 changed files with 4042 additions and 0 deletions

1
api/__init__.py Normal file
View File

@@ -0,0 +1 @@
# LLM 论文图书馆 — API 模块

57
api/backfill.py Normal file
View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
"""Backfill all uncached PDFs, skipping dead arXiv IDs"""
import json, subprocess, sys, time
from pathlib import Path
PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json"
ARXIV_DIR = Path(__file__).resolve().parent.parent / "papers" / "arxiv"
LOG_FILE = Path("/app/papers/backfill.log")
def main():
with open(PAPERS_JSON) as f:
data = json.load(f)
# Collect all arxiv IDs
arxiv_ids = set()
for mod in data.values():
for area in mod.get("areas", []):
for section in ("mainline", "branches", "forward"):
for p in area.get(section, []):
aid = p.get("arxiv")
if aid:
arxiv_ids.add(aid)
cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")}
missing = [aid for aid in arxiv_ids if aid not in cached]
print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}")
if not missing:
print("All caught up!")
return
ok, fail = 0, 0
for aid in missing:
url = f"https://arxiv.org/pdf/{aid}.pdf"
dest = ARXIV_DIR / f"{aid}.pdf"
try:
r = subprocess.run(
["wget", "-q", "-T", "15", "-O", str(dest), url],
timeout=20
)
if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000:
ok += 1
print(f" OK {aid} ({dest.stat().st_size//1024} KB)")
else:
dest.unlink(missing_ok=True)
fail += 1
print(f" FAIL {aid} (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})")
except Exception as e:
dest.unlink(missing_ok=True)
fail += 1
print(f" ERR {aid} {e}")
time.sleep(0.8) # Be nice to arXiv
print(f"\nDone: {ok} ok, {fail} failed")
if __name__ == "__main__":
main()

13
api/check_trans.py Normal file
View File

@@ -0,0 +1,13 @@
import urllib.request, json
r = urllib.request.urlopen("http://127.0.0.1:8000/api/translate/2005.14165")
data = json.loads(r.read())
p = data["paragraphs"][0]
page1 = p["page"]
en1 = p["en"][:60]
zh1 = p["zh"][:80]
print(f"page={page1}, en={en1}")
print(f"zh={zh1}")
p5 = data["paragraphs"][5]
page5 = p5["page"]
zh5 = p5["zh"][:80]
print(f"para[5] page={page5}, zh={zh5}")

198
api/downloader.py Normal file
View File

@@ -0,0 +1,198 @@
"""
LLM 论文图书馆 — PDF 下载器
从 arXiv 和 HuggingFace 下载论文 PDF 到本地缓存
"""
import os
import json
import time
import logging
from pathlib import Path
import httpx
from tqdm import tqdm
ROOT = Path(__file__).resolve().parent.parent
DATA_FILE = ROOT / "data" / "papers.json"
ARXIV_DIR = ROOT / "papers" / "arxiv"
HF_DIR = ROOT / "papers" / "hf"
LOG_FILE = ROOT / "papers" / "download.log"
log = logging.getLogger("downloader")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler(),
],
)
def collect_urls() -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
"""从 papers.json 收集所有需要下载的 PDF URL
Returns:
arxiv_list: [(arxiv_id, title), ...]
hf_list: [(url, filename), ...]
"""
with open(DATA_FILE) as f:
data = json.load(f)
arxiv_seen = set()
hf_seen = set()
arxiv_list = []
hf_list = []
for mod in data.values():
for area in mod.get("areas", []):
for section in ("mainline", "branches", "forward"):
for p in area.get(section, []):
if p.get("arxiv") and p["arxiv"] not in arxiv_seen:
arxiv_seen.add(p["arxiv"])
arxiv_list.append((p["arxiv"], p.get("title", "")))
if p.get("pdf") and p["pdf"] not in hf_seen:
hf_seen.add(p["pdf"])
# Derive a safe filename from the URL
name = p["pdf"].split("/")[-1].replace(".pdf", "")
hf_list.append((p["pdf"], name))
return arxiv_list, hf_list
def download_arxiv(client: httpx.Client, arxiv_id: str, title: str) -> bool:
"""下载单个 arXiv PDF"""
pdf_path = ARXIV_DIR / f"{arxiv_id}.pdf"
if pdf_path.exists():
log.debug(f"Skip (exists): {arxiv_id}")
return True
url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
try:
resp = client.get(url, follow_redirects=True, timeout=30)
resp.raise_for_status()
# Verify it's actually a PDF (arxiv returns HTML for missing papers)
content_type = resp.headers.get("content-type", "")
if "pdf" not in content_type and not resp.content.startswith(b"%PDF"):
log.warning(f"Not a PDF: {arxiv_id}{title[:60]}")
return False
pdf_path.write_bytes(resp.content)
size_kb = len(resp.content) / 1024
log.info(f"OK: {arxiv_id} ({size_kb:.0f} KB) — {title[:60]}")
return True
except httpx.HTTPError as e:
log.error(f"HTTP error {arxiv_id}: {e}")
return False
except Exception as e:
log.error(f"Error {arxiv_id}: {e}")
return False
def download_hf(client: httpx.Client, url: str, filename: str) -> bool:
"""下载单个 HuggingFace PDF"""
safe_name = filename.replace("..", "").replace("/", "_")
pdf_path = HF_DIR / f"{safe_name}.pdf"
if pdf_path.exists():
log.debug(f"Skip (exists): {safe_name}")
return True
try:
resp = client.get(url, follow_redirects=True, timeout=60)
resp.raise_for_status()
if not resp.content.startswith(b"%PDF"):
log.warning(f"Not a PDF: {safe_name}")
return False
pdf_path.write_bytes(resp.content)
size_kb = len(resp.content) / 1024
log.info(f"OK (HF): {safe_name} ({size_kb:.0f} KB)")
return True
except httpx.HTTPError as e:
log.error(f"HTTP error {safe_name}: {e}")
return False
except Exception as e:
log.error(f"Error {safe_name}: {e}")
return False
def run(incremental: bool = True, limit: int = 0, delay: float = 1.0):
"""批量下载所有 PDF
Args:
incremental: True=跳过已有文件
limit: 0=全部, N=只下载前N篇
delay: 请求间延迟(秒)
"""
ARXIV_DIR.mkdir(parents=True, exist_ok=True)
HF_DIR.mkdir(parents=True, exist_ok=True)
arxiv_list, hf_list = collect_urls()
total = len(arxiv_list) + len(hf_list)
log.info(f"Found {len(arxiv_list)} arXiv + {len(hf_list)} HF = {total} PDFs to download")
log.info(f"Incremental: {incremental}, Delay: {delay}s")
if not incremental:
log.warning("Non-incremental mode: will re-download existing files")
# Count existing
arxiv_existing = sum(1 for aid, _ in arxiv_list if (ARXIV_DIR / f"{aid}.pdf").exists())
hf_existing = sum(1 for _, name in hf_list if (HF_DIR / f"{name}.pdf").exists())
log.info(f"Already cached: {arxiv_existing} arXiv + {hf_existing} HF")
ok, fail = 0, 0
total_size = 0.0
with httpx.Client(
headers={"User-Agent": "LLM-Library-Downloader/0.1"},
timeout=30,
follow_redirects=True,
) as client:
# Download arXiv
if limit > 0:
arxiv_list = arxiv_list[:limit]
for arxiv_id, title in tqdm(arxiv_list, desc="arXiv"):
if incremental and (ARXIV_DIR / f"{arxiv_id}.pdf").exists():
ok += 1
continue
success = download_arxiv(client, arxiv_id, title)
if success:
ok += 1
p = ARXIV_DIR / f"{arxiv_id}.pdf"
if p.exists():
total_size += p.stat().st_size
else:
fail += 1
time.sleep(delay)
# Download HF
if limit > 0:
hf_list = hf_list[:limit]
for url, name in tqdm(hf_list, desc="HF "):
if incremental and (HF_DIR / f"{name}.pdf").exists():
ok += 1
continue
success = download_hf(client, url, name)
if success:
ok += 1
p = HF_DIR / f"{name}.pdf"
if p.exists():
total_size += p.stat().st_size
else:
fail += 1
time.sleep(delay)
log.info(f"Done: {ok} OK, {fail} failed, {total_size/1024/1024:.1f} MB total")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="下载论文 PDF 到本地缓存")
parser.add_argument("--no-incremental", action="store_true", help="重新下载所有 (默认跳过已有)")
parser.add_argument("--limit", type=int, default=0, help="限制下载数量 (0=全部)")
parser.add_argument("--delay", type=float, default=1.0, help="请求间延迟 (秒)")
args = parser.parse_args()
run(incremental=not args.no_incremental, limit=args.limit, delay=args.delay)

103
api/extract_data.py Normal file
View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""Build papers.json by regex-extracting paper entries from llm_library.html"""
import re, json, os
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
html_path = os.path.join(ROOT, 'llm_library.html')
with open(html_path, 'r') as f:
html = f.read()
# Step 1: Parse modules (each module is a top-level key in PAPER_DATA)
# Find each module block by matching " arch: {" style patterns
# Actually, let's parse line by line since this is a human-readable format
# Simpler approach: extract all paper entries with regex
# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] }
paper_re = re.compile(
r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*'
r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)'
r'tags:\s*\[(.*?)\]\s*\}',
re.DOTALL
)
papers = []
for m in paper_re.finditer(html):
title = m.group(1)
authors = m.group(2)
year = int(m.group(3))
venue = m.group(4)
arxiv = m.group(5) or None
pdf = m.group(6) or None
tags_str = m.group(7)
tags = re.findall(r'"([^"]*)"', tags_str)
# Find which module/area this paper belongs to
pos = m.start()
# Search backwards for module and area context
before = html[max(0,pos-3000):pos]
# Find module id
mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before)
if not mod_match:
# Try broader pattern
mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before)
if mod_match:
mod_id = mod_match.group(1)
mod_name = mod_match.group(2)
else:
mod_id = 'unknown'
mod_name = 'Unknown'
# Find area id
area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before)
if area_match:
area_id = area_match.group(1)
area_name = area_match.group(2)
else:
area_id = 'unknown'
area_name = 'Unknown'
papers.append({
'module': mod_id,
'module_name': mod_name,
'area': area_id,
'area_name': area_name,
'title': title,
'authors': authors,
'year': year,
'venue': venue,
'arxiv': arxiv,
'pdf': pdf,
'tags': tags,
})
print(f'Extracted {len(papers)} papers')
# Group by module → area → section (mainline/branches/forward)
# For now, just save as flat list for verification
# We'll reconstruct the proper nested structure after verifying
# Also extract module metadata
modules = {}
for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html):
mod_id = m.group(1)
modules[mod_id] = {
'name': m.group(2),
'icon': m.group(3),
'desc': m.group(4),
'color': mod_id,
'areas': []
}
print(f'Found {len(modules)} modules')
for mod_id, mod in modules.items():
print(f' {mod_id}: {mod["name"]}')
# Save the flat list for now
output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json')
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
json.dump(papers, f, ensure_ascii=False, indent=2)
print(f'Saved {len(papers)} papers (flat) to {output_path}')

107
api/parse_papers.py Normal file
View File

@@ -0,0 +1,107 @@
#!/usr/bin/env python3
"""Parse llm_library.html PAPER_DATA block → nested papers.json"""
import re, json, os
HTML = '/app/working/workspaces/default/llm_library.html'
JSON = '/app/working/workspaces/default/llm-library/data/papers.json'
with open(HTML) as f:
html = f.read()
s = html.index('const PAPER_DATA = {')
e = html.index('APP STATE')
block = html[s+22:e]
modules = {}
current_mod = None
current_area = None
current_section = 'mainline'
for line in block.split('\n'):
stripped = line.strip()
if not stripped:
continue
indent = len(line) - len(line.lstrip())
# Module start: " arch: {" at indent 2
if indent == 2 and re.match(r'^\w+:\s*\{', stripped):
mid = stripped.split(':')[0]
if current_mod and current_mod.get('name'):
modules[current_mod['id']] = current_mod
current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []}
current_area = None
current_section = 'mainline'
continue
if not current_mod:
continue
# Module metadata at indent 4
if indent == 4:
m = re.match(r'(\w+):\s*"([^"]*)"', stripped)
if m and m.group(1) in ('name', 'icon', 'desc', 'color'):
current_mod[m.group(1)] = m.group(2)
# Area header at indent 8
if indent == 8:
m_id = re.match(r'id:\s*"(\w+)"', stripped)
if m_id:
current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []}
current_mod['areas'].append(current_area)
current_section = 'mainline'
continue
m_name = re.match(r'name:\s*"([^"]+)"', stripped)
if m_name and current_area:
current_area['name'] = m_name.group(1)
continue
if re.match(r'mainline:\s*\[', stripped):
current_section = 'mainline'
elif re.match(r'branches:\s*\[', stripped):
current_section = 'branches'
elif re.match(r'forward:\s*\[', stripped):
current_section = 'forward'
# Paper entry
if stripped.startswith('{ title:') and 'tags:' in stripped and current_area:
title = re.search(r'title:\s*"([^"]+)"', stripped)
authors = re.search(r'authors:\s*"([^"]*?)"', stripped)
year = re.search(r'year:\s*(\d+)', stripped)
venue = re.search(r'venue:\s*"([^"]*?)"', stripped)
arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped)
pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped)
tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL)
if title and year and tags_m:
tags = re.findall(r'"([^"]*)"', tags_m.group(1))
entry = {
'title': title.group(1),
'authors': authors.group(1) if authors else '',
'year': int(year.group(1)),
'venue': venue.group(1) if venue else '',
'tags': tags
}
if arxiv and arxiv.group(1):
entry['arxiv'] = arxiv.group(1)
if pdf and pdf.group(1):
entry['pdf'] = pdf.group(1)
current_area[current_section].append(entry)
# Save last module
if current_mod and current_mod.get('name'):
modules[current_mod['id']] = current_mod
# Count
total = sum(
len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[]))
for m in modules.values() for a in m.get('areas',[])
)
print(f'Parsed: {len(modules)} modules, {total} papers')
for mid, m in sorted(modules.items()):
pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas'])
print(f' {mid}: {m["name"]}{len(m["areas"])} areas, {pc} papers')
os.makedirs(os.path.dirname(JSON), exist_ok=True)
with open(JSON, 'w') as f:
json.dump(modules, f, ensure_ascii=False, indent=2)
print(f'\nSaved to {JSON}')

484
api/server.py Normal file
View File

@@ -0,0 +1,484 @@
"""
LLM 论文图书馆 — FastAPI 后端
提供 REST API 进行论文查询、管理、PDF 代理服务
"""
import json
import os
import hashlib
import secrets
import logging
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, HTTPException, Query, Depends, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
# ─── Config ────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent.parent
DATA_FILE = ROOT / "data" / "papers.json"
PAPERS_DIR = ROOT / "papers"
API_KEY = os.environ.get("LLM_LIB_API_KEY", "change-me")
log = logging.getLogger("llm-library")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
# ─── App ───────────────────────────────────────────────
app = FastAPI(
title="LLM 论文图书馆",
description="大模型论文知识库 API — 查询、搜索、管理论文",
version="0.1.0",
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ─── Auth ──────────────────────────────────────────────
def verify_api_key(request: Request):
"""简单的 API Key 鉴权 — 用于写操作 (POST/PUT/DELETE)"""
auth = request.headers.get("Authorization", "")
if auth.startswith("Bearer "):
token = auth[7:]
else:
token = request.query_params.get("api_key", "")
if not token or token != API_KEY:
raise HTTPException(status_code=401, detail="Invalid or missing API key")
return True
# ─── Data loading ──────────────────────────────────────
def load_data():
if not DATA_FILE.exists():
return {}
with open(DATA_FILE, 'r') as f:
return json.load(f)
def save_data(data):
with open(DATA_FILE, 'w') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# ─── Paper CRUD helpers ────────────────────────────────
def find_paper(data, module_id, area_id, title):
"""Find a paper index by title within module/area"""
mod = data.get(module_id)
if not mod:
return None, None, None, None
for area in mod.get("areas", []):
if area["id"] == area_id:
for section in ("mainline", "branches", "forward"):
for i, p in enumerate(area.get(section, [])):
if p["title"] == title:
return mod, area, section, i
return None, None, None, None
# ─── Routes: Query ─────────────────────────────────────
@app.get("/api/stats")
def get_stats():
"""获取图书馆统计信息"""
data = load_data()
mods = len(data)
areas = 0
papers = 0
sections = {"mainline": 0, "branches": 0, "forward": 0}
for mod in data.values():
areas += len(mod.get("areas", []))
for area in mod.get("areas", []):
for s in ("mainline", "branches", "forward"):
n = len(area.get(s, []))
papers += n
sections[s] += n
return {
"modules": mods,
"areas": areas,
"papers": papers,
"sections": sections,
"data_file": str(DATA_FILE),
}
@app.get("/api/modules")
def list_modules():
"""列出所有模块 (不含论文详情)"""
data = load_data()
return [
{
"id": mid,
"name": m["name"],
"icon": m["icon"],
"desc": m["desc"],
"area_count": len(m.get("areas", [])),
"paper_count": sum(
len(a.get("mainline", [])) + len(a.get("branches", [])) + len(a.get("forward", []))
for a in m.get("areas", [])
),
}
for mid, m in data.items()
]
@app.get("/api/modules/{module_id}")
def get_module(module_id: str):
"""获取单个模块的完整论文数据"""
data = load_data()
mod = data.get(module_id)
if not mod:
raise HTTPException(status_code=404, detail=f"Module '{module_id}' not found")
return mod
@app.get("/api/papers")
def search_papers(
q: str = Query(default="", description="搜索关键词: 标题/作者"),
module: Optional[str] = Query(default=None),
tag: Optional[str] = Query(default=None, description="起点/关键节点/前沿/前瞻/支线"),
limit: int = Query(default=50, ge=1, le=200),
):
"""搜索论文 (全文/按模块/按标签)"""
data = load_data()
results = []
q = q.lower()
for mid, mod in data.items():
if module and mid != module:
continue
for area in mod.get("areas", []):
for section in ("mainline", "branches", "forward"):
for p in area.get(section, []):
# Filter by tag
if tag and tag not in p.get("tags", []):
continue
# Filter by query
if q:
if q not in (p.get("title", "") + p.get("authors", "")).lower():
continue
results.append({
"module_id": mid,
"module_name": mod["name"],
"area_id": area["id"],
"area_name": area["name"],
"section": section,
**p,
})
if len(results) >= limit:
break
if len(results) >= limit:
break
if len(results) >= limit:
break
if len(results) >= limit:
break
return results
# ─── Routes: Management (写操作, 需 API Key) ────────────
class PaperCreate(BaseModel):
module_id: str
area_id: str
section: str = "mainline" # mainline / branches / forward
title: str
authors: str = ""
year: int
venue: str = ""
arxiv: Optional[str] = None
pdf: Optional[str] = None
tags: list[str] = []
class PaperUpdate(BaseModel):
authors: Optional[str] = None
year: Optional[int] = None
venue: Optional[str] = None
arxiv: Optional[str] = None
pdf: Optional[str] = None
tags: Optional[list[str]] = None
section: Optional[str] = None # move to different section
@app.post("/api/papers", dependencies=[Depends(verify_api_key)])
def add_paper(paper: PaperCreate):
"""添加一篇新论文"""
data = load_data()
mod = data.get(paper.module_id)
if not mod:
raise HTTPException(status_code=404, detail="Module not found")
area = next((a for a in mod["areas"] if a["id"] == paper.area_id), None)
if not area:
raise HTTPException(status_code=404, detail="Area not found")
section = paper.section
if section not in ("mainline", "branches", "forward"):
raise HTTPException(status_code=400, detail="section must be mainline/branches/forward")
entry = {
"title": paper.title,
"authors": paper.authors,
"year": paper.year,
"venue": paper.venue,
"tags": paper.tags,
}
if paper.arxiv:
entry["arxiv"] = paper.arxiv
if paper.pdf:
entry["pdf"] = paper.pdf
area.setdefault(section, []).append(entry)
save_data(data)
log.info(f"Added paper: {paper.title}")
return {"ok": True, "title": paper.title}
@app.put("/api/papers")
def update_paper(
module_id: str,
area_id: str,
title: str,
update: PaperUpdate,
_=Depends(verify_api_key),
):
"""更新一篇论文"""
data = load_data()
mod, area, section, idx = find_paper(data, module_id, area_id, title)
if mod is None:
raise HTTPException(status_code=404, detail="Paper not found")
paper = area[section][idx]
for field in ("authors", "year", "venue", "arxiv", "pdf", "tags"):
val = getattr(update, field)
if val is not None:
paper[field] = val
# Move to different section?
if update.section and update.section != section:
if update.section not in ("mainline", "branches", "forward"):
raise HTTPException(status_code=400, detail="Invalid section")
area[section].pop(idx)
area.setdefault(update.section, []).append(paper)
save_data(data)
log.info(f"Updated paper: {title}")
return {"ok": True, "title": title}
@app.delete("/api/papers")
def delete_paper(
module_id: str,
area_id: str,
title: str,
_=Depends(verify_api_key),
):
"""删除一篇论文"""
data = load_data()
mod, area, section, idx = find_paper(data, module_id, area_id, title)
if mod is None:
raise HTTPException(status_code=404, detail="Paper not found")
area[section].pop(idx)
save_data(data)
log.info(f"Deleted paper: {title}")
return {"ok": True, "title": title}
# ─── Routes: PDF proxy ──────────────────────────────────
@app.get("/papers/arxiv/{arxiv_id}.pdf")
@app.get("/papers/arxiv/{arxiv_id}")
def serve_arxiv_pdf(arxiv_id: str):
"""从本地缓存提供 arXiv PDF无 .pdf 后缀路由防 IDM 拦截)"""
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
if not pdf_path.exists():
raise HTTPException(status_code=404, detail=f"PDF not in local cache: {arxiv_id}")
return FileResponse(
pdf_path, media_type="application/pdf",
headers={"Cache-Control": "public, max-age=86400"},
)
@app.get("/papers/hf/{filename}.pdf")
@app.get("/papers/hf/{filename}")
def serve_hf_pdf(filename: str):
"""从本地缓存提供 HuggingFace PDF无 .pdf 后缀路由防 IDM 拦截)"""
safe_name = filename.replace("..", "").replace("/", "_").removesuffix(".pdf")
pdf_path = PAPERS_DIR / "hf" / f"{safe_name}.pdf"
if not pdf_path.exists():
raise HTTPException(status_code=404, detail=f"PDF not in local cache: {filename}")
return FileResponse(
pdf_path, media_type="application/pdf",
headers={"Cache-Control": "public, max-age=86400"},
)
# ─── Routes: Translation ───────────────────────────────
TRANSLATE_CACHE = ROOT / "data" / "translations"
TRANSLATE_CACHE.mkdir(parents=True, exist_ok=True)
def extract_pdf_text_with_pages(pdf_path: Path, max_chars: int = 12000) -> list[dict]:
"""从 PDF 提取文本和页码信息,使用 pdftotext (Poppler) 避免 PyMuPDF GPU 依赖"""
import subprocess, tempfile
# Strip arXiv stamp (first page header)
stamp = f"{pdf_path.stem}.pdf" # e.g. "1706.03762.pdf"
result = subprocess.run(
["pdftotext", "-layout", "-q", str(pdf_path), "-"],
capture_output=True, text=True, timeout=30
)
if result.returncode != 0:
log.error(f"pdftotext failed: {result.stderr}")
raise HTTPException(status_code=500, detail="PDF text extraction failed")
text = result.stdout
# Remove arXiv stamp line
import re
text = re.sub(r'arXiv:' + re.escape(stamp.split('.pdf')[0]) + r'.*?\n\n', '', text, flags=re.DOTALL)
text = re.sub(r'arXiv:' + re.escape(stamp) + r'.*?\n\n', '', text, flags=re.DOTALL)
# Split by form-feed (page break)
pages = text.split('\f')
result_pages = []
total = 0
for i, page_text in enumerate(pages):
pt = page_text.strip()
if not pt: continue
result_pages.append({"page": i + 1, "text": pt})
total += len(pt)
if total >= max_chars: break
if not result_pages:
raise HTTPException(status_code=500, detail="No text extracted from PDF")
return result_pages
def split_text_with_pages(page_texts: list[dict], max_len: int = 400) -> list[dict]:
"""将按页拆分的文本进一步拆为段落,保留页码"""
chunks = []
for pt in page_texts:
page = pt["page"]
text = pt["text"]
raw_paras = [p.strip() for p in text.split("\n\n") if p.strip()]
for para in raw_paras:
if len(para) <= max_len:
chunks.append({"page": page, "text": para})
else:
sentences = para.replace(". ", ".|").replace("? ", "?|").replace("! ", "!|").split("|")
current = ""
for s in sentences:
s = s.strip()
if not s: continue
if len(current) + len(s) + 1 <= max_len:
current = (current + " " + s).strip()
else:
if current: chunks.append({"page": page, "text": current})
current = s
if current: chunks.append({"page": page, "text": current})
return chunks
def translate_text(text: str, source: str = "en", target: str = "zh") -> str:
"""使用 MyMemory 免费 API 翻译文本"""
import urllib.request
import urllib.parse
url = "https://api.mymemory.translated.net/get"
params = urllib.parse.urlencode({
"q": text,
"langpair": f"{source}|{target}",
"mt": "1", # Force machine translation, not memory
"de": "me@llm-library.local",
})
full_url = f"{url}?{params}"
try:
with urllib.request.urlopen(full_url, timeout=15) as resp:
data = json.loads(resp.read())
except Exception as e:
log.warning(f"Translation API error: {e}")
return text
if data.get("responseStatus") == 200 and data.get("responseData"):
return data["responseData"]["translatedText"]
return text
@app.get("/api/translate/{arxiv_id}")
def translate_paper(arxiv_id: str):
"""翻译论文正文 (从本地 PDF 提取文本,每段带页码)"""
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
if not pdf_path.exists():
raise HTTPException(status_code=404, detail=f"PDF not cached: {arxiv_id}")
cache_file = TRANSLATE_CACHE / f"{arxiv_id}.json"
if cache_file.exists():
with open(cache_file) as f:
return json.load(f)
# Extract text with page numbers
log.info(f"Extracting text from {arxiv_id}")
page_texts = extract_pdf_text_with_pages(pdf_path)
chunks = split_text_with_pages(page_texts)
log.info(f"Translating {len(chunks)} paragraphs for {arxiv_id}")
translated = []
for i, chunk in enumerate(chunks):
if i % 10 == 0:
log.info(f" [{arxiv_id}] translating paragraph {i+1}/{len(chunks)}")
zh = translate_text(chunk["text"])
translated.append({
"page": chunk["page"],
"en": chunk["text"],
"zh": zh,
})
result = {"arxiv_id": arxiv_id, "paragraphs": translated, "count": len(translated)}
with open(cache_file, "w") as f:
json.dump(result, f, ensure_ascii=False)
return result
@app.get("/api/translate/{arxiv_id}/status")
def translate_status(arxiv_id: str):
"""检查翻译缓存状态"""
cache_file = TRANSLATE_CACHE / f"{arxiv_id}.json"
return {
"arxiv_id": arxiv_id,
"cached": cache_file.exists(),
"pdf_exists": (PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf").exists(),
}
# ─── Routes: PDF download on-demand ────────────────────
@app.post("/api/download/{arxiv_id}")
def download_single_pdf(arxiv_id: str):
"""按需下载单篇 arXiv PDF"""
import subprocess, sys
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
if pdf_path.exists():
return {"ok": True, "arxiv_id": arxiv_id, "status": "cached"}
cmd = [sys.executable, str(ROOT / "api" / "downloader.py"), "--limit", "1", "--delay", "0"]
# We need a way to download specific arxiv IDs — for now, just run the downloader
# It will try all uncached papers, but the specific one will be among them
try:
subprocess.run(cmd, cwd=str(ROOT), timeout=60, capture_output=True)
if pdf_path.exists():
return {"ok": True, "arxiv_id": arxiv_id, "status": "downloaded"}
return {"ok": False, "arxiv_id": arxiv_id, "status": "failed"}
except subprocess.TimeoutExpired:
return {"ok": False, "arxiv_id": arxiv_id, "status": "timeout"}
# ─── Health ─────────────────────────────────────────────
@app.get("/api/health")
def health():
return {"status": "ok", "version": "0.1.0"}
# ─── Mount static frontend (at /) ──────────────────────
# Static files mounted after API routes to avoid conflicts
static_dir = ROOT / "static"
if static_dir.exists() and any(static_dir.iterdir()):
app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static")
# ─── Main ───────────────────────────────────────────────
def main():
import uvicorn
uvicorn.run("api.server:app", host="0.0.0.0", port=8000, reload=True)
if __name__ == "__main__":
main()