Files
llm-library/api/backfill.py
LaoWang f0ff62e082 feat: LLM 论文图书馆 — 初始提交
- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理
- 180 篇论文数据 (data/papers.json): 9 模块、32 子领域
- 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读
- 底部状态栏: arXiv/HF 连通性检测
- PDF 加载: arXiv 优先(5s超时) → HK 本地兜底
- Docker 化部署 (Dockerfile + start.sh + nginx.conf)
- arXiv + HF 批量下载器 (api/downloader.py)
2026-06-02 10:25:14 +00:00

58 lines
1.9 KiB
Python

#!/usr/bin/env python3
"""Backfill all uncached PDFs, skipping dead arXiv IDs"""
import json, subprocess, sys, time
from pathlib import Path
PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json"
ARXIV_DIR = Path(__file__).resolve().parent.parent / "papers" / "arxiv"
LOG_FILE = Path("/app/papers/backfill.log")
def main():
with open(PAPERS_JSON) as f:
data = json.load(f)
# Collect all arxiv IDs
arxiv_ids = set()
for mod in data.values():
for area in mod.get("areas", []):
for section in ("mainline", "branches", "forward"):
for p in area.get(section, []):
aid = p.get("arxiv")
if aid:
arxiv_ids.add(aid)
cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")}
missing = [aid for aid in arxiv_ids if aid not in cached]
print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}")
if not missing:
print("All caught up!")
return
ok, fail = 0, 0
for aid in missing:
url = f"https://arxiv.org/pdf/{aid}.pdf"
dest = ARXIV_DIR / f"{aid}.pdf"
try:
r = subprocess.run(
["wget", "-q", "-T", "15", "-O", str(dest), url],
timeout=20
)
if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000:
ok += 1
print(f" OK {aid} ({dest.stat().st_size//1024} KB)")
else:
dest.unlink(missing_ok=True)
fail += 1
print(f" FAIL {aid} (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})")
except Exception as e:
dest.unlink(missing_ok=True)
fail += 1
print(f" ERR {aid} {e}")
time.sleep(0.8) # Be nice to arXiv
print(f"\nDone: {ok} ok, {fail} failed")
if __name__ == "__main__":
main()