- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
58 lines
1.9 KiB
Python
58 lines
1.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Backfill all uncached PDFs, skipping dead arXiv IDs"""
|
|
import json, subprocess, sys, time
|
|
from pathlib import Path
|
|
|
|
PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json"
|
|
ARXIV_DIR = Path(__file__).resolve().parent.parent / "papers" / "arxiv"
|
|
LOG_FILE = Path("/app/papers/backfill.log")
|
|
|
|
def main():
|
|
with open(PAPERS_JSON) as f:
|
|
data = json.load(f)
|
|
|
|
# Collect all arxiv IDs
|
|
arxiv_ids = set()
|
|
for mod in data.values():
|
|
for area in mod.get("areas", []):
|
|
for section in ("mainline", "branches", "forward"):
|
|
for p in area.get(section, []):
|
|
aid = p.get("arxiv")
|
|
if aid:
|
|
arxiv_ids.add(aid)
|
|
|
|
cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")}
|
|
missing = [aid for aid in arxiv_ids if aid not in cached]
|
|
print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}")
|
|
|
|
if not missing:
|
|
print("All caught up!")
|
|
return
|
|
|
|
ok, fail = 0, 0
|
|
for aid in missing:
|
|
url = f"https://arxiv.org/pdf/{aid}.pdf"
|
|
dest = ARXIV_DIR / f"{aid}.pdf"
|
|
try:
|
|
r = subprocess.run(
|
|
["wget", "-q", "-T", "15", "-O", str(dest), url],
|
|
timeout=20
|
|
)
|
|
if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000:
|
|
ok += 1
|
|
print(f" OK {aid} ({dest.stat().st_size//1024} KB)")
|
|
else:
|
|
dest.unlink(missing_ok=True)
|
|
fail += 1
|
|
print(f" FAIL {aid} (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})")
|
|
except Exception as e:
|
|
dest.unlink(missing_ok=True)
|
|
fail += 1
|
|
print(f" ERR {aid} {e}")
|
|
time.sleep(0.8) # Be nice to arXiv
|
|
|
|
print(f"\nDone: {ok} ok, {fail} failed")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|