feat: LLM 论文图书馆 — 初始提交
- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
This commit is contained in:
57
api/backfill.py
Normal file
57
api/backfill.py
Normal file
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Backfill all uncached PDFs, skipping dead arXiv IDs"""
|
||||
import json, subprocess, sys, time
|
||||
from pathlib import Path
|
||||
|
||||
PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json"
|
||||
ARXIV_DIR = Path(__file__).resolve().parent.parent / "papers" / "arxiv"
|
||||
LOG_FILE = Path("/app/papers/backfill.log")
|
||||
|
||||
def main():
|
||||
with open(PAPERS_JSON) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Collect all arxiv IDs
|
||||
arxiv_ids = set()
|
||||
for mod in data.values():
|
||||
for area in mod.get("areas", []):
|
||||
for section in ("mainline", "branches", "forward"):
|
||||
for p in area.get(section, []):
|
||||
aid = p.get("arxiv")
|
||||
if aid:
|
||||
arxiv_ids.add(aid)
|
||||
|
||||
cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")}
|
||||
missing = [aid for aid in arxiv_ids if aid not in cached]
|
||||
print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}")
|
||||
|
||||
if not missing:
|
||||
print("All caught up!")
|
||||
return
|
||||
|
||||
ok, fail = 0, 0
|
||||
for aid in missing:
|
||||
url = f"https://arxiv.org/pdf/{aid}.pdf"
|
||||
dest = ARXIV_DIR / f"{aid}.pdf"
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["wget", "-q", "-T", "15", "-O", str(dest), url],
|
||||
timeout=20
|
||||
)
|
||||
if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000:
|
||||
ok += 1
|
||||
print(f" OK {aid} ({dest.stat().st_size//1024} KB)")
|
||||
else:
|
||||
dest.unlink(missing_ok=True)
|
||||
fail += 1
|
||||
print(f" FAIL {aid} (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})")
|
||||
except Exception as e:
|
||||
dest.unlink(missing_ok=True)
|
||||
fail += 1
|
||||
print(f" ERR {aid} {e}")
|
||||
time.sleep(0.8) # Be nice to arXiv
|
||||
|
||||
print(f"\nDone: {ok} ok, {fail} failed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user