#!/usr/bin/env python3 """Backfill all uncached PDFs, skipping dead arXiv IDs""" import json, subprocess, sys, time from pathlib import Path PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json" ARXIV_DIR = Path(__file__).resolve().parent.parent / "papers" / "arxiv" LOG_FILE = Path("/app/papers/backfill.log") def main(): with open(PAPERS_JSON) as f: data = json.load(f) # Collect all arxiv IDs arxiv_ids = set() for mod in data.values(): for area in mod.get("areas", []): for section in ("mainline", "branches", "forward"): for p in area.get(section, []): aid = p.get("arxiv") if aid: arxiv_ids.add(aid) cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")} missing = [aid for aid in arxiv_ids if aid not in cached] print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}") if not missing: print("All caught up!") return ok, fail = 0, 0 for aid in missing: url = f"https://arxiv.org/pdf/{aid}.pdf" dest = ARXIV_DIR / f"{aid}.pdf" try: r = subprocess.run( ["wget", "-q", "-T", "15", "-O", str(dest), url], timeout=20 ) if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000: ok += 1 print(f" OK {aid} ({dest.stat().st_size//1024} KB)") else: dest.unlink(missing_ok=True) fail += 1 print(f" FAIL {aid} (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})") except Exception as e: dest.unlink(missing_ok=True) fail += 1 print(f" ERR {aid} {e}") time.sleep(0.8) # Be nice to arXiv print(f"\nDone: {ok} ok, {fail} failed") if __name__ == "__main__": main()