llm-library/api/backfill.py

#!/usr/bin/env python3
"""Backfill all uncached PDFs, skipping dead arXiv IDs"""
import json, subprocess, sys, time
from pathlib import Path

PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json"
ARXIV_DIR   = Path(__file__).resolve().parent.parent / "papers" / "arxiv"
LOG_FILE    = Path("/app/papers/backfill.log")

def main():
    with open(PAPERS_JSON) as f:
        data = json.load(f)

    # Collect all arxiv IDs
    arxiv_ids = set()
    for mod in data.values():
        for area in mod.get("areas", []):
            for section in ("mainline", "branches", "forward"):
                for p in area.get(section, []):
                    aid = p.get("arxiv")
                    if aid:
                        arxiv_ids.add(aid)

    cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")}
    missing = [aid for aid in arxiv_ids if aid not in cached]
    print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}")

    if not missing:
        print("All caught up!")
        return

    ok, fail = 0, 0
    for aid in missing:
        url = f"https://arxiv.org/pdf/{aid}.pdf"
        dest = ARXIV_DIR / f"{aid}.pdf"
        try:
            r = subprocess.run(
                ["wget", "-q", "-T", "15", "-O", str(dest), url],
                timeout=20
            )
            if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000:
                ok += 1
                print(f"  OK  {aid}  ({dest.stat().st_size//1024} KB)")
            else:
                dest.unlink(missing_ok=True)
                fail += 1
                print(f"  FAIL {aid}  (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})")
        except Exception as e:
            dest.unlink(missing_ok=True)
            fail += 1
            print(f"  ERR {aid}  {e}")
        time.sleep(0.8)  # Be nice to arXiv

    print(f"\nDone: {ok} ok, {fail} failed")

if __name__ == "__main__":
    main()