llm-library/api/downloader.py

"""
LLM 论文图书馆 — PDF 下载器
从 arXiv 和 HuggingFace 下载论文 PDF 到本地缓存
"""

import os
import json
import time
import logging
from pathlib import Path

import httpx
from tqdm import tqdm

ROOT = Path(__file__).resolve().parent.parent
DATA_FILE = ROOT / "data" / "papers.json"
ARXIV_DIR = ROOT / "papers" / "arxiv"
HF_DIR = ROOT / "papers" / "hf"
LOG_FILE = ROOT / "papers" / "download.log"

log = logging.getLogger("downloader")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler(),
    ],
)


def collect_urls() -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
    """从 papers.json 收集所有需要下载的 PDF URL

    Returns:
        arxiv_list: [(arxiv_id, title), ...]
        hf_list: [(url, filename), ...]
    """
    with open(DATA_FILE) as f:
        data = json.load(f)

    arxiv_seen = set()
    hf_seen = set()
    arxiv_list = []
    hf_list = []

    for mod in data.values():
        for area in mod.get("areas", []):
            for section in ("mainline", "branches", "forward"):
                for p in area.get(section, []):
                    if p.get("arxiv") and p["arxiv"] not in arxiv_seen:
                        arxiv_seen.add(p["arxiv"])
                        arxiv_list.append((p["arxiv"], p.get("title", "")))
                    if p.get("pdf") and p["pdf"] not in hf_seen:
                        hf_seen.add(p["pdf"])
                        # Derive a safe filename from the URL
                        name = p["pdf"].split("/")[-1].replace(".pdf", "")
                        hf_list.append((p["pdf"], name))

    return arxiv_list, hf_list


def download_arxiv(client: httpx.Client, arxiv_id: str, title: str) -> bool:
    """下载单个 arXiv PDF"""
    pdf_path = ARXIV_DIR / f"{arxiv_id}.pdf"
    if pdf_path.exists():
        log.debug(f"Skip (exists): {arxiv_id}")
        return True

    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    try:
        resp = client.get(url, follow_redirects=True, timeout=30)
        resp.raise_for_status()

        # Verify it's actually a PDF (arxiv returns HTML for missing papers)
        content_type = resp.headers.get("content-type", "")
        if "pdf" not in content_type and not resp.content.startswith(b"%PDF"):
            log.warning(f"Not a PDF: {arxiv_id} — {title[:60]}")
            return False

        pdf_path.write_bytes(resp.content)
        size_kb = len(resp.content) / 1024
        log.info(f"OK: {arxiv_id} ({size_kb:.0f} KB) — {title[:60]}")
        return True
    except httpx.HTTPError as e:
        log.error(f"HTTP error {arxiv_id}: {e}")
        return False
    except Exception as e:
        log.error(f"Error {arxiv_id}: {e}")
        return False


def download_hf(client: httpx.Client, url: str, filename: str) -> bool:
    """下载单个 HuggingFace PDF"""
    safe_name = filename.replace("..", "").replace("/", "_")
    pdf_path = HF_DIR / f"{safe_name}.pdf"
    if pdf_path.exists():
        log.debug(f"Skip (exists): {safe_name}")
        return True

    try:
        resp = client.get(url, follow_redirects=True, timeout=60)
        resp.raise_for_status()

        if not resp.content.startswith(b"%PDF"):
            log.warning(f"Not a PDF: {safe_name}")
            return False

        pdf_path.write_bytes(resp.content)
        size_kb = len(resp.content) / 1024
        log.info(f"OK (HF): {safe_name} ({size_kb:.0f} KB)")
        return True
    except httpx.HTTPError as e:
        log.error(f"HTTP error {safe_name}: {e}")
        return False
    except Exception as e:
        log.error(f"Error {safe_name}: {e}")
        return False


def run(incremental: bool = True, limit: int = 0, delay: float = 1.0):
    """批量下载所有 PDF

    Args:
        incremental: True=跳过已有文件
        limit: 0=全部, N=只下载前N篇
        delay: 请求间延迟(秒)
    """
    ARXIV_DIR.mkdir(parents=True, exist_ok=True)
    HF_DIR.mkdir(parents=True, exist_ok=True)

    arxiv_list, hf_list = collect_urls()
    total = len(arxiv_list) + len(hf_list)
    log.info(f"Found {len(arxiv_list)} arXiv + {len(hf_list)} HF = {total} PDFs to download")
    log.info(f"Incremental: {incremental}, Delay: {delay}s")

    if not incremental:
        log.warning("Non-incremental mode: will re-download existing files")

    # Count existing
    arxiv_existing = sum(1 for aid, _ in arxiv_list if (ARXIV_DIR / f"{aid}.pdf").exists())
    hf_existing = sum(1 for _, name in hf_list if (HF_DIR / f"{name}.pdf").exists())
    log.info(f"Already cached: {arxiv_existing} arXiv + {hf_existing} HF")

    ok, fail = 0, 0
    total_size = 0.0

    with httpx.Client(
        headers={"User-Agent": "LLM-Library-Downloader/0.1"},
        timeout=30,
        follow_redirects=True,
    ) as client:

        # Download arXiv
        if limit > 0:
            arxiv_list = arxiv_list[:limit]
        for arxiv_id, title in tqdm(arxiv_list, desc="arXiv"):
            if incremental and (ARXIV_DIR / f"{arxiv_id}.pdf").exists():
                ok += 1
                continue
            success = download_arxiv(client, arxiv_id, title)
            if success:
                ok += 1
                p = ARXIV_DIR / f"{arxiv_id}.pdf"
                if p.exists():
                    total_size += p.stat().st_size
            else:
                fail += 1
            time.sleep(delay)

        # Download HF
        if limit > 0:
            hf_list = hf_list[:limit]
        for url, name in tqdm(hf_list, desc="HF   "):
            if incremental and (HF_DIR / f"{name}.pdf").exists():
                ok += 1
                continue
            success = download_hf(client, url, name)
            if success:
                ok += 1
                p = HF_DIR / f"{name}.pdf"
                if p.exists():
                    total_size += p.stat().st_size
            else:
                fail += 1
            time.sleep(delay)

    log.info(f"Done: {ok} OK, {fail} failed, {total_size/1024/1024:.1f} MB total")


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="下载论文 PDF 到本地缓存")
    parser.add_argument("--no-incremental", action="store_true", help="重新下载所有 (默认跳过已有)")
    parser.add_argument("--limit", type=int, default=0, help="限制下载数量 (0=全部)")
    parser.add_argument("--delay", type=float, default=1.0, help="请求间延迟 (秒)")
    args = parser.parse_args()
    run(incremental=not args.no_incremental, limit=args.limit, delay=args.delay)