diff --git a/Dockerfile b/Dockerfile index 98566b6..8e7d53a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,30 @@ FROM python:3.11-slim -RUN apt-get update && apt-get install -y --no-install-recommends poppler-utils && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y --no-install-recommends \ + poppler-utils \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 libxext6 libxrender-dev libgomp1 \ + && rm -rf /var/lib/apt/lists/* WORKDIR /app +# Install PyTorch CPU + pdf2zh +RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu +RUN pip install --no-cache-dir pdf2zh + COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt + COPY . . +# Pre-download the layout model +RUN python3 -c "from pdf2zh.doclayout import OnnxModel; OnnxModel.from_pretrained()" + VOLUME ["/app/papers", "/app/data"] EXPOSE 8000 ENV PORT=8000 ENV LOG_LEVEL=info +ENV DEEPSEEK_MODEL=deepseek-chat CMD ["sh", "-c", "python3 -m uvicorn api.server:app --host 0.0.0.0 --port ${PORT} --log-level ${LOG_LEVEL}"] diff --git a/api/batch_translate.py b/api/batch_translate.py new file mode 100644 index 0000000..0007fda --- /dev/null +++ b/api/batch_translate.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +"""批量翻译 arXiv 论文 — pdf2zh + DeepSeek V4 Flash""" +import json, os, sys, time + +PAPERS_DIR = "/app/papers" +TRANSLATED_DIR = os.path.join(PAPERS_DIR, "translated") +DATA_FILE = "/app/data/papers.json" +os.makedirs(TRANSLATED_DIR, exist_ok=True) + +# Set DeepSeek credentials +os.environ["DEEPSEEK_API_KEY"] = os.environ.get("DEEPSEEK_API_KEY", "") +os.environ["DEEPSEEK_MODEL"] = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat") + +# Load paper list +with open(DATA_FILE) as f: + data = json.load(f) + +papers = [] +for mod in data.values(): + for area in mod.get('areas', []): + for s in ('mainline', 'branches', 'forward'): + for p in area.get(s, []): + if p.get('arxiv'): + papers.append(p['arxiv']) + +if len(sys.argv) > 1: + target = set(sys.argv[1:]) + papers = [p for p in papers if p in target] + +total = len(papers) +done = 0; skipped = 0; failed = [] + +from pdf2zh.doclayout import OnnxModel + +print(f"=== Batch translate {total} papers (DeepSeek V4 Flash) ===") +model = OnnxModel.from_pretrained() + +for arxiv_id in papers: + out_path = os.path.join(TRANSLATED_DIR, f"{arxiv_id}.pdf") + if os.path.exists(out_path): + skipped += 1; done += 1 + continue + + pdf_path = os.path.join(PAPERS_DIR, "arxiv", f"{arxiv_id}.pdf") + if not os.path.exists(pdf_path): + print(f"[{done+1}/{total}] {arxiv_id}: PDF not found") + done += 1 + continue + + try: + from pdf2zh.high_level import translate + translate( + [pdf_path], output=TRANSLATED_DIR, + lang_in='en', lang_out='zh', + service='deepseek', thread=4, model=model, + ) + mono = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-mono.pdf") + dual = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-dual.pdf") + if os.path.exists(mono): + os.rename(mono, out_path) + if os.path.exists(dual): + os.remove(dual) + done += 1 + print(f"[{done}/{total}] {arxiv_id}: OK") + except Exception as e: + done += 1 + failed.append(arxiv_id) + print(f"[{done}/{total}] {arxiv_id}: FAILED - {e}") + time.sleep(5) + + time.sleep(1) + +print(f"\n=== Done: {done-skipped} translated, {skipped} skipped, {len(failed)} failed ===") +if failed: + print(f"Failed: {failed}") diff --git a/api/server.py b/api/server.py index 3455571..3b4932e 100644 --- a/api/server.py +++ b/api/server.py @@ -482,6 +482,59 @@ def serve_translated(arxiv_id: str): return FileResponse(fp, media_type="application/pdf", headers={"Content-Disposition": "inline"}) +# ─── Routes: Trigger translation ─────────────────────── +from concurrent.futures import ThreadPoolExecutor +import subprocess, threading + +_translate_lock = threading.Lock() +_translating = set() + +@app.post("/api/translate/{arxiv_id}") +async def trigger_translation(arxiv_id: str): + """Trigger pdf2zh translation for a paper (DeepSeek V4 Flash)""" + pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf" + if not pdf_path.exists(): + raise HTTPException(status_code=404, detail="PDF not found") + + out_path = TRANSLATED_DIR / f"{arxiv_id}.pdf" + if out_path.exists(): + return {"arxiv_id": arxiv_id, "status": "already_translated"} + + if arxiv_id in _translating: + return {"arxiv_id": arxiv_id, "status": "in_progress"} + + def do_translate(): + try: + _translating.add(arxiv_id) + from pdf2zh.doclayout import OnnxModel + from pdf2zh.high_level import translate + model = OnnxModel.from_pretrained() + translate( + [str(pdf_path)], output=str(TRANSLATED_DIR), + lang_in='en', lang_out='zh', + service='deepseek', thread=4, model=model, + ) + mono = TRANSLATED_DIR / f"{arxiv_id}-mono.pdf" + dual = TRANSLATED_DIR / f"{arxiv_id}-dual.pdf" + if mono.exists(): + if out_path.exists(): + out_path.unlink() + mono.rename(out_path) + if dual.exists(): + dual.unlink() + log.info(f"Translated: {arxiv_id}") + except Exception as e: + log.error(f"Translation failed for {arxiv_id}: {e}") + finally: + _translating.discard(arxiv_id) + + ThreadPoolExecutor(max_workers=1).submit(do_translate) + return {"arxiv_id": arxiv_id, "status": "started"} + +@app.get("/api/translate/status") +def translation_status(): + return {"translating": list(_translating)} + # ─── Health ───────────────────────────────────────────── @app.get("/api/health") def health(): diff --git a/papers/translated/1706.03762-dual.pdf b/papers/translated/1706.03762-dual.pdf new file mode 100644 index 0000000..a00dfd1 Binary files /dev/null and b/papers/translated/1706.03762-dual.pdf differ diff --git a/papers/translated/1706.03762.pdf b/papers/translated/1706.03762.pdf index 8b194ba..8692721 100644 Binary files a/papers/translated/1706.03762.pdf and b/papers/translated/1706.03762.pdf differ diff --git a/requirements.txt b/requirements.txt index a4ff6e3..68a2bdf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,3 @@ pydantic>=2.10 python-multipart>=0.0.19 aiofiles>=24.0 tqdm>=4.66 -fpdf2>=2.7