feat: pdf2zh + DeepSeek V4 Flash 翻译集成

This commit is contained in:
2026-06-02 12:14:59 +00:00
parent beae7102b8
commit 06724077c0
6 changed files with 143 additions and 2 deletions

View File

@@ -482,6 +482,59 @@ def serve_translated(arxiv_id: str):
return FileResponse(fp, media_type="application/pdf",
headers={"Content-Disposition": "inline"})
# ─── Routes: Trigger translation ───────────────────────
from concurrent.futures import ThreadPoolExecutor
import subprocess, threading
_translate_lock = threading.Lock()
_translating = set()
@app.post("/api/translate/{arxiv_id}")
async def trigger_translation(arxiv_id: str):
"""Trigger pdf2zh translation for a paper (DeepSeek V4 Flash)"""
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
if not pdf_path.exists():
raise HTTPException(status_code=404, detail="PDF not found")
out_path = TRANSLATED_DIR / f"{arxiv_id}.pdf"
if out_path.exists():
return {"arxiv_id": arxiv_id, "status": "already_translated"}
if arxiv_id in _translating:
return {"arxiv_id": arxiv_id, "status": "in_progress"}
def do_translate():
try:
_translating.add(arxiv_id)
from pdf2zh.doclayout import OnnxModel
from pdf2zh.high_level import translate
model = OnnxModel.from_pretrained()
translate(
[str(pdf_path)], output=str(TRANSLATED_DIR),
lang_in='en', lang_out='zh',
service='deepseek', thread=4, model=model,
)
mono = TRANSLATED_DIR / f"{arxiv_id}-mono.pdf"
dual = TRANSLATED_DIR / f"{arxiv_id}-dual.pdf"
if mono.exists():
if out_path.exists():
out_path.unlink()
mono.rename(out_path)
if dual.exists():
dual.unlink()
log.info(f"Translated: {arxiv_id}")
except Exception as e:
log.error(f"Translation failed for {arxiv_id}: {e}")
finally:
_translating.discard(arxiv_id)
ThreadPoolExecutor(max_workers=1).submit(do_translate)
return {"arxiv_id": arxiv_id, "status": "started"}
@app.get("/api/translate/status")
def translation_status():
return {"translating": list(_translating)}
# ─── Health ─────────────────────────────────────────────
@app.get("/api/health")
def health():