fix: 翻译 API 支持 arXiv + HF 两种论文源

This commit is contained in:
2026-06-02 12:30:38 +00:00
parent 06724077c0
commit 7f16e5decf
4 changed files with 44 additions and 24 deletions

View File

@@ -489,23 +489,42 @@ import subprocess, threading
_translate_lock = threading.Lock()
_translating = set()
@app.post("/api/translate/{arxiv_id}")
async def trigger_translation(arxiv_id: str):
"""Trigger pdf2zh translation for a paper (DeepSeek V4 Flash)"""
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
if not pdf_path.exists():
raise HTTPException(status_code=404, detail="PDF not found")
@app.get("/api/translate/status")
def translation_status():
return {"translating": list(_translating)}
@app.post("/api/translate/{paper_id}")
async def trigger_translation(paper_id: str):
"""Trigger pdf2zh translation for a paper (DeepSeek V4 Flash).
paper_id can be arxiv ID (e.g. 1706.03762) or a HF filename."""
# Find the PDF
pdf_path = None
# Try arxiv
candidate = PAPERS_DIR / "arxiv" / f"{paper_id}.pdf"
if candidate.exists():
pdf_path = candidate
else:
# Try HF papers directory
hf_dir = PAPERS_DIR / "hf"
if hf_dir.exists():
for f in hf_dir.glob("*.pdf"):
if paper_id in f.stem:
pdf_path = f
break
out_path = TRANSLATED_DIR / f"{arxiv_id}.pdf"
if not pdf_path:
raise HTTPException(status_code=404, detail=f"PDF not found for {paper_id}")
out_path = TRANSLATED_DIR / f"{paper_id}.pdf"
if out_path.exists():
return {"arxiv_id": arxiv_id, "status": "already_translated"}
return {"paper_id": paper_id, "status": "already_translated"}
if arxiv_id in _translating:
return {"arxiv_id": arxiv_id, "status": "in_progress"}
if paper_id in _translating:
return {"paper_id": paper_id, "status": "in_progress"}
def do_translate():
try:
_translating.add(arxiv_id)
_translating.add(paper_id)
from pdf2zh.doclayout import OnnxModel
from pdf2zh.high_level import translate
model = OnnxModel.from_pretrained()
@@ -514,22 +533,22 @@ async def trigger_translation(arxiv_id: str):
lang_in='en', lang_out='zh',
service='deepseek', thread=4, model=model,
)
mono = TRANSLATED_DIR / f"{arxiv_id}-mono.pdf"
dual = TRANSLATED_DIR / f"{arxiv_id}-dual.pdf"
mono = TRANSLATED_DIR / f"{paper_id}-mono.pdf"
dual = TRANSLATED_DIR / f"{paper_id}-dual.pdf"
if mono.exists():
if out_path.exists():
out_path.unlink()
mono.rename(out_path)
if dual.exists():
dual.unlink()
log.info(f"Translated: {arxiv_id}")
log.info(f"Translated: {paper_id}")
except Exception as e:
log.error(f"Translation failed for {arxiv_id}: {e}")
log.error(f"Translation failed for {paper_id}: {e}")
finally:
_translating.discard(arxiv_id)
_translating.discard(paper_id)
ThreadPoolExecutor(max_workers=1).submit(do_translate)
return {"arxiv_id": arxiv_id, "status": "started"}
return {"paper_id": paper_id, "status": "started"}
@app.get("/api/translate/status")
def translation_status():