fix: 翻译 API 支持 arXiv + HF 两种论文源

This commit is contained in:
2026-06-02 12:30:38 +00:00
parent 06724077c0
commit 7f16e5decf
4 changed files with 44 additions and 24 deletions

View File

@@ -2,9 +2,7 @@ FROM python:3.11-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
poppler-utils \
libgl1-mesa-glx \
libglib2.0-0 \
libsm6 libxext6 libxrender-dev libgomp1 \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
@@ -18,8 +16,7 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY . .
# Pre-download the layout model
RUN python3 -c "from pdf2zh.doclayout import OnnxModel; OnnxModel.from_pretrained()"
LABEL org.opencontainers.image.description="LLM 论文图书馆 + pdf2zh 翻译引擎"
VOLUME ["/app/papers", "/app/data"]
EXPOSE 8000

View File

@@ -489,23 +489,42 @@ import subprocess, threading
_translate_lock = threading.Lock()
_translating = set()
@app.post("/api/translate/{arxiv_id}")
async def trigger_translation(arxiv_id: str):
"""Trigger pdf2zh translation for a paper (DeepSeek V4 Flash)"""
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
if not pdf_path.exists():
raise HTTPException(status_code=404, detail="PDF not found")
@app.get("/api/translate/status")
def translation_status():
return {"translating": list(_translating)}
@app.post("/api/translate/{paper_id}")
async def trigger_translation(paper_id: str):
"""Trigger pdf2zh translation for a paper (DeepSeek V4 Flash).
paper_id can be arxiv ID (e.g. 1706.03762) or a HF filename."""
# Find the PDF
pdf_path = None
# Try arxiv
candidate = PAPERS_DIR / "arxiv" / f"{paper_id}.pdf"
if candidate.exists():
pdf_path = candidate
else:
# Try HF papers directory
hf_dir = PAPERS_DIR / "hf"
if hf_dir.exists():
for f in hf_dir.glob("*.pdf"):
if paper_id in f.stem:
pdf_path = f
break
out_path = TRANSLATED_DIR / f"{arxiv_id}.pdf"
if not pdf_path:
raise HTTPException(status_code=404, detail=f"PDF not found for {paper_id}")
out_path = TRANSLATED_DIR / f"{paper_id}.pdf"
if out_path.exists():
return {"arxiv_id": arxiv_id, "status": "already_translated"}
return {"paper_id": paper_id, "status": "already_translated"}
if arxiv_id in _translating:
return {"arxiv_id": arxiv_id, "status": "in_progress"}
if paper_id in _translating:
return {"paper_id": paper_id, "status": "in_progress"}
def do_translate():
try:
_translating.add(arxiv_id)
_translating.add(paper_id)
from pdf2zh.doclayout import OnnxModel
from pdf2zh.high_level import translate
model = OnnxModel.from_pretrained()
@@ -514,22 +533,22 @@ async def trigger_translation(arxiv_id: str):
lang_in='en', lang_out='zh',
service='deepseek', thread=4, model=model,
)
mono = TRANSLATED_DIR / f"{arxiv_id}-mono.pdf"
dual = TRANSLATED_DIR / f"{arxiv_id}-dual.pdf"
mono = TRANSLATED_DIR / f"{paper_id}-mono.pdf"
dual = TRANSLATED_DIR / f"{paper_id}-dual.pdf"
if mono.exists():
if out_path.exists():
out_path.unlink()
mono.rename(out_path)
if dual.exists():
dual.unlink()
log.info(f"Translated: {arxiv_id}")
log.info(f"Translated: {paper_id}")
except Exception as e:
log.error(f"Translation failed for {arxiv_id}: {e}")
log.error(f"Translation failed for {paper_id}: {e}")
finally:
_translating.discard(arxiv_id)
_translating.discard(paper_id)
ThreadPoolExecutor(max_workers=1).submit(do_translate)
return {"arxiv_id": arxiv_id, "status": "started"}
return {"paper_id": paper_id, "status": "started"}
@app.get("/api/translate/status")
def translation_status():

View File

@@ -151,7 +151,9 @@ function renderPaper(p) {
const links = [];
if (pdfUrl) links.push(`<button class="paper-link" data-pdf="${encodeURIComponent(pdfUrl)}" data-title="${encodeURIComponent(p.title)}" onclick="openPdfBtn(this)">📄 阅读</button>`);
else if (p.arxiv) links.push(`<a class="paper-link" href="https://arxiv.org/abs/${p.arxiv}" target="_blank">📋 arXiv</a>`);
if (p.arxiv) links.push(`<a class="paper-link trans-btn" href="/papers/translated/${p.arxiv}.pdf" target="_blank" title="中文译文">📖 译文</a>`);
// Show translation button for ALL papers with arxiv or pdf
const paperId = p.arxiv || (p.pdf ? p.pdf.split('/').pop().replace('.pdf','') : null);
if (paperId) links.push(`<a class="paper-link trans-btn" href="/papers/translated/${paperId}.pdf" target="_blank" title="中文译文">📖 译文</a>`);
return `<div class="paper-item"><div class="paper-year">${p.year||'—'}</div><div class="paper-body" id="${id}">
<div class="paper-title">${p.title}</div>
<div class="paper-meta"><span>${p.authors||''}</span>${p.venue?`<span class="paper-venue">${p.venue}</span>`:''}${tags}</div>

View File

@@ -183,7 +183,9 @@ function renderPaper(p) {
const links = [];
if (pdfUrl) links.push(`<button class="paper-link" data-pdf="${encodeURIComponent(pdfUrl)}" data-title="${encodeURIComponent(p.title)}" onclick="openPdfBtn(this)">📄 阅读</button>`);
else if (p.arxiv) links.push(`<a class="paper-link" href="https://arxiv.org/abs/${p.arxiv}" target="_blank">📋 arXiv</a>`);
if (p.arxiv) links.push(`<a class="paper-link trans-btn" href="/papers/translated/${p.arxiv}.pdf" target="_blank" title="中文译文">📖 译文</a>`);
// Show translation button for ALL papers with arxiv or pdf
const paperId = p.arxiv || (p.pdf ? p.pdf.split('/').pop().replace('.pdf','') : null);
if (paperId) links.push(`<a class="paper-link trans-btn" href="/papers/translated/${paperId}.pdf" target="_blank" title="中文译文">📖 译文</a>`);
return `<div class="paper-item"><div class="paper-year">${p.year||'—'}</div><div class="paper-body" id="${id}">
<div class="paper-title">${p.title}</div>
<div class="paper-meta"><span>${p.authors||''}</span>${p.venue?`<span class="paper-venue">${p.venue}</span>`:''}${tags}</div>