fix: 翻译 API 支持 arXiv + HF 两种论文源
This commit is contained in:
@@ -2,9 +2,7 @@ FROM python:3.11-slim
|
|||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
poppler-utils \
|
poppler-utils \
|
||||||
libgl1-mesa-glx \
|
libgomp1 \
|
||||||
libglib2.0-0 \
|
|
||||||
libsm6 libxext6 libxrender-dev libgomp1 \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
@@ -18,8 +16,7 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Pre-download the layout model
|
LABEL org.opencontainers.image.description="LLM 论文图书馆 + pdf2zh 翻译引擎"
|
||||||
RUN python3 -c "from pdf2zh.doclayout import OnnxModel; OnnxModel.from_pretrained()"
|
|
||||||
|
|
||||||
VOLUME ["/app/papers", "/app/data"]
|
VOLUME ["/app/papers", "/app/data"]
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|||||||
@@ -489,23 +489,42 @@ import subprocess, threading
|
|||||||
_translate_lock = threading.Lock()
|
_translate_lock = threading.Lock()
|
||||||
_translating = set()
|
_translating = set()
|
||||||
|
|
||||||
@app.post("/api/translate/{arxiv_id}")
|
@app.get("/api/translate/status")
|
||||||
async def trigger_translation(arxiv_id: str):
|
def translation_status():
|
||||||
"""Trigger pdf2zh translation for a paper (DeepSeek V4 Flash)"""
|
return {"translating": list(_translating)}
|
||||||
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
|
|
||||||
if not pdf_path.exists():
|
@app.post("/api/translate/{paper_id}")
|
||||||
raise HTTPException(status_code=404, detail="PDF not found")
|
async def trigger_translation(paper_id: str):
|
||||||
|
"""Trigger pdf2zh translation for a paper (DeepSeek V4 Flash).
|
||||||
|
paper_id can be arxiv ID (e.g. 1706.03762) or a HF filename."""
|
||||||
|
# Find the PDF
|
||||||
|
pdf_path = None
|
||||||
|
# Try arxiv
|
||||||
|
candidate = PAPERS_DIR / "arxiv" / f"{paper_id}.pdf"
|
||||||
|
if candidate.exists():
|
||||||
|
pdf_path = candidate
|
||||||
|
else:
|
||||||
|
# Try HF papers directory
|
||||||
|
hf_dir = PAPERS_DIR / "hf"
|
||||||
|
if hf_dir.exists():
|
||||||
|
for f in hf_dir.glob("*.pdf"):
|
||||||
|
if paper_id in f.stem:
|
||||||
|
pdf_path = f
|
||||||
|
break
|
||||||
|
|
||||||
out_path = TRANSLATED_DIR / f"{arxiv_id}.pdf"
|
if not pdf_path:
|
||||||
|
raise HTTPException(status_code=404, detail=f"PDF not found for {paper_id}")
|
||||||
|
|
||||||
|
out_path = TRANSLATED_DIR / f"{paper_id}.pdf"
|
||||||
if out_path.exists():
|
if out_path.exists():
|
||||||
return {"arxiv_id": arxiv_id, "status": "already_translated"}
|
return {"paper_id": paper_id, "status": "already_translated"}
|
||||||
|
|
||||||
if arxiv_id in _translating:
|
if paper_id in _translating:
|
||||||
return {"arxiv_id": arxiv_id, "status": "in_progress"}
|
return {"paper_id": paper_id, "status": "in_progress"}
|
||||||
|
|
||||||
def do_translate():
|
def do_translate():
|
||||||
try:
|
try:
|
||||||
_translating.add(arxiv_id)
|
_translating.add(paper_id)
|
||||||
from pdf2zh.doclayout import OnnxModel
|
from pdf2zh.doclayout import OnnxModel
|
||||||
from pdf2zh.high_level import translate
|
from pdf2zh.high_level import translate
|
||||||
model = OnnxModel.from_pretrained()
|
model = OnnxModel.from_pretrained()
|
||||||
@@ -514,22 +533,22 @@ async def trigger_translation(arxiv_id: str):
|
|||||||
lang_in='en', lang_out='zh',
|
lang_in='en', lang_out='zh',
|
||||||
service='deepseek', thread=4, model=model,
|
service='deepseek', thread=4, model=model,
|
||||||
)
|
)
|
||||||
mono = TRANSLATED_DIR / f"{arxiv_id}-mono.pdf"
|
mono = TRANSLATED_DIR / f"{paper_id}-mono.pdf"
|
||||||
dual = TRANSLATED_DIR / f"{arxiv_id}-dual.pdf"
|
dual = TRANSLATED_DIR / f"{paper_id}-dual.pdf"
|
||||||
if mono.exists():
|
if mono.exists():
|
||||||
if out_path.exists():
|
if out_path.exists():
|
||||||
out_path.unlink()
|
out_path.unlink()
|
||||||
mono.rename(out_path)
|
mono.rename(out_path)
|
||||||
if dual.exists():
|
if dual.exists():
|
||||||
dual.unlink()
|
dual.unlink()
|
||||||
log.info(f"Translated: {arxiv_id}")
|
log.info(f"Translated: {paper_id}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Translation failed for {arxiv_id}: {e}")
|
log.error(f"Translation failed for {paper_id}: {e}")
|
||||||
finally:
|
finally:
|
||||||
_translating.discard(arxiv_id)
|
_translating.discard(paper_id)
|
||||||
|
|
||||||
ThreadPoolExecutor(max_workers=1).submit(do_translate)
|
ThreadPoolExecutor(max_workers=1).submit(do_translate)
|
||||||
return {"arxiv_id": arxiv_id, "status": "started"}
|
return {"paper_id": paper_id, "status": "started"}
|
||||||
|
|
||||||
@app.get("/api/translate/status")
|
@app.get("/api/translate/status")
|
||||||
def translation_status():
|
def translation_status():
|
||||||
|
|||||||
@@ -151,7 +151,9 @@ function renderPaper(p) {
|
|||||||
const links = [];
|
const links = [];
|
||||||
if (pdfUrl) links.push(`<button class="paper-link" data-pdf="${encodeURIComponent(pdfUrl)}" data-title="${encodeURIComponent(p.title)}" onclick="openPdfBtn(this)">📄 阅读</button>`);
|
if (pdfUrl) links.push(`<button class="paper-link" data-pdf="${encodeURIComponent(pdfUrl)}" data-title="${encodeURIComponent(p.title)}" onclick="openPdfBtn(this)">📄 阅读</button>`);
|
||||||
else if (p.arxiv) links.push(`<a class="paper-link" href="https://arxiv.org/abs/${p.arxiv}" target="_blank">📋 arXiv</a>`);
|
else if (p.arxiv) links.push(`<a class="paper-link" href="https://arxiv.org/abs/${p.arxiv}" target="_blank">📋 arXiv</a>`);
|
||||||
if (p.arxiv) links.push(`<a class="paper-link trans-btn" href="/papers/translated/${p.arxiv}.pdf" target="_blank" title="中文译文">📖 译文</a>`);
|
// Show translation button for ALL papers with arxiv or pdf
|
||||||
|
const paperId = p.arxiv || (p.pdf ? p.pdf.split('/').pop().replace('.pdf','') : null);
|
||||||
|
if (paperId) links.push(`<a class="paper-link trans-btn" href="/papers/translated/${paperId}.pdf" target="_blank" title="中文译文">📖 译文</a>`);
|
||||||
return `<div class="paper-item"><div class="paper-year">${p.year||'—'}</div><div class="paper-body" id="${id}">
|
return `<div class="paper-item"><div class="paper-year">${p.year||'—'}</div><div class="paper-body" id="${id}">
|
||||||
<div class="paper-title">${p.title}</div>
|
<div class="paper-title">${p.title}</div>
|
||||||
<div class="paper-meta"><span>${p.authors||''}</span>${p.venue?`<span class="paper-venue">${p.venue}</span>`:''}${tags}</div>
|
<div class="paper-meta"><span>${p.authors||''}</span>${p.venue?`<span class="paper-venue">${p.venue}</span>`:''}${tags}</div>
|
||||||
|
|||||||
@@ -183,7 +183,9 @@ function renderPaper(p) {
|
|||||||
const links = [];
|
const links = [];
|
||||||
if (pdfUrl) links.push(`<button class="paper-link" data-pdf="${encodeURIComponent(pdfUrl)}" data-title="${encodeURIComponent(p.title)}" onclick="openPdfBtn(this)">📄 阅读</button>`);
|
if (pdfUrl) links.push(`<button class="paper-link" data-pdf="${encodeURIComponent(pdfUrl)}" data-title="${encodeURIComponent(p.title)}" onclick="openPdfBtn(this)">📄 阅读</button>`);
|
||||||
else if (p.arxiv) links.push(`<a class="paper-link" href="https://arxiv.org/abs/${p.arxiv}" target="_blank">📋 arXiv</a>`);
|
else if (p.arxiv) links.push(`<a class="paper-link" href="https://arxiv.org/abs/${p.arxiv}" target="_blank">📋 arXiv</a>`);
|
||||||
if (p.arxiv) links.push(`<a class="paper-link trans-btn" href="/papers/translated/${p.arxiv}.pdf" target="_blank" title="中文译文">📖 译文</a>`);
|
// Show translation button for ALL papers with arxiv or pdf
|
||||||
|
const paperId = p.arxiv || (p.pdf ? p.pdf.split('/').pop().replace('.pdf','') : null);
|
||||||
|
if (paperId) links.push(`<a class="paper-link trans-btn" href="/papers/translated/${paperId}.pdf" target="_blank" title="中文译文">📖 译文</a>`);
|
||||||
return `<div class="paper-item"><div class="paper-year">${p.year||'—'}</div><div class="paper-body" id="${id}">
|
return `<div class="paper-item"><div class="paper-year">${p.year||'—'}</div><div class="paper-body" id="${id}">
|
||||||
<div class="paper-title">${p.title}</div>
|
<div class="paper-title">${p.title}</div>
|
||||||
<div class="paper-meta"><span>${p.authors||''}</span>${p.venue?`<span class="paper-venue">${p.venue}</span>`:''}${tags}</div>
|
<div class="paper-meta"><span>${p.authors||''}</span>${p.venue?`<span class="paper-venue">${p.venue}</span>`:''}${tags}</div>
|
||||||
|
|||||||
Reference in New Issue
Block a user