feat: pdf2zh + DeepSeek V4 Flash 翻译集成
This commit is contained in:
16
Dockerfile
16
Dockerfile
@@ -1,16 +1,30 @@
|
|||||||
FROM python:3.11-slim
|
FROM python:3.11-slim
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends poppler-utils && rm -rf /var/lib/apt/lists/*
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
poppler-utils \
|
||||||
|
libgl1-mesa-glx \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 libxext6 libxrender-dev libgomp1 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install PyTorch CPU + pdf2zh
|
||||||
|
RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
RUN pip install --no-cache-dir pdf2zh
|
||||||
|
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
|
# Pre-download the layout model
|
||||||
|
RUN python3 -c "from pdf2zh.doclayout import OnnxModel; OnnxModel.from_pretrained()"
|
||||||
|
|
||||||
VOLUME ["/app/papers", "/app/data"]
|
VOLUME ["/app/papers", "/app/data"]
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
ENV PORT=8000
|
ENV PORT=8000
|
||||||
ENV LOG_LEVEL=info
|
ENV LOG_LEVEL=info
|
||||||
|
ENV DEEPSEEK_MODEL=deepseek-chat
|
||||||
|
|
||||||
CMD ["sh", "-c", "python3 -m uvicorn api.server:app --host 0.0.0.0 --port ${PORT} --log-level ${LOG_LEVEL}"]
|
CMD ["sh", "-c", "python3 -m uvicorn api.server:app --host 0.0.0.0 --port ${PORT} --log-level ${LOG_LEVEL}"]
|
||||||
|
|||||||
75
api/batch_translate.py
Normal file
75
api/batch_translate.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""批量翻译 arXiv 论文 — pdf2zh + DeepSeek V4 Flash"""
|
||||||
|
import json, os, sys, time
|
||||||
|
|
||||||
|
PAPERS_DIR = "/app/papers"
|
||||||
|
TRANSLATED_DIR = os.path.join(PAPERS_DIR, "translated")
|
||||||
|
DATA_FILE = "/app/data/papers.json"
|
||||||
|
os.makedirs(TRANSLATED_DIR, exist_ok=True)
|
||||||
|
|
||||||
|
# Set DeepSeek credentials
|
||||||
|
os.environ["DEEPSEEK_API_KEY"] = os.environ.get("DEEPSEEK_API_KEY", "")
|
||||||
|
os.environ["DEEPSEEK_MODEL"] = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
|
||||||
|
|
||||||
|
# Load paper list
|
||||||
|
with open(DATA_FILE) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
papers = []
|
||||||
|
for mod in data.values():
|
||||||
|
for area in mod.get('areas', []):
|
||||||
|
for s in ('mainline', 'branches', 'forward'):
|
||||||
|
for p in area.get(s, []):
|
||||||
|
if p.get('arxiv'):
|
||||||
|
papers.append(p['arxiv'])
|
||||||
|
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
target = set(sys.argv[1:])
|
||||||
|
papers = [p for p in papers if p in target]
|
||||||
|
|
||||||
|
total = len(papers)
|
||||||
|
done = 0; skipped = 0; failed = []
|
||||||
|
|
||||||
|
from pdf2zh.doclayout import OnnxModel
|
||||||
|
|
||||||
|
print(f"=== Batch translate {total} papers (DeepSeek V4 Flash) ===")
|
||||||
|
model = OnnxModel.from_pretrained()
|
||||||
|
|
||||||
|
for arxiv_id in papers:
|
||||||
|
out_path = os.path.join(TRANSLATED_DIR, f"{arxiv_id}.pdf")
|
||||||
|
if os.path.exists(out_path):
|
||||||
|
skipped += 1; done += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
pdf_path = os.path.join(PAPERS_DIR, "arxiv", f"{arxiv_id}.pdf")
|
||||||
|
if not os.path.exists(pdf_path):
|
||||||
|
print(f"[{done+1}/{total}] {arxiv_id}: PDF not found")
|
||||||
|
done += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pdf2zh.high_level import translate
|
||||||
|
translate(
|
||||||
|
[pdf_path], output=TRANSLATED_DIR,
|
||||||
|
lang_in='en', lang_out='zh',
|
||||||
|
service='deepseek', thread=4, model=model,
|
||||||
|
)
|
||||||
|
mono = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-mono.pdf")
|
||||||
|
dual = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-dual.pdf")
|
||||||
|
if os.path.exists(mono):
|
||||||
|
os.rename(mono, out_path)
|
||||||
|
if os.path.exists(dual):
|
||||||
|
os.remove(dual)
|
||||||
|
done += 1
|
||||||
|
print(f"[{done}/{total}] {arxiv_id}: OK")
|
||||||
|
except Exception as e:
|
||||||
|
done += 1
|
||||||
|
failed.append(arxiv_id)
|
||||||
|
print(f"[{done}/{total}] {arxiv_id}: FAILED - {e}")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
print(f"\n=== Done: {done-skipped} translated, {skipped} skipped, {len(failed)} failed ===")
|
||||||
|
if failed:
|
||||||
|
print(f"Failed: {failed}")
|
||||||
@@ -482,6 +482,59 @@ def serve_translated(arxiv_id: str):
|
|||||||
return FileResponse(fp, media_type="application/pdf",
|
return FileResponse(fp, media_type="application/pdf",
|
||||||
headers={"Content-Disposition": "inline"})
|
headers={"Content-Disposition": "inline"})
|
||||||
|
|
||||||
|
# ─── Routes: Trigger translation ───────────────────────
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
import subprocess, threading
|
||||||
|
|
||||||
|
_translate_lock = threading.Lock()
|
||||||
|
_translating = set()
|
||||||
|
|
||||||
|
@app.post("/api/translate/{arxiv_id}")
|
||||||
|
async def trigger_translation(arxiv_id: str):
|
||||||
|
"""Trigger pdf2zh translation for a paper (DeepSeek V4 Flash)"""
|
||||||
|
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
|
||||||
|
if not pdf_path.exists():
|
||||||
|
raise HTTPException(status_code=404, detail="PDF not found")
|
||||||
|
|
||||||
|
out_path = TRANSLATED_DIR / f"{arxiv_id}.pdf"
|
||||||
|
if out_path.exists():
|
||||||
|
return {"arxiv_id": arxiv_id, "status": "already_translated"}
|
||||||
|
|
||||||
|
if arxiv_id in _translating:
|
||||||
|
return {"arxiv_id": arxiv_id, "status": "in_progress"}
|
||||||
|
|
||||||
|
def do_translate():
|
||||||
|
try:
|
||||||
|
_translating.add(arxiv_id)
|
||||||
|
from pdf2zh.doclayout import OnnxModel
|
||||||
|
from pdf2zh.high_level import translate
|
||||||
|
model = OnnxModel.from_pretrained()
|
||||||
|
translate(
|
||||||
|
[str(pdf_path)], output=str(TRANSLATED_DIR),
|
||||||
|
lang_in='en', lang_out='zh',
|
||||||
|
service='deepseek', thread=4, model=model,
|
||||||
|
)
|
||||||
|
mono = TRANSLATED_DIR / f"{arxiv_id}-mono.pdf"
|
||||||
|
dual = TRANSLATED_DIR / f"{arxiv_id}-dual.pdf"
|
||||||
|
if mono.exists():
|
||||||
|
if out_path.exists():
|
||||||
|
out_path.unlink()
|
||||||
|
mono.rename(out_path)
|
||||||
|
if dual.exists():
|
||||||
|
dual.unlink()
|
||||||
|
log.info(f"Translated: {arxiv_id}")
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Translation failed for {arxiv_id}: {e}")
|
||||||
|
finally:
|
||||||
|
_translating.discard(arxiv_id)
|
||||||
|
|
||||||
|
ThreadPoolExecutor(max_workers=1).submit(do_translate)
|
||||||
|
return {"arxiv_id": arxiv_id, "status": "started"}
|
||||||
|
|
||||||
|
@app.get("/api/translate/status")
|
||||||
|
def translation_status():
|
||||||
|
return {"translating": list(_translating)}
|
||||||
|
|
||||||
# ─── Health ─────────────────────────────────────────────
|
# ─── Health ─────────────────────────────────────────────
|
||||||
@app.get("/api/health")
|
@app.get("/api/health")
|
||||||
def health():
|
def health():
|
||||||
|
|||||||
BIN
papers/translated/1706.03762-dual.pdf
Normal file
BIN
papers/translated/1706.03762-dual.pdf
Normal file
Binary file not shown.
Binary file not shown.
@@ -5,4 +5,3 @@ pydantic>=2.10
|
|||||||
python-multipart>=0.0.19
|
python-multipart>=0.0.19
|
||||||
aiofiles>=24.0
|
aiofiles>=24.0
|
||||||
tqdm>=4.66
|
tqdm>=4.66
|
||||||
fpdf2>=2.7
|
|
||||||
|
|||||||
Reference in New Issue
Block a user