Files
llm-library/api/batch_translate.py
LaoWang b8bce5d00b feat: Docker内pdf2zh翻译跑通 + 批量翻译
- Dockerfile: libgl1-mesa-glx→libgl1, libglib2.0-0→libglib2.0-0t64
- batch_docker.py: 容器内批量翻译脚本(thread=2)
- 模型预下载到 models/ 目录纳入镜像
- 修复索引模板多次修改导致的混乱
- 底栏 spdis链接 + DeepSeek维护说明
2026-06-02 16:44:12 +00:00

76 lines
2.3 KiB
Python

#!/usr/bin/env python3
"""批量翻译 arXiv 论文 — pdf2zh + DeepSeek V4 Flash"""
import json, os, sys, time
PAPERS_DIR = "/app/papers"
TRANSLATED_DIR = os.path.join(PAPERS_DIR, "translated")
DATA_FILE = "/app/data/papers.json"
os.makedirs(TRANSLATED_DIR, exist_ok=True)
# Set DeepSeek credentials
os.environ["DEEPSEEK_API_KEY"] = os.environ.get("DEEPSEEK_API_KEY", "")
os.environ["DEEPSEEK_MODEL"] = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
# Load paper list
with open(DATA_FILE) as f:
data = json.load(f)
papers = []
for mod in data.values():
for area in mod.get('areas', []):
for s in ('mainline', 'branches', 'forward'):
for p in area.get(s, []):
if p.get('arxiv'):
papers.append(p['arxiv'])
if len(sys.argv) > 1:
target = set(sys.argv[1:])
papers = [p for p in papers if p in target]
total = len(papers)
done = 0; skipped = 0; failed = []
from pdf2zh.doclayout import OnnxModel
print(f"=== Batch translate {total} papers (DeepSeek V4 Flash) ===")
model = OnnxModel.from_pretrained()
for arxiv_id in papers:
out_path = os.path.join(TRANSLATED_DIR, f"{arxiv_id}.pdf")
if os.path.exists(out_path):
skipped += 1; done += 1
continue
pdf_path = os.path.join(PAPERS_DIR, "arxiv", f"{arxiv_id}.pdf")
if not os.path.exists(pdf_path):
print(f"[{done+1}/{total}] {arxiv_id}: PDF not found")
done += 1
continue
try:
from pdf2zh.high_level import translate
translate(
[pdf_path], output=TRANSLATED_DIR,
lang_in='en', lang_out='zh',
service='deepseek', model=model,
)
mono = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-mono.pdf")
dual = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-dual.pdf")
if os.path.exists(mono):
os.rename(mono, out_path)
if os.path.exists(dual):
os.remove(dual)
done += 1
print(f"[{done}/{total}] {arxiv_id}: OK")
except Exception as e:
done += 1
failed.append(arxiv_id)
print(f"[{done}/{total}] {arxiv_id}: FAILED - {e}")
time.sleep(5)
time.sleep(1)
print(f"\n=== Done: {done-skipped} translated, {skipped} skipped, {len(failed)} failed ===")
if failed:
print(f"Failed: {failed}")