- Dockerfile: libgl1-mesa-glx→libgl1, libglib2.0-0→libglib2.0-0t64 - batch_docker.py: 容器内批量翻译脚本(thread=2) - 模型预下载到 models/ 目录纳入镜像 - 修复索引模板多次修改导致的混乱 - 底栏 spdis链接 + DeepSeek维护说明
76 lines
2.3 KiB
Python
76 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
"""批量翻译 arXiv 论文 — pdf2zh + DeepSeek V4 Flash"""
|
|
import json, os, sys, time
|
|
|
|
PAPERS_DIR = "/app/papers"
|
|
TRANSLATED_DIR = os.path.join(PAPERS_DIR, "translated")
|
|
DATA_FILE = "/app/data/papers.json"
|
|
os.makedirs(TRANSLATED_DIR, exist_ok=True)
|
|
|
|
# Set DeepSeek credentials
|
|
os.environ["DEEPSEEK_API_KEY"] = os.environ.get("DEEPSEEK_API_KEY", "")
|
|
os.environ["DEEPSEEK_MODEL"] = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
|
|
|
|
# Load paper list
|
|
with open(DATA_FILE) as f:
|
|
data = json.load(f)
|
|
|
|
papers = []
|
|
for mod in data.values():
|
|
for area in mod.get('areas', []):
|
|
for s in ('mainline', 'branches', 'forward'):
|
|
for p in area.get(s, []):
|
|
if p.get('arxiv'):
|
|
papers.append(p['arxiv'])
|
|
|
|
if len(sys.argv) > 1:
|
|
target = set(sys.argv[1:])
|
|
papers = [p for p in papers if p in target]
|
|
|
|
total = len(papers)
|
|
done = 0; skipped = 0; failed = []
|
|
|
|
from pdf2zh.doclayout import OnnxModel
|
|
|
|
print(f"=== Batch translate {total} papers (DeepSeek V4 Flash) ===")
|
|
model = OnnxModel.from_pretrained()
|
|
|
|
for arxiv_id in papers:
|
|
out_path = os.path.join(TRANSLATED_DIR, f"{arxiv_id}.pdf")
|
|
if os.path.exists(out_path):
|
|
skipped += 1; done += 1
|
|
continue
|
|
|
|
pdf_path = os.path.join(PAPERS_DIR, "arxiv", f"{arxiv_id}.pdf")
|
|
if not os.path.exists(pdf_path):
|
|
print(f"[{done+1}/{total}] {arxiv_id}: PDF not found")
|
|
done += 1
|
|
continue
|
|
|
|
try:
|
|
from pdf2zh.high_level import translate
|
|
translate(
|
|
[pdf_path], output=TRANSLATED_DIR,
|
|
lang_in='en', lang_out='zh',
|
|
service='deepseek', model=model,
|
|
)
|
|
mono = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-mono.pdf")
|
|
dual = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-dual.pdf")
|
|
if os.path.exists(mono):
|
|
os.rename(mono, out_path)
|
|
if os.path.exists(dual):
|
|
os.remove(dual)
|
|
done += 1
|
|
print(f"[{done}/{total}] {arxiv_id}: OK")
|
|
except Exception as e:
|
|
done += 1
|
|
failed.append(arxiv_id)
|
|
print(f"[{done}/{total}] {arxiv_id}: FAILED - {e}")
|
|
time.sleep(5)
|
|
|
|
time.sleep(1)
|
|
|
|
print(f"\n=== Done: {done-skipped} translated, {skipped} skipped, {len(failed)} failed ===")
|
|
if failed:
|
|
print(f"Failed: {failed}")
|