feat: pdf2zh + DeepSeek V4 Flash 翻译集成
This commit is contained in:
75
api/batch_translate.py
Normal file
75
api/batch_translate.py
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
"""批量翻译 arXiv 论文 — pdf2zh + DeepSeek V4 Flash"""
|
||||
import json, os, sys, time
|
||||
|
||||
PAPERS_DIR = "/app/papers"
|
||||
TRANSLATED_DIR = os.path.join(PAPERS_DIR, "translated")
|
||||
DATA_FILE = "/app/data/papers.json"
|
||||
os.makedirs(TRANSLATED_DIR, exist_ok=True)
|
||||
|
||||
# Set DeepSeek credentials
|
||||
os.environ["DEEPSEEK_API_KEY"] = os.environ.get("DEEPSEEK_API_KEY", "")
|
||||
os.environ["DEEPSEEK_MODEL"] = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat")
|
||||
|
||||
# Load paper list
|
||||
with open(DATA_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
papers = []
|
||||
for mod in data.values():
|
||||
for area in mod.get('areas', []):
|
||||
for s in ('mainline', 'branches', 'forward'):
|
||||
for p in area.get(s, []):
|
||||
if p.get('arxiv'):
|
||||
papers.append(p['arxiv'])
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
target = set(sys.argv[1:])
|
||||
papers = [p for p in papers if p in target]
|
||||
|
||||
total = len(papers)
|
||||
done = 0; skipped = 0; failed = []
|
||||
|
||||
from pdf2zh.doclayout import OnnxModel
|
||||
|
||||
print(f"=== Batch translate {total} papers (DeepSeek V4 Flash) ===")
|
||||
model = OnnxModel.from_pretrained()
|
||||
|
||||
for arxiv_id in papers:
|
||||
out_path = os.path.join(TRANSLATED_DIR, f"{arxiv_id}.pdf")
|
||||
if os.path.exists(out_path):
|
||||
skipped += 1; done += 1
|
||||
continue
|
||||
|
||||
pdf_path = os.path.join(PAPERS_DIR, "arxiv", f"{arxiv_id}.pdf")
|
||||
if not os.path.exists(pdf_path):
|
||||
print(f"[{done+1}/{total}] {arxiv_id}: PDF not found")
|
||||
done += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
from pdf2zh.high_level import translate
|
||||
translate(
|
||||
[pdf_path], output=TRANSLATED_DIR,
|
||||
lang_in='en', lang_out='zh',
|
||||
service='deepseek', thread=4, model=model,
|
||||
)
|
||||
mono = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-mono.pdf")
|
||||
dual = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-dual.pdf")
|
||||
if os.path.exists(mono):
|
||||
os.rename(mono, out_path)
|
||||
if os.path.exists(dual):
|
||||
os.remove(dual)
|
||||
done += 1
|
||||
print(f"[{done}/{total}] {arxiv_id}: OK")
|
||||
except Exception as e:
|
||||
done += 1
|
||||
failed.append(arxiv_id)
|
||||
print(f"[{done}/{total}] {arxiv_id}: FAILED - {e}")
|
||||
time.sleep(5)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
print(f"\n=== Done: {done-skipped} translated, {skipped} skipped, {len(failed)} failed ===")
|
||||
if failed:
|
||||
print(f"Failed: {failed}")
|
||||
Reference in New Issue
Block a user