#!/usr/bin/env python3 """批量翻译 arXiv 论文 — pdf2zh + DeepSeek V4 Flash""" import json, os, sys, time PAPERS_DIR = "/app/papers" TRANSLATED_DIR = os.path.join(PAPERS_DIR, "translated") DATA_FILE = "/app/data/papers.json" os.makedirs(TRANSLATED_DIR, exist_ok=True) # Set DeepSeek credentials os.environ["DEEPSEEK_API_KEY"] = os.environ.get("DEEPSEEK_API_KEY", "") os.environ["DEEPSEEK_MODEL"] = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat") # Load paper list with open(DATA_FILE) as f: data = json.load(f) papers = [] for mod in data.values(): for area in mod.get('areas', []): for s in ('mainline', 'branches', 'forward'): for p in area.get(s, []): if p.get('arxiv'): papers.append(p['arxiv']) if len(sys.argv) > 1: target = set(sys.argv[1:]) papers = [p for p in papers if p in target] total = len(papers) done = 0; skipped = 0; failed = [] from pdf2zh.doclayout import OnnxModel print(f"=== Batch translate {total} papers (DeepSeek V4 Flash) ===") model = OnnxModel.from_pretrained() for arxiv_id in papers: out_path = os.path.join(TRANSLATED_DIR, f"{arxiv_id}.pdf") if os.path.exists(out_path): skipped += 1; done += 1 continue pdf_path = os.path.join(PAPERS_DIR, "arxiv", f"{arxiv_id}.pdf") if not os.path.exists(pdf_path): print(f"[{done+1}/{total}] {arxiv_id}: PDF not found") done += 1 continue try: from pdf2zh.high_level import translate translate( [pdf_path], output=TRANSLATED_DIR, lang_in='en', lang_out='zh', service='deepseek', model=model, ) mono = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-mono.pdf") dual = os.path.join(TRANSLATED_DIR, f"{arxiv_id}-dual.pdf") if os.path.exists(mono): os.rename(mono, out_path) if os.path.exists(dual): os.remove(dual) done += 1 print(f"[{done}/{total}] {arxiv_id}: OK") except Exception as e: done += 1 failed.append(arxiv_id) print(f"[{done}/{total}] {arxiv_id}: FAILED - {e}") time.sleep(5) time.sleep(1) print(f"\n=== Done: {done-skipped} translated, {skipped} skipped, {len(failed)} failed ===") if failed: print(f"Failed: {failed}")