diff --git a/api/server.py b/api/server.py index a03dd0e..547bbb1 100644 --- a/api/server.py +++ b/api/server.py @@ -26,6 +26,24 @@ API_KEY = os.environ.get("LLM_LIB_API_KEY", "change-me") log = logging.getLogger("llm-library") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +# ─── Security helpers ────────────────────────────────── +import re +SAFE_ID = re.compile(r'^[a-zA-Z0-9_.\-]+$') + +def safe_paper_id(raw: str) -> str: + """Validate paper ID, reject path traversal attempts.""" + if not raw or len(raw) > 128 or not SAFE_ID.match(raw) or '..' in raw: + raise HTTPException(status_code=400, detail="Invalid paper ID") + return raw + +def safe_pdf_path(base: Path, paper_id: str) -> Path: + """Build safe PDF path, ensuring it stays within base directory.""" + pid = safe_paper_id(paper_id) + fp = (base / f"{pid}.pdf").resolve() + if not str(fp).startswith(str(base.resolve())): + raise HTTPException(status_code=400, detail="Invalid path") + return fp + # ─── App ─────────────────────────────────────────────── app = FastAPI( title="LLM 论文图书馆", @@ -280,7 +298,7 @@ def delete_paper( @app.get("/papers/arxiv/{arxiv_id}") def serve_arxiv_pdf(arxiv_id: str): """从本地缓存提供 arXiv PDF(无 .pdf 后缀路由防 IDM 拦截)""" - pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf" + pdf_path = safe_pdf_path(PAPERS_DIR / "arxiv", arxiv_id) if not pdf_path.exists(): raise HTTPException(status_code=404, detail=f"PDF not in local cache: {arxiv_id}") return FileResponse( @@ -448,8 +466,8 @@ def translate_status(arxiv_id: str): @app.post("/api/download/{arxiv_id}") def download_single_pdf(arxiv_id: str, _=Depends(verify_api_key)): """按需下载单篇 arXiv PDF""" - import subprocess, sys - pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf" + safe_paper_id(arxiv_id) + pdf_path = safe_pdf_path(PAPERS_DIR / "arxiv", arxiv_id) if pdf_path.exists(): return {"ok": True, "arxiv_id": arxiv_id, "status": "cached"} @@ -470,13 +488,16 @@ TRANSLATED_DIR = PAPERS_DIR / "translated" @app.get("/api/translated/{arxiv_id}") def check_translation(arxiv_id: str): """Check if translation exists for a paper""" - fn = f"{arxiv_id}.pdf" - return {"arxiv_id": arxiv_id, "exists": (TRANSLATED_DIR / fn).exists()} + try: + fp = safe_pdf_path(TRANSLATED_DIR, arxiv_id) + return {"arxiv_id": arxiv_id, "exists": fp.exists()} + except HTTPException: + return {"arxiv_id": arxiv_id, "exists": False} @app.get("/papers/translated/{arxiv_id}.pdf") def serve_translated(arxiv_id: str): """Serve translated PDF from cache""" - fp = TRANSLATED_DIR / f"{arxiv_id}.pdf" + fp = safe_pdf_path(TRANSLATED_DIR, arxiv_id) if not fp.exists(): raise HTTPException(status_code=404, detail="Translation not found") return FileResponse(fp, media_type="application/pdf", @@ -497,25 +518,26 @@ def translation_status(): async def trigger_translation(paper_id: str, _=Depends(verify_api_key)): """Trigger pdf2zh translation for a paper (DeepSeek V4 Flash). paper_id can be arxiv ID (e.g. 1706.03762) or a HF filename.""" + safe_paper_id(paper_id) # validate # Find the PDF pdf_path = None # Try arxiv - candidate = PAPERS_DIR / "arxiv" / f"{paper_id}.pdf" + candidate = safe_pdf_path(PAPERS_DIR / "arxiv", paper_id) if candidate.exists(): pdf_path = candidate else: # Try HF papers directory - hf_dir = PAPERS_DIR / "hf" + hf_dir = (PAPERS_DIR / "hf").resolve() if hf_dir.exists(): for f in hf_dir.glob("*.pdf"): - if paper_id in f.stem: + if paper_id in f.stem and str(f.resolve()).startswith(str(hf_dir)): pdf_path = f break if not pdf_path: raise HTTPException(status_code=404, detail=f"PDF not found for {paper_id}") - out_path = TRANSLATED_DIR / f"{paper_id}.pdf" + out_path = safe_pdf_path(TRANSLATED_DIR, paper_id) if out_path.exists(): return {"paper_id": paper_id, "status": "already_translated"}