feat: LLM 论文图书馆 — 初始提交
- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
This commit is contained in:
1
api/__init__.py
Normal file
1
api/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# LLM 论文图书馆 — API 模块
|
||||
57
api/backfill.py
Normal file
57
api/backfill.py
Normal file
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Backfill all uncached PDFs, skipping dead arXiv IDs"""
|
||||
import json, subprocess, sys, time
|
||||
from pathlib import Path
|
||||
|
||||
PAPERS_JSON = Path(__file__).resolve().parent.parent / "data" / "papers.json"
|
||||
ARXIV_DIR = Path(__file__).resolve().parent.parent / "papers" / "arxiv"
|
||||
LOG_FILE = Path("/app/papers/backfill.log")
|
||||
|
||||
def main():
|
||||
with open(PAPERS_JSON) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Collect all arxiv IDs
|
||||
arxiv_ids = set()
|
||||
for mod in data.values():
|
||||
for area in mod.get("areas", []):
|
||||
for section in ("mainline", "branches", "forward"):
|
||||
for p in area.get(section, []):
|
||||
aid = p.get("arxiv")
|
||||
if aid:
|
||||
arxiv_ids.add(aid)
|
||||
|
||||
cached = {p.stem for p in ARXIV_DIR.glob("*.pdf")}
|
||||
missing = [aid for aid in arxiv_ids if aid not in cached]
|
||||
print(f"Total: {len(arxiv_ids)}, Cached: {len(cached)}, Missing: {len(missing)}")
|
||||
|
||||
if not missing:
|
||||
print("All caught up!")
|
||||
return
|
||||
|
||||
ok, fail = 0, 0
|
||||
for aid in missing:
|
||||
url = f"https://arxiv.org/pdf/{aid}.pdf"
|
||||
dest = ARXIV_DIR / f"{aid}.pdf"
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["wget", "-q", "-T", "15", "-O", str(dest), url],
|
||||
timeout=20
|
||||
)
|
||||
if r.returncode == 0 and dest.exists() and dest.stat().st_size > 5000:
|
||||
ok += 1
|
||||
print(f" OK {aid} ({dest.stat().st_size//1024} KB)")
|
||||
else:
|
||||
dest.unlink(missing_ok=True)
|
||||
fail += 1
|
||||
print(f" FAIL {aid} (rc={r.returncode}, sz={dest.stat().st_size if dest.exists() else 0})")
|
||||
except Exception as e:
|
||||
dest.unlink(missing_ok=True)
|
||||
fail += 1
|
||||
print(f" ERR {aid} {e}")
|
||||
time.sleep(0.8) # Be nice to arXiv
|
||||
|
||||
print(f"\nDone: {ok} ok, {fail} failed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
13
api/check_trans.py
Normal file
13
api/check_trans.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import urllib.request, json
|
||||
r = urllib.request.urlopen("http://127.0.0.1:8000/api/translate/2005.14165")
|
||||
data = json.loads(r.read())
|
||||
p = data["paragraphs"][0]
|
||||
page1 = p["page"]
|
||||
en1 = p["en"][:60]
|
||||
zh1 = p["zh"][:80]
|
||||
print(f"page={page1}, en={en1}")
|
||||
print(f"zh={zh1}")
|
||||
p5 = data["paragraphs"][5]
|
||||
page5 = p5["page"]
|
||||
zh5 = p5["zh"][:80]
|
||||
print(f"para[5] page={page5}, zh={zh5}")
|
||||
198
api/downloader.py
Normal file
198
api/downloader.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
LLM 论文图书馆 — PDF 下载器
|
||||
从 arXiv 和 HuggingFace 下载论文 PDF 到本地缓存
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from tqdm import tqdm
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
DATA_FILE = ROOT / "data" / "papers.json"
|
||||
ARXIV_DIR = ROOT / "papers" / "arxiv"
|
||||
HF_DIR = ROOT / "papers" / "hf"
|
||||
LOG_FILE = ROOT / "papers" / "download.log"
|
||||
|
||||
log = logging.getLogger("downloader")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(LOG_FILE),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def collect_urls() -> tuple[list[tuple[str, str]], list[tuple[str, str]]]:
|
||||
"""从 papers.json 收集所有需要下载的 PDF URL
|
||||
|
||||
Returns:
|
||||
arxiv_list: [(arxiv_id, title), ...]
|
||||
hf_list: [(url, filename), ...]
|
||||
"""
|
||||
with open(DATA_FILE) as f:
|
||||
data = json.load(f)
|
||||
|
||||
arxiv_seen = set()
|
||||
hf_seen = set()
|
||||
arxiv_list = []
|
||||
hf_list = []
|
||||
|
||||
for mod in data.values():
|
||||
for area in mod.get("areas", []):
|
||||
for section in ("mainline", "branches", "forward"):
|
||||
for p in area.get(section, []):
|
||||
if p.get("arxiv") and p["arxiv"] not in arxiv_seen:
|
||||
arxiv_seen.add(p["arxiv"])
|
||||
arxiv_list.append((p["arxiv"], p.get("title", "")))
|
||||
if p.get("pdf") and p["pdf"] not in hf_seen:
|
||||
hf_seen.add(p["pdf"])
|
||||
# Derive a safe filename from the URL
|
||||
name = p["pdf"].split("/")[-1].replace(".pdf", "")
|
||||
hf_list.append((p["pdf"], name))
|
||||
|
||||
return arxiv_list, hf_list
|
||||
|
||||
|
||||
def download_arxiv(client: httpx.Client, arxiv_id: str, title: str) -> bool:
|
||||
"""下载单个 arXiv PDF"""
|
||||
pdf_path = ARXIV_DIR / f"{arxiv_id}.pdf"
|
||||
if pdf_path.exists():
|
||||
log.debug(f"Skip (exists): {arxiv_id}")
|
||||
return True
|
||||
|
||||
url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
|
||||
try:
|
||||
resp = client.get(url, follow_redirects=True, timeout=30)
|
||||
resp.raise_for_status()
|
||||
|
||||
# Verify it's actually a PDF (arxiv returns HTML for missing papers)
|
||||
content_type = resp.headers.get("content-type", "")
|
||||
if "pdf" not in content_type and not resp.content.startswith(b"%PDF"):
|
||||
log.warning(f"Not a PDF: {arxiv_id} — {title[:60]}")
|
||||
return False
|
||||
|
||||
pdf_path.write_bytes(resp.content)
|
||||
size_kb = len(resp.content) / 1024
|
||||
log.info(f"OK: {arxiv_id} ({size_kb:.0f} KB) — {title[:60]}")
|
||||
return True
|
||||
except httpx.HTTPError as e:
|
||||
log.error(f"HTTP error {arxiv_id}: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
log.error(f"Error {arxiv_id}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def download_hf(client: httpx.Client, url: str, filename: str) -> bool:
|
||||
"""下载单个 HuggingFace PDF"""
|
||||
safe_name = filename.replace("..", "").replace("/", "_")
|
||||
pdf_path = HF_DIR / f"{safe_name}.pdf"
|
||||
if pdf_path.exists():
|
||||
log.debug(f"Skip (exists): {safe_name}")
|
||||
return True
|
||||
|
||||
try:
|
||||
resp = client.get(url, follow_redirects=True, timeout=60)
|
||||
resp.raise_for_status()
|
||||
|
||||
if not resp.content.startswith(b"%PDF"):
|
||||
log.warning(f"Not a PDF: {safe_name}")
|
||||
return False
|
||||
|
||||
pdf_path.write_bytes(resp.content)
|
||||
size_kb = len(resp.content) / 1024
|
||||
log.info(f"OK (HF): {safe_name} ({size_kb:.0f} KB)")
|
||||
return True
|
||||
except httpx.HTTPError as e:
|
||||
log.error(f"HTTP error {safe_name}: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
log.error(f"Error {safe_name}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def run(incremental: bool = True, limit: int = 0, delay: float = 1.0):
|
||||
"""批量下载所有 PDF
|
||||
|
||||
Args:
|
||||
incremental: True=跳过已有文件
|
||||
limit: 0=全部, N=只下载前N篇
|
||||
delay: 请求间延迟(秒)
|
||||
"""
|
||||
ARXIV_DIR.mkdir(parents=True, exist_ok=True)
|
||||
HF_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
arxiv_list, hf_list = collect_urls()
|
||||
total = len(arxiv_list) + len(hf_list)
|
||||
log.info(f"Found {len(arxiv_list)} arXiv + {len(hf_list)} HF = {total} PDFs to download")
|
||||
log.info(f"Incremental: {incremental}, Delay: {delay}s")
|
||||
|
||||
if not incremental:
|
||||
log.warning("Non-incremental mode: will re-download existing files")
|
||||
|
||||
# Count existing
|
||||
arxiv_existing = sum(1 for aid, _ in arxiv_list if (ARXIV_DIR / f"{aid}.pdf").exists())
|
||||
hf_existing = sum(1 for _, name in hf_list if (HF_DIR / f"{name}.pdf").exists())
|
||||
log.info(f"Already cached: {arxiv_existing} arXiv + {hf_existing} HF")
|
||||
|
||||
ok, fail = 0, 0
|
||||
total_size = 0.0
|
||||
|
||||
with httpx.Client(
|
||||
headers={"User-Agent": "LLM-Library-Downloader/0.1"},
|
||||
timeout=30,
|
||||
follow_redirects=True,
|
||||
) as client:
|
||||
|
||||
# Download arXiv
|
||||
if limit > 0:
|
||||
arxiv_list = arxiv_list[:limit]
|
||||
for arxiv_id, title in tqdm(arxiv_list, desc="arXiv"):
|
||||
if incremental and (ARXIV_DIR / f"{arxiv_id}.pdf").exists():
|
||||
ok += 1
|
||||
continue
|
||||
success = download_arxiv(client, arxiv_id, title)
|
||||
if success:
|
||||
ok += 1
|
||||
p = ARXIV_DIR / f"{arxiv_id}.pdf"
|
||||
if p.exists():
|
||||
total_size += p.stat().st_size
|
||||
else:
|
||||
fail += 1
|
||||
time.sleep(delay)
|
||||
|
||||
# Download HF
|
||||
if limit > 0:
|
||||
hf_list = hf_list[:limit]
|
||||
for url, name in tqdm(hf_list, desc="HF "):
|
||||
if incremental and (HF_DIR / f"{name}.pdf").exists():
|
||||
ok += 1
|
||||
continue
|
||||
success = download_hf(client, url, name)
|
||||
if success:
|
||||
ok += 1
|
||||
p = HF_DIR / f"{name}.pdf"
|
||||
if p.exists():
|
||||
total_size += p.stat().st_size
|
||||
else:
|
||||
fail += 1
|
||||
time.sleep(delay)
|
||||
|
||||
log.info(f"Done: {ok} OK, {fail} failed, {total_size/1024/1024:.1f} MB total")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="下载论文 PDF 到本地缓存")
|
||||
parser.add_argument("--no-incremental", action="store_true", help="重新下载所有 (默认跳过已有)")
|
||||
parser.add_argument("--limit", type=int, default=0, help="限制下载数量 (0=全部)")
|
||||
parser.add_argument("--delay", type=float, default=1.0, help="请求间延迟 (秒)")
|
||||
args = parser.parse_args()
|
||||
run(incremental=not args.no_incremental, limit=args.limit, delay=args.delay)
|
||||
103
api/extract_data.py
Normal file
103
api/extract_data.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build papers.json by regex-extracting paper entries from llm_library.html"""
|
||||
import re, json, os
|
||||
|
||||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
html_path = os.path.join(ROOT, 'llm_library.html')
|
||||
|
||||
with open(html_path, 'r') as f:
|
||||
html = f.read()
|
||||
|
||||
# Step 1: Parse modules (each module is a top-level key in PAPER_DATA)
|
||||
# Find each module block by matching " arch: {" style patterns
|
||||
# Actually, let's parse line by line since this is a human-readable format
|
||||
|
||||
# Simpler approach: extract all paper entries with regex
|
||||
# Pattern: { title:"...", authors:"...", year:..., venue:"...", arxiv:"...", tags:[...] }
|
||||
paper_re = re.compile(
|
||||
r'\{\s*title:\s*"([^"]*)",\s*authors:\s*"([^"]*)",\s*year:\s*(\d+),\s*venue:\s*"([^"]*)",\s*'
|
||||
r'(?:arxiv:\s*"([^"]*)",\s*|pdf:\s*"([^"]*)",\s*|)'
|
||||
r'tags:\s*\[(.*?)\]\s*\}',
|
||||
re.DOTALL
|
||||
)
|
||||
|
||||
papers = []
|
||||
for m in paper_re.finditer(html):
|
||||
title = m.group(1)
|
||||
authors = m.group(2)
|
||||
year = int(m.group(3))
|
||||
venue = m.group(4)
|
||||
arxiv = m.group(5) or None
|
||||
pdf = m.group(6) or None
|
||||
tags_str = m.group(7)
|
||||
tags = re.findall(r'"([^"]*)"', tags_str)
|
||||
|
||||
# Find which module/area this paper belongs to
|
||||
pos = m.start()
|
||||
# Search backwards for module and area context
|
||||
before = html[max(0,pos-3000):pos]
|
||||
|
||||
# Find module id
|
||||
mod_match = re.search(r'\n\s*(\w+):\s*\{\s*\{?\s*name:\s*"([^"]*)"', before)
|
||||
if not mod_match:
|
||||
# Try broader pattern
|
||||
mod_match = re.search(r'(\w+):\s*\{[^}]*name:\s*"([^"]*)"', before)
|
||||
if mod_match:
|
||||
mod_id = mod_match.group(1)
|
||||
mod_name = mod_match.group(2)
|
||||
else:
|
||||
mod_id = 'unknown'
|
||||
mod_name = 'Unknown'
|
||||
|
||||
# Find area id
|
||||
area_match = re.search(r'id:\s*"(\w+)"[^}]*name:\s*"([^"]*)"', before)
|
||||
if area_match:
|
||||
area_id = area_match.group(1)
|
||||
area_name = area_match.group(2)
|
||||
else:
|
||||
area_id = 'unknown'
|
||||
area_name = 'Unknown'
|
||||
|
||||
papers.append({
|
||||
'module': mod_id,
|
||||
'module_name': mod_name,
|
||||
'area': area_id,
|
||||
'area_name': area_name,
|
||||
'title': title,
|
||||
'authors': authors,
|
||||
'year': year,
|
||||
'venue': venue,
|
||||
'arxiv': arxiv,
|
||||
'pdf': pdf,
|
||||
'tags': tags,
|
||||
})
|
||||
|
||||
print(f'Extracted {len(papers)} papers')
|
||||
|
||||
# Group by module → area → section (mainline/branches/forward)
|
||||
# For now, just save as flat list for verification
|
||||
# We'll reconstruct the proper nested structure after verifying
|
||||
|
||||
# Also extract module metadata
|
||||
modules = {}
|
||||
for m in re.finditer(r"(\w+):\s*\{\s*name:\s*\"([^\"]+)\"[^}]*icon:\s*\"([^\"]*)\"[^}]*desc:\s*\"([^\"]*)\"", html):
|
||||
mod_id = m.group(1)
|
||||
modules[mod_id] = {
|
||||
'name': m.group(2),
|
||||
'icon': m.group(3),
|
||||
'desc': m.group(4),
|
||||
'color': mod_id,
|
||||
'areas': []
|
||||
}
|
||||
|
||||
print(f'Found {len(modules)} modules')
|
||||
for mod_id, mod in modules.items():
|
||||
print(f' {mod_id}: {mod["name"]}')
|
||||
|
||||
# Save the flat list for now
|
||||
output_path = os.path.join(os.path.dirname(__file__), '..', 'data', 'papers.json')
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(papers, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f'Saved {len(papers)} papers (flat) to {output_path}')
|
||||
107
api/parse_papers.py
Normal file
107
api/parse_papers.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Parse llm_library.html PAPER_DATA block → nested papers.json"""
|
||||
import re, json, os
|
||||
|
||||
HTML = '/app/working/workspaces/default/llm_library.html'
|
||||
JSON = '/app/working/workspaces/default/llm-library/data/papers.json'
|
||||
|
||||
with open(HTML) as f:
|
||||
html = f.read()
|
||||
|
||||
s = html.index('const PAPER_DATA = {')
|
||||
e = html.index('APP STATE')
|
||||
block = html[s+22:e]
|
||||
|
||||
modules = {}
|
||||
current_mod = None
|
||||
current_area = None
|
||||
current_section = 'mainline'
|
||||
|
||||
for line in block.split('\n'):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
indent = len(line) - len(line.lstrip())
|
||||
|
||||
# Module start: " arch: {" at indent 2
|
||||
if indent == 2 and re.match(r'^\w+:\s*\{', stripped):
|
||||
mid = stripped.split(':')[0]
|
||||
if current_mod and current_mod.get('name'):
|
||||
modules[current_mod['id']] = current_mod
|
||||
current_mod = {'id': mid, 'name': '', 'icon': '', 'desc': '', 'color': mid, 'areas': []}
|
||||
current_area = None
|
||||
current_section = 'mainline'
|
||||
continue
|
||||
|
||||
if not current_mod:
|
||||
continue
|
||||
|
||||
# Module metadata at indent 4
|
||||
if indent == 4:
|
||||
m = re.match(r'(\w+):\s*"([^"]*)"', stripped)
|
||||
if m and m.group(1) in ('name', 'icon', 'desc', 'color'):
|
||||
current_mod[m.group(1)] = m.group(2)
|
||||
|
||||
# Area header at indent 8
|
||||
if indent == 8:
|
||||
m_id = re.match(r'id:\s*"(\w+)"', stripped)
|
||||
if m_id:
|
||||
current_area = {'id': m_id.group(1), 'name': '', 'mainline': [], 'branches': [], 'forward': []}
|
||||
current_mod['areas'].append(current_area)
|
||||
current_section = 'mainline'
|
||||
continue
|
||||
m_name = re.match(r'name:\s*"([^"]+)"', stripped)
|
||||
if m_name and current_area:
|
||||
current_area['name'] = m_name.group(1)
|
||||
continue
|
||||
if re.match(r'mainline:\s*\[', stripped):
|
||||
current_section = 'mainline'
|
||||
elif re.match(r'branches:\s*\[', stripped):
|
||||
current_section = 'branches'
|
||||
elif re.match(r'forward:\s*\[', stripped):
|
||||
current_section = 'forward'
|
||||
|
||||
# Paper entry
|
||||
if stripped.startswith('{ title:') and 'tags:' in stripped and current_area:
|
||||
title = re.search(r'title:\s*"([^"]+)"', stripped)
|
||||
authors = re.search(r'authors:\s*"([^"]*?)"', stripped)
|
||||
year = re.search(r'year:\s*(\d+)', stripped)
|
||||
venue = re.search(r'venue:\s*"([^"]*?)"', stripped)
|
||||
arxiv = re.search(r'arxiv:\s*"(\S+?)"', stripped)
|
||||
pdf = re.search(r'pdf:\s*"(https:[^"]+)"', stripped)
|
||||
tags_m = re.search(r'tags:\s*\[(.*?)\]', stripped, re.DOTALL)
|
||||
|
||||
if title and year and tags_m:
|
||||
tags = re.findall(r'"([^"]*)"', tags_m.group(1))
|
||||
entry = {
|
||||
'title': title.group(1),
|
||||
'authors': authors.group(1) if authors else '',
|
||||
'year': int(year.group(1)),
|
||||
'venue': venue.group(1) if venue else '',
|
||||
'tags': tags
|
||||
}
|
||||
if arxiv and arxiv.group(1):
|
||||
entry['arxiv'] = arxiv.group(1)
|
||||
if pdf and pdf.group(1):
|
||||
entry['pdf'] = pdf.group(1)
|
||||
current_area[current_section].append(entry)
|
||||
|
||||
# Save last module
|
||||
if current_mod and current_mod.get('name'):
|
||||
modules[current_mod['id']] = current_mod
|
||||
|
||||
# Count
|
||||
total = sum(
|
||||
len(a.get('mainline',[])) + len(a.get('branches',[])) + len(a.get('forward',[]))
|
||||
for m in modules.values() for a in m.get('areas',[])
|
||||
)
|
||||
|
||||
print(f'Parsed: {len(modules)} modules, {total} papers')
|
||||
for mid, m in sorted(modules.items()):
|
||||
pc = sum(len(a.get('mainline',[]))+len(a.get('branches',[]))+len(a.get('forward',[])) for a in m['areas'])
|
||||
print(f' {mid}: {m["name"]} — {len(m["areas"])} areas, {pc} papers')
|
||||
|
||||
os.makedirs(os.path.dirname(JSON), exist_ok=True)
|
||||
with open(JSON, 'w') as f:
|
||||
json.dump(modules, f, ensure_ascii=False, indent=2)
|
||||
print(f'\nSaved to {JSON}')
|
||||
484
api/server.py
Normal file
484
api/server.py
Normal file
@@ -0,0 +1,484 @@
|
||||
"""
|
||||
LLM 论文图书馆 — FastAPI 后端
|
||||
提供 REST API 进行论文查询、管理、PDF 代理服务
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import hashlib
|
||||
import secrets
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, HTTPException, Query, Depends, Request
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
|
||||
# ─── Config ────────────────────────────────────────────
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
DATA_FILE = ROOT / "data" / "papers.json"
|
||||
PAPERS_DIR = ROOT / "papers"
|
||||
API_KEY = os.environ.get("LLM_LIB_API_KEY", "change-me")
|
||||
|
||||
log = logging.getLogger("llm-library")
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
# ─── App ───────────────────────────────────────────────
|
||||
app = FastAPI(
|
||||
title="LLM 论文图书馆",
|
||||
description="大模型论文知识库 API — 查询、搜索、管理论文",
|
||||
version="0.1.0",
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# ─── Auth ──────────────────────────────────────────────
|
||||
def verify_api_key(request: Request):
|
||||
"""简单的 API Key 鉴权 — 用于写操作 (POST/PUT/DELETE)"""
|
||||
auth = request.headers.get("Authorization", "")
|
||||
if auth.startswith("Bearer "):
|
||||
token = auth[7:]
|
||||
else:
|
||||
token = request.query_params.get("api_key", "")
|
||||
if not token or token != API_KEY:
|
||||
raise HTTPException(status_code=401, detail="Invalid or missing API key")
|
||||
return True
|
||||
|
||||
# ─── Data loading ──────────────────────────────────────
|
||||
def load_data():
|
||||
if not DATA_FILE.exists():
|
||||
return {}
|
||||
with open(DATA_FILE, 'r') as f:
|
||||
return json.load(f)
|
||||
|
||||
def save_data(data):
|
||||
with open(DATA_FILE, 'w') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# ─── Paper CRUD helpers ────────────────────────────────
|
||||
def find_paper(data, module_id, area_id, title):
|
||||
"""Find a paper index by title within module/area"""
|
||||
mod = data.get(module_id)
|
||||
if not mod:
|
||||
return None, None, None, None
|
||||
for area in mod.get("areas", []):
|
||||
if area["id"] == area_id:
|
||||
for section in ("mainline", "branches", "forward"):
|
||||
for i, p in enumerate(area.get(section, [])):
|
||||
if p["title"] == title:
|
||||
return mod, area, section, i
|
||||
return None, None, None, None
|
||||
|
||||
# ─── Routes: Query ─────────────────────────────────────
|
||||
@app.get("/api/stats")
|
||||
def get_stats():
|
||||
"""获取图书馆统计信息"""
|
||||
data = load_data()
|
||||
mods = len(data)
|
||||
areas = 0
|
||||
papers = 0
|
||||
sections = {"mainline": 0, "branches": 0, "forward": 0}
|
||||
for mod in data.values():
|
||||
areas += len(mod.get("areas", []))
|
||||
for area in mod.get("areas", []):
|
||||
for s in ("mainline", "branches", "forward"):
|
||||
n = len(area.get(s, []))
|
||||
papers += n
|
||||
sections[s] += n
|
||||
return {
|
||||
"modules": mods,
|
||||
"areas": areas,
|
||||
"papers": papers,
|
||||
"sections": sections,
|
||||
"data_file": str(DATA_FILE),
|
||||
}
|
||||
|
||||
@app.get("/api/modules")
|
||||
def list_modules():
|
||||
"""列出所有模块 (不含论文详情)"""
|
||||
data = load_data()
|
||||
return [
|
||||
{
|
||||
"id": mid,
|
||||
"name": m["name"],
|
||||
"icon": m["icon"],
|
||||
"desc": m["desc"],
|
||||
"area_count": len(m.get("areas", [])),
|
||||
"paper_count": sum(
|
||||
len(a.get("mainline", [])) + len(a.get("branches", [])) + len(a.get("forward", []))
|
||||
for a in m.get("areas", [])
|
||||
),
|
||||
}
|
||||
for mid, m in data.items()
|
||||
]
|
||||
|
||||
@app.get("/api/modules/{module_id}")
|
||||
def get_module(module_id: str):
|
||||
"""获取单个模块的完整论文数据"""
|
||||
data = load_data()
|
||||
mod = data.get(module_id)
|
||||
if not mod:
|
||||
raise HTTPException(status_code=404, detail=f"Module '{module_id}' not found")
|
||||
return mod
|
||||
|
||||
@app.get("/api/papers")
|
||||
def search_papers(
|
||||
q: str = Query(default="", description="搜索关键词: 标题/作者"),
|
||||
module: Optional[str] = Query(default=None),
|
||||
tag: Optional[str] = Query(default=None, description="起点/关键节点/前沿/前瞻/支线"),
|
||||
limit: int = Query(default=50, ge=1, le=200),
|
||||
):
|
||||
"""搜索论文 (全文/按模块/按标签)"""
|
||||
data = load_data()
|
||||
results = []
|
||||
q = q.lower()
|
||||
for mid, mod in data.items():
|
||||
if module and mid != module:
|
||||
continue
|
||||
for area in mod.get("areas", []):
|
||||
for section in ("mainline", "branches", "forward"):
|
||||
for p in area.get(section, []):
|
||||
# Filter by tag
|
||||
if tag and tag not in p.get("tags", []):
|
||||
continue
|
||||
# Filter by query
|
||||
if q:
|
||||
if q not in (p.get("title", "") + p.get("authors", "")).lower():
|
||||
continue
|
||||
results.append({
|
||||
"module_id": mid,
|
||||
"module_name": mod["name"],
|
||||
"area_id": area["id"],
|
||||
"area_name": area["name"],
|
||||
"section": section,
|
||||
**p,
|
||||
})
|
||||
if len(results) >= limit:
|
||||
break
|
||||
if len(results) >= limit:
|
||||
break
|
||||
if len(results) >= limit:
|
||||
break
|
||||
if len(results) >= limit:
|
||||
break
|
||||
return results
|
||||
|
||||
# ─── Routes: Management (写操作, 需 API Key) ────────────
|
||||
class PaperCreate(BaseModel):
|
||||
module_id: str
|
||||
area_id: str
|
||||
section: str = "mainline" # mainline / branches / forward
|
||||
title: str
|
||||
authors: str = ""
|
||||
year: int
|
||||
venue: str = ""
|
||||
arxiv: Optional[str] = None
|
||||
pdf: Optional[str] = None
|
||||
tags: list[str] = []
|
||||
|
||||
class PaperUpdate(BaseModel):
|
||||
authors: Optional[str] = None
|
||||
year: Optional[int] = None
|
||||
venue: Optional[str] = None
|
||||
arxiv: Optional[str] = None
|
||||
pdf: Optional[str] = None
|
||||
tags: Optional[list[str]] = None
|
||||
section: Optional[str] = None # move to different section
|
||||
|
||||
@app.post("/api/papers", dependencies=[Depends(verify_api_key)])
|
||||
def add_paper(paper: PaperCreate):
|
||||
"""添加一篇新论文"""
|
||||
data = load_data()
|
||||
mod = data.get(paper.module_id)
|
||||
if not mod:
|
||||
raise HTTPException(status_code=404, detail="Module not found")
|
||||
|
||||
area = next((a for a in mod["areas"] if a["id"] == paper.area_id), None)
|
||||
if not area:
|
||||
raise HTTPException(status_code=404, detail="Area not found")
|
||||
|
||||
section = paper.section
|
||||
if section not in ("mainline", "branches", "forward"):
|
||||
raise HTTPException(status_code=400, detail="section must be mainline/branches/forward")
|
||||
|
||||
entry = {
|
||||
"title": paper.title,
|
||||
"authors": paper.authors,
|
||||
"year": paper.year,
|
||||
"venue": paper.venue,
|
||||
"tags": paper.tags,
|
||||
}
|
||||
if paper.arxiv:
|
||||
entry["arxiv"] = paper.arxiv
|
||||
if paper.pdf:
|
||||
entry["pdf"] = paper.pdf
|
||||
|
||||
area.setdefault(section, []).append(entry)
|
||||
save_data(data)
|
||||
log.info(f"Added paper: {paper.title}")
|
||||
return {"ok": True, "title": paper.title}
|
||||
|
||||
@app.put("/api/papers")
|
||||
def update_paper(
|
||||
module_id: str,
|
||||
area_id: str,
|
||||
title: str,
|
||||
update: PaperUpdate,
|
||||
_=Depends(verify_api_key),
|
||||
):
|
||||
"""更新一篇论文"""
|
||||
data = load_data()
|
||||
mod, area, section, idx = find_paper(data, module_id, area_id, title)
|
||||
if mod is None:
|
||||
raise HTTPException(status_code=404, detail="Paper not found")
|
||||
|
||||
paper = area[section][idx]
|
||||
for field in ("authors", "year", "venue", "arxiv", "pdf", "tags"):
|
||||
val = getattr(update, field)
|
||||
if val is not None:
|
||||
paper[field] = val
|
||||
|
||||
# Move to different section?
|
||||
if update.section and update.section != section:
|
||||
if update.section not in ("mainline", "branches", "forward"):
|
||||
raise HTTPException(status_code=400, detail="Invalid section")
|
||||
area[section].pop(idx)
|
||||
area.setdefault(update.section, []).append(paper)
|
||||
|
||||
save_data(data)
|
||||
log.info(f"Updated paper: {title}")
|
||||
return {"ok": True, "title": title}
|
||||
|
||||
@app.delete("/api/papers")
|
||||
def delete_paper(
|
||||
module_id: str,
|
||||
area_id: str,
|
||||
title: str,
|
||||
_=Depends(verify_api_key),
|
||||
):
|
||||
"""删除一篇论文"""
|
||||
data = load_data()
|
||||
mod, area, section, idx = find_paper(data, module_id, area_id, title)
|
||||
if mod is None:
|
||||
raise HTTPException(status_code=404, detail="Paper not found")
|
||||
|
||||
area[section].pop(idx)
|
||||
save_data(data)
|
||||
log.info(f"Deleted paper: {title}")
|
||||
return {"ok": True, "title": title}
|
||||
|
||||
# ─── Routes: PDF proxy ──────────────────────────────────
|
||||
@app.get("/papers/arxiv/{arxiv_id}.pdf")
|
||||
@app.get("/papers/arxiv/{arxiv_id}")
|
||||
def serve_arxiv_pdf(arxiv_id: str):
|
||||
"""从本地缓存提供 arXiv PDF(无 .pdf 后缀路由防 IDM 拦截)"""
|
||||
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
|
||||
if not pdf_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"PDF not in local cache: {arxiv_id}")
|
||||
return FileResponse(
|
||||
pdf_path, media_type="application/pdf",
|
||||
headers={"Cache-Control": "public, max-age=86400"},
|
||||
)
|
||||
|
||||
@app.get("/papers/hf/{filename}.pdf")
|
||||
@app.get("/papers/hf/{filename}")
|
||||
def serve_hf_pdf(filename: str):
|
||||
"""从本地缓存提供 HuggingFace PDF(无 .pdf 后缀路由防 IDM 拦截)"""
|
||||
safe_name = filename.replace("..", "").replace("/", "_").removesuffix(".pdf")
|
||||
pdf_path = PAPERS_DIR / "hf" / f"{safe_name}.pdf"
|
||||
if not pdf_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"PDF not in local cache: {filename}")
|
||||
return FileResponse(
|
||||
pdf_path, media_type="application/pdf",
|
||||
headers={"Cache-Control": "public, max-age=86400"},
|
||||
)
|
||||
|
||||
# ─── Routes: Translation ───────────────────────────────
|
||||
TRANSLATE_CACHE = ROOT / "data" / "translations"
|
||||
TRANSLATE_CACHE.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def extract_pdf_text_with_pages(pdf_path: Path, max_chars: int = 12000) -> list[dict]:
|
||||
"""从 PDF 提取文本和页码信息,使用 pdftotext (Poppler) 避免 PyMuPDF GPU 依赖"""
|
||||
import subprocess, tempfile
|
||||
|
||||
# Strip arXiv stamp (first page header)
|
||||
stamp = f"{pdf_path.stem}.pdf" # e.g. "1706.03762.pdf"
|
||||
|
||||
result = subprocess.run(
|
||||
["pdftotext", "-layout", "-q", str(pdf_path), "-"],
|
||||
capture_output=True, text=True, timeout=30
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
log.error(f"pdftotext failed: {result.stderr}")
|
||||
raise HTTPException(status_code=500, detail="PDF text extraction failed")
|
||||
|
||||
text = result.stdout
|
||||
|
||||
# Remove arXiv stamp line
|
||||
import re
|
||||
text = re.sub(r'arXiv:' + re.escape(stamp.split('.pdf')[0]) + r'.*?\n\n', '', text, flags=re.DOTALL)
|
||||
text = re.sub(r'arXiv:' + re.escape(stamp) + r'.*?\n\n', '', text, flags=re.DOTALL)
|
||||
|
||||
# Split by form-feed (page break)
|
||||
pages = text.split('\f')
|
||||
|
||||
result_pages = []
|
||||
total = 0
|
||||
for i, page_text in enumerate(pages):
|
||||
pt = page_text.strip()
|
||||
if not pt: continue
|
||||
result_pages.append({"page": i + 1, "text": pt})
|
||||
total += len(pt)
|
||||
if total >= max_chars: break
|
||||
|
||||
if not result_pages:
|
||||
raise HTTPException(status_code=500, detail="No text extracted from PDF")
|
||||
|
||||
return result_pages
|
||||
|
||||
|
||||
def split_text_with_pages(page_texts: list[dict], max_len: int = 400) -> list[dict]:
|
||||
"""将按页拆分的文本进一步拆为段落,保留页码"""
|
||||
chunks = []
|
||||
for pt in page_texts:
|
||||
page = pt["page"]
|
||||
text = pt["text"]
|
||||
raw_paras = [p.strip() for p in text.split("\n\n") if p.strip()]
|
||||
for para in raw_paras:
|
||||
if len(para) <= max_len:
|
||||
chunks.append({"page": page, "text": para})
|
||||
else:
|
||||
sentences = para.replace(". ", ".|").replace("? ", "?|").replace("! ", "!|").split("|")
|
||||
current = ""
|
||||
for s in sentences:
|
||||
s = s.strip()
|
||||
if not s: continue
|
||||
if len(current) + len(s) + 1 <= max_len:
|
||||
current = (current + " " + s).strip()
|
||||
else:
|
||||
if current: chunks.append({"page": page, "text": current})
|
||||
current = s
|
||||
if current: chunks.append({"page": page, "text": current})
|
||||
return chunks
|
||||
|
||||
|
||||
def translate_text(text: str, source: str = "en", target: str = "zh") -> str:
|
||||
"""使用 MyMemory 免费 API 翻译文本"""
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
|
||||
url = "https://api.mymemory.translated.net/get"
|
||||
params = urllib.parse.urlencode({
|
||||
"q": text,
|
||||
"langpair": f"{source}|{target}",
|
||||
"mt": "1", # Force machine translation, not memory
|
||||
"de": "me@llm-library.local",
|
||||
})
|
||||
full_url = f"{url}?{params}"
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(full_url, timeout=15) as resp:
|
||||
data = json.loads(resp.read())
|
||||
except Exception as e:
|
||||
log.warning(f"Translation API error: {e}")
|
||||
return text
|
||||
|
||||
if data.get("responseStatus") == 200 and data.get("responseData"):
|
||||
return data["responseData"]["translatedText"]
|
||||
return text
|
||||
|
||||
|
||||
@app.get("/api/translate/{arxiv_id}")
|
||||
def translate_paper(arxiv_id: str):
|
||||
"""翻译论文正文 (从本地 PDF 提取文本,每段带页码)"""
|
||||
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
|
||||
if not pdf_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"PDF not cached: {arxiv_id}")
|
||||
|
||||
cache_file = TRANSLATE_CACHE / f"{arxiv_id}.json"
|
||||
if cache_file.exists():
|
||||
with open(cache_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
# Extract text with page numbers
|
||||
log.info(f"Extracting text from {arxiv_id}")
|
||||
page_texts = extract_pdf_text_with_pages(pdf_path)
|
||||
chunks = split_text_with_pages(page_texts)
|
||||
log.info(f"Translating {len(chunks)} paragraphs for {arxiv_id}")
|
||||
|
||||
translated = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
if i % 10 == 0:
|
||||
log.info(f" [{arxiv_id}] translating paragraph {i+1}/{len(chunks)}")
|
||||
zh = translate_text(chunk["text"])
|
||||
translated.append({
|
||||
"page": chunk["page"],
|
||||
"en": chunk["text"],
|
||||
"zh": zh,
|
||||
})
|
||||
|
||||
result = {"arxiv_id": arxiv_id, "paragraphs": translated, "count": len(translated)}
|
||||
|
||||
with open(cache_file, "w") as f:
|
||||
json.dump(result, f, ensure_ascii=False)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@app.get("/api/translate/{arxiv_id}/status")
|
||||
def translate_status(arxiv_id: str):
|
||||
"""检查翻译缓存状态"""
|
||||
cache_file = TRANSLATE_CACHE / f"{arxiv_id}.json"
|
||||
return {
|
||||
"arxiv_id": arxiv_id,
|
||||
"cached": cache_file.exists(),
|
||||
"pdf_exists": (PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf").exists(),
|
||||
}
|
||||
|
||||
# ─── Routes: PDF download on-demand ────────────────────
|
||||
@app.post("/api/download/{arxiv_id}")
|
||||
def download_single_pdf(arxiv_id: str):
|
||||
"""按需下载单篇 arXiv PDF"""
|
||||
import subprocess, sys
|
||||
pdf_path = PAPERS_DIR / "arxiv" / f"{arxiv_id}.pdf"
|
||||
if pdf_path.exists():
|
||||
return {"ok": True, "arxiv_id": arxiv_id, "status": "cached"}
|
||||
|
||||
cmd = [sys.executable, str(ROOT / "api" / "downloader.py"), "--limit", "1", "--delay", "0"]
|
||||
# We need a way to download specific arxiv IDs — for now, just run the downloader
|
||||
# It will try all uncached papers, but the specific one will be among them
|
||||
try:
|
||||
subprocess.run(cmd, cwd=str(ROOT), timeout=60, capture_output=True)
|
||||
if pdf_path.exists():
|
||||
return {"ok": True, "arxiv_id": arxiv_id, "status": "downloaded"}
|
||||
return {"ok": False, "arxiv_id": arxiv_id, "status": "failed"}
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"ok": False, "arxiv_id": arxiv_id, "status": "timeout"}
|
||||
|
||||
# ─── Health ─────────────────────────────────────────────
|
||||
@app.get("/api/health")
|
||||
def health():
|
||||
return {"status": "ok", "version": "0.1.0"}
|
||||
|
||||
# ─── Mount static frontend (at /) ──────────────────────
|
||||
# Static files mounted after API routes to avoid conflicts
|
||||
static_dir = ROOT / "static"
|
||||
if static_dir.exists() and any(static_dir.iterdir()):
|
||||
app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static")
|
||||
|
||||
# ─── Main ───────────────────────────────────────────────
|
||||
def main():
|
||||
import uvicorn
|
||||
uvicorn.run("api.server:app", host="0.0.0.0", port=8000, reload=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user