- FastAPI 后端: REST API + Bearer Token 鉴权 + PDF 代理 - 180 篇论文数据 (data/papers.json): 9 模块、32 子领域 - 前端: 数据驱动、卡片径向渐变光效、PDF 页面内阅读 - 底部状态栏: arXiv/HF 连通性检测 - PDF 加载: arXiv 优先(5s超时) → HK 本地兜底 - Docker 化部署 (Dockerfile + start.sh + nginx.conf) - arXiv + HF 批量下载器 (api/downloader.py)
2175 lines
60 KiB
JSON
2175 lines
60 KiB
JSON
{
|
||
"arch": {
|
||
"id": "arch",
|
||
"name": "模型架构设计",
|
||
"icon": "🏗️",
|
||
"desc": "Transformer 变体、注意力机制、MoE、位置编码、SSM 等",
|
||
"color": "arch",
|
||
"areas": [
|
||
{
|
||
"id": "transformer",
|
||
"name": "Transformer 核心架构",
|
||
"mainline": [
|
||
{
|
||
"title": "Attention Is All You Need",
|
||
"authors": "Vaswani et al.",
|
||
"year": 2017,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "1706.03762"
|
||
},
|
||
{
|
||
"title": "GPT-2 / GPT-3: Language Models are Few-Shot Learners",
|
||
"authors": "Radford et al. / Brown et al.",
|
||
"year": 2020,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2005.14165"
|
||
},
|
||
{
|
||
"title": "LLaMA: Open and Efficient Foundation Language Models",
|
||
"authors": "Touvron et al.",
|
||
"year": 2023,
|
||
"venue": "Meta AI",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2302.13971"
|
||
},
|
||
{
|
||
"title": "Llama 3 / Llama 4 (Scout/Maverick/Behemoth, MoE, 10M context)",
|
||
"authors": "Meta AI",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2601.11659"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V3 / V3.2: MoE + MLA + MTP + Sparse Attention",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2412.19437"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V4: 1.6T MoE + CSA+HCA Hybrid Attention + 1M Context",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2026,
|
||
"venue": "DeepSeek",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf"
|
||
},
|
||
{
|
||
"title": "Kimi K2: Open Agentic Intelligence (1T MoE, 128K ctx, Muon optimizer)",
|
||
"authors": "Moonshot AI",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2507.20534"
|
||
},
|
||
{
|
||
"title": "Kimi K2.5: Visual Agentic Intelligence (native multimodal, Agent Swarm)",
|
||
"authors": "Moonshot AI",
|
||
"year": 2026,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2602.02276"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "T5: Exploring the Limits of Transfer Learning",
|
||
"authors": "Raffel et al.",
|
||
"year": 2020,
|
||
"venue": "JMLR",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "1910.10683"
|
||
},
|
||
{
|
||
"title": "PaLM / PaLM-2: Pathways Language Model",
|
||
"authors": "Chowdhery et al.",
|
||
"year": 2022,
|
||
"venue": "JMLR",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2204.02311"
|
||
},
|
||
{
|
||
"title": "Gemma 2 / Gemma 3 系列",
|
||
"authors": "Google DeepMind",
|
||
"year": 2024,
|
||
"venue": "Google",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2408.00118"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "attention",
|
||
"name": "注意力机制演进",
|
||
"mainline": [
|
||
{
|
||
"title": "Multi-Head Attention (MHA)",
|
||
"authors": "Vaswani et al.",
|
||
"year": 2017,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "1706.03762"
|
||
},
|
||
{
|
||
"title": "MQA: Fast Transformer Decoding — One Write-Head Is All You Need",
|
||
"authors": "Shazeer",
|
||
"year": 2019,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "1911.02150"
|
||
},
|
||
{
|
||
"title": "GQA: Training Generalized Multi-Query Transformer Models",
|
||
"authors": "Ainslie et al.",
|
||
"year": 2023,
|
||
"venue": "EMNLP",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2305.13245"
|
||
},
|
||
{
|
||
"title": "MLA: Multi-head Latent Attention (DeepSeek-V2/V3)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2405.04434"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V4: CSA+HCA Hybrid Attention (KV Cache → 10% of V3.2)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2026,
|
||
"venue": "DeepSeek",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf"
|
||
},
|
||
{
|
||
"title": "FlashAttention-3 / Sparse Attention 工程化",
|
||
"authors": "Dao et al. / DeepSeek",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2407.08608"
|
||
},
|
||
{
|
||
"title": "TEST",
|
||
"authors": "Test",
|
||
"year": 2026,
|
||
"venue": "Test",
|
||
"tags": [
|
||
"test"
|
||
]
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "FlashAttention-1 / FlashAttention-2: IO-Aware Exact Attention",
|
||
"authors": "Dao et al.",
|
||
"year": 2022,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2205.14135"
|
||
},
|
||
{
|
||
"title": "Differential Transformer (DIFF Transformer)",
|
||
"authors": "Ye et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2410.05258"
|
||
}
|
||
],
|
||
"forward": [
|
||
{
|
||
"title": "Engram: Conditional Memory via Scalable Lookup (N-gram O(1), 分离记忆与推理)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2026,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前瞻"
|
||
],
|
||
"arxiv": "2601.07372"
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"id": "moe",
|
||
"name": "MoE 混合专家",
|
||
"mainline": [
|
||
{
|
||
"title": "Sparsely-Gated MoE Layer: Outrageously Large Neural Networks",
|
||
"authors": "Shazeer et al.",
|
||
"year": 2017,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "1701.06538"
|
||
},
|
||
{
|
||
"title": "Switch Transformers: Scaling to Trillion Parameter Models",
|
||
"authors": "Fedus et al.",
|
||
"year": 2022,
|
||
"venue": "JMLR",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2101.03961"
|
||
},
|
||
{
|
||
"title": "Mixtral of Experts (8x7B, 开放 MoE)",
|
||
"authors": "Jiang et al.",
|
||
"year": 2024,
|
||
"venue": "Mistral AI",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2401.04088"
|
||
},
|
||
{
|
||
"title": "DeepSeekMoE: Fine-Grained Expert Segmentation + Shared Experts",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2401.06066"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V4: 1.6T参数 MoE + 领域专家独立训练后合并 + On-Policy Distillation",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2026,
|
||
"venue": "DeepSeek",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "GLaM: Efficient Scaling with Mixture-of-Experts",
|
||
"authors": "Du et al.",
|
||
"year": 2022,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2112.06905"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V3 Multi-Token Prediction (MTP)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2412.19437"
|
||
},
|
||
{
|
||
"title": "Muon: Muon is Scalable for LLM Training (Kimi 核心优化器)",
|
||
"authors": "Jordan et al.",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2502.16982"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "posenc",
|
||
"name": "位置编码",
|
||
"mainline": [
|
||
{
|
||
"title": "Sinusoidal Positional Encoding",
|
||
"authors": "Vaswani et al.",
|
||
"year": 2017,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "1706.03762"
|
||
},
|
||
{
|
||
"title": "RoPE: Rotary Position Embedding",
|
||
"authors": "Su et al.",
|
||
"year": 2021,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2104.09864"
|
||
},
|
||
{
|
||
"title": "ALiBi: Train Short, Test Long",
|
||
"authors": "Press et al.",
|
||
"year": 2022,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2108.12409"
|
||
},
|
||
{
|
||
"title": "YaRN: Efficient Context Window Extension of LLMs",
|
||
"authors": "Peng et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2309.00071"
|
||
},
|
||
{
|
||
"title": "NoPE + 10M Token Context (Llama 4 Scout)",
|
||
"authors": "Kazemnejad et al. / Meta",
|
||
"year": 2025,
|
||
"venue": "ICML / Meta",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2305.19466"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "ReRoPE / SelfExtend: Training-Free Length Extension",
|
||
"authors": "Su et al. / Jin et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2401.01325"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "norm",
|
||
"name": "归一化与激活函数",
|
||
"mainline": [
|
||
{
|
||
"title": "Layer Normalization",
|
||
"authors": "Ba et al.",
|
||
"year": 2016,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "1607.06450"
|
||
},
|
||
{
|
||
"title": "RMSNorm: Root Mean Square Layer Normalization",
|
||
"authors": "Zhang & Sennrich",
|
||
"year": 2019,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "1910.07467"
|
||
},
|
||
{
|
||
"title": "DeepNorm: DeepNet — Scaling Transformers to 1,000 Layers",
|
||
"authors": "Wang et al.",
|
||
"year": 2022,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2203.00555"
|
||
},
|
||
{
|
||
"title": "SwiGLU / GLU Variants (成为行业标配)",
|
||
"authors": "Shazeer et al.",
|
||
"year": 2020,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2002.05202"
|
||
},
|
||
{
|
||
"title": "mHC: Manifold-Constrained Hyper-Connections (DeepSeek-V4 基础)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2026,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2512.24880"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "ssm",
|
||
"name": "新兴架构 (SSM / Linear Attention)",
|
||
"mainline": [
|
||
{
|
||
"title": "S4: Efficiently Modeling Long Sequences with Structured State Spaces",
|
||
"authors": "Gu et al.",
|
||
"year": 2022,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2111.00396"
|
||
},
|
||
{
|
||
"title": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
|
||
"authors": "Gu & Dao",
|
||
"year": 2023,
|
||
"venue": "COLM",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2312.00752"
|
||
},
|
||
{
|
||
"title": "RWKV: Reinventing RNNs for the Transformer Era",
|
||
"authors": "Peng et al.",
|
||
"year": 2023,
|
||
"venue": "EMNLP",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2305.13048"
|
||
},
|
||
{
|
||
"title": "Mamba-2: Transformers are SSMs",
|
||
"authors": "Dao & Gu",
|
||
"year": 2024,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2405.21060"
|
||
},
|
||
{
|
||
"title": "Kimi Linear (K2.5 混合架构: Transformer + Linear Attention)",
|
||
"authors": "Moonshot AI",
|
||
"year": 2026,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2602.02276"
|
||
},
|
||
{
|
||
"title": "Titans: Learning to Memorize at Test Time (Neural Memory)",
|
||
"authors": "Behrouz et al.",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2501.00663"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "RetNet: Retentive Network — A Successor to Transformer",
|
||
"authors": "Sun et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2307.08621"
|
||
},
|
||
{
|
||
"title": "Hymba: Hybrid Mamba-Transformer",
|
||
"authors": "NVIDIA / Meta",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2411.13676"
|
||
}
|
||
],
|
||
"forward": [
|
||
{
|
||
"title": "Titans: Learning to Memorize at Test Time (神经记忆, 超越 Transformer)",
|
||
"authors": "Behrouz et al.",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前瞻"
|
||
],
|
||
"arxiv": "2501.00663"
|
||
},
|
||
{
|
||
"title": "MatMul-free Language Modeling (消除矩阵乘法, 类脑计算)",
|
||
"authors": "Zhu et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前瞻"
|
||
],
|
||
"arxiv": "2406.02528"
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
"multi": {
|
||
"id": "multi",
|
||
"name": "多模态",
|
||
"icon": "🖼️",
|
||
"desc": "视觉-语言、音频、视频、Omni 统一模态",
|
||
"color": "multi",
|
||
"areas": [
|
||
{
|
||
"id": "vision",
|
||
"name": "视觉-语言模型",
|
||
"mainline": [
|
||
{
|
||
"title": "CLIP: Learning Transferable Visual Models (对比学习奠基)",
|
||
"authors": "Radford et al.",
|
||
"year": 2021,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2103.00020"
|
||
},
|
||
{
|
||
"title": "BLIP-2: Bootstrapping Language-Image Pre-training (Q-Former)",
|
||
"authors": "Li et al.",
|
||
"year": 2023,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2301.12597"
|
||
},
|
||
{
|
||
"title": "LLaVA / LLaVA-1.5: Visual Instruction Tuning",
|
||
"authors": "Liu et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2304.08485"
|
||
},
|
||
{
|
||
"title": "GPT-4o System Card: 原生多模态 (文本/图像/音频端到端)",
|
||
"authors": "OpenAI",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2410.21276"
|
||
},
|
||
{
|
||
"title": "Qwen3-VL / InternVL3 / Kimi K2.5 Visual: 开源 SOTA 视觉-语言",
|
||
"authors": "Alibaba / Shanghai AI Lab / Moonshot",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2602.02276"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "SigLIP: Sigmoid Loss for Language Image Pre-training",
|
||
"authors": "Zhai et al.",
|
||
"year": 2023,
|
||
"venue": "ICCV",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2303.15343"
|
||
},
|
||
{
|
||
"title": "Flamingo: Visual Language Model for Few-Shot Learning",
|
||
"authors": "Alayrac et al.",
|
||
"year": 2022,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2204.14198"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "omni",
|
||
"name": "Omni 统一多模态",
|
||
"mainline": [
|
||
{
|
||
"title": "Gemini 1.0 Technical Report: 原生多模态 (文本/图像/音频/视频)",
|
||
"authors": "Google DeepMind",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2312.11805"
|
||
},
|
||
{
|
||
"title": "GPT-4o System Card: 端到端 omni 多模态",
|
||
"authors": "OpenAI",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2410.21276"
|
||
},
|
||
{
|
||
"title": "Chameleon: Mixed-Modal Early-Fusion Foundation Models",
|
||
"authors": "Team Chameleon (Meta)",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2405.09818"
|
||
},
|
||
{
|
||
"title": "Qwen3.5-Omni Technical Report: 全模态 (文本/图像/音频/视频/语音)",
|
||
"authors": "Alibaba Qwen",
|
||
"year": 2026,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2604.15804"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "EMU3: Next-Token Prediction is All You Need (Meta 统一多模态)",
|
||
"authors": "Wang et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2409.18869"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "audio",
|
||
"name": "音频/语音模型",
|
||
"mainline": [
|
||
{
|
||
"title": "Whisper / Whisper-large-v3: Robust Speech Recognition",
|
||
"authors": "Radford et al.",
|
||
"year": 2023,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2212.04356"
|
||
},
|
||
{
|
||
"title": "Qwen-Audio / Qwen2-Audio: 通用音频理解",
|
||
"authors": "Alibaba",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2311.07919"
|
||
},
|
||
{
|
||
"title": "CosyVoice 2: Scalable Streaming Speech Synthesis",
|
||
"authors": "Du et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2412.10117"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "Moshi: Real-time Speech Dialogue (Kyutai)",
|
||
"authors": "Défossez et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2410.00037"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "video",
|
||
"name": "视频理解",
|
||
"mainline": [
|
||
{
|
||
"title": "Video-LLaMA / VideoChat: 视频-语言对话",
|
||
"authors": "Zhang et al.",
|
||
"year": 2023,
|
||
"venue": "EMNLP",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2306.02858"
|
||
},
|
||
{
|
||
"title": "Gemini 1.5 Pro: 1M token 长上下文视频理解",
|
||
"authors": "Google DeepMind",
|
||
"year": 2024,
|
||
"venue": "Google",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2403.05530"
|
||
},
|
||
{
|
||
"title": "LLaVA-OneVision / Qwen3-VL 视频能力",
|
||
"authors": "Liu et al. / Qwen",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2408.03326"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
}
|
||
]
|
||
},
|
||
"data": {
|
||
"id": "data",
|
||
"name": "数据工程",
|
||
"icon": "📊",
|
||
"desc": "采集清洗、数据配比、合成数据、质量筛选",
|
||
"color": "data",
|
||
"areas": [
|
||
{
|
||
"id": "dedup",
|
||
"name": "数据清洗与去重",
|
||
"mainline": [
|
||
{
|
||
"title": "Deduplicating Training Data Makes Language Models Better",
|
||
"authors": "Lee et al.",
|
||
"year": 2022,
|
||
"venue": "ACL",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2107.06499"
|
||
},
|
||
{
|
||
"title": "The Pile: An 800GB Dataset of Diverse Text",
|
||
"authors": "Gao et al.",
|
||
"year": 2020,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2101.00027"
|
||
},
|
||
{
|
||
"title": "CCNet / RefinedWeb / FineWeb: 大规模高质量 Web 数据",
|
||
"authors": "Penedo et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2306.01116"
|
||
},
|
||
{
|
||
"title": "DCLM: DataComp-LM — 数据筛选标准化基准",
|
||
"authors": "Li et al.",
|
||
"year": 2024,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2406.11794"
|
||
},
|
||
{
|
||
"title": "Dolma / OLMo / datatrove: 全开放数据处理管线",
|
||
"authors": "Soldaini et al.",
|
||
"year": 2024,
|
||
"venue": "ACL",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2402.00159"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "C4 / mC4: Colossal Clean Crawled Corpus (T5)",
|
||
"authors": "Raffel et al.",
|
||
"year": 2020,
|
||
"venue": "JMLR",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "1910.10683"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "mixing",
|
||
"name": "数据配比与课程学习",
|
||
"mainline": [
|
||
{
|
||
"title": "DoReMi: Optimizing Data Mixtures Speeds Up LM Pretraining",
|
||
"authors": "Xie et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2305.10429"
|
||
},
|
||
{
|
||
"title": "DOGE: Domain-General Reweighting for LLM Pretraining",
|
||
"authors": "Fan et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2405.13063"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "Scaling Data-Constrained Language Models (多轮重复训练)",
|
||
"authors": "Muennighoff et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2305.16264"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "synthesis",
|
||
"name": "合成数据生成",
|
||
"mainline": [
|
||
{
|
||
"title": "Self-Instruct: Aligning LM with Self-Generated Instructions",
|
||
"authors": "Wang et al.",
|
||
"year": 2023,
|
||
"venue": "ACL",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2212.10560"
|
||
},
|
||
{
|
||
"title": "Evol-Instruct (WizardLM) / Orca: 渐进式指令演化",
|
||
"authors": "Xu et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2304.12244"
|
||
},
|
||
{
|
||
"title": "Magpie: Alignment Data Synthesis from Scratch",
|
||
"authors": "Xu et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2406.06859"
|
||
},
|
||
{
|
||
"title": "Phi-4: 合成数据驱动的推理训练 + 代码合成",
|
||
"authors": "Microsoft Research",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2412.08905"
|
||
},
|
||
{
|
||
"title": "DeepSeek-R1 冷启动数据合成 (长推理链)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2501.12948"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "Alpaca: A Strong, Replicable Instruction-Following Model (Stanford CRFM)",
|
||
"authors": "Taori et al.",
|
||
"year": 2023,
|
||
"venue": "Stanford",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2303.08774"
|
||
}
|
||
],
|
||
"forward": []
|
||
}
|
||
]
|
||
},
|
||
"pretrain": {
|
||
"id": "pretrain",
|
||
"name": "预训练",
|
||
"icon": "🔥",
|
||
"desc": "训练目标、分布式并行、训练稳定性、Scaling Law、长上下文",
|
||
"color": "pretrain",
|
||
"areas": [
|
||
{
|
||
"id": "scaling",
|
||
"name": "Scaling Law 与计算优化",
|
||
"mainline": [
|
||
{
|
||
"title": "Scaling Laws for Neural Language Models (Kaplan)",
|
||
"authors": "Kaplan et al.",
|
||
"year": 2020,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2001.08361"
|
||
},
|
||
{
|
||
"title": "Chinchilla: Training Compute-Optimal Large Language Models",
|
||
"authors": "Hoffmann et al.",
|
||
"year": 2022,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2203.15556"
|
||
},
|
||
{
|
||
"title": "Scaling Data-Constrained Language Models (重复训练 scaling)",
|
||
"authors": "Muennighoff et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2305.16264"
|
||
},
|
||
{
|
||
"title": "Qwen3 / DeepSeek-V3 实践: 超 Chinchilla ~15×~60× tokens",
|
||
"authors": "Alibaba / DeepSeek",
|
||
"year": 2025,
|
||
"venue": "Industry",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2503.20630"
|
||
},
|
||
{
|
||
"title": "Inference-Aware Scaling Laws (部署成本纳入 scaling)",
|
||
"authors": "Sardana & Frankle",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2401.00448"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "distributed",
|
||
"name": "分布式训练系统",
|
||
"mainline": [
|
||
{
|
||
"title": "Megatron-LM: Training Multi-Billion Parameter Models (TP/PP)",
|
||
"authors": "Shoeybi et al.",
|
||
"year": 2020,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "1909.08053"
|
||
},
|
||
{
|
||
"title": "ZeRO / ZeRO++ / ZeRO-3: Memory Optimizations (DeepSpeed)",
|
||
"authors": "Rajbhandari et al.",
|
||
"year": 2020,
|
||
"venue": "SC",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "1910.02054"
|
||
},
|
||
{
|
||
"title": "3D 并行 + 序列并行 + 专家并行 (DeepSeek-V3 部署)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2412.19437"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "GPipe / PipeDream: Pipeline Parallelism 基础",
|
||
"authors": "Huang et al.",
|
||
"year": 2019,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "1811.06965"
|
||
},
|
||
{
|
||
"title": "Ring Attention / Striped Attention: 长序列分布式训练",
|
||
"authors": "Liu et al. / Brandon et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2310.01889"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "stability",
|
||
"name": "训练稳定性与精度",
|
||
"mainline": [
|
||
{
|
||
"title": "Mixed Precision Training (FP16 → 标准)",
|
||
"authors": "Micikevicius et al.",
|
||
"year": 2018,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "1710.03740"
|
||
},
|
||
{
|
||
"title": "FP8 Training for LLMs (H100/B200 原生支持)",
|
||
"authors": "Micikevicius et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2310.18313"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "GLM / BLOOM: 千卡训练稳定性工程经验",
|
||
"authors": "Zeng et al. / BigScience",
|
||
"year": 2023,
|
||
"venue": "KDD",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2210.02414"
|
||
}
|
||
],
|
||
"forward": []
|
||
}
|
||
]
|
||
},
|
||
"post": {
|
||
"id": "post",
|
||
"name": "后训练",
|
||
"icon": "🎯",
|
||
"desc": "SFT、RLHF/DPO/GRPO、推理增强、安全对齐",
|
||
"color": "post",
|
||
"areas": [
|
||
{
|
||
"id": "sft",
|
||
"name": "SFT 监督微调",
|
||
"mainline": [
|
||
{
|
||
"title": "Finetuned Language Models Are Zero-Shot Learners (FLAN)",
|
||
"authors": "Chung et al.",
|
||
"year": 2022,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2109.01652"
|
||
},
|
||
{
|
||
"title": "LIMA: Less Is More for Alignment (1,000 高质量样本)",
|
||
"authors": "Zhou et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2305.11206"
|
||
},
|
||
{
|
||
"title": "Tulu 3 / Orca 3: 系统化后训练管线",
|
||
"authors": "AllenAI / MSR",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2411.15124"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "LoRA: Low-Rank Adaptation of LLMs",
|
||
"authors": "Hu et al.",
|
||
"year": 2022,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2106.09685"
|
||
},
|
||
{
|
||
"title": "QLoRA: Efficient Finetuning of Quantized LLMs",
|
||
"authors": "Dettmers et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2305.14314"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "alignment",
|
||
"name": "偏好对齐 (RLHF → DPO → GRPO)",
|
||
"mainline": [
|
||
{
|
||
"title": "InstructGPT: Training Language Models to Follow Instructions (RLHF + PPO)",
|
||
"authors": "Ouyang et al.",
|
||
"year": 2022,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2203.02155"
|
||
},
|
||
{
|
||
"title": "Constitutional AI: Harmlessness from AI Feedback",
|
||
"authors": "Bai et al.",
|
||
"year": 2022,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2212.08073"
|
||
},
|
||
{
|
||
"title": "DPO: Direct Preference Optimization",
|
||
"authors": "Rafailov et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2305.18290"
|
||
},
|
||
{
|
||
"title": "DeepSeek-R1 / GRPO: 纯 RL 驱动推理涌现 (无人工标注)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2501.12948"
|
||
},
|
||
{
|
||
"title": "SimPO / ORPO / KTO: 参考模型不可知的对齐方法",
|
||
"authors": "Meng et al.",
|
||
"year": 2024,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2405.14734"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "RLAIF: Constitutional AI — RL from AI Feedback",
|
||
"authors": "Bai et al.",
|
||
"year": 2022,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2212.08073"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "reasoning",
|
||
"name": "推理增强 (Reasoning)",
|
||
"mainline": [
|
||
{
|
||
"title": "Chain-of-Thought Prompting Elicits Reasoning in LLMs",
|
||
"authors": "Wei et al.",
|
||
"year": 2022,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2201.11903"
|
||
},
|
||
{
|
||
"title": "Self-Consistency Improves Chain of Thought Reasoning",
|
||
"authors": "Wang et al.",
|
||
"year": 2023,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2203.11171"
|
||
},
|
||
{
|
||
"title": "STaR: Self-Taught Reasoner / ReST (自学习推理链)",
|
||
"authors": "Zelikman et al.",
|
||
"year": 2022,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2203.14465"
|
||
},
|
||
{
|
||
"title": "OpenAI o1 System Card / o3 System Card (推理时 Scaling)",
|
||
"authors": "OpenAI",
|
||
"year": 2024,
|
||
"venue": "OpenAI",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2412.16720"
|
||
},
|
||
{
|
||
"title": "DeepSeek-R1: 开源推理模型 (RL + Cold-Start SFT)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2501.12948"
|
||
},
|
||
{
|
||
"title": "Kimi K2-Thinking (扩展推理, 256K上下文)",
|
||
"authors": "Moonshot AI",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2507.20534"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "Tree of Thoughts: Deliberate Problem Solving with LLMs",
|
||
"authors": "Yao et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2305.10601"
|
||
},
|
||
{
|
||
"title": "PRM/ORM: Let's Verify Step by Step (过程/结果奖励模型)",
|
||
"authors": "Lightman et al.",
|
||
"year": 2024,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2305.20050"
|
||
}
|
||
],
|
||
"forward": [
|
||
{
|
||
"title": "Quiet-STaR: Language Models Can Teach Themselves to Think Before Speaking",
|
||
"authors": "Zelikman et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前瞻"
|
||
],
|
||
"arxiv": "2403.09629"
|
||
}
|
||
]
|
||
}
|
||
]
|
||
},
|
||
"compress": {
|
||
"id": "compress",
|
||
"name": "模型压缩",
|
||
"icon": "📦",
|
||
"desc": "量化、剪枝、蒸馏、KV Cache 压缩",
|
||
"color": "compress",
|
||
"areas": [
|
||
{
|
||
"id": "quant",
|
||
"name": "量化 (Quantization)",
|
||
"mainline": [
|
||
{
|
||
"title": "GPTQ: Accurate Post-Training Quantization for GPT",
|
||
"authors": "Frantar et al.",
|
||
"year": 2023,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2210.17323"
|
||
},
|
||
{
|
||
"title": "AWQ: Activation-aware Weight Quantization",
|
||
"authors": "Lin et al.",
|
||
"year": 2024,
|
||
"venue": "MLSys",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2306.00978"
|
||
},
|
||
{
|
||
"title": "SmoothQuant: Accurate and Efficient Post-Training Quantization",
|
||
"authors": "Xiao et al.",
|
||
"year": 2024,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2211.10438"
|
||
},
|
||
{
|
||
"title": "QuaRot / SpinQuant: Outlier-Free 4-bit Quantization",
|
||
"authors": "Ashkboos et al.",
|
||
"year": 2025,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2404.00456"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V4 FP4 QAT: 4-bit 量化感知训练 (99.7% recall)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2026,
|
||
"venue": "DeepSeek",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "FP8 / FP4 推理 (NVIDIA B200 原生支持)",
|
||
"authors": "Micikevicius et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2310.18313"
|
||
}
|
||
],
|
||
"forward": [
|
||
{
|
||
"title": "BitNet b1.58: 1.58-bit LLM (三值量化 {−1,0,1})",
|
||
"authors": "Wang et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前瞻"
|
||
],
|
||
"arxiv": "2402.17764"
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"id": "prune",
|
||
"name": "剪枝 (Pruning)",
|
||
"mainline": [
|
||
{
|
||
"title": "SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot",
|
||
"authors": "Frantar & Alistarh",
|
||
"year": 2023,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2301.00774"
|
||
},
|
||
{
|
||
"title": "Wanda: A Simple and Effective Pruning Approach for LLMs",
|
||
"authors": "Sun et al.",
|
||
"year": 2024,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2306.11695"
|
||
},
|
||
{
|
||
"title": "SliceGPT: Compress LLMs by Deleting Rows/Columns",
|
||
"authors": "Ashkboos et al.",
|
||
"year": 2024,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2401.15024"
|
||
},
|
||
{
|
||
"title": "LLM-Pruner / ShortGPT: 结构化剪枝 + 层剪枝",
|
||
"authors": "Ma et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2305.11627"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "distill",
|
||
"name": "知识蒸馏",
|
||
"mainline": [
|
||
{
|
||
"title": "Distilling the Knowledge in a Neural Network",
|
||
"authors": "Hinton et al.",
|
||
"year": 2015,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "1503.02531"
|
||
},
|
||
{
|
||
"title": "Orca: Progressive Learning from Complex Explanation Traces",
|
||
"authors": "Mukherjee et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2306.02707"
|
||
},
|
||
{
|
||
"title": "DeepSeek-R1 蒸馏 (R1 → Qwen/Llama 小模型)",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2501.12948"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V4 On-Policy Distillation: 10 Teacher Models → 1 Student",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2026,
|
||
"venue": "DeepSeek",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "kvcache",
|
||
"name": "KV Cache 压缩",
|
||
"mainline": [
|
||
{
|
||
"title": "StreamingLLM: Efficient Streaming Language Models with Attention Sinks",
|
||
"authors": "Xiao et al.",
|
||
"year": 2024,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2309.17453"
|
||
},
|
||
{
|
||
"title": "H2O: Heavy-Hitter Oracle for Efficient KV Cache Eviction",
|
||
"authors": "Zhang et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2306.14048"
|
||
},
|
||
{
|
||
"title": "FastGen / KIVI / CacheGen: 自适应 KV 压缩与量化",
|
||
"authors": "Ge et al.",
|
||
"year": 2024,
|
||
"venue": "ICLR / SIGCOMM",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2310.07240"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V4 CSA+HCA: KV Cache 降至 V3.2 的 10%, FLOPs 降至 27%",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2026,
|
||
"venue": "DeepSeek",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf"
|
||
},
|
||
{
|
||
"title": "GQA/MQA: 从架构层面减少 KV 头 (Llama3/DeepSeek 标配)",
|
||
"authors": "Ainslie et al.",
|
||
"year": 2023,
|
||
"venue": "EMNLP",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2305.13245"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
}
|
||
]
|
||
},
|
||
"deploy": {
|
||
"id": "deploy",
|
||
"name": "部署推理",
|
||
"icon": "🚀",
|
||
"desc": "推理引擎、加速技术、服务化、边缘部署、成本优化",
|
||
"color": "deploy",
|
||
"areas": [
|
||
{
|
||
"id": "engine",
|
||
"name": "推理引擎与框架",
|
||
"mainline": [
|
||
{
|
||
"title": "vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention",
|
||
"authors": "Kwon et al.",
|
||
"year": 2023,
|
||
"venue": "SOSP",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2309.06180"
|
||
},
|
||
{
|
||
"title": "SGLang: Efficient LLM Programming and Serving",
|
||
"authors": "Zheng et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2312.07104"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "MII / DeepSpeed-FastGen: Dynamic SplitFuse",
|
||
"authors": "Microsoft",
|
||
"year": 2024,
|
||
"venue": "Industry",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2401.08671"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "accel",
|
||
"name": "推理加速技术",
|
||
"mainline": [
|
||
{
|
||
"title": "Speculative Decoding: Fast Inference from Transformers",
|
||
"authors": "Leviathan et al. / Chen et al.",
|
||
"year": 2023,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2211.17192"
|
||
},
|
||
{
|
||
"title": "Medusa: Simple LLM Inference Acceleration with Multiple Heads",
|
||
"authors": "Cai et al.",
|
||
"year": 2024,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2401.10774"
|
||
},
|
||
{
|
||
"title": "EAGLE / EAGLE-2: 推测解码框架 (无需 draft model)",
|
||
"authors": "Li et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2401.15077"
|
||
},
|
||
{
|
||
"title": "Continuous Batching + Prefill-Decode Disaggregation",
|
||
"authors": "Patel et al. / Yu et al.",
|
||
"year": 2024,
|
||
"venue": "OSDI",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2401.08671"
|
||
},
|
||
{
|
||
"title": "DeepSeek-V3 Multi-Token Prediction (MTP) + V4 Flash 推理",
|
||
"authors": "DeepSeek-AI",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2412.19437"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "FlashDecoding / FlashInfer: GPU 算子级加速",
|
||
"authors": "Dao et al. / Ye et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2310.12049"
|
||
},
|
||
{
|
||
"title": "KV Cache Offloading (GPU ↔ CPU 动态迁移)",
|
||
"authors": "Sheng et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2303.06865"
|
||
}
|
||
],
|
||
"forward": []
|
||
}
|
||
]
|
||
},
|
||
"agent": {
|
||
"id": "agent",
|
||
"name": "Agent & 应用",
|
||
"icon": "🤖",
|
||
"desc": "Tool Use、RAG、Multi-Agent、MCP/A2A、Computer Use",
|
||
"color": "agent",
|
||
"areas": [
|
||
{
|
||
"id": "react",
|
||
"name": "Agent 核心 (ReAct / Tool Use / Computer Use)",
|
||
"mainline": [
|
||
{
|
||
"title": "ReAct: Synergizing Reasoning and Acting in Language Models",
|
||
"authors": "Yao et al.",
|
||
"year": 2023,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2210.03629"
|
||
},
|
||
{
|
||
"title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
|
||
"authors": "Schick et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2302.04761"
|
||
},
|
||
{
|
||
"title": "SWE-Agent: Agent-Computer Interfaces for Software Engineering",
|
||
"authors": "Yang et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2405.15793"
|
||
},
|
||
{
|
||
"title": "SEAgent: Self-Evolving Computer Use Agent (GRPO 自主学习)",
|
||
"authors": "Sun et al.",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2508.04700"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "Voyager: Open-Ended Embodied Agent with LLMs (Minecraft)",
|
||
"authors": "Wang et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2305.16291"
|
||
},
|
||
{
|
||
"title": "Claude Opus 4 & Sonnet 4 System Card (Agent 安全评估)",
|
||
"authors": "Anthropic",
|
||
"year": 2025,
|
||
"venue": "Anthropic",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"pdf": "https://www-cdn.anthropic.com/6be99a52cb68eb70eb9572b4cafad13df32ed995.pdf"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "rag",
|
||
"name": "RAG 检索增强生成",
|
||
"mainline": [
|
||
{
|
||
"title": "RAG: Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
|
||
"authors": "Lewis et al.",
|
||
"year": 2020,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2005.11401"
|
||
},
|
||
{
|
||
"title": "Self-RAG: Learning to Retrieve, Generate, and Critique",
|
||
"authors": "Asai et al.",
|
||
"year": 2024,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2310.11511"
|
||
},
|
||
{
|
||
"title": "GraphRAG: From Local to Global — Graph-based RAG",
|
||
"authors": "Edge et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2404.16130"
|
||
},
|
||
{
|
||
"title": "Agentic RAG / Corrective RAG / Adaptive RAG",
|
||
"authors": "Yan et al. / Asai et al.",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2401.15884"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "HyDE: Precise Zero-Shot Dense Retrieval without Relevant Labels",
|
||
"authors": "Gao et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2212.10496"
|
||
},
|
||
{
|
||
"title": "ColBERT / ColPali: Late-Interaction / Visual RAG",
|
||
"authors": "Khattab et al.",
|
||
"year": 2022,
|
||
"venue": "SIGIR",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2112.01488"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "multiagent",
|
||
"name": "Multi-Agent & 协议",
|
||
"mainline": [
|
||
{
|
||
"title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
|
||
"authors": "Wu et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2308.08155"
|
||
},
|
||
{
|
||
"title": "MetaGPT / ChatDev / CrewAI: 多 Agent 协作框架",
|
||
"authors": "Hong et al. / Qian et al.",
|
||
"year": 2024,
|
||
"venue": "ACL",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2308.00352"
|
||
},
|
||
{
|
||
"title": "Kimi K2.5 Agent Swarm: 100 子 Agent / 1,500 工具调用",
|
||
"authors": "Moonshot AI",
|
||
"year": 2026,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2602.02276"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "Gorilla: Large Language Model Connected with Massive APIs",
|
||
"authors": "Patil et al.",
|
||
"year": 2023,
|
||
"venue": "ICML",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2305.15334"
|
||
}
|
||
],
|
||
"forward": []
|
||
}
|
||
]
|
||
},
|
||
"eval": {
|
||
"id": "eval",
|
||
"name": "评估体系",
|
||
"icon": "📏",
|
||
"desc": "通用、推理、编码、Agent、安全评估",
|
||
"color": "eval",
|
||
"areas": [
|
||
{
|
||
"id": "general",
|
||
"name": "通用能力评估",
|
||
"mainline": [
|
||
{
|
||
"title": "MMLU: Measuring Massive Multitask Language Understanding",
|
||
"authors": "Hendrycks et al.",
|
||
"year": 2021,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2009.03300"
|
||
},
|
||
{
|
||
"title": "BIG-bench: Beyond the Imitation Game (204 tasks)",
|
||
"authors": "Srivastava et al.",
|
||
"year": 2023,
|
||
"venue": "TMLR",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2206.04615"
|
||
},
|
||
{
|
||
"title": "MMLU-Pro: A More Robust and Challenging Benchmark",
|
||
"authors": "Wang et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2406.01574"
|
||
},
|
||
{
|
||
"title": "LiveBench: A Challenging, Contamination-Free LLM Benchmark",
|
||
"authors": "White et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2406.19314"
|
||
},
|
||
{
|
||
"title": "Humanity's Last Exam (HLE): 人类知识极限测试",
|
||
"authors": "Phan et al.",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2501.14249"
|
||
}
|
||
],
|
||
"branches": [
|
||
{
|
||
"title": "C-Eval / CMMLU: 中文评估",
|
||
"authors": "Huang et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"支线"
|
||
],
|
||
"arxiv": "2305.08322"
|
||
}
|
||
],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "reasoning",
|
||
"name": "推理 & 数学评估",
|
||
"mainline": [
|
||
{
|
||
"title": "GSM8K / MATH: 数学推理基础",
|
||
"authors": "Cobbe et al. / Hendrycks et al.",
|
||
"year": 2021,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2103.03874"
|
||
},
|
||
{
|
||
"title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
|
||
"authors": "Rein et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2311.12022"
|
||
},
|
||
{
|
||
"title": "BBH: Challenging BIG-Bench Tasks (BIG-Bench Hard)",
|
||
"authors": "Suzgun et al.",
|
||
"year": 2023,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2210.09261"
|
||
},
|
||
{
|
||
"title": "HLE (Humanity's Last Exam): 3000问题极限测试",
|
||
"authors": "Phan et al.",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2501.14249"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "code",
|
||
"name": "编码评估",
|
||
"mainline": [
|
||
{
|
||
"title": "HumanEval / MBPP: 函数级代码生成",
|
||
"authors": "Chen et al. / Austin et al.",
|
||
"year": 2021,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2107.03374"
|
||
},
|
||
{
|
||
"title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
|
||
"authors": "Jimenez et al.",
|
||
"year": 2024,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2310.06770"
|
||
},
|
||
{
|
||
"title": "LiveCodeBench: Holistic and Contamination-Free Coding",
|
||
"authors": "Jain et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2408.07935"
|
||
},
|
||
{
|
||
"title": "BigCodeBench: Benchmarking Code Generation with Diverse Tasks",
|
||
"authors": "Zhuo et al.",
|
||
"year": 2025,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"前沿"
|
||
],
|
||
"arxiv": "2406.15877"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
},
|
||
{
|
||
"id": "agent_eval",
|
||
"name": "Agent 评估",
|
||
"mainline": [
|
||
{
|
||
"title": "BFCL: Berkeley Function Calling Leaderboard",
|
||
"authors": "Yan et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"起点"
|
||
],
|
||
"arxiv": "2402.16053"
|
||
},
|
||
{
|
||
"title": "τ-bench: Agent Tool Use & Task Completion",
|
||
"authors": "Yao et al.",
|
||
"year": 2024,
|
||
"venue": "arXiv",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2406.12045"
|
||
},
|
||
{
|
||
"title": "WebArena / VisualWebArena: 真实 Web 任务 Agent 评估",
|
||
"authors": "Zhou et al.",
|
||
"year": 2024,
|
||
"venue": "ICLR",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2307.13854"
|
||
},
|
||
{
|
||
"title": "GAIA: A Benchmark for General AI Assistants",
|
||
"authors": "Mialon et al.",
|
||
"year": 2023,
|
||
"venue": "NeurIPS",
|
||
"tags": [
|
||
"关键节点"
|
||
],
|
||
"arxiv": "2311.12983"
|
||
}
|
||
],
|
||
"branches": [],
|
||
"forward": []
|
||
}
|
||
]
|
||
}
|
||
} |