{ "arch": { "id": "arch", "name": "模型架构设计", "icon": "🏗️", "desc": "Transformer 变体、注意力机制、MoE、位置编码、SSM 等", "color": "arch", "areas": [ { "id": "transformer", "name": "Transformer 核心架构", "mainline": [ { "title": "Attention Is All You Need", "authors": "Vaswani et al.", "year": 2017, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "1706.03762" }, { "title": "GPT-2 / GPT-3: Language Models are Few-Shot Learners", "authors": "Radford et al. / Brown et al.", "year": 2020, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2005.14165" }, { "title": "LLaMA: Open and Efficient Foundation Language Models", "authors": "Touvron et al.", "year": 2023, "venue": "Meta AI", "tags": [ "关键节点" ], "arxiv": "2302.13971" }, { "title": "DeepSeek-V3 / V3.2: MoE + MLA + MTP + Sparse Attention", "authors": "DeepSeek-AI", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2412.19437" }, { "title": "Kimi K2: Open Agentic Intelligence (1T MoE, 128K ctx, Muon optimizer)", "authors": "Moonshot AI", "year": 2025, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2507.20534" }, { "title": "Llama 3 / Llama 4 (Scout/Maverick/Behemoth, MoE, 10M context)", "authors": "Meta AI", "year": 2025, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2601.11659" }, { "title": "DeepSeek-V4: 1.6T MoE + CSA+HCA Hybrid Attention + 1M Context", "authors": "DeepSeek-AI", "year": 2026, "venue": "DeepSeek", "tags": [ "前沿" ], "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" }, { "title": "Kimi K2.5: Visual Agentic Intelligence (native multimodal, Agent Swarm)", "authors": "Moonshot AI", "year": 2026, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2602.02276" } ], "branches": [ { "title": "T5: Exploring the Limits of Transfer Learning", "authors": "Raffel et al.", "year": 2020, "venue": "JMLR", "tags": [ "支线" ], "arxiv": "1910.10683" }, { "title": "PaLM / PaLM-2: Pathways Language Model", "authors": "Chowdhery et al.", "year": 2022, "venue": "JMLR", "tags": [ "支线" ], "arxiv": "2204.02311" }, { "title": "Gemma 2 / Gemma 3 系列", "authors": "Google DeepMind", "year": 2024, "venue": "Google", "tags": [ "支线" ], "arxiv": "2408.00118" } ], "forward": [] }, { "id": "attention", "name": "注意力机制演进", "mainline": [ { "title": "Multi-Head Attention (MHA)", "authors": "Vaswani et al.", "year": 2017, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "1706.03762" }, { "title": "MQA: Fast Transformer Decoding — One Write-Head Is All You Need", "authors": "Shazeer", "year": 2019, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "1911.02150" }, { "title": "GQA: Training Generalized Multi-Query Transformer Models", "authors": "Ainslie et al.", "year": 2023, "venue": "EMNLP", "tags": [ "关键节点" ], "arxiv": "2305.13245" }, { "title": "MLA: Multi-head Latent Attention (DeepSeek-V2/V3)", "authors": "DeepSeek-AI", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2405.04434" }, { "title": "FlashAttention-3 / Sparse Attention 工程化", "authors": "Dao et al. / DeepSeek", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2407.08608" }, { "title": "MiniMax-01: Lightning Attention + MoE (456B, 45.9B active)", "authors": "MiniMax", "year": 2025, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2501.08313" }, { "title": "DeepSeek-V4: CSA+HCA Hybrid Attention (KV Cache → 10% of V3.2)", "authors": "DeepSeek-AI", "year": 2026, "venue": "DeepSeek", "tags": [ "前沿" ], "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" } ], "branches": [ { "title": "FlashAttention-1 / FlashAttention-2: IO-Aware Exact Attention", "authors": "Dao et al.", "year": 2022, "venue": "NeurIPS", "tags": [ "支线" ], "arxiv": "2205.14135" }, { "title": "Differential Transformer (DIFF Transformer)", "authors": "Ye et al.", "year": 2024, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2410.05258" } ], "forward": [ { "title": "Engram: Conditional Memory via Scalable Lookup (N-gram O(1), 分离记忆与推理)", "authors": "DeepSeek-AI", "year": 2026, "venue": "arXiv", "tags": [ "前瞻" ], "arxiv": "2601.07372" } ] }, { "id": "moe", "name": "MoE 混合专家", "mainline": [ { "title": "Sparsely-Gated MoE Layer: Outrageously Large Neural Networks", "authors": "Shazeer et al.", "year": 2017, "venue": "ICLR", "tags": [ "起点" ], "arxiv": "1701.06538" }, { "title": "Switch Transformers: Scaling to Trillion Parameter Models", "authors": "Fedus et al.", "year": 2022, "venue": "JMLR", "tags": [ "关键节点" ], "arxiv": "2101.03961" }, { "title": "DeepSeekMoE: Fine-Grained Expert Segmentation + Shared Experts", "authors": "DeepSeek-AI", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2401.06066" }, { "title": "Mixtral of Experts (8x7B, 开放 MoE)", "authors": "Jiang et al.", "year": 2024, "venue": "Mistral AI", "tags": [ "关键节点" ], "arxiv": "2401.04088" }, { "title": "DeepSeek-V4: 1.6T参数 MoE + 领域专家独立训练后合并 + On-Policy Distillation", "authors": "DeepSeek-AI", "year": 2026, "venue": "DeepSeek", "tags": [ "前沿" ], "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" } ], "branches": [ { "title": "GLaM: Efficient Scaling with Mixture-of-Experts", "authors": "Du et al.", "year": 2022, "venue": "ICML", "tags": [ "支线" ], "arxiv": "2112.06905" }, { "title": "DeepSeek-V3 Multi-Token Prediction (MTP)", "authors": "DeepSeek-AI", "year": 2024, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2412.19437" }, { "title": "Muon: Muon is Scalable for LLM Training (Kimi 核心优化器)", "authors": "Jordan et al.", "year": 2025, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2502.16982" } ], "forward": [] }, { "id": "posenc", "name": "位置编码", "mainline": [ { "title": "Sinusoidal Positional Encoding", "authors": "Vaswani et al.", "year": 2017, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "1706.03762" }, { "title": "RoPE: Rotary Position Embedding", "authors": "Su et al.", "year": 2021, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2104.09864" }, { "title": "ALiBi: Train Short, Test Long", "authors": "Press et al.", "year": 2022, "venue": "ICLR", "tags": [ "关键节点" ], "arxiv": "2108.12409" }, { "title": "YaRN: Efficient Context Window Extension of LLMs", "authors": "Peng et al.", "year": 2023, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2309.00071" }, { "title": "NoPE + 10M Token Context (Llama 4 Scout)", "authors": "Kazemnejad et al. / Meta", "year": 2025, "venue": "ICML / Meta", "tags": [ "前沿" ], "arxiv": "2305.19466" } ], "branches": [ { "title": "ReRoPE / SelfExtend: Training-Free Length Extension", "authors": "Su et al. / Jin et al.", "year": 2024, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2401.01325" } ], "forward": [] }, { "id": "norm", "name": "归一化与激活函数", "mainline": [ { "title": "Layer Normalization", "authors": "Ba et al.", "year": 2016, "venue": "arXiv", "tags": [ "起点" ], "arxiv": "1607.06450" }, { "title": "RMSNorm: Root Mean Square Layer Normalization", "authors": "Zhang & Sennrich", "year": 2019, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "1910.07467" }, { "title": "SwiGLU / GLU Variants (成为行业标配)", "authors": "Shazeer et al.", "year": 2020, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2002.05202" }, { "title": "DeepNorm: DeepNet — Scaling Transformers to 1,000 Layers", "authors": "Wang et al.", "year": 2022, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2203.00555" }, { "title": "mHC: Manifold-Constrained Hyper-Connections (DeepSeek-V4 基础)", "authors": "DeepSeek-AI", "year": 2026, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2512.24880" } ], "branches": [], "forward": [] }, { "id": "ssm", "name": "新兴架构 (SSM / Linear Attention)", "mainline": [ { "title": "S4: Efficiently Modeling Long Sequences with Structured State Spaces", "authors": "Gu et al.", "year": 2022, "venue": "ICLR", "tags": [ "起点" ], "arxiv": "2111.00396" }, { "title": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces", "authors": "Gu & Dao", "year": 2023, "venue": "COLM", "tags": [ "关键节点" ], "arxiv": "2312.00752" }, { "title": "RWKV: Reinventing RNNs for the Transformer Era", "authors": "Peng et al.", "year": 2023, "venue": "EMNLP", "tags": [ "关键节点" ], "arxiv": "2305.13048" }, { "title": "Mamba-2: Transformers are SSMs", "authors": "Dao & Gu", "year": 2024, "venue": "ICML", "tags": [ "关键节点" ], "arxiv": "2405.21060" }, { "title": "Titans: Learning to Memorize at Test Time (Neural Memory)", "authors": "Behrouz et al.", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2501.00663" }, { "title": "Kimi Linear (K2.5 混合架构: Transformer + Linear Attention)", "authors": "Moonshot AI", "year": 2026, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2602.02276" } ], "branches": [ { "title": "RetNet: Retentive Network — A Successor to Transformer", "authors": "Sun et al.", "year": 2023, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2307.08621" }, { "title": "xLSTM: Extended Long Short-Term Memory (sLSTM + mLSTM, matrix memory, linear attention revival)", "authors": "Beck et al.", "year": 2024, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2405.04517" }, { "title": "Hymba: Hybrid Mamba-Transformer", "authors": "NVIDIA / Meta", "year": 2025, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2411.13676" } ], "forward": [ { "title": "Byte Latent Transformer: Patches Scale Better Than Tokens (tokenization-free, entropy-based patching)", "authors": "Pagnoni et al. / Meta FAIR", "year": 2024, "venue": "arXiv", "tags": [ "前瞻" ], "arxiv": "2412.09871" }, { "title": "Large Concept Models: Language Modeling in a Sentence Representation Space (SONAR, 200 languages)", "authors": "Baranchuk et al. / Meta FAIR", "year": 2024, "venue": "arXiv", "tags": [ "前瞻" ], "arxiv": "2412.08821" }, { "title": "MatMul-free Language Modeling (消除矩阵乘法, 类脑计算)", "authors": "Zhu et al.", "year": 2024, "venue": "arXiv", "tags": [ "前瞻" ], "arxiv": "2406.02528" }, { "title": "LLaDA: Large Language Diffusion with mAsking (diffusion-based LLM, 8B, non-autoregressive generation)", "authors": "Nie et al.", "year": 2025, "venue": "arXiv", "tags": [ "前瞻" ], "arxiv": "2502.09992" }, { "title": "Titans: Learning to Memorize at Test Time (神经记忆, 超越 Transformer)", "authors": "Behrouz et al.", "year": 2025, "venue": "arXiv", "tags": [ "前瞻" ], "arxiv": "2501.00663" } ] } ] }, "multi": { "id": "multi", "name": "多模态", "icon": "🖼️", "desc": "视觉-语言、音频、视频、Omni 统一模态", "color": "multi", "areas": [ { "id": "vision", "name": "视觉-语言模型", "mainline": [ { "title": "CLIP: Learning Transferable Visual Models (对比学习奠基)", "authors": "Radford et al.", "year": 2021, "venue": "ICML", "tags": [ "起点" ], "arxiv": "2103.00020" }, { "title": "BLIP-2: Bootstrapping Language-Image Pre-training (Q-Former)", "authors": "Li et al.", "year": 2023, "venue": "ICML", "tags": [ "关键节点" ], "arxiv": "2301.12597" }, { "title": "LLaVA / LLaVA-1.5: Visual Instruction Tuning", "authors": "Liu et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2304.08485" }, { "title": "GPT-4o System Card: 原生多模态 (文本/图像/音频端到端)", "authors": "OpenAI", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2410.21276" }, { "title": "Qwen3-VL / InternVL3 / Kimi K2.5 Visual: 开源 SOTA 视觉-语言", "authors": "Alibaba / Shanghai AI Lab / Moonshot", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2602.02276" }, { "title": "GPT-4V / GPT-4o / GPT-5.4: Multimodal Frontier (vision + speech + text native, real-time voice)", "authors": "OpenAI", "year": 2026, "venue": "OpenAI", "tags": [ "关键节点" ], "arxiv": "" } ], "branches": [ { "title": "Flamingo: Visual Language Model for Few-Shot Learning", "authors": "Alayrac et al.", "year": 2022, "venue": "NeurIPS", "tags": [ "支线" ], "arxiv": "2204.14198" }, { "title": "SigLIP: Sigmoid Loss for Language Image Pre-training", "authors": "Zhai et al.", "year": 2023, "venue": "ICCV", "tags": [ "支线" ], "arxiv": "2303.15343" } ], "forward": [] }, { "id": "omni", "name": "Omni 统一多模态", "mainline": [ { "title": "Gemini 1.0 Technical Report: 原生多模态 (文本/图像/音频/视频)", "authors": "Google DeepMind", "year": 2023, "venue": "arXiv", "tags": [ "起点" ], "arxiv": "2312.11805" }, { "title": "Chameleon: Mixed-Modal Early-Fusion Foundation Models", "authors": "Team Chameleon (Meta)", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2405.09818" }, { "title": "GPT-4o System Card: 端到端 omni 多模态", "authors": "OpenAI", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2410.21276" }, { "title": "Qwen3.5-Omni Technical Report: 全模态 (文本/图像/音频/视频/语音)", "authors": "Alibaba Qwen", "year": 2026, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2604.15804" } ], "branches": [ { "title": "EMU3: Next-Token Prediction is All You Need (Meta 统一多模态)", "authors": "Wang et al.", "year": 2024, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2409.18869" } ], "forward": [] }, { "id": "audio", "name": "音频/语音模型", "mainline": [ { "title": "Whisper / Whisper-large-v3: Robust Speech Recognition", "authors": "Radford et al.", "year": 2023, "venue": "ICML", "tags": [ "起点" ], "arxiv": "2212.04356" }, { "title": "CosyVoice 2: Scalable Streaming Speech Synthesis", "authors": "Du et al.", "year": 2024, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2412.10117" }, { "title": "Qwen-Audio / Qwen2-Audio: 通用音频理解", "authors": "Alibaba", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2311.07919" } ], "branches": [ { "title": "Moshi: Real-time Speech Dialogue (Kyutai)", "authors": "Défossez et al.", "year": 2024, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2410.00037" } ], "forward": [] }, { "id": "video", "name": "视频理解", "mainline": [ { "title": "Video-LLaMA / VideoChat: 视频-语言对话", "authors": "Zhang et al.", "year": 2023, "venue": "EMNLP", "tags": [ "起点" ], "arxiv": "2306.02858" }, { "title": "Gemini 1.5 Pro: 1M token 长上下文视频理解", "authors": "Google DeepMind", "year": 2024, "venue": "Google", "tags": [ "关键节点" ], "arxiv": "2403.05530" }, { "title": "LLaVA-OneVision / Qwen3-VL 视频能力", "authors": "Liu et al. / Qwen", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2408.03326" } ], "branches": [], "forward": [] } ] }, "data": { "id": "data", "name": "数据工程", "icon": "📊", "desc": "采集清洗、数据配比、合成数据、质量筛选", "color": "data", "areas": [ { "id": "dedup", "name": "数据清洗与去重", "mainline": [ { "title": "The Pile: An 800GB Dataset of Diverse Text", "authors": "Gao et al.", "year": 2020, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2101.00027" }, { "title": "Deduplicating Training Data Makes Language Models Better", "authors": "Lee et al.", "year": 2022, "venue": "ACL", "tags": [ "起点" ], "arxiv": "2107.06499" }, { "title": "CCNet / RefinedWeb / FineWeb: 大规模高质量 Web 数据", "authors": "Penedo et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2306.01116" }, { "title": "DCLM: DataComp-LM — 数据筛选标准化基准", "authors": "Li et al.", "year": 2024, "venue": "NeurIPS", "tags": [ "前沿" ], "arxiv": "2406.11794" }, { "title": "Dolma / OLMo / datatrove: 全开放数据处理管线", "authors": "Soldaini et al.", "year": 2024, "venue": "ACL", "tags": [ "前沿" ], "arxiv": "2402.00159" } ], "branches": [ { "title": "C4 / mC4: Colossal Clean Crawled Corpus (T5)", "authors": "Raffel et al.", "year": 2020, "venue": "JMLR", "tags": [ "支线" ], "arxiv": "1910.10683" } ], "forward": [] }, { "id": "mixing", "name": "数据配比与课程学习", "mainline": [ { "title": "DoReMi: Optimizing Data Mixtures Speeds Up LM Pretraining", "authors": "Xie et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "2305.10429" }, { "title": "DOGE: Domain-General Reweighting for LLM Pretraining", "authors": "Fan et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2405.13063" } ], "branches": [ { "title": "Scaling Data-Constrained Language Models (多轮重复训练)", "authors": "Muennighoff et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "支线" ], "arxiv": "2305.16264" } ], "forward": [] }, { "id": "synthesis", "name": "合成数据生成", "mainline": [ { "title": "Evol-Instruct (WizardLM) / Orca: 渐进式指令演化", "authors": "Xu et al.", "year": 2023, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2304.12244" }, { "title": "Self-Instruct: Aligning LM with Self-Generated Instructions", "authors": "Wang et al.", "year": 2023, "venue": "ACL", "tags": [ "起点" ], "arxiv": "2212.10560" }, { "title": "Magpie: Alignment Data Synthesis from Scratch", "authors": "Xu et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2406.06859" }, { "title": "DeepSeek-R1 冷启动数据合成 (长推理链)", "authors": "DeepSeek-AI", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2501.12948" }, { "title": "Phi-4: 合成数据驱动的推理训练 + 代码合成", "authors": "Microsoft Research", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2412.08905" } ], "branches": [ { "title": "Alpaca: A Strong, Replicable Instruction-Following Model (Stanford CRFM)", "authors": "Taori et al.", "year": 2023, "venue": "Stanford", "tags": [ "支线" ], "arxiv": "2303.08774" } ], "forward": [] } ] }, "pretrain": { "id": "pretrain", "name": "预训练", "icon": "🔥", "desc": "训练目标、分布式并行、训练稳定性、Scaling Law、长上下文", "color": "pretrain", "areas": [ { "id": "scaling", "name": "Scaling Law 与计算优化", "mainline": [ { "title": "Scaling Laws for Neural Language Models (Kaplan)", "authors": "Kaplan et al.", "year": 2020, "venue": "arXiv", "tags": [ "起点" ], "arxiv": "2001.08361" }, { "title": "Chinchilla: Training Compute-Optimal Large Language Models", "authors": "Hoffmann et al.", "year": 2022, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2203.15556" }, { "title": "Scaling Data-Constrained Language Models (重复训练 scaling)", "authors": "Muennighoff et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2305.16264" }, { "title": "Inference-Aware Scaling Laws (部署成本纳入 scaling)", "authors": "Sardana & Frankle", "year": 2024, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2401.00448" }, { "title": "Qwen3 / DeepSeek-V3 实践: 超 Chinchilla ~15×~60× tokens", "authors": "Alibaba / DeepSeek", "year": 2025, "venue": "Industry", "tags": [ "前沿" ], "arxiv": "2503.20630" } ], "branches": [], "forward": [] }, { "id": "distributed", "name": "分布式训练系统", "mainline": [ { "title": "Megatron-LM: Training Multi-Billion Parameter Models (TP/PP)", "authors": "Shoeybi et al.", "year": 2020, "venue": "arXiv", "tags": [ "起点" ], "arxiv": "1909.08053" }, { "title": "ZeRO / ZeRO++ / ZeRO-3: Memory Optimizations (DeepSpeed)", "authors": "Rajbhandari et al.", "year": 2020, "venue": "SC", "tags": [ "关键节点" ], "arxiv": "1910.02054" }, { "title": "3D 并行 + 序列并行 + 专家并行 (DeepSeek-V3 部署)", "authors": "DeepSeek-AI", "year": 2024, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2412.19437" } ], "branches": [ { "title": "GPipe / PipeDream: Pipeline Parallelism 基础", "authors": "Huang et al.", "year": 2019, "venue": "NeurIPS", "tags": [ "支线" ], "arxiv": "1811.06965" }, { "title": "Ring Attention / Striped Attention: 长序列分布式训练", "authors": "Liu et al. / Brandon et al.", "year": 2023, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2310.01889" } ], "forward": [] }, { "id": "stability", "name": "训练稳定性与精度", "mainline": [ { "title": "Mixed Precision Training (FP16 → 标准)", "authors": "Micikevicius et al.", "year": 2018, "venue": "ICLR", "tags": [ "起点" ], "arxiv": "1710.03740" }, { "title": "FP8 Training for LLMs (H100/B200 原生支持)", "authors": "Micikevicius et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2310.18313" } ], "branches": [ { "title": "GLM / BLOOM: 千卡训练稳定性工程经验", "authors": "Zeng et al. / BigScience", "year": 2023, "venue": "KDD", "tags": [ "支线" ], "arxiv": "2210.02414" } ], "forward": [] } ] }, "post": { "id": "post", "name": "后训练", "icon": "🎯", "desc": "SFT、RLHF/DPO/GRPO、推理增强、安全对齐", "color": "post", "areas": [ { "id": "sft", "name": "SFT 监督微调", "mainline": [ { "title": "Finetuned Language Models Are Zero-Shot Learners (FLAN)", "authors": "Chung et al.", "year": 2022, "venue": "ICLR", "tags": [ "起点" ], "arxiv": "2109.01652" }, { "title": "LIMA: Less Is More for Alignment (1,000 高质量样本)", "authors": "Zhou et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2305.11206" }, { "title": "Tulu 3 / Orca 3: 系统化后训练管线", "authors": "AllenAI / MSR", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2411.15124" } ], "branches": [ { "title": "LoRA: Low-Rank Adaptation of LLMs", "authors": "Hu et al.", "year": 2022, "venue": "ICLR", "tags": [ "支线" ], "arxiv": "2106.09685" }, { "title": "QLoRA: Efficient Finetuning of Quantized LLMs", "authors": "Dettmers et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "支线" ], "arxiv": "2305.14314" } ], "forward": [] }, { "id": "alignment", "name": "偏好对齐 (RLHF → DPO → GRPO)", "mainline": [ { "title": "Constitutional AI: Harmlessness from AI Feedback", "authors": "Bai et al.", "year": 2022, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2212.08073" }, { "title": "InstructGPT: Training Language Models to Follow Instructions (RLHF + PPO)", "authors": "Ouyang et al.", "year": 2022, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "2203.02155" }, { "title": "DPO: Direct Preference Optimization", "authors": "Rafailov et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2305.18290" }, { "title": "SimPO / ORPO / KTO: 参考模型不可知的对齐方法", "authors": "Meng et al.", "year": 2024, "venue": "ICML", "tags": [ "前沿" ], "arxiv": "2405.14734" }, { "title": "DeepSeek-R1 / GRPO: 纯 RL 驱动推理涌现 (无人工标注)", "authors": "DeepSeek-AI", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2501.12948" } ], "branches": [ { "title": "RLAIF: Constitutional AI — RL from AI Feedback", "authors": "Bai et al.", "year": 2022, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2212.08073" } ], "forward": [] }, { "id": "reasoning", "name": "推理增强 (Reasoning)", "mainline": [ { "title": "Chain-of-Thought Prompting Elicits Reasoning in LLMs", "authors": "Wei et al.", "year": 2022, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "2201.11903" }, { "title": "STaR: Self-Taught Reasoner / ReST (自学习推理链)", "authors": "Zelikman et al.", "year": 2022, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2203.14465" }, { "title": "Self-Consistency Improves Chain of Thought Reasoning", "authors": "Wang et al.", "year": 2023, "venue": "ICLR", "tags": [ "关键节点" ], "arxiv": "2203.11171" }, { "title": "OpenAI o1 System Card / o3 System Card (推理时 Scaling)", "authors": "OpenAI", "year": 2024, "venue": "OpenAI", "tags": [ "前沿" ], "arxiv": "2412.16720" }, { "title": "DeepSeek-R1: 开源推理模型 (RL + Cold-Start SFT)", "authors": "DeepSeek-AI", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2501.12948" }, { "title": "Kimi K2-Thinking (扩展推理, 256K上下文)", "authors": "Moonshot AI", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2507.20534" }, { "title": "MiniMax-M1: CISPO (Curriculum-Informed Synthetic Planning Optimization) RL for reasoning emergence", "authors": "MiniMax", "year": 2025, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2506.13585" } ], "branches": [ { "title": "Tree of Thoughts: Deliberate Problem Solving with LLMs", "authors": "Yao et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "支线" ], "arxiv": "2305.10601" }, { "title": "PRM/ORM: Let's Verify Step by Step (过程/结果奖励模型)", "authors": "Lightman et al.", "year": 2024, "venue": "ICLR", "tags": [ "支线" ], "arxiv": "2305.20050" } ], "forward": [ { "title": "Coconut: Training LLMs to Reason in a Continuous Latent Space (摆脱文字链, 潜在空间连续推理)", "authors": "Hao et al. / Meta FAIR", "year": 2024, "venue": "NeurIPS 2024", "tags": [ "前瞻" ], "arxiv": "2412.06769" }, { "title": "Quiet-STaR: Language Models Can Teach Themselves to Think Before Speaking", "authors": "Zelikman et al.", "year": 2024, "venue": "arXiv", "tags": [ "前瞻" ], "arxiv": "2403.09629" } ] } ] }, "compress": { "id": "compress", "name": "模型压缩", "icon": "📦", "desc": "量化、剪枝、蒸馏、KV Cache 压缩", "color": "compress", "areas": [ { "id": "quant", "name": "量化 (Quantization)", "mainline": [ { "title": "GPTQ: Accurate Post-Training Quantization for GPT", "authors": "Frantar et al.", "year": 2023, "venue": "ICLR", "tags": [ "起点" ], "arxiv": "2210.17323" }, { "title": "AWQ: Activation-aware Weight Quantization", "authors": "Lin et al.", "year": 2024, "venue": "MLSys", "tags": [ "关键节点" ], "arxiv": "2306.00978" }, { "title": "SmoothQuant: Accurate and Efficient Post-Training Quantization", "authors": "Xiao et al.", "year": 2024, "venue": "ICML", "tags": [ "关键节点" ], "arxiv": "2211.10438" }, { "title": "QuaRot / SpinQuant: Outlier-Free 4-bit Quantization", "authors": "Ashkboos et al.", "year": 2025, "venue": "ICLR", "tags": [ "前沿" ], "arxiv": "2404.00456" }, { "title": "DeepSeek-V4 FP4 QAT: 4-bit 量化感知训练 (99.7% recall)", "authors": "DeepSeek-AI", "year": 2026, "venue": "DeepSeek", "tags": [ "前沿" ], "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" } ], "branches": [ { "title": "FP8 / FP4 推理 (NVIDIA B200 原生支持)", "authors": "Micikevicius et al.", "year": 2024, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2310.18313" } ], "forward": [ { "title": "BitNet b1.58: 1.58-bit LLM (三值量化 {−1,0,1})", "authors": "Wang et al.", "year": 2024, "venue": "arXiv", "tags": [ "前瞻" ], "arxiv": "2402.17764" } ] }, { "id": "prune", "name": "剪枝 (Pruning)", "mainline": [ { "title": "SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot", "authors": "Frantar & Alistarh", "year": 2023, "venue": "ICML", "tags": [ "起点" ], "arxiv": "2301.00774" }, { "title": "LLM-Pruner / ShortGPT: 结构化剪枝 + 层剪枝", "authors": "Ma et al.", "year": 2024, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2305.11627" }, { "title": "SliceGPT: Compress LLMs by Deleting Rows/Columns", "authors": "Ashkboos et al.", "year": 2024, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2401.15024" }, { "title": "Wanda: A Simple and Effective Pruning Approach for LLMs", "authors": "Sun et al.", "year": 2024, "venue": "ICLR", "tags": [ "关键节点" ], "arxiv": "2306.11695" } ], "branches": [], "forward": [] }, { "id": "distill", "name": "知识蒸馏", "mainline": [ { "title": "Distilling the Knowledge in a Neural Network", "authors": "Hinton et al.", "year": 2015, "venue": "arXiv", "tags": [ "起点" ], "arxiv": "1503.02531" }, { "title": "Orca: Progressive Learning from Complex Explanation Traces", "authors": "Mukherjee et al.", "year": 2023, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2306.02707" }, { "title": "DeepSeek-R1 蒸馏 (R1 → Qwen/Llama 小模型)", "authors": "DeepSeek-AI", "year": 2025, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2501.12948" }, { "title": "DeepSeek-V4 On-Policy Distillation: 10 Teacher Models → 1 Student", "authors": "DeepSeek-AI", "year": 2026, "venue": "DeepSeek", "tags": [ "前沿" ], "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" } ], "branches": [], "forward": [] }, { "id": "kvcache", "name": "KV Cache 压缩", "mainline": [ { "title": "GQA/MQA: 从架构层面减少 KV 头 (Llama3/DeepSeek 标配)", "authors": "Ainslie et al.", "year": 2023, "venue": "EMNLP", "tags": [ "前沿" ], "arxiv": "2305.13245" }, { "title": "H2O: Heavy-Hitter Oracle for Efficient KV Cache Eviction", "authors": "Zhang et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2306.14048" }, { "title": "FastGen / KIVI / CacheGen: 自适应 KV 压缩与量化", "authors": "Ge et al.", "year": 2024, "venue": "ICLR / SIGCOMM", "tags": [ "关键节点" ], "arxiv": "2310.07240" }, { "title": "StreamingLLM: Efficient Streaming Language Models with Attention Sinks", "authors": "Xiao et al.", "year": 2024, "venue": "ICLR", "tags": [ "起点" ], "arxiv": "2309.17453" }, { "title": "DeepSeek-V4 CSA+HCA: KV Cache 降至 V3.2 的 10%, FLOPs 降至 27%", "authors": "DeepSeek-AI", "year": 2026, "venue": "DeepSeek", "tags": [ "前沿" ], "pdf": "https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro/resolve/main/DeepSeek_V4.pdf" } ], "branches": [], "forward": [] } ] }, "deploy": { "id": "deploy", "name": "部署推理", "icon": "🚀", "desc": "推理引擎、加速技术、服务化、边缘部署、成本优化", "color": "deploy", "areas": [ { "id": "engine", "name": "推理引擎与框架", "mainline": [ { "title": "vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention", "authors": "Kwon et al.", "year": 2023, "venue": "SOSP", "tags": [ "起点" ], "arxiv": "2309.06180" }, { "title": "SGLang: Efficient LLM Programming and Serving", "authors": "Zheng et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2312.07104" } ], "branches": [ { "title": "MII / DeepSpeed-FastGen: Dynamic SplitFuse", "authors": "Microsoft", "year": 2024, "venue": "Industry", "tags": [ "支线" ], "arxiv": "2401.08671" } ], "forward": [] }, { "id": "accel", "name": "推理加速技术", "mainline": [ { "title": "Speculative Decoding: Fast Inference from Transformers", "authors": "Leviathan et al. / Chen et al.", "year": 2023, "venue": "ICML", "tags": [ "起点" ], "arxiv": "2211.17192" }, { "title": "Continuous Batching + Prefill-Decode Disaggregation", "authors": "Patel et al. / Yu et al.", "year": 2024, "venue": "OSDI", "tags": [ "前沿" ], "arxiv": "2401.08671" }, { "title": "EAGLE / EAGLE-2: 推测解码框架 (无需 draft model)", "authors": "Li et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2401.15077" }, { "title": "Medusa: Simple LLM Inference Acceleration with Multiple Heads", "authors": "Cai et al.", "year": 2024, "venue": "ICML", "tags": [ "关键节点" ], "arxiv": "2401.10774" }, { "title": "DeepSeek-V3 Multi-Token Prediction (MTP) + V4 Flash 推理", "authors": "DeepSeek-AI", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2412.19437" } ], "branches": [ { "title": "KV Cache Offloading (GPU ↔ CPU 动态迁移)", "authors": "Sheng et al.", "year": 2023, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2303.06865" }, { "title": "FlashDecoding / FlashInfer: GPU 算子级加速", "authors": "Dao et al. / Ye et al.", "year": 2024, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2310.12049" } ], "forward": [] } ] }, "agent": { "id": "agent", "name": "Agent & 应用", "icon": "🤖", "desc": "Tool Use、RAG、Multi-Agent、MCP/A2A、Computer Use", "color": "agent", "areas": [ { "id": "react", "name": "Agent 核心 (ReAct / Tool Use / Computer Use)", "mainline": [ { "title": "Generative Agents: Interactive Simulacra of Human Behavior (Stanford AI Town, memory-stream architecture, 25 agents)", "authors": "Park et al. / Stanford", "year": 2023, "venue": "UIST 2023", "tags": [ "起点" ], "arxiv": "2304.03442" }, { "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "authors": "Yao et al.", "year": 2023, "venue": "ICLR", "tags": [ "起点" ], "arxiv": "2210.03629" }, { "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", "authors": "Schick et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2302.04761" }, { "title": "SWE-Agent: Agent-Computer Interfaces for Software Engineering", "authors": "Yang et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2405.15793" }, { "title": "SEAgent: Self-Evolving Computer Use Agent (GRPO 自主学习)", "authors": "Sun et al.", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2508.04700" } ], "branches": [ { "title": "Voyager: Open-Ended Embodied Agent with LLMs (Minecraft)", "authors": "Wang et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "支线" ], "arxiv": "2305.16291" }, { "title": "Claude Opus 4 & Sonnet 4 System Card (Agent 安全评估)", "authors": "Anthropic", "year": 2025, "venue": "Anthropic", "tags": [ "支线" ], "pdf": "https://www-cdn.anthropic.com/6be99a52cb68eb70eb9572b4cafad13df32ed995.pdf" } ], "forward": [] }, { "id": "rag", "name": "RAG 检索增强生成", "mainline": [ { "title": "RAG: Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "authors": "Lewis et al.", "year": 2020, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "2005.11401" }, { "title": "GraphRAG: From Local to Global — Graph-based RAG", "authors": "Edge et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2404.16130" }, { "title": "Self-RAG: Learning to Retrieve, Generate, and Critique", "authors": "Asai et al.", "year": 2024, "venue": "ICLR", "tags": [ "关键节点" ], "arxiv": "2310.11511" }, { "title": "Agentic RAG / Corrective RAG / Adaptive RAG", "authors": "Yan et al. / Asai et al.", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2401.15884" } ], "branches": [ { "title": "ColBERT / ColPali: Late-Interaction / Visual RAG", "authors": "Khattab et al.", "year": 2022, "venue": "SIGIR", "tags": [ "支线" ], "arxiv": "2112.01488" }, { "title": "HyDE: Precise Zero-Shot Dense Retrieval without Relevant Labels", "authors": "Gao et al.", "year": 2023, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2212.10496" } ], "forward": [] }, { "id": "multiagent", "name": "Multi-Agent & 协议", "mainline": [ { "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", "authors": "Wu et al.", "year": 2023, "venue": "arXiv", "tags": [ "起点" ], "arxiv": "2308.08155" }, { "title": "MetaGPT / ChatDev / CrewAI: 多 Agent 协作框架", "authors": "Hong et al. / Qian et al.", "year": 2024, "venue": "ACL", "tags": [ "关键节点" ], "arxiv": "2308.00352" }, { "title": "Kimi K2.5 Agent Swarm: 100 子 Agent / 1,500 工具调用", "authors": "Moonshot AI", "year": 2026, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2602.02276" } ], "branches": [ { "title": "Gorilla: Large Language Model Connected with Massive APIs", "authors": "Patil et al.", "year": 2023, "venue": "ICML", "tags": [ "支线" ], "arxiv": "2305.15334" } ], "forward": [] } ] }, "eval": { "id": "eval", "name": "评估体系", "icon": "📏", "desc": "通用、推理、编码、Agent、安全评估", "color": "eval", "areas": [ { "id": "general", "name": "通用能力评估", "mainline": [ { "title": "MMLU: Measuring Massive Multitask Language Understanding", "authors": "Hendrycks et al.", "year": 2021, "venue": "ICLR", "tags": [ "起点" ], "arxiv": "2009.03300" }, { "title": "BIG-bench: Beyond the Imitation Game (204 tasks)", "authors": "Srivastava et al.", "year": 2023, "venue": "TMLR", "tags": [ "关键节点" ], "arxiv": "2206.04615" }, { "title": "LiveBench: A Challenging, Contamination-Free LLM Benchmark", "authors": "White et al.", "year": 2024, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2406.19314" }, { "title": "MMLU-Pro: A More Robust and Challenging Benchmark", "authors": "Wang et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2406.01574" }, { "title": "Humanity's Last Exam (HLE): 人类知识极限测试", "authors": "Phan et al.", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2501.14249" } ], "branches": [ { "title": "C-Eval / CMMLU: 中文评估", "authors": "Huang et al.", "year": 2023, "venue": "arXiv", "tags": [ "支线" ], "arxiv": "2305.08322" } ], "forward": [] }, { "id": "reasoning", "name": "推理 & 数学评估", "mainline": [ { "title": "GSM8K / MATH: 数学推理基础", "authors": "Cobbe et al. / Hendrycks et al.", "year": 2021, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "2103.03874" }, { "title": "BBH: Challenging BIG-Bench Tasks (BIG-Bench Hard)", "authors": "Suzgun et al.", "year": 2023, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2210.09261" }, { "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark", "authors": "Rein et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2311.12022" }, { "title": "HLE (Humanity's Last Exam): 3000问题极限测试", "authors": "Phan et al.", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2501.14249" } ], "branches": [], "forward": [] }, { "id": "code", "name": "编码评估", "mainline": [ { "title": "HumanEval / MBPP: 函数级代码生成", "authors": "Chen et al. / Austin et al.", "year": 2021, "venue": "NeurIPS", "tags": [ "起点" ], "arxiv": "2107.03374" }, { "title": "LiveCodeBench: Holistic and Contamination-Free Coding", "authors": "Jain et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2408.07935" }, { "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", "authors": "Jimenez et al.", "year": 2024, "venue": "ICLR", "tags": [ "关键节点" ], "arxiv": "2310.06770" }, { "title": "BigCodeBench: Benchmarking Code Generation with Diverse Tasks", "authors": "Zhuo et al.", "year": 2025, "venue": "arXiv", "tags": [ "前沿" ], "arxiv": "2406.15877" } ], "branches": [], "forward": [] }, { "id": "agent_eval", "name": "Agent 评估", "mainline": [ { "title": "GAIA: A Benchmark for General AI Assistants", "authors": "Mialon et al.", "year": 2023, "venue": "NeurIPS", "tags": [ "关键节点" ], "arxiv": "2311.12983" }, { "title": "BFCL: Berkeley Function Calling Leaderboard", "authors": "Yan et al.", "year": 2024, "venue": "arXiv", "tags": [ "起点" ], "arxiv": "2402.16053" }, { "title": "WebArena / VisualWebArena: 真实 Web 任务 Agent 评估", "authors": "Zhou et al.", "year": 2024, "venue": "ICLR", "tags": [ "关键节点" ], "arxiv": "2307.13854" }, { "title": "τ-bench: Agent Tool Use & Task Completion", "authors": "Yao et al.", "year": 2024, "venue": "arXiv", "tags": [ "关键节点" ], "arxiv": "2406.12045" } ], "branches": [], "forward": [] } ] } }