[ PROMPT_NODE_22364 ]
architectures
[ SKILL_DOCUMENTATION ]
# MoE 模型架构
关于不同专家混合(Mixture of Experts, MoE)架构及其设计模式的综合指南。
## 目录
- Mixtral 8x7B (Mistral AI)
- DeepSeek-V3 (DeepSeek AI)
- Switch Transformers (Google)
- GLaM (Google)
- 对比表
## Mixtral 8x7B (Mistral AI - 2024)
### 架构概览
**参数量:**
- 总计:47B 参数
- 每个 Token 激活:13B(8 个专家中激活 2 个)
- 每个专家:约 7B 参数
**关键特性:**
- **Top-2 路由**:每个 Token 被路由至 2 个专家
- **每层 8 个专家**:稀疏激活
- **SMoE 架构**:稀疏专家混合模型
- **分组查询注意力 (GQA)**:高效的注意力机制
### 层结构
python
# Mixtral Transformer 块
class MixtralDecoderLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.hidden_size = config.hidden_size
# 自注意力机制
self.self_attn = MixtralAttention(config)
# MoE 前馈网络
self.block_sparse_moe = MixtralSparseMoeBlock(config)
# 层归一化
self.input_layernorm = MixtralRMSNorm(config.hidden_size)
self.post_attention_layernorm = MixtralRMSNorm(config.hidden_size)
def forward(self, hidden_states, attention_mask=None):
residual = hidden_states
# 自注意力机制
hidden_states = self.input_layernorm(hidden_states)
hidden_states = self.self_attn(hidden_states, attention_mask)
hidden_states = residual + hidden_states
# MoE FFN
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.block_sparse_moe(hidden_states)
hidden_states = residual + hidden_states
return hidden_states
### 稀疏 MoE 块
python
class MixtralSparseMoeBlock(nn.Module):
def __init__(self, config):
super().__init__()
self.hidden_dim = config.hidden_size
self.ffn_dim = config.intermediate_size
self.num_experts = config.num_local_experts # 8
self.top_k = config.num_experts_per_tok # 2
# 路由(门控网络)
self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
# 8 个专家 FFN
self.experts = nn.ModuleList([
MixtralBlockSparseTop2MLP(config)
for _ in range(self.num_experts)
])
def forward(self, hidden_states):
batch_size, sequence_length, hidden_dim = hidden_states.shape