[ PROMPT_NODE_22724 ]
Model Architecture Nanogpt 训练
[ SKILL_DOCUMENTATION ]
# NanoGPT 训练指南
## 训练循环 (~300 行)
NanoGPT 的 `train.py` 是一个自包含的训练脚本,依赖项极少。
### 完整训练脚本结构
python
# train.py (简化版)
import os
import time
import math
import pickle
import torch
from model import GPTConfig, GPT
# 训练配置
batch_size = 12 # 微批次大小
block_size = 1024 # 上下文长度
gradient_accumulation_steps = 5 * 8 # 每个批次约 60K token
# 模型配置
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.0
# 优化器配置
learning_rate = 6e-4
max_iters = 600000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0
# 学习率调度
warmup_iters = 2000
lr_decay_iters = 600000
min_lr = 6e-5
# 系统
device = 'cuda'
dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16'
compile = True # PyTorch 2.0
# 数据加载器
def get_batch(split):
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+1+block_size] for i in ix])
x, y = x.to(device), y.to(device)
return x, y
# 学习率调度
def get_lr(it):
# 预热
if it lr_decay_iters:
return min_lr
# 余弦衰减
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
return min_lr + coeff * (learning_rate - min_lr)
# 初始化模型
model = GPT(GPTConfig())
model.to(device)
# 编译模型 (PyTorch 2.0)
if compile:
print("正在编译模型...")
model = torch.compile(model)
# 优化器
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device)
# 训练循环
for iter_num in range(max_iters):
# 设置学习率
lr = get_lr(iter_num)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# 梯度累积
for micro_step in range(gradient_accumulation_steps):
X, Y = get_batch('train')
with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
logits, loss = model(X, Y)
loss = loss / gradient_accumulation_steps
loss.backward()
# 梯度裁剪
if grad_clip != 0.0:
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
# 更新权重