DeepSpeed笔记--利用Accelerate实现DeepSpeed加速
1--参考文档
Accelerate官方文档
accelerate+deepspeed多机多卡训练-适用集群环境
DeepSpeed & Accelerate
2--安装过程
# 安装accelerate
pip install acceleratepip install importlib-metadata
# 获取默认配置文件
python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"# 默认保存地址
# /home/liujinfu/.cache/huggingface/accelerate/default_config.yaml# 查看配好的环境
accelerate env# 查看环境是否配好
accelerate test
3--测试代码
# 加载库
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoaderfrom accelerate import Accelerator, DeepSpeedPlugin# 定义测试网络
class TestNet(nn.Module):def __init__(self, input_dim: int, output_dim: int):super(TestNet, self).__init__()self.fc1 = nn.Linear(in_features = input_dim, out_features = output_dim)self.fc2 = nn.Linear(in_features = output_dim, out_features = output_dim)def forward(self, x: torch.Tensor):x = torch.relu(self.fc1(x))x = torch.fc2(x)return xif __name__ == "__main__":input_dim = 8output_dim = 64batch_size = 8dataset_size = 1000# 随机生成数据input_data = torch.randn(dataset_size, input_dim)labels = torch.randn(dataset_size, output_dim)# 创建数据集dataset = TensorDataset(input_data, labels)dataloader = DataLoader(dataset = dataset, batch_size = batch_size)# 初始化模型model = TestNet(input_dim = input_dim, output_dim = output_dim)# 创建Deepspeed配置deepspeed = DeepSpeedPlugin(zero_stage = 2, gradient_clipping = 1.0) # 使用zero-2accelerator = Accelerator(deepspeed_plugin = deepspeed)# 创建训练配置optimizator = torch.optim.Adam(model.parameters(), lr = 0.001)loss_func = nn.MSELoss()# 初始化model, optimizator, dataloader = accelerator.prepare(model, optimizator, dataloader)# 训练模型for epoch in range(10):model.train()for batch in dataloader:inputs, labels = batch# 清理梯度optimizator.zero_grad()outputs = model(inputs)loss = loss_func(outputs, labels)accelerator.backward(loss) # 核心改动optimizator.step()print(f"Epoch {epoch}, Loss: {loss.item()}")# 保存模型accelerator.wait_for_everyone()accelerator.save(model.state_dict(), "test_model.pth")
4--代码运行
未完待续!