PyTorch 神经网络基础
torch.nn模块简介
torch.nn是PyTorch中构建神经网络的核心模块,提供了各种层、激活函数、损失函数等组件。
python
import torch
import torch.nn as nn
import torch.nn.functional as F
# 基本的神经网络层
linear = nn.Linear(10, 5) # 线性层:输入10维,输出5维
conv = nn.Conv2d(3, 16, 3) # 卷积层:3通道输入,16通道输出,3x3卷积核
lstm = nn.LSTM(10, 20, 2) # LSTM层:输入10维,隐藏20维,2层构建第一个神经网络
1. 使用nn.Sequential
python
import torch
import torch.nn as nn
# 最简单的方式:Sequential容器
model = nn.Sequential(
nn.Linear(784, 128), # 输入层到隐藏层
nn.ReLU(), # 激活函数
nn.Linear(128, 64), # 隐藏层
nn.ReLU(),
nn.Linear(64, 10) # 输出层
)
# 测试模型
x = torch.randn(32, 784) # 批量大小32,特征784
output = model(x)
print(f"输出形状: {output.shape}") # [32, 10]2. 自定义nn.Module
python
class MLP(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(MLP, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
# 创建模型实例
model = MLP(784, 128, 10)
print(model)
# 查看模型参数
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"总参数数量: {total_params}")
print(f"可训练参数数量: {trainable_params}")常用神经网络层
1. 线性层
python
# 全连接层
linear = nn.Linear(in_features=100, out_features=50, bias=True)
# 查看参数
print(f"权重形状: {linear.weight.shape}") # [50, 100]
print(f"偏置形状: {linear.bias.shape}") # [50]
# 自定义初始化
nn.init.xavier_uniform_(linear.weight)
nn.init.zeros_(linear.bias)2. 卷积层
python
# 2D卷积
conv2d = nn.Conv2d(
in_channels=3, # 输入通道数
out_channels=16, # 输出通道数
kernel_size=3, # 卷积核大小
stride=1, # 步长
padding=1, # 填充
bias=True
)
# 1D卷积(用于序列数据)
conv1d = nn.Conv1d(in_channels=10, out_channels=20, kernel_size=3)
# 转置卷积(反卷积)
conv_transpose = nn.ConvTranspose2d(16, 3, 3, stride=2, padding=1)
# 测试卷积层
x = torch.randn(32, 3, 64, 64) # [batch, channels, height, width]
output = conv2d(x)
print(f"卷积输出形状: {output.shape}") # [32, 16, 64, 64]3. 池化层
python
# 最大池化
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
# 平均池化
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
# 自适应池化(输出固定尺寸)
adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))
# 全局平均池化
global_pool = nn.AdaptiveAvgPool2d((1, 1))
# 测试池化
x = torch.randn(32, 16, 64, 64)
pooled = maxpool(x)
print(f"池化后形状: {pooled.shape}") # [32, 16, 32, 32]4. 循环层
python
# LSTM层
lstm = nn.LSTM(
input_size=100, # 输入特征维度
hidden_size=128, # 隐藏状态维度
num_layers=2, # 层数
batch_first=True, # 输入形状为(batch, seq, feature)
dropout=0.1, # dropout概率
bidirectional=False # 是否双向
)
# GRU层
gru = nn.GRU(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
# 简单RNN
rnn = nn.RNN(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
# 测试LSTM
x = torch.randn(32, 50, 100) # [batch, seq_len, input_size]
output, (hidden, cell) = lstm(x)
print(f"LSTM输出形状: {output.shape}") # [32, 50, 128]
print(f"隐藏状态形状: {hidden.shape}") # [2, 32, 128]5. 注意力机制
python
# 多头注意力
attention = nn.MultiheadAttention(
embed_dim=512, # 嵌入维度
num_heads=8, # 注意力头数
dropout=0.1,
batch_first=True
)
# 测试注意力
query = torch.randn(32, 50, 512) # [batch, seq_len, embed_dim]
key = torch.randn(32, 50, 512)
value = torch.randn(32, 50, 512)
attn_output, attn_weights = attention(query, key, value)
print(f"注意力输出形状: {attn_output.shape}") # [32, 50, 512]激活函数
python
# 常用激活函数
relu = nn.ReLU()
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
elu = nn.ELU()
gelu = nn.GELU()
swish = nn.SiLU() # Swish激活函数
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
softmax = nn.Softmax(dim=-1)
# 函数式接口
x = torch.randn(10)
y1 = F.relu(x)
y2 = F.gelu(x)
y3 = F.softmax(x, dim=0)
print(f"ReLU输出: {y1}")
print(f"GELU输出: {y2}")
print(f"Softmax输出: {y3}")正则化技术
1. Dropout
python
# Dropout层
dropout = nn.Dropout(p=0.5) # 50%的神经元被随机置零
# 在训练和评估时的不同行为
model.train() # 训练模式,启用dropout
output_train = dropout(x)
model.eval() # 评估模式,禁用dropout
output_eval = dropout(x)2. 批量归一化
python
# 1D批量归一化(用于全连接层)
bn1d = nn.BatchNorm1d(num_features=128)
# 2D批量归一化(用于卷积层)
bn2d = nn.BatchNorm2d(num_features=64)
# 层归一化
ln = nn.LayerNorm(normalized_shape=128)
# 组归一化
gn = nn.GroupNorm(num_groups=8, num_channels=64)
# 实例归一化
in_norm = nn.InstanceNorm2d(num_features=64)3. 权重初始化
python
def init_weights(m):
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.BatchNorm2d):
nn.init.ones_(m.weight)
nn.init.zeros_(m.bias)
# 应用初始化
model.apply(init_weights)复杂网络结构
1. 残差连接
python
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
# 跳跃连接
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 1, stride, bias=False),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
residual = self.shortcut(x)
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += residual # 残差连接
out = F.relu(out)
return out2. 注意力机制
python
class SelfAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super(SelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
self.query = nn.Linear(embed_dim, embed_dim)
self.key = nn.Linear(embed_dim, embed_dim)
self.value = nn.Linear(embed_dim, embed_dim)
self.out = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
batch_size, seq_len, embed_dim = x.size()
# 计算Q, K, V
Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
# 计算注意力分数
scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
attn_weights = F.softmax(scores, dim=-1)
# 应用注意力
attn_output = torch.matmul(attn_weights, V)
attn_output = attn_output.transpose(1, 2).contiguous().view(
batch_size, seq_len, embed_dim
)
return self.out(attn_output)模型管理
1. 参数访问和修改
python
# 访问所有参数
for name, param in model.named_parameters():
print(f"{name}: {param.shape}")
# 访问特定层的参数
linear_layer = model.fc1
print(f"权重: {linear_layer.weight.shape}")
print(f"偏置: {linear_layer.bias.shape}")
# 冻结参数
for param in model.parameters():
param.requires_grad = False
# 只训练特定层
for name, param in model.named_parameters():
if 'fc3' in name: # 只训练最后一层
param.requires_grad = True
else:
param.requires_grad = False2. 模型状态管理
python
# 训练模式和评估模式
model.train() # 启用dropout、batch norm等
model.eval() # 禁用dropout、batch norm等
# 检查模式
print(f"模型是否在训练模式: {model.training}")
# 移动到GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# 检查模型设备
print(f"模型设备: {next(model.parameters()).device}")3. 模型保存和加载
python
# 保存整个模型
torch.save(model, 'model.pth')
# 只保存参数(推荐)
torch.save(model.state_dict(), 'model_params.pth')
# 加载模型
model = torch.load('model.pth')
# 加载参数
model = MLP(784, 128, 10) # 先创建模型结构
model.load_state_dict(torch.load('model_params.pth'))
# 保存训练状态
checkpoint = {
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'epoch': epoch,
'loss': loss
}
torch.save(checkpoint, 'checkpoint.pth')实际应用示例
1. 图像分类网络
python
class ImageClassifier(nn.Module):
def __init__(self, num_classes=10):
super(ImageClassifier, self).__init__()
self.features = nn.Sequential(
# 第一个卷积块
nn.Conv2d(3, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
# 第二个卷积块
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
# 第三个卷积块
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
)
self.classifier = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten(),
nn.Linear(128, 256),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# 测试模型
model = ImageClassifier(num_classes=10)
x = torch.randn(32, 3, 32, 32) # CIFAR-10尺寸
output = model(x)
print(f"分类输出形状: {output.shape}") # [32, 10]2. 序列到序列模型
python
class Seq2Seq(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=2):
super(Seq2Seq, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
# 编码器
self.encoder = nn.LSTM(input_size, hidden_size, num_layers,
batch_first=True, dropout=0.1)
# 解码器
self.decoder = nn.LSTM(output_size, hidden_size, num_layers,
batch_first=True, dropout=0.1)
# 输出层
self.out = nn.Linear(hidden_size, output_size)
def forward(self, src, tgt):
# 编码
encoder_output, (hidden, cell) = self.encoder(src)
# 解码
decoder_output, _ = self.decoder(tgt, (hidden, cell))
# 输出
output = self.out(decoder_output)
return output调试技巧
1. 检查模型结构
python
from torchsummary import summary
# 打印模型摘要
summary(model, input_size=(3, 32, 32))
# 或者使用torchinfo
from torchinfo import summary
summary(model, input_size=(32, 3, 32, 32))2. 梯度检查
python
# 检查梯度是否正常
def check_gradients(model):
for name, param in model.named_parameters():
if param.grad is not None:
grad_norm = param.grad.norm()
print(f"{name}: 梯度范数 = {grad_norm:.6f}")
else:
print(f"{name}: 无梯度")3. 可视化网络
python
import torch.nn as nn
import matplotlib.pyplot as plt
def visualize_weights(model, layer_name):
for name, module in model.named_modules():
if name == layer_name and isinstance(module, nn.Conv2d):
weights = module.weight.data
# 可视化第一个卷积核
plt.imshow(weights[0, 0].cpu(), cmap='gray')
plt.title(f'{layer_name} - First Filter')
plt.show()
break总结
神经网络是深度学习的核心,PyTorch提供了丰富的工具来构建各种网络结构:
- 基础组件:掌握各种层、激活函数、正则化技术
- 模型构建:学会使用Sequential和自定义Module
- 高级结构:理解残差连接、注意力机制等现代技术
- 模型管理:掌握参数访问、状态管理、保存加载
- 调试技巧:学会检查模型结构、梯度、权重等
这些知识将为后续的模型训练和优化打下坚实基础!