Skip to content

PyTorch 神经网络基础

torch.nn模块简介

torch.nn是PyTorch中构建神经网络的核心模块,提供了各种层、激活函数、损失函数等组件。

python
import torch
import torch.nn as nn
import torch.nn.functional as F

# 基本的神经网络层
linear = nn.Linear(10, 5)  # 线性层:输入10维,输出5维
conv = nn.Conv2d(3, 16, 3)  # 卷积层:3通道输入,16通道输出,3x3卷积核
lstm = nn.LSTM(10, 20, 2)   # LSTM层:输入10维,隐藏20维,2层

构建第一个神经网络

1. 使用nn.Sequential

python
import torch
import torch.nn as nn

# 最简单的方式:Sequential容器
model = nn.Sequential(
    nn.Linear(784, 128),    # 输入层到隐藏层
    nn.ReLU(),              # 激活函数
    nn.Linear(128, 64),     # 隐藏层
    nn.ReLU(),
    nn.Linear(64, 10)       # 输出层
)

# 测试模型
x = torch.randn(32, 784)  # 批量大小32,特征784
output = model(x)
print(f"输出形状: {output.shape}")  # [32, 10]

2. 自定义nn.Module

python
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# 创建模型实例
model = MLP(784, 128, 10)
print(model)

# 查看模型参数
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"总参数数量: {total_params}")
print(f"可训练参数数量: {trainable_params}")

常用神经网络层

1. 线性层

python
# 全连接层
linear = nn.Linear(in_features=100, out_features=50, bias=True)

# 查看参数
print(f"权重形状: {linear.weight.shape}")  # [50, 100]
print(f"偏置形状: {linear.bias.shape}")    # [50]

# 自定义初始化
nn.init.xavier_uniform_(linear.weight)
nn.init.zeros_(linear.bias)

2. 卷积层

python
# 2D卷积
conv2d = nn.Conv2d(
    in_channels=3,      # 输入通道数
    out_channels=16,    # 输出通道数
    kernel_size=3,      # 卷积核大小
    stride=1,           # 步长
    padding=1,          # 填充
    bias=True
)

# 1D卷积(用于序列数据)
conv1d = nn.Conv1d(in_channels=10, out_channels=20, kernel_size=3)

# 转置卷积(反卷积)
conv_transpose = nn.ConvTranspose2d(16, 3, 3, stride=2, padding=1)

# 测试卷积层
x = torch.randn(32, 3, 64, 64)  # [batch, channels, height, width]
output = conv2d(x)
print(f"卷积输出形状: {output.shape}")  # [32, 16, 64, 64]

3. 池化层

python
# 最大池化
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

# 平均池化
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)

# 自适应池化(输出固定尺寸)
adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

# 全局平均池化
global_pool = nn.AdaptiveAvgPool2d((1, 1))

# 测试池化
x = torch.randn(32, 16, 64, 64)
pooled = maxpool(x)
print(f"池化后形状: {pooled.shape}")  # [32, 16, 32, 32]

4. 循环层

python
# LSTM层
lstm = nn.LSTM(
    input_size=100,     # 输入特征维度
    hidden_size=128,    # 隐藏状态维度
    num_layers=2,       # 层数
    batch_first=True,   # 输入形状为(batch, seq, feature)
    dropout=0.1,        # dropout概率
    bidirectional=False # 是否双向
)

# GRU层
gru = nn.GRU(input_size=100, hidden_size=128, num_layers=2, batch_first=True)

# 简单RNN
rnn = nn.RNN(input_size=100, hidden_size=128, num_layers=2, batch_first=True)

# 测试LSTM
x = torch.randn(32, 50, 100)  # [batch, seq_len, input_size]
output, (hidden, cell) = lstm(x)
print(f"LSTM输出形状: {output.shape}")  # [32, 50, 128]
print(f"隐藏状态形状: {hidden.shape}")   # [2, 32, 128]

5. 注意力机制

python
# 多头注意力
attention = nn.MultiheadAttention(
    embed_dim=512,      # 嵌入维度
    num_heads=8,        # 注意力头数
    dropout=0.1,
    batch_first=True
)

# 测试注意力
query = torch.randn(32, 50, 512)  # [batch, seq_len, embed_dim]
key = torch.randn(32, 50, 512)
value = torch.randn(32, 50, 512)

attn_output, attn_weights = attention(query, key, value)
print(f"注意力输出形状: {attn_output.shape}")  # [32, 50, 512]

激活函数

python
# 常用激活函数
relu = nn.ReLU()
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
elu = nn.ELU()
gelu = nn.GELU()
swish = nn.SiLU()  # Swish激活函数
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
softmax = nn.Softmax(dim=-1)

# 函数式接口
x = torch.randn(10)
y1 = F.relu(x)
y2 = F.gelu(x)
y3 = F.softmax(x, dim=0)

print(f"ReLU输出: {y1}")
print(f"GELU输出: {y2}")
print(f"Softmax输出: {y3}")

正则化技术

1. Dropout

python
# Dropout层
dropout = nn.Dropout(p=0.5)  # 50%的神经元被随机置零

# 在训练和评估时的不同行为
model.train()  # 训练模式,启用dropout
output_train = dropout(x)

model.eval()   # 评估模式,禁用dropout
output_eval = dropout(x)

2. 批量归一化

python
# 1D批量归一化(用于全连接层)
bn1d = nn.BatchNorm1d(num_features=128)

# 2D批量归一化(用于卷积层)
bn2d = nn.BatchNorm2d(num_features=64)

# 层归一化
ln = nn.LayerNorm(normalized_shape=128)

# 组归一化
gn = nn.GroupNorm(num_groups=8, num_channels=64)

# 实例归一化
in_norm = nn.InstanceNorm2d(num_features=64)

3. 权重初始化

python
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.ones_(m.weight)
        nn.init.zeros_(m.bias)

# 应用初始化
model.apply(init_weights)

复杂网络结构

1. 残差连接

python
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        # 跳跃连接
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        residual = self.shortcut(x)
        
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += residual  # 残差连接
        out = F.relu(out)
        
        return out

2. 注意力机制

python
class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.out = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, x):
        batch_size, seq_len, embed_dim = x.size()
        
        # 计算Q, K, V
        Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        # 计算注意力分数
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = F.softmax(scores, dim=-1)
        
        # 应用注意力
        attn_output = torch.matmul(attn_weights, V)
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            batch_size, seq_len, embed_dim
        )
        
        return self.out(attn_output)

模型管理

1. 参数访问和修改

python
# 访问所有参数
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

# 访问特定层的参数
linear_layer = model.fc1
print(f"权重: {linear_layer.weight.shape}")
print(f"偏置: {linear_layer.bias.shape}")

# 冻结参数
for param in model.parameters():
    param.requires_grad = False

# 只训练特定层
for name, param in model.named_parameters():
    if 'fc3' in name:  # 只训练最后一层
        param.requires_grad = True
    else:
        param.requires_grad = False

2. 模型状态管理

python
# 训练模式和评估模式
model.train()  # 启用dropout、batch norm等
model.eval()   # 禁用dropout、batch norm等

# 检查模式
print(f"模型是否在训练模式: {model.training}")

# 移动到GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 检查模型设备
print(f"模型设备: {next(model.parameters()).device}")

3. 模型保存和加载

python
# 保存整个模型
torch.save(model, 'model.pth')

# 只保存参数(推荐)
torch.save(model.state_dict(), 'model_params.pth')

# 加载模型
model = torch.load('model.pth')

# 加载参数
model = MLP(784, 128, 10)  # 先创建模型结构
model.load_state_dict(torch.load('model_params.pth'))

# 保存训练状态
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch,
    'loss': loss
}
torch.save(checkpoint, 'checkpoint.pth')

实际应用示例

1. 图像分类网络

python
class ImageClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super(ImageClassifier, self).__init__()
        self.features = nn.Sequential(
            # 第一个卷积块
            nn.Conv2d(3, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            # 第二个卷积块
            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            # 第三个卷积块
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(128, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# 测试模型
model = ImageClassifier(num_classes=10)
x = torch.randn(32, 3, 32, 32)  # CIFAR-10尺寸
output = model(x)
print(f"分类输出形状: {output.shape}")  # [32, 10]

2. 序列到序列模型

python
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(Seq2Seq, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # 编码器
        self.encoder = nn.LSTM(input_size, hidden_size, num_layers, 
                              batch_first=True, dropout=0.1)
        
        # 解码器
        self.decoder = nn.LSTM(output_size, hidden_size, num_layers, 
                              batch_first=True, dropout=0.1)
        
        # 输出层
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, src, tgt):
        # 编码
        encoder_output, (hidden, cell) = self.encoder(src)
        
        # 解码
        decoder_output, _ = self.decoder(tgt, (hidden, cell))
        
        # 输出
        output = self.out(decoder_output)
        
        return output

调试技巧

1. 检查模型结构

python
from torchsummary import summary

# 打印模型摘要
summary(model, input_size=(3, 32, 32))

# 或者使用torchinfo
from torchinfo import summary
summary(model, input_size=(32, 3, 32, 32))

2. 梯度检查

python
# 检查梯度是否正常
def check_gradients(model):
    for name, param in model.named_parameters():
        if param.grad is not None:
            grad_norm = param.grad.norm()
            print(f"{name}: 梯度范数 = {grad_norm:.6f}")
        else:
            print(f"{name}: 无梯度")

3. 可视化网络

python
import torch.nn as nn
import matplotlib.pyplot as plt

def visualize_weights(model, layer_name):
    for name, module in model.named_modules():
        if name == layer_name and isinstance(module, nn.Conv2d):
            weights = module.weight.data
            # 可视化第一个卷积核
            plt.imshow(weights[0, 0].cpu(), cmap='gray')
            plt.title(f'{layer_name} - First Filter')
            plt.show()
            break

总结

神经网络是深度学习的核心,PyTorch提供了丰富的工具来构建各种网络结构:

  1. 基础组件:掌握各种层、激活函数、正则化技术
  2. 模型构建:学会使用Sequential和自定义Module
  3. 高级结构:理解残差连接、注意力机制等现代技术
  4. 模型管理:掌握参数访问、状态管理、保存加载
  5. 调试技巧:学会检查模型结构、梯度、权重等

这些知识将为后续的模型训练和优化打下坚实基础!

本站内容仅供学习和研究使用。