PyTorch 神经网络基础

torch.nn模块简介

torch.nn是PyTorch中构建神经网络的核心模块，提供了各种层、激活函数、损失函数等组件。

python

import torch
import torch.nn as nn
import torch.nn.functional as F

# 基本的神经网络层
linear = nn.Linear(10, 5)  # 线性层：输入10维，输出5维
conv = nn.Conv2d(3, 16, 3)  # 卷积层：3通道输入，16通道输出，3x3卷积核
lstm = nn.LSTM(10, 20, 2)   # LSTM层：输入10维，隐藏20维，2层

构建第一个神经网络

1. 使用nn.Sequential

python

import torch
import torch.nn as nn

# 最简单的方式：Sequential容器
model = nn.Sequential(
    nn.Linear(784, 128),    # 输入层到隐藏层
    nn.ReLU(),              # 激活函数
    nn.Linear(128, 64),     # 隐藏层
    nn.ReLU(),
    nn.Linear(64, 10)       # 输出层
)

# 测试模型
x = torch.randn(32, 784)  # 批量大小32，特征784
output = model(x)
print(f"输出形状: {output.shape}")  # [32, 10]

2. 自定义nn.Module

python

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# 创建模型实例
model = MLP(784, 128, 10)
print(model)

# 查看模型参数
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"总参数数量: {total_params}")
print(f"可训练参数数量: {trainable_params}")

常用神经网络层

1. 线性层

python

# 全连接层
linear = nn.Linear(in_features=100, out_features=50, bias=True)

# 查看参数
print(f"权重形状: {linear.weight.shape}")  # [50, 100]
print(f"偏置形状: {linear.bias.shape}")    # [50]

# 自定义初始化
nn.init.xavier_uniform_(linear.weight)
nn.init.zeros_(linear.bias)

2. 卷积层

python

# 2D卷积
conv2d = nn.Conv2d(
    in_channels=3,      # 输入通道数
    out_channels=16,    # 输出通道数
    kernel_size=3,      # 卷积核大小
    stride=1,           # 步长
    padding=1,          # 填充
    bias=True
)

# 1D卷积（用于序列数据）
conv1d = nn.Conv1d(in_channels=10, out_channels=20, kernel_size=3)

# 转置卷积（反卷积）
conv_transpose = nn.ConvTranspose2d(16, 3, 3, stride=2, padding=1)

# 测试卷积层
x = torch.randn(32, 3, 64, 64)  # [batch, channels, height, width]
output = conv2d(x)
print(f"卷积输出形状: {output.shape}")  # [32, 16, 64, 64]

3. 池化层

python

# 最大池化
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

# 平均池化
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)

# 自适应池化（输出固定尺寸）
adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

# 全局平均池化
global_pool = nn.AdaptiveAvgPool2d((1, 1))

# 测试池化
x = torch.randn(32, 16, 64, 64)
pooled = maxpool(x)
print(f"池化后形状: {pooled.shape}")  # [32, 16, 32, 32]

4. 循环层

python

# LSTM层
lstm = nn.LSTM(
    input_size=100,     # 输入特征维度
    hidden_size=128,    # 隐藏状态维度
    num_layers=2,       # 层数
    batch_first=True,   # 输入形状为(batch, seq, feature)
    dropout=0.1,        # dropout概率
    bidirectional=False # 是否双向
)

# GRU层
gru = nn.GRU(input_size=100, hidden_size=128, num_layers=2, batch_first=True)

# 简单RNN
rnn = nn.RNN(input_size=100, hidden_size=128, num_layers=2, batch_first=True)

# 测试LSTM
x = torch.randn(32, 50, 100)  # [batch, seq_len, input_size]
output, (hidden, cell) = lstm(x)
print(f"LSTM输出形状: {output.shape}")  # [32, 50, 128]
print(f"隐藏状态形状: {hidden.shape}")   # [2, 32, 128]

5. 注意力机制

python

# 多头注意力
attention = nn.MultiheadAttention(
    embed_dim=512,      # 嵌入维度
    num_heads=8,        # 注意力头数
    dropout=0.1,
    batch_first=True
)

# 测试注意力
query = torch.randn(32, 50, 512)  # [batch, seq_len, embed_dim]
key = torch.randn(32, 50, 512)
value = torch.randn(32, 50, 512)

attn_output, attn_weights = attention(query, key, value)
print(f"注意力输出形状: {attn_output.shape}")  # [32, 50, 512]

激活函数

python

# 常用激活函数
relu = nn.ReLU()
leaky_relu = nn.LeakyReLU(negative_slope=0.01)
elu = nn.ELU()
gelu = nn.GELU()
swish = nn.SiLU()  # Swish激活函数
tanh = nn.Tanh()
sigmoid = nn.Sigmoid()
softmax = nn.Softmax(dim=-1)

# 函数式接口
x = torch.randn(10)
y1 = F.relu(x)
y2 = F.gelu(x)
y3 = F.softmax(x, dim=0)

print(f"ReLU输出: {y1}")
print(f"GELU输出: {y2}")
print(f"Softmax输出: {y3}")

正则化技术

1. Dropout

python

# Dropout层
dropout = nn.Dropout(p=0.5)  # 50%的神经元被随机置零

# 在训练和评估时的不同行为
model.train()  # 训练模式，启用dropout
output_train = dropout(x)

model.eval()   # 评估模式，禁用dropout
output_eval = dropout(x)

2. 批量归一化

python

# 1D批量归一化（用于全连接层）
bn1d = nn.BatchNorm1d(num_features=128)

# 2D批量归一化（用于卷积层）
bn2d = nn.BatchNorm2d(num_features=64)

# 层归一化
ln = nn.LayerNorm(normalized_shape=128)

# 组归一化
gn = nn.GroupNorm(num_groups=8, num_channels=64)

# 实例归一化
in_norm = nn.InstanceNorm2d(num_features=64)

3. 权重初始化

python

def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.ones_(m.weight)
        nn.init.zeros_(m.bias)

# 应用初始化
model.apply(init_weights)

复杂网络结构

1. 残差连接

python

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        
        # 跳跃连接
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
    
    def forward(self, x):
        residual = self.shortcut(x)
        
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += residual  # 残差连接
        out = F.relu(out)
        
        return out

2. 注意力机制

python

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.out = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, x):
        batch_size, seq_len, embed_dim = x.size()
        
        # 计算Q, K, V
        Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        # 计算注意力分数
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = F.softmax(scores, dim=-1)
        
        # 应用注意力
        attn_output = torch.matmul(attn_weights, V)
        attn_output = attn_output.transpose(1, 2).contiguous().view(
            batch_size, seq_len, embed_dim
        )
        
        return self.out(attn_output)

模型管理

1. 参数访问和修改

python

# 访问所有参数
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

# 访问特定层的参数
linear_layer = model.fc1
print(f"权重: {linear_layer.weight.shape}")
print(f"偏置: {linear_layer.bias.shape}")

# 冻结参数
for param in model.parameters():
    param.requires_grad = False

# 只训练特定层
for name, param in model.named_parameters():
    if 'fc3' in name:  # 只训练最后一层
        param.requires_grad = True
    else:
        param.requires_grad = False

2. 模型状态管理

python

# 训练模式和评估模式
model.train()  # 启用dropout、batch norm等
model.eval()   # 禁用dropout、batch norm等

# 检查模式
print(f"模型是否在训练模式: {model.training}")

# 移动到GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 检查模型设备
print(f"模型设备: {next(model.parameters()).device}")

3. 模型保存和加载

python

# 保存整个模型
torch.save(model, 'model.pth')

# 只保存参数（推荐）
torch.save(model.state_dict(), 'model_params.pth')

# 加载模型
model = torch.load('model.pth')

# 加载参数
model = MLP(784, 128, 10)  # 先创建模型结构
model.load_state_dict(torch.load('model_params.pth'))

# 保存训练状态
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch,
    'loss': loss
}
torch.save(checkpoint, 'checkpoint.pth')

实际应用示例

1. 图像分类网络

python

class ImageClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super(ImageClassifier, self).__init__()
        self.features = nn.Sequential(
            # 第一个卷积块
            nn.Conv2d(3, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            # 第二个卷积块
            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            # 第三个卷积块
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(128, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# 测试模型
model = ImageClassifier(num_classes=10)
x = torch.randn(32, 3, 32, 32)  # CIFAR-10尺寸
output = model(x)
print(f"分类输出形状: {output.shape}")  # [32, 10]

2. 序列到序列模型

python

class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(Seq2Seq, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # 编码器
        self.encoder = nn.LSTM(input_size, hidden_size, num_layers, 
                              batch_first=True, dropout=0.1)
        
        # 解码器
        self.decoder = nn.LSTM(output_size, hidden_size, num_layers, 
                              batch_first=True, dropout=0.1)
        
        # 输出层
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, src, tgt):
        # 编码
        encoder_output, (hidden, cell) = self.encoder(src)
        
        # 解码
        decoder_output, _ = self.decoder(tgt, (hidden, cell))
        
        # 输出
        output = self.out(decoder_output)
        
        return output

调试技巧

1. 检查模型结构

python

from torchsummary import summary

# 打印模型摘要
summary(model, input_size=(3, 32, 32))

# 或者使用torchinfo
from torchinfo import summary
summary(model, input_size=(32, 3, 32, 32))

2. 梯度检查

python

# 检查梯度是否正常
def check_gradients(model):
    for name, param in model.named_parameters():
        if param.grad is not None:
            grad_norm = param.grad.norm()
            print(f"{name}: 梯度范数 = {grad_norm:.6f}")
        else:
            print(f"{name}: 无梯度")

3. 可视化网络

python

import torch.nn as nn
import matplotlib.pyplot as plt

def visualize_weights(model, layer_name):
    for name, module in model.named_modules():
        if name == layer_name and isinstance(module, nn.Conv2d):
            weights = module.weight.data
            # 可视化第一个卷积核
            plt.imshow(weights[0, 0].cpu(), cmap='gray')
            plt.title(f'{layer_name} - First Filter')
            plt.show()
            break

总结

神经网络是深度学习的核心，PyTorch提供了丰富的工具来构建各种网络结构：

基础组件：掌握各种层、激活函数、正则化技术
模型构建：学会使用Sequential和自定义Module
高级结构：理解残差连接、注意力机制等现代技术
模型管理：掌握参数访问、状态管理、保存加载
调试技巧：学会检查模型结构、梯度、权重等

这些知识将为后续的模型训练和优化打下坚实基础！

PyTorch 神经网络基础 ​

torch.nn模块简介 ​

构建第一个神经网络 ​

1. 使用nn.Sequential ​

2. 自定义nn.Module ​

常用神经网络层 ​

1. 线性层 ​

2. 卷积层 ​

3. 池化层 ​

4. 循环层 ​

5. 注意力机制 ​

激活函数 ​

正则化技术 ​

1. Dropout ​

2. 批量归一化 ​

3. 权重初始化 ​

复杂网络结构 ​

1. 残差连接 ​

2. 注意力机制 ​

模型管理 ​

1. 参数访问和修改 ​

2. 模型状态管理 ​

3. 模型保存和加载 ​

实际应用示例 ​

1. 图像分类网络 ​

2. 序列到序列模型 ​

调试技巧 ​

1. 检查模型结构 ​

2. 梯度检查 ​

3. 可视化网络 ​

总结 ​

PyTorch 神经网络基础

torch.nn模块简介

构建第一个神经网络

1. 使用nn.Sequential

2. 自定义nn.Module

常用神经网络层

1. 线性层

2. 卷积层

3. 池化层

4. 循环层

5. 注意力机制

激活函数

正则化技术

1. Dropout

2. 批量归一化

3. 权重初始化

复杂网络结构

1. 残差连接

2. 注意力机制

模型管理

1. 参数访问和修改

2. 模型状态管理

3. 模型保存和加载

实际应用示例

1. 图像分类网络

2. 序列到序列模型

调试技巧

1. 检查模型结构

2. 梯度检查

3. 可视化网络

总结