PyTorch 卷积神经网络
卷积神经网络简介
卷积神经网络(Convolutional Neural Network, CNN)是深度学习中最重要的架构之一,特别适用于图像处理任务。CNN通过卷积层、池化层和全连接层的组合,能够自动学习图像的层次化特征。
python
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
# CNN的基本组件
conv_layer = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
pool_layer = nn.MaxPool2d(kernel_size=2, stride=2)
fc_layer = nn.Linear(in_features=1024, out_features=10)卷积层详解
1. 基本卷积操作
python
# 2D卷积层
conv2d = nn.Conv2d(
in_channels=3, # 输入通道数
out_channels=64, # 输出通道数(卷积核数量)
kernel_size=3, # 卷积核大小
stride=1, # 步长
padding=1, # 填充
dilation=1, # 膨胀
groups=1, # 分组卷积
bias=True # 是否使用偏置
)
# 1D卷积(用于序列数据)
conv1d = nn.Conv1d(in_channels=100, out_channels=128, kernel_size=3)
# 3D卷积(用于视频数据)
conv3d = nn.Conv3d(in_channels=3, out_channels=64, kernel_size=3)
# 转置卷积(反卷积)
conv_transpose = nn.ConvTranspose2d(64, 3, kernel_size=3, stride=2, padding=1)2. 卷积参数计算
python
def conv_output_size(input_size, kernel_size, stride=1, padding=0, dilation=1):
"""计算卷积输出尺寸"""
return (input_size + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1
# 示例计算
input_h, input_w = 32, 32
kernel_size = 3
stride = 1
padding = 1
output_h = conv_output_size(input_h, kernel_size, stride, padding)
output_w = conv_output_size(input_w, kernel_size, stride, padding)
print(f"输出尺寸: {output_h} x {output_w}") # 32 x 323. 不同类型的卷积
python
# 深度可分离卷积
class DepthwiseSeparableConv(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
super().__init__()
# 深度卷积
self.depthwise = nn.Conv2d(
in_channels, in_channels, kernel_size, stride, padding, groups=in_channels
)
# 点卷积
self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
def forward(self, x):
x = self.depthwise(x)
x = self.pointwise(x)
return x
# 空洞卷积(膨胀卷积)
dilated_conv = nn.Conv2d(64, 64, kernel_size=3, padding=2, dilation=2)
# 分组卷积
group_conv = nn.Conv2d(64, 64, kernel_size=3, padding=1, groups=4)池化层
1. 常用池化操作
python
# 最大池化
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)
# 平均池化
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
# 自适应池化(输出固定尺寸)
adaptive_avg_pool = nn.AdaptiveAvgPool2d((7, 7))
adaptive_max_pool = nn.AdaptiveMaxPool2d((1, 1)) # 全局池化
# 测试池化效果
x = torch.randn(1, 64, 32, 32)
pooled = max_pool(x)
print(f"池化前: {x.shape}") # [1, 64, 32, 32]
print(f"池化后: {pooled.shape}") # [1, 64, 16, 16]2. 自定义池化
python
class StochasticPool2d(nn.Module):
"""随机池化"""
def __init__(self, kernel_size, stride=None):
super().__init__()
self.kernel_size = kernel_size
self.stride = stride or kernel_size
def forward(self, x):
if self.training:
# 训练时使用随机池化
return F.adaptive_avg_pool2d(x,
(x.size(2) // self.stride, x.size(3) // self.stride))
else:
# 测试时使用平均池化
return F.avg_pool2d(x, self.kernel_size, self.stride)经典CNN架构
1. LeNet-5
python
class LeNet5(nn.Module):
def __init__(self, num_classes=10):
super(LeNet5, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 6, kernel_size=5),
nn.Tanh(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5),
nn.Tanh(),
nn.AvgPool2d(kernel_size=2, stride=2)
)
self.classifier = nn.Sequential(
nn.Linear(16 * 5 * 5, 120),
nn.Tanh(),
nn.Linear(120, 84),
nn.Tanh(),
nn.Linear(84, num_classes)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x2. AlexNet
python
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(),
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x3. VGG网络
python
class VGG(nn.Module):
def __init__(self, features, num_classes=1000):
super(VGG, self).__init__()
self.features = features
self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(True),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
def make_vgg_layers(cfg, batch_norm=False):
"""构建VGG特征层"""
layers = []
in_channels = 3
for v in cfg:
if v == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
if batch_norm:
layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
else:
layers += [conv2d, nn.ReLU(inplace=True)]
in_channels = v
return nn.Sequential(*layers)
# VGG配置
vgg_configs = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
def vgg16(num_classes=1000, batch_norm=True):
return VGG(make_vgg_layers(vgg_configs['VGG16'], batch_norm), num_classes)现代CNN架构
1. ResNet(残差网络)
python
class BasicBlock(nn.Module):
"""ResNet基本块"""
expansion = 1
def __init__(self, in_planes, planes, stride=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(planes)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(planes)
self.shortcut = nn.Sequential()
if stride != 1 or in_planes != self.expansion * planes:
self.shortcut = nn.Sequential(
nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(self.expansion * planes)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x) # 残差连接
out = F.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, num_blocks, num_classes=10):
super(ResNet, self).__init__()
self.in_planes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
self.linear = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, planes, num_blocks, stride):
strides = [stride] + [1] * (num_blocks - 1)
layers = []
for stride in strides:
layers.append(block(self.in_planes, planes, stride))
self.in_planes = planes * block.expansion
return nn.Sequential(*layers)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.layer1(out)
out = self.layer2(out)
out = self.layer3(out)
out = self.layer4(out)
out = F.avg_pool2d(out, 4)
out = out.view(out.size(0), -1)
out = self.linear(out)
return out
def ResNet18():
return ResNet(BasicBlock, [2, 2, 2, 2])2. DenseNet(密集连接网络)
python
class DenseBlock(nn.Module):
def __init__(self, in_channels, growth_rate, num_layers):
super(DenseBlock, self).__init__()
self.layers = nn.ModuleList()
for i in range(num_layers):
layer = nn.Sequential(
nn.BatchNorm2d(in_channels + i * growth_rate),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels + i * growth_rate, growth_rate, kernel_size=3, padding=1)
)
self.layers.append(layer)
def forward(self, x):
features = [x]
for layer in self.layers:
new_feature = layer(torch.cat(features, 1))
features.append(new_feature)
return torch.cat(features, 1)
class TransitionLayer(nn.Module):
def __init__(self, in_channels, out_channels):
super(TransitionLayer, self).__init__()
self.transition = nn.Sequential(
nn.BatchNorm2d(in_channels),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels, out_channels, kernel_size=1),
nn.AvgPool2d(kernel_size=2, stride=2)
)
def forward(self, x):
return self.transition(x)注意力机制
1. 通道注意力(SE模块)
python
class SEBlock(nn.Module):
"""Squeeze-and-Excitation Block"""
def __init__(self, channels, reduction=16):
super(SEBlock, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channels, channels // reduction, bias=False),
nn.ReLU(inplace=True),
nn.Linear(channels // reduction, channels, bias=False),
nn.Sigmoid()
)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y.expand_as(x)2. 空间注意力
python
class SpatialAttention(nn.Module):
def __init__(self, kernel_size=7):
super(SpatialAttention, self).__init__()
self.conv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
avg_out = torch.mean(x, dim=1, keepdim=True)
max_out, _ = torch.max(x, dim=1, keepdim=True)
attention = torch.cat([avg_out, max_out], dim=1)
attention = self.conv(attention)
return x * self.sigmoid(attention)实际应用示例
1. CIFAR-10图像分类
python
# 数据预处理
transform_train = transforms.Compose([
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])
# 加载数据
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)
# 定义模型
class CIFAR10CNN(nn.Module):
def __init__(self, num_classes=10):
super(CIFAR10CNN, self).__init__()
self.features = nn.Sequential(
# 第一个卷积块
nn.Conv2d(3, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Dropout(0.25),
# 第二个卷积块
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Dropout(0.25),
# 第三个卷积块
nn.Conv2d(128, 256, 3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, 3, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.MaxPool2d(2, 2),
nn.Dropout(0.25),
)
self.classifier = nn.Sequential(
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten(),
nn.Linear(256, 512),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CIFAR10CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)可视化和分析
1. 特征图可视化
python
def visualize_feature_maps(model, input_tensor, layer_name):
"""可视化特征图"""
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
# 注册钩子
for name, module in model.named_modules():
if name == layer_name:
module.register_forward_hook(get_activation(name))
# 前向传播
model.eval()
with torch.no_grad():
_ = model(input_tensor)
# 获取特征图
feature_maps = activation[layer_name]
# 可视化
fig, axes = plt.subplots(4, 8, figsize=(16, 8))
for i in range(min(32, feature_maps.shape[1])):
row, col = i // 8, i % 8
axes[row, col].imshow(feature_maps[0, i].cpu(), cmap='viridis')
axes[row, col].axis('off')
axes[row, col].set_title(f'Channel {i}')
plt.tight_layout()
plt.show()
# 使用示例
sample_input = torch.randn(1, 3, 32, 32).to(device)
visualize_feature_maps(model, sample_input, 'features.0')2. 卷积核可视化
python
def visualize_conv_filters(model, layer_name):
"""可视化卷积核"""
for name, module in model.named_modules():
if name == layer_name and isinstance(module, nn.Conv2d):
weights = module.weight.data
# 只显示前16个卷积核
num_filters = min(16, weights.shape[0])
fig, axes = plt.subplots(4, 4, figsize=(8, 8))
for i in range(num_filters):
row, col = i // 4, i % 4
# 如果是RGB输入,显示所有通道
if weights.shape[1] == 3:
filter_img = weights[i].permute(1, 2, 0)
filter_img = (filter_img - filter_img.min()) / (filter_img.max() - filter_img.min())
axes[row, col].imshow(filter_img.cpu())
else:
axes[row, col].imshow(weights[i, 0].cpu(), cmap='gray')
axes[row, col].axis('off')
axes[row, col].set_title(f'Filter {i}')
plt.tight_layout()
plt.show()
break
# 使用示例
visualize_conv_filters(model, 'features.0')总结
卷积神经网络是计算机视觉的基础,本章介绍了:
- 基础概念:卷积层、池化层的原理和实现
- 经典架构:LeNet、AlexNet、VGG、ResNet等重要网络
- 现代技术:注意力机制、密集连接等先进技术
- 实际应用:完整的图像分类项目实现
- 可视化分析:特征图和卷积核的可视化方法
掌握CNN将为你在计算机视觉领域的深入学习打下坚实基础!