Skip to content

PyTorch 卷积神经网络

卷积神经网络简介

卷积神经网络(Convolutional Neural Network, CNN)是深度学习中最重要的架构之一,特别适用于图像处理任务。CNN通过卷积层、池化层和全连接层的组合,能够自动学习图像的层次化特征。

python
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

# CNN的基本组件
conv_layer = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
pool_layer = nn.MaxPool2d(kernel_size=2, stride=2)
fc_layer = nn.Linear(in_features=1024, out_features=10)

卷积层详解

1. 基本卷积操作

python
# 2D卷积层
conv2d = nn.Conv2d(
    in_channels=3,      # 输入通道数
    out_channels=64,    # 输出通道数(卷积核数量)
    kernel_size=3,      # 卷积核大小
    stride=1,           # 步长
    padding=1,          # 填充
    dilation=1,         # 膨胀
    groups=1,           # 分组卷积
    bias=True           # 是否使用偏置
)

# 1D卷积(用于序列数据)
conv1d = nn.Conv1d(in_channels=100, out_channels=128, kernel_size=3)

# 3D卷积(用于视频数据)
conv3d = nn.Conv3d(in_channels=3, out_channels=64, kernel_size=3)

# 转置卷积(反卷积)
conv_transpose = nn.ConvTranspose2d(64, 3, kernel_size=3, stride=2, padding=1)

2. 卷积参数计算

python
def conv_output_size(input_size, kernel_size, stride=1, padding=0, dilation=1):
    """计算卷积输出尺寸"""
    return (input_size + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1

# 示例计算
input_h, input_w = 32, 32
kernel_size = 3
stride = 1
padding = 1

output_h = conv_output_size(input_h, kernel_size, stride, padding)
output_w = conv_output_size(input_w, kernel_size, stride, padding)
print(f"输出尺寸: {output_h} x {output_w}")  # 32 x 32

3. 不同类型的卷积

python
# 深度可分离卷积
class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super().__init__()
        # 深度卷积
        self.depthwise = nn.Conv2d(
            in_channels, in_channels, kernel_size, stride, padding, groups=in_channels
        )
        # 点卷积
        self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
    
    def forward(self, x):
        x = self.depthwise(x)
        x = self.pointwise(x)
        return x

# 空洞卷积(膨胀卷积)
dilated_conv = nn.Conv2d(64, 64, kernel_size=3, padding=2, dilation=2)

# 分组卷积
group_conv = nn.Conv2d(64, 64, kernel_size=3, padding=1, groups=4)

池化层

1. 常用池化操作

python
# 最大池化
max_pool = nn.MaxPool2d(kernel_size=2, stride=2)

# 平均池化
avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)

# 自适应池化(输出固定尺寸)
adaptive_avg_pool = nn.AdaptiveAvgPool2d((7, 7))
adaptive_max_pool = nn.AdaptiveMaxPool2d((1, 1))  # 全局池化

# 测试池化效果
x = torch.randn(1, 64, 32, 32)
pooled = max_pool(x)
print(f"池化前: {x.shape}")      # [1, 64, 32, 32]
print(f"池化后: {pooled.shape}")  # [1, 64, 16, 16]

2. 自定义池化

python
class StochasticPool2d(nn.Module):
    """随机池化"""
    def __init__(self, kernel_size, stride=None):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride or kernel_size
    
    def forward(self, x):
        if self.training:
            # 训练时使用随机池化
            return F.adaptive_avg_pool2d(x, 
                (x.size(2) // self.stride, x.size(3) // self.stride))
        else:
            # 测试时使用平均池化
            return F.avg_pool2d(x, self.kernel_size, self.stride)

经典CNN架构

1. LeNet-5

python
class LeNet5(nn.Module):
    def __init__(self, num_classes=10):
        super(LeNet5, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5),
            nn.Tanh(),
            nn.AvgPool2d(kernel_size=2, stride=2),
            nn.Conv2d(6, 16, kernel_size=5),
            nn.Tanh(),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(16 * 5 * 5, 120),
            nn.Tanh(),
            nn.Linear(120, 84),
            nn.Tanh(),
            nn.Linear(84, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

2. AlexNet

python
class AlexNet(nn.Module):
    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

3. VGG网络

python
class VGG(nn.Module):
    def __init__(self, features, num_classes=1000):
        super(VGG, self).__init__()
        self.features = features
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

def make_vgg_layers(cfg, batch_norm=False):
    """构建VGG特征层"""
    layers = []
    in_channels = 3
    
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    
    return nn.Sequential(*layers)

# VGG配置
vgg_configs = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

def vgg16(num_classes=1000, batch_norm=True):
    return VGG(make_vgg_layers(vgg_configs['VGG16'], batch_norm), num_classes)

现代CNN架构

1. ResNet(残差网络)

python
class BasicBlock(nn.Module):
    """ResNet基本块"""
    expansion = 1
    
    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion * planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion * planes)
            )
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)  # 残差连接
        out = F.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64
        
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512 * block.expansion, num_classes)
    
    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

2. DenseNet(密集连接网络)

python
class DenseBlock(nn.Module):
    def __init__(self, in_channels, growth_rate, num_layers):
        super(DenseBlock, self).__init__()
        self.layers = nn.ModuleList()
        
        for i in range(num_layers):
            layer = nn.Sequential(
                nn.BatchNorm2d(in_channels + i * growth_rate),
                nn.ReLU(inplace=True),
                nn.Conv2d(in_channels + i * growth_rate, growth_rate, kernel_size=3, padding=1)
            )
            self.layers.append(layer)
    
    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_feature = layer(torch.cat(features, 1))
            features.append(new_feature)
        return torch.cat(features, 1)

class TransitionLayer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TransitionLayer, self).__init__()
        self.transition = nn.Sequential(
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, out_channels, kernel_size=1),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
    
    def forward(self, x):
        return self.transition(x)

注意力机制

1. 通道注意力(SE模块)

python
class SEBlock(nn.Module):
    """Squeeze-and-Excitation Block"""
    def __init__(self, channels, reduction=16):
        super(SEBlock, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channels, channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channels // reduction, channels, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

2. 空间注意力

python
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size//2, bias=False)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        attention = torch.cat([avg_out, max_out], dim=1)
        attention = self.conv(attention)
        return x * self.sigmoid(attention)

实际应用示例

1. CIFAR-10图像分类

python
# 数据预处理
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 加载数据
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

# 定义模型
class CIFAR10CNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CIFAR10CNN, self).__init__()
        self.features = nn.Sequential(
            # 第一个卷积块
            nn.Conv2d(3, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
            
            # 第二个卷积块
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
            
            # 第三个卷积块
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            nn.Dropout(0.25),
        )
        
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(256, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CIFAR10CNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)

可视化和分析

1. 特征图可视化

python
def visualize_feature_maps(model, input_tensor, layer_name):
    """可视化特征图"""
    activation = {}
    
    def get_activation(name):
        def hook(model, input, output):
            activation[name] = output.detach()
        return hook
    
    # 注册钩子
    for name, module in model.named_modules():
        if name == layer_name:
            module.register_forward_hook(get_activation(name))
    
    # 前向传播
    model.eval()
    with torch.no_grad():
        _ = model(input_tensor)
    
    # 获取特征图
    feature_maps = activation[layer_name]
    
    # 可视化
    fig, axes = plt.subplots(4, 8, figsize=(16, 8))
    for i in range(min(32, feature_maps.shape[1])):
        row, col = i // 8, i % 8
        axes[row, col].imshow(feature_maps[0, i].cpu(), cmap='viridis')
        axes[row, col].axis('off')
        axes[row, col].set_title(f'Channel {i}')
    
    plt.tight_layout()
    plt.show()

# 使用示例
sample_input = torch.randn(1, 3, 32, 32).to(device)
visualize_feature_maps(model, sample_input, 'features.0')

2. 卷积核可视化

python
def visualize_conv_filters(model, layer_name):
    """可视化卷积核"""
    for name, module in model.named_modules():
        if name == layer_name and isinstance(module, nn.Conv2d):
            weights = module.weight.data
            
            # 只显示前16个卷积核
            num_filters = min(16, weights.shape[0])
            fig, axes = plt.subplots(4, 4, figsize=(8, 8))
            
            for i in range(num_filters):
                row, col = i // 4, i % 4
                # 如果是RGB输入,显示所有通道
                if weights.shape[1] == 3:
                    filter_img = weights[i].permute(1, 2, 0)
                    filter_img = (filter_img - filter_img.min()) / (filter_img.max() - filter_img.min())
                    axes[row, col].imshow(filter_img.cpu())
                else:
                    axes[row, col].imshow(weights[i, 0].cpu(), cmap='gray')
                
                axes[row, col].axis('off')
                axes[row, col].set_title(f'Filter {i}')
            
            plt.tight_layout()
            plt.show()
            break

# 使用示例
visualize_conv_filters(model, 'features.0')

总结

卷积神经网络是计算机视觉的基础,本章介绍了:

  1. 基础概念:卷积层、池化层的原理和实现
  2. 经典架构:LeNet、AlexNet、VGG、ResNet等重要网络
  3. 现代技术:注意力机制、密集连接等先进技术
  4. 实际应用:完整的图像分类项目实现
  5. 可视化分析:特征图和卷积核的可视化方法

掌握CNN将为你在计算机视觉领域的深入学习打下坚实基础!

本站内容仅供学习和研究使用。