PyTorch 自动微分

什么是自动微分？

自动微分（Automatic Differentiation，简称AutoGrad）是PyTorch的核心特性之一，它能够自动计算张量运算的梯度。这对于深度学习中的反向传播算法至关重要。

python

import torch

# 创建需要梯度的张量
x = torch.tensor(2.0, requires_grad=True)
y = x ** 2 + 3 * x + 1

# 计算梯度
y.backward()

print(f"x = {x}")
print(f"y = {y}")
print(f"dy/dx = {x.grad}")  # 应该是 2*2 + 3 = 7

计算图

PyTorch使用动态计算图来跟踪操作和计算梯度：

python

import torch

x = torch.tensor(1.0, requires_grad=True)
y = torch.tensor(2.0, requires_grad=True)

# 构建计算图
z = x * y + x ** 2
w = z.mean()

print(f"计算图中的函数: {w.grad_fn}")
print(f"z的梯度函数: {z.grad_fn}")

# 反向传播
w.backward()

print(f"dw/dx = {x.grad}")  # 2*x + y = 2*1 + 2 = 4
print(f"dw/dy = {y.grad}")  # x = 1

requires_grad属性

1. 基本用法

python

# 创建时指定
x = torch.randn(3, 4, requires_grad=True)

# 后续设置
y = torch.randn(3, 4)
y.requires_grad_(True)  # 就地修改

# 检查是否需要梯度
print(f"x需要梯度: {x.requires_grad}")
print(f"y需要梯度: {y.requires_grad}")

2. 梯度传播规则

python

x = torch.randn(2, 2, requires_grad=True)
y = torch.randn(2, 2, requires_grad=False)

# 只要有一个操作数需要梯度，结果就需要梯度
z = x + y
print(f"z需要梯度: {z.requires_grad}")  # True

# 所有操作数都不需要梯度时，结果也不需要
a = torch.randn(2, 2)
b = torch.randn(2, 2)
c = a + b
print(f"c需要梯度: {c.requires_grad}")  # False

梯度计算

1. 标量函数的梯度

python

# 单变量函数
x = torch.tensor(3.0, requires_grad=True)
y = x ** 3 - 2 * x ** 2 + x - 1

y.backward()
print(f"dy/dx = {x.grad}")  # 3*9 - 4*3 + 1 = 16

# 多变量函数
x = torch.tensor(1.0, requires_grad=True)
y = torch.tensor(2.0, requires_grad=True)
z = x ** 2 + y ** 2 + 2 * x * y

z.backward()
print(f"dz/dx = {x.grad}")  # 2*x + 2*y = 2*1 + 2*2 = 6
print(f"dz/dy = {y.grad}")  # 2*y + 2*x = 2*2 + 2*1 = 6

2. 向量函数的梯度

python

# 对于非标量输出，需要提供gradient参数
x = torch.randn(3, requires_grad=True)
y = x * 2

# 需要提供与y相同形状的梯度
gradient = torch.ones_like(y)
y.backward(gradient)

print(f"x的梯度: {x.grad}")  # 应该全为2

3. 雅可比向量积

python

x = torch.randn(3, requires_grad=True)
y = x ** 2

# 计算雅可比向量积 J^T * v
v = torch.tensor([1.0, 1.0, 1.0])
y.backward(v)

print(f"雅可比向量积: {x.grad}")  # 2*x

梯度累积

python

x = torch.tensor(1.0, requires_grad=True)

# 第一次计算
y1 = x ** 2
y1.backward()
print(f"第一次梯度: {x.grad}")  # 2

# 第二次计算（梯度会累积）
y2 = x ** 3
y2.backward()
print(f"累积后梯度: {x.grad}")  # 2 + 3 = 5

# 清零梯度
x.grad.zero_()
print(f"清零后梯度: {x.grad}")  # 0

高阶导数

python

# 计算二阶导数
x = torch.tensor(2.0, requires_grad=True)
y = x ** 4

# 一阶导数
grad1 = torch.autograd.grad(y, x, create_graph=True)[0]
print(f"一阶导数: {grad1}")  # 4 * x^3 = 32

# 二阶导数
grad2 = torch.autograd.grad(grad1, x)[0]
print(f"二阶导数: {grad2}")  # 12 * x^2 = 48

控制梯度计算

1. torch.no_grad()

python

x = torch.randn(3, requires_grad=True)

# 在no_grad上下文中，不会构建计算图
with torch.no_grad():
    y = x ** 2
    print(f"y需要梯度: {y.requires_grad}")  # False

# 装饰器形式
@torch.no_grad()
def inference(x):
    return x ** 2 + 1

result = inference(x)
print(f"推理结果需要梯度: {result.requires_grad}")  # False

2. detach()方法

python

x = torch.randn(3, requires_grad=True)
y = x ** 2

# 分离张量，阻断梯度传播
y_detached = y.detach()
z = y_detached * 2

print(f"y需要梯度: {y.requires_grad}")  # True
print(f"y_detached需要梯度: {y_detached.requires_grad}")  # False
print(f"z需要梯度: {z.requires_grad}")  # False

3. torch.set_grad_enabled()

python

# 全局控制梯度计算
torch.set_grad_enabled(False)
x = torch.randn(3, requires_grad=True)
y = x ** 2
print(f"全局禁用时y需要梯度: {y.requires_grad}")  # False

torch.set_grad_enabled(True)  # 重新启用

自定义autograd函数

python

class MySquare(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        # 保存输入用于反向传播
        ctx.save_for_backward(input)
        return input ** 2
    
    @staticmethod
    def backward(ctx, grad_output):
        # 获取保存的输入
        input, = ctx.saved_tensors
        # 计算梯度
        return grad_output * 2 * input

# 使用自定义函数
my_square = MySquare.apply

x = torch.tensor(3.0, requires_grad=True)
y = my_square(x)
y.backward()

print(f"自定义函数梯度: {x.grad}")  # 6

梯度检查

python

def gradient_check(func, inputs, eps=1e-6):
    """数值梯度检查"""
    # 解析梯度
    outputs = func(*inputs)
    if outputs.numel() != 1:
        outputs = outputs.sum()
    
    analytical_grads = torch.autograd.grad(outputs, inputs)
    
    # 数值梯度
    numerical_grads = []
    for i, inp in enumerate(inputs):
        grad = torch.zeros_like(inp)
        it = torch.nditer(inp.detach().numpy(), flags=['multi_index'])
        
        while not it.finished:
            idx = it.multi_index
            old_value = inp[idx].item()
            
            # f(x + eps)
            inp[idx] = old_value + eps
            pos_output = func(*inputs).sum()
            
            # f(x - eps)
            inp[idx] = old_value - eps
            neg_output = func(*inputs).sum()
            
            # 数值梯度
            grad[idx] = (pos_output - neg_output) / (2 * eps)
            
            # 恢复原值
            inp[idx] = old_value
            it.iternext()
        
        numerical_grads.append(grad)
    
    # 比较梯度
    for i, (analytical, numerical) in enumerate(zip(analytical_grads, numerical_grads)):
        diff = torch.abs(analytical - numerical).max()
        print(f"输入{i}的梯度差异: {diff:.8f}")

# 测试
def test_func(x, y):
    return x ** 2 + y ** 3

x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)
gradient_check(test_func, [x, y])

常见问题和解决方案

1. 梯度爆炸

python

# 梯度裁剪
def clip_gradients(model, max_norm):
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

# 使用示例
# clip_gradients(model, max_norm=1.0)

2. 梯度消失

python

# 检查梯度大小
def check_gradients(model):
    total_norm = 0
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
    total_norm = total_norm ** (1. / 2)
    print(f"梯度总范数: {total_norm}")

3. 内存泄漏

python

# 及时清理梯度
optimizer.zero_grad()

# 分离不需要梯度的张量
prediction = model(x).detach()

# 使用torch.no_grad()进行推理
with torch.no_grad():
    prediction = model(x)

实际应用示例

1. 简单的线性回归

python

import torch
import torch.nn as nn

# 生成数据
torch.manual_seed(42)
x = torch.randn(100, 1)
y = 3 * x + 2 + 0.1 * torch.randn(100, 1)

# 定义参数
w = torch.randn(1, 1, requires_grad=True)
b = torch.randn(1, requires_grad=True)

# 训练
learning_rate = 0.01
for epoch in range(100):
    # 前向传播
    y_pred = x @ w + b
    loss = ((y_pred - y) ** 2).mean()
    
    # 反向传播
    loss.backward()
    
    # 更新参数
    with torch.no_grad():
        w -= learning_rate * w.grad
        b -= learning_rate * b.grad
        
        # 清零梯度
        w.grad.zero_()
        b.grad.zero_()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

print(f"学习到的参数: w={w.item():.2f}, b={b.item():.2f}")

2. 神经网络中的梯度流

python

import torch.nn as nn

class SimpleNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(10, 50),
            nn.ReLU(),
            nn.Linear(50, 20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )
    
    def forward(self, x):
        return self.layers(x)

# 创建模型和数据
model = SimpleNet()
x = torch.randn(32, 10)
y = torch.randn(32, 1)

# 前向传播
output = model(x)
loss = nn.MSELoss()(output, y)

# 反向传播
loss.backward()

# 检查每层的梯度
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: 梯度范数 = {param.grad.norm().item():.6f}")

总结

自动微分是PyTorch的核心功能，理解它对于深度学习至关重要：

计算图：理解动态计算图的构建和执行
梯度计算：掌握标量和向量函数的梯度计算
梯度控制：学会使用no_grad、detach等控制梯度
性能优化：避免不必要的梯度计算，及时清理内存
调试技巧：使用梯度检查验证实现的正确性

掌握这些概念将为后续的神经网络训练打下坚实基础！

PyTorch 自动微分 ​

什么是自动微分？ ​

计算图 ​

requires_grad属性 ​

1. 基本用法 ​

2. 梯度传播规则 ​

梯度计算 ​

1. 标量函数的梯度 ​

2. 向量函数的梯度 ​

3. 雅可比向量积 ​

梯度累积 ​

高阶导数 ​

控制梯度计算 ​

1. torch.no_grad() ​

2. detach()方法 ​

3. torch.set_grad_enabled() ​

自定义autograd函数 ​

梯度检查 ​

常见问题和解决方案 ​

1. 梯度爆炸 ​

2. 梯度消失 ​

3. 内存泄漏 ​

实际应用示例 ​

1. 简单的线性回归 ​

2. 神经网络中的梯度流 ​

总结 ​

PyTorch 自动微分

什么是自动微分？

计算图

requires_grad属性

1. 基本用法

2. 梯度传播规则

梯度计算

1. 标量函数的梯度

2. 向量函数的梯度

3. 雅可比向量积

梯度累积

高阶导数

控制梯度计算

1. torch.no_grad()

2. detach()方法

3. torch.set_grad_enabled()

自定义autograd函数

梯度检查

常见问题和解决方案

1. 梯度爆炸

2. 梯度消失

3. 内存泄漏

实际应用示例

1. 简单的线性回归

2. 神经网络中的梯度流

总结