Skip to content

第8章:支持向量机

支持向量机(Support Vector Machine, SVM)是机器学习中最强大和优雅的算法之一。它通过寻找最优分离超平面来解决分类和回归问题,在高维数据和非线性问题上表现出色。

8.1 什么是支持向量机?

支持向量机的核心思想是找到一个最优的决策边界(超平面),使得不同类别之间的间隔(margin)最大化。这个决策边界由少数几个关键的数据点(支持向量)决定。

8.1.1 核心概念

  • 超平面:在n维空间中分离数据的(n-1)维子空间
  • 支持向量:距离决策边界最近的数据点
  • 间隔:支持向量到决策边界的距离
  • 核函数:将数据映射到高维空间的函数

8.1.2 SVM的优势

  • 有效处理高维数据:在特征数量很大时仍然有效
  • 内存效率高:只使用支持向量进行预测
  • 灵活性强:通过不同核函数处理非线性问题
  • 泛化能力强:基于结构风险最小化原理

8.1.3 SVM的劣势

  • 对大数据集训练慢:时间复杂度较高
  • 对噪声敏感:异常值可能影响决策边界
  • 需要特征缩放:对特征尺度敏感
  • 缺乏概率输出:不直接提供预测概率

8.2 准备环境和数据

python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_circles, make_moons, load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, validation_curve
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_squared_error, r2_score, roc_curve, auc
)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子
np.random.seed(42)

# 设置图形样式
plt.style.use('seaborn-v0_8')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

8.3 线性SVM

8.3.1 线性可分数据

python
# 创建线性可分的数据
def create_linearly_separable_data():
    """创建线性可分的二分类数据"""
    np.random.seed(42)
    
    # 类别1
    class1_x = np.random.normal(2, 0.5, 50)
    class1_y = np.random.normal(2, 0.5, 50)
    
    # 类别2
    class2_x = np.random.normal(-2, 0.5, 50)
    class2_y = np.random.normal(-2, 0.5, 50)
    
    X = np.vstack([np.column_stack([class1_x, class1_y]), 
                   np.column_stack([class2_x, class2_y])])
    y = np.hstack([np.ones(50), np.zeros(50)])
    
    return X, y

X_linear, y_linear = create_linearly_separable_data()

# 可视化数据
plt.figure(figsize=(10, 8))
colors = ['red', 'blue']
for i, color in enumerate(colors):
    mask = y_linear == i
    plt.scatter(X_linear[mask, 0], X_linear[mask, 1], 
                c=color, label=f'类别 {i}', alpha=0.7, s=50)

plt.xlabel('特征 1')
plt.ylabel('特征 2')
plt.title('线性可分数据')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"数据形状: {X_linear.shape}")
print(f"类别分布: {np.bincount(y_linear.astype(int))}")

8.3.2 训练线性SVM

python
# 分割数据
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(
    X_linear, y_linear, test_size=0.2, random_state=42, stratify=y_linear
)

# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_linear)
X_test_scaled = scaler.transform(X_test_linear)

# 创建线性SVM分类器
linear_svm = SVC(kernel='linear', C=1.0, random_state=42)
linear_svm.fit(X_train_scaled, y_train_linear)

# 预测
y_pred_linear = linear_svm.predict(X_test_scaled)

# 评估
accuracy_linear = accuracy_score(y_test_linear, y_pred_linear)
print(f"线性SVM准确率: {accuracy_linear:.4f}")

print("\n详细分类报告:")
print(classification_report(y_test_linear, y_pred_linear))

# 获取支持向量信息
print(f"\n支持向量数量: {linear_svm.n_support_}")
print(f"总支持向量数: {len(linear_svm.support_)}")
print(f"支持向量占比: {len(linear_svm.support_) / len(X_train_scaled) * 100:.2f}%")

8.3.3 可视化决策边界和支持向量

python
def plot_svm_decision_boundary(X, y, model, scaler=None, title="SVM决策边界"):
    """绘制SVM的决策边界和支持向量"""
    plt.figure(figsize=(12, 8))
    
    # 如果有缩放器,使用原始数据进行可视化
    if scaler is not None:
        X_plot = X
        # 创建网格
        h = 0.02
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                           np.arange(y_min, y_max, h))
        
        # 预测网格点(需要标准化)
        grid_points = np.c_[xx.ravel(), yy.ravel()]
        grid_points_scaled = scaler.transform(grid_points)
        Z = model.predict(grid_points_scaled)
        Z = Z.reshape(xx.shape)
        
        # 获取决策函数值(用于绘制间隔)
        decision_values = model.decision_function(grid_points_scaled)
        decision_values = decision_values.reshape(xx.shape)
        
        # 获取支持向量(需要反标准化)
        support_vectors_scaled = model.support_vectors_
        support_vectors = scaler.inverse_transform(support_vectors_scaled)
    else:
        X_plot = X
        # 创建网格
        h = 0.02
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                           np.arange(y_min, y_max, h))
        
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        
        decision_values = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
        decision_values = decision_values.reshape(xx.shape)
        
        support_vectors = model.support_vectors_
    
    # 绘制决策边界
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    
    # 绘制决策边界线和间隔
    plt.contour(xx, yy, decision_values, levels=[-1, 0, 1], 
                colors=['red', 'black', 'red'], linestyles=['--', '-', '--'],
                linewidths=[2, 3, 2])
    
    # 绘制数据点
    colors = ['red', 'blue']
    for i, color in enumerate(colors):
        mask = y == i
        plt.scatter(X_plot[mask, 0], X_plot[mask, 1], 
                   c=color, label=f'类别 {i}', alpha=0.7, s=50)
    
    # 突出显示支持向量
    plt.scatter(support_vectors[:, 0], support_vectors[:, 1], 
                s=200, facecolors='none', edgecolors='black', 
                linewidths=2, label='支持向量')
    
    plt.xlabel('特征 1')
    plt.ylabel('特征 2')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# 绘制线性SVM的决策边界
plot_svm_decision_boundary(X_train_linear, y_train_linear, linear_svm, scaler, 
                          "线性SVM决策边界和支持向量")

8.3.4 C参数的影响

python
# 比较不同C值的影响
C_values = [0.1, 1, 10, 100]
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('不同C值对线性SVM的影响', fontsize=16)

for i, C in enumerate(C_values):
    row = i // 2
    col = i % 2
    
    # 训练SVM
    svm_c = SVC(kernel='linear', C=C, random_state=42)
    svm_c.fit(X_train_scaled, y_train_linear)
    
    # 预测
    y_pred_c = svm_c.predict(X_test_scaled)
    accuracy_c = accuracy_score(y_test_linear, y_pred_c)
    
    # 创建网格进行可视化
    h = 0.02
    x_min, x_max = X_train_linear[:, 0].min() - 1, X_train_linear[:, 0].max() + 1
    y_min, y_max = X_train_linear[:, 1].min() - 1, X_train_linear[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                       np.arange(y_min, y_max, h))
    
    grid_points_scaled = scaler.transform(np.c_[xx.ravel(), yy.ravel()])
    Z = svm_c.predict(grid_points_scaled)
    Z = Z.reshape(xx.shape)
    
    decision_values = svm_c.decision_function(grid_points_scaled)
    decision_values = decision_values.reshape(xx.shape)
    
    # 绘制
    axes[row, col].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
    axes[row, col].contour(xx, yy, decision_values, levels=[-1, 0, 1], 
                          colors=['red', 'black', 'red'], 
                          linestyles=['--', '-', '--'], linewidths=[1, 2, 1])
    
    # 绘制数据点
    for j, color in enumerate(['red', 'blue']):
        mask = y_train_linear == j
        axes[row, col].scatter(X_train_linear[mask, 0], X_train_linear[mask, 1], 
                              c=color, alpha=0.7, s=30)
    
    # 支持向量
    support_vectors = scaler.inverse_transform(svm_c.support_vectors_)
    axes[row, col].scatter(support_vectors[:, 0], support_vectors[:, 1], 
                          s=100, facecolors='none', edgecolors='black', linewidths=1)
    
    axes[row, col].set_title(f'C={C}, 准确率={accuracy_c:.3f}, 支持向量={len(svm_c.support_)}')
    axes[row, col].set_xlabel('特征 1')
    axes[row, col].set_ylabel('特征 2')
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 分析C值对性能的影响
print("C值对模型的影响:")
print("C值\t准确率\t支持向量数")
print("-" * 30)
for C in C_values:
    svm_c = SVC(kernel='linear', C=C, random_state=42)
    svm_c.fit(X_train_scaled, y_train_linear)
    y_pred_c = svm_c.predict(X_test_scaled)
    accuracy_c = accuracy_score(y_test_linear, y_pred_c)
    print(f"{C}\t{accuracy_c:.4f}\t{len(svm_c.support_)}")

8.4 非线性SVM和核函数

8.4.1 非线性数据

python
# 创建非线性数据集
X_circles, y_circles = make_circles(n_samples=200, noise=0.1, factor=0.3, random_state=42)
X_moons, y_moons = make_moons(n_samples=200, noise=0.1, random_state=42)

# 可视化非线性数据
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# 同心圆数据
for i, color in enumerate(['red', 'blue']):
    mask = y_circles == i
    axes[0].scatter(X_circles[mask, 0], X_circles[mask, 1], 
                   c=color, label=f'类别 {i}', alpha=0.7)
axes[0].set_title('同心圆数据')
axes[0].set_xlabel('特征 1')
axes[0].set_ylabel('特征 2')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 月牙形数据
for i, color in enumerate(['red', 'blue']):
    mask = y_moons == i
    axes[1].scatter(X_moons[mask, 0], X_moons[mask, 1], 
                   c=color, label=f'类别 {i}', alpha=0.7)
axes[1].set_title('月牙形数据')
axes[1].set_xlabel('特征 1')
axes[1].set_ylabel('特征 2')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

8.4.2 不同核函数的比较

python
# 比较不同核函数在非线性数据上的表现
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
datasets = [('同心圆', X_circles, y_circles), ('月牙形', X_moons, y_moons)]

for dataset_name, X_data, y_data in datasets:
    print(f"\n{dataset_name}数据集上不同核函数的性能:")
    print("核函数\t\t准确率\t\t支持向量数")
    print("-" * 40)
    
    # 数据预处理
    X_train, X_test, y_train, y_test = train_test_split(
        X_data, y_data, test_size=0.2, random_state=42, stratify=y_data
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 测试不同核函数
    kernel_results = {}
    
    for kernel in kernels:
        if kernel == 'poly':
            svm = SVC(kernel=kernel, degree=3, C=1.0, random_state=42)
        else:
            svm = SVC(kernel=kernel, C=1.0, random_state=42)
        
        svm.fit(X_train_scaled, y_train)
        y_pred = svm.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        
        kernel_results[kernel] = {
            'accuracy': accuracy,
            'n_support': len(svm.support_),
            'model': svm
        }
        
        print(f"{kernel}\t\t{accuracy:.4f}\t\t{len(svm.support_)}")
    
    # 可视化不同核函数的决策边界
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'{dataset_name}数据集 - 不同核函数的决策边界', fontsize=16)
    
    for i, kernel in enumerate(kernels):
        row = i // 2
        col = i % 2
        
        model = kernel_results[kernel]['model']
        accuracy = kernel_results[kernel]['accuracy']
        n_support = kernel_results[kernel]['n_support']
        
        # 创建网格
        h = 0.02
        x_min, x_max = X_data[:, 0].min() - 0.5, X_data[:, 0].max() + 0.5
        y_min, y_max = X_data[:, 1].min() - 0.5, X_data[:, 1].max() + 0.5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                           np.arange(y_min, y_max, h))
        
        # 预测网格点
        grid_points = scaler.transform(np.c_[xx.ravel(), yy.ravel()])
        Z = model.predict(grid_points)
        Z = Z.reshape(xx.shape)
        
        # 绘制决策边界
        axes[row, col].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
        
        # 绘制数据点
        for j, color in enumerate(['red', 'blue']):
            mask = y_data == j
            axes[row, col].scatter(X_data[mask, 0], X_data[mask, 1], 
                                 c=color, alpha=0.7, s=30)
        
        # 支持向量
        support_vectors = scaler.inverse_transform(model.support_vectors_)
        axes[row, col].scatter(support_vectors[:, 0], support_vectors[:, 1], 
                             s=100, facecolors='none', edgecolors='black', linewidths=1)
        
        axes[row, col].set_title(f'{kernel}核 (准确率={accuracy:.3f}, 支持向量={n_support})')
        axes[row, col].set_xlabel('特征 1')
        axes[row, col].set_ylabel('特征 2')
        axes[row, col].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

8.4.3 RBF核参数调优

python
# RBF核的gamma参数影响
def analyze_rbf_parameters():
    """分析RBF核的gamma参数影响"""
    
    # 使用同心圆数据
    X_train, X_test, y_train, y_test = train_test_split(
        X_circles, y_circles, test_size=0.2, random_state=42, stratify=y_circles
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 不同的gamma值
    gamma_values = [0.01, 0.1, 1, 10]
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('RBF核不同gamma值的影响', fontsize=16)
    
    print("RBF核gamma参数的影响:")
    print("gamma\t准确率\t支持向量数")
    print("-" * 30)
    
    for i, gamma in enumerate(gamma_values):
        row = i // 2
        col = i % 2
        
        # 训练SVM
        svm_rbf = SVC(kernel='rbf', gamma=gamma, C=1.0, random_state=42)
        svm_rbf.fit(X_train_scaled, y_train)
        
        # 预测
        y_pred = svm_rbf.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"{gamma}\t{accuracy:.4f}\t{len(svm_rbf.support_)}")
        
        # 可视化
        h = 0.02
        x_min, x_max = X_circles[:, 0].min() - 0.5, X_circles[:, 0].max() + 0.5
        y_min, y_max = X_circles[:, 1].min() - 0.5, X_circles[:, 1].max() + 0.5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                           np.arange(y_min, y_max, h))
        
        grid_points = scaler.transform(np.c_[xx.ravel(), yy.ravel()])
        Z = svm_rbf.predict(grid_points)
        Z = Z.reshape(xx.shape)
        
        axes[row, col].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
        
        # 绘制数据点
        for j, color in enumerate(['red', 'blue']):
            mask = y_circles == j
            axes[row, col].scatter(X_circles[mask, 0], X_circles[mask, 1], 
                                 c=color, alpha=0.7, s=30)
        
        # 支持向量
        support_vectors = scaler.inverse_transform(svm_rbf.support_vectors_)
        axes[row, col].scatter(support_vectors[:, 0], support_vectors[:, 1], 
                             s=100, facecolors='none', edgecolors='black', linewidths=1)
        
        axes[row, col].set_title(f'gamma={gamma} (准确率={accuracy:.3f}, 支持向量={len(svm_rbf.support_)})')
        axes[row, col].set_xlabel('特征 1')
        axes[row, col].set_ylabel('特征 2')
        axes[row, col].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

analyze_rbf_parameters()

8.5 SVM回归

8.5.1 线性回归

python
# 创建回归数据
np.random.seed(42)
X_reg = np.linspace(0, 10, 100).reshape(-1, 1)
y_reg = 2 * X_reg.ravel() + 1 + 0.5 * np.random.randn(100)

# 分割数据
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# 训练线性SVR
linear_svr = SVR(kernel='linear', C=1.0, epsilon=0.1)
linear_svr.fit(X_train_reg, y_train_reg)

# 预测
y_pred_reg = linear_svr.predict(X_test_reg)

# 评估
r2 = r2_score(y_test_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))

print(f"线性SVR性能:")
print(f"R² 得分: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"支持向量数: {len(linear_svr.support_)}")

# 可视化回归结果
plt.figure(figsize=(12, 8))

# 绘制数据点
plt.scatter(X_train_reg, y_train_reg, alpha=0.6, label='训练数据', color='blue')
plt.scatter(X_test_reg, y_test_reg, alpha=0.6, label='测试数据', color='green')

# 绘制回归线
X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
y_plot = linear_svr.predict(X_plot)
plt.plot(X_plot, y_plot, color='red', linewidth=2, label='SVR预测')

# 绘制epsilon管道
epsilon = 0.1
plt.fill_between(X_plot.ravel(), y_plot - epsilon, y_plot + epsilon, 
                alpha=0.2, color='red', label=f'ε-管道 (ε={epsilon})')

# 突出显示支持向量
support_vectors_x = X_train_reg[linear_svr.support_]
support_vectors_y = y_train_reg[linear_svr.support_]
plt.scatter(support_vectors_x, support_vectors_y, s=200, 
           facecolors='none', edgecolors='black', linewidths=2, label='支持向量')

plt.xlabel('X')
plt.ylabel('y')
plt.title(f'线性SVR (R²={r2:.3f}, 支持向量={len(linear_svr.support_)})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

8.5.2 非线性回归

python
# 创建非线性回归数据
np.random.seed(42)
X_nonlinear = np.linspace(0, 4*np.pi, 100).reshape(-1, 1)
y_nonlinear = np.sin(X_nonlinear.ravel()) + 0.1 * np.random.randn(100)

X_train_nl, X_test_nl, y_train_nl, y_test_nl = train_test_split(
    X_nonlinear, y_nonlinear, test_size=0.2, random_state=42
)

# 比较不同核函数的SVR
svr_kernels = ['linear', 'poly', 'rbf']
svr_results = {}

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, kernel in enumerate(svr_kernels):
    if kernel == 'poly':
        svr = SVR(kernel=kernel, degree=3, C=1.0, epsilon=0.1)
    else:
        svr = SVR(kernel=kernel, C=1.0, epsilon=0.1)
    
    svr.fit(X_train_nl, y_train_nl)
    y_pred_nl = svr.predict(X_test_nl)
    
    r2_nl = r2_score(y_test_nl, y_pred_nl)
    rmse_nl = np.sqrt(mean_squared_error(y_test_nl, y_pred_nl))
    
    svr_results[kernel] = {'r2': r2_nl, 'rmse': rmse_nl, 'n_support': len(svr.support_)}
    
    # 可视化
    axes[i].scatter(X_train_nl, y_train_nl, alpha=0.6, label='训练数据', s=20)
    axes[i].scatter(X_test_nl, y_test_nl, alpha=0.6, label='测试数据', color='green', s=20)
    
    # 预测曲线
    X_plot_nl = np.linspace(0, 4*np.pi, 200).reshape(-1, 1)
    y_plot_nl = svr.predict(X_plot_nl)
    axes[i].plot(X_plot_nl, y_plot_nl, color='red', linewidth=2, label='SVR预测')
    
    # 支持向量
    support_vectors_x = X_train_nl[svr.support_]
    support_vectors_y = y_train_nl[svr.support_]
    axes[i].scatter(support_vectors_x, support_vectors_y, s=100, 
                   facecolors='none', edgecolors='black', linewidths=1, label='支持向量')
    
    axes[i].set_xlabel('X')
    axes[i].set_ylabel('y')
    axes[i].set_title(f'{kernel}核SVR\nR²={r2_nl:.3f}, 支持向量={len(svr.support_)}')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 性能比较
print("不同核函数SVR性能比较:")
print("核函数\t\t\t\tRMSE\t\t支持向量数")
print("-" * 50)
for kernel, results in svr_results.items():
    print(f"{kernel}\t\t{results['r2']:.4f}\t\t{results['rmse']:.4f}\t\t{results['n_support']}")

8.6 超参数调优

8.6.1 网格搜索

python
# 使用乳腺癌数据集进行超参数调优
cancer = load_breast_cancer()
X_cancer, y_cancer = cancer.data, cancer.target

X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(
    X_cancer, y_cancer, test_size=0.2, random_state=42, stratify=y_cancer
)

# 创建管道(包含标准化和SVM)
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(random_state=42))
])

# 定义参数网格
param_grid = [
    {
        'svm__kernel': ['linear'],
        'svm__C': [0.1, 1, 10, 100]
    },
    {
        'svm__kernel': ['rbf'],
        'svm__C': [0.1, 1, 10, 100],
        'svm__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
    },
    {
        'svm__kernel': ['poly'],
        'svm__C': [0.1, 1, 10],
        'svm__degree': [2, 3, 4],
        'svm__gamma': ['scale', 'auto']
    }
]

# 网格搜索
print("正在进行SVM超参数网格搜索...")
grid_search = GridSearchCV(
    svm_pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_cancer, y_train_cancer)

print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")

# 测试最佳模型
best_svm = grid_search.best_estimator_
y_pred_best = best_svm.predict(X_test_cancer)
test_accuracy = accuracy_score(y_test_cancer, y_pred_best)
print(f"测试集准确率: {test_accuracy:.4f}")

# 详细评估
print("\n最佳SVM模型详细评估:")
print(classification_report(y_test_cancer, y_pred_best, 
                          target_names=['恶性', '良性']))

8.6.2 验证曲线分析

python
# 绘制C参数的验证曲线
def plot_validation_curve_svm():
    """绘制SVM的验证曲线"""
    
    # 使用标准化的数据
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_cancer)
    
    # C参数验证曲线
    C_range = np.logspace(-3, 3, 10)
    train_scores, val_scores = validation_curve(
        SVC(kernel='rbf', gamma='scale', random_state=42),
        X_train_scaled, y_train_cancer,
        param_name='C', param_range=C_range,
        cv=5, scoring='accuracy', n_jobs=-1
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(12, 5))
    
    # C参数验证曲线
    plt.subplot(1, 2, 1)
    plt.semilogx(C_range, train_mean, 'o-', color='blue', label='训练得分')
    plt.fill_between(C_range, train_mean - train_std, train_mean + train_std, 
                     alpha=0.1, color='blue')
    
    plt.semilogx(C_range, val_mean, 'o-', color='red', label='验证得分')
    plt.fill_between(C_range, val_mean - val_std, val_mean + val_std, 
                     alpha=0.1, color='red')
    
    plt.xlabel('C参数')
    plt.ylabel('准确率')
    plt.title('SVM C参数验证曲线')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # gamma参数验证曲线
    gamma_range = np.logspace(-4, 1, 10)
    train_scores_gamma, val_scores_gamma = validation_curve(
        SVC(kernel='rbf', C=1.0, random_state=42),
        X_train_scaled, y_train_cancer,
        param_name='gamma', param_range=gamma_range,
        cv=5, scoring='accuracy', n_jobs=-1
    )
    
    train_mean_gamma = np.mean(train_scores_gamma, axis=1)
    train_std_gamma = np.std(train_scores_gamma, axis=1)
    val_mean_gamma = np.mean(val_scores_gamma, axis=1)
    val_std_gamma = np.std(val_scores_gamma, axis=1)
    
    plt.subplot(1, 2, 2)
    plt.semilogx(gamma_range, train_mean_gamma, 'o-', color='blue', label='训练得分')
    plt.fill_between(gamma_range, train_mean_gamma - train_std_gamma, 
                     train_mean_gamma + train_std_gamma, alpha=0.1, color='blue')
    
    plt.semilogx(gamma_range, val_mean_gamma, 'o-', color='red', label='验证得分')
    plt.fill_between(gamma_range, val_mean_gamma - val_std_gamma, 
                     val_mean_gamma + val_std_gamma, alpha=0.1, color='red')
    
    plt.xlabel('gamma参数')
    plt.ylabel('准确率')
    plt.title('SVM gamma参数验证曲线')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

plot_validation_curve_svm()

8.7 实际应用案例

8.7.1 文本分类示例

python
# 创建简单的文本分类数据
from sklearn.feature_extraction.text import TfidfVectorizer

# 模拟文本数据
texts = [
    "机器学习是人工智能的重要分支",
    "深度学习在图像识别中表现出色",
    "支持向量机是经典的分类算法",
    "今天天气很好,适合出门散步",
    "这部电影的剧情非常精彩",
    "餐厅的菜品味道不错,服务也很好",
    "神经网络可以处理复杂的非线性问题",
    "数据预处理是机器学习的重要步骤",
    "自然语言处理技术发展迅速",
    "阳光明媚的下午,心情特别舒畅",
    "这本书的内容很有趣,值得推荐",
    "购物中心里人很多,商品种类丰富"
]

# 标签:0-技术类,1-生活类
labels = [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]

# 文本向量化
vectorizer = TfidfVectorizer(max_features=100, stop_words=None)
X_text = vectorizer.fit_transform(texts).toarray()

print(f"文本特征维度: {X_text.shape}")
print(f"类别分布: {np.bincount(labels)}")

# 分割数据
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X_text, labels, test_size=0.3, random_state=42, stratify=labels
)

# 训练SVM文本分类器
text_svm = SVC(kernel='linear', C=1.0, random_state=42)
text_svm.fit(X_train_text, y_train_text)

# 预测
y_pred_text = text_svm.predict(X_test_text)
accuracy_text = accuracy_score(y_test_text, y_pred_text)

print(f"\n文本分类SVM准确率: {accuracy_text:.4f}")
print(f"支持向量数: {len(text_svm.support_)}")

# 获取最重要的特征词
feature_names = vectorizer.get_feature_names_out()
coef = text_svm.coef_[0]

# 找出最重要的正负特征
top_positive = np.argsort(coef)[-10:]
top_negative = np.argsort(coef)[:10]

print("\n最重要的技术类特征词:")
for idx in reversed(top_positive):
    print(f"{feature_names[idx]}: {coef[idx]:.4f}")

print("\n最重要的生活类特征词:")
for idx in top_negative:
    print(f"{feature_names[idx]}: {coef[idx]:.4f}")

8.7.2 高维数据处理

python
# 创建高维数据集
X_high_dim, y_high_dim = make_classification(
    n_samples=1000,
    n_features=1000,
    n_informative=100,
    n_redundant=50,
    n_clusters_per_class=1,
    random_state=42
)

print(f"高维数据形状: {X_high_dim.shape}")

# 分割数据
X_train_hd, X_test_hd, y_train_hd, y_test_hd = train_test_split(
    X_high_dim, y_high_dim, test_size=0.2, random_state=42, stratify=y_high_dim
)

# 比较线性SVM和RBF SVM在高维数据上��表现
import time

models = {
    '线性SVM': SVC(kernel='linear', C=1.0, random_state=42),
    'RBF SVM': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
    '线性SVC': LinearSVC(C=1.0, random_state=42, max_iter=1000)  # 专门为线性SVM优化
}

print("\n高维数据上不同SVM的性能比较:")
print("模型\t\t训练时间\t预测时间\t准确率\t\t支持向量数")
print("-" * 70)

# 标准化数据
scaler = StandardScaler()
X_train_hd_scaled = scaler.fit_transform(X_train_hd)
X_test_hd_scaled = scaler.transform(X_test_hd)

for name, model in models.items():
    # 训练时间
    start_time = time.time()
    model.fit(X_train_hd_scaled, y_train_hd)
    train_time = time.time() - start_time
    
    # 预测时间
    start_time = time.time()
    y_pred_hd = model.predict(X_test_hd_scaled)
    pred_time = time.time() - start_time
    
    # 准确率
    accuracy_hd = accuracy_score(y_test_hd, y_pred_hd)
    
    # 支持向量数(LinearSVC没有这个属性)
    if hasattr(model, 'support_'):
        n_support = len(model.support_)
    else:
        n_support = "N/A"
    
    print(f"{name}\t\t{train_time:.3f}s\t\t{pred_time:.3f}s\t\t{accuracy_hd:.4f}\t\t{n_support}")

# 可视化训练时间和准确率的关系
model_names = list(models.keys())
train_times = []
accuracies = []

for name, model in models.items():
    start_time = time.time()
    model.fit(X_train_hd_scaled, y_train_hd)
    train_time = time.time() - start_time
    
    y_pred = model.predict(X_test_hd_scaled)
    accuracy = accuracy_score(y_test_hd, y_pred)
    
    train_times.append(train_time)
    accuracies.append(accuracy)

plt.figure(figsize=(10, 6))
colors = ['blue', 'red', 'green']
for i, (name, train_time, accuracy) in enumerate(zip(model_names, train_times, accuracies)):
    plt.scatter(train_time, accuracy, s=200, c=colors[i], alpha=0.7, label=name)
    plt.annotate(name, (train_time, accuracy), xytext=(5, 5), 
                textcoords='offset points', fontsize=10)

plt.xlabel('训练时间 (秒)')
plt.ylabel('准确率')
plt.title('高维数据:训练时间 vs 准确率')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

8.8 SVM的优缺点总结

8.8.1 性能比较

python
def comprehensive_svm_comparison():
    """全面比较SVM与其他算法"""
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    
    # 使用葡萄酒数据集
    wine = load_wine()
    X_wine, y_wine = wine.data, wine.target
    
    X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
        X_wine, y_wine, test_size=0.2, random_state=42, stratify=y_wine
    )
    
    # 标准化
    scaler = StandardScaler()
    X_train_wine_scaled = scaler.fit_transform(X_train_wine)
    X_test_wine_scaled = scaler.transform(X_test_wine)
    
    # 定义算法
    algorithms = {
        'SVM (RBF)': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
        'SVM (线性)': SVC(kernel='linear', C=1.0, random_state=42),
        '随机森林': RandomForestClassifier(n_estimators=100, random_state=42),
        '逻辑回归': LogisticRegression(random_state=42, max_iter=1000),
        'K近邻': KNeighborsClassifier(n_neighbors=5)
    }
    
    results = {}
    
    print("算法性能全面比较:")
    print("算法\t\t训练时间\t准确率\t\t交叉验证得分")
    print("-" * 60)
    
    for name, algorithm in algorithms.items():
        # 训练时间
        start_time = time.time()
        if 'SVM' in name or name == '逻辑回归' or name == 'K近邻':
            algorithm.fit(X_train_wine_scaled, y_train_wine)
            y_pred = algorithm.predict(X_test_wine_scaled)
            cv_scores = cross_val_score(algorithm, X_train_wine_scaled, y_train_wine, cv=5)
        else:
            algorithm.fit(X_train_wine, y_train_wine)
            y_pred = algorithm.predict(X_test_wine)
            cv_scores = cross_val_score(algorithm, X_train_wine, y_train_wine, cv=5)
        
        train_time = time.time() - start_time
        
        # 性能指标
        accuracy = accuracy_score(y_test_wine, y_pred)
        cv_mean = np.mean(cv_scores)
        
        results[name] = {
            'train_time': train_time,
            'accuracy': accuracy,
            'cv_score': cv_mean
        }
        
        print(f"{name}\t{train_time:.4f}s\t\t{accuracy:.4f}\t\t{cv_mean:.4f}")
    
    # 可视化比较
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    names = list(results.keys())
    train_times = [results[name]['train_time'] for name in names]
    accuracies = [results[name]['accuracy'] for name in names]
    cv_scores = [results[name]['cv_score'] for name in names]
    
    # 训练时间
    axes[0].bar(names, train_times, color='skyblue', alpha=0.7)
    axes[0].set_title('训练时间比较')
    axes[0].set_ylabel('时间 (秒)')
    axes[0].tick_params(axis='x', rotation=45)
    
    # 准确率
    axes[1].bar(names, accuracies, color='lightgreen', alpha=0.7)
    axes[1].set_title('测试准确率比较')
    axes[1].set_ylabel('准确率')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].set_ylim(0.8, 1.0)
    
    # 交叉验证得分
    axes[2].bar(names, cv_scores, color='lightcoral', alpha=0.7)
    axes[2].set_title('交叉验证得分比较')
    axes[2].set_ylabel('CV得分')
    axes[2].tick_params(axis='x', rotation=45)
    axes[2].set_ylim(0.8, 1.0)
    
    plt.tight_layout()
    plt.show()
    
    return results

comparison_results = comprehensive_svm_comparison()

8.8.2 使用建议

python
def svm_usage_guide():
    """SVM使用指南"""
    
    print("SVM使用指南和最佳实践:")
    print("=" * 50)
    
    guidelines = {
        "数据预处理": [
            "必须进行特征标准化或归一化",
            "处理缺失值和异常值",
            "考虑特征选择以减少维度"
        ],
        "核函数选择": [
            "线性核:数据线性可分或高维稀疏数据",
            "RBF核:通用选择,适合大多数非线性问题",
            "多项式核:特定的非线性关系",
            "自定义核:特殊领域问题"
        ],
        "参数调优": [
            "C参数:控制正则化强度,需要交叉验证选择",
            "gamma参数:RBF核的重要参数,影响决策边界复杂度",
            "使用网格搜索或随机搜索进行调优"
        ],
        "适用场景": [
            "高维数据(如文本分类、基因数据)",
            "中小规模数据集",
            "需要稳定性能的场景",
            "二分类问题(原生支持)"
        ],
        "不适用场景": [
            "大规模数据集(>10万样本)",
            "需要概率输出的场景",
            "实时预测要求极高的场景",
            "噪声很多的数据"
        ]
    }
    
    for category, items in guidelines.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  • {item}")
    
    print("\n" + "=" * 50)
    print("参数选择经验法则:")
    print("• C值:从0.1, 1, 10, 100开始尝试")
    print("• gamma值:从'scale', 0.001, 0.01, 0.1, 1开始尝试")
    print("• 使用交叉验证评估不同参数组合")
    print("• 注意过拟合:训练准确率远高于验证准确率")

svm_usage_guide()

8.9 练习题

练习1:基础SVM

  1. 使用鸢尾花数据集训练线性SVM分类器
  2. 可视化决策边界和支持向量
  3. 分析不同C值对模型的影响

练习2:核函数比较

  1. 创建一个复杂的非线性数据集
  2. 比较不同核函数(线性、多项式、RBF、sigmoid)的性能
  3. 分析每种核函数的适用场景

练习3:SVM回归

  1. 使用波士顿房价数据集训练SVR模型
  2. 比较线性和RBF核的回归性能
  3. 分析epsilon参数对模型的影响

练习4:高维数据处理

  1. 创建一个高维数据集(特征数>1000)
  2. 比较SVM与其他算法在高维数据上的表现
  3. 分析特征选择对SVM性能的影响

8.10 小结

在本章中,我们深入学习了支持向量机的各个方面:

核心概念

  • SVM原理:最大间隔分类器、支持向量、超平面
  • 核技巧:将数据映射到高维空间处理非线性问题
  • 参数调优:C参数、gamma参数的作用和选择

主要技术

  • 线性SVM:处理线性可分和近似线性可分问题
  • 非线性SVM:通过核函数处理复杂的非线性问题
  • SVM回归:支持向量回归的原理和应用
  • 超参数优化:网格搜索、交叉验证

实践技能

  • 数据预处理:特征标准化的重要性
  • 核函数选择:根据数据特点选择合适的核函数
  • 性能评估:全面评估SVM模型的性能
  • 实际应用:文本分类、高维数据处理

关键要点

  • SVM在高维数据和中小规模数据集上表现优秀
  • 必须进行特征标准化,对参数敏感
  • 核函数的选择对性能有重要影响
  • 适合需要稳定性能的分类和回归任务

8.11 下一步

现在你已经掌握了强大的支持向量机算法!在下一章朴素贝叶斯中,我们将学习基于概率的分类方法,了解贝叶斯定理在机器学习中的应用。


章节要点回顾

  • ✅ 理解了SVM的数学原理和几何直觉
  • ✅ 掌握了线性和非线性SVM的实现
  • ✅ 学会了不同核函数的选择和应用
  • ✅ 了解了SVM回归的原理和实践
  • ✅ 掌握了SVM的超参数调优方法
  • ✅ 能够在实际问题中合理使用SVM

本站内容仅供学习和研究使用。