第8章:支持向量机
支持向量机(Support Vector Machine, SVM)是机器学习中最强大和优雅的算法之一。它通过寻找最优分离超平面来解决分类和回归问题,在高维数据和非线性问题上表现出色。
8.1 什么是支持向量机?
支持向量机的核心思想是找到一个最优的决策边界(超平面),使得不同类别之间的间隔(margin)最大化。这个决策边界由少数几个关键的数据点(支持向量)决定。
8.1.1 核心概念
- 超平面:在n维空间中分离数据的(n-1)维子空间
- 支持向量:距离决策边界最近的数据点
- 间隔:支持向量到决策边界的距离
- 核函数:将数据映射到高维空间的函数
8.1.2 SVM的优势
- 有效处理高维数据:在特征数量很大时仍然有效
- 内存效率高:只使用支持向量进行预测
- 灵活性强:通过不同核函数处理非线性问题
- 泛化能力强:基于结构风险最小化原理
8.1.3 SVM的劣势
- 对大数据集训练慢:时间复杂度较高
- 对噪声敏感:异常值可能影响决策边界
- 需要特征缩放:对特征尺度敏感
- 缺乏概率输出:不直接提供预测概率
8.2 准备环境和数据
python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_circles, make_moons, load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, validation_curve
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix,
mean_squared_error, r2_score, roc_curve, auc
)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
# 设置随机种子
np.random.seed(42)
# 设置图形样式
plt.style.use('seaborn-v0_8')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False8.3 线性SVM
8.3.1 线性可分数据
python
# 创建线性可分的数据
def create_linearly_separable_data():
"""创建线性可分的二分类数据"""
np.random.seed(42)
# 类别1
class1_x = np.random.normal(2, 0.5, 50)
class1_y = np.random.normal(2, 0.5, 50)
# 类别2
class2_x = np.random.normal(-2, 0.5, 50)
class2_y = np.random.normal(-2, 0.5, 50)
X = np.vstack([np.column_stack([class1_x, class1_y]),
np.column_stack([class2_x, class2_y])])
y = np.hstack([np.ones(50), np.zeros(50)])
return X, y
X_linear, y_linear = create_linearly_separable_data()
# 可视化数据
plt.figure(figsize=(10, 8))
colors = ['red', 'blue']
for i, color in enumerate(colors):
mask = y_linear == i
plt.scatter(X_linear[mask, 0], X_linear[mask, 1],
c=color, label=f'类别 {i}', alpha=0.7, s=50)
plt.xlabel('特征 1')
plt.ylabel('特征 2')
plt.title('线性可分数据')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"数据形状: {X_linear.shape}")
print(f"类别分布: {np.bincount(y_linear.astype(int))}")8.3.2 训练线性SVM
python
# 分割数据
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(
X_linear, y_linear, test_size=0.2, random_state=42, stratify=y_linear
)
# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_linear)
X_test_scaled = scaler.transform(X_test_linear)
# 创建线性SVM分类器
linear_svm = SVC(kernel='linear', C=1.0, random_state=42)
linear_svm.fit(X_train_scaled, y_train_linear)
# 预测
y_pred_linear = linear_svm.predict(X_test_scaled)
# 评估
accuracy_linear = accuracy_score(y_test_linear, y_pred_linear)
print(f"线性SVM准确率: {accuracy_linear:.4f}")
print("\n详细分类报告:")
print(classification_report(y_test_linear, y_pred_linear))
# 获取支持向量信息
print(f"\n支持向量数量: {linear_svm.n_support_}")
print(f"总支持向量数: {len(linear_svm.support_)}")
print(f"支持向量占比: {len(linear_svm.support_) / len(X_train_scaled) * 100:.2f}%")8.3.3 可视化决策边界和支持向量
python
def plot_svm_decision_boundary(X, y, model, scaler=None, title="SVM决策边界"):
"""绘制SVM的决策边界和支持向量"""
plt.figure(figsize=(12, 8))
# 如果有缩放器,使用原始数据进行可视化
if scaler is not None:
X_plot = X
# 创建网格
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# 预测网格点(需要标准化)
grid_points = np.c_[xx.ravel(), yy.ravel()]
grid_points_scaled = scaler.transform(grid_points)
Z = model.predict(grid_points_scaled)
Z = Z.reshape(xx.shape)
# 获取决策函数值(用于绘制间隔)
decision_values = model.decision_function(grid_points_scaled)
decision_values = decision_values.reshape(xx.shape)
# 获取支持向量(需要反标准化)
support_vectors_scaled = model.support_vectors_
support_vectors = scaler.inverse_transform(support_vectors_scaled)
else:
X_plot = X
# 创建网格
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
decision_values = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
decision_values = decision_values.reshape(xx.shape)
support_vectors = model.support_vectors_
# 绘制决策边界
plt.contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
# 绘制决策边界线和间隔
plt.contour(xx, yy, decision_values, levels=[-1, 0, 1],
colors=['red', 'black', 'red'], linestyles=['--', '-', '--'],
linewidths=[2, 3, 2])
# 绘制数据点
colors = ['red', 'blue']
for i, color in enumerate(colors):
mask = y == i
plt.scatter(X_plot[mask, 0], X_plot[mask, 1],
c=color, label=f'类别 {i}', alpha=0.7, s=50)
# 突出显示支持向量
plt.scatter(support_vectors[:, 0], support_vectors[:, 1],
s=200, facecolors='none', edgecolors='black',
linewidths=2, label='支持向量')
plt.xlabel('特征 1')
plt.ylabel('特征 2')
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 绘制线性SVM的决策边界
plot_svm_decision_boundary(X_train_linear, y_train_linear, linear_svm, scaler,
"线性SVM决策边界和支持向量")8.3.4 C参数的影响
python
# 比较不同C值的影响
C_values = [0.1, 1, 10, 100]
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('不同C值对线性SVM的影响', fontsize=16)
for i, C in enumerate(C_values):
row = i // 2
col = i % 2
# 训练SVM
svm_c = SVC(kernel='linear', C=C, random_state=42)
svm_c.fit(X_train_scaled, y_train_linear)
# 预测
y_pred_c = svm_c.predict(X_test_scaled)
accuracy_c = accuracy_score(y_test_linear, y_pred_c)
# 创建网格进行可视化
h = 0.02
x_min, x_max = X_train_linear[:, 0].min() - 1, X_train_linear[:, 0].max() + 1
y_min, y_max = X_train_linear[:, 1].min() - 1, X_train_linear[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
grid_points_scaled = scaler.transform(np.c_[xx.ravel(), yy.ravel()])
Z = svm_c.predict(grid_points_scaled)
Z = Z.reshape(xx.shape)
decision_values = svm_c.decision_function(grid_points_scaled)
decision_values = decision_values.reshape(xx.shape)
# 绘制
axes[row, col].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
axes[row, col].contour(xx, yy, decision_values, levels=[-1, 0, 1],
colors=['red', 'black', 'red'],
linestyles=['--', '-', '--'], linewidths=[1, 2, 1])
# 绘制数据点
for j, color in enumerate(['red', 'blue']):
mask = y_train_linear == j
axes[row, col].scatter(X_train_linear[mask, 0], X_train_linear[mask, 1],
c=color, alpha=0.7, s=30)
# 支持向量
support_vectors = scaler.inverse_transform(svm_c.support_vectors_)
axes[row, col].scatter(support_vectors[:, 0], support_vectors[:, 1],
s=100, facecolors='none', edgecolors='black', linewidths=1)
axes[row, col].set_title(f'C={C}, 准确率={accuracy_c:.3f}, 支持向量={len(svm_c.support_)}')
axes[row, col].set_xlabel('特征 1')
axes[row, col].set_ylabel('特征 2')
axes[row, col].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 分析C值对性能的影响
print("C值对模型的影响:")
print("C值\t准确率\t支持向量数")
print("-" * 30)
for C in C_values:
svm_c = SVC(kernel='linear', C=C, random_state=42)
svm_c.fit(X_train_scaled, y_train_linear)
y_pred_c = svm_c.predict(X_test_scaled)
accuracy_c = accuracy_score(y_test_linear, y_pred_c)
print(f"{C}\t{accuracy_c:.4f}\t{len(svm_c.support_)}")8.4 非线性SVM和核函数
8.4.1 非线性数据
python
# 创建非线性数据集
X_circles, y_circles = make_circles(n_samples=200, noise=0.1, factor=0.3, random_state=42)
X_moons, y_moons = make_moons(n_samples=200, noise=0.1, random_state=42)
# 可视化非线性数据
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# 同心圆数据
for i, color in enumerate(['red', 'blue']):
mask = y_circles == i
axes[0].scatter(X_circles[mask, 0], X_circles[mask, 1],
c=color, label=f'类别 {i}', alpha=0.7)
axes[0].set_title('同心圆数据')
axes[0].set_xlabel('特征 1')
axes[0].set_ylabel('特征 2')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# 月牙形数据
for i, color in enumerate(['red', 'blue']):
mask = y_moons == i
axes[1].scatter(X_moons[mask, 0], X_moons[mask, 1],
c=color, label=f'类别 {i}', alpha=0.7)
axes[1].set_title('月牙形数据')
axes[1].set_xlabel('特征 1')
axes[1].set_ylabel('特征 2')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()8.4.2 不同核函数的比较
python
# 比较不同核函数在非线性数据上的表现
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
datasets = [('同心圆', X_circles, y_circles), ('月牙形', X_moons, y_moons)]
for dataset_name, X_data, y_data in datasets:
print(f"\n{dataset_name}数据集上不同核函数的性能:")
print("核函数\t\t准确率\t\t支持向量数")
print("-" * 40)
# 数据预处理
X_train, X_test, y_train, y_test = train_test_split(
X_data, y_data, test_size=0.2, random_state=42, stratify=y_data
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 测试不同核函数
kernel_results = {}
for kernel in kernels:
if kernel == 'poly':
svm = SVC(kernel=kernel, degree=3, C=1.0, random_state=42)
else:
svm = SVC(kernel=kernel, C=1.0, random_state=42)
svm.fit(X_train_scaled, y_train)
y_pred = svm.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
kernel_results[kernel] = {
'accuracy': accuracy,
'n_support': len(svm.support_),
'model': svm
}
print(f"{kernel}\t\t{accuracy:.4f}\t\t{len(svm.support_)}")
# 可视化不同核函数的决策边界
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle(f'{dataset_name}数据集 - 不同核函数的决策边界', fontsize=16)
for i, kernel in enumerate(kernels):
row = i // 2
col = i % 2
model = kernel_results[kernel]['model']
accuracy = kernel_results[kernel]['accuracy']
n_support = kernel_results[kernel]['n_support']
# 创建网格
h = 0.02
x_min, x_max = X_data[:, 0].min() - 0.5, X_data[:, 0].max() + 0.5
y_min, y_max = X_data[:, 1].min() - 0.5, X_data[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# 预测网格点
grid_points = scaler.transform(np.c_[xx.ravel(), yy.ravel()])
Z = model.predict(grid_points)
Z = Z.reshape(xx.shape)
# 绘制决策边界
axes[row, col].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
# 绘制数据点
for j, color in enumerate(['red', 'blue']):
mask = y_data == j
axes[row, col].scatter(X_data[mask, 0], X_data[mask, 1],
c=color, alpha=0.7, s=30)
# 支持向量
support_vectors = scaler.inverse_transform(model.support_vectors_)
axes[row, col].scatter(support_vectors[:, 0], support_vectors[:, 1],
s=100, facecolors='none', edgecolors='black', linewidths=1)
axes[row, col].set_title(f'{kernel}核 (准确率={accuracy:.3f}, 支持向量={n_support})')
axes[row, col].set_xlabel('特征 1')
axes[row, col].set_ylabel('特征 2')
axes[row, col].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()8.4.3 RBF核参数调优
python
# RBF核的gamma参数影响
def analyze_rbf_parameters():
"""分析RBF核的gamma参数影响"""
# 使用同心圆数据
X_train, X_test, y_train, y_test = train_test_split(
X_circles, y_circles, test_size=0.2, random_state=42, stratify=y_circles
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 不同的gamma值
gamma_values = [0.01, 0.1, 1, 10]
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('RBF核不同gamma值的影响', fontsize=16)
print("RBF核gamma参数的影响:")
print("gamma\t准确率\t支持向量数")
print("-" * 30)
for i, gamma in enumerate(gamma_values):
row = i // 2
col = i % 2
# 训练SVM
svm_rbf = SVC(kernel='rbf', gamma=gamma, C=1.0, random_state=42)
svm_rbf.fit(X_train_scaled, y_train)
# 预测
y_pred = svm_rbf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"{gamma}\t{accuracy:.4f}\t{len(svm_rbf.support_)}")
# 可视化
h = 0.02
x_min, x_max = X_circles[:, 0].min() - 0.5, X_circles[:, 0].max() + 0.5
y_min, y_max = X_circles[:, 1].min() - 0.5, X_circles[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
grid_points = scaler.transform(np.c_[xx.ravel(), yy.ravel()])
Z = svm_rbf.predict(grid_points)
Z = Z.reshape(xx.shape)
axes[row, col].contourf(xx, yy, Z, alpha=0.3, cmap='RdYlBu')
# 绘制数据点
for j, color in enumerate(['red', 'blue']):
mask = y_circles == j
axes[row, col].scatter(X_circles[mask, 0], X_circles[mask, 1],
c=color, alpha=0.7, s=30)
# 支持向量
support_vectors = scaler.inverse_transform(svm_rbf.support_vectors_)
axes[row, col].scatter(support_vectors[:, 0], support_vectors[:, 1],
s=100, facecolors='none', edgecolors='black', linewidths=1)
axes[row, col].set_title(f'gamma={gamma} (准确率={accuracy:.3f}, 支持向量={len(svm_rbf.support_)})')
axes[row, col].set_xlabel('特征 1')
axes[row, col].set_ylabel('特征 2')
axes[row, col].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
analyze_rbf_parameters()8.5 SVM回归
8.5.1 线性回归
python
# 创建回归数据
np.random.seed(42)
X_reg = np.linspace(0, 10, 100).reshape(-1, 1)
y_reg = 2 * X_reg.ravel() + 1 + 0.5 * np.random.randn(100)
# 分割数据
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# 训练线性SVR
linear_svr = SVR(kernel='linear', C=1.0, epsilon=0.1)
linear_svr.fit(X_train_reg, y_train_reg)
# 预测
y_pred_reg = linear_svr.predict(X_test_reg)
# 评估
r2 = r2_score(y_test_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
print(f"线性SVR性能:")
print(f"R² 得分: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"支持向量数: {len(linear_svr.support_)}")
# 可视化回归结果
plt.figure(figsize=(12, 8))
# 绘制数据点
plt.scatter(X_train_reg, y_train_reg, alpha=0.6, label='训练数据', color='blue')
plt.scatter(X_test_reg, y_test_reg, alpha=0.6, label='测试数据', color='green')
# 绘制回归线
X_plot = np.linspace(0, 10, 100).reshape(-1, 1)
y_plot = linear_svr.predict(X_plot)
plt.plot(X_plot, y_plot, color='red', linewidth=2, label='SVR预测')
# 绘制epsilon管道
epsilon = 0.1
plt.fill_between(X_plot.ravel(), y_plot - epsilon, y_plot + epsilon,
alpha=0.2, color='red', label=f'ε-管道 (ε={epsilon})')
# 突出显示支持向量
support_vectors_x = X_train_reg[linear_svr.support_]
support_vectors_y = y_train_reg[linear_svr.support_]
plt.scatter(support_vectors_x, support_vectors_y, s=200,
facecolors='none', edgecolors='black', linewidths=2, label='支持向量')
plt.xlabel('X')
plt.ylabel('y')
plt.title(f'线性SVR (R²={r2:.3f}, 支持向量={len(linear_svr.support_)})')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()8.5.2 非线性回归
python
# 创建非线性回归数据
np.random.seed(42)
X_nonlinear = np.linspace(0, 4*np.pi, 100).reshape(-1, 1)
y_nonlinear = np.sin(X_nonlinear.ravel()) + 0.1 * np.random.randn(100)
X_train_nl, X_test_nl, y_train_nl, y_test_nl = train_test_split(
X_nonlinear, y_nonlinear, test_size=0.2, random_state=42
)
# 比较不同核函数的SVR
svr_kernels = ['linear', 'poly', 'rbf']
svr_results = {}
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for i, kernel in enumerate(svr_kernels):
if kernel == 'poly':
svr = SVR(kernel=kernel, degree=3, C=1.0, epsilon=0.1)
else:
svr = SVR(kernel=kernel, C=1.0, epsilon=0.1)
svr.fit(X_train_nl, y_train_nl)
y_pred_nl = svr.predict(X_test_nl)
r2_nl = r2_score(y_test_nl, y_pred_nl)
rmse_nl = np.sqrt(mean_squared_error(y_test_nl, y_pred_nl))
svr_results[kernel] = {'r2': r2_nl, 'rmse': rmse_nl, 'n_support': len(svr.support_)}
# 可视化
axes[i].scatter(X_train_nl, y_train_nl, alpha=0.6, label='训练数据', s=20)
axes[i].scatter(X_test_nl, y_test_nl, alpha=0.6, label='测试数据', color='green', s=20)
# 预测曲线
X_plot_nl = np.linspace(0, 4*np.pi, 200).reshape(-1, 1)
y_plot_nl = svr.predict(X_plot_nl)
axes[i].plot(X_plot_nl, y_plot_nl, color='red', linewidth=2, label='SVR预测')
# 支持向量
support_vectors_x = X_train_nl[svr.support_]
support_vectors_y = y_train_nl[svr.support_]
axes[i].scatter(support_vectors_x, support_vectors_y, s=100,
facecolors='none', edgecolors='black', linewidths=1, label='支持向量')
axes[i].set_xlabel('X')
axes[i].set_ylabel('y')
axes[i].set_title(f'{kernel}核SVR\nR²={r2_nl:.3f}, 支持向量={len(svr.support_)}')
axes[i].legend()
axes[i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 性能比较
print("不同核函数SVR性能比较:")
print("核函数\t\tR²\t\tRMSE\t\t支持向量数")
print("-" * 50)
for kernel, results in svr_results.items():
print(f"{kernel}\t\t{results['r2']:.4f}\t\t{results['rmse']:.4f}\t\t{results['n_support']}")8.6 超参数调优
8.6.1 网格搜索
python
# 使用乳腺癌数据集进行超参数调优
cancer = load_breast_cancer()
X_cancer, y_cancer = cancer.data, cancer.target
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(
X_cancer, y_cancer, test_size=0.2, random_state=42, stratify=y_cancer
)
# 创建管道(包含标准化和SVM)
svm_pipeline = Pipeline([
('scaler', StandardScaler()),
('svm', SVC(random_state=42))
])
# 定义参数网格
param_grid = [
{
'svm__kernel': ['linear'],
'svm__C': [0.1, 1, 10, 100]
},
{
'svm__kernel': ['rbf'],
'svm__C': [0.1, 1, 10, 100],
'svm__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
},
{
'svm__kernel': ['poly'],
'svm__C': [0.1, 1, 10],
'svm__degree': [2, 3, 4],
'svm__gamma': ['scale', 'auto']
}
]
# 网格搜索
print("正在进行SVM超参数网格搜索...")
grid_search = GridSearchCV(
svm_pipeline,
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train_cancer, y_train_cancer)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
# 测试最佳模型
best_svm = grid_search.best_estimator_
y_pred_best = best_svm.predict(X_test_cancer)
test_accuracy = accuracy_score(y_test_cancer, y_pred_best)
print(f"测试集准确率: {test_accuracy:.4f}")
# 详细评估
print("\n最佳SVM模型详细评估:")
print(classification_report(y_test_cancer, y_pred_best,
target_names=['恶性', '良性']))8.6.2 验证曲线分析
python
# 绘制C参数的验证曲线
def plot_validation_curve_svm():
"""绘制SVM的验证曲线"""
# 使用标准化的数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_cancer)
# C参数验证曲线
C_range = np.logspace(-3, 3, 10)
train_scores, val_scores = validation_curve(
SVC(kernel='rbf', gamma='scale', random_state=42),
X_train_scaled, y_train_cancer,
param_name='C', param_range=C_range,
cv=5, scoring='accuracy', n_jobs=-1
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)
plt.figure(figsize=(12, 5))
# C参数验证曲线
plt.subplot(1, 2, 1)
plt.semilogx(C_range, train_mean, 'o-', color='blue', label='训练得分')
plt.fill_between(C_range, train_mean - train_std, train_mean + train_std,
alpha=0.1, color='blue')
plt.semilogx(C_range, val_mean, 'o-', color='red', label='验证得分')
plt.fill_between(C_range, val_mean - val_std, val_mean + val_std,
alpha=0.1, color='red')
plt.xlabel('C参数')
plt.ylabel('准确率')
plt.title('SVM C参数验证曲线')
plt.legend()
plt.grid(True, alpha=0.3)
# gamma参数验证曲线
gamma_range = np.logspace(-4, 1, 10)
train_scores_gamma, val_scores_gamma = validation_curve(
SVC(kernel='rbf', C=1.0, random_state=42),
X_train_scaled, y_train_cancer,
param_name='gamma', param_range=gamma_range,
cv=5, scoring='accuracy', n_jobs=-1
)
train_mean_gamma = np.mean(train_scores_gamma, axis=1)
train_std_gamma = np.std(train_scores_gamma, axis=1)
val_mean_gamma = np.mean(val_scores_gamma, axis=1)
val_std_gamma = np.std(val_scores_gamma, axis=1)
plt.subplot(1, 2, 2)
plt.semilogx(gamma_range, train_mean_gamma, 'o-', color='blue', label='训练得分')
plt.fill_between(gamma_range, train_mean_gamma - train_std_gamma,
train_mean_gamma + train_std_gamma, alpha=0.1, color='blue')
plt.semilogx(gamma_range, val_mean_gamma, 'o-', color='red', label='验证得分')
plt.fill_between(gamma_range, val_mean_gamma - val_std_gamma,
val_mean_gamma + val_std_gamma, alpha=0.1, color='red')
plt.xlabel('gamma参数')
plt.ylabel('准确率')
plt.title('SVM gamma参数验证曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plot_validation_curve_svm()8.7 实际应用案例
8.7.1 文本分类示例
python
# 创建简单的文本分类数据
from sklearn.feature_extraction.text import TfidfVectorizer
# 模拟文本数据
texts = [
"机器学习是人工智能的重要分支",
"深度学习在图像识别中表现出色",
"支持向量机是经典的分类算法",
"今天天气很好,适合出门散步",
"这部电影的剧情非常精彩",
"餐厅的菜品味道不错,服务也很好",
"神经网络可以处理复杂的非线性问题",
"数据预处理是机器学习的重要步骤",
"自然语言处理技术发展迅速",
"阳光明媚的下午,心情特别舒畅",
"这本书的内容很有趣,值得推荐",
"购物中心里人很多,商品种类丰富"
]
# 标签:0-技术类,1-生活类
labels = [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]
# 文本向量化
vectorizer = TfidfVectorizer(max_features=100, stop_words=None)
X_text = vectorizer.fit_transform(texts).toarray()
print(f"文本特征维度: {X_text.shape}")
print(f"类别分布: {np.bincount(labels)}")
# 分割数据
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
X_text, labels, test_size=0.3, random_state=42, stratify=labels
)
# 训练SVM文本分类器
text_svm = SVC(kernel='linear', C=1.0, random_state=42)
text_svm.fit(X_train_text, y_train_text)
# 预测
y_pred_text = text_svm.predict(X_test_text)
accuracy_text = accuracy_score(y_test_text, y_pred_text)
print(f"\n文本分类SVM准确率: {accuracy_text:.4f}")
print(f"支持向量数: {len(text_svm.support_)}")
# 获取最重要的特征词
feature_names = vectorizer.get_feature_names_out()
coef = text_svm.coef_[0]
# 找出最重要的正负特征
top_positive = np.argsort(coef)[-10:]
top_negative = np.argsort(coef)[:10]
print("\n最重要的技术类特征词:")
for idx in reversed(top_positive):
print(f"{feature_names[idx]}: {coef[idx]:.4f}")
print("\n最重要的生活类特征词:")
for idx in top_negative:
print(f"{feature_names[idx]}: {coef[idx]:.4f}")8.7.2 高维数据处理
python
# 创建高维数据集
X_high_dim, y_high_dim = make_classification(
n_samples=1000,
n_features=1000,
n_informative=100,
n_redundant=50,
n_clusters_per_class=1,
random_state=42
)
print(f"高维数据形状: {X_high_dim.shape}")
# 分割数据
X_train_hd, X_test_hd, y_train_hd, y_test_hd = train_test_split(
X_high_dim, y_high_dim, test_size=0.2, random_state=42, stratify=y_high_dim
)
# 比较线性SVM和RBF SVM在高维数据上��表现
import time
models = {
'线性SVM': SVC(kernel='linear', C=1.0, random_state=42),
'RBF SVM': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
'线性SVC': LinearSVC(C=1.0, random_state=42, max_iter=1000) # 专门为线性SVM优化
}
print("\n高维数据上不同SVM的性能比较:")
print("模型\t\t训练时间\t预测时间\t准确率\t\t支持向量数")
print("-" * 70)
# 标准化数据
scaler = StandardScaler()
X_train_hd_scaled = scaler.fit_transform(X_train_hd)
X_test_hd_scaled = scaler.transform(X_test_hd)
for name, model in models.items():
# 训练时间
start_time = time.time()
model.fit(X_train_hd_scaled, y_train_hd)
train_time = time.time() - start_time
# 预测时间
start_time = time.time()
y_pred_hd = model.predict(X_test_hd_scaled)
pred_time = time.time() - start_time
# 准确率
accuracy_hd = accuracy_score(y_test_hd, y_pred_hd)
# 支持向量数(LinearSVC没有这个属性)
if hasattr(model, 'support_'):
n_support = len(model.support_)
else:
n_support = "N/A"
print(f"{name}\t\t{train_time:.3f}s\t\t{pred_time:.3f}s\t\t{accuracy_hd:.4f}\t\t{n_support}")
# 可视化训练时间和准确率的关系
model_names = list(models.keys())
train_times = []
accuracies = []
for name, model in models.items():
start_time = time.time()
model.fit(X_train_hd_scaled, y_train_hd)
train_time = time.time() - start_time
y_pred = model.predict(X_test_hd_scaled)
accuracy = accuracy_score(y_test_hd, y_pred)
train_times.append(train_time)
accuracies.append(accuracy)
plt.figure(figsize=(10, 6))
colors = ['blue', 'red', 'green']
for i, (name, train_time, accuracy) in enumerate(zip(model_names, train_times, accuracies)):
plt.scatter(train_time, accuracy, s=200, c=colors[i], alpha=0.7, label=name)
plt.annotate(name, (train_time, accuracy), xytext=(5, 5),
textcoords='offset points', fontsize=10)
plt.xlabel('训练时间 (秒)')
plt.ylabel('准确率')
plt.title('高维数据:训练时间 vs 准确率')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()8.8 SVM的优缺点总结
8.8.1 性能比较
python
def comprehensive_svm_comparison():
"""全面比较SVM与其他算法"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# 使用葡萄酒数据集
wine = load_wine()
X_wine, y_wine = wine.data, wine.target
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
X_wine, y_wine, test_size=0.2, random_state=42, stratify=y_wine
)
# 标准化
scaler = StandardScaler()
X_train_wine_scaled = scaler.fit_transform(X_train_wine)
X_test_wine_scaled = scaler.transform(X_test_wine)
# 定义算法
algorithms = {
'SVM (RBF)': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42),
'SVM (线性)': SVC(kernel='linear', C=1.0, random_state=42),
'随机森林': RandomForestClassifier(n_estimators=100, random_state=42),
'逻辑回归': LogisticRegression(random_state=42, max_iter=1000),
'K近邻': KNeighborsClassifier(n_neighbors=5)
}
results = {}
print("算法性能全面比较:")
print("算法\t\t训练时间\t准确率\t\t交叉验证得分")
print("-" * 60)
for name, algorithm in algorithms.items():
# 训练时间
start_time = time.time()
if 'SVM' in name or name == '逻辑回归' or name == 'K近邻':
algorithm.fit(X_train_wine_scaled, y_train_wine)
y_pred = algorithm.predict(X_test_wine_scaled)
cv_scores = cross_val_score(algorithm, X_train_wine_scaled, y_train_wine, cv=5)
else:
algorithm.fit(X_train_wine, y_train_wine)
y_pred = algorithm.predict(X_test_wine)
cv_scores = cross_val_score(algorithm, X_train_wine, y_train_wine, cv=5)
train_time = time.time() - start_time
# 性能指标
accuracy = accuracy_score(y_test_wine, y_pred)
cv_mean = np.mean(cv_scores)
results[name] = {
'train_time': train_time,
'accuracy': accuracy,
'cv_score': cv_mean
}
print(f"{name}\t{train_time:.4f}s\t\t{accuracy:.4f}\t\t{cv_mean:.4f}")
# 可视化比较
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
names = list(results.keys())
train_times = [results[name]['train_time'] for name in names]
accuracies = [results[name]['accuracy'] for name in names]
cv_scores = [results[name]['cv_score'] for name in names]
# 训练时间
axes[0].bar(names, train_times, color='skyblue', alpha=0.7)
axes[0].set_title('训练时间比较')
axes[0].set_ylabel('时间 (秒)')
axes[0].tick_params(axis='x', rotation=45)
# 准确率
axes[1].bar(names, accuracies, color='lightgreen', alpha=0.7)
axes[1].set_title('测试准确率比较')
axes[1].set_ylabel('准确率')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_ylim(0.8, 1.0)
# 交叉验证得分
axes[2].bar(names, cv_scores, color='lightcoral', alpha=0.7)
axes[2].set_title('交叉验证得分比较')
axes[2].set_ylabel('CV得分')
axes[2].tick_params(axis='x', rotation=45)
axes[2].set_ylim(0.8, 1.0)
plt.tight_layout()
plt.show()
return results
comparison_results = comprehensive_svm_comparison()8.8.2 使用建议
python
def svm_usage_guide():
"""SVM使用指南"""
print("SVM使用指南和最佳实践:")
print("=" * 50)
guidelines = {
"数据预处理": [
"必须进行特征标准化或归一化",
"处理缺失值和异常值",
"考虑特征选择以减少维度"
],
"核函数选择": [
"线性核:数据线性可分或高维稀疏数据",
"RBF核:通用选择,适合大多数非线性问题",
"多项式核:特定的非线性关系",
"自定义核:特殊领域问题"
],
"参数调优": [
"C参数:控制正则化强度,需要交叉验证选择",
"gamma参数:RBF核的重要参数,影响决策边界复杂度",
"使用网格搜索或随机搜索进行调优"
],
"适用场景": [
"高维数据(如文本分类、基因数据)",
"中小规模数据集",
"需要稳定性能的场景",
"二分类问题(原生支持)"
],
"不适用场景": [
"大规模数据集(>10万样本)",
"需要概率输出的场景",
"实时预测要求极高的场景",
"噪声很多的数据"
]
}
for category, items in guidelines.items():
print(f"\n{category}:")
for item in items:
print(f" • {item}")
print("\n" + "=" * 50)
print("参数选择经验法则:")
print("• C值:从0.1, 1, 10, 100开始尝试")
print("• gamma值:从'scale', 0.001, 0.01, 0.1, 1开始尝试")
print("• 使用交叉验证评估不同参数组合")
print("• 注意过拟合:训练准确率远高于验证准确率")
svm_usage_guide()8.9 练习题
练习1:基础SVM
- 使用鸢尾花数据集训练线性SVM分类器
- 可视化决策边界和支持向量
- 分析不同C值对模型的影响
练习2:核函数比较
- 创建一个复杂的非线性数据集
- 比较不同核函数(线性、多项式、RBF、sigmoid)的性能
- 分析每种核函数的适用场景
练习3:SVM回归
- 使用波士顿房价数据集训练SVR模型
- 比较线性和RBF核的回归性能
- 分析epsilon参数对模型的影响
练习4:高维数据处理
- 创建一个高维数据集(特征数>1000)
- 比较SVM与其他算法在高维数据上的表现
- 分析特征选择对SVM性能的影响
8.10 小结
在本章中,我们深入学习了支持向量机的各个方面:
核心概念
- SVM原理:最大间隔分类器、支持向量、超平面
- 核技巧:将数据映射到高维空间处理非线性问题
- 参数调优:C参数、gamma参数的作用和选择
主要技术
- 线性SVM:处理线性可分和近似线性可分问题
- 非线性SVM:通过核函数处理复杂的非线性问题
- SVM回归:支持向量回归的原理和应用
- 超参数优化:网格搜索、交叉验证
实践技能
- 数据预处理:特征标准化的重要性
- 核函数选择:根据数据特点选择合适的核函数
- 性能评估:全面评估SVM模型的性能
- 实际应用:文本分类、高维数据处理
关键要点
- SVM在高维数据和中小规模数据集上表现优秀
- 必须进行特征标准化,对参数敏感
- 核函数的选择对性能有重要影响
- 适合需要稳定性能的分类和回归任务
8.11 下一步
现在你已经掌握了强大的支持向量机算法!在下一章朴素贝叶斯中,我们将学习基于概率的分类方法,了解贝叶斯定理在机器学习中的应用。
章节要点回顾:
- ✅ 理解了SVM的数学原理和几何直觉
- ✅ 掌握了线性和非线性SVM的实现
- ✅ 学会了不同核函数的选择和应用
- ✅ 了解了SVM回归的原理和实践
- ✅ 掌握了SVM的超参数调优方法
- ✅ 能够在实际问题中合理使用SVM