Skip to content

超参数调优

超参数调优是机器学习中提升模型性能的关键步骤。与模型参数不同,超参数是在训练开始前设定的配置参数,它们控制着学习过程本身。

什么是超参数?

超参数是机器学习算法中需要在训练前手动设置的参数,它们不能通过训练数据直接学习得到。常见的超参数包括:

  • 学习率
  • 正则化参数
  • 树的深度
  • 聚类数量
  • 核函数参数

网格搜索是最直观的超参数调优方法,它会尝试所有可能的参数组合。

python
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# 加载数据
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 创建模型
rf = RandomForestClassifier(random_state=42)

# 网格搜索
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,  # 5折交叉验证
    scoring='accuracy',
    n_jobs=-1,  # 使用所有CPU核心
    verbose=1
)

# 执行搜索
grid_search.fit(X_train, y_train)

# 查看最佳参数
print("最佳参数:", grid_search.best_params_)
print("最佳得分:", grid_search.best_score_)

# 使用最佳模型预测
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("测试集得分:", test_score)

当参数空间很大时,随机搜索比网格搜索更高效。

python
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# 定义参数分布
param_distributions = {
    'n_estimators': randint(50, 500),
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

# 随机搜索
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=100,  # 尝试100种组合
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)

print("随机搜索最佳参数:", random_search.best_params_)
print("随机搜索最佳得分:", random_search.best_score_)

贝叶斯优化

贝叶斯优化是一种更智能的超参数调优方法,它利用先前的评估结果来指导下一次参数选择。

python
# 需要安装: pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# 定义搜索空间
search_spaces = {
    'n_estimators': Integer(50, 500),
    'max_depth': Categorical([3, 5, 7, 10, None]),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Categorical(['sqrt', 'log2', None])
}

# 贝叶斯搜索
bayes_search = BayesSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    search_spaces=search_spaces,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

bayes_search.fit(X_train, y_train)

print("贝叶斯优化最佳参数:", bayes_search.best_params_)
print("贝叶斯优化最佳得分:", bayes_search.best_score_)

不同算法的常见超参数

支持向量机 (SVM)

python
from sklearn.svm import SVC

svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

svm_grid = GridSearchCV(SVC(), svm_params, cv=5)

逻辑回归

python
from sklearn.linear_model import LogisticRegression

lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 500, 1000]
}

lr_grid = GridSearchCV(LogisticRegression(), lr_params, cv=5)

梯度提升

python
from sklearn.ensemble import GradientBoostingClassifier

gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0]
}

gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_params, cv=5)

验证曲线分析

验证曲线帮助我们理解单个超参数对模型性能的影响。

python
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
import numpy as np

# 分析 n_estimators 参数
param_range = [10, 50, 100, 200, 300, 400, 500]
train_scores, test_scores = validation_curve(
    RandomForestClassifier(random_state=42),
    X_train, y_train,
    param_name='n_estimators',
    param_range=param_range,
    cv=5,
    scoring='accuracy'
)

# 计算均值和标准差
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# 绘制验证曲线
plt.figure(figsize=(10, 6))
plt.plot(param_range, train_mean, 'o-', color='blue', label='训练得分')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.plot(param_range, test_mean, 'o-', color='red', label='验证得分')
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')

plt.xlabel('n_estimators')
plt.ylabel('准确率')
plt.title('随机森林验证曲线')
plt.legend()
plt.grid(True)
plt.show()

学习曲线分析

学习曲线显示模型性能如何随训练样本数量变化。

python
from sklearn.model_selection import learning_curve

# 生成学习曲线
train_sizes, train_scores, test_scores = learning_curve(
    RandomForestClassifier(n_estimators=100, random_state=42),
    X_train, y_train,
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 10),
    scoring='accuracy'
)

# 绘制学习曲线
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='训练得分')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='验证得分')
plt.xlabel('训练样本数量')
plt.ylabel('准确率')
plt.title('学习曲线')
plt.legend()
plt.grid(True)
plt.show()

超参数调优最佳实践

1. 分层调优

python
# 第一步:粗调
coarse_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 7, None]
}

coarse_search = GridSearchCV(rf, coarse_params, cv=3)
coarse_search.fit(X_train, y_train)

# 第二步:细调
fine_params = {
    'n_estimators': [80, 100, 120],
    'max_depth': [5, 7, 9],
    'min_samples_split': [2, 3, 4]
}

fine_search = GridSearchCV(rf, fine_params, cv=5)
fine_search.fit(X_train, y_train)

2. 早停策略

python
from sklearn.ensemble import GradientBoostingClassifier

# 使用验证集进行早停
gb = GradientBoostingClassifier(
    n_estimators=1000,
    validation_fraction=0.2,
    n_iter_no_change=10,
    random_state=42
)

gb.fit(X_train, y_train)
print(f"最优迭代次数: {gb.n_estimators_}")

3. 交叉验证策略选择

python
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit

# 分层K折(适用于分类问题)
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 时间序列交叉验证(适用于时序数据)
ts_cv = TimeSeriesSplit(n_splits=5)

grid_search_stratified = GridSearchCV(
    rf, param_grid, cv=stratified_cv, scoring='f1_macro'
)

多目标优化

有时我们需要同时优化多个指标。

python
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

# 定义多个评分指标
scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score, average='macro'),
    'precision': make_scorer(precision_score, average='macro'),
    'recall': make_scorer(recall_score, average='macro')
}

# 多指标网格搜索
multi_score_search = GridSearchCV(
    rf, param_grid, 
    cv=5, 
    scoring=scoring,
    refit='f1',  # 使用F1分数选择最佳模型
    return_train_score=True
)

multi_score_search.fit(X_train, y_train)

# 查看所有指标的结果
results = multi_score_search.cv_results_
for metric in scoring.keys():
    print(f"最佳{metric}: {results[f'mean_test_{metric}'][multi_score_search.best_index_]:.4f}")

实战案例:完整的调优流程

python
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 加载数据
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建管道
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# 定义参数网格(注意管道中的参数命名)
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__class_weight': [None, 'balanced']
}

# 执行网格搜索
grid_search = GridSearchCV(
    pipeline, param_grid, 
    cv=5, 
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# 评估最终模型
final_model = grid_search.best_estimator_
test_score = final_model.score(X_test, y_test)

print(f"最佳参数: {grid_search.best_params_}")
print(f"交叉验证得分: {grid_search.best_score_:.4f}")
print(f"测试集得分: {test_score:.4f}")

# 特征重要性分析
feature_importance = final_model.named_steps['classifier'].feature_importances_
feature_names = cancer.feature_names

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\n前10个重要特征:")
print(importance_df.head(10))

总结

超参数调优是提升模型性能的重要手段:

  1. 选择合适的搜索策略:网格搜索适合小参数空间,随机搜索适合大参数空间
  2. 使用交叉验证:确保结果的可靠性
  3. 分析验证曲线:理解参数对性能的影响
  4. 考虑计算成本:平衡搜索精度和时间成本
  5. 避免过拟合:不要在测试集上调参

下一章我们将学习模型选择策略,了解如何在多个算法中选择最适合的模型。

本站内容仅供学习和研究使用。