超参数调优
超参数调优是机器学习中提升模型性能的关键步骤。与模型参数不同,超参数是在训练开始前设定的配置参数,它们控制着学习过程本身。
什么是超参数?
超参数是机器学习算法中需要在训练前手动设置的参数,它们不能通过训练数据直接学习得到。常见的超参数包括:
- 学习率
- 正则化参数
- 树的深度
- 聚类数量
- 核函数参数
网格搜索 (Grid Search)
网格搜索是最直观的超参数调优方法,它会尝试所有可能的参数组合。
python
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
# 加载数据
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.2, random_state=42
)
# 定义参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# 创建模型
rf = RandomForestClassifier(random_state=42)
# 网格搜索
grid_search = GridSearchCV(
estimator=rf,
param_grid=param_grid,
cv=5, # 5折交叉验证
scoring='accuracy',
n_jobs=-1, # 使用所有CPU核心
verbose=1
)
# 执行搜索
grid_search.fit(X_train, y_train)
# 查看最佳参数
print("最佳参数:", grid_search.best_params_)
print("最佳得分:", grid_search.best_score_)
# 使用最佳模型预测
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("测试集得分:", test_score)随机搜索 (Random Search)
当参数空间很大时,随机搜索比网格搜索更高效。
python
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
# 定义参数分布
param_distributions = {
'n_estimators': randint(50, 500),
'max_depth': [3, 5, 7, 10, None],
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10),
'max_features': ['sqrt', 'log2', None]
}
# 随机搜索
random_search = RandomizedSearchCV(
estimator=RandomForestClassifier(random_state=42),
param_distributions=param_distributions,
n_iter=100, # 尝试100种组合
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42,
verbose=1
)
random_search.fit(X_train, y_train)
print("随机搜索最佳参数:", random_search.best_params_)
print("随机搜索最佳得分:", random_search.best_score_)贝叶斯优化
贝叶斯优化是一种更智能的超参数调优方法,它利用先前的评估结果来指导下一次参数选择。
python
# 需要安装: pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
# 定义搜索空间
search_spaces = {
'n_estimators': Integer(50, 500),
'max_depth': Categorical([3, 5, 7, 10, None]),
'min_samples_split': Integer(2, 20),
'min_samples_leaf': Integer(1, 10),
'max_features': Categorical(['sqrt', 'log2', None])
}
# 贝叶斯搜索
bayes_search = BayesSearchCV(
estimator=RandomForestClassifier(random_state=42),
search_spaces=search_spaces,
n_iter=50,
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42
)
bayes_search.fit(X_train, y_train)
print("贝叶斯优化最佳参数:", bayes_search.best_params_)
print("贝叶斯优化最佳得分:", bayes_search.best_score_)不同算法的常见超参数
支持向量机 (SVM)
python
from sklearn.svm import SVC
svm_params = {
'C': [0.1, 1, 10, 100],
'kernel': ['linear', 'rbf', 'poly'],
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}
svm_grid = GridSearchCV(SVC(), svm_params, cv=5)逻辑回归
python
from sklearn.linear_model import LogisticRegression
lr_params = {
'C': [0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2', 'elasticnet'],
'solver': ['liblinear', 'saga'],
'max_iter': [100, 500, 1000]
}
lr_grid = GridSearchCV(LogisticRegression(), lr_params, cv=5)梯度提升
python
from sklearn.ensemble import GradientBoostingClassifier
gb_params = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0]
}
gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_params, cv=5)验证曲线分析
验证曲线帮助我们理解单个超参数对模型性能的影响。
python
from sklearn.model_selection import validation_curve
import matplotlib.pyplot as plt
import numpy as np
# 分析 n_estimators 参数
param_range = [10, 50, 100, 200, 300, 400, 500]
train_scores, test_scores = validation_curve(
RandomForestClassifier(random_state=42),
X_train, y_train,
param_name='n_estimators',
param_range=param_range,
cv=5,
scoring='accuracy'
)
# 计算均值和标准差
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
# 绘制验证曲线
plt.figure(figsize=(10, 6))
plt.plot(param_range, train_mean, 'o-', color='blue', label='训练得分')
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color='blue')
plt.plot(param_range, test_mean, 'o-', color='red', label='验证得分')
plt.fill_between(param_range, test_mean - test_std, test_mean + test_std, alpha=0.1, color='red')
plt.xlabel('n_estimators')
plt.ylabel('准确率')
plt.title('随机森林验证曲线')
plt.legend()
plt.grid(True)
plt.show()学习曲线分析
学习曲线显示模型性能如何随训练样本数量变化。
python
from sklearn.model_selection import learning_curve
# 生成学习曲线
train_sizes, train_scores, test_scores = learning_curve(
RandomForestClassifier(n_estimators=100, random_state=42),
X_train, y_train,
cv=5,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring='accuracy'
)
# 绘制学习曲线
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', label='训练得分')
plt.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', label='验证得分')
plt.xlabel('训练样本数量')
plt.ylabel('准确率')
plt.title('学习曲线')
plt.legend()
plt.grid(True)
plt.show()超参数调优最佳实践
1. 分层调优
python
# 第一步:粗调
coarse_params = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 7, None]
}
coarse_search = GridSearchCV(rf, coarse_params, cv=3)
coarse_search.fit(X_train, y_train)
# 第二步:细调
fine_params = {
'n_estimators': [80, 100, 120],
'max_depth': [5, 7, 9],
'min_samples_split': [2, 3, 4]
}
fine_search = GridSearchCV(rf, fine_params, cv=5)
fine_search.fit(X_train, y_train)2. 早停策略
python
from sklearn.ensemble import GradientBoostingClassifier
# 使用验证集进行早停
gb = GradientBoostingClassifier(
n_estimators=1000,
validation_fraction=0.2,
n_iter_no_change=10,
random_state=42
)
gb.fit(X_train, y_train)
print(f"最优迭代次数: {gb.n_estimators_}")3. 交叉验证策略选择
python
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
# 分层K折(适用于分类问题)
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# 时间序列交叉验证(适用于时序数据)
ts_cv = TimeSeriesSplit(n_splits=5)
grid_search_stratified = GridSearchCV(
rf, param_grid, cv=stratified_cv, scoring='f1_macro'
)多目标优化
有时我们需要同时优化多个指标。
python
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
# 定义多个评分指标
scoring = {
'accuracy': 'accuracy',
'f1': make_scorer(f1_score, average='macro'),
'precision': make_scorer(precision_score, average='macro'),
'recall': make_scorer(recall_score, average='macro')
}
# 多指标网格搜索
multi_score_search = GridSearchCV(
rf, param_grid,
cv=5,
scoring=scoring,
refit='f1', # 使用F1分数选择最佳模型
return_train_score=True
)
multi_score_search.fit(X_train, y_train)
# 查看所有指标的结果
results = multi_score_search.cv_results_
for metric in scoring.keys():
print(f"最佳{metric}: {results[f'mean_test_{metric}'][multi_score_search.best_index_]:.4f}")实战案例:完整的调优流程
python
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# 加载数据
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建管道
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier(random_state=42))
])
# 定义参数网格(注意管道中的参数命名)
param_grid = {
'classifier__n_estimators': [100, 200, 300],
'classifier__max_depth': [5, 10, None],
'classifier__min_samples_split': [2, 5, 10],
'classifier__class_weight': [None, 'balanced']
}
# 执行网格搜索
grid_search = GridSearchCV(
pipeline, param_grid,
cv=5,
scoring='roc_auc',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
# 评估最终模型
final_model = grid_search.best_estimator_
test_score = final_model.score(X_test, y_test)
print(f"最佳参数: {grid_search.best_params_}")
print(f"交叉验证得分: {grid_search.best_score_:.4f}")
print(f"测试集得分: {test_score:.4f}")
# 特征重要性分析
feature_importance = final_model.named_steps['classifier'].feature_importances_
feature_names = cancer.feature_names
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': feature_importance
}).sort_values('importance', ascending=False)
print("\n前10个重要特征:")
print(importance_df.head(10))总结
超参数调优是提升模型性能的重要手段:
- 选择合适的搜索策略:网格搜索适合小参数空间,随机搜索适合大参数空间
- 使用交叉验证:确保结果的可靠性
- 分析验证曲线:理解参数对性能的影响
- 考虑计算成本:平衡搜索精度和时间成本
- 避免过拟合:不要在测试集上调参
下一章我们将学习模型选择策略,了解如何在多个算法中选择最适合的模型。