性能指标详解
选择合适的评估指标对于正确评估模型性能至关重要。不同的问题类型需要不同的评估指标,本章将详细介绍各种性能指标的含义、计算方法和使用场景。
分类问题评估指标
1. 基础指标:准确率、精确率、召回率、F1分数
python
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report
)
import matplotlib.pyplot as plt
import seaborn as sns
# 创建示例数据
X, y = make_classification(
n_samples=1000, n_features=20, n_informative=10,
n_redundant=10, n_classes=3, random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
def calculate_basic_metrics(y_true, y_pred):
"""计算基础分类指标"""
# 准确率
accuracy = accuracy_score(y_true, y_pred)
# 精确率、召回率、F1分数(多类别平均)
precision_macro = precision_score(y_true, y_pred, average='macro')
recall_macro = recall_score(y_true, y_pred, average='macro')
f1_macro = f1_score(y_true, y_pred, average='macro')
# 加权平均
precision_weighted = precision_score(y_true, y_pred, average='weighted')
recall_weighted = recall_score(y_true, y_pred, average='weighted')
f1_weighted = f1_score(y_true, y_pred, average='weighted')
print("=== 基础分类指标 ===")
print(f"准确率 (Accuracy): {accuracy:.4f}")
print(f"\n宏平均 (Macro Average):")
print(f" 精确率 (Precision): {precision_macro:.4f}")
print(f" 召回率 (Recall): {recall_macro:.4f}")
print(f" F1分数: {f1_macro:.4f}")
print(f"\n加权平均 (Weighted Average):")
print(f" 精确率 (Precision): {precision_weighted:.4f}")
print(f" 召回率 (Recall): {recall_weighted:.4f}")
print(f" F1分数: {f1_weighted:.4f}")
return {
'accuracy': accuracy,
'precision_macro': precision_macro,
'recall_macro': recall_macro,
'f1_macro': f1_macro,
'precision_weighted': precision_weighted,
'recall_weighted': recall_weighted,
'f1_weighted': f1_weighted
}
# 计算基础指标
basic_metrics = calculate_basic_metrics(y_test, y_pred)2. 混淆矩阵
python
def plot_confusion_matrix(y_true, y_pred, class_names=None):
"""绘制混淆矩阵"""
cm = confusion_matrix(y_true, y_pred)
if class_names is None:
class_names = [f'类别 {i}' for i in range(len(np.unique(y_true)))]
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=class_names, yticklabels=class_names)
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.show()
# 计算每个类别的指标
print("\n=== 各类别详细指标 ===")
report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True)
for class_name in class_names:
metrics = report[class_name]
print(f"{class_name}:")
print(f" 精确率: {metrics['precision']:.4f}")
print(f" 召回率: {metrics['recall']:.4f}")
print(f" F1分数: {metrics['f1-score']:.4f}")
print(f" 支持度: {metrics['support']}")
return cm
# 绘制混淆矩阵
cm = plot_confusion_matrix(y_test, y_pred, ['类别A', '类别B', '类别C'])3. ROC曲线和AUC
python
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.preprocessing import label_binarize
from itertools import cycle
def plot_roc_curves(y_true, y_pred_proba, class_names=None):
"""绘制ROC曲线"""
n_classes = y_pred_proba.shape[1]
if class_names is None:
class_names = [f'类别 {i}' for i in range(n_classes)]
# 二值化标签
y_true_bin = label_binarize(y_true, classes=range(n_classes))
# 计算每个类别的ROC曲线
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_pred_proba[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# 计算微平均ROC曲线
fpr["micro"], tpr["micro"], _ = roc_curve(y_true_bin.ravel(), y_pred_proba.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# 绘制ROC曲线
plt.figure(figsize=(10, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'green'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label=f'{class_names[i]} (AUC = {roc_auc[i]:.2f})')
plt.plot(fpr["micro"], tpr["micro"], color='deeppink', linestyle=':', linewidth=4,
label=f'微平均 (AUC = {roc_auc["micro"]:.2f})')
plt.plot([0, 1], [0, 1], 'k--', lw=2, label='随机分类器')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假正率 (False Positive Rate)')
plt.ylabel('真正率 (True Positive Rate)')
plt.title('ROC曲线')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
# 打印AUC分数
print("=== AUC 分数 ===")
for i, class_name in enumerate(class_names):
print(f"{class_name}: {roc_auc[i]:.4f}")
print(f"微平均: {roc_auc['micro']:.4f}")
# 多类别AUC(一对多)
try:
macro_auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro')
weighted_auc = roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='weighted')
print(f"宏平均 AUC: {macro_auc:.4f}")
print(f"加权平均 AUC: {weighted_auc:.4f}")
except:
print("无法计算多类别AUC")
return roc_auc
# 绘制ROC曲线
roc_results = plot_roc_curves(y_test, y_pred_proba, ['类别A', '类别B', '类别C'])4. 精确率-召回率曲线
python
from sklearn.metrics import precision_recall_curve, average_precision_score
def plot_precision_recall_curves(y_true, y_pred_proba, class_names=None):
"""绘制精确率-召回率曲线"""
n_classes = y_pred_proba.shape[1]
if class_names is None:
class_names = [f'类别 {i}' for i in range(n_classes)]
# 二值化标签
y_true_bin = label_binarize(y_true, classes=range(n_classes))
# 计算每个类别的PR曲线
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_true_bin[:, i], y_pred_proba[:, i])
average_precision[i] = average_precision_score(y_true_bin[:, i], y_pred_proba[:, i])
# 计算微平均PR曲线
precision["micro"], recall["micro"], _ = precision_recall_curve(
y_true_bin.ravel(), y_pred_proba.ravel()
)
average_precision["micro"] = average_precision_score(y_true_bin, y_pred_proba, average="micro")
# 绘制PR曲线
plt.figure(figsize=(10, 8))
colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])
for i, color in zip(range(n_classes), colors):
plt.plot(recall[i], precision[i], color=color, lw=2,
label=f'{class_names[i]} (AP = {average_precision[i]:.2f})')
plt.plot(recall["micro"], precision["micro"], color='gold', linestyle=':', linewidth=4,
label=f'微平均 (AP = {average_precision["micro"]:.2f})')
plt.xlabel('召回率 (Recall)')
plt.ylabel('精确率 (Precision)')
plt.title('精确率-召回率曲线')
plt.legend(loc="lower left")
plt.grid(True)
plt.show()
# 打印平均精确率
print("=== 平均精确率 (Average Precision) ===")
for i, class_name in enumerate(class_names):
print(f"{class_name}: {average_precision[i]:.4f}")
print(f"微平均: {average_precision['micro']:.4f}")
return average_precision
# 绘制PR曲线
pr_results = plot_precision_recall_curves(y_test, y_pred_proba, ['类别A', '类别B', '类别C'])回归问题评估指标
python
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
mean_squared_error, mean_absolute_error, r2_score,
mean_absolute_percentage_error, explained_variance_score
)
# 创建回归数据
X_reg, y_reg = make_regression(
n_samples=1000, n_features=10, noise=10, random_state=42
)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# 训练回归模型
reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_model.fit(X_train_reg, y_train_reg)
y_pred_reg = reg_model.predict(X_test_reg)
def calculate_regression_metrics(y_true, y_pred):
"""计算回归指标"""
# 基础指标
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
# 其他指标
try:
mape = mean_absolute_percentage_error(y_true, y_pred)
except:
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
explained_var = explained_variance_score(y_true, y_pred)
print("=== 回归评估指标 ===")
print(f"均方误差 (MSE): {mse:.4f}")
print(f"均方根误差 (RMSE): {rmse:.4f}")
print(f"平均绝对误差 (MAE): {mae:.4f}")
print(f"决定系数 (R²): {r2:.4f}")
print(f"平均绝对百分比误差 (MAPE): {mape:.4f}%")
print(f"解释方差分数: {explained_var:.4f}")
# 残差分析
residuals = y_true - y_pred
print(f"\n=== 残差分析 ===")
print(f"残差均值: {np.mean(residuals):.4f}")
print(f"残差标准差: {np.std(residuals):.4f}")
print(f"残差最大值: {np.max(residuals):.4f}")
print(f"残差最小值: {np.min(residuals):.4f}")
return {
'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2,
'mape': mape, 'explained_variance': explained_var,
'residuals': residuals
}
# 计算回归指标
reg_metrics = calculate_regression_metrics(y_test_reg, y_pred_reg)回归可视化分析
python
def plot_regression_analysis(y_true, y_pred, metrics):
"""绘制回归分析图"""
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 1. 预测值 vs 真实值
axes[0, 0].scatter(y_true, y_pred, alpha=0.6)
axes[0, 0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
axes[0, 0].set_xlabel('真实值')
axes[0, 0].set_ylabel('预测值')
axes[0, 0].set_title(f'预测值 vs 真实值 (R² = {metrics["r2"]:.3f})')
axes[0, 0].grid(True)
# 2. 残差图
residuals = metrics['residuals']
axes[0, 1].scatter(y_pred, residuals, alpha=0.6)
axes[0, 1].axhline(y=0, color='r', linestyle='--')
axes[0, 1].set_xlabel('预测值')
axes[0, 1].set_ylabel('残差')
axes[0, 1].set_title('残差图')
axes[0, 1].grid(True)
# 3. 残差直方图
axes[1, 0].hist(residuals, bins=30, alpha=0.7, edgecolor='black')
axes[1, 0].set_xlabel('残差')
axes[1, 0].set_ylabel('频数')
axes[1, 0].set_title('残差分布')
axes[1, 0].grid(True)
# 4. Q-Q图(正态性检验)
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('残差Q-Q图')
axes[1, 1].grid(True)
plt.tight_layout()
plt.show()
# 绘制回归分析图
plot_regression_analysis(y_test_reg, y_pred_reg, reg_metrics)聚类评估指标
python
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import (
adjusted_rand_score, normalized_mutual_info_score,
silhouette_score, calinski_harabasz_score, davies_bouldin_score
)
# 创建聚类数据
X_cluster, y_cluster_true = make_blobs(
n_samples=300, centers=4, n_features=2,
random_state=42, cluster_std=0.60
)
# 执行聚类
kmeans = KMeans(n_clusters=4, random_state=42)
y_cluster_pred = kmeans.fit_predict(X_cluster)
def calculate_clustering_metrics(X, y_true, y_pred):
"""计算聚类指标"""
# 外部指标(需要真实标签)
ari = adjusted_rand_score(y_true, y_pred)
nmi = normalized_mutual_info_score(y_true, y_pred)
# 内部指标(不需要真实标签)
silhouette = silhouette_score(X, y_pred)
calinski_harabasz = calinski_harabasz_score(X, y_pred)
davies_bouldin = davies_bouldin_score(X, y_pred)
print("=== 聚类评估指标 ===")
print(f"调整兰德指数 (ARI): {ari:.4f}")
print(f"标准化互信息 (NMI): {nmi:.4f}")
print(f"轮廓系数 (Silhouette): {silhouette:.4f}")
print(f"Calinski-Harabasz指数: {calinski_harabasz:.4f}")
print(f"Davies-Bouldin指数: {davies_bouldin:.4f}")
return {
'ari': ari, 'nmi': nmi, 'silhouette': silhouette,
'calinski_harabasz': calinski_harabasz, 'davies_bouldin': davies_bouldin
}
# 计算聚类指标
cluster_metrics = calculate_clustering_metrics(X_cluster, y_cluster_true, y_cluster_pred)
# 可视化聚类结果
def plot_clustering_results(X, y_true, y_pred):
"""可视化聚类结果"""
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 真实聚类
axes[0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
axes[0].set_title('真实聚类')
axes[0].set_xlabel('特征1')
axes[0].set_ylabel('特征2')
# 预测聚类
axes[1].scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis')
axes[1].set_title('预测聚类')
axes[1].set_xlabel('特征1')
axes[1].set_ylabel('特征2')
plt.tight_layout()
plt.show()
plot_clustering_results(X_cluster, y_cluster_true, y_cluster_pred)交叉验证评估
python
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer
def comprehensive_cross_validation(X, y, model, problem_type='classification'):
"""综合交叉验证评估"""
if problem_type == 'classification':
# 分类指标
scoring = {
'accuracy': 'accuracy',
'precision': make_scorer(precision_score, average='macro'),
'recall': make_scorer(recall_score, average='macro'),
'f1': make_scorer(f1_score, average='macro')
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
else:
# 回归指标
scoring = {
'r2': 'r2',
'neg_mse': 'neg_mean_squared_error',
'neg_mae': 'neg_mean_absolute_error'
}
cv = 5
# 执行交叉验证
cv_results = cross_validate(
model, X, y, cv=cv, scoring=scoring,
return_train_score=True, n_jobs=-1
)
print(f"=== {problem_type.upper()} 交叉验证结果 ===")
print(f"交叉验证折数: {cv if isinstance(cv, int) else cv.n_splits}")
for metric in scoring.keys():
test_scores = cv_results[f'test_{metric}']
train_scores = cv_results[f'train_{metric}']
print(f"\n{metric.upper()}:")
print(f" 测试集: {test_scores.mean():.4f} (+/- {test_scores.std() * 2:.4f})")
print(f" 训练集: {train_scores.mean():.4f} (+/- {train_scores.std() * 2:.4f})")
print(f" 过拟合程度: {train_scores.mean() - test_scores.mean():.4f}")
# 训练时间分析
fit_times = cv_results['fit_time']
score_times = cv_results['score_time']
print(f"\n时间分析:")
print(f" 平均训练时间: {fit_times.mean():.4f}s (+/- {fit_times.std() * 2:.4f}s)")
print(f" 平均评估时间: {score_times.mean():.4f}s (+/- {score_times.std() * 2:.4f}s)")
return cv_results
# 分类模型交叉验证
print("=== 分类模型评估 ===")
clf_cv_results = comprehensive_cross_validation(
X, y, RandomForestClassifier(n_estimators=100, random_state=42), 'classification'
)
print("\n" + "="*50)
# 回归模型交叉验证
print("=== 回归模型评估 ===")
reg_cv_results = comprehensive_cross_validation(
X_reg, y_reg, RandomForestRegressor(n_estimators=100, random_state=42), 'regression'
)模型比较和统计检验
python
from scipy import stats
from sklearn.model_selection import cross_val_score
def compare_models_statistically(X, y, models, model_names, cv=5, scoring='accuracy'):
"""统计学模型比较"""
# 收集所有模型的交叉验证分数
all_scores = []
for model in models:
scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
all_scores.append(scores)
# 创建结果DataFrame
results_df = pd.DataFrame(all_scores, index=model_names).T
print(f"=== 模型性能比较 ({scoring}) ===")
print(results_df.describe())
# 配对t检验
print(f"\n=== 配对t检验 (p值) ===")
n_models = len(models)
p_values = np.zeros((n_models, n_models))
for i in range(n_models):
for j in range(n_models):
if i != j:
_, p_value = stats.ttest_rel(all_scores[i], all_scores[j])
p_values[i, j] = p_value
p_values_df = pd.DataFrame(p_values, index=model_names, columns=model_names)
print(p_values_df)
# 可视化比较
plt.figure(figsize=(10, 6))
results_df.boxplot()
plt.title(f'模型性能比较 ({scoring})')
plt.ylabel(scoring)
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
return results_df, p_values_df
# 比较多个分类模型
models_to_compare = [
RandomForestClassifier(n_estimators=100, random_state=42),
GradientBoostingClassifier(n_estimators=100, random_state=42),
SVC(random_state=42)
]
model_names = ['随机森林', '梯度提升', 'SVM']
comparison_results, p_values = compare_models_statistically(
X, y, models_to_compare, model_names, cv=5, scoring='accuracy'
)自定义评估指标
python
from sklearn.metrics import make_scorer
def custom_business_metric(y_true, y_pred):
"""自定义业务指标示例"""
# 假设这是一个业务场景:
# - 正确预测正类的收益是10
# - 正确预测负类的收益是1
# - 错误预测正类的损失是-5
# - 错误预测负类的损失是-2
tp = np.sum((y_true == 1) & (y_pred == 1)) # 真正例
tn = np.sum((y_true == 0) & (y_pred == 0)) # 真负例
fp = np.sum((y_true == 0) & (y_pred == 1)) # 假正例
fn = np.sum((y_true == 1) & (y_pred == 0)) # 假负例
business_value = tp * 10 + tn * 1 + fp * (-5) + fn * (-2)
return business_value
# 创建二分类数据进行演示
X_binary, y_binary = make_classification(
n_samples=1000, n_features=10, n_classes=2, random_state=42
)
# 创建自定义评分器
business_scorer = make_scorer(custom_business_metric, greater_is_better=True)
# 使用自定义指标评估模型
model_binary = RandomForestClassifier(n_estimators=100, random_state=42)
business_scores = cross_val_score(model_binary, X_binary, y_binary,
cv=5, scoring=business_scorer)
print("=== 自定义业务指标评估 ===")
print(f"业务价值分数: {business_scores.mean():.2f} (+/- {business_scores.std() * 2:.2f})")
# 与标准指标比较
accuracy_scores = cross_val_score(model_binary, X_binary, y_binary,
cv=5, scoring='accuracy')
print(f"准确率: {accuracy_scores.mean():.4f} (+/- {accuracy_scores.std() * 2:.4f})")评估指标选择指南
python
def metric_selection_guide():
"""评估指标选择指南"""
guide = """
=== 评估指标选择指南 ===
分类问题:
├── 平衡数据集
│ ├── 整体性能 → 准确率 (Accuracy)
│ ├── 各类别性能 → 宏平均 F1分数
│ └── 概率预测 → AUC-ROC
│
├── 不平衡数据集
│ ├── 关注少数类 → 召回率, AUC-PR
│ ├── 精确预测 → 精确率
│ └── 平衡考虑 → F1分数, 加权平均指标
│
├── 多类别问题
│ ├── 宏平均 → 各类别等权重
│ ├── 微平均 → 样本等权重
│ └── 加权平均 → 按类别样本数加权
│
└── 业务场景
├── 医疗诊断 → 召回率 (避免漏诊)
├── 垃圾邮件 → 精确率 (避免误判)
└── 推荐系统 → AUC, Top-K准确率
回归问题:
├── 误差大小
│ ├── 平均误差 → MAE
│ ├── 大误差敏感 → MSE, RMSE
│ └── 相对误差 → MAPE
│
├── 解释性
│ ├── 拟合优度 → R²
│ ├── 方差解释 → 解释方差分数
│ └── 基线比较 → 相对改进
│
└── 业务场景
├── 价格预测 → MAPE (相对误差重要)
├── 销量预测 → MAE (绝对误差重要)
└── 风险评估 → MSE (大误差代价高)
聚类问题:
├── 有真实标签
│ ├── 聚类质量 → ARI, NMI
│ └── 标签一致性 → 调整互信息
│
├── 无真实标签
│ ├── 簇内紧密度 → 轮廓系数
│ ├── 簇间分离度 → Calinski-Harabasz指数
│ └── 簇的紧凑性 → Davies-Bouldin指数
│
└── 选择聚类数
├── 肘部法则 → 簇内平方和
├── 轮廓分析 → 轮廓系数
└── Gap统计 → 与随机数据比较
"""
print(guide)
# 显示指标选择指南
metric_selection_guide()总结
选择合适的评估指标是机器学习项目成功的关键:
关键原则:
- 问题导向:根据具体问题类型选择指标
- 业务相关:考虑实际业务场景和成本
- 数据特征:考虑数据平衡性、噪声等
- 多指标评估:使用多个指标全面评估
- 统计显著性:进行统计检验确保结果可靠
常用组合:
- 分类:准确率 + F1分数 + AUC
- 回归:R² + RMSE + MAE
- 聚类:轮廓系数 + Calinski-Harabasz指数
注意事项:
- 避免在测试集上反复调优
- 使用交叉验证获得稳定估计
- 考虑计算成本和解释性
- 根据业务需求自定义指标
下一章我们将学习管道与工作流,了解如何构建高效的机器学习管道。