第5章:逻辑回归实战
逻辑回归是机器学习中最重要的分类算法之一。尽管名字中有"回归",但它实际上是一个分类算法。本章将深入探讨逻辑回归的原理、实现和应用。
5.1 什么是逻辑回归?
逻辑回归使用逻辑函数(Sigmoid函数)来建模二分类问题的概率。它不直接预测类别,而是预测样本属于某个类别的概率。
5.1.1 数学原理
Sigmoid函数:
σ(z) = 1 / (1 + e^(-z))其中 z = β₀ + β₁x₁ + β₂x₂ + ... + βₙxₙ
概率预测:
P(y=1|x) = σ(β₀ + β₁x₁ + β₂x₂ + ... + βₙxₙ)
P(y=0|x) = 1 - P(y=1|x)决策边界:
- 当 P(y=1|x) ≥ 0.5 时,预测为类别1
- 当 P(y=1|x) < 0.5 时,预测为类别0
5.1.2 与线性回归的区别
| 特征 | 线性回归 | 逻辑回归 |
|---|---|---|
| 目标 | 预测连续值 | 预测概率/分类 |
| 输出范围 | (-∞, +∞) | [0, 1] |
| 激活函数 | 无 | Sigmoid |
| 损失函数 | 均方误差 | 对数似然 |
5.2 准备环境和数据
python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_curve, auc,
precision_recall_curve, log_loss
)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
# 设置随机种子
np.random.seed(42)
# 设置图形样式
plt.style.use('seaborn-v0_8')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False5.3 二分类逻辑回归
5.3.1 生成二分类数据
python
# 生成二分类数据集
X_binary, y_binary = make_classification(
n_samples=1000,
n_features=2,
n_redundant=0,
n_informative=2,
n_clusters_per_class=1,
random_state=42
)
# 创建DataFrame便于分析
df_binary = pd.DataFrame(X_binary, columns=['特征1', '特征2'])
df_binary['标签'] = y_binary
print("二分类数据集信息:")
print(df_binary.info())
print("\n类别分布:")
print(df_binary['标签'].value_counts())
# 可视化数据分布
plt.figure(figsize=(10, 8))
colors = ['red', 'blue']
for i, label in enumerate([0, 1]):
mask = y_binary == label
plt.scatter(X_binary[mask, 0], X_binary[mask, 1],
c=colors[i], label=f'类别 {label}', alpha=0.7)
plt.xlabel('特征1')
plt.ylabel('特征2')
plt.title('二分类数据分布')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()5.3.2 训练二分类逻辑回归模型
python
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
X_binary, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)
# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 创建和训练逻辑回归模型
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_scaled, y_train)
# 查看模型参数
print("逻辑回归模型参数:")
print(f"截距: {logistic_model.intercept_[0]:.4f}")
print(f"系数: {logistic_model.coef_[0]}")
# 预测概率和类别
y_pred_proba = logistic_model.predict_proba(X_test_scaled)
y_pred = logistic_model.predict(X_test_scaled)
print(f"\n预测示例(前5个样本):")
for i in range(5):
print(f"样本 {i+1}: 真实={y_test[i]}, 预测={y_pred[i]}, "
f"概率=[{y_pred_proba[i][0]:.3f}, {y_pred_proba[i][1]:.3f}]")5.3.3 决策边界可视化
python
def plot_decision_boundary(X, y, model, scaler=None, title="决策边界"):
"""绘制决策边界"""
plt.figure(figsize=(10, 8))
# 创建网格
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# 预测网格点
grid_points = np.c_[xx.ravel(), yy.ravel()]
if scaler:
grid_points = scaler.transform(grid_points)
Z = model.predict_proba(grid_points)[:, 1]
Z = Z.reshape(xx.shape)
# 绘制等高线
plt.contourf(xx, yy, Z, levels=50, alpha=0.8, cmap='RdYlBu')
plt.colorbar(label='P(y=1)')
# 绘制决策边界
plt.contour(xx, yy, Z, levels=[0.5], colors='black', linestyles='--', linewidths=2)
# 绘制数据点
colors = ['red', 'blue']
for i, label in enumerate([0, 1]):
mask = y == label
plt.scatter(X[mask, 0], X[mask, 1],
c=colors[i], label=f'类别 {label}', alpha=0.7, edgecolors='black')
plt.xlabel('特征1')
plt.ylabel('特征2')
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 绘制决策边界
plot_decision_boundary(X_train, y_train, logistic_model, scaler, "逻辑回归决策边界")5.3.4 Sigmoid函数可视化
python
# 可视化Sigmoid函数
z = np.linspace(-10, 10, 100)
sigmoid = 1 / (1 + np.exp(-z))
plt.figure(figsize=(10, 6))
plt.plot(z, sigmoid, 'b-', linewidth=2, label='Sigmoid函数')
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.7, label='决策阈值')
plt.axvline(x=0, color='g', linestyle='--', alpha=0.7, label='z=0')
plt.xlabel('z = β₀ + β₁x₁ + β₂x₂')
plt.ylabel('P(y=1|x)')
plt.title('Sigmoid函数')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 展示线性组合到概率的转换
sample_features = X_test_scaled[:10]
linear_combination = logistic_model.decision_function(sample_features)
probabilities = logistic_model.predict_proba(sample_features)[:, 1]
print("线性组合到概率的转换示例:")
print("线性组合(z)\t概率P(y=1)\t预测类别")
print("-" * 40)
for i in range(len(sample_features)):
pred_class = 1 if probabilities[i] >= 0.5 else 0
print(f"{linear_combination[i]:8.3f}\t{probabilities[i]:8.3f}\t{pred_class:8d}")5.4 模型评估
5.4.1 基本评估指标
python
def evaluate_classification_model(y_true, y_pred, y_pred_proba=None, model_name="模型"):
"""评估分类模型性能"""
print(f"{model_name} 评估结果:")
print("-" * 50)
# 基本指标
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')
print(f"准确率 (Accuracy): {accuracy:.4f}")
print(f"精确率 (Precision): {precision:.4f}")
print(f"召回率 (Recall): {recall:.4f}")
print(f"F1得分: {f1:.4f}")
# 对数损失
if y_pred_proba is not None:
logloss = log_loss(y_true, y_pred_proba)
print(f"对数损失 (Log Loss): {logloss:.4f}")
print("\n详细分类报告:")
print(classification_report(y_true, y_pred))
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
'log_loss': log_loss(y_true, y_pred_proba) if y_pred_proba is not None else None
}
# 评估模型
metrics = evaluate_classification_model(
y_test, y_pred, y_pred_proba, "逻辑回归"
)5.4.2 混淆矩阵
python
# 计算和可视化混淆矩阵
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['类别0', '类别1'],
yticklabels=['类别0', '类别1'])
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.show()
# 从混淆矩阵计算指标
tn, fp, fn, tp = cm.ravel()
print("混淆矩阵分析:")
print(f"真负例 (TN): {tn}")
print(f"假正例 (FP): {fp}")
print(f"假负例 (FN): {fn}")
print(f"真正例 (TP): {tp}")
print(f"\n手动计算的指标:")
print(f"准确率: {(tp + tn) / (tp + tn + fp + fn):.4f}")
print(f"精确率: {tp / (tp + fp):.4f}")
print(f"召回率: {tp / (tp + fn):.4f}")
print(f"特异性: {tn / (tn + fp):.4f}")5.4.3 ROC曲线和AUC
python
# 计算ROC曲线
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])
roc_auc = auc(fpr, tpr)
# 绘制ROC曲线
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2,
label=f'ROC曲线 (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
label='随机分类器')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假正例率 (FPR)')
plt.ylabel('真正例率 (TPR)')
plt.title('ROC曲线')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()
print(f"AUC得分: {roc_auc:.4f}")
# 不同阈值的性能
print("\n不同阈值下的性能:")
print("阈值\t\tFPR\t\tTPR\t\t精确率\t\t召回率")
print("-" * 60)
for i in range(0, len(thresholds), len(thresholds)//10):
threshold = thresholds[i]
y_pred_threshold = (y_pred_proba[:, 1] >= threshold).astype(int)
if len(np.unique(y_pred_threshold)) > 1: # 避免除零错误
precision_thresh = precision_score(y_test, y_pred_threshold)
recall_thresh = recall_score(y_test, y_pred_threshold)
print(f"{threshold:.3f}\t\t{fpr[i]:.3f}\t\t{tpr[i]:.3f}\t\t{precision_thresh:.3f}\t\t{recall_thresh:.3f}")5.4.4 精确率-召回率曲线
python
# 计算精确率-召回率曲线
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(
y_test, y_pred_proba[:, 1]
)
pr_auc = auc(recall_curve, precision_curve)
# 绘制PR曲线
plt.figure(figsize=(10, 8))
plt.plot(recall_curve, precision_curve, color='blue', lw=2,
label=f'PR曲线 (AUC = {pr_auc:.3f})')
# 基线(随机分类器)
baseline = np.sum(y_test) / len(y_test)
plt.axhline(y=baseline, color='red', linestyle='--',
label=f'随机分类器 (精确率 = {baseline:.3f})')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('召回率')
plt.ylabel('精确率')
plt.title('精确率-召回率曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print(f"PR-AUC得分: {pr_auc:.4f}")5.5 多分类逻辑回归
5.5.1 加载多分类数据
python
# 使用葡萄酒数据集(3分类)
wine_data = load_wine()
X_wine = wine_data.data
y_wine = wine_data.target
feature_names_wine = wine_data.feature_names
target_names_wine = wine_data.target_names
print("葡萄酒数据集信息:")
print(f"样本数: {X_wine.shape[0]}")
print(f"特征数: {X_wine.shape[1]}")
print(f"类别数: {len(np.unique(y_wine))}")
print(f"类别名称: {target_names_wine}")
# 查看类别分布
unique, counts = np.unique(y_wine, return_counts=True)
plt.figure(figsize=(8, 6))
plt.bar(target_names_wine, counts, color=['red', 'green', 'blue'], alpha=0.7)
plt.title('葡萄酒数据集类别分布')
plt.xlabel('葡萄酒类型')
plt.ylabel('样本数量')
plt.show()
for i, name in enumerate(target_names_wine):
print(f"{name}: {counts[i]} 样本")5.5.2 特征分析
python
# 创建DataFrame便于分析
df_wine = pd.DataFrame(X_wine, columns=feature_names_wine)
df_wine['wine_type'] = y_wine
# 选择几个重要特征进行可视化
important_features = ['alcohol', 'flavanoids', 'color_intensity', 'proline']
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('重要特征的分布', fontsize=16)
for i, feature in enumerate(important_features):
row = i // 2
col = i % 2
for wine_type in range(3):
data = df_wine[df_wine['wine_type'] == wine_type][feature]
axes[row, col].hist(data, alpha=0.6, label=target_names_wine[wine_type], bins=15)
axes[row, col].set_title(feature)
axes[row, col].set_xlabel(feature)
axes[row, col].set_ylabel('频次')
axes[row, col].legend()
axes[row, col].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 特征相关性分析
plt.figure(figsize=(12, 10))
correlation_matrix = df_wine[important_features + ['wine_type']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
square=True, linewidths=0.5)
plt.title('重要特征相关性矩阵')
plt.tight_layout()
plt.show()5.5.3 训练多分类逻辑回归
python
# 分割数据
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
X_wine, y_wine, test_size=0.2, random_state=42, stratify=y_wine
)
# 特征标准化
scaler_wine = StandardScaler()
X_train_wine_scaled = scaler_wine.fit_transform(X_train_wine)
X_test_wine_scaled = scaler_wine.transform(X_test_wine)
# 训练多分类逻辑回归
# multi_class='ovr': One-vs-Rest策略
# multi_class='multinomial': 多项式逻辑回归
logistic_multi = LogisticRegression(
multi_class='multinomial',
solver='lbfgs',
random_state=42,
max_iter=1000
)
logistic_multi.fit(X_train_wine_scaled, y_train_wine)
print("多分类逻辑回归模型信息:")
print(f"类别数: {len(logistic_multi.classes_)}")
print(f"系数矩阵形状: {logistic_multi.coef_.shape}")
print(f"截距: {logistic_multi.intercept_}")
# 预测
y_pred_wine = logistic_multi.predict(X_test_wine_scaled)
y_pred_proba_wine = logistic_multi.predict_proba(X_test_wine_scaled)
# 评估
wine_metrics = evaluate_classification_model(
y_test_wine, y_pred_wine, y_pred_proba_wine, "多分类逻辑回归"
)5.5.4 多分类混淆矩阵
python
# 多分类混淆矩阵
cm_wine = confusion_matrix(y_test_wine, y_pred_wine)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_wine, annot=True, fmt='d', cmap='Blues',
xticklabels=target_names_wine,
yticklabels=target_names_wine)
plt.title('多分类混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.show()
# 每个类别的性能
print("各类别详细性能:")
for i, class_name in enumerate(target_names_wine):
class_precision = precision_score(y_test_wine, y_pred_wine,
labels=[i], average=None)[0]
class_recall = recall_score(y_test_wine, y_pred_wine,
labels=[i], average=None)[0]
class_f1 = f1_score(y_test_wine, y_pred_wine,
labels=[i], average=None)[0]
print(f"{class_name}:")
print(f" 精确率: {class_precision:.4f}")
print(f" 召回率: {class_recall:.4f}")
print(f" F1得分: {class_f1:.4f}")5.5.5 One-vs-Rest vs Multinomial比较
python
# 比较不同多分类策略
strategies = ['ovr', 'multinomial']
strategy_results = {}
for strategy in strategies:
model = LogisticRegression(
multi_class=strategy,
solver='lbfgs',
random_state=42,
max_iter=1000
)
model.fit(X_train_wine_scaled, y_train_wine)
y_pred = model.predict(X_test_wine_scaled)
accuracy = accuracy_score(y_test_wine, y_pred)
f1 = f1_score(y_test_wine, y_pred, average='weighted')
strategy_results[strategy] = {'accuracy': accuracy, 'f1': f1}
print(f"{strategy.upper()} 策略:")
print(f" 准确率: {accuracy:.4f}")
print(f" F1得分: {f1:.4f}")
print()
# 可视化比较
strategies_df = pd.DataFrame(strategy_results).T
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
strategies_df['accuracy'].plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('准确率比较')
axes[0].set_ylabel('准确率')
axes[0].tick_params(axis='x', rotation=0)
strategies_df['f1'].plot(kind='bar', ax=axes[1], color='lightcoral')
axes[1].set_title('F1得分比较')
axes[1].set_ylabel('F1得分')
axes[1].tick_params(axis='x', rotation=0)
plt.tight_layout()
plt.show()5.6 正则化逻辑回归
5.6.1 L1和L2正则化
python
# 创建高维数据集测试正则化效果
X_high_dim, y_high_dim = make_classification(
n_samples=500,
n_features=50,
n_informative=10,
n_redundant=10,
n_clusters_per_class=1,
random_state=42
)
X_train_hd, X_test_hd, y_train_hd, y_test_hd = train_test_split(
X_high_dim, y_high_dim, test_size=0.2, random_state=42
)
# 标准化
scaler_hd = StandardScaler()
X_train_hd_scaled = scaler_hd.fit_transform(X_train_hd)
X_test_hd_scaled = scaler_hd.transform(X_test_hd)
# 比较不同正则化方法
penalties = ['none', 'l1', 'l2', 'elasticnet']
C_values = [0.01, 0.1, 1, 10, 100]
results = {}
for penalty in penalties:
if penalty == 'none':
model = LogisticRegression(penalty=penalty, solver='lbfgs',
random_state=42, max_iter=1000)
model.fit(X_train_hd_scaled, y_train_hd)
y_pred = model.predict(X_test_hd_scaled)
accuracy = accuracy_score(y_test_hd, y_pred)
results[f'{penalty}'] = accuracy
elif penalty == 'elasticnet':
model = LogisticRegression(penalty=penalty, solver='saga',
C=1.0, l1_ratio=0.5,
random_state=42, max_iter=1000)
model.fit(X_train_hd_scaled, y_train_hd)
y_pred = model.predict(X_test_hd_scaled)
accuracy = accuracy_score(y_test_hd, y_pred)
results[f'{penalty}'] = accuracy
else:
best_accuracy = 0
best_C = None
for C in C_values:
solver = 'liblinear' if penalty == 'l1' else 'lbfgs'
model = LogisticRegression(penalty=penalty, C=C, solver=solver,
random_state=42, max_iter=1000)
model.fit(X_train_hd_scaled, y_train_hd)
y_pred = model.predict(X_test_hd_scaled)
accuracy = accuracy_score(y_test_hd, y_pred)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_C = C
results[f'{penalty} (C={best_C})'] = best_accuracy
print("正则化方法比较:")
for method, accuracy in results.items():
print(f"{method}: {accuracy:.4f}")5.6.2 正则化路径可视化
python
from sklearn.linear_model import LogisticRegressionCV
# L1正则化路径
l1_model = LogisticRegressionCV(
penalty='l1',
solver='liblinear',
Cs=np.logspace(-4, 2, 20),
cv=5,
random_state=42
)
l1_model.fit(X_train_hd_scaled, y_train_hd)
# L2正则化路径
l2_model = LogisticRegressionCV(
penalty='l2',
solver='lbfgs',
Cs=np.logspace(-4, 2, 20),
cv=5,
random_state=42
)
l2_model.fit(X_train_hd_scaled, y_train_hd)
print(f"L1最佳C: {l1_model.C_[0]:.4f}")
print(f"L2最佳C: {l2_model.C_[0]:.4f}")
# 可视化系数路径
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# L1路径
C_range = np.logspace(-4, 2, 20)
coefs_l1 = []
for C in C_range:
model = LogisticRegression(penalty='l1', C=C, solver='liblinear',
random_state=42, max_iter=1000)
model.fit(X_train_hd_scaled, y_train_hd)
coefs_l1.append(model.coef_[0])
coefs_l1 = np.array(coefs_l1)
for i in range(min(10, coefs_l1.shape[1])): # 只显示前10个特征
axes[0].plot(C_range, coefs_l1[:, i], label=f'特征{i+1}')
axes[0].set_xscale('log')
axes[0].set_xlabel('C (正则化强度的倒数)')
axes[0].set_ylabel('系数值')
axes[0].set_title('L1正则化路径')
axes[0].grid(True, alpha=0.3)
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
# L2路径
coefs_l2 = []
for C in C_range:
model = LogisticRegression(penalty='l2', C=C, solver='lbfgs',
random_state=42, max_iter=1000)
model.fit(X_train_hd_scaled, y_train_hd)
coefs_l2.append(model.coef_[0])
coefs_l2 = np.array(coefs_l2)
for i in range(min(10, coefs_l2.shape[1])): # 只显示前10个特征
axes[1].plot(C_range, coefs_l2[:, i], label=f'特征{i+1}')
axes[1].set_xscale('log')
axes[1].set_xlabel('C (正则化强度的倒数)')
axes[1].set_ylabel('系数值')
axes[1].set_title('L2正则化路径')
axes[1].grid(True, alpha=0.3)
axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
# 特征选择效果比较
l1_final = LogisticRegression(penalty='l1', C=l1_model.C_[0],
solver='liblinear', random_state=42)
l1_final.fit(X_train_hd_scaled, y_train_hd)
l2_final = LogisticRegression(penalty='l2', C=l2_model.C_[0],
solver='lbfgs', random_state=42)
l2_final.fit(X_train_hd_scaled, y_train_hd)
print(f"L1正则化非零系数数量: {np.sum(l1_final.coef_[0] != 0)}/{len(l1_final.coef_[0])}")
print(f"L2正则化非零系数数量: {np.sum(l2_final.coef_[0] != 0)}/{len(l2_final.coef_[0])}")5.7 超参数调优
5.7.1 网格搜索
python
# 使用网格搜索优化超参数
param_grid = {
'C': [0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2'],
'solver': ['liblinear'] # 支持l1和l2
}
grid_search = GridSearchCV(
LogisticRegression(random_state=42, max_iter=1000),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
grid_search.fit(X_train_hd_scaled, y_train_hd)
print("网格搜索结果:")
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
# 测试集性能
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_hd_scaled)
test_accuracy = accuracy_score(y_test_hd, y_pred_best)
print(f"测试集准确率: {test_accuracy:.4f}")
# 可视化网格搜索结果
results_df = pd.DataFrame(grid_search.cv_results_)
plt.figure(figsize=(10, 8))
pivot_table = results_df.pivot_table(
values='mean_test_score',
index='param_penalty',
columns='param_C'
)
sns.heatmap(pivot_table, annot=True, cmap='viridis', fmt='.4f')
plt.title('网格搜索结果热力图')
plt.xlabel('C值')
plt.ylabel('正则化类型')
plt.show()5.7.2 学习曲线分析
python
from sklearn.model_selection import learning_curve
def plot_learning_curve_classification(estimator, X, y, title="学习曲线"):
"""绘制分类模型的学习曲线"""
train_sizes, train_scores, val_scores = learning_curve(
estimator, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring='accuracy'
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练得分')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std,
alpha=0.1, color='blue')
plt.plot(train_sizes, val_mean, 'o-', color='red', label='验证得分')
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std,
alpha=0.1, color='red')
plt.xlabel('训练样本数')
plt.ylabel('准确率')
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 绘制最佳模型的学习曲线
plot_learning_curve_classification(
best_model, X_train_hd_scaled, y_train_hd,
"最佳逻辑回归模型学习曲线"
)5.8 实际应用案例
5.8.1 乳腺癌诊断案例
python
# 加载乳腺癌数据集
cancer_data = load_breast_cancer()
X_cancer = cancer_data.data
y_cancer = cancer_data.target
feature_names_cancer = cancer_data.feature_names
target_names_cancer = cancer_data.target_names
print("乳腺癌数据集信息:")
print(f"样本数: {X_cancer.shape[0]}")
print(f"特征数: {X_cancer.shape[1]}")
print(f"类别: {target_names_cancer}")
# 查看类别分布
unique, counts = np.unique(y_cancer, return_counts=True)
print(f"良性: {counts[1]} 样本")
print(f"恶性: {counts[0]} 样本")
# 分割数据
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(
X_cancer, y_cancer, test_size=0.2, random_state=42, stratify=y_cancer
)
# 创建完整的预处理和建模管道
cancer_pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression(random_state=42, max_iter=1000))
])
# 训练模型
cancer_pipeline.fit(X_train_cancer, y_train_cancer)
# 预测和评估
y_pred_cancer = cancer_pipeline.predict(X_test_cancer)
y_pred_proba_cancer = cancer_pipeline.predict_proba(X_test_cancer)
print("\n乳腺癌诊断模型评估:")
cancer_metrics = evaluate_classification_model(
y_test_cancer, y_pred_cancer, y_pred_proba_cancer, "乳腺癌诊断模型"
)5.8.2 特征重要性分析
python
# 获取特征重要性(基于系数绝对值)
classifier = cancer_pipeline.named_steps['classifier']
feature_importance = np.abs(classifier.coef_[0])
# 创建特征重要性DataFrame
importance_df = pd.DataFrame({
'feature': feature_names_cancer,
'importance': feature_importance
}).sort_values('importance', ascending=False)
# 可视化前15个最重要的特征
plt.figure(figsize=(10, 8))
top_features = importance_df.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('特征重要性(系数绝对值)')
plt.title('乳腺癌诊断模型 - 前15个重要特征')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
print("前10个最重要的特征:")
for i, (_, row) in enumerate(top_features.head(10).iterrows()):
print(f"{i+1:2d}. {row['feature']}: {row['importance']:.4f}")5.8.3 模型解释和预测示例
python
# 预测新样本
def predict_cancer_diagnosis(model, scaler, sample_features, feature_names):
"""预测乳腺癌诊断结果"""
# 标准化特征
sample_scaled = scaler.transform([sample_features])
# 预测概率
proba = model.predict_proba(sample_scaled)[0]
prediction = model.predict(sample_scaled)[0]
print("乳腺癌诊断预测结果:")
print(f"预测类别: {'良性' if prediction == 1 else '恶性'}")
print(f"恶性概率: {proba[0]:.3f}")
print(f"良性概率: {proba[1]:.3f}")
# 显示最重要特征的贡献
classifier = model.named_steps['classifier']
coefficients = classifier.coef_[0]
print("\n重要特征贡献分析:")
feature_contributions = sample_scaled[0] * coefficients
# 获取贡献最大的特征
top_indices = np.argsort(np.abs(feature_contributions))[-5:]
for idx in reversed(top_indices):
contribution = feature_contributions[idx]
direction = "支持恶性" if contribution < 0 else "支持良性"
print(f"{feature_names[idx]}: {contribution:.3f} ({direction})")
# 使用测试集中的一个样本进行演示
sample_idx = 0
sample_features = X_test_cancer[sample_idx]
true_label = y_test_cancer[sample_idx]
print(f"真实标签: {'良性' if true_label == 1 else '恶性'}")
predict_cancer_diagnosis(cancer_pipeline,
cancer_pipeline.named_steps['scaler'],
sample_features,
feature_names_cancer)5.9 练习题
练习1:基础逻辑回归
- 使用
make_classification生成一个二分类数据集 - 训练逻辑回归模型并绘制决策边界
- 分析不同阈值对分类结果的影响
练习2:多分类问题
- 使用鸢尾花数据集训练多分类逻辑回归
- 比较 One-vs-Rest 和 Multinomial 策略的性能
- 分析每个类别的分类难度
练习3:不平衡数据处理
- 创建一个不平衡的二分类数据集(比例1:9)
- 使用不同的评估指标评估模型性能
- 尝试使用
class_weight='balanced'参数改善性能
练习4:特征选择
- 使用高维数据集(特征数 > 100)
- 比较L1和L2正则化的特征选择效果
- 分析正则化强度对模型性能的影响
5.10 小结
在本章中,我们深入学习了逻辑回归的各个方面:
核心概念
- 逻辑回归原理:Sigmoid函数、概率预测、决策边界
- 多分类策略:One-vs-Rest、Multinomial
- 正则化方法:L1、L2、ElasticNet
主要技术
- 模型训练:二分类、多分类逻辑回归
- 性能评估:准确率、精确率、召回率、F1、AUC
- 可视化技术:ROC曲线、PR曲线、决策边界
- 超参数调优:网格搜索、交叉验证
实践技能
- 数据预处理:标准化、特征选择
- 模型解释:系数分析、特征重要性
- 实际应用:医疗诊断、分类预测
- 性能优化:正则化、阈值调整
关键要点
- 逻辑回归是线性分类器,适用于线性可分问题
- Sigmoid函数将线性组合映射到概率空间
- 正则化可以防止过拟合并进行特征选择
- 评估指标的选择取决于具体的业务需求
5.11 下一步
现在你已经掌握了逻辑回归这个重要的分类算法!在下一章决策树算法中,我们将学习一个完全不同的算法——决策树,它具有很好的可解释性,是理解更复杂集成方法的基础。
章节要点回顾:
- ✅ 理解了逻辑回归的数学原理和Sigmoid函数
- ✅ 掌握了二分类和多分类逻辑回归的实现
- ✅ 学会了使用多种评估指标评估分类模型
- ✅ 了解了正则化在逻辑回归中的应用
- ✅ 掌握了ROC曲线和PR曲线的绘制和解释
- ✅ 能够构建完整的分类预测系统