Skip to content

第5章:逻辑回归实战

逻辑回归是机器学习中最重要的分类算法之一。尽管名字中有"回归",但它实际上是一个分类算法。本章将深入探讨逻辑回归的原理、实现和应用。

5.1 什么是逻辑回归?

逻辑回归使用逻辑函数(Sigmoid函数)来建模二分类问题的概率。它不直接预测类别,而是预测样本属于某个类别的概率。

5.1.1 数学原理

Sigmoid函数

σ(z) = 1 / (1 + e^(-z))

其中 z = β₀ + β₁x₁ + β₂x₂ + ... + βₙxₙ

概率预测

P(y=1|x) = σ(β₀ + β₁x₁ + β₂x₂ + ... + βₙxₙ)
P(y=0|x) = 1 - P(y=1|x)

决策边界

  • 当 P(y=1|x) ≥ 0.5 时,预测为类别1
  • 当 P(y=1|x) < 0.5 时,预测为类别0

5.1.2 与线性回归的区别

特征线性回归逻辑回归
目标预测连续值预测概率/分类
输出范围(-∞, +∞)[0, 1]
激活函数Sigmoid
损失函数均方误差对数似然

5.2 准备环境和数据

python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc,
    precision_recall_curve, log_loss
)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子
np.random.seed(42)

# 设置图形样式
plt.style.use('seaborn-v0_8')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

5.3 二分类逻辑回归

5.3.1 生成二分类数据

python
# 生成二分类数据集
X_binary, y_binary = make_classification(
    n_samples=1000,
    n_features=2,
    n_redundant=0,
    n_informative=2,
    n_clusters_per_class=1,
    random_state=42
)

# 创建DataFrame便于分析
df_binary = pd.DataFrame(X_binary, columns=['特征1', '特征2'])
df_binary['标签'] = y_binary

print("二分类数据集信息:")
print(df_binary.info())
print("\n类别分布:")
print(df_binary['标签'].value_counts())

# 可视化数据分布
plt.figure(figsize=(10, 8))
colors = ['red', 'blue']
for i, label in enumerate([0, 1]):
    mask = y_binary == label
    plt.scatter(X_binary[mask, 0], X_binary[mask, 1], 
                c=colors[i], label=f'类别 {label}', alpha=0.7)

plt.xlabel('特征1')
plt.ylabel('特征2')
plt.title('二分类数据分布')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

5.3.2 训练二分类逻辑回归模型

python
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
    X_binary, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 创建和训练逻辑回归模型
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train_scaled, y_train)

# 查看模型参数
print("逻辑回归模型参数:")
print(f"截距: {logistic_model.intercept_[0]:.4f}")
print(f"系数: {logistic_model.coef_[0]}")

# 预测概率和类别
y_pred_proba = logistic_model.predict_proba(X_test_scaled)
y_pred = logistic_model.predict(X_test_scaled)

print(f"\n预测示例(前5个样本):")
for i in range(5):
    print(f"样本 {i+1}: 真实={y_test[i]}, 预测={y_pred[i]}, "
          f"概率=[{y_pred_proba[i][0]:.3f}, {y_pred_proba[i][1]:.3f}]")

5.3.3 决策边界可视化

python
def plot_decision_boundary(X, y, model, scaler=None, title="决策边界"):
    """绘制决策边界"""
    plt.figure(figsize=(10, 8))
    
    # 创建网格
    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # 预测网格点
    grid_points = np.c_[xx.ravel(), yy.ravel()]
    if scaler:
        grid_points = scaler.transform(grid_points)
    
    Z = model.predict_proba(grid_points)[:, 1]
    Z = Z.reshape(xx.shape)
    
    # 绘制等高线
    plt.contourf(xx, yy, Z, levels=50, alpha=0.8, cmap='RdYlBu')
    plt.colorbar(label='P(y=1)')
    
    # 绘制决策边界
    plt.contour(xx, yy, Z, levels=[0.5], colors='black', linestyles='--', linewidths=2)
    
    # 绘制数据点
    colors = ['red', 'blue']
    for i, label in enumerate([0, 1]):
        mask = y == label
        plt.scatter(X[mask, 0], X[mask, 1], 
                   c=colors[i], label=f'类别 {label}', alpha=0.7, edgecolors='black')
    
    plt.xlabel('特征1')
    plt.ylabel('特征2')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# 绘制决策边界
plot_decision_boundary(X_train, y_train, logistic_model, scaler, "逻辑回归决策边界")

5.3.4 Sigmoid函数可视化

python
# 可视化Sigmoid函数
z = np.linspace(-10, 10, 100)
sigmoid = 1 / (1 + np.exp(-z))

plt.figure(figsize=(10, 6))
plt.plot(z, sigmoid, 'b-', linewidth=2, label='Sigmoid函数')
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.7, label='决策阈值')
plt.axvline(x=0, color='g', linestyle='--', alpha=0.7, label='z=0')
plt.xlabel('z = β₀ + β₁x₁ + β₂x₂')
plt.ylabel('P(y=1|x)')
plt.title('Sigmoid函数')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 展示线性组合到概率的转换
sample_features = X_test_scaled[:10]
linear_combination = logistic_model.decision_function(sample_features)
probabilities = logistic_model.predict_proba(sample_features)[:, 1]

print("线性组合到概率的转换示例:")
print("线性组合(z)\t概率P(y=1)\t预测类别")
print("-" * 40)
for i in range(len(sample_features)):
    pred_class = 1 if probabilities[i] >= 0.5 else 0
    print(f"{linear_combination[i]:8.3f}\t{probabilities[i]:8.3f}\t{pred_class:8d}")

5.4 模型评估

5.4.1 基本评估指标

python
def evaluate_classification_model(y_true, y_pred, y_pred_proba=None, model_name="模型"):
    """评估分类模型性能"""
    print(f"{model_name} 评估结果:")
    print("-" * 50)
    
    # 基本指标
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print(f"准确率 (Accuracy): {accuracy:.4f}")
    print(f"精确率 (Precision): {precision:.4f}")
    print(f"召回率 (Recall): {recall:.4f}")
    print(f"F1得分: {f1:.4f}")
    
    # 对数损失
    if y_pred_proba is not None:
        logloss = log_loss(y_true, y_pred_proba)
        print(f"对数损失 (Log Loss): {logloss:.4f}")
    
    print("\n详细分类报告:")
    print(classification_report(y_true, y_pred))
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'log_loss': log_loss(y_true, y_pred_proba) if y_pred_proba is not None else None
    }

# 评估模型
metrics = evaluate_classification_model(
    y_test, y_pred, y_pred_proba, "逻辑回归"
)

5.4.2 混淆矩阵

python
# 计算和可视化混淆矩阵
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['类别0', '类别1'],
            yticklabels=['类别0', '类别1'])
plt.title('混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.show()

# 从混淆矩阵计算指标
tn, fp, fn, tp = cm.ravel()
print("混淆矩阵分析:")
print(f"真负例 (TN): {tn}")
print(f"假正例 (FP): {fp}")
print(f"假负例 (FN): {fn}")
print(f"真正例 (TP): {tp}")

print(f"\n手动计算的指标:")
print(f"准确率: {(tp + tn) / (tp + tn + fp + fn):.4f}")
print(f"精确率: {tp / (tp + fp):.4f}")
print(f"召回率: {tp / (tp + fn):.4f}")
print(f"特异性: {tn / (tn + fp):.4f}")

5.4.3 ROC曲线和AUC

python
# 计算ROC曲线
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])
roc_auc = auc(fpr, tpr)

# 绘制ROC曲线
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC曲线 (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', 
         label='随机分类器')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假正例率 (FPR)')
plt.ylabel('真正例率 (TPR)')
plt.title('ROC曲线')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

print(f"AUC得分: {roc_auc:.4f}")

# 不同阈值的性能
print("\n不同阈值下的性能:")
print("阈值\t\tFPR\t\tTPR\t\t精确率\t\t召回率")
print("-" * 60)

for i in range(0, len(thresholds), len(thresholds)//10):
    threshold = thresholds[i]
    y_pred_threshold = (y_pred_proba[:, 1] >= threshold).astype(int)
    
    if len(np.unique(y_pred_threshold)) > 1:  # 避免除零错误
        precision_thresh = precision_score(y_test, y_pred_threshold)
        recall_thresh = recall_score(y_test, y_pred_threshold)
        print(f"{threshold:.3f}\t\t{fpr[i]:.3f}\t\t{tpr[i]:.3f}\t\t{precision_thresh:.3f}\t\t{recall_thresh:.3f}")

5.4.4 精确率-召回率曲线

python
# 计算精确率-召回率曲线
precision_curve, recall_curve, pr_thresholds = precision_recall_curve(
    y_test, y_pred_proba[:, 1]
)
pr_auc = auc(recall_curve, precision_curve)

# 绘制PR曲线
plt.figure(figsize=(10, 8))
plt.plot(recall_curve, precision_curve, color='blue', lw=2,
         label=f'PR曲线 (AUC = {pr_auc:.3f})')

# 基线(随机分类器)
baseline = np.sum(y_test) / len(y_test)
plt.axhline(y=baseline, color='red', linestyle='--', 
           label=f'随机分类器 (精确率 = {baseline:.3f})')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('召回率')
plt.ylabel('精确率')
plt.title('精确率-召回率曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"PR-AUC得分: {pr_auc:.4f}")

5.5 多分类逻辑回归

5.5.1 加载多分类数据

python
# 使用葡萄酒数据集(3分类)
wine_data = load_wine()
X_wine = wine_data.data
y_wine = wine_data.target
feature_names_wine = wine_data.feature_names
target_names_wine = wine_data.target_names

print("葡萄酒数据集信息:")
print(f"样本数: {X_wine.shape[0]}")
print(f"特征数: {X_wine.shape[1]}")
print(f"类别数: {len(np.unique(y_wine))}")
print(f"类别名称: {target_names_wine}")

# 查看类别分布
unique, counts = np.unique(y_wine, return_counts=True)
plt.figure(figsize=(8, 6))
plt.bar(target_names_wine, counts, color=['red', 'green', 'blue'], alpha=0.7)
plt.title('葡萄酒数据集类别分布')
plt.xlabel('葡萄酒类型')
plt.ylabel('样本数量')
plt.show()

for i, name in enumerate(target_names_wine):
    print(f"{name}: {counts[i]} 样本")

5.5.2 特征分析

python
# 创建DataFrame便于分析
df_wine = pd.DataFrame(X_wine, columns=feature_names_wine)
df_wine['wine_type'] = y_wine

# 选择几个重要特征进行可视化
important_features = ['alcohol', 'flavanoids', 'color_intensity', 'proline']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('重要特征的分布', fontsize=16)

for i, feature in enumerate(important_features):
    row = i // 2
    col = i % 2
    
    for wine_type in range(3):
        data = df_wine[df_wine['wine_type'] == wine_type][feature]
        axes[row, col].hist(data, alpha=0.6, label=target_names_wine[wine_type], bins=15)
    
    axes[row, col].set_title(feature)
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('频次')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 特征相关性分析
plt.figure(figsize=(12, 10))
correlation_matrix = df_wine[important_features + ['wine_type']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('重要特征相关性矩阵')
plt.tight_layout()
plt.show()

5.5.3 训练多分类逻辑回归

python
# 分割数据
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    X_wine, y_wine, test_size=0.2, random_state=42, stratify=y_wine
)

# 特征标准化
scaler_wine = StandardScaler()
X_train_wine_scaled = scaler_wine.fit_transform(X_train_wine)
X_test_wine_scaled = scaler_wine.transform(X_test_wine)

# 训练多分类逻辑回归
# multi_class='ovr': One-vs-Rest策略
# multi_class='multinomial': 多项式逻辑回归
logistic_multi = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    random_state=42,
    max_iter=1000
)

logistic_multi.fit(X_train_wine_scaled, y_train_wine)

print("多分类逻辑回归模型信息:")
print(f"类别数: {len(logistic_multi.classes_)}")
print(f"系数矩阵形状: {logistic_multi.coef_.shape}")
print(f"截距: {logistic_multi.intercept_}")

# 预测
y_pred_wine = logistic_multi.predict(X_test_wine_scaled)
y_pred_proba_wine = logistic_multi.predict_proba(X_test_wine_scaled)

# 评估
wine_metrics = evaluate_classification_model(
    y_test_wine, y_pred_wine, y_pred_proba_wine, "多分类逻辑回归"
)

5.5.4 多分类混淆矩阵

python
# 多分类混淆矩阵
cm_wine = confusion_matrix(y_test_wine, y_pred_wine)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_wine, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names_wine,
            yticklabels=target_names_wine)
plt.title('多分类混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.show()

# 每个类别的性能
print("各类别详细性能:")
for i, class_name in enumerate(target_names_wine):
    class_precision = precision_score(y_test_wine, y_pred_wine, 
                                    labels=[i], average=None)[0]
    class_recall = recall_score(y_test_wine, y_pred_wine, 
                              labels=[i], average=None)[0]
    class_f1 = f1_score(y_test_wine, y_pred_wine, 
                       labels=[i], average=None)[0]
    
    print(f"{class_name}:")
    print(f"  精确率: {class_precision:.4f}")
    print(f"  召回率: {class_recall:.4f}")
    print(f"  F1得分: {class_f1:.4f}")

5.5.5 One-vs-Rest vs Multinomial比较

python
# 比较不同多分类策略
strategies = ['ovr', 'multinomial']
strategy_results = {}

for strategy in strategies:
    model = LogisticRegression(
        multi_class=strategy,
        solver='lbfgs',
        random_state=42,
        max_iter=1000
    )
    
    model.fit(X_train_wine_scaled, y_train_wine)
    y_pred = model.predict(X_test_wine_scaled)
    
    accuracy = accuracy_score(y_test_wine, y_pred)
    f1 = f1_score(y_test_wine, y_pred, average='weighted')
    
    strategy_results[strategy] = {'accuracy': accuracy, 'f1': f1}
    
    print(f"{strategy.upper()} 策略:")
    print(f"  准确率: {accuracy:.4f}")
    print(f"  F1得分: {f1:.4f}")
    print()

# 可视化比较
strategies_df = pd.DataFrame(strategy_results).T
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

strategies_df['accuracy'].plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('准确率比较')
axes[0].set_ylabel('准确率')
axes[0].tick_params(axis='x', rotation=0)

strategies_df['f1'].plot(kind='bar', ax=axes[1], color='lightcoral')
axes[1].set_title('F1得分比较')
axes[1].set_ylabel('F1得分')
axes[1].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

5.6 正则化逻辑回归

5.6.1 L1和L2正则化

python
# 创建高维数据集测试正则化效果
X_high_dim, y_high_dim = make_classification(
    n_samples=500,
    n_features=50,
    n_informative=10,
    n_redundant=10,
    n_clusters_per_class=1,
    random_state=42
)

X_train_hd, X_test_hd, y_train_hd, y_test_hd = train_test_split(
    X_high_dim, y_high_dim, test_size=0.2, random_state=42
)

# 标准化
scaler_hd = StandardScaler()
X_train_hd_scaled = scaler_hd.fit_transform(X_train_hd)
X_test_hd_scaled = scaler_hd.transform(X_test_hd)

# 比较不同正则化方法
penalties = ['none', 'l1', 'l2', 'elasticnet']
C_values = [0.01, 0.1, 1, 10, 100]

results = {}

for penalty in penalties:
    if penalty == 'none':
        model = LogisticRegression(penalty=penalty, solver='lbfgs', 
                                 random_state=42, max_iter=1000)
        model.fit(X_train_hd_scaled, y_train_hd)
        y_pred = model.predict(X_test_hd_scaled)
        accuracy = accuracy_score(y_test_hd, y_pred)
        results[f'{penalty}'] = accuracy
        
    elif penalty == 'elasticnet':
        model = LogisticRegression(penalty=penalty, solver='saga', 
                                 C=1.0, l1_ratio=0.5,
                                 random_state=42, max_iter=1000)
        model.fit(X_train_hd_scaled, y_train_hd)
        y_pred = model.predict(X_test_hd_scaled)
        accuracy = accuracy_score(y_test_hd, y_pred)
        results[f'{penalty}'] = accuracy
        
    else:
        best_accuracy = 0
        best_C = None
        
        for C in C_values:
            solver = 'liblinear' if penalty == 'l1' else 'lbfgs'
            model = LogisticRegression(penalty=penalty, C=C, solver=solver,
                                     random_state=42, max_iter=1000)
            model.fit(X_train_hd_scaled, y_train_hd)
            y_pred = model.predict(X_test_hd_scaled)
            accuracy = accuracy_score(y_test_hd, y_pred)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_C = C
        
        results[f'{penalty} (C={best_C})'] = best_accuracy

print("正则化方法比较:")
for method, accuracy in results.items():
    print(f"{method}: {accuracy:.4f}")

5.6.2 正则化路径可视化

python
from sklearn.linear_model import LogisticRegressionCV

# L1正则化路径
l1_model = LogisticRegressionCV(
    penalty='l1',
    solver='liblinear',
    Cs=np.logspace(-4, 2, 20),
    cv=5,
    random_state=42
)

l1_model.fit(X_train_hd_scaled, y_train_hd)

# L2正则化路径
l2_model = LogisticRegressionCV(
    penalty='l2',
    solver='lbfgs',
    Cs=np.logspace(-4, 2, 20),
    cv=5,
    random_state=42
)

l2_model.fit(X_train_hd_scaled, y_train_hd)

print(f"L1最佳C: {l1_model.C_[0]:.4f}")
print(f"L2最佳C: {l2_model.C_[0]:.4f}")

# 可视化系数路径
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# L1路径
C_range = np.logspace(-4, 2, 20)
coefs_l1 = []
for C in C_range:
    model = LogisticRegression(penalty='l1', C=C, solver='liblinear', 
                             random_state=42, max_iter=1000)
    model.fit(X_train_hd_scaled, y_train_hd)
    coefs_l1.append(model.coef_[0])

coefs_l1 = np.array(coefs_l1)
for i in range(min(10, coefs_l1.shape[1])):  # 只显示前10个特征
    axes[0].plot(C_range, coefs_l1[:, i], label=f'特征{i+1}')

axes[0].set_xscale('log')
axes[0].set_xlabel('C (正则化强度的倒数)')
axes[0].set_ylabel('系数值')
axes[0].set_title('L1正则化路径')
axes[0].grid(True, alpha=0.3)
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# L2路径
coefs_l2 = []
for C in C_range:
    model = LogisticRegression(penalty='l2', C=C, solver='lbfgs', 
                             random_state=42, max_iter=1000)
    model.fit(X_train_hd_scaled, y_train_hd)
    coefs_l2.append(model.coef_[0])

coefs_l2 = np.array(coefs_l2)
for i in range(min(10, coefs_l2.shape[1])):  # 只显示前10个特征
    axes[1].plot(C_range, coefs_l2[:, i], label=f'特征{i+1}')

axes[1].set_xscale('log')
axes[1].set_xlabel('C (正则化强度的倒数)')
axes[1].set_ylabel('系数值')
axes[1].set_title('L2正则化路径')
axes[1].grid(True, alpha=0.3)
axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

# 特征选择效果比较
l1_final = LogisticRegression(penalty='l1', C=l1_model.C_[0], 
                            solver='liblinear', random_state=42)
l1_final.fit(X_train_hd_scaled, y_train_hd)

l2_final = LogisticRegression(penalty='l2', C=l2_model.C_[0], 
                            solver='lbfgs', random_state=42)
l2_final.fit(X_train_hd_scaled, y_train_hd)

print(f"L1正则化非零系数数量: {np.sum(l1_final.coef_[0] != 0)}/{len(l1_final.coef_[0])}")
print(f"L2正则化非零系数数量: {np.sum(l2_final.coef_[0] != 0)}/{len(l2_final.coef_[0])}")

5.7 超参数调优

5.7.1 网格搜索

python
# 使用网格搜索优化超参数
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # 支持l1和l2
}

grid_search = GridSearchCV(
    LogisticRegression(random_state=42, max_iter=1000),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train_hd_scaled, y_train_hd)

print("网格搜索结果:")
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")

# 测试集性能
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_hd_scaled)
test_accuracy = accuracy_score(y_test_hd, y_pred_best)
print(f"测试集准确率: {test_accuracy:.4f}")

# 可视化网格搜索结果
results_df = pd.DataFrame(grid_search.cv_results_)

plt.figure(figsize=(10, 8))
pivot_table = results_df.pivot_table(
    values='mean_test_score',
    index='param_penalty',
    columns='param_C'
)

sns.heatmap(pivot_table, annot=True, cmap='viridis', fmt='.4f')
plt.title('网格搜索结果热力图')
plt.xlabel('C值')
plt.ylabel('正则化类型')
plt.show()

5.7.2 学习曲线分析

python
from sklearn.model_selection import learning_curve

def plot_learning_curve_classification(estimator, X, y, title="学习曲线"):
    """绘制分类模型的学习曲线"""
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, 'o-', color='blue', label='训练得分')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, 
                     alpha=0.1, color='blue')
    
    plt.plot(train_sizes, val_mean, 'o-', color='red', label='验证得分')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, 
                     alpha=0.1, color='red')
    
    plt.xlabel('训练样本数')
    plt.ylabel('准确率')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# 绘制最佳模型的学习曲线
plot_learning_curve_classification(
    best_model, X_train_hd_scaled, y_train_hd, 
    "最佳逻辑回归模型学习曲线"
)

5.8 实际应用案例

5.8.1 乳腺癌诊断案例

python
# 加载乳腺癌数据集
cancer_data = load_breast_cancer()
X_cancer = cancer_data.data
y_cancer = cancer_data.target
feature_names_cancer = cancer_data.feature_names
target_names_cancer = cancer_data.target_names

print("乳腺癌数据集信息:")
print(f"样本数: {X_cancer.shape[0]}")
print(f"特征数: {X_cancer.shape[1]}")
print(f"类别: {target_names_cancer}")

# 查看类别分布
unique, counts = np.unique(y_cancer, return_counts=True)
print(f"良性: {counts[1]} 样本")
print(f"恶性: {counts[0]} 样本")

# 分割数据
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(
    X_cancer, y_cancer, test_size=0.2, random_state=42, stratify=y_cancer
)

# 创建完整的预处理和建模管道
cancer_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# 训练模型
cancer_pipeline.fit(X_train_cancer, y_train_cancer)

# 预测和评估
y_pred_cancer = cancer_pipeline.predict(X_test_cancer)
y_pred_proba_cancer = cancer_pipeline.predict_proba(X_test_cancer)

print("\n乳腺癌诊断模型评估:")
cancer_metrics = evaluate_classification_model(
    y_test_cancer, y_pred_cancer, y_pred_proba_cancer, "乳腺癌诊断模型"
)

5.8.2 特征重要性分析

python
# 获取特征重要性(基于系数绝对值)
classifier = cancer_pipeline.named_steps['classifier']
feature_importance = np.abs(classifier.coef_[0])

# 创建特征重要性DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names_cancer,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

# 可视化前15个最重要的特征
plt.figure(figsize=(10, 8))
top_features = importance_df.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('特征重要性(系数绝对值)')
plt.title('乳腺癌诊断模型 - 前15个重要特征')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("前10个最重要的特征:")
for i, (_, row) in enumerate(top_features.head(10).iterrows()):
    print(f"{i+1:2d}. {row['feature']}: {row['importance']:.4f}")

5.8.3 模型解释和预测示例

python
# 预测新样本
def predict_cancer_diagnosis(model, scaler, sample_features, feature_names):
    """预测乳腺癌诊断结果"""
    # 标准化特征
    sample_scaled = scaler.transform([sample_features])
    
    # 预测概率
    proba = model.predict_proba(sample_scaled)[0]
    prediction = model.predict(sample_scaled)[0]
    
    print("乳腺癌诊断预测结果:")
    print(f"预测类别: {'良性' if prediction == 1 else '恶性'}")
    print(f"恶性概率: {proba[0]:.3f}")
    print(f"良性概率: {proba[1]:.3f}")
    
    # 显示最重要特征的贡献
    classifier = model.named_steps['classifier']
    coefficients = classifier.coef_[0]
    
    print("\n重要特征贡献分析:")
    feature_contributions = sample_scaled[0] * coefficients
    
    # 获取贡献最大的特征
    top_indices = np.argsort(np.abs(feature_contributions))[-5:]
    
    for idx in reversed(top_indices):
        contribution = feature_contributions[idx]
        direction = "支持恶性" if contribution < 0 else "支持良性"
        print(f"{feature_names[idx]}: {contribution:.3f} ({direction})")

# 使用测试集中的一个样本进行演示
sample_idx = 0
sample_features = X_test_cancer[sample_idx]
true_label = y_test_cancer[sample_idx]

print(f"真实标签: {'良性' if true_label == 1 else '恶性'}")
predict_cancer_diagnosis(cancer_pipeline, 
                        cancer_pipeline.named_steps['scaler'],
                        sample_features, 
                        feature_names_cancer)

5.9 练习题

练习1:基础逻辑回归

  1. 使用 make_classification 生成一个二分类数据集
  2. 训练逻辑回归模型并绘制决策边界
  3. 分析不同阈值对分类结果的影响

练习2:多分类问题

  1. 使用鸢尾花数据集训练多分类逻辑回归
  2. 比较 One-vs-Rest 和 Multinomial 策略的性能
  3. 分析每个类别的分类难度

练习3:不平衡数据处理

  1. 创建一个不平衡的二分类数据集(比例1:9)
  2. 使用不同的评估指标评估模型性能
  3. 尝试使用 class_weight='balanced' 参数改善性能

练习4:特征选择

  1. 使用高维数据集(特征数 > 100)
  2. 比较L1和L2正则化的特征选择效果
  3. 分析正则化强度对模型性能的影响

5.10 小结

在本章中,我们深入学习了逻辑回归的各个方面:

核心概念

  • 逻辑回归原理:Sigmoid函数、概率预测、决策边界
  • 多分类策略:One-vs-Rest、Multinomial
  • 正则化方法:L1、L2、ElasticNet

主要技术

  • 模型训练:二分类、多分类逻辑回归
  • 性能评估:准确率、精确率、召回率、F1、AUC
  • 可视化技术:ROC曲线、PR曲线、决策边界
  • 超参数调优:网格搜索、交叉验证

实践技能

  • 数据预处理:标准化、特征选择
  • 模型解释:系数分析、特征重要性
  • 实际应用:医疗诊断、分类预测
  • 性能优化:正则化、阈值调整

关键要点

  • 逻辑回归是线性分类器,适用于线性可分问题
  • Sigmoid函数将线性组合映射到概率空间
  • 正则化可以防止过拟合并进行特征选择
  • 评估指标的选择取决于具体的业务需求

5.11 下一步

现在你已经掌握了逻辑回归这个重要的分类算法!在下一章决策树算法中,我们将学习一个完全不同的算法——决策树,它具有很好的可解释性,是理解更复杂集成方法的基础。


章节要点回顾

  • ✅ 理解了逻辑回归的数学原理和Sigmoid函数
  • ✅ 掌握了二分类和多分类逻辑回归的实现
  • ✅ 学会了使用多种评估指标评估分类模型
  • ✅ 了解了正则化在逻辑回归中的应用
  • ✅ 掌握了ROC曲线和PR曲线的绘制和解释
  • ✅ 能够构建完整的分类预测系统

本站内容仅供学习和研究使用。