第6章:决策树算法
决策树是机器学习中最直观、最易理解的算法之一。它通过一系列if-else条件来进行决策,就像人类的思维过程一样。本章将详细介绍决策树的原理、实现和应用。
6.1 什么是决策树?
决策树是一种树形结构的分类和回归算法,它通过学习一系列的判断规则来对数据进行预测。每个内部节点表示一个特征上的测试,每个分支代表一个测试结果,每个叶节点代表一个类标签或数值。
6.1.1 决策树的组成
- 根节点:树的顶部,包含所有训练样本
- 内部节点:表示对某个特征的测试
- 分支:表示测试的结果
- 叶节点:表示分类结果或回归值
6.1.2 决策树的优势
- 易于理解和解释:决策过程透明
- 无需数据预处理:可以处理数值型和类别型特征
- 能够处理多输出问题:同时预测多个目标
- 可以验证模型:通过统计测试验证模型
- 对异常值不敏感:基于排序的分割
6.1.3 决策树的劣势
- 容易过拟合:特别是深度较大的树
- 不稳定:数据的小变化可能导致完全不同的树
- 偏向于选择更多级别的特征:信息增益偏向问题
- 难以表达线性关系:需要很多分割
6.2 准备环境和数据
python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_iris, load_wine, make_regression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree, export_text
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix,
mean_squared_error, r2_score
)
import graphviz
from sklearn.tree import export_graphviz
import warnings
warnings.filterwarnings('ignore')
# 设置随机种子
np.random.seed(42)
# 设置图形样式
plt.style.use('seaborn-v0_8')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False6.3 决策树的构建原理
6.3.1 信息论基础
python
def calculate_entropy(y):
"""计算熵"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))
return entropy
def calculate_gini(y):
"""计算基尼不纯度"""
_, counts = np.unique(y, return_counts=True)
probabilities = counts / len(y)
gini = 1 - np.sum(probabilities ** 2)
return gini
def calculate_information_gain(y, y_left, y_right):
"""计算信息增益"""
n = len(y)
n_left, n_right = len(y_left), len(y_right)
entropy_before = calculate_entropy(y)
entropy_after = (n_left/n) * calculate_entropy(y_left) + (n_right/n) * calculate_entropy(y_right)
information_gain = entropy_before - entropy_after
return information_gain
# 演示不同纯度指标
y_pure = np.array([1, 1, 1, 1, 1]) # 纯净
y_mixed = np.array([0, 0, 1, 1, 1]) # 混合
y_impure = np.array([0, 0, 1, 1, 2]) # 不纯
print("不同数据集的纯度指标:")
print("数据集\t\t熵\t\t基尼不纯度")
print("-" * 40)
print(f"纯净 [1,1,1,1,1]\t{calculate_entropy(y_pure):.4f}\t\t{calculate_gini(y_pure):.4f}")
print(f"混合 [0,0,1,1,1]\t{calculate_entropy(y_mixed):.4f}\t\t{calculate_gini(y_mixed):.4f}")
print(f"不纯 [0,0,1,1,2]\t{calculate_entropy(y_impure):.4f}\t\t{calculate_gini(y_impure):.4f}")6.3.2 分割准则可视化
python
# 可视化不同分割准则
def plot_impurity_measures():
"""可视化不同的不纯度度量"""
p = np.linspace(0.01, 0.99, 100)
# 二分类情况下的不纯度度量
entropy = -p * np.log2(p) - (1-p) * np.log2(1-p)
gini = 2 * p * (1-p)
misclassification = 1 - np.maximum(p, 1-p)
plt.figure(figsize=(10, 6))
plt.plot(p, entropy, label='熵', linewidth=2)
plt.plot(p, gini, label='基尼不纯度', linewidth=2)
plt.plot(p, misclassification, label='误分类率', linewidth=2)
plt.xlabel('类别1的概率')
plt.ylabel('不纯度')
plt.title('不同不纯度度量的比较')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
plot_impurity_measures()6.4 分类决策树
6.4.1 简单二分类示例
python
# 创建简单的二分类数据
X_simple, y_simple = make_classification(
n_samples=200,
n_features=2,
n_redundant=0,
n_informative=2,
n_clusters_per_class=1,
random_state=42
)
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
X_simple, y_simple, test_size=0.2, random_state=42
)
# 创建决策树分类器
dt_classifier = DecisionTreeClassifier(
criterion='gini', # 分割准则
max_depth=3, # 最大深度
min_samples_split=20, # 分割所需的最小样本数
min_samples_leaf=10, # 叶节点最小样本数
random_state=42
)
# 训练模型
dt_classifier.fit(X_train, y_train)
# 预测
y_pred = dt_classifier.predict(X_test)
y_pred_proba = dt_classifier.predict_proba(X_test)
# 评估
accuracy = accuracy_score(y_test, y_pred)
print(f"决策树分类准确率: {accuracy:.4f}")
print("\n分类报告:")
print(classification_report(y_test, y_pred))6.4.2 决策边界可视化
python
def plot_decision_tree_boundary(X, y, model, title="决策树决策边界"):
"""绘制决策树的决策边界"""
plt.figure(figsize=(12, 8))
# 创建网格
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# 预测网格点
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 绘制决策边界
plt.contourf(xx, yy, Z, alpha=0.8, cmap='RdYlBu')
# 绘制数据点
colors = ['red', 'blue']
for i, color in enumerate(colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color,
label=f'类别 {i}', cmap='RdYlBu', edgecolors='black')
plt.xlabel('特征 1')
plt.ylabel('特征 2')
plt.title(title)
plt.legend()
plt.colorbar()
plt.show()
# 绘制决策边界
plot_decision_tree_boundary(X_train, y_train, dt_classifier, "决策树分类边界")6.4.3 决策树可视化
python
# 可视化决策树结构
plt.figure(figsize=(15, 10))
plot_tree(dt_classifier,
feature_names=['特征1', '特征2'],
class_names=['类别0', '类别1'],
filled=True,
rounded=True,
fontsize=10)
plt.title('决策树结构图')
plt.show()
# 文本形式的决策树
tree_rules = export_text(dt_classifier,
feature_names=['特征1', '特征2'])
print("决策树规则(文本形式):")
print(tree_rules)6.4.4 特征重要性
python
# 特征重要性分析
feature_importance = dt_classifier.feature_importances_
feature_names = ['特征1', '特征2']
plt.figure(figsize=(8, 6))
plt.bar(feature_names, feature_importance, color=['skyblue', 'lightcoral'])
plt.title('决策树特征重要性')
plt.xlabel('特征')
plt.ylabel('重要性')
plt.show()
print("特征重要性:")
for name, importance in zip(feature_names, feature_importance):
print(f"{name}: {importance:.4f}")6.5 回归决策树
6.5.1 创建回归数据
python
# 创建回归数据集
X_reg, y_reg = make_regression(
n_samples=200,
n_features=1,
noise=10,
random_state=42
)
# 添加一些非线性关系
X_reg = X_reg.flatten()
y_reg = y_reg + 0.1 * X_reg**2
# 分割数据
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg.reshape(-1, 1), y_reg, test_size=0.2, random_state=42
)
# 创建不同深度的回归决策树
depths = [2, 5, 10, None]
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('不同深度的回归决策树', fontsize=16)
for i, depth in enumerate(depths):
row = i // 2
col = i % 2
# 训练模型
dt_regressor = DecisionTreeRegressor(
max_depth=depth,
min_samples_split=20,
min_samples_leaf=10,
random_state=42
)
dt_regressor.fit(X_train_reg, y_train_reg)
# 预测
y_pred_reg = dt_regressor.predict(X_test_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
# 绘制结果
X_plot = np.linspace(X_reg.min(), X_reg.max(), 100).reshape(-1, 1)
y_plot = dt_regressor.predict(X_plot)
axes[row, col].scatter(X_train_reg, y_train_reg, alpha=0.6, label='训练数据')
axes[row, col].scatter(X_test_reg, y_test_reg, alpha=0.6, color='green', label='测试数据')
axes[row, col].plot(X_plot, y_plot, color='red', linewidth=2, label='决策树预测')
depth_str = str(depth) if depth is not None else '无限制'
axes[row, col].set_title(f'深度={depth_str}, R²={r2:.3f}, RMSE={rmse:.1f}')
axes[row, col].set_xlabel('X')
axes[row, col].set_ylabel('y')
axes[row, col].legend()
axes[row, col].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()6.5.2 回归树的分割过程
python
# 演示回归树的分割过程
def demonstrate_regression_splits():
"""演示回归决策树的分割过程"""
# 创建简单的一维数据
np.random.seed(42)
X_demo = np.linspace(0, 10, 50).reshape(-1, 1)
y_demo = np.sin(X_demo.flatten()) + 0.1 * np.random.randn(50)
# 训练浅层决策树
dt_demo = DecisionTreeRegressor(max_depth=3, random_state=42)
dt_demo.fit(X_demo, y_demo)
# 获取分割点
tree = dt_demo.tree_
plt.figure(figsize=(12, 8))
# 绘制原始数据
plt.scatter(X_demo, y_demo, alpha=0.6, color='blue', label='训练数据')
# 绘制预测曲线
X_plot = np.linspace(0, 10, 200).reshape(-1, 1)
y_plot = dt_demo.predict(X_plot)
plt.plot(X_plot, y_plot, color='red', linewidth=2, label='决策树预测')
# 标记分割点
def get_split_points(node_id, depth=0):
if tree.children_left[node_id] != tree.children_right[node_id]:
split_value = tree.threshold[node_id]
plt.axvline(x=split_value, color='green', linestyle='--', alpha=0.7)
plt.text(split_value, plt.ylim()[1] - 0.1 * (depth + 1),
f'分割{depth+1}', rotation=90, ha='right')
get_split_points(tree.children_left[node_id], depth + 1)
get_split_points(tree.children_right[node_id], depth + 1)
get_split_points(0)
plt.xlabel('X')
plt.ylabel('y')
plt.title('回归决策树的分割过程')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 打印分割信息
print("决策树分割信息:")
print(export_text(dt_demo, feature_names=['X']))
demonstrate_regression_splits()6.6 决策树的超参数
6.6.1 主要超参数说明
python
# 演示不同超参数的影响
def compare_hyperparameters():
"""比较不同超参数对决策树的影响"""
# 使用鸢尾花数据集
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
X_iris, y_iris, test_size=0.2, random_state=42
)
# 定义不同的超参数组合
hyperparams = {
'max_depth': [3, 5, 10, None],
'min_samples_split': [2, 10, 20],
'min_samples_leaf': [1, 5, 10],
'criterion': ['gini', 'entropy']
}
results = []
# 测试max_depth的影响
print("max_depth参数的影响:")
print("深度\t训练准确率\t测试准确率\t叶节点数")
print("-" * 50)
for depth in hyperparams['max_depth']:
dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
dt.fit(X_train_iris, y_train_iris)
train_acc = dt.score(X_train_iris, y_train_iris)
test_acc = dt.score(X_test_iris, y_test_iris)
n_leaves = dt.get_n_leaves()
depth_str = str(depth) if depth is not None else '无限制'
print(f"{depth_str}\t{train_acc:.4f}\t\t{test_acc:.4f}\t\t{n_leaves}")
results.append({
'param': 'max_depth',
'value': depth_str,
'train_acc': train_acc,
'test_acc': test_acc,
'n_leaves': n_leaves
})
# 可视化max_depth的影响
depths_numeric = [3, 5, 10, 20] # 用20代替None进行可视化
train_accs = []
test_accs = []
for depth in depths_numeric:
dt = DecisionTreeClassifier(max_depth=depth if depth != 20 else None, random_state=42)
dt.fit(X_train_iris, y_train_iris)
train_accs.append(dt.score(X_train_iris, y_train_iris))
test_accs.append(dt.score(X_test_iris, y_test_iris))
plt.figure(figsize=(10, 6))
plt.plot(depths_numeric, train_accs, 'o-', label='训练准确率', linewidth=2)
plt.plot(depths_numeric, test_accs, 'o-', label='测试准确率', linewidth=2)
plt.xlabel('最大深度')
plt.ylabel('准确率')
plt.title('决策树深度对性能的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(depths_numeric, ['3', '5', '10', '无限制'])
plt.show()
compare_hyperparameters()6.6.2 网格搜索优化
python
# 使用网格搜索优化超参数
param_grid = {
'max_depth': [3, 5, 7, 10, None],
'min_samples_split': [2, 5, 10, 20],
'min_samples_leaf': [1, 2, 5, 10],
'criterion': ['gini', 'entropy']
}
# 使用葡萄酒数据集
wine = load_wine()
X_wine, y_wine = wine.data, wine.target
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
X_wine, y_wine, test_size=0.2, random_state=42
)
# 网格搜索
grid_search = GridSearchCV(
DecisionTreeClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1
)
print("正在进行网格搜索...")
grid_search.fit(X_train_wine, y_train_wine)
print("网格搜索结果:")
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
# 测试最佳模型
best_dt = grid_search.best_estimator_
test_accuracy = best_dt.score(X_test_wine, y_test_wine)
print(f"测试集准确率: {test_accuracy:.4f}")
# 可视化网格搜索结果
results_df = pd.DataFrame(grid_search.cv_results_)
# 选择几个重要参数进行可视化
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
# max_depth vs 性能
depth_results = results_df.groupby('param_max_depth')['mean_test_score'].mean()
axes[0].bar(range(len(depth_results)), depth_results.values)
axes[0].set_xticks(range(len(depth_results)))
axes[0].set_xticklabels([str(x) if x is not None else 'None' for x in depth_results.index])
axes[0].set_xlabel('最大深度')
axes[0].set_ylabel('平均交叉验证得分')
axes[0].set_title('最大深度对性能的影响')
# criterion vs 性能
criterion_results = results_df.groupby('param_criterion')['mean_test_score'].mean()
axes[1].bar(criterion_results.index, criterion_results.values, color=['orange', 'green'])
axes[1].set_xlabel('分割准则')
axes[1].set_ylabel('平均交叉验证得分')
axes[1].set_title('分割准则对性能的影响')
plt.tight_layout()
plt.show()6.7 过拟合与剪枝
6.7.1 过拟合演示
python
def demonstrate_overfitting():
"""演示决策树的过拟合现象"""
# 创建带噪声的数据
np.random.seed(42)
X_noise = np.random.uniform(-3, 3, 200).reshape(-1, 1)
y_noise = np.sin(X_noise.flatten()) + 0.3 * np.random.randn(200)
X_train_noise, X_test_noise, y_train_noise, y_test_noise = train_test_split(
X_noise, y_noise, test_size=0.3, random_state=42
)
# 训练不同复杂度的决策树
complexities = [
{'max_depth': 2, 'min_samples_leaf': 20}, # 简单
{'max_depth': 5, 'min_samples_leaf': 10}, # 中等
{'max_depth': None, 'min_samples_leaf': 1} # 复杂
]
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for i, params in enumerate(complexities):
dt = DecisionTreeRegressor(random_state=42, **params)
dt.fit(X_train_noise, y_train_noise)
# 计算性能
train_score = dt.score(X_train_noise, y_train_noise)
test_score = dt.score(X_test_noise, y_test_noise)
# 绘制结果
X_plot = np.linspace(-3, 3, 300).reshape(-1, 1)
y_plot = dt.predict(X_plot)
axes[i].scatter(X_train_noise, y_train_noise, alpha=0.6, label='训练数据')
axes[i].scatter(X_test_noise, y_test_noise, alpha=0.6, color='green', label='测试数据')
axes[i].plot(X_plot, y_plot, color='red', linewidth=2, label='决策树预测')
# 真实函数
y_true = np.sin(X_plot.flatten())
axes[i].plot(X_plot, y_true, color='black', linestyle='--', alpha=0.7, label='真实函数')
complexity_name = ['简单模型', '中等复杂度', '复杂模型'][i]
axes[i].set_title(f'{complexity_name}\n训练R²={train_score:.3f}, 测试R²={test_score:.3f}')
axes[i].set_xlabel('X')
axes[i].set_ylabel('y')
axes[i].legend()
axes[i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
demonstrate_overfitting()6.7.2 学习曲线分析
python
from sklearn.model_selection import learning_curve
def plot_learning_curves():
"""绘制学习曲线分析过拟合"""
# 使用葡萄酒数据集
wine = load_wine()
X, y = wine.data, wine.target
# 比较不同复杂度的模型
models = {
'简单决策树': DecisionTreeClassifier(max_depth=3, min_samples_leaf=10, random_state=42),
'复杂决策树': DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, random_state=42),
'最佳决策树': grid_search.best_estimator_
}
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for i, (name, model) in enumerate(models.items()):
train_sizes, train_scores, val_scores = learning_curve(
model, X, y, cv=5, n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10),
scoring='accuracy'
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)
axes[i].plot(train_sizes, train_mean, 'o-', color='blue', label='训练得分')
axes[i].fill_between(train_sizes, train_mean - train_std, train_mean + train_std,
alpha=0.1, color='blue')
axes[i].plot(train_sizes, val_mean, 'o-', color='red', label='验证得分')
axes[i].fill_between(train_sizes, val_mean - val_std, val_mean + val_std,
alpha=0.1, color='red')
axes[i].set_xlabel('训练样本数')
axes[i].set_ylabel('准确率')
axes[i].set_title(f'{name}学习曲线')
axes[i].legend()
axes[i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plot_learning_curves()6.8 实际应用案例
6.8.1 客户流失预测
python
# 创建客户流失预测数据集
def create_customer_churn_dataset():
"""创建客户流失预测数据集"""
np.random.seed(42)
n_samples = 1000
# 生成特征
age = np.random.normal(40, 15, n_samples)
monthly_charges = np.random.normal(70, 20, n_samples)
total_charges = monthly_charges * np.random.normal(24, 12, n_samples) # 平均24个月
contract_length = np.random.choice([1, 12, 24], n_samples, p=[0.3, 0.4, 0.3])
tech_support = np.random.choice([0, 1], n_samples, p=[0.6, 0.4])
online_security = np.random.choice([0, 1], n_samples, p=[0.5, 0.5])
# 生成目标变量(流失概率基于特征)
churn_prob = (
0.01 * (age < 30).astype(int) + # 年轻客户更容易流失
0.02 * (monthly_charges > 80).astype(int) + # 高费用客户更容易流失
0.03 * (contract_length == 1).astype(int) + # 短合同客户更容易流失
-0.02 * tech_support + # 技术支持降低流失
-0.01 * online_security + # 在线安全降低流失
0.1 # 基础流失率
)
churn = np.random.binomial(1, np.clip(churn_prob, 0, 1), n_samples)
data = pd.DataFrame({
'年龄': age,
'月费用': monthly_charges,
'总费用': total_charges,
'合同长度': contract_length,
'技术支持': tech_support,
'在线安全': online_security,
'流失': churn
})
return data
# 创建数据集
churn_data = create_customer_churn_dataset()
print("客户流失数据集信息:")
print(churn_data.info())
print("\n流失率:")
print(churn_data['流失'].value_counts(normalize=True))
# 特征分析
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('客户特征与流失关系分析', fontsize=16)
features = ['年龄', '月费用', '总费用', '合同长度', '技术支持', '在线安全']
for i, feature in enumerate(features):
row = i // 3
col = i % 3
if feature in ['技术支持', '在线安全']:
# 类别特征
churn_data.groupby([feature, '流失']).size().unstack().plot(kind='bar', ax=axes[row, col])
axes[row, col].set_title(f'{feature} vs 流失')
axes[row, col].set_xlabel(feature)
axes[row, col].tick_params(axis='x', rotation=0)
else:
# 数值特征
for churn_status in [0, 1]:
data_subset = churn_data[churn_data['流失'] == churn_status][feature]
axes[row, col].hist(data_subset, alpha=0.6,
label=f'流失={churn_status}', bins=20)
axes[row, col].set_title(f'{feature} 分布')
axes[row, col].set_xlabel(feature)
axes[row, col].legend()
plt.tight_layout()
plt.show()6.8.2 构建流失预测模型
python
# 准备数据
X_churn = churn_data.drop('流失', axis=1)
y_churn = churn_data['流失']
X_train_churn, X_test_churn, y_train_churn, y_test_churn = train_test_split(
X_churn, y_churn, test_size=0.2, random_state=42, stratify=y_churn
)
# 训练决策树模型
churn_dt = DecisionTreeClassifier(
max_depth=5,
min_samples_split=50,
min_samples_leaf=20,
random_state=42
)
churn_dt.fit(X_train_churn, y_train_churn)
# 预测和评估
y_pred_churn = churn_dt.predict(X_test_churn)
y_pred_proba_churn = churn_dt.predict_proba(X_test_churn)
print("客户流失预测模型评估:")
print(f"准确率: {accuracy_score(y_test_churn, y_pred_churn):.4f}")
print("\n详细分类报告:")
print(classification_report(y_test_churn, y_pred_churn,
target_names=['未流失', '流失']))
# 混淆矩阵
cm = confusion_matrix(y_test_churn, y_pred_churn)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['未流失', '流失'],
yticklabels=['未流失', '流失'])
plt.title('客户流失预测混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.show()
# 特征重要性
feature_importance = churn_dt.feature_importances_
importance_df = pd.DataFrame({
'feature': X_churn.columns,
'importance': feature_importance
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.title('客户流失预测模型特征重要性')
plt.xlabel('重要性')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
print("特征重要性排序:")
for _, row in importance_df.iterrows():
print(f"{row['feature']}: {row['importance']:.4f}")6.8.3 决策规则解释
python
# 可视化决策树
plt.figure(figsize=(20, 12))
plot_tree(churn_dt,
feature_names=X_churn.columns,
class_names=['未流失', '流失'],
filled=True,
rounded=True,
fontsize=8)
plt.title('客户流失预测决策树')
plt.show()
# 提取决策规则
def extract_decision_rules(tree, feature_names, class_names):
"""提取决策树的规则"""
tree_ = tree.tree_
feature_name = [
feature_names[i] if i != -2
else "undefined!"
for i in tree_.feature
]
def recurse(node, depth, parent_rule=""):
indent = " " * depth
if tree_.feature[node] != -2:
name = feature_name[node]
threshold = tree_.threshold[node]
left_rule = f"{parent_rule} AND {name} <= {threshold:.2f}"
right_rule = f"{parent_rule} AND {name} > {threshold:.2f}"
print(f"{indent}if {name} <= {threshold:.2f}:")
recurse(tree_.children_left[node], depth + 1, left_rule)
print(f"{indent}else: # if {name} > {threshold:.2f}")
recurse(tree_.children_right[node], depth + 1, right_rule)
else:
# 叶节点
value = tree_.value[node][0]
predicted_class = class_names[np.argmax(value)]
confidence = np.max(value) / np.sum(value)
print(f"{indent}预测: {predicted_class} (置信度: {confidence:.3f})")
print(f"{indent}规则: {parent_rule.strip(' AND ')}")
print()
print("决策树规则:")
extract_decision_rules(churn_dt, X_churn.columns, ['未流失', '流失'])
# 预测示例
sample_customers = pd.DataFrame({
'年龄': [25, 45, 60],
'月费用': [90, 50, 30],
'总费用': [1800, 1200, 720],
'合同长度': [1, 12, 24],
'技术支持': [0, 1, 1],
'在线安全': [0, 1, 1]
})
predictions = churn_dt.predict(sample_customers)
probabilities = churn_dt.predict_proba(sample_customers)
print("客户流失预测示例:")
for i, (_, customer) in enumerate(sample_customers.iterrows()):
print(f"\n客户 {i+1}:")
print(f" 特征: {dict(customer)}")
print(f" 预测: {'流失' if predictions[i] == 1 else '未流失'}")
print(f" 流失概率: {probabilities[i][1]:.3f}")6.9 决策树 vs 其他算法
6.9.1 算法比较
python
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# 使用多个算法比较
algorithms = {
'决策树': DecisionTreeClassifier(max_depth=5, random_state=42),
'逻辑回归': LogisticRegression(random_state=42),
'支持向量机': SVC(random_state=42, probability=True),
'K近邻': KNeighborsClassifier(n_neighbors=5),
'朴素贝叶斯': GaussianNB()
}
# 使用鸢尾花数据集进行比较
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
X_iris, y_iris, test_size=0.2, random_state=42
)
results = {}
print("算法性能比较:")
print("算法\t\t训练时间\t准确率\t\t可解释性")
print("-" * 60)
import time
for name, algorithm in algorithms.items():
# 训练时间
start_time = time.time()
algorithm.fit(X_train_iris, y_train_iris)
training_time = time.time() - start_time
# 准确率
accuracy = algorithm.score(X_test_iris, y_test_iris)
# 可解释性评分(主观)
interpretability = {
'决策树': '高',
'逻辑回归': '中',
'支持向量机': '低',
'K近邻': '中',
'朴素贝叶斯': '中'
}
results[name] = {
'training_time': training_time,
'accuracy': accuracy,
'interpretability': interpretability[name]
}
print(f"{name}\t{training_time:.4f}s\t\t{accuracy:.4f}\t\t{interpretability[name]}")
# 可视化比较
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 准确率比较
accuracies = [results[name]['accuracy'] for name in algorithms.keys()]
axes[0].bar(algorithms.keys(), accuracies, color='skyblue')
axes[0].set_title('算法准确率比较')
axes[0].set_ylabel('准确率')
axes[0].tick_params(axis='x', rotation=45)
# 训练时间比较
times = [results[name]['training_time'] for name in algorithms.keys()]
axes[1].bar(algorithms.keys(), times, color='lightcoral')
axes[1].set_title('算法训练时间比较')
axes[1].set_ylabel('训练时间 (秒)')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()6.10 练习题
练习1:基础决策树
- 使用
make_classification生成一个二分类数据集 - 训练决策树并可视化决策边界
- 分析不同深度对模型性能的影响
练习2:回归决策树
- 创建一个包含非线性关系的回归数据集
- 比较决策树回归与线性回归的性能
- 分析决策树如何处理非线性关系
练习3:超参数优化
- 使用网格搜索优化决策树的超参数
- 分析不同超参数对过拟合的影响
- 绘制验证曲线分析最佳参数
练习4:实际应用
- 选择一个实际数据集(如泰坦尼克号生存预测)
- 构建决策树分类模型
- 解释模型的决策规则并分析特征重要性
6.11 小结
在本章中,我们深入学习了决策树算法的各个方面:
核心概念
- 决策树原理:信息增益、基尼不纯度、分割准则
- 树的构建:递归分割、停止条件、剪枝
- 分类与回归:不同任务类型的决策树应用
主要技术
- 分类决策树:处理离散目标变量
- 回归决策树:处理连续目标变量
- 超参数调优:深度控制、样本数限制
- 模型可视化:树结构图、决策边界
实践技能
- 过拟合控制:剪枝技术、复杂度控制
- 特征重要性:基于分割的特征选择
- 模型解释:规则提取、决策路径分析
- 实际应用:客户流失预测、医疗诊断
关键要点
- 决策树具有很好的可解释性,适合需要理解决策过程的场景
- 容易过拟合,需要通过剪枝和参数控制来避免
- 对数据的小变化敏感,但这也是集成方法的基础
- 能够自动进行特征选择,处理非线性关系
6.12 下一步
现在你已经掌握了决策树这个重要的基础算法!在下一章随机森林与集成方法中,我们将学习如何通过组合多个决策树来构建更强大、更稳定的模型。
章节要点回顾:
- ✅ 理解了决策树的构建原理和分割准则
- ✅ 掌握了分类和回归决策树的实现
- ✅ 学会了控制过拟合和进行超参数优化
- ✅ 了解了决策树的可视化和解释方法
- ✅ 掌握了特征重要性分析和实际应用
- ✅ 能够构建可解释的预测模型