Skip to content

第9章:朴素贝叶斯

朴素贝叶斯是基于贝叶斯定理的概率分类算法,以其简单、高效和良好的性能而闻名。尽管有"朴素"的假设,但在许多实际应用中表现出色,特别是在文本分类和垃圾邮件过滤等领域。

9.1 什么是朴素贝叶斯?

朴素贝叶斯基于贝叶斯定理,并假设特征之间相互独立(这就是"朴素"假设的来源)。尽管这个假设在现实中往往不成立,但朴素贝叶斯仍然在许多场景下表现良好。

9.1.1 贝叶斯定理

贝叶斯定理描述了在已知某些条件下,事件发生的概率:

P(A|B) = P(B|A) × P(A) / P(B)

在分类问题中:

P(类别|特征) = P(特征|类别) × P(类别) / P(特征)

9.1.2 朴素假设

朴素贝叶斯假设所有特征在给定类别的条件下相互独立:

P(x₁, x₂, ..., xₙ|y) = P(x₁|y) × P(x₂|y) × ... × P(xₙ|y)

9.1.3 朴素贝叶斯的优势

  • 训练速度快:只需要计算概率分布
  • 预测速度快:简单的概率计算
  • 内存效率高:只需存储概率参数
  • 处理多分类问题:天然支持多分类
  • 对小数据集友好:不需要大量训练数据
  • 提供概率输出:给出预测的置信度

9.1.4 朴素贝叶斯的劣势

  • 独立性假设:现实中特征往往相关
  • 对输入敏感:需要平滑处理零概率
  • 连续特征处理:需要假设分布类型

9.2 准备环境和数据

python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_iris, load_wine, fetch_20newsgroups
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, auc, precision_recall_curve
)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# 设置随机种子
np.random.seed(42)

# 设置图形样式
plt.style.use('seaborn-v0_8')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

9.3 高斯朴素贝叶斯

9.3.1 基本原理

高斯朴素贝叶斯假设每个特征在给定类别下服从正态分布。

python
# 演示高斯朴素贝叶斯的基本原理
def demonstrate_gaussian_nb_principle():
    """演示高斯朴素贝叶斯的基本原理"""
    
    # 创建简单的二分类数据
    np.random.seed(42)
    
    # 类别0:均值[2, 2],标准差[1, 1]
    class0_x1 = np.random.normal(2, 1, 100)
    class0_x2 = np.random.normal(2, 1, 100)
    
    # 类别1:均值[-2, -2],标准差[1, 1]
    class1_x1 = np.random.normal(-2, 1, 100)
    class1_x2 = np.random.normal(-2, 1, 100)
    
    X = np.vstack([np.column_stack([class0_x1, class0_x2]),
                   np.column_stack([class1_x1, class1_x2])])
    y = np.hstack([np.zeros(100), np.ones(100)])
    
    # 可视化数据和分布
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # 原始数据
    colors = ['red', 'blue']
    for i, color in enumerate(colors):
        mask = y == i
        axes[0].scatter(X[mask, 0], X[mask, 1], c=color, alpha=0.6, label=f'类别 {i}')
    
    axes[0].set_xlabel('特征 1')
    axes[0].set_ylabel('特征 2')
    axes[0].set_title('原始数据分布')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # 特征1的分布
    for i, color in enumerate(colors):
        mask = y == i
        axes[1].hist(X[mask, 0], bins=20, alpha=0.6, color=color, label=f'类别 {i}')
    
    axes[1].set_xlabel('特征 1')
    axes[1].set_ylabel('频次')
    axes[1].set_title('特征 1 的分布')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    # 特征2的分布
    for i, color in enumerate(colors):
        mask = y == i
        axes[2].hist(X[mask, 1], bins=20, alpha=0.6, color=color, label=f'类别 {i}')
    
    axes[2].set_xlabel('特征 2')
    axes[2].set_ylabel('频次')
    axes[2].set_title('特征 2 的分布')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return X, y

X_demo, y_demo = demonstrate_gaussian_nb_principle()

9.3.2 训练高斯朴素贝叶斯

python
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
    X_demo, y_demo, test_size=0.2, random_state=42, stratify=y_demo
)

# 创建高斯朴素贝叶斯分类器
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# 预测
y_pred = gnb.predict(X_test)
y_pred_proba = gnb.predict_proba(X_test)

# 评估
accuracy = accuracy_score(y_test, y_pred)
print(f"高斯朴素贝叶斯准确率: {accuracy:.4f}")

print("\n详细分类报告:")
print(classification_report(y_test, y_pred))

# 查看学习到的参数
print(f"\n模型参数:")
print(f"类别先验概率: {gnb.class_prior_}")
print(f"特征均值:")
for i, class_mean in enumerate(gnb.theta_):
    print(f"  类别 {i}: {class_mean}")
print(f"特征方差:")
for i, class_var in enumerate(gnb.sigma_):
    print(f"  类别 {i}: {class_var}")

9.3.3 决策边界可视化

python
def plot_nb_decision_boundary(X, y, model, title="朴素贝叶斯决策边界"):
    """绘制朴素贝叶斯的决策边界"""
    plt.figure(figsize=(10, 8))
    
    # 创建网格
    h = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # 预测网格点
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # 预测概率
    Z_proba = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z_proba = Z_proba.reshape(xx.shape)
    
    # 绘制决策边界和概率等高线
    plt.contourf(xx, yy, Z_proba, levels=50, alpha=0.8, cmap='RdYlBu')
    plt.colorbar(label='P(类别=1)')
    
    # 绘制决策边界
    plt.contour(xx, yy, Z_proba, levels=[0.5], colors='black', linestyles='--', linewidths=2)
    
    # 绘制数据点
    colors = ['red', 'blue']
    for i, color in enumerate(colors):
        mask = y == i
        plt.scatter(X[mask, 0], X[mask, 1], 
                   c=color, label=f'类别 {i}', alpha=0.7, edgecolors='black')
    
    plt.xlabel('特征 1')
    plt.ylabel('特征 2')
    plt.title(title)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# 绘制决策边界
plot_nb_decision_boundary(X_train, y_train, gnb, "高斯朴素贝叶斯决策边界")

9.3.4 与其他算法比较

python
# 比较朴素贝叶斯与其他算法
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# 使用鸢尾花数据集进行比较
iris = load_iris()
X_iris, y_iris = iris.data, iris.target

X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.2, random_state=42, stratify=y_iris
)

# 定义算法
algorithms = {
    '高斯朴素贝叶斯': GaussianNB(),
    '逻辑回归': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True),
    '决策树': DecisionTreeClassifier(random_state=42)
}

results = {}

print("算法性能比较(鸢尾花数据集):")
print("算法\t\t\t准确率\t\t交叉验证得分")
print("-" * 50)

for name, algorithm in algorithms.items():
    # 训练和预测
    algorithm.fit(X_train_iris, y_train_iris)
    y_pred_iris = algorithm.predict(X_test_iris)
    
    # 性能指标
    accuracy_iris = accuracy_score(y_test_iris, y_pred_iris)
    cv_scores = cross_val_score(algorithm, X_iris, y_iris, cv=5)
    cv_mean = np.mean(cv_scores)
    
    results[name] = {'accuracy': accuracy_iris, 'cv_score': cv_mean}
    print(f"{name}\t{accuracy_iris:.4f}\t\t{cv_mean:.4f}")

# 可视化比较
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in names]
cv_scores = [results[name]['cv_score'] for name in names]

# 测试准确率
axes[0].bar(names, accuracies, color='skyblue', alpha=0.7)
axes[0].set_title('测试集准确率比较')
axes[0].set_ylabel('准确率')
axes[0].tick_params(axis='x', rotation=45)
axes[0].set_ylim(0.8, 1.0)

# 交叉验证得分
axes[1].bar(names, cv_scores, color='lightgreen', alpha=0.7)
axes[1].set_title('交叉验证得分比较')
axes[1].set_ylabel('CV得分')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_ylim(0.8, 1.0)

plt.tight_layout()
plt.show()

9.4 多项式朴素贝叶斯

9.4.1 文本分类应用

多项式朴素贝叶斯特别适合处理离散特征,如文本数据中的词频。

python
# 创建文本分类数据
texts = [
    # 科技类
    "人工智能技术发展迅速,机器学习算法不断改进",
    "深度学习在图像识别领域取得重大突破",
    "云计算和大数据技术推动数字化转型",
    "区块链技术在金融领域应用广泛",
    "物联网设备连接数量快速增长",
    "5G网络建设加速推进",
    "自动驾驶汽车技术日趋成熟",
    "量子计算研究取得新进展",
    
    # 体育类
    "足球比赛精彩激烈,球员表现出色",
    "篮球联赛进入季后赛阶段",
    "游泳运动员打破世界纪录",
    "网球公开赛决赛即将开始",
    "马拉松比赛吸引众多跑者参与",
    "体操运动员展现完美技巧",
    "羽毛球世锦赛激战正酣",
    "滑雪运动在冬季备受欢迎",
    
    # 美食类
    "川菜以麻辣著称,口味独特",
    "粤菜注重原汁原味,制作精细",
    "意大利面条搭配各种酱料",
    "日式料理追求食材新鲜",
    "法式甜点制作工艺复杂",
    "烧烤美食深受大众喜爱",
    "海鲜料理营养丰富美味",
    "素食餐厅越来越受欢迎"
]

labels = [0]*8 + [1]*8 + [2]*8  # 0-科技,1-体育,2-美食
label_names = ['科技', '体育', '美食']

print(f"文本数据集信息:")
print(f"总文本数: {len(texts)}")
print(f"类别分布: {np.bincount(labels)}")

# 文本向量化
vectorizer = CountVectorizer(max_features=100)
X_text = vectorizer.fit_transform(texts)

print(f"特征维度: {X_text.shape}")
print(f"特征词汇: {len(vectorizer.get_feature_names_out())}")

# 分割数据
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X_text, labels, test_size=0.3, random_state=42, stratify=labels
)

# 训练多项式朴素贝叶斯
mnb = MultinomialNB(alpha=1.0)  # alpha是拉普拉斯平滑参数
mnb.fit(X_train_text, y_train_text)

# 预测
y_pred_text = mnb.predict(X_test_text)
y_pred_proba_text = mnb.predict_proba(X_test_text)

# 评估
accuracy_text = accuracy_score(y_test_text, y_pred_text)
print(f"\n多项式朴素贝叶斯文本分类准确率: {accuracy_text:.4f}")

print("\n详细分类报告:")
print(classification_report(y_test_text, y_pred_text, target_names=label_names))

9.4.2 特征重要性分析

python
# 分析最重要的特征词
feature_names = vectorizer.get_feature_names_out()

# 获取每个类别的特征对数概率
feature_log_prob = mnb.feature_log_prob_

print("每个类别最重要的特征词:")
print("=" * 50)

for i, class_name in enumerate(label_names):
    print(f"\n{class_name}类最重要的词:")
    
    # 获取该类别的特征概率
    class_prob = feature_log_prob[i]
    
    # 找出概率最高的词
    top_indices = np.argsort(class_prob)[-10:]
    
    for j, idx in enumerate(reversed(top_indices)):
        word = feature_names[idx]
        prob = np.exp(class_prob[idx])
        print(f"  {j+1:2d}. {word}: {prob:.4f}")

# 可视化特征重要性
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, class_name in enumerate(label_names):
    class_prob = np.exp(feature_log_prob[i])
    top_indices = np.argsort(class_prob)[-10:]
    top_words = [feature_names[idx] for idx in top_indices]
    top_probs = [class_prob[idx] for idx in top_indices]
    
    axes[i].barh(range(len(top_words)), top_probs)
    axes[i].set_yticks(range(len(top_words)))
    axes[i].set_yticklabels(top_words)
    axes[i].set_xlabel('概率')
    axes[i].set_title(f'{class_name}类重要特征词')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

9.4.3 平滑参数的影响

python
# 分析拉普拉斯平滑参数alpha的影响
alpha_values = [0.1, 0.5, 1.0, 2.0, 5.0]
alpha_results = {}

print("拉普拉斯平滑参数alpha的影响:")
print("alpha\t准确率\t\t交叉验证得分")
print("-" * 40)

for alpha in alpha_values:
    mnb_alpha = MultinomialNB(alpha=alpha)
    mnb_alpha.fit(X_train_text, y_train_text)
    
    # 测试集性能
    y_pred_alpha = mnb_alpha.predict(X_test_text)
    accuracy_alpha = accuracy_score(y_test_text, y_pred_alpha)
    
    # 交叉验证
    cv_scores = cross_val_score(mnb_alpha, X_text, labels, cv=5)
    cv_mean = np.mean(cv_scores)
    
    alpha_results[alpha] = {'accuracy': accuracy_alpha, 'cv_score': cv_mean}
    print(f"{alpha}\t{accuracy_alpha:.4f}\t\t{cv_mean:.4f}")

# 可视化alpha参数的影响
plt.figure(figsize=(10, 6))
alphas = list(alpha_results.keys())
accuracies = [alpha_results[alpha]['accuracy'] for alpha in alphas]
cv_scores = [alpha_results[alpha]['cv_score'] for alpha in alphas]

plt.plot(alphas, accuracies, 'o-', label='测试准确率', linewidth=2, markersize=8)
plt.plot(alphas, cv_scores, 's-', label='交叉验证得分', linewidth=2, markersize=8)

plt.xlabel('Alpha参数')
plt.ylabel('性能得分')
plt.title('拉普拉斯平滑参数对性能的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xscale('log')
plt.show()

9.5 伯努利朴素贝叶斯

9.5.1 二值特征处理

伯努利朴素贝叶斯适合处理二值特征,如文档中是否包含某个词。

python
# 创建二值特征数据
# 将文本转换为二值特征(词是否出现)
binary_vectorizer = CountVectorizer(binary=True, max_features=50)
X_binary = binary_vectorizer.fit_transform(texts)

print(f"二值特征数据形状: {X_binary.shape}")
print(f"特征示例(前5个文档的前10个特征):")
print(X_binary[:5, :10].toarray())

# 分割数据
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    X_binary, labels, test_size=0.3, random_state=42, stratify=labels
)

# 训练伯努利朴素贝叶斯
bnb = BernoulliNB(alpha=1.0)
bnb.fit(X_train_binary, y_train_binary)

# 预测
y_pred_binary = bnb.predict(X_test_binary)
accuracy_binary = accuracy_score(y_test_binary, y_pred_binary)

print(f"\n伯努利朴素贝叶斯准确率: {accuracy_binary:.4f}")

# 比较多项式和伯努利朴素贝叶斯
print("\n多项式 vs 伯努利朴素贝叶斯比较:")
print("模型\t\t\t准确率")
print("-" * 30)

# 多项式朴素贝叶斯(使用二值数据)
mnb_binary = MultinomialNB(alpha=1.0)
mnb_binary.fit(X_train_binary, y_train_binary)
y_pred_mnb_binary = mnb_binary.predict(X_test_binary)
accuracy_mnb_binary = accuracy_score(y_test_binary, y_pred_mnb_binary)

print(f"多项式朴素贝叶斯\t{accuracy_mnb_binary:.4f}")
print(f"伯努利朴素贝叶斯\t{accuracy_binary:.4f}")

9.5.2 特征选择的影响

python
# 分析特征数量对性能的影响
feature_numbers = [10, 20, 50, 100, 200]
performance_comparison = {}

for n_features in feature_numbers:
    # 创建不同特征数的向量化器
    vec_count = CountVectorizer(max_features=n_features)
    vec_binary = CountVectorizer(binary=True, max_features=n_features)
    
    # 向量化
    X_count = vec_count.fit_transform(texts)
    X_bin = vec_binary.fit_transform(texts)
    
    # 分割数据
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
        X_count, labels, test_size=0.3, random_state=42, stratify=labels
    )
    X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
        X_bin, labels, test_size=0.3, random_state=42, stratify=labels
    )
    
    # 训练模型
    mnb_temp = MultinomialNB(alpha=1.0)
    bnb_temp = BernoulliNB(alpha=1.0)
    
    mnb_temp.fit(X_train_c, y_train_c)
    bnb_temp.fit(X_train_b, y_train_b)
    
    # 预测
    y_pred_mnb_temp = mnb_temp.predict(X_test_c)
    y_pred_bnb_temp = bnb_temp.predict(X_test_b)
    
    # 计算准确率
    acc_mnb = accuracy_score(y_test_c, y_pred_mnb_temp)
    acc_bnb = accuracy_score(y_test_b, y_pred_bnb_temp)
    
    performance_comparison[n_features] = {
        'multinomial': acc_mnb,
        'bernoulli': acc_bnb
    }

# 可视化特征数量的影响
plt.figure(figsize=(10, 6))
features = list(performance_comparison.keys())
mnb_accs = [performance_comparison[f]['multinomial'] for f in features]
bnb_accs = [performance_comparison[f]['bernoulli'] for f in features]

plt.plot(features, mnb_accs, 'o-', label='多项式朴素贝叶斯', linewidth=2, markersize=8)
plt.plot(features, bnb_accs, 's-', label='伯努利朴素贝叶斯', linewidth=2, markersize=8)

plt.xlabel('特征数量')
plt.ylabel('准确率')
plt.title('特征数量对朴素贝叶斯性能的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print("特征数量对性能的影响:")
print("特征数\t多项式NB\t伯努利NB")
print("-" * 35)
for f in features:
    print(f"{f}\t{performance_comparison[f]['multinomial']:.4f}\t\t{performance_comparison[f]['bernoulli']:.4f}")

9.6 补集朴素贝叶斯

9.6.1 处理不平衡数据

补集朴素贝叶斯(Complement Naive Bayes)特别适合处理不平衡的文本分类问题。

python
# 创建不平衡的文本数据集
imbalanced_texts = texts[:12] + texts[16:20]  # 科技12个,体育4个,美食4个
imbalanced_labels = [0]*12 + [1]*4 + [2]*4

print("不平衡数据集:")
print(f"类别分布: {np.bincount(imbalanced_labels)}")
print(f"类别比例: {np.bincount(imbalanced_labels) / len(imbalanced_labels)}")

# 向量化
imbalanced_vectorizer = CountVectorizer(max_features=50)
X_imbalanced = imbalanced_vectorizer.fit_transform(imbalanced_texts)

# 分割数据
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
    X_imbalanced, imbalanced_labels, test_size=0.3, random_state=42, stratify=imbalanced_labels
)

# 比较不同朴素贝叶斯算法在不平衡数据上的表现
nb_algorithms = {
    '多项式朴素贝叶斯': MultinomialNB(alpha=1.0),
    '补集朴素贝叶斯': ComplementNB(alpha=1.0),
    '伯努利朴素贝叶斯': BernoulliNB(alpha=1.0)
}

print("\n不平衡数据集上的性能比较:")
print("算法\t\t\t准确率\t\t宏平均F1")
print("-" * 50)

from sklearn.metrics import f1_score

for name, algorithm in nb_algorithms.items():
    if name == '伯努利朴素贝叶斯':
        # 为伯努利朴素贝叶斯创建二值特征
        X_train_temp = (X_train_imb > 0).astype(int)
        X_test_temp = (X_test_imb > 0).astype(int)
    else:
        X_train_temp = X_train_imb
        X_test_temp = X_test_imb
    
    algorithm.fit(X_train_temp, y_train_imb)
    y_pred_imb = algorithm.predict(X_test_temp)
    
    accuracy_imb = accuracy_score(y_test_imb, y_pred_imb)
    f1_macro = f1_score(y_test_imb, y_pred_imb, average='macro')
    
    print(f"{name}\t{accuracy_imb:.4f}\t\t{f1_macro:.4f}")

# 详细分析补集朴素贝叶斯的性能
cnb = ComplementNB(alpha=1.0)
cnb.fit(X_train_imb, y_train_imb)
y_pred_cnb = cnb.predict(X_test_imb)

print(f"\n补集朴素贝叶斯详细分类报告:")
print(classification_report(y_test_imb, y_pred_cnb, target_names=label_names))

9.7 实际应用案例

9.7.1 垃圾邮件过滤

python
# 创建垃圾邮件分类数据集
spam_emails = [
    "恭喜您中奖了!立即点击领取大奖!",
    "免费获得iPhone,仅限今天!",
    "投资理财,月收益30%,无风险!",
    "减肥药效果神奇,一周瘦10斤!",
    "贷款无需抵押,当天放款!",
    "点击链接获得免费礼品!",
    "特价商品,限时抢购!",
    "网络兼职,日赚500元!"
]

normal_emails = [
    "明天的会议改到下午3点,请准时参加。",
    "您的订单已发货,预计3天内到达。",
    "感谢您参加我们的产品发布会。",
    "请查收本月的工作报告。",
    "周末聚餐的地点定在市中心餐厅。",
    "项目进度更新,请查看附件。",
    "生日快乐!祝您身体健康!",
    "课程安排有调整,请注意查看。"
]

# 合并数据
all_emails = spam_emails + normal_emails
email_labels = [1]*len(spam_emails) + [0]*len(normal_emails)  # 1-垃圾邮件,0-正常邮件

print("邮件分类数据集:")
print(f"总邮件数: {len(all_emails)}")
print(f"垃圾邮件: {sum(email_labels)}")
print(f"正常邮件: {len(email_labels) - sum(email_labels)}")

# 文本向量化
email_vectorizer = TfidfVectorizer(max_features=100, stop_words=None)
X_emails = email_vectorizer.fit_transform(all_emails)

# 分割数据
X_train_email, X_test_email, y_train_email, y_test_email = train_test_split(
    X_emails, email_labels, test_size=0.3, random_state=42, stratify=email_labels
)

# 训练多项式朴素贝叶斯
spam_classifier = MultinomialNB(alpha=1.0)
spam_classifier.fit(X_train_email, y_train_email)

# 预测
y_pred_email = spam_classifier.predict(X_test_email)
y_pred_proba_email = spam_classifier.predict_proba(X_test_email)

# 评估
accuracy_email = accuracy_score(y_test_email, y_pred_email)
print(f"\n垃圾邮件分类准确率: {accuracy_email:.4f}")

print("\n详细分类报告:")
print(classification_report(y_test_email, y_pred_email, 
                          target_names=['正常邮件', '垃圾邮件']))

# 分析重要特征
feature_names_email = email_vectorizer.get_feature_names_out()
feature_log_prob_email = spam_classifier.feature_log_prob_

print("\n垃圾邮件的重要特征词:")
spam_prob = np.exp(feature_log_prob_email[1])  # 垃圾邮件类别
top_spam_indices = np.argsort(spam_prob)[-10:]

for i, idx in enumerate(reversed(top_spam_indices)):
    word = feature_names_email[idx]
    prob = spam_prob[idx]
    print(f"  {i+1:2d}. {word}: {prob:.4f}")

print("\n正常邮件的重要特征词:")
normal_prob = np.exp(feature_log_prob_email[0])  # 正常邮件类别
top_normal_indices = np.argsort(normal_prob)[-10:]

for i, idx in enumerate(reversed(top_normal_indices)):
    word = feature_names_email[idx]
    prob = normal_prob[idx]
    print(f"  {i+1:2d}. {word}: {prob:.4f}")

9.7.2 情感分析

python
# 创建情感分析数据集
positive_reviews = [
    "这部电影太精彩了,演员表演出色!",
    "服务态度很好,菜品味道不错。",
    "产品质量优秀,物超所值。",
    "课程内容丰富,老师讲解清晰。",
    "环境优美,设施完善。",
    "工作人员热情友好,体验很棒。"
]

negative_reviews = [
    "电影剧情拖沓,浪费时间。",
    "服务差劲,态度恶劣。",
    "产品质量有问题,不推荐购买。",
    "课程内容过时,讲解不清楚。",
    "环境嘈杂,设施陈旧。",
    "工作人员不专业,体验糟糕。"
]

# 合并数据
all_reviews = positive_reviews + negative_reviews
sentiment_labels = [1]*len(positive_reviews) + [0]*len(negative_reviews)  # 1-正面,0-负面

print("情感分析数据集:")
print(f"总评论数: {len(all_reviews)}")
print(f"正面评论: {sum(sentiment_labels)}")
print(f"负面评论: {len(sentiment_labels) - sum(sentiment_labels)}")

# 文本向量化
sentiment_vectorizer = TfidfVectorizer(max_features=50)
X_sentiment = sentiment_vectorizer.fit_transform(all_reviews)

# 训练朴素贝叶斯分类器
sentiment_classifier = MultinomialNB(alpha=1.0)
sentiment_classifier.fit(X_sentiment, sentiment_labels)

# 测试新评论
test_reviews = [
    "这个产品真的很棒,强烈推荐!",
    "质量太差了,完全不值这个价格。",
    "服务还可以,但是有改进空间。"
]

test_vectors = sentiment_vectorizer.transform(test_reviews)
test_predictions = sentiment_classifier.predict(test_vectors)
test_probabilities = sentiment_classifier.predict_proba(test_vectors)

print("\n新评论情感分析结果:")
for i, review in enumerate(test_reviews):
    sentiment = "正面" if test_predictions[i] == 1 else "负面"
    confidence = np.max(test_probabilities[i])
    print(f"评论: {review}")
    print(f"情感: {sentiment} (置信度: {confidence:.3f})")
    print()

9.8 朴素贝叶斯的优化技巧

9.8.1 特征工程

python
# 演示不同特征工程技术对朴素贝叶斯的影响
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# 使用原始文本数据
original_texts = texts
original_labels = labels

# 1. 不同的向量化方法
vectorizers = {
    'CountVectorizer': CountVectorizer(max_features=100),
    'TfidfVectorizer': TfidfVectorizer(max_features=100),
    'Binary CountVectorizer': CountVectorizer(binary=True, max_features=100)
}

print("不同向量化方法的影响:")
print("方法\t\t\t准确率")
print("-" * 35)

for name, vectorizer in vectorizers.items():
    X_vec = vectorizer.fit_transform(original_texts)
    X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(
        X_vec, original_labels, test_size=0.3, random_state=42, stratify=original_labels
    )
    
    if 'Binary' in name:
        nb_vec = BernoulliNB(alpha=1.0)
    else:
        nb_vec = MultinomialNB(alpha=1.0)
    
    nb_vec.fit(X_train_vec, y_train_vec)
    y_pred_vec = nb_vec.predict(X_test_vec)
    accuracy_vec = accuracy_score(y_test_vec, y_pred_vec)
    
    print(f"{name}\t{accuracy_vec:.4f}")

# 2. 特征选择的影响
print(f"\n特征选择的影响:")
print("特征数\t\t准确率")
print("-" * 25)

# 使用TF-IDF向量化
tfidf_vec = TfidfVectorizer(max_features=200)
X_tfidf = tfidf_vec.fit_transform(original_texts)

k_values = [10, 20, 50, 100, 150]
for k in k_values:
    # 使用卡方检验选择特征
    selector = SelectKBest(chi2, k=k)
    X_selected = selector.fit_transform(X_tfidf, original_labels)
    
    X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(
        X_selected, original_labels, test_size=0.3, random_state=42, stratify=original_labels
    )
    
    nb_sel = MultinomialNB(alpha=1.0)
    nb_sel.fit(X_train_sel, y_train_sel)
    y_pred_sel = nb_sel.predict(X_test_sel)
    accuracy_sel = accuracy_score(y_test_sel, y_pred_sel)
    
    print(f"{k}\t\t{accuracy_sel:.4f}")

9.8.2 集成朴素贝叶斯

python
# 创建朴素贝叶斯集成模型
from sklearn.ensemble import VotingClassifier

# 准备不同的朴素贝叶斯模型
# 1. 基于词频的多项式朴素贝叶斯
count_vec = CountVectorizer(max_features=100)
X_count = count_vec.fit_transform(original_texts)

# 2. 基于TF-IDF的多项式朴素贝叶斯
tfidf_vec = TfidfVectorizer(max_features=100)
X_tfidf = tfidf_vec.fit_transform(original_texts)

# 3. 基于二值特征的伯努利朴素贝叶斯
binary_vec = CountVectorizer(binary=True, max_features=100)
X_binary = binary_vec.fit_transform(original_texts)

# 分割数据
X_train_count, X_test_count, y_train, y_test = train_test_split(
    X_count, original_labels, test_size=0.3, random_state=42, stratify=original_labels
)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
    X_tfidf, original_labels, test_size=0.3, random_state=42, stratify=original_labels
)
X_train_binary, X_test_binary, _, _ = train_test_split(
    X_binary, original_labels, test_size=0.3, random_state=42, stratify=original_labels
)

# 训练单个模型
nb_count = MultinomialNB(alpha=1.0)
nb_tfidf = MultinomialNB(alpha=1.0)
nb_binary = BernoulliNB(alpha=1.0)

nb_count.fit(X_train_count, y_train)
nb_tfidf.fit(X_train_tfidf, y_train)
nb_binary.fit(X_train_binary, y_train)

# 预测
y_pred_count = nb_count.predict(X_test_count)
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)
y_pred_binary = nb_binary.predict(X_test_binary)

# 简单投票集成
ensemble_pred = []
for i in range(len(y_test)):
    votes = [y_pred_count[i], y_pred_tfidf[i], y_pred_binary[i]]
    ensemble_pred.append(max(set(votes), key=votes.count))

# 评估结果
print("朴素贝叶斯集成结果:")
print("模型\t\t\t准确率")
print("-" * 35)
print(f"词频多项式NB\t\t{accuracy_score(y_test, y_pred_count):.4f}")
print(f"TF-IDF多项式NB\t\t{accuracy_score(y_test, y_pred_tfidf):.4f}")
print(f"二值伯努利NB\t\t{accuracy_score(y_test, y_pred_binary):.4f}")
print(f"投票集成\t\t{accuracy_score(y_test, ensemble_pred):.4f}")

# 可视化集成效果
models = ['词频NB', 'TF-IDF NB', '二值NB', '投票集成']
accuracies = [
    accuracy_score(y_test, y_pred_count),
    accuracy_score(y_test, y_pred_tfidf),
    accuracy_score(y_test, y_pred_binary),
    accuracy_score(y_test, ensemble_pred)
]

plt.figure(figsize=(10, 6))
bars = plt.bar(models, accuracies, color=['skyblue', 'lightgreen', 'lightcoral', 'gold'], alpha=0.7)
plt.title('朴素贝叶斯集成效果比较')
plt.ylabel('准确率')
plt.ylim(0.6, 1.0)

# 添加数值标签
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{acc:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

9.9 练习题

练习1:基础朴素贝叶斯

  1. 使用葡萄酒数据集训练高斯朴素贝叶斯分类器
  2. 分析模型学习到的参数(均值和方差)
  3. 比较标准化前后的性能差异

练习2:文本分类

  1. 收集或创建一个多类别文本数据集
  2. 比较多项式和伯努利朴素贝叶斯的性能
  3. 分析不同平滑参数对性能的影响

练习3:特征工程

  1. 使用新闻数据集进行文本分类
  2. 比较不同向量化方法(Count、TF-IDF、Binary)的效果
  3. 使用特征选择技术提升模型性能

练习4:不平衡数据处理

  1. 创建一个严重不平衡的分类数据集
  2. 比较不同朴素贝叶斯算法的表现
  3. 尝试使用采样技术改善性能

9.10 小结

在本章中,我们深入学习了朴素贝叶斯算法的各个方面:

核心概念

  • 贝叶斯定理:概率推理的数学基础
  • 朴素假设:特征独立性假设及其影响
  • 不同变体:高斯、多项式、伯努利、补集朴素贝叶斯

主要技术

  • 高斯朴素贝叶斯:处理连续特征,假设正态分布
  • 多项式朴素贝叶斯:处理离散特征,适合文本分类
  • 伯努利朴素贝叶斯:处理二值特征
  • 补集朴素贝叶斯:处理不平衡数据

实践技能

  • 文本分类:垃圾邮件过滤、情感分析
  • 特征工程:向量化、特征选择
  • 参数调优:平滑参数的选择和影响
  • 集成方法:组合不同朴素贝叶斯模型

关键要点

  • 朴素贝叶斯简单高效,适合快速原型开发
  • 在文本分类等高维稀疏数据上表现优秀
  • 需要合适的平滑技术处理零概率问题
  • 独立性假设虽然"朴素",但在实践中往往有效

9.11 下一步

现在你已经掌握了朴素贝叶斯这个重要的概率分类算法!在下一章K近邻算法中,我们将学习一个完全不同的方法——基于实例的学习,了解"近朱者赤"的机器学习思想。


章节要点回顾

  • ✅ 理解了贝叶斯定理和朴素假设
  • ✅ 掌握了不同类型朴素贝叶斯的应用场景
  • ✅ 学会了文本分类的完整流程
  • ✅ 了解了特征工程对朴素贝叶斯的重要性
  • ✅ 掌握了处理不平衡数据的技巧
  • ✅ 能够构建实用的文本分类系统

本站内容仅供学习和研究使用。