第9章:朴素贝叶斯
朴素贝叶斯是基于贝叶斯定理的概率分类算法,以其简单、高效和良好的性能而闻名。尽管有"朴素"的假设,但在许多实际应用中表现出色,特别是在文本分类和垃圾邮件过滤等领域。
9.1 什么是朴素贝叶斯?
朴素贝叶斯基于贝叶斯定理,并假设特征之间相互独立(这就是"朴素"假设的来源)。尽管这个假设在现实中往往不成立,但朴素贝叶斯仍然在许多场景下表现良好。
9.1.1 贝叶斯定理
贝叶斯定理描述了在已知某些条件下,事件发生的概率:
P(A|B) = P(B|A) × P(A) / P(B)在分类问题中:
P(类别|特征) = P(特征|类别) × P(类别) / P(特征)9.1.2 朴素假设
朴素贝叶斯假设所有特征在给定类别的条件下相互独立:
P(x₁, x₂, ..., xₙ|y) = P(x₁|y) × P(x₂|y) × ... × P(xₙ|y)9.1.3 朴素贝叶斯的优势
- 训练速度快:只需要计算概率分布
- 预测速度快:简单的概率计算
- 内存效率高:只需存储概率参数
- 处理多分类问题:天然支持多分类
- 对小数据集友好:不需要大量训练数据
- 提供概率输出:给出预测的置信度
9.1.4 朴素贝叶斯的劣势
- 独立性假设:现实中特征往往相关
- 对输入敏感:需要平滑处理零概率
- 连续特征处理:需要假设分布类型
9.2 准备环境和数据
python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_iris, load_wine, fetch_20newsgroups
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix,
roc_curve, auc, precision_recall_curve
)
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
# 设置随机种子
np.random.seed(42)
# 设置图形样式
plt.style.use('seaborn-v0_8')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False9.3 高斯朴素贝叶斯
9.3.1 基本原理
高斯朴素贝叶斯假设每个特征在给定类别下服从正态分布。
python
# 演示高斯朴素贝叶斯的基本原理
def demonstrate_gaussian_nb_principle():
"""演示高斯朴素贝叶斯的基本原理"""
# 创建简单的二分类数据
np.random.seed(42)
# 类别0:均值[2, 2],标准差[1, 1]
class0_x1 = np.random.normal(2, 1, 100)
class0_x2 = np.random.normal(2, 1, 100)
# 类别1:均值[-2, -2],标准差[1, 1]
class1_x1 = np.random.normal(-2, 1, 100)
class1_x2 = np.random.normal(-2, 1, 100)
X = np.vstack([np.column_stack([class0_x1, class0_x2]),
np.column_stack([class1_x1, class1_x2])])
y = np.hstack([np.zeros(100), np.ones(100)])
# 可视化数据和分布
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# 原始数据
colors = ['red', 'blue']
for i, color in enumerate(colors):
mask = y == i
axes[0].scatter(X[mask, 0], X[mask, 1], c=color, alpha=0.6, label=f'类别 {i}')
axes[0].set_xlabel('特征 1')
axes[0].set_ylabel('特征 2')
axes[0].set_title('原始数据分布')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# 特征1的分布
for i, color in enumerate(colors):
mask = y == i
axes[1].hist(X[mask, 0], bins=20, alpha=0.6, color=color, label=f'类别 {i}')
axes[1].set_xlabel('特征 1')
axes[1].set_ylabel('频次')
axes[1].set_title('特征 1 的分布')
axes[1].legend()
axes[1].grid(True, alpha=0.3)
# 特征2的分布
for i, color in enumerate(colors):
mask = y == i
axes[2].hist(X[mask, 1], bins=20, alpha=0.6, color=color, label=f'类别 {i}')
axes[2].set_xlabel('特征 2')
axes[2].set_ylabel('频次')
axes[2].set_title('特征 2 的分布')
axes[2].legend()
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return X, y
X_demo, y_demo = demonstrate_gaussian_nb_principle()9.3.2 训练高斯朴素贝叶斯
python
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
X_demo, y_demo, test_size=0.2, random_state=42, stratify=y_demo
)
# 创建高斯朴素贝叶斯分类器
gnb = GaussianNB()
gnb.fit(X_train, y_train)
# 预测
y_pred = gnb.predict(X_test)
y_pred_proba = gnb.predict_proba(X_test)
# 评估
accuracy = accuracy_score(y_test, y_pred)
print(f"高斯朴素贝叶斯准确率: {accuracy:.4f}")
print("\n详细分类报告:")
print(classification_report(y_test, y_pred))
# 查看学习到的参数
print(f"\n模型参数:")
print(f"类别先验概率: {gnb.class_prior_}")
print(f"特征均值:")
for i, class_mean in enumerate(gnb.theta_):
print(f" 类别 {i}: {class_mean}")
print(f"特征方差:")
for i, class_var in enumerate(gnb.sigma_):
print(f" 类别 {i}: {class_var}")9.3.3 决策边界可视化
python
def plot_nb_decision_boundary(X, y, model, title="朴素贝叶斯决策边界"):
"""绘制朴素贝叶斯的决策边界"""
plt.figure(figsize=(10, 8))
# 创建网格
h = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# 预测网格点
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 预测概率
Z_proba = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z_proba = Z_proba.reshape(xx.shape)
# 绘制决策边界和概率等高线
plt.contourf(xx, yy, Z_proba, levels=50, alpha=0.8, cmap='RdYlBu')
plt.colorbar(label='P(类别=1)')
# 绘制决策边界
plt.contour(xx, yy, Z_proba, levels=[0.5], colors='black', linestyles='--', linewidths=2)
# 绘制数据点
colors = ['red', 'blue']
for i, color in enumerate(colors):
mask = y == i
plt.scatter(X[mask, 0], X[mask, 1],
c=color, label=f'类别 {i}', alpha=0.7, edgecolors='black')
plt.xlabel('特征 1')
plt.ylabel('特征 2')
plt.title(title)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 绘制决策边界
plot_nb_decision_boundary(X_train, y_train, gnb, "高斯朴素贝叶斯决策边界")9.3.4 与其他算法比较
python
# 比较朴素贝叶斯与其他算法
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# 使用鸢尾花数据集进行比较
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
X_iris, y_iris, test_size=0.2, random_state=42, stratify=y_iris
)
# 定义算法
algorithms = {
'高斯朴素贝叶斯': GaussianNB(),
'逻辑回归': LogisticRegression(random_state=42, max_iter=1000),
'SVM': SVC(random_state=42, probability=True),
'决策树': DecisionTreeClassifier(random_state=42)
}
results = {}
print("算法性能比较(鸢尾花数据集):")
print("算法\t\t\t准确率\t\t交叉验证得分")
print("-" * 50)
for name, algorithm in algorithms.items():
# 训练和预测
algorithm.fit(X_train_iris, y_train_iris)
y_pred_iris = algorithm.predict(X_test_iris)
# 性能指标
accuracy_iris = accuracy_score(y_test_iris, y_pred_iris)
cv_scores = cross_val_score(algorithm, X_iris, y_iris, cv=5)
cv_mean = np.mean(cv_scores)
results[name] = {'accuracy': accuracy_iris, 'cv_score': cv_mean}
print(f"{name}\t{accuracy_iris:.4f}\t\t{cv_mean:.4f}")
# 可视化比较
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in names]
cv_scores = [results[name]['cv_score'] for name in names]
# 测试准确率
axes[0].bar(names, accuracies, color='skyblue', alpha=0.7)
axes[0].set_title('测试集准确率比较')
axes[0].set_ylabel('准确率')
axes[0].tick_params(axis='x', rotation=45)
axes[0].set_ylim(0.8, 1.0)
# 交叉验证得分
axes[1].bar(names, cv_scores, color='lightgreen', alpha=0.7)
axes[1].set_title('交叉验证得分比较')
axes[1].set_ylabel('CV得分')
axes[1].tick_params(axis='x', rotation=45)
axes[1].set_ylim(0.8, 1.0)
plt.tight_layout()
plt.show()9.4 多项式朴素贝叶斯
9.4.1 文本分类应用
多项式朴素贝叶斯特别适合处理离散特征,如文本数据中的词频。
python
# 创建文本分类数据
texts = [
# 科技类
"人工智能技术发展迅速,机器学习算法不断改进",
"深度学习在图像识别领域取得重大突破",
"云计算和大数据技术推动数字化转型",
"区块链技术在金融领域应用广泛",
"物联网设备连接数量快速增长",
"5G网络建设加速推进",
"自动驾驶汽车技术日趋成熟",
"量子计算研究取得新进展",
# 体育类
"足球比赛精彩激烈,球员表现出色",
"篮球联赛进入季后赛阶段",
"游泳运动员打破世界纪录",
"网球公开赛决赛即将开始",
"马拉松比赛吸引众多跑者参与",
"体操运动员展现完美技巧",
"羽毛球世锦赛激战正酣",
"滑雪运动在冬季备受欢迎",
# 美食类
"川菜以麻辣著称,口味独特",
"粤菜注重原汁原味,制作精细",
"意大利面条搭配各种酱料",
"日式料理追求食材新鲜",
"法式甜点制作工艺复杂",
"烧烤美食深受大众喜爱",
"海鲜料理营养丰富美味",
"素食餐厅越来越受欢迎"
]
labels = [0]*8 + [1]*8 + [2]*8 # 0-科技,1-体育,2-美食
label_names = ['科技', '体育', '美食']
print(f"文本数据集信息:")
print(f"总文本数: {len(texts)}")
print(f"类别分布: {np.bincount(labels)}")
# 文本向量化
vectorizer = CountVectorizer(max_features=100)
X_text = vectorizer.fit_transform(texts)
print(f"特征维度: {X_text.shape}")
print(f"特征词汇: {len(vectorizer.get_feature_names_out())}")
# 分割数据
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
X_text, labels, test_size=0.3, random_state=42, stratify=labels
)
# 训练多项式朴素贝叶斯
mnb = MultinomialNB(alpha=1.0) # alpha是拉普拉斯平滑参数
mnb.fit(X_train_text, y_train_text)
# 预测
y_pred_text = mnb.predict(X_test_text)
y_pred_proba_text = mnb.predict_proba(X_test_text)
# 评估
accuracy_text = accuracy_score(y_test_text, y_pred_text)
print(f"\n多项式朴素贝叶斯文本分类准确率: {accuracy_text:.4f}")
print("\n详细分类报告:")
print(classification_report(y_test_text, y_pred_text, target_names=label_names))9.4.2 特征重要性分析
python
# 分析最重要的特征词
feature_names = vectorizer.get_feature_names_out()
# 获取每个类别的特征对数概率
feature_log_prob = mnb.feature_log_prob_
print("每个类别最重要的特征词:")
print("=" * 50)
for i, class_name in enumerate(label_names):
print(f"\n{class_name}类最重要的词:")
# 获取该类别的特征概率
class_prob = feature_log_prob[i]
# 找出概率最高的词
top_indices = np.argsort(class_prob)[-10:]
for j, idx in enumerate(reversed(top_indices)):
word = feature_names[idx]
prob = np.exp(class_prob[idx])
print(f" {j+1:2d}. {word}: {prob:.4f}")
# 可视化特征重要性
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for i, class_name in enumerate(label_names):
class_prob = np.exp(feature_log_prob[i])
top_indices = np.argsort(class_prob)[-10:]
top_words = [feature_names[idx] for idx in top_indices]
top_probs = [class_prob[idx] for idx in top_indices]
axes[i].barh(range(len(top_words)), top_probs)
axes[i].set_yticks(range(len(top_words)))
axes[i].set_yticklabels(top_words)
axes[i].set_xlabel('概率')
axes[i].set_title(f'{class_name}类重要特征词')
axes[i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()9.4.3 平滑参数的影响
python
# 分析拉普拉斯平滑参数alpha的影响
alpha_values = [0.1, 0.5, 1.0, 2.0, 5.0]
alpha_results = {}
print("拉普拉斯平滑参数alpha的影响:")
print("alpha\t准确率\t\t交叉验证得分")
print("-" * 40)
for alpha in alpha_values:
mnb_alpha = MultinomialNB(alpha=alpha)
mnb_alpha.fit(X_train_text, y_train_text)
# 测试集性能
y_pred_alpha = mnb_alpha.predict(X_test_text)
accuracy_alpha = accuracy_score(y_test_text, y_pred_alpha)
# 交叉验证
cv_scores = cross_val_score(mnb_alpha, X_text, labels, cv=5)
cv_mean = np.mean(cv_scores)
alpha_results[alpha] = {'accuracy': accuracy_alpha, 'cv_score': cv_mean}
print(f"{alpha}\t{accuracy_alpha:.4f}\t\t{cv_mean:.4f}")
# 可视化alpha参数的影响
plt.figure(figsize=(10, 6))
alphas = list(alpha_results.keys())
accuracies = [alpha_results[alpha]['accuracy'] for alpha in alphas]
cv_scores = [alpha_results[alpha]['cv_score'] for alpha in alphas]
plt.plot(alphas, accuracies, 'o-', label='测试准确率', linewidth=2, markersize=8)
plt.plot(alphas, cv_scores, 's-', label='交叉验证得分', linewidth=2, markersize=8)
plt.xlabel('Alpha参数')
plt.ylabel('性能得分')
plt.title('拉普拉斯平滑参数对性能的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xscale('log')
plt.show()9.5 伯努利朴素贝叶斯
9.5.1 二值特征处理
伯努利朴素贝叶斯适合处理二值特征,如文档中是否包含某个词。
python
# 创建二值特征数据
# 将文本转换为二值特征(词是否出现)
binary_vectorizer = CountVectorizer(binary=True, max_features=50)
X_binary = binary_vectorizer.fit_transform(texts)
print(f"二值特征数据形状: {X_binary.shape}")
print(f"特征示例(前5个文档的前10个特征):")
print(X_binary[:5, :10].toarray())
# 分割数据
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
X_binary, labels, test_size=0.3, random_state=42, stratify=labels
)
# 训练伯努利朴素贝叶斯
bnb = BernoulliNB(alpha=1.0)
bnb.fit(X_train_binary, y_train_binary)
# 预测
y_pred_binary = bnb.predict(X_test_binary)
accuracy_binary = accuracy_score(y_test_binary, y_pred_binary)
print(f"\n伯努利朴素贝叶斯准确率: {accuracy_binary:.4f}")
# 比较多项式和伯努利朴素贝叶斯
print("\n多项式 vs 伯努利朴素贝叶斯比较:")
print("模型\t\t\t准确率")
print("-" * 30)
# 多项式朴素贝叶斯(使用二值数据)
mnb_binary = MultinomialNB(alpha=1.0)
mnb_binary.fit(X_train_binary, y_train_binary)
y_pred_mnb_binary = mnb_binary.predict(X_test_binary)
accuracy_mnb_binary = accuracy_score(y_test_binary, y_pred_mnb_binary)
print(f"多项式朴素贝叶斯\t{accuracy_mnb_binary:.4f}")
print(f"伯努利朴素贝叶斯\t{accuracy_binary:.4f}")9.5.2 特征选择的影响
python
# 分析特征数量对性能的影响
feature_numbers = [10, 20, 50, 100, 200]
performance_comparison = {}
for n_features in feature_numbers:
# 创建不同特征数的向量化器
vec_count = CountVectorizer(max_features=n_features)
vec_binary = CountVectorizer(binary=True, max_features=n_features)
# 向量化
X_count = vec_count.fit_transform(texts)
X_bin = vec_binary.fit_transform(texts)
# 分割数据
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
X_count, labels, test_size=0.3, random_state=42, stratify=labels
)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
X_bin, labels, test_size=0.3, random_state=42, stratify=labels
)
# 训练模型
mnb_temp = MultinomialNB(alpha=1.0)
bnb_temp = BernoulliNB(alpha=1.0)
mnb_temp.fit(X_train_c, y_train_c)
bnb_temp.fit(X_train_b, y_train_b)
# 预测
y_pred_mnb_temp = mnb_temp.predict(X_test_c)
y_pred_bnb_temp = bnb_temp.predict(X_test_b)
# 计算准确率
acc_mnb = accuracy_score(y_test_c, y_pred_mnb_temp)
acc_bnb = accuracy_score(y_test_b, y_pred_bnb_temp)
performance_comparison[n_features] = {
'multinomial': acc_mnb,
'bernoulli': acc_bnb
}
# 可视化特征数量的影响
plt.figure(figsize=(10, 6))
features = list(performance_comparison.keys())
mnb_accs = [performance_comparison[f]['multinomial'] for f in features]
bnb_accs = [performance_comparison[f]['bernoulli'] for f in features]
plt.plot(features, mnb_accs, 'o-', label='多项式朴素贝叶斯', linewidth=2, markersize=8)
plt.plot(features, bnb_accs, 's-', label='伯努利朴素贝叶斯', linewidth=2, markersize=8)
plt.xlabel('特征数量')
plt.ylabel('准确率')
plt.title('特征数量对朴素贝叶斯性能的影响')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print("特征数量对性能的影响:")
print("特征数\t多项式NB\t伯努利NB")
print("-" * 35)
for f in features:
print(f"{f}\t{performance_comparison[f]['multinomial']:.4f}\t\t{performance_comparison[f]['bernoulli']:.4f}")9.6 补集朴素贝叶斯
9.6.1 处理不平衡数据
补集朴素贝叶斯(Complement Naive Bayes)特别适合处理不平衡的文本分类问题。
python
# 创建不平衡的文本数据集
imbalanced_texts = texts[:12] + texts[16:20] # 科技12个,体育4个,美食4个
imbalanced_labels = [0]*12 + [1]*4 + [2]*4
print("不平衡数据集:")
print(f"类别分布: {np.bincount(imbalanced_labels)}")
print(f"类别比例: {np.bincount(imbalanced_labels) / len(imbalanced_labels)}")
# 向量化
imbalanced_vectorizer = CountVectorizer(max_features=50)
X_imbalanced = imbalanced_vectorizer.fit_transform(imbalanced_texts)
# 分割数据
X_train_imb, X_test_imb, y_train_imb, y_test_imb = train_test_split(
X_imbalanced, imbalanced_labels, test_size=0.3, random_state=42, stratify=imbalanced_labels
)
# 比较不同朴素贝叶斯算法在不平衡数据上的表现
nb_algorithms = {
'多项式朴素贝叶斯': MultinomialNB(alpha=1.0),
'补集朴素贝叶斯': ComplementNB(alpha=1.0),
'伯努利朴素贝叶斯': BernoulliNB(alpha=1.0)
}
print("\n不平衡数据集上的性能比较:")
print("算法\t\t\t准确率\t\t宏平均F1")
print("-" * 50)
from sklearn.metrics import f1_score
for name, algorithm in nb_algorithms.items():
if name == '伯努利朴素贝叶斯':
# 为伯努利朴素贝叶斯创建二值特征
X_train_temp = (X_train_imb > 0).astype(int)
X_test_temp = (X_test_imb > 0).astype(int)
else:
X_train_temp = X_train_imb
X_test_temp = X_test_imb
algorithm.fit(X_train_temp, y_train_imb)
y_pred_imb = algorithm.predict(X_test_temp)
accuracy_imb = accuracy_score(y_test_imb, y_pred_imb)
f1_macro = f1_score(y_test_imb, y_pred_imb, average='macro')
print(f"{name}\t{accuracy_imb:.4f}\t\t{f1_macro:.4f}")
# 详细分析补集朴素贝叶斯的性能
cnb = ComplementNB(alpha=1.0)
cnb.fit(X_train_imb, y_train_imb)
y_pred_cnb = cnb.predict(X_test_imb)
print(f"\n补集朴素贝叶斯详细分类报告:")
print(classification_report(y_test_imb, y_pred_cnb, target_names=label_names))9.7 实际应用案例
9.7.1 垃圾邮件过滤
python
# 创建垃圾邮件分类数据集
spam_emails = [
"恭喜您中奖了!立即点击领取大奖!",
"免费获得iPhone,仅限今天!",
"投资理财,月收益30%,无风险!",
"减肥药效果神奇,一周瘦10斤!",
"贷款无需抵押,当天放款!",
"点击链接获得免费礼品!",
"特价商品,限时抢购!",
"网络兼职,日赚500元!"
]
normal_emails = [
"明天的会议改到下午3点,请准时参加。",
"您的订单已发货,预计3天内到达。",
"感谢您参加我们的产品发布会。",
"请查收本月的工作报告。",
"周末聚餐的地点定在市中心餐厅。",
"项目进度更新,请查看附件。",
"生日快乐!祝您身体健康!",
"课程安排有调整,请注意查看。"
]
# 合并数据
all_emails = spam_emails + normal_emails
email_labels = [1]*len(spam_emails) + [0]*len(normal_emails) # 1-垃圾邮件,0-正常邮件
print("邮件分类数据集:")
print(f"总邮件数: {len(all_emails)}")
print(f"垃圾邮件: {sum(email_labels)}")
print(f"正常邮件: {len(email_labels) - sum(email_labels)}")
# 文本向量化
email_vectorizer = TfidfVectorizer(max_features=100, stop_words=None)
X_emails = email_vectorizer.fit_transform(all_emails)
# 分割数据
X_train_email, X_test_email, y_train_email, y_test_email = train_test_split(
X_emails, email_labels, test_size=0.3, random_state=42, stratify=email_labels
)
# 训练多项式朴素贝叶斯
spam_classifier = MultinomialNB(alpha=1.0)
spam_classifier.fit(X_train_email, y_train_email)
# 预测
y_pred_email = spam_classifier.predict(X_test_email)
y_pred_proba_email = spam_classifier.predict_proba(X_test_email)
# 评估
accuracy_email = accuracy_score(y_test_email, y_pred_email)
print(f"\n垃圾邮件分类准确率: {accuracy_email:.4f}")
print("\n详细分类报告:")
print(classification_report(y_test_email, y_pred_email,
target_names=['正常邮件', '垃圾邮件']))
# 分析重要特征
feature_names_email = email_vectorizer.get_feature_names_out()
feature_log_prob_email = spam_classifier.feature_log_prob_
print("\n垃圾邮件的重要特征词:")
spam_prob = np.exp(feature_log_prob_email[1]) # 垃圾邮件类别
top_spam_indices = np.argsort(spam_prob)[-10:]
for i, idx in enumerate(reversed(top_spam_indices)):
word = feature_names_email[idx]
prob = spam_prob[idx]
print(f" {i+1:2d}. {word}: {prob:.4f}")
print("\n正常邮件的重要特征词:")
normal_prob = np.exp(feature_log_prob_email[0]) # 正常邮件类别
top_normal_indices = np.argsort(normal_prob)[-10:]
for i, idx in enumerate(reversed(top_normal_indices)):
word = feature_names_email[idx]
prob = normal_prob[idx]
print(f" {i+1:2d}. {word}: {prob:.4f}")9.7.2 情感分析
python
# 创建情感分析数据集
positive_reviews = [
"这部电影太精彩了,演员表演出色!",
"服务态度很好,菜品味道不错。",
"产品质量优秀,物超所值。",
"课程内容丰富,老师讲解清晰。",
"环境优美,设施完善。",
"工作人员热情友好,体验很棒。"
]
negative_reviews = [
"电影剧情拖沓,浪费时间。",
"服务差劲,态度恶劣。",
"产品质量有问题,不推荐购买。",
"课程内容过时,讲解不清楚。",
"环境嘈杂,设施陈旧。",
"工作人员不专业,体验糟糕。"
]
# 合并数据
all_reviews = positive_reviews + negative_reviews
sentiment_labels = [1]*len(positive_reviews) + [0]*len(negative_reviews) # 1-正面,0-负面
print("情感分析数据集:")
print(f"总评论数: {len(all_reviews)}")
print(f"正面评论: {sum(sentiment_labels)}")
print(f"负面评论: {len(sentiment_labels) - sum(sentiment_labels)}")
# 文本向量化
sentiment_vectorizer = TfidfVectorizer(max_features=50)
X_sentiment = sentiment_vectorizer.fit_transform(all_reviews)
# 训练朴素贝叶斯分类器
sentiment_classifier = MultinomialNB(alpha=1.0)
sentiment_classifier.fit(X_sentiment, sentiment_labels)
# 测试新评论
test_reviews = [
"这个产品真的很棒,强烈推荐!",
"质量太差了,完全不值这个价格。",
"服务还可以,但是有改进空间。"
]
test_vectors = sentiment_vectorizer.transform(test_reviews)
test_predictions = sentiment_classifier.predict(test_vectors)
test_probabilities = sentiment_classifier.predict_proba(test_vectors)
print("\n新评论情感分析结果:")
for i, review in enumerate(test_reviews):
sentiment = "正面" if test_predictions[i] == 1 else "负面"
confidence = np.max(test_probabilities[i])
print(f"评论: {review}")
print(f"情感: {sentiment} (置信度: {confidence:.3f})")
print()9.8 朴素贝叶斯的优化技巧
9.8.1 特征工程
python
# 演示不同特征工程技术对朴素贝叶斯的影响
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
# 使用原始文本数据
original_texts = texts
original_labels = labels
# 1. 不同的向量化方法
vectorizers = {
'CountVectorizer': CountVectorizer(max_features=100),
'TfidfVectorizer': TfidfVectorizer(max_features=100),
'Binary CountVectorizer': CountVectorizer(binary=True, max_features=100)
}
print("不同向量化方法的影响:")
print("方法\t\t\t准确率")
print("-" * 35)
for name, vectorizer in vectorizers.items():
X_vec = vectorizer.fit_transform(original_texts)
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(
X_vec, original_labels, test_size=0.3, random_state=42, stratify=original_labels
)
if 'Binary' in name:
nb_vec = BernoulliNB(alpha=1.0)
else:
nb_vec = MultinomialNB(alpha=1.0)
nb_vec.fit(X_train_vec, y_train_vec)
y_pred_vec = nb_vec.predict(X_test_vec)
accuracy_vec = accuracy_score(y_test_vec, y_pred_vec)
print(f"{name}\t{accuracy_vec:.4f}")
# 2. 特征选择的影响
print(f"\n特征选择的影响:")
print("特征数\t\t准确率")
print("-" * 25)
# 使用TF-IDF向量化
tfidf_vec = TfidfVectorizer(max_features=200)
X_tfidf = tfidf_vec.fit_transform(original_texts)
k_values = [10, 20, 50, 100, 150]
for k in k_values:
# 使用卡方检验选择特征
selector = SelectKBest(chi2, k=k)
X_selected = selector.fit_transform(X_tfidf, original_labels)
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(
X_selected, original_labels, test_size=0.3, random_state=42, stratify=original_labels
)
nb_sel = MultinomialNB(alpha=1.0)
nb_sel.fit(X_train_sel, y_train_sel)
y_pred_sel = nb_sel.predict(X_test_sel)
accuracy_sel = accuracy_score(y_test_sel, y_pred_sel)
print(f"{k}\t\t{accuracy_sel:.4f}")9.8.2 集成朴素贝叶斯
python
# 创建朴素贝叶斯集成模型
from sklearn.ensemble import VotingClassifier
# 准备不同的朴素贝叶斯模型
# 1. 基于词频的多项式朴素贝叶斯
count_vec = CountVectorizer(max_features=100)
X_count = count_vec.fit_transform(original_texts)
# 2. 基于TF-IDF的多项式朴素贝叶斯
tfidf_vec = TfidfVectorizer(max_features=100)
X_tfidf = tfidf_vec.fit_transform(original_texts)
# 3. 基于二值特征的伯努利朴素贝叶斯
binary_vec = CountVectorizer(binary=True, max_features=100)
X_binary = binary_vec.fit_transform(original_texts)
# 分割数据
X_train_count, X_test_count, y_train, y_test = train_test_split(
X_count, original_labels, test_size=0.3, random_state=42, stratify=original_labels
)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
X_tfidf, original_labels, test_size=0.3, random_state=42, stratify=original_labels
)
X_train_binary, X_test_binary, _, _ = train_test_split(
X_binary, original_labels, test_size=0.3, random_state=42, stratify=original_labels
)
# 训练单个模型
nb_count = MultinomialNB(alpha=1.0)
nb_tfidf = MultinomialNB(alpha=1.0)
nb_binary = BernoulliNB(alpha=1.0)
nb_count.fit(X_train_count, y_train)
nb_tfidf.fit(X_train_tfidf, y_train)
nb_binary.fit(X_train_binary, y_train)
# 预测
y_pred_count = nb_count.predict(X_test_count)
y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)
y_pred_binary = nb_binary.predict(X_test_binary)
# 简单投票集成
ensemble_pred = []
for i in range(len(y_test)):
votes = [y_pred_count[i], y_pred_tfidf[i], y_pred_binary[i]]
ensemble_pred.append(max(set(votes), key=votes.count))
# 评估结果
print("朴素贝叶斯集成结果:")
print("模型\t\t\t准确率")
print("-" * 35)
print(f"词频多项式NB\t\t{accuracy_score(y_test, y_pred_count):.4f}")
print(f"TF-IDF多项式NB\t\t{accuracy_score(y_test, y_pred_tfidf):.4f}")
print(f"二值伯努利NB\t\t{accuracy_score(y_test, y_pred_binary):.4f}")
print(f"投票集成\t\t{accuracy_score(y_test, ensemble_pred):.4f}")
# 可视化集成效果
models = ['词频NB', 'TF-IDF NB', '二值NB', '投票集成']
accuracies = [
accuracy_score(y_test, y_pred_count),
accuracy_score(y_test, y_pred_tfidf),
accuracy_score(y_test, y_pred_binary),
accuracy_score(y_test, ensemble_pred)
]
plt.figure(figsize=(10, 6))
bars = plt.bar(models, accuracies, color=['skyblue', 'lightgreen', 'lightcoral', 'gold'], alpha=0.7)
plt.title('朴素贝叶斯集成效果比较')
plt.ylabel('准确率')
plt.ylim(0.6, 1.0)
# 添加数值标签
for bar, acc in zip(bars, accuracies):
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
f'{acc:.3f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()9.9 练习题
练习1:基础朴素贝叶斯
- 使用葡萄酒数据集训练高斯朴素贝叶斯分类器
- 分析模型学习到的参数(均值和方差)
- 比较标准化前后的性能差异
练习2:文本分类
- 收集或创建一个多类别文本数据集
- 比较多项式和伯努利朴素贝叶斯的性能
- 分析不同平滑参数对性能的影响
练习3:特征工程
- 使用新闻数据集进行文本分类
- 比较不同向量化方法(Count、TF-IDF、Binary)的效果
- 使用特征选择技术提升模型性能
练习4:不平衡数据处理
- 创建一个严重不平衡的分类数据集
- 比较不同朴素贝叶斯算法的表现
- 尝试使用采样技术改善性能
9.10 小结
在本章中,我们深入学习了朴素贝叶斯算法的各个方面:
核心概念
- 贝叶斯定理:概率推理的数学基础
- 朴素假设:特征独立性假设及其影响
- 不同变体:高斯、多项式、伯努利、补集朴素贝叶斯
主要技术
- 高斯朴素贝叶斯:处理连续特征,假设正态分布
- 多项式朴素贝叶斯:处理离散特征,适合文本分类
- 伯努利朴素贝叶斯:处理二值特征
- 补集朴素贝叶斯:处理不平衡数据
实践技能
- 文本分类:垃圾邮件过滤、情感分析
- 特征工程:向量化、特征选择
- 参数调优:平滑参数的选择和影响
- 集成方法:组合不同朴素贝叶斯模型
关键要点
- 朴素贝叶斯简单高效,适合快速原型开发
- 在文本分类等高维稀疏数据上表现优秀
- 需要合适的平滑技术处理零概率问题
- 独立性假设虽然"朴素",但在实践中往往有效
9.11 下一步
现在你已经掌握了朴素贝叶斯这个重要的概率分类算法!在下一章K近邻算法中,我们将学习一个完全不同的方法——基于实例的学习,了解"近朱者赤"的机器学习思想。
章节要点回顾:
- ✅ 理解了贝叶斯定理和朴素假设
- ✅ 掌握了不同类型朴素贝叶斯的应用场景
- ✅ 学会了文本分类的完整流程
- ✅ 了解了特征工程对朴素贝叶斯的重要性
- ✅ 掌握了处理不平衡数据的技巧
- ✅ 能够构建实用的文本分类系统