锦州市网站建设_网站建设公司_Windows Server_seo优化
2025/12/29 0:04:25 网站建设 项目流程

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

class NaiveBayesClassifier:
"""
自定义朴素贝叶斯分类器实现
使用高斯分布假设(类似GaussianNB)
"""

def __init__(self):self.classes = Noneself.class_priors = {}self.feature_means = {}self.feature_vars = {}def fit(self, X, y):"""训练朴素贝叶斯模型参数:X: 特征矩阵,形状为 (n_samples, n_features)y: 目标标签,形状为 (n_samples,)"""self.classes = np.unique(y)for cls in self.classes:# 计算每个类别的先验概率X_cls = X[y == cls]self.class_priors[cls] = len(X_cls) / len(X)# 计算每个类别下每个特征的均值和方差self.feature_means[cls] = np.mean(X_cls, axis=0)self.feature_vars[cls] = np.var(X_cls, axis=0)def _calculate_likelihood(self, x, mean, var):"""计算高斯分布下的似然概率参数:x: 特征值mean: 均值var: 方差返回:似然概率"""eps = 1e-10  # 避免除零coeff = 1.0 / np.sqrt(2.0 * np.pi * var + eps)exponent = np.exp(-0.5 * ((x - mean) ** 2) / (var + eps))return coeff * exponentdef predict_single(self, x):"""预测单个样本的类别参数:x: 单个样本的特征向量返回:预测的类别"""posteriors = []for cls in self.classes:# 计算先验概率的对数prior = np.log(self.class_priors[cls])# 计算似然概率的对数和likelihood = np.sum(np.log(self._calculate_likelihood(x, self.feature_means[cls], self.feature_vars[cls])))# 计算后验概率posterior = prior + likelihoodposteriors.append(posterior)# 返回后验概率最大的类别return self.classes[np.argmax(posteriors)]def predict(self, X):"""预测多个样本的类别参数:X: 特征矩阵,形状为 (n_samples, n_features)返回:预测的类别数组"""return np.array([self.predict_single(x) for x in X])

def load_and_explore_data():
"""
加载和探索iris数据集
"""
print("=== 1. 数据集加载与探索 ===")

# 加载iris数据集
iris = load_iris()
X, y = iris.data, iris.targetprint(f"数据集大小: {X.shape}")
print(f"特征名称: {iris.feature_names}")
print(f"目标类别: {iris.target_names}")
print(f"类别分布: {np.bincount(y)}")# 创建DataFrame便于分析
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y
df['target_name'] = [iris.target_names[i] for i in y]print("\n数据集前5行:")
print(df.head())print("\n数据集统计信息:")
print(df.describe())return X, y, iris

def evaluate_model(y_true, y_pred, fold_num=None):
"""
计算模型性能指标

参数:
y_true: 真实标签
y_pred: 预测标签
fold_num: 折数(可选)返回:
性能指标字典
"""
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')prefix = f"第{fold_num}折 " if fold_num is not None else ""print(f"\n{prefix}性能指标:")
print(f"准确度 (Accuracy): {accuracy:.4f}")
print(f"精度 (Precision): {precision:.4f}")
print(f"召回率 (Recall): {recall:.4f}")
print(f"F1值 (F1-score): {f1:.4f}")return {'accuracy': accuracy,'precision': precision,'recall': recall,'f1': f1
}

def cross_validation_experiment(X, y, n_splits=5, use_custom=True):
"""
执行交叉验证实验

参数:
X: 特征矩阵
y: 目标标签
n_splits: 折数
use_custom: 是否使用自定义实现返回:
性能指标列表
"""
print(f"\n=== 2. 五折交叉验证实验 ===")
print(f"使用{'自定义' if use_custom else 'scikit-learn'}朴素贝叶斯算法")skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)results = []
fold = 1for train_idx, test_idx in skf.split(X, y):X_train, X_test = X[train_idx], X[test_idx]y_train, y_test = y[train_idx], y[test_idx]print(f"\n--- 第{fold}折 ---")print(f"训练集大小: {X_train.shape}, 测试集大小: {X_test.shape}")if use_custom:# 使用自定义朴素贝叶斯nb = NaiveBayesClassifier()nb.fit(X_train, y_train)y_pred = nb.predict(X_test)else:# 使用scikit-learn的GaussianNBnb = GaussianNB()nb.fit(X_train, y_train)y_pred = nb.predict(X_test)# 评估性能metrics = evaluate_model(y_test, y_pred, fold)results.append(metrics)fold += 1return results

def analyze_results(custom_results, sklearn_results):
"""
分析实验结果
"""
print("\n=== 3. 实验结果分析 ===")

# 转换为DataFrame便于分析
custom_df = pd.DataFrame(custom_results)
sklearn_df = pd.DataFrame(sklearn_results)print("\n自定义朴素贝叶斯结果:")
print(custom_df)
print(f"\n平均值:")
print(custom_df.mean())
print(f"\n标准差:")
print(custom_df.std())print("\nscikit-learn朴素贝叶斯结果:")
print(sklearn_df)
print(f"\n平均值:")
print(sklearn_df.mean())
print(f"\n标准差:")
print(sklearn_df.std())# 比较两种实现
print("\n=== 4. 两种实现对比 ===")
comparison = pd.DataFrame({'自定义实现': custom_df.mean(),'scikit-learn': sklearn_df.mean()
})
print(comparison)return custom_df, sklearn_df

def visualize_results(custom_df, sklearn_df):
"""
可视化实验结果
"""
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['accuracy', 'precision', 'recall', 'f1']
titles = ['准确度', '精度', '召回率', 'F1值']for i, (metric, title) in enumerate(zip(metrics, titles)):row, col = i // 2, i % 2# 箱线图比较data = [custom_df[metric], sklearn_df[metric]]axes[row, col].boxplot(data, labels=['自定义', 'scikit-learn'])axes[row, col].set_title(f'{title}对比')axes[row, col].set_ylabel(title)axes[row, col].grid(True, alpha=0.3)plt.tight_layout()
plt.savefig('naive_bayes_comparison.png', dpi=300, bbox_inches='tight')
plt.show()# 生成混淆矩阵(使用最后一次交叉验证的结果)
iris = load_iris()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)for train_idx, test_idx in skf.split(iris.data, iris.target):X_train, X_test = iris.data[train_idx], iris.data[test_idx]y_train, y_test = iris.target[train_idx], iris.target[test_idx]break# 使用scikit-learn模型生成混淆矩阵
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)cm = confusion_matrix(y_test, y_pred)plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',xticklabels=iris.target_names,yticklabels=iris.target_names)
plt.title('混淆矩阵 - 朴素贝叶斯分类')
plt.xlabel('预测类别')
plt.ylabel('真实类别')
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

def main():
"""
主函数
"""
print("朴素贝叶斯算法实验")
print("=" * 50)

# 1. 加载和探索数据
X, y, iris = load_and_explore_data()# 2. 使用自定义朴素贝叶斯进行交叉验证
custom_results = cross_validation_experiment(X, y, n_splits=5, use_custom=True)# 3. 使用scikit-learn朴素贝叶斯进行交叉验证
sklearn_results = cross_validation_experiment(X, y, n_splits=5, use_custom=False)# 4. 分析结果
custom_df, sklearn_df = analyze_results(custom_results, sklearn_results)# 5. 可视化结果
visualize_results(custom_df, sklearn_df)print("\n=== 5. 实验总结 ===")
print("实验完成!主要发现:")
print("1. 自定义朴素贝叶斯算法与scikit-learn实现效果相近")
print("2. 两种实现都能达到较高的分类准确度(>0.9)")
print("3. 五折交叉验证结果显示模型具有良好的泛化能力")
print("4. 混淆矩阵显示各类别的分类效果良好")

if name == "main":
main()

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询