实验内容:
(1)从scikit-learn 库中加载 iris 数据集或本地读取,进行数据分析,去除数据集中类
别标签行;
(2)采用五折交叉验证划分训练集和测试集,使用训练集对K均值聚类算法进行训练;
(3)使用五折交叉验证对模型性能(准确度、精度、召回率和F1值)进行测试,与测试集中的样本距离最近的聚类中心类别即为该样本的预测类别;
(4)通过对测试结果进行比较分析,评估模型性能;
执行代码
点击查看代码
```plaintextimport numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import warnings
import sys
import io# 设置输出编码为UTF-8,解决Windows PowerShell中文乱码问题
if sys.platform == 'win32':sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')warnings.filterwarnings('ignore')# ============================================================================
# 步骤1:加载数据并分析(无监督聚类不使用标签训练)
# ============================================================================
print("=" * 80)
print("步骤1:从scikit-learn库中加载iris数据集,进行数据分析(去除标签后聚类)")
print("=" * 80)
print()"""load_iris(): 加载Iris数据,返回data、target等属性。"""
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_namesprint(f"数据集形状: {X.shape}")
print(f"特征数量: {X.shape[1]}")
print(f"样本数量: {X.shape[0]}")
print(f"类别数量: {len(np.unique(y))}")
print()
print(f"特征名称: {feature_names}")
print(f"类别名称: {target_names}")
print()print("数据统计信息:")
iris_df = pd.DataFrame(X, columns=feature_names)
iris_df['species'] = [target_names[i] for i in y]
print(iris_df.describe())
print()print("类别分布:")
for i, name in enumerate(target_names):count = np.sum(y == i)print(f" {name}: {count} 个样本 ({count/len(y)*100:.1f}%)")
print()print("缺失值检查:")
print(f" 缺失值数量: {iris_df.isnull().sum().sum()}")
print()print("特征相关性分析:")
correlation = iris_df[feature_names].corr()
print(correlation)
print()# ============================================================================
# 步骤2:五折交叉验证训练K均值聚类模型
# ============================================================================
print("=" * 80)
print("步骤2:五折交叉验证训练K均值聚类模型(无监督)")
print("=" * 80)
print()"""StandardScaler(): 特征标准化。"""
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)print("数据标准化完成")
print(f" 标准化前 - 均值: {X.mean(axis=0).round(2)}, 标准差: {X.std(axis=0).round(2)}")
print(f" 标准化后 - 均值: {X_scaled.mean(axis=0).round(2)}, 标准差: {X_scaled.std(axis=0).round(2)}")
print()"""KFold: 五折交叉验证划分训练/测试索引。"""
kfold = KFold(n_splits=5, shuffle=True, random_state=42)print("创建K均值模型...")
print()fold_results = {'fold': [], 'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
all_y_true = []
all_y_pred = []print("开始五折交叉验证...")
print("-" * 80)def assign_labels_from_clusters(model, X_train, y_train):"""根据训练集多数表决,将聚类中心映射到真实标签。"""cluster_labels = {}for k in range(model.n_clusters):idx = model.labels_ == kif np.sum(idx) == 0:cluster_labels[k] = 0continuemajority = np.bincount(y_train[idx]).argmax()cluster_labels[k] = majorityreturn cluster_labelsfold_num = 1
for train_idx, test_idx in kfold.split(X_scaled):print(f"\n第 {fold_num} 折:")print(f" 训练集大小: {len(train_idx)} ({len(train_idx)/len(X)*100:.1f}%)")print(f" 测试集大小: {len(test_idx)} ({len(test_idx)/len(X)*100:.1f}%)")X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]y_train, y_test = y[train_idx], y[test_idx]"""KMeans: 无监督聚类,n_clusters=3 对应三类鸢尾花。"""kmeans = KMeans(n_clusters=3, random_state=42, n_init=10, max_iter=300)kmeans.fit(X_train)cluster_to_label = assign_labels_from_clusters(kmeans, X_train, y_train)# 预测:分配到最近中心,再映射到标签test_clusters = kmeans.predict(X_test)y_pred = np.array([cluster_to_label[c] for c in test_clusters])all_y_true.extend(y_test)all_y_pred.extend(y_pred)acc = accuracy_score(y_test, y_pred)prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)fold_results['fold'].append(fold_num)fold_results['accuracy'].append(acc)fold_results['precision'].append(prec)fold_results['recall'].append(rec)fold_results['f1'].append(f1)print(f" 准确度 (Accuracy): {acc:.4f}")print(f" 精度 (Precision): {prec:.4f}")print(f" 召回率 (Recall): {rec:.4f}")print(f" F1值 (F1-Score): {f1:.4f}")fold_num += 1print("\n" + "=" * 80)
print("五折交叉验证完成!")
print("=" * 80)
print()# ============================================================================
# 步骤3:计算整体指标并展示详细结果
# ============================================================================
print("=" * 80)
print("步骤3:使用五折交叉验证对模型性能(准确度、精度、召回率和F1值)进行测试")
print("=" * 80)
print()overall_accuracy = accuracy_score(all_y_true, all_y_pred)
overall_precision = precision_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
overall_recall = recall_score(all_y_true, all_y_pred, average='weighted', zero_division=0)
overall_f1 = f1_score(all_y_true, all_y_pred, average='weighted', zero_division=0)print("整体评估指标(基于所有5折的预测结果):")
print(f" 准确度 (Accuracy): {overall_accuracy:.4f}")
print(f" 精度 (Precision): {overall_precision:.4f}")
print(f" 召回率 (Recall): {overall_recall:.4f}")
print(f" F1值 (F1-Score): {overall_f1:.4f}")
print()print("五折交叉验证统计结果:")
print(f"{'指标':<15} {'平均值':<12} {'标准差':<12} {'最小值':<12} {'最大值':<12}")
print("-" * 70)
print(f"{'准确度':<15} {np.mean(fold_results['accuracy']):<12.4f} {np.std(fold_results['accuracy']):<12.4f} {np.min(fold_results['accuracy']):<12.4f} {np.max(fold_results['accuracy']):<12.4f}")
print(f"{'精度':<15} {np.mean(fold_results['precision']):<12.4f} {np.std(fold_results['precision']):<12.4f} {np.min(fold_results['precision']):<12.4f} {np.max(fold_results['precision']):<12.4f}")
print(f"{'召回率':<15} {np.mean(fold_results['recall']):<12.4f} {np.std(fold_results['recall']):<12.4f} {np.min(fold_results['recall']):<12.4f} {np.max(fold_results['recall']):<12.4f}")
print(f"{'F1值':<15} {np.mean(fold_results['f1']):<12.4f} {np.std(fold_results['f1']):<12.4f} {np.min(fold_results['f1']):<12.4f} {np.max(fold_results['f1']):<12.4f}")
print()print("按类别显示详细评估指标:")
precision_per_class = precision_score(all_y_true, all_y_pred, average=None, zero_division=0)
recall_per_class = recall_score(all_y_true, all_y_pred, average=None, zero_division=0)
f1_per_class = f1_score(all_y_true, all_y_pred, average=None, zero_division=0)print(f"{'类别':<20} {'精度':<12} {'召回率':<12} {'F1值':<12}")
print("-" * 60)
for i, class_name in enumerate(target_names):print(f"{class_name:<20} {precision_per_class[i]:<12.4f} {recall_per_class[i]:<12.4f} {f1_per_class[i]:<12.4f}")
print()print("混淆矩阵(Confusion Matrix):")
"""confusion_matrix: 行真列预测。"""
cm = confusion_matrix(all_y_true, all_y_pred)
print("\n真实类别(行) vs 预测类别(列):")
print(f"{'':<15}", end="")
for name in target_names:print(f"{name:<15}", end="")
print()
for i, name in enumerate(target_names):print(f"{name:<15}", end="")for j in range(len(target_names)):print(f"{cm[i, j]:<15}", end="")print()
print()print("分类报告(Classification Report):")
"""classification_report: 输出precision/recall/F1/支持数。"""
report = classification_report(all_y_true, all_y_pred, target_names=target_names)
print("\n" + report)
print()# ============================================================================
# 步骤4:结果分析
# ============================================================================
print("=" * 80)
print("步骤4:通过对测试结果进行比较分析,评估模型性能")
print("=" * 80)
print()print("模型性能分析")
print("=" * 80)
print()
print("1. 准确度分析:")
print(f" 整体准确度 {overall_accuracy:.4f},五折平均 {np.mean(fold_results['accuracy']):.4f},标准差 {np.std(fold_results['accuracy']):.4f}")
print()print("2. 精度分析:")
print(f" 整体精度 {overall_precision:.4f},反映预测为某类时的正确率。")
print()print("3. 召回率分析:")
print(f" 整体召回率 {overall_recall:.4f},反映实际样本被聚类到对应类别的比例。")
print()print("4. F1值分析:")
print(f" 整体F1值 {overall_f1:.4f},综合衡量精度与召回率。")
print()print("5. 类别分析:")
for i, class_name in enumerate(target_names):print(f" {class_name}: 精度 {precision_per_class[i]:.4f}, 召回率 {recall_per_class[i]:.4f}, F1值 {f1_per_class[i]:.4f}")
print()print("实验完成!")
</details>