K-Means
????????K-Means是一種經典的無監督學習算法,用于將數據集劃分為K個簇(clusters),使得同一簇內的數據點相似度高,不同簇間的相似度低。它在數據挖掘、模式識別和機器學習中廣泛應用,如客戶細分、圖像壓縮和異常檢測。下面我將逐步介紹其核心原理、算法步驟、優缺點和應用場景。
要求:
????????理解并掌握K-Means算法,理解算法的原理,能夠實現算法,并對給定的數據集進行聚類
代碼實現:
import numpy as np
import matplotlib.pyplot as plt
import random
#算法實現
class KMeans:def __init__(self, n_clusters=3, max_iter=300):self.n_clusters = n_clustersself.max_iter = max_iterself.centroids = Noneself.labels = Nonedef fit(self, X):#隨機初始化質心indices = random.sample(range(len(X)), self.n_clusters)self.centroids = X[indices]for _ in range(self.max_iter):#分配樣本到最近的質心distances = self._calc_distances(X)self.labels = np.argmin(distances, axis=1)#更新質心位置new_centroids = np.array([X[self.labels == k].mean(axis=0) for k in range(self.n_clusters)])#檢查收斂if np.allclose(self.centroids, new_centroids):breakself.centroids = new_centroidsdef _calc_distances(self, X):return np.array([[np.linalg.norm(x - c) for c in self.centroids] for x in X])def predict(self, X):distances = self._calc_distances(X)return np.argmin(distances, axis=1)#數據加載函數
def load_data(file_path):data = []labels = []with open(file_path, 'r') as f:for line in f:values = line.strip().split(',')#提取前兩個特征和標簽data.append([float(values[0]), float(values[1])])labels.append(values[2])return np.array(data), np.array(labels)#評估函數(使用多數投票原則)
def evaluate_clustering(true_labels, pred_labels):#創建映射關系label_mapping = {}for cluster_id in set(pred_labels):cluster_samples = np.where(pred_labels == cluster_id)[0]cluster_labels = true_labels[cluster_samples]majority_label = max(set(cluster_labels), key=list(cluster_labels).count)label_mapping[cluster_id] = majority_label#計算準確率correct = 0for i in range(len(true_labels)):if label_mapping[pred_labels[i]] == true_labels[i]:correct += 1return correct / len(true_labels), label_mappingif __name__ == "__main__":#加載數據file_path = "D:/課程/數據挖掘/實驗六/實驗6-iris-聚類.txt"X, true_labels = load_data(file_path)#創建K-Means實例并訓練kmeans = KMeans(n_clusters=3)kmeans.fit(X)#預測聚類標簽pred_labels = kmeans.labels#評估聚類結果accuracy, mapping = evaluate_clustering(true_labels, pred_labels)print(f"聚類準確率: {accuracy:.2%}")print("聚類標簽映射關系:")for cluster_id, species in mapping.items():print(f"聚類{cluster_id} -> {species}")plt.figure(figsize=(12, 5))plt.subplot(121)colors = {'Iris-setosa': 'red', 'Iris-versicolor': 'green', 'Iris-virginica': 'blue'}for species in np.unique(true_labels):plt.scatter(X[true_labels == species, 0], X[true_labels == species, 1], label=species,c=colors[species],alpha=0.6)plt.title('真實標簽分布')plt.xlabel('花萼長度')plt.ylabel('花萼寬度')plt.legend()#聚類結果分布plt.subplot(122)for cluster_id in range(3):plt.scatter(X[pred_labels == cluster_id, 0], X[pred_labels == cluster_id, 1], label=f'聚類{cluster_id}',alpha=0.6)#標記質心位置plt.scatter(kmeans.centroids[:, 0], kmeans.centroids[:, 1], marker='X', s=200, c='black', label='質心')plt.title('K-Means聚類結果')plt.xlabel('花萼長度')plt.ylabel('花萼寬度')plt.legend()plt.tight_layout()plt.savefig('kmeans_clustering_result.png')plt.show()
運行結果:
左圖為真實的標簽分布,右圖為K-Means的聚類結果