NaiveBayes
????????樸素貝葉斯的核心是貝葉斯定理,它描述了如何根據新證據更新事件的概率。
要求:
????????1、實現樸素貝葉斯分類算法,驗證算法的正確性,并將算法應用于給定的數據集Data_User_Modeling數據集,選擇一部分數據集作為已知結果,然后用剩下的數據集作為測試集,驗證算法的分類情況
????????2、重新選取訓練樣本和測試集,對比并分析分類結果
????????3、選取一部分數據集作為訓練樣本,實現分類;不斷從測試集中選取數據加入到訓練集中,對比并分析分類結果
代碼實現:
import pandas as pd
import numpy as np
from math import pi, sqrt, exp# ========================== 數據預處理 ==========================
# 配置文件路徑(請根據實際路徑修改)
DATA_PATH = r"D:\課程\數據挖掘\實驗四\實驗4-Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls"# 讀取Excel數據
excel_file = pd.ExcelFile(DATA_PATH)
df = excel_file.parse('Training_Data') # 假設工作表名為'Training_Data'# 數據清洗:統一目標變量格式(轉為小寫并去除空格)
df['UNS'] = df['UNS'].str.strip().str.lower()# 劃分訓練集和測試集(70%訓練,30%測試)
train_df = df.sample(frac=0.7, random_state=42)
test_df = df.drop(train_df.index)# 分離特征和標簽
X_train = train_df.drop('UNS', axis=1)
y_train = train_df['UNS']
X_test = test_df.drop('UNS', axis=1)
y_test = test_df['UNS']# ========================== 樸素貝葉斯算法實現 ==========================
class NaiveBayesClassifier:def __init__(self):self.classes = None # 存儲類別標簽self.mean = {} # 各特征在類別下的均值self.var = {} # 各特征在類別下的方差self.prior = {} # 類別先驗概率def fit(self, X, y):"""訓練模型,計算類別統計量"""self.classes = np.unique(y) # 獲取所有類別n_samples, n_features = X.shape # 樣本數和特征數for cls in self.classes:cls_data = X[y == cls] # 提取當前類別數據self.mean[cls] = cls_data.mean(axis=0) # 計算各特征均值self.var[cls] = cls_data.var(axis=0, ddof=1) # 計算無偏方差(ddof=1)self.prior[cls] = len(cls_data) / n_samples # 計算先驗概率def _gaussian_probability(self, x, mean, var):"""計算高斯分布的概率密度函數(連續特征)"""exponent = exp(-((x - mean) ** 2) / (2 * var))denominator = sqrt(2 * pi * var)return exponent / denominatordef predict(self, X):"""預測樣本類別"""predictions = []for _, sample in X.iterrows():posteriors = {}for cls in self.classes:# 計算先驗概率(取對數避免下溢)prior = np.log(self.prior[cls])# 計算似然概率(特征獨立假設,乘積轉對數求和)likelihood = 1.0for feature in X.columns:prob = self._gaussian_probability(sample[feature],self.mean[cls][feature],self.var[cls][feature])likelihood *= prob if prob != 0 else 1e-10 # 處理零概率# 后驗概率 = 先驗概率 * 似然概率(取對數域計算)posterior = prior + np.log(likelihood)posteriors[cls] = posterior# 選擇后驗概率最大的類別predictions.append(max(posteriors, key=posteriors.get))return np.array(predictions)# ========================== 模型訓練與評估 ==========================
# 創建分類器實例并訓練
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)# 預測測試集
y_pred = nb_classifier.predict(X_test)# 計算準確率
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f"樸素貝葉斯分類準確率: {accuracy * 100:.2f}%")# ========================== 可選:增量學習實驗(按實驗要求擴展) ==========================
def incremental_learning_evaluation(initial_train, test_data, steps=5):"""逐步將測試數據加入訓練集,觀察準確率變化"""train_data = initial_train.copy()X_incr = test_data.drop('UNS', axis=1)y_incr = test_data['UNS']n_test = len(X_incr)step_size = n_test // steps if n_test >= steps else n_test # 避免除數為0for i in range(steps):start_idx = i * step_sizeend_idx = (i + 1) * step_sizeadd_X = X_incr.iloc[start_idx:end_idx]add_y = y_incr.iloc[start_idx:end_idx]# 合并訓練數據train_data = pd.concat([train_data, pd.concat([add_X, add_y], axis=1)])current_X_train = train_data.drop('UNS', axis=1)current_y_train = train_data['UNS']# 重新訓練模型incr_classifier = NaiveBayesClassifier()incr_classifier.fit(current_X_train, current_y_train)# 預測剩余測試數據remaining_X_test = X_incr.iloc[end_idx:]remaining_y_test = y_incr.iloc[end_idx:]if not remaining_X_test.empty:y_pred_incr = incr_classifier.predict(remaining_X_test)current_accuracy = np.sum(y_pred_incr == remaining_y_test) / len(remaining_y_test)print(f"加入{end_idx}條數據后準確率: {current_accuracy * 100:.2f}%")else:print("所有測試數據已加入訓練集")# 執行增量學習實驗(可選,取消注釋后運行)
# incremental_learning_evaluation(train_df, test_df, steps=5)