基尼系數實現決策樹
基尼指數
Gini ? ( D ) = 1 ? ∑ k = 1 K ( ∣ C k ∣ ∣ D ∣ ) 2 \operatorname{Gini}(D)=1-\sum_{k=1}^{K}\left(\frac{\left|C_{k}\right|}{|D|}\right)^{2} Gini(D)=1?k=1∑K?(∣D∣∣Ck?∣?)2
特征 A A A條件下集合 D D D的基尼指數:
Gini ? ( D , A ) = ∣ D 1 ∣ ∣ D ∣ Gini ? ( D 1 ) + ∣ D 2 ∣ ∣ D ∣ Gini ? ( D 2 ) \operatorname{Gini}(D, A)=\frac{\left|D_{1}\right|}{|D|} \operatorname{Gini}\left(D_{1}\right)+\frac{\left|D_{2}\right|}{|D|} \operatorname{Gini}\left(D_{2}\right) Gini(D,A)=∣D∣∣D1?∣?Gini(D1?)+∣D∣∣D2?∣?Gini(D2?)
import numpy as npdef calculate_gini(labels):# 計算標簽的基尼系數_, counts = np.unique(labels, return_counts=True)probabilities = counts / len(labels)gini = 1 - np.sum(probabilities ** 2)return ginidef calculate_gini_index(data, labels, feature_index, threshold):# 根據給定的特征和閾值劃分數據left_mask = data[:, feature_index] <= thresholdright_mask = data[:, feature_index] > thresholdleft_labels = labels[left_mask]right_labels = labels[right_mask]# 計算左右子集的基尼系數left_gini = calculate_gini(left_labels)right_gini = calculate_gini(right_labels)# 計算基尼指數total_gini = calculate_gini(labels)left_weight = len(left_labels) / len(labels)right_weight = len(right_labels) / len(labels)gini_index = (left_weight * left_gini) + (right_weight * right_gini)return gini_indexdef find_best_split(data, labels):num_features = data.shape[1]best_gini_index = float('inf')best_feature_index = -1best_threshold = Nonefor feature_index in range(num_features):feature_values = data[:, feature_index]unique_values = np.unique(feature_values)for threshold in unique_values:gini_index = calculate_gini_index(data, labels, feature_index, threshold)if gini_index < best_gini_index:best_gini_index = gini_indexbest_feature_index = feature_indexbest_threshold = thresholdreturn best_feature_index, best_thresholddef create_decision_tree(data, labels):# 基本情況:如果所有標簽都相同,則返回一個葉節點,其中包含該標簽if len(np.unique(labels)) == 1:return {'label': labels[0]}# 找到最佳的劃分特征best_feature_index, best_threshold = find_best_split(data, labels)# 創建一個新的內部節點,其中包含最佳特征和閾值node = {'feature_index': best_feature_index,'threshold': best_threshold,'left': None,'right': None}# 根據最佳特征和閾值劃分數據left_mask = data[:, best_feature_index] <= best_thresholdright_mask = data[:, best_feature_index] > best_thresholdleft_data = data[left_mask]left_labels = labels[left_mask]right_data = data[right_mask]right_labels = labels[right_mask]# 遞歸創建左右子樹node['left'] = create_decision_tree(left_data, left_labels)node['right'] = create_decision_tree(right_data, right_labels)return nodedef predict(node, sample):if 'label' in node:return node['label']feature_value = sample[node['feature_index']]if feature_value <= node['threshold']:return predict(node['left'], sample)else:return predict(node['right'], sample)# 示例數據集
data = np.array([[1, 2, 0],[1, 2, 1],[1, 3, 1],[2, 3, 1],[2, 3, 0],[2, 2, 0],[1, 1, 0],[1, 1, 1],[2, 1, 1],[1, 3, 0]
])labels = np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1])# 創建決策樹
decision_tree = create_decision_tree(data, labels)# 測試數據
test_data = np.array([[1, 2, 0],[2, 1, 1],[1, 3, 1],[2, 3, 0]
])# 預測結果
for sample in test_data:prediction = predict(decision_tree, sample)print(f"樣本: {sample}, 預測標簽: {prediction}")