知識點回顧
- 規范的文件命名
- 規范的文件夾管理
- 機器學習項目的拆分
- 編碼格式和類型注解
作業:嘗試針對之前的心臟病項目ipynb,將他按照今天的示例項目整理成規范的形式,思考下哪些部分可以未來復用。
心臟病項目目錄
目錄結構:heart/
├── config/ #集中存放項目的配置文件
├── data/ #存放項目相關數據├── processed/ #數據預處理后的數據└── row/ #原始數據
├── experiments/ #用于探索和測試
├── models/ #存放訓練好的模型文件
├── reports/ #存儲項目運行產生的各類報告和輸出文件
└── src/ #存放項目的核心源代碼├── data/ #數據相關代碼├── models/ #模型相關代碼└── utils/ #通用輔助函數代碼
心臟病項目拆分
導入依賴庫
# 忽視警告
import warnings
warnings.simplefilter('ignore')# 數據處理
import numpy as np
import pandas as pd# 數據可視化
import matplotlib.pyplot as plt
import seaborn as sns # 隨機森林
from sklearn.ensemble import RandomForestClassifier # 決策樹
from sklearn.tree import DecisionTreeClassifier# 樹的可視化
from sklearn.tree import export_graphviz # 模型評估方法
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report # 混淆矩陣
from sklearn.metrics import confusion_matrix # 數據切分
from sklearn.model_selection import train_test_split #可解釋性分析
import shap np.random.seed(123)
pd.options.mode.chained_assignment = None %matplotlib inline
數據可視化
dt = pd.read_csv("heart.csv")
# 設置可視化風格
sns.set(palette = 'pastel', rc = {"figure.figsize": (10,5), # 圖形大小、"axes.titlesize" : 14, # 標題文字尺寸"axes.labelsize" : 12, # 坐標軸標簽文字尺寸"xtick.labelsize" : 10, # X軸刻度文字尺寸"ytick.labelsize" : 10 }) # Y軸刻度文字尺寸
a = sns.countplot(x = 'target', data = dt) # 繪制計數圖,其中x為target,數據為dt
a.set_title('Distribution of Presence of Heart Disease') # 設置圖形標題
a.set_xticklabels(['Absent', 'Present']) # 將兩個條形的標簽分別設置為“Absent”(沒有心臟病)和“Present”(有心臟病)
plt.xlabel("Presence of Heart Disease") # 設置X軸標簽
# 顯示圖形
plt.show()#患者年齡分布
plt.show()
g = sns.countplot(x = 'age', data = dt) # 繪制計數圖,其中x為age,數據為dt
g.set_title('Distribution of Age') # 設置圖形標題
plt.xlabel('Age') # 設置X軸標簽#患者性別分布
dt.sex.value_counts()
b = sns.countplot(x = 'target', data = dt, hue = 'sex') # 創建一個計數圖,其中x為target,數據為dt,用sex作為色相(切分類別)
plt.legend(['Female', 'Male']) # 以female/male作為標簽,在圖形中嵌入圖例
b.set_title('Distribution of Presence of Heart Disease by Sex') # 設置圖形標題
b.set_xticklabels(['Absent', 'Present']) # 設置條形圖的標簽
# 顯示圖形
plt.show()# 可視化病患血清膽固醇濃度分布
sns.distplot(dt['chol'].dropna(), kde=True, color='darkblue', bins=40)# 可視化病人(入院時)的靜息血壓分布
sns.distplot(dt['trestbps'].dropna(), kde=True, color='darkgreen', bins=10)# 可視化病人空腹血糖濃度分布
g = sns.countplot(x = 'fbs', data = dt) # 繪制計數圖,其中x為fbs,數據為dt
g.set_title('Distribution of Fasting blood sugar') # 設置圖形標題
plt.xlabel('Fasting blood sugar') # 設置X軸標簽#繪制熱力圖
f,ax = plt.subplots(figsize=(12,12)) # 定義圖形尺寸
# 根據計算的相關值繪制熱力圖
sns.heatmap(dt.corr('pearson'), annot = True, linewidths = .5, fmt = '.1f', ax = ax)
# 顯示特征相關性熱力圖
plt.show()
數據預處理
#由于原數據集內的特征名稱不利于解讀。因此,我們先對其進行重命名,使其更容易理解。
dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved','exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']#轉換數據類型
dt.dtypes# 使用"astype"指定數據類型
dt['sex'] = dt['sex'].astype('object')
dt['chest_pain_type'] = dt['chest_pain_type'].astype('object')
dt['fasting_blood_sugar'] = dt['fasting_blood_sugar'].astype('object')
dt['rest_ecg'] = dt['rest_ecg'].astype('object')
dt['exercise_induced_angina'] = dt['exercise_induced_angina'].astype('object')
dt['st_slope'] = dt['st_slope'].astype('object')
dt['thalassemia'] = dt['thalassemia'].astype('object')#轉換后重新確認輸出
dt.dtypes# 對object數據類型進行編碼
# 將"female"編碼為0,將"male"編碼為1
# 下面的編碼方式類似
dt['sex'][dt['sex'] == 0] = 'female'
dt['sex'][dt['sex'] == 1] = 'male'dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'typical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'atypical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'non-anginal pain'
dt['chest_pain_type'][dt['chest_pain_type'] == 4] = 'asymptomatic'dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'
dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality'
dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no'
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes'dt['st_slope'][dt['st_slope'] == 1] = 'upsloping'
dt['st_slope'][dt['st_slope'] == 2] = 'flat'
dt['st_slope'][dt['st_slope'] == 3] = 'downsloping'dt['thalassemia'][dt['thalassemia'] == 1] = 'normal'
dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect'
dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'# 調用"get_dummies"進行獨特編碼
dt = pd.get_dummies(dt, drop_first=True) #劃分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(dt.drop(columns='target'),dt['target'],test_size=0.2,random_state=10)
模型創建和預測
#創建模型
model = RandomForestClassifier(max_depth=5, n_estimators=10) # 設置最大深度與基學習器等參數
model.fit(X_train, y_train) # 使用隨機森林擬合訓練集#模型預測
y_predict = model.predict(X_test)
# 生成一個nxm的矩陣,第i行表示第i個樣本屬于各個標簽的概率
y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)
模型評估
#生成混淆矩陣
confusion_matrix = confusion_matrix(y_test, y_pred_bin)
confusion_matrix#計算靈敏度和特異度
total=sum(sum(confusion_matrix))sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('靈敏度 : ', sensitivity )specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('特異度 : ', specificity)# 繪制ROC曲線
# 得到曲線的橫軸和縱軸
fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant)fig, ax = plt.subplots()
# 繪制roc曲線
ax.plot(fpr, tpr)
# 繪制y=x直線
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve for diabetes classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
可解釋性分析
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
#特征重要性圖
shap.summary_plot(shap_values[1], X_test, plot_type="bar")
#蜂群圖
shap.summary_plot(shap_values[1], X_test)
@浙大疏錦行