XY的基礎處理
target_column = [ 'SeriousDlqin2yrs' ]
feature_columns = [ 'RevolvingUtilizationOfUnsecuredLines' , 'age' ,'NumberOfTime30-59DaysPastDueNotWorse' , 'DebtRatio' , 'MonthlyIncome' ,'NumberOfOpenCreditLinesAndLoans' , 'NumberOfTimes90DaysLate' ,'NumberRealEstateLoansOrLines' , 'NumberOfTime60-89DaysPastDueNotWorse' ,'NumberOfDependents' ]
x_list = feature_columnsimport pandas as pd
df = pd.read_csv( "cs-training.csv" )
features = x_list
for i in x_list:df[ i] = df[ i] .astype( float)
for i in target_column:df[ i] = df[ i] .astype( float) import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
features = x_list
iv_values = [ ]
df[ 'Y' ] = df[ 'SeriousDlqin2yrs' ]
第二步,我們封裝一個計算KS的函數
def calculate_ks(y_true, y_score):# 根據評分排序df = pd.DataFrame({'y': y_true, 'score': y_score})df = df.sort_values('score', ascending=False).reset_index(drop=True)# 計算累積分布df['cum_total'] = np.arange(1, len(df)+1) / len(df)df['cum_bad'] = df['y'].cumsum() / df['y'].sum()df['cum_good'] = (1 - df['y']).cumsum() / (len(df) - df['y'].sum())# 計算KS值ks = (df['cum_bad'] - df['cum_good']).abs().max()return ks
第三步,我們定義 plot_feature_analysis信貸特征分析可視化函數(外置指標面板完整版)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curvedef plot_feature_analysis(df, features, Y, n_bins=10, figsize=(20, 100)):"""參數:df : DataFrame 包含特征和目標變量的數據集features : list 需要分析的特征列表Y : str 目標變量名稱n_bins : int分箱數量 (默認10)figsize : tuple畫布尺寸 (寬, 高)"""# 動態計算布局n_features = len(features)fig, axes = plt.subplots(n_features, 1, figsize=figsize, constrained_layout=False, squeeze=False)# 調整畫布右側空間fig.subplots_adjust(right=0.7) # 為指標面板騰出30%空間# 全局樣式設置plt.rcParams.update({'font.sans-serif': 'SimHei','axes.unicode_minus': False,'axes.titlesize': 20,'axes.titlepad': 20,'figure.dpi': 400})# 遍歷所有特征for idx, (feature, ax) in enumerate(zip(features, axes.flat)):ax2 = ax.twinx() # 創建副坐標軸try:# === 數據清洗 ===df_clean = df[[feature, Y]].dropna()coverage = len(df_clean) / len(df)y = df_clean[Y]overall_bad_rate = y.mean()# === 動態分箱處理 ===ser = df_clean[feature]try:# 使用qcut并處理空箱bins = pd.qcut(ser, q=n_bins, duplicates='drop')bin_categories = bins.cat.categoriesexcept Exception as e:# 數值型變量分箱失敗時改用等距分箱bins = pd.cut(ser, bins=n_bins, include_lowest=True)bin_categories = bins.cat.categories# === 生成分箱標簽 ===bin_labels = []for interval in bin_categories:if pd.isnull(interval):bin_labels.append('Missing')else:left = round(interval.left, 2)right = round(interval.right, 2)bin_labels.append(f"{left}-{right}")# === 分箱統計 ===grouped = (df_clean.assign(bin=bins).groupby('bin', observed=False).agg(count=(Y, 'count'),bad=(Y, 'sum')).reindex(bin_categories) # 確保包含所有分箱.fillna({'count':0, 'bad':0}).assign(bad_rate = lambda x: x['bad']/x['count'].replace(0, np.nan),lift = lambda x: x['bad_rate']/overall_bad_rate).reset_index()).fillna(0)# === 繪制柱狀圖 ===bars = ax.bar(bin_labels, grouped['count'], width=0.8,alpha=0.7,color='#1f77b4',label='樣本量')# === 繪制折線圖 ===line = ax2.plot(bin_labels, grouped['bad_rate'], color='#d62728',marker='o',markersize=20,linewidth=3,linestyle='--',label='逾期率')# === 添加數值標注 ===for i, (rect, br, lift, count, bad) in enumerate(zip(bars, grouped['bad_rate'], grouped['lift'],grouped['count'],grouped['bad'])):# 樣本量標注ax.text(rect.get_x() + rect.get_width()/2, rect.get_height() * 0.6, f"All: {count:,}\nBad: {bad:,}",ha='center', va='center',color='white', fontsize=13,fontweight='bold',linespacing=1.2)# 逾期率標注ax2.text(rect.get_x() + rect.get_width()/2, br + 0.005, f'{br:.2%}',ha='center', va='bottom',color='#d62728',fontsize=18, fontweight='bold')# Lift值標注ax.text(rect.get_x() + rect.get_width()/2, rect.get_height() * 0.95,f'Lift: {lift:.2f}',ha='center', va='top',color='#FFFF00',fontsize=15,fontweight='bold')# === 計算模型指標 ===#y_score = grouped['bad_rate'].iloc[pd.factorize(bins)[0]].values#auc = roc_auc_score(y, y_score)#fpr, tpr, _ = roc_curve(y, y_score)#ks = (tpr - fpr).max()y_score = grouped['bad_rate'].iloc[pd.factorize(bins)[0]].values # 使用分箱壞樣本率作為評分auc = roc_auc_score(y, y_score)# auc=max(auc,1-auc)ks = calculate_ks(y, y_score)fpr, tpr, _ = roc_curve(y, y_score)# === 設置坐標軸 ===ax.set_ylabel('樣本量', color='#1f77b4', fontsize=20)ax2.set_ylabel('逾期率', color='#d62728', fontsize=20)ax.tick_params(axis='y', colors='#1f77b4', labelsize=14)ax2.tick_params(axis='y', colors='#d62728', labelsize=14)# === 設置X軸 ===ax.set_xticks(range(len(bin_labels)))ax.set_xticklabels(bin_labels,rotation=45,ha='right',fontsize=16)# === 添加外部指標面板 === # 獲取坐標位置(figure坐標系)ax_bbox = ax.get_position()panel_x = ax_bbox.x1 + 0.03 # 右側偏移3%panel_y = ax_bbox.y0 + ax_bbox.height*0.6 # 垂直居中偏上# 在figure層面添加文本fig.text(x=panel_x,y=panel_y,s=f"特征分析指標\n━━━━━━━━━━━━\n"f"特征名稱: {feature}\n"f"分箱數量: {len(bin_labels)}\n"f"特征覆蓋率: {coverage:.2%}\n"f"AUC: {auc:.3f}\n"f"KS值: {ks:.3f}\n"f"全局逾期率: {overall_bad_rate:.2%}",fontsize=15,linespacing=1.8,va='top',ha='left',fontfamily='SimHei',bbox=dict(boxstyle='round',facecolor='#f8f9fa',edgecolor='#ced4da',alpha=0.95,pad=0.8))# === 網格線設置 ===ax.grid(True, axis='y', linestyle=':', alpha=0.7)ax2.grid(True, axis='y', linestyle=':', alpha=0.3)except Exception as e:print(f"特征 {feature} 分析失敗: {str(e)}")ax.set_visible(False)ax2.set_visible(False)
第四步,使用示例
if __name__ == "__main__":# 執行分析,以上代碼都是在個人電腦基于開源數據編譯成功plot_feature_analysis(df,features,Y='Y',n_bins=8,figsize=(22, 80))plt.show()