要求:
1、根據相關的信息預測通過電話推銷,用戶是否會在銀行進行存款
2、數據bank.csv,約4520條數據,17個屬性值 提示: 17個屬性,分別是年齡,工作類型,婚姻狀況,受教育背景,信用情況,房貸,個人貸款,聯系電話是手機還是固定電話,最后聯系月份,最后聯系日,通話持續時間,本次活動中聯系的次數,最后一次接觸距離上一次接觸的時間,以前的活動中聯系的次數,上一次活動成功與否,是否會進行存款
代碼實現:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False #加載本地的數據
file_path = r'D:\課程\數據挖掘\實驗八\實驗8-bank.csv'
df = pd.read_csv(file_path, sep=';', quotechar='"')#進行數據預處理
df = df.replace('unknown', np.nan)
for col in ['default', 'housing', 'loan', 'y']:df[col] = df[col].map({'yes': 1, 'no': 0, 'unknown': np.nan})#處理缺失值
mode_values = {'job': df['job'].mode()[0] if not df['job'].mode().empty else 'unknown','education': df['education'].mode()[0] if not df['education'].mode().empty else 'unknown','contact': df['contact'].mode()[0] if not df['contact'].mode().empty else 'unknown','poutcome': df['poutcome'].mode()[0] if not df['poutcome'].mode().empty else 'unknown'
}
df = df.fillna(mode_values)
#標識特征
df['pdays'] = df['pdays'].replace(-1, 999) #未聯系過的特殊標記
df['contacted_before'] = np.where(df['pdays'] != 999, 1, 0) #是否曾經聯系過#檢查并處理目標變量中的NaN
print(f"目標變量y中的缺失值數量: {df['y'].isna().sum()}")
df = df.dropna(subset=['y']) #特征編碼
categorical_features = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numeric_features),('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)])X = df.drop('y', axis=1)
y = df['y']#進行數據集劃分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y
)#模型訓練
rf = RandomForestClassifier(n_estimators=100, max_depth=8,min_samples_split=5,class_weight='balanced',random_state=42
)X_train_processed = preprocessor.fit_transform(X_train)
rf.fit(X_train_processed, y_train)#模型評估
X_test_processed = preprocessor.transform(X_test)
y_pred = rf.predict(X_test_processed)
y_proba = rf.predict_proba(X_test_processed)[:, 1]print(f"準確率: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1分數: {f1_score(y_test, y_pred):.4f}")
print(f"AUC: {roc_auc_score(y_test, y_proba):.4f}")#混淆矩陣可視化
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['不會存款', '會存款'], yticklabels=['不會存款', '會存款'])
plt.xlabel('預測標簽')
plt.ylabel('真實標簽')
plt.title('混淆矩陣')
plt.show()#獲取特征名稱
cat_encoder = preprocessor.named_transformers_['cat']
cat_features = cat_encoder.get_feature_names_out(categorical_features)
all_features = np.concatenate([numeric_features, cat_features])#繪制特征重要性
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1][:15]plt.figure(figsize=(12, 8))
plt.title("Top 15 特征重要性")
plt.barh(range(15), importances[indices][::-1], align='center')
plt.yticks(range(15), all_features[indices][::-1])
plt.xlabel('重要性')
plt.tight_layout()
plt.show()#精準營銷建議
def marketing_recommendation(model, preprocessor, customer_data):processed_data = preprocessor.transform(customer_data)proba = model.predict_proba(processed_data)[0][1]return proba > 0.4
#客戶數據
sample_customer = pd.DataFrame({'age': [42],'job': ['management'],'marital': ['married'],'education': ['tertiary'],'default': [0],'balance': [1500],'housing': [1],'loan': [0],'contact': ['cellular'],'day': [15],'month': ['may'],'duration': [300],'campaign': [2],'pdays': [100],'previous': [3],'poutcome': ['success']
})if marketing_recommendation(rf, preprocessor, sample_customer):print("推薦對該客戶進行存款營銷")
else:print("不建議對該客戶進行存款營銷")
運行結果:
混淆矩陣:
Top15特征重要性
控制臺輸出,推薦對該客戶進行存款營銷