python打卡day23@浙大疏錦行

知識回顧:

1.??轉化器和估計器的概念

2.??管道工程

3.??ColumnTransformer和Pipeline類

作業：

整理下全部邏輯的先后順序，看看能不能制作出適合所有機器學習的通用pipeline

一、導入數據庫

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time # 導入 time 庫
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

?二、導入pipeline

from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler 
from sklearn.impute import SimpleImputer 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report, confusion_matrix 
data = pd.read_csv('heart.csv')
print("原始數據加載完成，形狀為:", data.shape)
y = data['target'] # 標簽
X = data.drop(['target'], axis=1) 
print("\n特征和標簽分離完成。")
print("特征 X 的形狀:", X.shape)
print("標簽 y 的形狀:", y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
print("\n數據集劃分完成 (預處理之前)。")
print("X_train 形狀:", X_train.shape)
print("X_test 形狀:", X_test.shape)
print("y_train 形狀:", y_train.shape)
print("y_test 形狀:", y_test.shape)
object_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(exclude=['object']).columns.tolist()
ordinal_features = ['cp',  'restecg',  'slope', 'ca', 'thal']ordinal_categories = [[0, 1, 2, 3], # Home Ownership 的順序 (對應1, 2, 3, 4)[0, 1, 2], # Years in current job 的順序 (對應1-11)[0, 1, 2], # Term 的順序 (對應0, 1)[0, 1, 2, 3, 4],[0, 1, 2, 3],
]

三、構建有序特征pipeline

ordinal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1)) 
])
print("有序特征處理 Pipeline 定義完成。")
nominal_features = ['sex','fbs','exang']

四、構建標稱特征pipeline

nominal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) 
])
print("標稱特征處理 Pipeline 定義完成。")
continuous_features = [f for f in X.columns if f not in ordinal_features + nominal_features]
continuous_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('scaler', StandardScaler()) 
print("連續特征處理 Pipeline 定義完成。")

五、串聯預處理器和模型

preprocessor = ColumnTransformer(transformers=[('ordinal', ordinal_transformer, ordinal_features), ('nominal', nominal_transformer, nominal_features), ('continuous', continuous_transformer, continuous_features) ],remainder='passthrough' # 如何處理沒有在上面列表中指定的列。
)
print("\nColumnTransformer (預處理器) 定義完成。")
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42)) 
])
print("\n完整的 Pipeline 定義完成。")
print("\n--- 1. 默認參數隨機森林 (訓練集 -> 測試集) ---") 
start_time = time.time() 
pipeline.fit(X_train, y_train)
pipeline_pred = pipeline.predict(X_test)
end_time = time.time() 
print(f"訓練與預測耗時: {end_time - start_time:.4f} 秒") 
print("\n默認隨機森林 在測試集上的分類報告：") 
print(classification_report(y_test, pipeline_pred))
print("默認隨機森林 在測試集上的混淆矩陣：") 
print(confusion_matrix(y_test, pipeline_pred))

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/bicheng/80699.shtml
繁體地址，請注明出處：http://hk.pswp.cn/bicheng/80699.shtml
英文地址，請注明出處：http://en.pswp.cn/bicheng/80699.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！