Predict Calorie Expenditure
題意:
給出每個人的基本信息,預測運動后的卡路里消耗值。
數據處理:
1.構造出人體機能、運動相關的特征值。
2.所有特征值進行從新組合,注意唯獨爆炸
3.對連續信息分箱變成離散
建立模型:
1.xgb模型,lgb模型,cat模型
2.使用stack堆疊融合,使用3折交叉驗證
3.對xgb、lgb、cat進行K折交叉驗證,最終和stack進行結果融合。
代碼:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridgedef init():os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 僅輸出錯誤日志warnings.simplefilter('ignore') # 忽略警告日志pd.set_option('display.width', 1000)pd.set_option('display.max_colwidth', 1000)pd.set_option("display.max_rows", 1000)pd.set_option("display.max_columns", 1000)def show_dataframe(df):print("查看特征值和特征值類型\n" + str(df.dtypes) + "\n" + "-" * 100)print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)print("查看每個特征值的各種數據統計信息\n" + str(df.describe()) + "\n" + "-" * 100)print("輸出重復行的個數\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)print("查看每列的缺失值個數\n" + str(df.isnull().sum()) + "\n" + "-" * 100)print("查看缺失值的具體信息\n" + str(df.info()) + "\n" + "-" * 100)#print("輸出X所有值出現的是什么,還有對應出現的次數\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)def show_relation(data, colx, coly): # 輸出某一特征值與目標值的關系if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:seaborn.boxplot(x=colx, y=coly, data=data)else:plt.scatter(data[colx], data[coly])plt.xlabel(colx)plt.ylabel(coly)plt.show()# 自定義RMSLE評分函數(GridSearchCV需要最大化評分,因此返回負RMSLE)
def rmsle_scorer(y_true, y_pred):y_pred = np.clip(y_pred, 1e-15, None) # 防止對0取對數y_true = np.clip(y_true, 1e-15, None)log_error = np.log(y_pred + 1) - np.log(y_true + 1)rmsle = np.sqrt(np.mean(log_error ** 2))return -rmsle # 返回負值,因為GridSearchCV默認最大化評分if __name__ == '__main__':init()df_train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')df_test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')#for col in df_train.columns:# show_relation(df_train, col, 'Calories')#特征工程df_all = pd.concat([df_train.drop(['id', 'Calories'], axis=1), df_test.drop(['id'], axis=1)], axis=0)df_all['Sex'] = df_all['Sex'].map({'male': 0, 'female': 1})df_all = df_all.reset_index(drop=True)#構造BMIdf_all['BMI'] = df_all['Weight'] / (df_all['Height'] / 100) ** 2#Harris-Benedict公式df_all['BMR'] = 0df_all.loc[df_all['Sex'] == 0, 'BMR'] = 88.362 + (13.397 * df_all['Weight']) + (4.799 * df_all['Height']) - (5.677 * df_all['Age'])df_all.loc[df_all['Sex'] == 1, 'BMR'] = 447.593 + (9.247 * df_all['Weight']) + (3.098 * df_all['Height']) - (4.330 * df_all['Age'])# 數值特征標準化#numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']#scaler = StandardScaler()#df_all[numeric_features] = scaler.fit_transform(df_all[numeric_features])#運動強度特征df_all['Max_HR'] = 220 - df_all['Age'] # 最大心率df_all['HR_Reserve_Ratio'] = df_all['Heart_Rate'] / df_all['Max_HR']#交互特征df_all['Weight_Duration'] = df_all['Weight'] * df_all['Duration']df_all['Sex_Weight'] = df_all['Sex'] * df_all['Weight']# 構造運動功率特征df_all['workload'] = df_all['Weight'] * df_all['Duration'] * df_all['Heart_Rate'] / 1000# 構造生理特征交互項df_all['age_heart_ratio'] = df_all['Age'] / df_all['Heart_Rate']# 時間維度特征(如有時間戳)df_all['hour_of_day'] = df_all['Duration']/60/24# 組合特征numeric_cols = df_all.columnsfor i in range(len(numeric_cols)):feature_1 = numeric_cols[i]for j in range(i + 1, len(numeric_cols)):feature_2 = numeric_cols[j]df_all[f'{feature_1}_x_{feature_2}'] = df_all[feature_1] * df_all[feature_2]#數值歸一化#scaler = RobustScaler()#df_all = scaler.fit_transform(df_all)now_col = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']for i in now_col:df_all[i + "_box"] = pd.cut(df_all[i], bins=10, labels=False, right=False)X_train = df_all[:df_train.shape[0]]Y_train = np.log1p(df_train['Calories'])x_test = df_all[df_train.shape[0]:]#xgbmodel_xgb =estimator=XGBRegressor(random_state=42,n_estimators=8000,objective='reg:squarederror',eval_metric='rmse',device='cuda',learning_rate=0.05,max_depth=8,colsample_bytree=0.75,subsample=0.9,#reg_lambda = 1,#reg_alpha = 0.5,early_stopping_rounds=500,)#lgbmodel_lgb = lightgbm.LGBMRegressor(n_estimators=3000, # 增加迭代次數配合早停learning_rate=0.03, # 減小學習率num_leaves=15, # 限制模型復雜度min_child_samples=25, # 增加最小葉子樣本數reg_alpha=0.1, # L1正則化reg_lambda=0.1, # L2正則化objective='regression_l1', # 改用MAE損失early_stopping_rounds=500,)#catmodel_cat = CatBoostRegressor(iterations=3500,learning_rate=0.02,depth=12,loss_function='RMSE',l2_leaf_reg=3,random_seed=42,eval_metric='RMSE',early_stopping_rounds=200,verbose=1000,task_type='GPU',)#融合#創建基模型列表(需禁用早停以生成完整預測)base_models = [('xgb', XGBRegressor(early_stopping_rounds=None, # 禁用早停**{k: v for k, v in model_xgb.get_params().items() if k != 'early_stopping_rounds'})),('lgb', LGBMRegressor(early_stopping_rounds=None, # 禁用早停**{k: v for k, v in model_lgb.get_params().items() if k != 'early_stopping_rounds'})),('cat', CatBoostRegressor(early_stopping_rounds=None, # 禁用早停**{k: v for k, v in model_cat.get_params().items() if k != 'early_stopping_rounds'}))]meta_model = RidgeCV()model_stack = StackingRegressor(estimators=base_models,final_estimator=meta_model,cv=3, # 使用3折交叉驗證生成元特征passthrough=False, # 不使用原始特征verbose=1)FOLDS = 20KF = KFold(n_splits=FOLDS, shuffle=True, random_state=42)cat_features = ['Sex']oof_cat = np.zeros(len(df_train))pred_cat = np.zeros(len(df_test))oof_xgb = np.zeros(len(df_train))pred_xgb = np.zeros(len(df_test))oof_lgb = np.zeros(len(df_train))pred_lgb = np.zeros(len(df_test))for i, (train_idx, valid_idx) in enumerate(KF.split(X_train, Y_train)):print('#' * 15, i + 1, '#' * 15)## SPLIT DSx_train, y_train = X_train.iloc[train_idx], Y_train.iloc[train_idx]x_valid, y_valid = X_train.iloc[valid_idx], Y_train.iloc[valid_idx]## CATBOOST fitmodel_cat.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], cat_features=cat_features,use_best_model=True, verbose=0)## XGB FIRmodel_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=0)## LGB MODELmodel_lgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])## PREDICTION CATBOOSToof_cat[valid_idx] = model_cat.predict(x_valid)pred_cat += model_cat.predict(x_test)## PREDICTION XGBoof_xgb[valid_idx] = model_xgb.predict(x_valid)pred_xgb += model_xgb.predict(x_test)## PREDICTION LGBoof_lgb[valid_idx] = model_lgb.predict(x_valid)pred_lgb += model_lgb.predict(x_test)cat_rmse = mean_squared_error(y_valid, oof_cat[valid_idx]) ** 0.5xgb_rmse = mean_squared_error(y_valid, oof_xgb[valid_idx]) ** 0.5lgb_rmse = mean_squared_error(y_valid, oof_lgb[valid_idx]) ** 0.5print(f'FOLD {i + 1} CATBOOST_RMSE = {cat_rmse:.4f} <=> XGB_RMSE = {xgb_rmse:.4f} <=> LGB_RMSE = {lgb_rmse:.4f}')#預測pred_cat /= FOLDSpred_xgb /= FOLDSpred_lgb /= FOLDSpred_stack = model_stack.predict(df_test)pred_all = np.expm1(pred_xgb) * 0.1 + np.expm1(pred_stack) * 0.80 + np.expm1(pred_cat) * 0.1submission = pd.DataFrame({'id': df_test['id'],'Calories': pred_all})submission['Calories'] = np.clip(submission['Calories'], a_min=1, a_max=20*df_test['Duration'])submission.to_csv('/kaggle/working/submission.csv', index=False)
代碼
使用k折交叉驗證,對預測結果再進行訓練預測。
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridgedef init():os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 僅輸出錯誤日志warnings.simplefilter('ignore') # 忽略警告日志pd.set_option('display.width', 1000)pd.set_option('display.max_colwidth', 1000)pd.set_option("display.max_rows", 1000)pd.set_option("display.max_columns", 1000)def show_dataframe(df):print("查看特征值和特征值類型\n" + str(df.dtypes) + "\n" + "-" * 100)print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)print("查看每個特征值的各種數據統計信息\n" + str(df.describe()) + "\n" + "-" * 100)print("輸出重復行的個數\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)print("查看每列的缺失值個數\n" + str(df.isnull().sum()) + "\n" + "-" * 100)print("查看缺失值的具體信息\n" + str(df.info()) + "\n" + "-" * 100)#print("輸出X所有值出現的是什么,還有對應出現的次數\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)def show_relation(data, colx, coly): # 輸出某一特征值與目標值的關系if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:seaborn.boxplot(x=colx, y=coly, data=data)else:plt.scatter(data[colx], data[coly])plt.xlabel(colx)plt.ylabel(coly)plt.show()def show_score(model_name, pred):mse = mean_squared_error(y_val, pred)mae = mean_absolute_error(y_val, pred)score = r2_score(y_val, pred)print(model_name)print(f"{'MSE':<10}{mse:<15.4f}")print(f"{'MAE':<10}{mae:<15.4f}")print(f"{'R2':<10}{score:<15.4f}")print("-" * 100)# Function to calculate RMSLE
def rmsle(y_true, y_pred):return np.sqrt(np.mean(np.power(np.log1p(y_true) - np.log1p(y_pred), 2)))if __name__ == '__main__':init()df_train = pd.read_csv('train.csv')df_test = pd.read_csv('test.csv')#for col in df_train.columns:# show_relation(df_train, col, 'Calories')#特征工程df_all = pd.concat([df_train.drop(['id', 'Calories'], axis=1), df_test.drop(['id'], axis=1)], axis=0)df_all['Sex_encoded'] = df_all['Sex'].map({'male': 0, 'female': 1})df_all.drop(['Sex'], axis=1, inplace=True)df_all = df_all.reset_index(drop=True)#構造BMIdf_all['BMI'] = df_all['Weight'] / (df_all['Height'] / 100) ** 2#Harris-Benedict公式df_all['BMR'] = 0df_all.loc[df_all['Sex_encoded'] == 0, 'BMR'] = 88.362 + (13.397 * df_all['Weight']) + (4.799 * df_all['Height']) - (5.677 * df_all['Age'])df_all.loc[df_all['Sex_encoded'] == 1, 'BMR'] = 447.593 + (9.247 * df_all['Weight']) + (3.098 * df_all['Height']) - (4.330 * df_all['Age'])# 數值特征標準化#numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']#scaler = StandardScaler()#df_all[numeric_features] = scaler.fit_transform(df_all[numeric_features])#運動強度特征df_all['Max_HR'] = 220 - df_all['Age'] # 最大心率df_all['HR_Reserve_Ratio'] = df_all['Heart_Rate'] / df_all['Max_HR']#交互特征df_all['Weight_Duration'] = df_all['Weight'] * df_all['Duration']df_all['Sex_Weight'] = df_all['Sex_encoded'] * df_all['Weight']# 構造運動功率特征df_all['workload'] = df_all['Weight'] * df_all['Duration'] * df_all['Heart_Rate'] / 1000# 構造生理特征交互項df_all['age_heart_ratio'] = df_all['Age'] / df_all['Heart_Rate']# 時間維度特征(如有時間戳)df_all['hour_of_day'] = df_all['Duration']/60/24# 組合特征numeric_cols = df_all.columnsfor i in range(len(numeric_cols)):feature_1 = numeric_cols[i]for j in range(i + 1, len(numeric_cols)):feature_2 = numeric_cols[j]df_all[f'{feature_1}_x_{feature_2}'] = df_all[feature_1] * df_all[feature_2]#數值歸一化#scaler = RobustScaler()#df_all = scaler.fit_transform(df_all)# 分箱,把連續變成離散的,看你在哪一類now_col = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']for i in now_col:df_all[i + "_box"] = pd.cut(df_all[i], bins=10, labels=False, right=False)baseline_temp = 37.0# Calculate 'Temp_Change' for the training datadf_all['Temp_Change'] = df_all['Body_Temp'] - baseline_temp# Calculate 'Intensity' for the training datadf_all['Intensity'] = df_all['Heart_Rate'] / df_all['Duration']# Calculate 'Heart_Rate_Ratio' for the training datadf_all['Heart_Rate_Ratio'] = df_all['Heart_Rate'] / df_all['Age']# Calculate 'Duration_x_HeartRate' for the training datadf_all['Duration_x_HeartRate'] = df_all['Duration'] * df_all['Heart_Rate']# Calculate 'Weight_x_Duration' for the training datadf_all['Weight_x_Duration'] = df_all['Weight'] * df_all['Duration']# Calculate 'Height_x_Duration' for the training datadf_all['Height_x_Duration'] = df_all['Height'] * df_all['Duration']# Calculate 'Weight_x_Height' for the training datadf_all['Weight_x_Height'] = df_all['Weight'] * df_all['Height']# Calculate 'Weight_x_Intensity' for the training datadf_all['Weight_x_Intensity'] = df_all['Weight'] * df_all['Intensity']# Calculate 'Height_x_Intensity' for the training datadf_all['Height_x_Intensity'] = df_all['Height'] * df_all['Intensity']X_train = df_all[:df_train.shape[0]]Y_train = np.log1p(df_train['Calories'])x_test = df_all[df_train.shape[0]:]#xgbmodel_xgb =estimator=XGBRegressor(random_state=42,n_estimators=8000,objective='reg:squarederror',eval_metric='rmse',device='cuda',learning_rate=0.05,max_depth=8,colsample_bytree=0.75,subsample=0.9,#reg_lambda = 1,#reg_alpha = 0.5,early_stopping_rounds=200,)#lgbmodel_lgb = lightgbm.LGBMRegressor(n_estimators=3000, # 增加迭代次數配合早停learning_rate=0.03, # 減小學習率num_leaves=15, # 限制模型復雜度min_child_samples=25, # 增加最小葉子樣本數reg_alpha=0.1, # L1正則化reg_lambda=0.1, # L2正則化objective='regression_l1', # 改用MAE損失early_stopping_rounds=200,eval_metric='RMSE',)#catmodel_cat = CatBoostRegressor(iterations=3500,learning_rate=0.02,depth=12,loss_function='RMSE',l2_leaf_reg=3,random_seed=42,eval_metric='RMSE',verbose=1000,task_type='GPU',early_stopping_rounds=200,)#k折交叉print("🔄Generating Out-of-Fold (OOF) predictions and Test predictions for Base Models...\n" + "-"*70 + "\n")# --- Prediction Storage ---# Arrays to store out-of-fold (OOF)add_pred_val_cat = np.zeros(len(X_train))add_pred_val_xgb = np.zeros(len(X_train))add_pred_val_lgb = np.zeros(len(X_train))# Arrays to store test predictions (accumulated across folds for averaging)add_pred_test_cat = np.zeros(len(x_test))add_pred_test_xgb = np.zeros(len(x_test))add_pred_test_lgb = np.zeros(len(x_test))kf = KFold(n_splits=5, shuffle=True, random_state=42)for fold, (train_index, val_index) in enumerate(kf.split(X_train, Y_train)):print(f"\n---Fold {fold + 1}/{kf.n_splits} ---")x_train, x_val = X_train.iloc[train_index], X_train.iloc[val_index]y_train, y_val = Y_train.iloc[train_index], Y_train.iloc[val_index]#Apply log1p transformation to target for trainingy_train_log1p = np.log1p(y_train)y_val_log1p = np.log1p(y_val)# --- CatBoost Training and Prediction ---print(" ? Training CatBoost...")model_cat.fit(x_train, y_train_log1p,eval_set=[(x_val, y_val_log1p)],verbose=0 # Set to 100 if you want to see progress)pred_val_cat = model_cat.predict(x_val)pred_test_cat = model_cat.predict(x_test)# --- XGBoost Training and Prediction ---print(" ? Training XGBoost...")model_xgb.fit(x_train, y_train_log1p,eval_set=[(x_val, y_val_log1p)],verbose=0 # Set to 100 if you want to see progress)pred_val_xgb = model_xgb.predict(x_val)pred_test_xgb = model_xgb.predict(x_test)# --- LGBM Training and Prediction ---print(" ? Training LGBM...")model_lgb.fit(x_train, y_train_log1p,eval_set=[(x_val, y_val_log1p)],)pred_val_lgb = model_lgb.predict(x_val)pred_test_lgb = model_lgb.predict(x_test)# --- Store OOF and Test Predictions (transformed back to original scale) ---add_pred_val_cat[val_index] = np.expm1(pred_val_cat)add_pred_val_xgb[val_index] = np.expm1(pred_val_xgb)add_pred_val_lgb[val_index] = np.expm1(pred_val_lgb)add_pred_test_cat += np.expm1(pred_test_cat) / kf.n_splitsadd_pred_test_xgb += np.expm1(pred_test_xgb) / kf.n_splitsadd_pred_test_lgb += np.expm1(pred_test_lgb) / kf.n_splits# Ensure all predictions are non-negativeadd_pred_val_cat[add_pred_val_cat < 0] = 0add_pred_val_xgb[add_pred_val_xgb < 0] = 0add_pred_val_lgb[add_pred_val_lgb < 0] = 0# Note: test predictions will also be non-negative after final prediction step# Calculate and print RMSLE for individual models on this foldprint(f" CatBoost RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_cat[val_index]):.4f}")print(f" XGBoost RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_xgb[val_index]):.4f}")print(f" LGBM RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_lgb[val_index]):.4f}")x_meta_train = pd.DataFrame({'cat_pred': add_pred_val_cat,'xgb_pred': add_pred_val_xgb,'lgbm_pred': add_pred_val_lgb,})y_meta_train = Y_trainx_meta_test = pd.DataFrame({'cat_pred': add_pred_test_cat,'xgb_pred': add_pred_test_xgb,'lgbm_pred': add_pred_test_lgb,})model_meta = Ridge(random_state=42)model_meta.fit(x_meta_train, y_meta_train)print(f" meta RMSLE :{rmsle(y_meta_train, model_meta.predict(x_meta_train)):.4f}")pred_all = np.expm1(model_meta.predict(x_meta_test))submission = pd.DataFrame({'id': df_test['id'],'Calories': pred_all})submission['Calories'] = np.clip(submission['Calories'], a_min=df_test['Duration'], a_max=20*df_test['Duration'])submission.to_csv('submission.csv', index=False)