【機器學習實戰】Datawhale夏令營：Baseline精讀筆記2

# AI夏令營 # Datawhale # 夏令營

在原有的Baseline上除了交叉驗證，還有一種關鍵的優化方式，即特征工程。

如何優化特征，關系著我們提高模型預測的精準度。特征工程往往是對問題的領域有深入了解的人員能夠做好的部分，因為我們要思考轉換的方式。

Smiles特征之外還有很多特征可以提取有價值的信息，比如InChI是由一系列部分組成，提供了關于分子結構的詳細信息。比如開頭標識、分子式、連接表、氫原子計數、多可旋轉鍵計數、立體化學信息、同分異構體信息、混合物或互變異構體信息、電荷和自旋多重度信息等。

除此之外，要想提升模型的精準度，換模型也未嘗不可。

特征優化

提取分子式

從InChI字符串中，我們可以看到分子式直接給出在/C47H61N7O6S部分。這意味著分子由47個碳原子、61個氫原子、7個氮原子、6個氧原子和1個硫原子組成；

計算分子量

分子量可以通過將每種原子的原子質量乘以其數量然后相加得到。

如

碳（C）的原子質量約為12.01 g/mol
氫（H）的原子質量約為1.008 g/mol
氮（N）的原子質量約為14.01 g/mol
氧（O）的原子質量約為16.00 g/mol
硫（S）的原子質量約為32.07 g/mol

乘以數量相加，我們就可以得到分子量。

原子計數

直接計算不同原子的個數，并進行展開。

import pandas as pd
import reatomic_masses = {'H': 1.008, 'He': 4.002602, 'Li': 6.94, 'Be': 9.0122, 'B': 10.81, 'C': 12.01,'N': 14.01, 'O': 16.00, 'F': 19.00, 'Ne': 20.180, 'Na': 22.990, 'Mg': 24.305,'Al': 26.982, 'Si': 28.085, 'P': 30.97, 'S': 32.07, 'Cl': 35.45, 'Ar': 39.95,'K': 39.10, 'Ca': 40.08, 'Sc': 44.956, 'Ti': 47.867, 'V': 50.942, 'Cr': 52.00,'Mn': 54.938, 'Fe': 55.845, 'Co': 58.933, 'Ni': 58.69, 'Cu': 63.55, 'Zn': 65.38
}# 函數用于解析單個InChI字符串
def parse_inchi(row):inchi_str = row['InChI']formula = ''molecular_weight = 0element_counts = {}# 提取分子式formula_match = re.search(r"InChI=1S/([^/]+)/c", inchi_str)if formula_match:formula = formula_match.group(1)# 計算分子量和原子計數for element, count in re.findall(r"([A-Z][a-z]*)([0-9]*)", formula):count = int(count) if count else 1element_mass = atomic_masses.get(element.upper(), 0)molecular_weight += element_mass * countelement_counts[element.upper()] = countreturn pd.Series({'Formula': formula,'MolecularWeight': molecular_weight,'ElementCounts': element_counts})# 應用函數到DataFrame的每一行
train[['Formula', 'MolecularWeight', 'ElementCounts']] = train.apply(parse_inchi, axis=1)# 定義存在的key
keys = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn']# 創建一個空的DataFrame，列名為keys
df_expanded = pd.DataFrame({key: pd.Series() for key in keys})# 遍歷數據，填充DataFrame
for index, item in enumerate(train['ElementCounts'].values):for key in keys:# 將字典中的值填充到相應的列中df_expanded.at[index, key] = item.get(key, 0)df_expanded = pd.DataFrame(df_expanded)

模型融合

上次提到了我們使用的是catboost模型，沒有嘗試過lightgbm和xgboost，可以依次跑完這三個模型，然后對三個模型的結果進行取平均進行融合（也是可以改進的地方）。

def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2023):folds = 5kf = KFold(n_splits=folds, shuffle=True, random_state=seed)oof = np.zeros(train_x.shape[0])test_predict = np.zeros(test_x.shape[0])cv_scores = []for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):print('************************************ {} ************************************'.format(str(i+1)))trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]if clf_name == "lgb":train_matrix = clf.Dataset(trn_x, label=trn_y)valid_matrix = clf.Dataset(val_x, label=val_y)params = {'boosting_type': 'gbdt','objective': 'binary','min_child_weight': 6,'num_leaves': 2 ** 6,'lambda_l2': 10,'feature_fraction': 0.8,'bagging_fraction': 0.8,'bagging_freq': 4,'learning_rate': 0.35,'seed': 2024,'nthread' : 16,'verbose' : -1,}model = clf.train(params, train_matrix, 2000, valid_sets=[train_matrix, valid_matrix],categorical_feature=[], verbose_eval=1000, early_stopping_rounds=100)val_pred = model.predict(val_x, num_iteration=model.best_iteration)test_pred = model.predict(test_x, num_iteration=model.best_iteration)if clf_name == "xgb":xgb_params = {'booster': 'gbtree', 'objective': 'binary:logistic','num_class':3,'max_depth': 5,'lambda': 10,'subsample': 0.7,'colsample_bytree': 0.7,'colsample_bylevel': 0.7,'eta': 0.35,'tree_method': 'hist','seed': 520,'nthread': 16}train_matrix = clf.DMatrix(trn_x , label=trn_y)valid_matrix = clf.DMatrix(val_x , label=val_y)test_matrix = clf.DMatrix(test_x)watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]model = clf.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=100)val_pred  = model.predict(valid_matrix)test_pred = model.predict(test_matrix)if clf_name == "cat":params = {'learning_rate': 0.35, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2024,'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}model = clf(iterations=2000, **params)model.fit(trn_x, trn_y, eval_set=(val_x, val_y),metric_period=1000,use_best_model=True, cat_features=[],verbose=1)val_pred  = model.predict_proba(val_x)test_pred = model.predict_proba(test_x)oof[valid_index] = val_predtest_predict += test_pred / kf.n_splitsF1_score = f1_score(val_y, np.where(val_pred>0.5, 1, 0))cv_scores.append(F1_score)print(cv_scores)return oof, test_predict# 參考demo,具體對照baseline實踐部分調用cv_model函數
# 選擇lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, x_train, y_train, x_test, 'lgb')
# 選擇xgboost模型
xgb_oof, xgb_test = cv_model(xgb, x_train, y_train, x_test, 'xgb')
# 選擇catboost模型
cat_oof, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, 'cat')# 進行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3

或者可以用stacking的方法

代碼更正

要想在飛槳上跑數據，復制粘貼是不行的。由于部分模型版本更迭，下面給出完整的更正后代碼。

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import f1_score
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.feature_extraction.text import TfidfVectorizer
import tqdm, sys, os, gc, re, argparse, warnings
warnings.filterwarnings('ignore')train = pd.read_excel('./data/data280993/traindata-new.xlsx')
test = pd.read_excel('./data/data280993/testdata-new.xlsx')# test數據不包含 DC50 (nM) 和 Dmax (%)
train = train.drop(['DC50 (nM)', 'Dmax (%)'], axis=1)# 定義了一個空列表drop_cols，用于存儲在測試數據集中非空值小于10個的列名。
drop_cols = []
for f in test.columns:if test[f].notnull().sum() < 10:drop_cols.append(f)# 使用drop方法從訓練集和測試集中刪除了這些列，以避免在后續的分析或建模中使用這些包含大量缺失值的列
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)# 使用pd.concat將清洗后的訓練集和測試集合并成一個名為data的DataFrame，便于進行統一的特征工程處理
data = pd.concat([train, test], axis=0, ignore_index=True)
cols = data.columns[2:]# 將SMILES轉換為分子對象列表,并轉換為SMILES字符串列表
data['smiles_list'] = data['Smiles'].apply(lambda x:[Chem.MolToSmiles(mol, isomericSmiles=True) for mol in [Chem.MolFromSmiles(x)]])
data['smiles_list'] = data['smiles_list'].map(lambda x: ' '.join(x))  # 使用TfidfVectorizer計算TF-IDF
tfidf = TfidfVectorizer(max_df = 0.9, min_df = 1, sublinear_tf = True)
res = tfidf.fit_transform(data['smiles_list'])# 將結果轉為dataframe格式
tfidf_df = pd.DataFrame(res.toarray())
tfidf_df.columns = [f'smiles_tfidf_{i}' for i in range(tfidf_df.shape[1])]# 按列合并到data數據
data = pd.concat([data, tfidf_df], axis=1)# 自然數編碼
def label_encode(series):unique = list(series.unique())return series.map(dict(zip(unique, range(series.nunique()))))for col in cols:if data[col].dtype == 'object':data[col]  = label_encode(data[col])train = data[data.Label.notnull()].reset_index(drop=True)
test = data[data.Label.isnull()].reset_index(drop=True)

lgb不支持特殊字符的輸入，因此我們需要重寫特征名稱：

import redef strict_clean_feature_name(name):# 只保留字母、數字和下劃線name = re.sub(r'[^a-zA-Z0-9_]', '', name)# 確保名稱不為空，并且不以數字開頭if not name or name[0].isdigit():name = 'f_' + namereturn name# 應用新的清理函數
x_train.columns = [strict_clean_feature_name(col) for col in x_train.columns]
x_test.columns = [strict_clean_feature_name(col) for col in x_test.columns]# 再次檢查清理后的特征名稱
print(x_train.columns)

模型融合：

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifieratomic_masses = {'H': 1.008, 'He': 4.002602, 'Li': 6.94, 'Be': 9.0122, 'B': 10.81, 'C': 12.01,'N': 14.01, 'O': 16.00, 'F': 19.00, 'Ne': 20.180, 'Na': 22.990, 'Mg': 24.305,'Al': 26.982, 'Si': 28.085, 'P': 30.97, 'S': 32.07, 'Cl': 35.45, 'Ar': 39.95,'K': 39.10, 'Ca': 40.08, 'Sc': 44.956, 'Ti': 47.867, 'V': 50.942, 'Cr': 52.00,'Mn': 54.938, 'Fe': 55.845, 'Co': 58.933, 'Ni': 58.69, 'Cu': 63.55, 'Zn': 65.38
}def parse_inchi(row):inchi_str = row['InChI']  # Assuming 'InChI' is a column in your DataFrameif isinstance(inchi_str, str):  # Check if inchi_str is a stringformula_match = re.search(r"InChI=1S/([^/]+)/c", inchi_str)if formula_match:formula = formula_match.group(1)molecular_weight = calculate_molecular_weight(formula)  # You need to define this functionelement_counts = extract_element_counts(formula)  # You need to define this functionreturn pd.Series({'Formula': formula,'MolecularWeight': molecular_weight,'ElementCounts': element_counts})else:# Handle case where regex pattern does not matchreturn pd.Series({'Formula': None,'MolecularWeight': None,'ElementCounts': None})else:# Handle case where inchi_str is not a string (e.g., it could be None or unexpected type)return pd.Series({'Formula': None,'MolecularWeight': None,'ElementCounts': None})# 應用函數到DataFrame的每一行
train[['Formula', 'MolecularWeight', 'ElementCounts']] = train.apply(parse_inchi, axis=1)# 定義存在的key
keys = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn']# 創建一個空的DataFrame，列名為keys
df_expanded = pd.DataFrame({key: pd.Series() for key in keys})for index, item in enumerate(train['ElementCounts'].values):if item is not None:for key in keys:# 將字典中的值填充到相應的列中df_expanded.at[index, key] = item.get(key, 0)else:# 如果 item 是 None，則所有元素計數設為 0for key in keys:df_expanded.at[index, key] = 0
df_expanded = pd.DataFrame(df_expanded)def cv_model(clf, train_x, train_y, test_x, clf_name, seed = 2023):folds = 5kf = KFold(n_splits=folds, shuffle=True, random_state=seed)oof = np.zeros(train_x.shape[0])test_predict = np.zeros(test_x.shape[0])cv_scores = []for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):print('************************************ {} ************************************'.format(str(i+1)))trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]if clf_name == "lgb":train_matrix = clf.Dataset(trn_x, label=trn_y)valid_matrix = clf.Dataset(val_x, label=val_y)params = {'boosting_type': 'gbdt','objective': 'binary','min_child_weight': 6,'num_leaves': 2 ** 6,'lambda_l2': 10,'feature_fraction': 0.8,'bagging_fraction': 0.8,'bagging_freq': 4,'learning_rate': 0.35,'seed': 2024,'verbose': -1,}model = clf.train(params,train_matrix,num_boost_round=2000,valid_sets=[train_matrix, valid_matrix],categorical_feature=[],callbacks=[lgb.early_stopping(stopping_rounds=100),lgb.log_evaluation(period=1000)])val_pred = model.predict(val_x, num_iteration=model.best_iteration)test_pred = model.predict(test_x, num_iteration=model.best_iteration)if clf_name == "xgb":xgb_params = {'booster': 'gbtree', 'objective': 'binary:logistic','num_class':3,'max_depth': 5,'lambda': 10,'subsample': 0.7,'colsample_bytree': 0.7,'colsample_bylevel': 0.7,'eta': 0.35,'tree_method': 'hist','seed': 520,'nthread': 16}train_matrix = clf.DMatrix(trn_x , label=trn_y)valid_matrix = clf.DMatrix(val_x , label=val_y)test_matrix = clf.DMatrix(test_x)watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]model = clf.train(xgb_params, train_matrix, num_boost_round=2000, evals=watchlist, verbose_eval=1000, early_stopping_rounds=100)val_pred  = model.predict(valid_matrix)test_pred = model.predict(test_matrix)if clf_name == "cat":params = {'learning_rate': 0.35, 'depth': 5, 'bootstrap_type':'Bernoulli','random_seed':2024,'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}model = clf(iterations=2000, **params)model.fit(trn_x, trn_y, eval_set=(val_x, val_y),metric_period=1000,use_best_model=True, cat_features=[],verbose=1)val_pred  = model.predict_proba(val_x)test_pred = model.predict_proba(test_x)oof[valid_index] = val_predtest_predict += test_pred / kf.n_splitsF1_score = f1_score(val_y, np.where(val_pred>0.5, 1, 0))cv_scores.append(F1_score)print(cv_scores)return oof, test_predict# 參考demo,具體對照baseline實踐部分調用cv_model函數
# 選擇lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, x_train, y_train, x_test, 'lgb')
# 選擇xgboost模型
xgb_oof, xgb_test = cv_model(xgb, x_train, y_train, x_test, 'xgb')
# 選擇catboost模型
cat_oof, cat_test = cv_model(CatBoostClassifier, x_train, y_train, x_test, 'cat')# 進行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3

跑的比較慢。

Stacking

def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y):'''輸入的oof_1, oof_2, oof_3可以對應lgb_oof，xgb_oof，cat_oofpredictions_1, predictions_2, predictions_3對應lgb_test，xgb_test，cat_test'''train_stack = pd.concat([oof_1, oof_2, oof_3], axis=1)test_stack = pd.concat([predictions_1, predictions_2, predictions_3], axis=1)oof = np.zeros((train_stack.shape[0],))predictions = np.zeros((test_stack.shape[0],))scores = []from sklearn.model_selection import RepeatedKFoldfolds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2021)for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train_stack)): print("fold n°{}".format(fold_+1))trn_data, trn_y = train_stack.loc[trn_idx], y[trn_idx]val_data, val_y = train_stack.loc[val_idx], y[val_idx]clf = Ridge(random_state=2024)clf.fit(trn_data, trn_y)oof[val_idx] = clf.predict(val_data)predictions += clf.predict(test_stack) / (5 * 2)score_single = roc_auc_score(val_y, oof[val_idx])scores.append(score_single)print(f'{fold_+1}/{5}', score_single)print('mean: ',np.mean(scores))return oof, predictions