「日拱一碼」024 機器學習—

數據層面

數據增強

數據正則化

?數據采樣

模型結構層面

簡化模型

添加正則化層

早停法（Early Stopping）

訓練過程層面

使用交叉驗證

使用集成學習

調整學習率

防止過擬合是機器學習中一個非常重要的問題，它可以幫助模型在新的數據上表現得更好。以下將從數據層面、模型結構層面和訓練過程層面對防止過擬合的方法進行分類介紹

數據層面

數據增強

數據增強通過對訓練數據進行變換（如旋轉、縮放、裁剪等），增加數據的多樣性，從而減少模型對訓練數據的過擬合

## 數據層面
# 1. 數據增強import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.preprocessing.image import ImageDataGenerator# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 將圖像數據轉換為浮點數并歸一化
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0# 將圖像數據擴展為 4D 張量 (samples, height, width, channels)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)# 創建數據增強生成器
datagen = ImageDataGenerator(rotation_range=20,  # 隨機旋轉度數范圍width_shift_range=0.1,  # 隨機水平移動范圍height_shift_range=0.1,  # 隨機垂直移動范圍shear_range=0.2,  # 剪切強度zoom_range=0.2,  # 隨機縮放范圍horizontal_flip=False,  # 不進行水平翻轉（因為數字圖像水平翻轉可能沒有意義）fill_mode='nearest'  # 填充新創建像素的方法
)# 選擇一張圖像進行增強
sample_image = x_train[0]  # 選擇第一張圖像
sample_image = np.expand_dims(sample_image, 0)  # 添加批次維度# 使用數據增強生成器生成增強后的圖像
augmented_images = datagen.flow(sample_image, batch_size=1)# 可視化增強后的圖像
plt.figure(figsize=(10, 6))
for i in range(10):  # 生成并顯示 10 張增強后的圖像augmented_image = next(augmented_images)[0]  # 獲取一張增強后的圖像plt.subplot(2, 5, i + 1)plt.imshow(augmented_image.squeeze(), cmap='gray')  # 顯示灰度圖像plt.axis('off')  # 關閉坐標軸
plt.show()

數據正則化

數據正則化通過對輸入數據進行歸一化或標準化，使數據的分布更加均勻，減少模型對數據的過擬合

# 2. 數據正則化import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 將圖像數據轉換為浮點數
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')# 數據歸一化（將像素值縮放到 [0, 1]）
x_train_normalized = x_train / 255.0
x_test_normalized = x_test / 255.0# 數據標準化（將數據縮放到均值為 0，標準差為 1）
scaler = StandardScaler()
x_train_reshaped = x_train.reshape(-1, 28 * 28)  # 將圖像數據展平為二維數組
x_test_reshaped = x_test.reshape(-1, 28 * 28)x_train_standardized = scaler.fit_transform(x_train_reshaped)
x_test_standardized = scaler.transform(x_test_reshaped)# 可視化歸一化和標準化的效果
def plot_images(images, title):plt.figure(figsize=(10, 2))for i in range(10):plt.subplot(1, 10, i + 1)plt.imshow(images[i], cmap='gray')plt.axis('off')plt.suptitle(title)plt.show()# 顯示原始圖像
plot_images(x_train[:10], "Original Images")# 顯示歸一化后的圖像
plot_images(x_train_normalized[:10], "Normalized Images")# 顯示標準化后的圖像
plot_images(x_train_standardized[:10].reshape(-1, 28, 28), "Standardized Images")

數據采樣

數據采樣可以通過欠采樣（減少多數類樣本）或過采樣（增加少數類樣本）來平衡數據集，減少模型對多數類的過擬合

# 3. 數據采樣import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 將圖像數據轉換為浮點數
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')# 模擬不平衡數據集（選擇數字 0 和數字 1）
x_train_sampled = x_train[y_train < 2]
y_train_sampled = y_train[y_train < 2]# 過采樣（SMOTE）
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_train_sampled.reshape(-1, 28 * 28), y_train_sampled)# 欠采樣（RandomUnderSampler）
undersampler = RandomUnderSampler(random_state=42)
x_undersampled, y_undersampled = undersampler.fit_resample(x_train_sampled.reshape(-1, 28 * 28), y_train_sampled)# 可視化過采樣和欠采樣的效果
def plot_sampled_images(images, labels, title):plt.figure(figsize=(10, 2))for i in range(10):plt.subplot(1, 10, i + 1)plt.imshow(images[i].reshape(28, 28), cmap='gray')plt.title(labels[i])plt.axis('off')plt.suptitle(title)plt.show()# 顯示過采樣后的圖像
plot_sampled_images(x_resampled[:10], y_resampled[:10], "Over-sampled Images")# 顯示欠采樣后的圖像
plot_sampled_images(x_undersampled[:10], y_undersampled[:10], "Under-sampled Images")

模型結構層面

簡化模型

選擇更簡單的模型結構或減少模型的復雜度，可以有效減少過擬合。例如，減少神經網絡的層數或神經元數量

## 模型結構層面# 1. 簡化模型
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 數據預處理
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28)# 構建簡化模型
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(28 * 28,)))  # 較少的神經元
model.add(Dense(10, activation='softmax'))# 編譯模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])# 訓練模型
history_simple = model.fit(x_train, y_train,validation_split=0.2,epochs=50,batch_size=128
)# 可視化訓練過程
def plot_training_history(history, title):plt.figure(figsize=(12, 4))# 繪制訓練和驗證的損失plt.subplot(1, 2, 1)plt.plot(history.history['loss'], label='Training Loss')plt.plot(history.history['val_loss'], label='Validation Loss')plt.title(f'Training and Validation Loss ({title})')plt.xlabel('Epochs')plt.ylabel('Loss')plt.legend()# 繪制訓練和驗證的準確率plt.subplot(1, 2, 2)plt.plot(history.history['accuracy'], label='Training Accuracy')plt.plot(history.history['val_accuracy'], label='Validation Accuracy')plt.title(f'Training and Validation Accuracy ({title})')plt.xlabel('Epochs')plt.ylabel('Accuracy')plt.legend()plt.tight_layout()plt.show()# 調用可視化函數
plot_training_history(history_simple, "Simple Model")

添加正則化層

在模型中添加正則化層（如 Dropout 或 L1/L2 正則化），可以減少模型對訓練數據的依賴

# 2. 添加正則化層from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 數據預處理
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28)# 構建帶有正則化層的模型
model = Sequential()
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01), input_shape=(28 * 28,)))  # L2 正則化
model.add(Dropout(0.5))  # Dropout
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))# 編譯模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])# 訓練模型
history_regularized = model.fit(x_train, y_train,validation_split=0.2,epochs=50,batch_size=128
)# 可視化訓練過程
def plot_training_history(history, title):plt.figure(figsize=(12, 4))# 繪制訓練和驗證的損失plt.subplot(1, 2, 1)plt.plot(history.history['loss'], label='Training Loss')plt.plot(history.history['val_loss'], label='Validation Loss')plt.title(f'Training and Validation Loss ({title})')plt.xlabel('Epochs')plt.ylabel('Loss')plt.legend()# 繪制訓練和驗證的準確率plt.subplot(1, 2, 2)plt.plot(history.history['accuracy'], label='Training Accuracy')plt.plot(history.history['val_accuracy'], label='Validation Accuracy')plt.title(f'Training and Validation Accuracy ({title})')plt.xlabel('Epochs')plt.ylabel('Accuracy')plt.legend()plt.tight_layout()plt.show()# 可視化訓練過程
plot_training_history(history_regularized, "Regularized Model")

早停法（Early Stopping）

早停法通過在訓練過程中監控驗證集的損失，當驗證集的損失不再下降時停止訓練，從而避免過擬合

# 3. 早停法(Early Stopping)from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 數據預處理
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28)# 構建模型
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(28 * 28,)))
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='softmax'))# 編譯模型
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])# 設置早停法
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)# 訓練模型
history_early_stopping = model.fit(x_train, y_train,validation_split=0.2,epochs=50,batch_size=128,callbacks=[early_stopping]
)# 可視化訓練過程
def plot_training_history(history, title):plt.figure(figsize=(12, 4))# 繪制訓練和驗證的損失plt.subplot(1, 2, 1)plt.plot(history.history['loss'], label='Training Loss')plt.plot(history.history['val_loss'], label='Validation Loss')plt.title(f'Training and Validation Loss ({title})')plt.xlabel('Epochs')plt.ylabel('Loss')plt.legend()# 繪制訓練和驗證的準確率plt.subplot(1, 2, 2)plt.plot(history.history['accuracy'], label='Training Accuracy')plt.plot(history.history['val_accuracy'], label='Validation Accuracy')plt.title(f'Training and Validation Accuracy ({title})')plt.xlabel('Epochs')plt.ylabel('Accuracy')plt.legend()plt.tight_layout()plt.show()# 可視化訓練過程
plot_training_history(history_early_stopping, "Early Stopping")

訓練過程層面

使用交叉驗證

交叉驗證可以更好地評估模型的泛化能力，避免模型對特定訓練集的過擬合

## 訓練過程層面# 1. 使用交叉驗證
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import KFold# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 數據預處理
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28)# 定義模型
def create_model():model = Sequential()model.add(Dense(256, activation='relu', input_shape=(28 * 28,)))model.add(Dense(128, activation='relu'))model.add(Dense(10, activation='softmax'))model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])return model# K 折交叉驗證
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
accuracies = []for train_index, val_index in kf.split(x_train):print(f'Training on fold {fold_no}...')x_train_fold, x_val_fold = x_train[train_index], x_train[val_index]y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]model = create_model()model.fit(x_train_fold, y_train_fold, epochs=10, batch_size=128, verbose=0)scores = model.evaluate(x_val_fold, y_val_fold, verbose=0)accuracies.append(scores[1])print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1] * 100}%')fold_no += 1# 輸出交叉驗證的平均準確率
print(f'Average accuracy: {np.mean(accuracies) * 100}%') # 97.5766670703888%

使用集成學習

集成學習通過組合多個模型來提高模型的泛化能力，減少過擬合

# 2. 使用集成學習from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tensorflow.keras.datasets import mnist
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 數據預處理
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28)# 定義多個模型
model1 = LogisticRegression(max_iter=1000, random_state=42)
model2 = SVC(probability=True, random_state=42)
model3 = RandomForestClassifier(random_state=42)# 創建集成模型
ensemble_model = VotingClassifier(estimators=[('lr', model1), ('svc', model2), ('rf', model3)], voting='soft')# 訓練集成模型
ensemble_model.fit(x_train, y_train)# 預測并評估
y_pred = ensemble_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble model accuracy: {accuracy * 100}%')   # 97.21%

調整學習率

適當調整學習率可以避免模型在訓練過程中過度擬合訓練數據

# 3. 調整學習率
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential# 加載 MNIST 數據集
(x_train, y_train), (x_test, y_test) = mnist.load_data()# 數據預處理
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0
x_train = x_train.reshape(-1, 28 * 28)
x_test = x_test.reshape(-1, 28 * 28)# 定義模型
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(28 * 28,)))
model.add(Dense(128, activation='relu'))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])# 設置動態調整學習率
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)# 訓練模型
history = model.fit(x_train, y_train,validation_split=0.2,epochs=50,batch_size=128,callbacks=[reduce_lr]
)# 可視化訓練過程
def plot_training_history(history, title):plt.figure(figsize=(12, 4))# 繪制訓練和驗證的損失plt.subplot(1, 2, 1)plt.plot(history.history['loss'], label='Training Loss')plt.plot(history.history['val_loss'], label='Validation Loss')plt.title(f'Training and Validation Loss ({title})')plt.xlabel('Epochs')plt.ylabel('Loss')plt.legend()# 繪制訓練和驗證的準確率plt.subplot(1, 2, 2)plt.plot(history.history['accuracy'], label='Training Accuracy')plt.plot(history.history['val_accuracy'], label='Validation Accuracy')plt.title(f'Training and Validation Accuracy ({title})')plt.xlabel('Epochs')plt.ylabel('Accuracy')plt.legend()plt.tight_layout()plt.show()# 可視化訓練過程
plot_training_history(history, "Dynamic Learning Rate")