【機器學習】數據預處理
- 1. 下載/解壓數據
- 2. 數據預處理
- 3. 加載以及訓練代碼
- 3.1 使用PIL等加載代碼
- 3.2 使用OpenCV的方式來一張張加載代碼
- 3.3 h5的方式來加載大文件
- 最后總結
這個數據大約 140個G,128w的訓練集
1. 下載/解壓數據
首先需要下載數據:
數據最后處理成如圖的格式,每個種類的數據都放到一個相同的文件夾中去,這里的文件夾名稱(種類名稱)最好改成整數,方便后續處理
2. 數據預處理
需要對數據做如下處理
- 處理成模型需要的224*224長寬的數據
- 處理成h5/npy之類大文件格式,從而減少CPU的IO開支
import h5py
import numpy as np
import os
from tqdm import tqdm
import cv2
from concurrent.futures import ThreadPoolExecutor
from sklearn.preprocessing import LabelEncoderdef process_image(file_path, size=(224, 224)):image = cv2.imread(file_path)if image is None:print(f"無法讀取圖像: {file_path}")return None# 調整圖像大小resized_image = cv2.resize(image, size)return resized_imagedef create_hdf5_datasets(input_dir, output_dir, images_per_file=1000, max_workers=8):# 獲取所有文件的列表all_files = []for root, dirs, files in os.walk(input_dir):for file_name in files:file_path = os.path.join(root, file_name)all_files.append(file_path)# 確保輸出目錄存在if not os.path.exists(output_dir):os.makedirs(output_dir)# 獲取所有標簽并進行編碼all_labels = [os.path.basename(os.path.dirname(file)) for file in all_files]label_encoder = LabelEncoder()label_encoder.fit(all_labels)file_count = 0total_files = len(all_files)# 使用多線程處理圖像with ThreadPoolExecutor(max_workers=max_workers) as executor:for i in range(0, total_files, images_per_file):chunk_files = all_files[i:i + images_per_file]processed_images = list(tqdm(executor.map(process_image, chunk_files), total=len(chunk_files), desc=f"Processing chunk {file_count + 1}"))# 過濾掉 None 值processed_images = [img for img in processed_images if img is not None]# 創建標簽數據(假設標簽為文件夾名稱)labels = [os.path.basename(os.path.dirname(file)) for file in chunk_files if cv2.imread(file) is not None]encoded_labels = label_encoder.transform(labels)# 寫入 HDF5 文件output_hdf5 = os.path.join(output_dir, f'train_{file_count + 1}.hdf5')with h5py.File(output_hdf5, 'w') as f:dataset_images = f.create_dataset("images", (len(processed_images), 224, 224, 3), dtype='uint8')dataset_labels = f.create_dataset("labels", (len(encoded_labels),), dtype='int')for j, img in enumerate(processed_images):dataset_images[j] = imgdataset_labels[j] = encoded_labels[j]file_count += 1print(f"Created {output_hdf5} with {len(processed_images)} images")print(f"Total HDF5 files created: {file_count}")# 示例用法
input_directory_path = 'E:\\data\\train' # 替換為你的目錄路徑
output_directory_path = 'E:\\data\\hdf5\\train' # 輸出的目錄路徑
create_hdf5_datasets(input_directory_path, output_directory_path, images_per_file=50000) # 創建多個 HDF5 文件
這里就是將圖片分成若干份,每一份50000張圖,主要是我電腦內存32G 無法一次性加載,所以分割了一下。
3. 加載以及訓練代碼
3.1 使用PIL等加載代碼
這個方式是一張張的加載圖片,加載后再處理成模型需要的尺寸,在一張張加載圖片的時候速度較慢,會影響訓練速度
# 定義自定義數據集類
class CustomDataset(torch.utils.data.Dataset):def __init__(self, csv_file, transform=None):self.data_frame = pd.read_csv(csv_file)self.transform = transformself.label_encoder = LabelEncoder()self.data_frame['label'] = self.label_encoder.fit_transform(self.data_frame['label']) # 將標簽編碼為整數def __len__(self):return len(self.data_frame)def __getitem__(self, idx):img_path = self.data_frame.iloc[idx, 0] # 圖像路徑image = Image.open(train_file + img_path).convert('RGB')# 讀取圖像label = self.data_frame.iloc[idx, 1] #從表格中讀取標簽 ,此時標簽已經被編碼為整數label = torch.tensor(label, dtype=torch.long)# 將標簽轉換為張量if self.transform:image = self.transform(image)return image, label# 定義圖像轉換
transform = transforms.Compose([transforms.Resize((224, 224)), # 調整圖像大小transforms.ToTensor(), # 轉換為張量transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # 歸一化 mean的值和std的值是根據ImageNet數據集的均值和標準差計算得到的
])
3.2 使用OpenCV的方式來一張張加載代碼
OpenCV確實能加速一點IO的速度,
import os
import pandas as pd
import cv2 # 導入 OpenCV 庫
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm # 導入 tqdm 庫
import time# 定義數據路徑
data_path = 'E:\\data\\ImageNet2012\\ILSVRC2012_img_train\\'# 定義自定義數據集類
class CustomDataset(torch.utils.data.Dataset):def __init__(self, csv_file, data_path, transform=None):self.data_frame = pd.read_csv(csv_file)self.data_path = data_pathself.transform = transformself.label_encoder = LabelEncoder()self.data_frame['label'] = self.label_encoder.fit_transform(self.data_frame['label']) # 將標簽編碼為整數def __len__(self):return len(self.data_frame)def __getitem__(self, idx):start_time = time.time()data_load_time = time.time() - start_timeimg_name = self.data_frame.iloc[idx, 0] # 圖像相對路徑img_path = os.path.join(self.data_path, img_name) # 生成完整的圖像路徑image = cv2.imread(img_path) # 使用 OpenCV 讀取圖像image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # 將圖像從 BGR 轉換為 RGBimage = cv2.resize(image, (224, 224)) # 調整圖像大小label = self.data_frame.iloc[idx, 1] # 從表格中讀取標簽,此時標簽已經被編碼為整數label = torch.tensor(label, dtype=torch.long) # 將標簽轉換為張量data_to_device_time = time.time() - start_time - data_load_timeif self.transform:image = self.transform(image)forward_time = time.time() - start_time - data_load_time - data_to_device_timeprint(f"Data load time: {data_load_time:.4f}, Data to device time: {data_to_device_time:.4f}, Forward time: {forward_time:.4f}")return image, label# 定義圖像轉換
transform = transforms.Compose([transforms.ToTensor(), # 轉換為張量transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # 歸一化 mean的值和std的值是根據ImageNet數據集的均值和標準差計算得到的
])# 創建數據集
csv_file = os.path.join(data_path, 'train.csv')
dataset = CustomDataset(csv_file=csv_file, data_path=data_path, transform=transform)# 將數據集分為訓練集和驗證集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])# 創建數據加載器
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True) # 設置 shuffle 為 True
val_dataloader = DataLoader(val_dataset, batch_size=512, shuffle=False) # 加載預訓練的 ResNet 模型
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(dataset.data_frame['label'].unique())) # 根據標簽數量調整最后一層# 將模型移動到 GPU 上
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)# 訓練函數
def train_model(model, dataloader, criterion, optimizer):model.train()running_loss = 0.0for inputs, labels in tqdm(dataloader, desc="Training"): # 使用 tqdm 包裝 dataloaderinputs, labels = inputs.to(device), labels.to(device) # 將數據移動到 GPU 上optimizer.zero_grad()outputs = model(inputs)loss = criterion(outputs, labels)loss.backward()optimizer.step()running_loss += loss.item() * inputs.size(0)epoch_loss = running_loss / len(dataloader.dataset)print(f'Training Loss: {epoch_loss:.4f}')# 測試函數
def test_model(model, dataloader, criterion):model.eval()correct = 0total = 0running_loss = 0.0with torch.no_grad():for inputs, labels in tqdm(dataloader, desc="Validation"): # 使用 tqdm 包裝 dataloaderinputs, labels = inputs.to(device), labels.to(device) # 將數據移動到 GPU 上outputs = model(inputs)loss = criterion(outputs, labels)running_loss += loss.item() * inputs.size(0)_, predicted = torch.max(outputs, 1)total += labels.size(0)correct += (predicted == labels).sum().item()accuracy = correct / totalepoch_loss = running_loss / len(dataloader.dataset)print(f'Test Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}')# 訓練和驗證循環
epochs = 25
for t in range(epochs):print(f"Epoch {t+1}\n-------------------------------")train_model(model, train_dataloader, criterion, optimizer)print("Validation:")test_model(model, val_dataloader, criterion)
print("Done!")
3.3 h5的方式來加載大文件
HDF5Dataset 類在初始化時只加載文件索引,而不是加載所有數據。在 getitem 方法中,它會根據索引動態加載所需的 HDF5 文件,并從中讀取圖像和標簽。這可以確保在每次訪問數據時只加載當前需要的 HDF5 文件,并在使用完成后自動從內存中移除。
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from tqdm import tqdm
import h5py# 定義數據路徑
train_data_path = 'E:\\data\\hdf5\\train'
val_data_path = 'E:\\data\\hdf5\\val'# 定義自定義數據集類
class HDF5Dataset(Dataset):def __init__(self, hdf5_dir, transform=None):self.hdf5_files = [os.path.join(hdf5_dir, f) for f in os.listdir(hdf5_dir) if f.endswith('.hdf5')]self.transform = transformself.file_indices = []self.load_file_indices()def load_file_indices(self):for file_idx, hdf5_file in enumerate(self.hdf5_files):with h5py.File(hdf5_file, 'r') as f:num_images = f['images'].shape[0]self.file_indices.extend([(file_idx, i) for i in range(num_images)])def __len__(self):return len(self.file_indices)def __getitem__(self, idx):file_idx, image_idx = self.file_indices[idx]hdf5_file = self.hdf5_files[file_idx]with h5py.File(hdf5_file, 'r') as f:image = f['images'][image_idx]label = f['labels'][image_idx]if self.transform:image = self.transform(image)# 將標簽轉換為張量label = torch.tensor(label, dtype=torch.long)return image, label# 定義圖像轉換
transform = transforms.Compose([transforms.ToTensor(), # 轉換為張量transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # 歸一化 mean的值和std的值是根據ImageNet數據集的均值和標準差計算得到的
])# 創建訓練集數據集
train_dataset = HDF5Dataset(hdf5_dir=train_data_path, transform=transform)# 創建驗證集數據集
val_dataset = HDF5Dataset(hdf5_dir=val_data_path, transform=transform)# 創建數據加載器
train_dataloader = DataLoader(train_dataset, batch_size=256, shuffle=True) # 設置 shuffle 為 True
val_dataloader = DataLoader(val_dataset, batch_size=256, shuffle=False) # 加載預訓練的 ResNet 模型
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(set(train_dataset.file_indices))) # 根據標簽數量調整最后一層# 將模型移動到 GPU 上
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)# 訓練函數
def train_model(model, dataloader, criterion, optimizer):model.train()running_loss = 0.0for inputs, labels in tqdm(dataloader, desc="Training"): # 使用 tqdm 包裝 dataloaderinputs, labels = inputs.to(device), labels.to(device) # 將數據移動到 GPU 上optimizer.zero_grad()outputs = model(inputs)loss = criterion(outputs, labels)loss.backward()optimizer.step()running_loss += loss.item() * inputs.size(0)epoch_loss = running_loss / len(dataloader.dataset)print(f'Training Loss: {epoch_loss:.4f}')# 測試函數
def test_model(model, dataloader, criterion):model.eval()correct = 0total = 0running_loss = 0.0with torch.no_grad():for inputs, labels in tqdm(dataloader, desc="Validation"): # 使用 tqdm 包裝 dataloaderinputs, labels = inputs.to(device), labels.to(device) # 將數據移動到 GPU 上outputs = model(inputs)loss = criterion(outputs, labels)running_loss += loss.item() * inputs.size(0)_, predicted = torch.max(outputs, 1)total += labels.size(0)correct += (predicted == labels).sum().item()accuracy = correct / totalepoch_loss = running_loss / len(dataloader.dataset)print(f'Test Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.4f}')# 訓練和驗證循環
epochs = 25
model_save_path = 'model_checkpoint.pth'
for t in range(epochs):print(f"Epoch {t+1}\n-------------------------------")train_model(model, train_dataloader, criterion, optimizer)print("Validation:")test_model(model, val_dataloader, criterion)# 每5個循環保存一次模型,并刪除之前的模型if (t + 1) % 5 == 0:if os.path.exists(model_save_path):os.remove(model_save_path)torch.save(model.state_dict(), model_save_path)print(f"Model saved at epoch {t+1}")print("Done!")
最后總結
我的電腦環境 i5 12400+4090+32G內存+固態。
但磁盤速度才幾十M如果是機械盤的話應該也問題不大
然后我訓練的時間最快能達到45分鐘一個epoch,使用3.3章節中的代碼。
提升訓練速度的小技巧
- 不要開任務管理器,雖然開著很爽,但確實比較占用CPU的資源
- 不要開瀏覽器,瀏覽器中不知道運行了些什么東西會影響速度
- 不要開很多vscode,只保留debug的一個,能加速10分鐘