多模態分類案例實現

以下是基于飛槳平臺實現的多模態分類詳細案例，結合圖像和文本信息進行分類任務。案例包含數據處理、模型構建、訓練和評估的完整流程，并提供詳細注釋：

一、多模態分類案例實現

import os
import json
import numpy as np
from PIL import Image
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import Dataset, DataLoader
from paddle.vision import models
import paddlenlp as ppnlp
from paddlenlp.transformers import ErnieTokenizer, ErnieModel# 設置隨機種子，確保結果可復現
paddle.seed(42)
np.random.seed(42)# ---------------------- 1. 數據集定義 ----------------------
class MultiModalDataset(Dataset):"""多模態圖像-文本分類數據集"""def __init__(self, data_path, image_dir, tokenizer, max_seq_len=128, mode='train'):"""data_path: 標注文件路徑image_dir: 圖像文件夾路徑tokenizer: 文本tokenizermax_seq_len: 文本最大長度mode: 模式，train/val/test"""super().__init__()self.image_dir = image_dirself.tokenizer = tokenizerself.max_seq_len = max_seq_lenself.mode = mode# 加載數據集with open(data_path, 'r', encoding='utf-8') as f:self.data = json.load(f)# 定義類別到ID的映射（根據數據集調整）self.label2id = {'科技': 0, '娛樂': 1, '體育': 2, '財經': 3, '教育': 4}self.id2label = {v: k for k, v in self.label2id.items()}def __len__(self):return len(self.data)def __getitem__(self, idx):# 獲取單條數據item = self.data[idx]image_path = os.path.join(self.image_dir, item['image'])text = item['text']label = self.label2id[item['label']]# 處理圖像image = Image.open(image_path).convert('RGB')image = self._preprocess_image(image)# 處理文本encoded_inputs = self.tokenizer(text=text,max_seq_len=self.max_seq_len,pad_to_max_seq_len=True,return_attention_mask=True,return_token_type_ids=True)# 轉換為Tensorinput_ids = paddle.to_tensor(encoded_inputs['input_ids'], dtype='int64')attention_mask = paddle.to_tensor(encoded_inputs['attention_mask'], dtype='int64')token_type_ids = paddle.to_tensor(encoded_inputs['token_type_ids'], dtype='int64')label = paddle.to_tensor(label, dtype='int64')return {'image': image,'input_ids': input_ids,'attention_mask': attention_mask,'token_type_ids': token_type_ids,'label': label}def _preprocess_image(self, image):"""圖像預處理：縮放、歸一化、轉Tensor"""# 調整圖像大小為224x224image = image.resize((224, 224), Image.BICUBIC)# 轉換為numpy數組image = np.array(image).astype('float32')# 歸一化image = image / 255.0# 標準化（ImageNet均值和標準差）image = (image - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])# 調整通道順序 (HWC -> CHW)image = np.transpose(image, (2, 0, 1))return paddle.to_tensor(image, dtype='float32')# ---------------------- 2. 多模態分類模型 ----------------------
class MultiModalClassifier(nn.Layer):"""基于圖像和文本的多模態分類模型"""def __init__(self, num_classes, text_encoder='ernie-1.0', pretrained=True):super().__init__()# 圖像編碼器（使用預訓練ResNet50）self.image_encoder = models.resnet50(pretrained=pretrained)# 移除最后的全連接層self.image_encoder.fc = nn.Identity()# 添加投影層，將圖像特征映射到共同空間self.image_proj = nn.Linear(2048, 512)# 文本編碼器（使用預訓練ERNIE）self.text_encoder = ErnieModel.from_pretrained(text_encoder)# 添加投影層，將文本特征映射到共同空間self.text_proj = nn.Linear(768, 512)# 特征融合層self.fusion = nn.Sequential(nn.Linear(1024, 512),  # 拼接圖像和文本特征 (512+512)nn.ReLU(),nn.Dropout(0.5),nn.Linear(512, 256),nn.ReLU(),nn.Dropout(0.5))# 分類器self.classifier = nn.Linear(256, num_classes)def forward(self, image, input_ids, attention_mask, token_type_ids=None):# 提取圖像特征image_features = self.image_encoder(image)  # [batch_size, 2048]image_features = self.image_proj(image_features)  # [batch_size, 512]# 提取文本特征text_outputs = self.text_encoder(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)# 獲取[CLS] token的表示text_features = text_outputs[1]  # [batch_size, 768]text_features = self.text_proj(text_features)  # [batch_size, 512]# 特征融合fused_features = paddle.concat([image_features, text_features], axis=1)  # [batch_size, 1024]fused_features = self.fusion(fused_features)  # [batch_size, 256]# 分類預測logits = self.classifier(fused_features)  # [batch_size, num_classes]return logits# ---------------------- 3. 模型訓練 ----------------------
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs, save_dir):"""訓練多模態分類模型"""best_acc = 0.0for epoch in range(epochs):# 訓練模式model.train()train_loss = 0.0correct = 0total = 0for batch in train_loader:# 獲取數據image = batch['image']input_ids = batch['input_ids']attention_mask = batch['attention_mask']token_type_ids = batch['token_type_ids']label = batch['label']# 前向傳播logits = model(image, input_ids, attention_mask, token_type_ids)loss = criterion(logits, label)# 反向傳播loss.backward()optimizer.step()optimizer.clear_grad()# 統計訓練指標train_loss += loss.numpy()[0]total += label.shape[0]pred = paddle.argmax(logits, axis=1)correct += (pred == label).sum().numpy()[0]# 計算訓練準確率train_acc = correct / totalprint(f'Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_acc:.4f}')# 驗證val_acc = evaluate_model(model, val_loader)print(f'Epoch [{epoch+1}/{epochs}], Val Acc: {val_acc:.4f}')# 保存最佳模型if val_acc > best_acc:best_acc = val_accpaddle.save(model.state_dict(), os.path.join(save_dir, 'best_model.pdparams'))print(f'Model saved at acc: {best_acc:.4f}')# ---------------------- 4. 模型評估 ----------------------
def evaluate_model(model, data_loader):"""評估模型性能"""model.eval()correct = 0total = 0with paddle.no_grad():for batch in data_loader:# 獲取數據image = batch['image']input_ids = batch['input_ids']attention_mask = batch['attention_mask']token_type_ids = batch['token_type_ids']label = batch['label']# 模型預測logits = model(image, input_ids, attention_mask, token_type_ids)pred = paddle.argmax(logits, axis=1)# 統計準確率total += label.shape[0]correct += (pred == label).sum().numpy()[0]return correct / total# ---------------------- 5. 主函數 ----------------------
def main():# 配置參數config = {'train_data_path': 'data/train.json',  # 訓練數據路徑'val_data_path': 'data/val.json',      # 驗證數據路徑'image_dir': 'data/images',            # 圖像文件夾路徑'save_dir': 'checkpoints',             # 模型保存路徑'num_classes': 5,                      # 分類類別數'batch_size': 16,                      # 批次大小'epochs': 10,                          # 訓練輪數'learning_rate': 1e-4,                 # 學習率'max_seq_len': 128                     # 文本最大長度}# 創建保存目錄os.makedirs(config['save_dir'], exist_ok=True)# 初始化tokenizertokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')# 創建數據集train_dataset = MultiModalDataset(config['train_data_path'], config['image_dir'], tokenizer, config['max_seq_len'],mode='train')val_dataset = MultiModalDataset(config['val_data_path'], config['image_dir'], tokenizer, config['max_seq_len'],mode='val')# 創建數據加載器train_loader = DataLoader(train_dataset,batch_size=config['batch_size'],shuffle=True,num_workers=4)val_loader = DataLoader(val_dataset,batch_size=config['batch_size'],shuffle=False,num_workers=4)# 初始化模型model = MultiModalClassifier(config['num_classes'])# 定義損失函數和優化器criterion = nn.CrossEntropyLoss()optimizer = paddle.optimizer.AdamW(learning_rate=config['learning_rate'],parameters=model.parameters())# 訓練模型train_model(model, train_loader, val_loader, optimizer, criterion, config['epochs'], config['save_dir'])# 加載最佳模型并評估model.set_state_dict(paddle.load(os.path.join(config['save_dir'], 'best_model.pdparams')))test_acc = evaluate_model(model, val_loader)print(f'Final Test Accuracy: {test_acc:.4f}')if __name__ == '__main__':main()

二、數據集格式說明

數據集采用JSON格式，每條數據包含圖像路徑、文本描述和類別標簽：

[{"image": "image_001.jpg","text": "這款新手機的相機功能非常出色，拍照效果堪比專業相機","label": "科技"},{"image": "image_002.jpg","text": "這支足球隊在本賽季表現出色，有望奪得冠軍","label": "體育"},...
]