目錄
1. 速度 vs 精度分析
mvit:
r2plus1d_r50 推理代碼:
x3d_xs推理代碼:
R(2+1)D
X3D(輕量級,速度快)
I3D(經典 3D CNN)
替換分類層(適配你的任務)
https://github.com/facebookresearch/pytorchvideo/blob/main/docs/source/model_zoo.md
arch | depth | pretrain | frame length x sample rate | top 1 | top 5 | Flops (G) x views | Params (M) | Model |
---|---|---|---|---|---|---|---|---|
C2D | R50 | - | 8x8 | 71.46 | 89.68 | 25.89 x 3 x 10 | 24.33 | link |
I3D | R50 | - | 8x8 | 73.27 | 90.70 | 37.53 x 3 x 10 | 28.04 | link |
Slow | R50 | - | 4x16 | 72.40 | 90.18 | 27.55 x 3 x 10 | 32.45 | link |
Slow | R50 | - | 8x8 | 74.58 | 91.63 | 54.52 x 3 x 10 | 32.45 | link |
SlowFast | R50 | - | 4x16 | 75.34 | 91.89 | 36.69 x 3 x 10 | 34.48 | link |
SlowFast | R50 | - | 8x8 | 76.94 | 92.69 | 65.71 x 3 x 10 | 34.57 | link |
SlowFast | R101 | - | 8x8 | 77.90 | 93.27 | 127.20 x 3 x 10 | 62.83 | link |
SlowFast | R101 | - | 16x8 | 78.70 | 93.61 | 215.61 x 3 x 10 | 53.77 | link |
CSN | R101 | - | 32x2 | 77.00 | 92.90 | 75.62 x 3 x 10 | 22.21 | link |
R(2+1)D | R50 | - | 16x4 | 76.01 | 92.23 | 76.45 x 3 x 10 | 28.11 | link |
X3D | XS | - | 4x12 | 69.12 | 88.63 | 0.91 x 3 x 10 | 3.79 | link |
X3D | S | - | 13x6 | 73.33 | 91.27 | 2.96 x 3 x 10 | 3.79 | link |
X3D | M | - | 16x5 | 75.94 | 92.72 | 6.72 x 3 x 10 | 3.79 | link |
X3D | L | - | 16x5 | 77.44 | 93.31 | 26.64 x 3 x 10 | 6.15 | link |
MViT | B | - | 16x4 | 78.85 | 93.85 | 70.80 x 1 x 5 | 36.61 | link |
MViT | B | - | 32x3 | 80.30 | 94.69 | 170.37 x 1 x 5 | 36.61 | link |
1. 速度 vs 精度分析
模型 | 計算量/速度 | 精度 | 適合你需求的程度 |
---|---|---|---|
X3D | ??????最快 | ???? 較高 | 🏆 最佳選擇 |
R(2+1)D | ??? 中等 | ????? 很高 | ??? 不錯但稍慢 |
I3D | ?? 最慢 | ???? 較高 | ?? 不太適合 |
pip install pytorchvideo
import torch
from pytorchvideo.models import hubbackbone = getattr(hub, "r2plus1d_r50")(pretrained=False)backbone = torch.hub.load("facebookresearch/pytorchvideo", model="r2plus1d_r50", pretrained=True)
mvit:
import torch
import time
from torch import nn# 如果你用的是官方實現,可以替換成 mvit_v2_s/mvit_v2_b 等
# pip install torchvision>=0.13
from torchvision.models.video import mvit_v2_sdef val_mvit_inference_speed(device="cuda", warmup=10, test_iter=50):"""測試 MViT 網絡推理速度:param device: 設備 ('cuda' or 'cpu'):param warmup: 預熱次數(避免初次運行不穩定):param test_iter: 測試迭代次數"""# 1. 構建模型model = mvit_v2_s(pretrained=False) # 可以換成 mvit_v2_bmodel = model.to(device)model.eval()# 2. 構造一個隨機輸入 (N, C, T, H, W)# 例如 1 幀視頻片段 (16 幀, 3 通道, 224x224)dummy_input = torch.randn(1, 3, 16, 224, 224).to(device)# 3. 預熱with torch.no_grad():for _ in range(warmup):_ = model(dummy_input)# 4. 正式測試torch.cuda.synchronize() if device.startswith("cuda") else Nonestart = time.time()with torch.no_grad():for _ in range(test_iter):_ = model(dummy_input)torch.cuda.synchronize() if device.startswith("cuda") else Noneend = time.time()avg_time = (end - start) / test_iterfps = 1 / avg_timeprint(f"設備: {device}")print(f"平均推理時間: {avg_time * 1000:.2f} ms/iter")print(f"推理速度: {fps:.2f} FPS")if __name__ == "__main__":val_mvit_inference_speed("cuda") # GPU 測試# test_mvit_inference_speed("cpu") # CPU 測試
r2plus1d_r50 推理代碼:
224*224 分類需要60ms
import timeimport torch
import numpy as np
from PIL import Image
import torchvision.transforms as transformsdevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用設備: {device}")def load_x3d_xs_model():"""加載 X3D-XS 模型,更適合小分辨率"""# model = torch.hub.load('facebookresearch/pytorchvideo', 'x3d_xs', pretrained=True)# model = torch.hub.load('facebookresearch/pytorchvideo', 'x3d_s', pretrained=True)model = torch.hub.load("facebookresearch/pytorchvideo", model="r2plus1d_r50", pretrained=True)model = model.to(device)model.eval()return modeldef preprocess_for_x3d_xs(video_frames, target_size=182, crop_size=72):"""為 X3D-XS 預處理X3D-XS 設計用于較小分辨率,推薦 160×160"""mean = [0.45, 0.45, 0.45]std = [0.225, 0.225, 0.225]num_frames = 13 # X3D-XS 使用4幀# 幀數處理if len(video_frames) > num_frames:indices = np.linspace(0, len(video_frames) - 1, num_frames, dtype=int)video_frames = [video_frames[i] for i in indices]transform = transforms.Compose([transforms.Resize(target_size), transforms.CenterCrop(crop_size), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)])processed_frames = []for frame in video_frames:if isinstance(frame, np.ndarray):frame = Image.fromarray(frame)processed_frame = transform(frame)processed_frames.append(processed_frame)video_tensor = torch.stack(processed_frames).permute(1, 0, 2, 3)return video_tensor.unsqueeze(0)# 使用示例
model = load_x3d_xs_model()for i in range(10):dummy_frames = [np.random.randint(0, 255, (200, 200, 3), dtype=np.uint8) for _ in range(13)]input_tensor = preprocess_for_x3d_xs(dummy_frames)print(input_tensor.shape)input_tensor = input_tensor.to(device)start = time.time()with torch.no_grad():output = model(input_tensor)torch.cuda.synchronize()print(f"輸出形狀: {output.shape}", time.time() - start) # 應該能正常工作
x3d_xs推理代碼:
import torchimport osfrom torch import nnos.environ["FFCV_DISABLE_IOPATH"] = "1"
import torch.nn.functional as Ffrom pytorchvideo.models.x3d import create_x3d
import torchvision.transforms as T
from torchvision.io import read_video
import numpy as np
from PIL import Image
import cv2
import osclass VideoNormalize(nn.Module):def __init__(self, mean, std):super().__init__()self.register_buffer("mean", torch.tensor(mean).view(-1, 1, 1, 1)) # [C,1,1,1]self.register_buffer("std", torch.tensor(std).view(-1, 1, 1, 1)) # [C,1,1,1]def forward(self, x):# x: [C, T, H, W]return (x - self.mean) / self.stdclass X3DVideoClassifier:def __init__(self, model_type='x3d_xs', num_classes=2, device='auto'):"""初始化X3D視頻分類器Args:model_type: 模型類型 ('x3d_xs', 'x3d_s', 'x3d_m')num_classes: 分類數量device: 運行設備 ('auto', 'cuda', 'cpu')"""self.model_type = model_typeself.num_classes = num_classesself.device = deviceif device == 'auto':self.device = 'cuda' if torch.cuda.is_available() else 'cpu'self._load_model()self._setup_transforms()def _load_model(self):"""加載預訓練模型"""# model_map = {'x3d_xs': x3d_xs, 'x3d_s': x3d_s, 'x3d_m': x3d_m}# if self.model_type not in model_map:# raise ValueError(f"不支持的模型類型: {self.model_type}")# 加載預訓練模型self.model = torch.hub.load("facebookresearch/pytorchvideo", "x3d_s", pretrained=True)# 替換最后的分類層in_features = self.model.blocks[-1].proj.in_featuresself.model.blocks[-1].proj = torch.nn.Linear(in_features, self.num_classes)self.model.to(self.device)self.model.eval()print(f"已加載 {self.model_type} 模型到 {self.device}")def _setup_transforms(self):"""設置數據預處理流程"""self.transform = T.Compose([T.Lambda(lambda x: x / 255.0), # 歸一化到 [0, 1]T.Lambda(lambda x: x.permute(3, 0, 1, 2)), # [T, H, W, C] -> [C, T, H, W]T.Resize((72, 72)), # 調整到72x72VideoNormalize(mean=[0.45, 0.45, 0.45], std=[0.225, 0.225, 0.225]), ])def load_video(self, video_path, max_frames=16):"""加載視頻文件Args:video_path: 視頻文件路徑max_frames: 最大幀數"""if not os.path.exists(video_path):raise FileNotFoundError(f"視頻文件不存在: {video_path}")# 使用OpenCV讀取視頻cap = cv2.VideoCapture(video_path)frames = []while len(frames) < max_frames:ret, frame = cap.read()if not ret:break# 轉換BGR到RGBframe_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)frames.append(frame_rgb)cap.release()if len(frames) == 0:raise ValueError("無法從視頻中讀取幀")# 轉換為tensor [T, H, W, C]video_tensor = torch.from_numpy(np.array(frames)).float()return video_tensordef preprocess_video(self, video_tensor):"""預處理視頻數據"""# 應用變換processed = self.transform(video_tensor)# 添加batch維度 [1, C, T, H, W]processed = processed.unsqueeze(0)return processed.to(self.device)def predict(self, video_path, class_names=None):"""對視頻進行分類預測Args:video_path: 視頻文件路徑class_names: 類別名稱列表"""# 加載視頻print(f"正在加載視頻: {video_path}")video_tensor = self.load_video(video_path)print(f"視頻幀數: {video_tensor.shape[0]}")# 預處理input_tensor = self.preprocess_video(video_tensor)print(f"輸入張量形狀: {input_tensor.shape}")# 推理with torch.no_grad():outputs = self.model(input_tensor)probabilities = F.softmax(outputs, dim=1)confidence, predicted_idx = torch.max(probabilities, 1)# 處理結果confidence = confidence.item()predicted_idx = predicted_idx.item()if class_names and len(class_names) > predicted_idx:predicted_class = class_names[predicted_idx]else:predicted_class = f"Class {predicted_idx}"return {'predicted_class': predicted_class, 'confidence': confidence, 'class_index': predicted_idx, 'probabilities': probabilities.cpu().numpy()[0]}def predict_from_tensor(self, video_tensor, class_names=None):"""直接從張量進行預測"""input_tensor = self.preprocess_video(video_tensor)with torch.no_grad():outputs = self.model(input_tensor)probabilities = F.softmax(outputs, dim=1)confidence, predicted_idx = torch.max(probabilities, 1)confidence = confidence.item()predicted_idx = predicted_idx.item()if class_names and len(class_names) > predicted_idx:predicted_class = class_names[predicted_idx]else:predicted_class = f"Class {predicted_idx}"return {'predicted_class': predicted_class, 'confidence': confidence, 'class_index': predicted_idx, 'probabilities': probabilities.cpu().numpy()[0]}# 使用示例
def main():# 初始化分類器classifier = X3DVideoClassifier(model_type='x3d_s', # 使用超小版本,速度最快num_classes=2, # 2分類任務device='auto' # 自動選擇設備)# 類別名稱(根據你的任務修改)class_names = ["類別A", "類別B"]# 示例1: 從文件預測video_path = r"C:\Users\Administrator\Videos\xiaoxia.mp4"try:result = classifier.predict(video_path, class_names)print("\n" + "=" * 50)print("視頻分類結果:")print(f"預測類別: {result['predicted_class']}")print(f"置信度: {result['confidence']:.4f}")print(f"類別索引: {result['class_index']}")print("各類別概率:")for i, prob in enumerate(result['probabilities']):class_name = class_names[i] if i < len(class_names) else f"Class {i}"print(f" {class_name}: {prob:.4f}")print("=" * 50)except Exception as e:print(f"錯誤: {e}")print("使用隨機張量進行演示...")# 示例2: 使用隨機張量演示random_video = torch.randn(16, 72, 72, 3) # [T, H, W, C]result = classifier.predict_from_tensor(random_video, class_names)print("\n隨機張量演示結果:")print(f"預測類別: {result['predicted_class']}")print(f"置信度: {result['confidence']:.4f}")if __name__ == "__main__":main()
R(2+1)D
R(2+1)D 將 3D 卷積分解為空間 2D 卷積和時間 1D 卷積,在性能和效率上取得了很好的平衡。
import torch
from pytorchvideo.models import resnet# R(2+1)D-18, 預訓練在 Kinetics-400
model = resnet.create_r2plus1d(input_channel=3, # RGBmodel_depth=18, # ResNet18 backbonemodel_num_class=400, # Kinetics-400 分類數pretrained=True
)
X3D(輕量級,速度快)
from pytorchvideo.models import x3d# X3D-Medium (還有 XS, S, L 版本)
model = x3d.create_x3d(input_channel=3,model_num_class=400, # Kinetics-400model_depth=50,pretrained=True,model_variant="M" # XS / S / M / L
)
I3D(經典 3D CNN)
from pytorchvideo.models import i3dmodel = i3d.create_kinetics_resnet50(pretrained=True,model_num_class=400 # Kinetics-400
)
替換分類層(適配你的任務)
假設你的視頻只有 num_classes=5
:
num_classes = 5# 替換掉最后的分類層 (fc 或 proj depending on model)
if hasattr(model, "blocks"): # R(2+1)D / I3D 用這種方式model.blocks[-1].proj = torch.nn.Linear(model.blocks[-1].proj.in_features, num_classes)
else:# X3Dmodel.head.proj = torch.nn.Linear(model.head.proj.in_features, num_classes)