我想開發一個基于深度學習的分類小軟件,逐漸了解到了TensorRT在模型推理速度上的優勢,經過一下午資料的查找實現了將onnx模型轉為TensorRT格式模型的推理及測試過程。將實現過程記錄下來方便日后查看。
本文實驗設備是MX350顯卡 2G顯存
一 、安裝TensorRT
點擊TensorRT下載鏈接,選擇合適的TensorRT版本下載,讀者選擇使用TensorRT進行推理,默認已經配置好cuda和cudnn環境,如果沒配置好請移步這篇博客Windows配置深度學習環境(從查詢合適的torch版本開始)——torch+CUDA+cuDNN
TensorRT與cuda版本對應方式查看如下:
-
點擊TensorRT版本
-
點擊同意
-
點擊版本號
-
查看cuda版本是否符合你設備,點擊下載即可
二、環境配置
- 下載后得到文件結構如下所示
- 添加環境變量,右鍵此電腦點擊屬性,根據圖中序號依次點擊并添加環境變量
我的環境變量如下所示
D:\Software\TensorRT-8.6.1.6\lib
D:\Software\TensorRT-8.6.1.6\bin
三、模型轉換
打開命令行窗口,切換到D:\Software\TensorRT-8.6.1.6\bin目錄,執行如下命令
trtexec --onnx=mymodel.onnx --saveEngine=model.trt --fp16
這里的–fp16應該也可以改成int8,但是精度損失會有點大,我沒有實驗
這個mymodel.onnx需要你自己的onnx文件名,這個model.trt 就隨便起名字了
如下圖所示為轉換成功
四、TensorRT與ONNX推理速度與精度測試
推理時間測試
- TensorRT推理時間測試代碼
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import os
import numpy as np
import time
from typing import Tupledef softmax(x: np.ndarray) -> np.ndarray:"""計算softmax"""e_x = np.exp(x - np.max(x)) # 防止數值溢出return e_x / e_x.sum(axis=0)class TensorRTPredictor:def __init__(self, engine_path: str):"""初始化TensorRT預測器"""self.logger = trt.Logger(trt.Logger.WARNING)self.engine = self._load_engine(engine_path)self.context = self.engine.create_execution_context()self.input_shape = tuple(self.engine.get_tensor_shape(self.engine.get_tensor_name(0)))self.output_shape = tuple(self.engine.get_tensor_shape(self.engine.get_tensor_name(1)))self.is_warmed_up = Falsedef _load_engine(self, engine_path: str) -> trt.ICudaEngine:"""加載TensorRT引擎"""load_start_time = time.time()with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:engine = runtime.deserialize_cuda_engine(f.read())load_end_time = time.time()load_time = (load_end_time - load_start_time) * 1000print(f"加載引擎時間: {load_time:.2f} ms")return enginedef preprocess_image(self, image_path: str) -> np.ndarray:"""圖像預處理"""preprocess_start_time = time.time()if not os.path.exists(image_path):raise FileNotFoundError(f"圖像文件不存在: {os.path.abspath(image_path)}")image = cv2.imread(image_path)if image is None:raise ValueError("無法讀取圖像,請檢查文件格式和完整性")try:image = cv2.resize(image, (224, 224))image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)image = np.ascontiguousarray(image.transpose(2, 0, 1).astype(np.float32) / 255.0)mean = np.array([0.362, 0.279, 0.258]).reshape(3, 1, 1)std = np.array([0.222, 0.191, 0.185]).reshape(3, 1, 1)image = (image - mean) / stdexcept Exception as e:raise RuntimeError(f"圖像預處理失敗: {str(e)}")preprocess_end_time = time.time()preprocess_time = (preprocess_end_time - preprocess_start_time) * 1000print(f" 預處理時間: {preprocess_time:.2f} ms")return imagedef warmup(self, iterations: int = 10):"""模型預熱"""if self.is_warmed_up:print("模型已經預熱,跳過預熱步驟")returnwarmup_start_time = time.time()input_size = int(np.prod(self.input_shape)) * np.float32().itemsizeoutput_size = int(np.prod(self.output_shape)) * np.float32().itemsized_input = cuda.mem_alloc(input_size)d_output = cuda.mem_alloc(output_size)stream = cuda.Stream()dummy_input = np.random.rand(*self.input_shape).astype(np.float32)for _ in range(iterations):cuda.memcpy_htod_async(d_input, dummy_input, stream)self.context.execute_async_v2(bindings=[int(d_input), int(d_output)],stream_handle=stream.handle)stream.synchronize()d_input.free()d_output.free()warmup_end_time = time.time()warmup_time = (warmup_end_time - warmup_start_time) * 1000print(f" 預熱時間: {warmup_time:.2f} ms")self.is_warmed_up = Truedef infer(self, image: np.ndarray) -> Tuple[float, np.ndarray]:"""執行TensorRT推理"""if not self.is_warmed_up:print("警告:模型尚未預熱,推理性能可能受影響")input_size = int(np.prod(self.input_shape)) * np.float32().itemsizeoutput_size = int(np.prod(self.output_shape)) * np.float32().itemsized_input = cuda.mem_alloc(input_size)d_output = cuda.mem_alloc(output_size)stream = cuda.Stream()input_data = np.ascontiguousarray(np.expand_dims(image, axis=0), dtype=np.float32)# 正式推理infer_start_time = time.time()cuda.memcpy_htod_async(d_input, input_data, stream)self.context.execute_async_v2(bindings=[int(d_input), int(d_output)],stream_handle=stream.handle)stream.synchronize()infer_end_time = time.time()infer_time = (infer_end_time - infer_start_time) * 1000print(f" TensorRT 推理時間: {infer_time:.2f} ms")# 獲取輸出output_data = np.empty(self.output_shape, dtype=np.float32)output_start_time = time.time()cuda.memcpy_dtoh_async(output_data, d_output, stream)stream.synchronize()output_end_time = time.time()output_time = (output_end_time - output_start_time) * 1000print(f" 獲取輸出時間: {output_time:.2f} ms")# 置信度confidence = softmax(output_data[0])d_input.free()d_output.free()return infer_time, output_data, confidenceif __name__ == "__main__":# 配置路徑PATHS = {"image_folder": "D:/Desktop/DATA/balance_bei_liao_hu/temp", # 圖片文件夾路徑"engine": "mnv4.engine" # TensorRT引擎文件路徑}# 驗證文件夾和文件存在if not os.path.exists(PATHS["image_folder"]):print(f"錯誤: 圖片文件夾不存在 -> {os.path.abspath(PATHS['image_folder'])}")exit(1)if not os.path.exists(PATHS["engine"]):print(f"錯誤: 引擎文件不存在 -> {os.path.abspath(PATHS['engine'])}")exit(1)# 獲取文件夾中所有圖片文件(包括子文件夾)image_files = []for root, _, files in os.walk(PATHS["image_folder"]):for file in files:if file.endswith(('.jpg', '.png', '.bmp', '.jpeg')):image_files.append(os.path.join(root, file))if not image_files:print(f"錯誤: 文件夾中沒有圖片文件 -> {PATHS['image_folder']}")exit(1)# 初始化預測器predictor = TensorRTPredictor(PATHS["engine"])predictor.warmup(iterations=10) # 預熱模型total_time = 0total_preprocess_time = 0for image_path in image_files:try:print(f"處理圖片: {image_path}")img = predictor.preprocess_image(image_path)trt_time, trt_out, trt_confidence = predictor.infer(img)print(f" TensorRT 輸出: {np.argmax(trt_out)} (置信度: {np.max(trt_confidence):.4f})")total_time += trt_timeexcept Exception as e:print(f"處理圖片時出錯: {image_path} -> {str(e)}")avg_time = total_time / len(image_files)print(f"\n平均推理時間: {avg_time:.2f} ms")
這里TensorRT推理150張224×224圖片平均速度為5.50ms
- ONNX推理時間測試代碼
import onnxruntime as ort
import numpy as np
from PIL import Image
from torchvision import transforms
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
import time
import osdef softmax(x: np.ndarray) -> np.ndarray:"""計算softmax"""e_x = np.exp(x - np.max(x)) # 防止數值溢出return e_x / e_x.sum(axis=0)class ONNXPredictor:def __init__(self, model_path="mobilenetv4_hybrid_medium.onnx", size=224):# 自動檢測可用providerself.providers = self._get_available_providers()print(f"可用推理后端: {self.providers}")# 初始化ONNX Runtime會話self.session = ort.InferenceSession(model_path, providers=self.providers)# 獲取當前使用的provider信息current_provider = self.session.get_providers()print(f"實際使用的推理后端: {current_provider}")# 獲取輸入輸出名稱self.input_name = self.session.get_inputs()[0].nameself.output_name = self.session.get_outputs()[0].name# 預處理變換self.transform = self.build_transform(size)# 預熱標志self.is_warmed_up = Falsedef _get_available_providers(self):"""獲取可用的推理后端,優先使用CUDA且僅使用CUDA(如果可用)"""available_providers = ort.get_available_providers()# 優先使用CUDA且僅使用CUDAif 'CUDAExecutionProvider' in available_providers:return ['CUDAExecutionProvider'] # 僅返回CUDA# 如果沒有CUDA,則回退到CPUelif 'CPUExecutionProvider' in available_providers:return ['CPUExecutionProvider']else:raise RuntimeError("沒有可用的執行提供程序(既沒有CUDA也沒有CPU)")def build_transform(self, size: int):"""構建圖像預處理流水線"""return transforms.Compose([transforms.Resize(size, interpolation=transforms.InterpolationMode.BICUBIC),transforms.CenterCrop(size),transforms.ToTensor(),transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)])def preprocess(self, image):"""預處理圖像"""# 如果輸入是文件路徑,先加載圖像if isinstance(image, str):image = Image.open(image).convert('RGB')# 應用變換并添加batch維度return self.transform(image).unsqueeze(0).numpy()def warmup(self, iterations=10):"""預熱模型"""dummy_input = np.random.rand(1, 3, 224, 224).astype(np.float32)for _ in range(iterations):self.session.run([self.output_name], {self.input_name: dummy_input})self.is_warmed_up = Trueprint(f"模型已預熱 {iterations} 次")def predict(self, image):"""執行預測"""# 預處理input_data = self.preprocess(image)# 運行模型outputs = self.session.run([self.output_name], {self.input_name: input_data})[0]confidence = softmax(outputs[0])return outputs, confidenceif __name__ == "__main__":# 配置路徑PATHS = {"image_folder": "D:/Desktop/DATA/balance_bei_liao_hu/temp", # 圖片文件夾路徑"model_path": "mobilenetv4_hybrid_medium.onnx" # ONNX模型文件路徑}# 驗證文件夾和文件存在if not os.path.exists(PATHS["image_folder"]):print(f"錯誤: 圖片文件夾不存在 -> {os.path.abspath(PATHS['image_folder'])}")exit(1)if not os.path.exists(PATHS["model_path"]):print(f"錯誤: 模型文件不存在 -> {os.path.abspath(PATHS['model_path'])}")exit(1)# 獲取文件夾中所有圖片文件(包括子文件夾)image_files = []for root, _, files in os.walk(PATHS["image_folder"]):for file in files:if file.endswith(('.jpg', '.png', '.bmp', '.jpeg')):image_files.append(os.path.join(root, file))if not image_files:print(f"錯誤: 文件夾中沒有圖片文件 -> {PATHS['image_folder']}")exit(1)# 初始化預測器predictor = ONNXPredictor(model_path=PATHS["model_path"], size=224)predictor.warmup(iterations=10) # 預熱模型total_time = 0for image_path in image_files:try:print(f"處理圖片: {image_path}")start_time = time.time()predictions, confidence = predictor.predict(image_path)end_time = time.time()inference_time = (end_time - start_time) * 1000 # 轉換為毫秒print(f" ONNX 推理時間: {inference_time:.2f} ms")print(f" ONNX 輸出: {np.argmax(predictions)} (置信度: {np.max(confidence):.4f})")total_time += inference_timeexcept Exception as e:print(f"處理圖片時出錯: {image_path} -> {str(e)}")avg_time = total_time / len(image_files)print(f"\n平均推理時間: {avg_time:.2f} ms")
兩種格式的模型分別預測了150張尺寸為224×224的三類圖片,每一類有50張,調用TensorRT平均每張圖片需要5.17ms,而onnx平均每張圖片需要11.11ms,TensorRT模型的推理速度縮短為onnx的二分之一,根據查找的資料顯示,轉換后的模型推理時間的縮短可能與設備有關。
精度測試
- TensorRT推理代碼
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import os
import numpy as np
import time
from typing import Tuple
from sklearn.metrics import classification_report, accuracy_score, f1_score
from collections import Counterclass TensorRTPredictor:def __init__(self, engine_path: str):"""初始化TensorRT預測器"""self.logger = trt.Logger(trt.Logger.WARNING)self.engine = self._load_engine(engine_path)self.context = self.engine.create_execution_context()self.input_shape = tuple(self.engine.get_tensor_shape(self.engine.get_tensor_name(0)))self.output_shape = tuple(self.engine.get_tensor_shape(self.engine.get_tensor_name(1)))self.is_warmed_up = Falseself.warmup(iterations=10) # 在初始化時進行預熱def _load_engine(self, engine_path: str) -> trt.ICudaEngine:"""加載TensorRT引擎"""load_start_time = time.time()with open(engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:engine = runtime.deserialize_cuda_engine(f.read())load_end_time = time.time()load_time = (load_end_time - load_start_time) * 1000print(f"加載引擎時間: {load_time:.2f} ms")return enginedef preprocess_image(self, image_path: str) -> np.ndarray:"""圖像預處理"""preprocess_start_time = time.time()if not os.path.exists(image_path):raise FileNotFoundError(f"圖像文件不存在: {os.path.abspath(image_path)}")image = cv2.imread(image_path)if image is None:raise ValueError("無法讀取圖像,請檢查文件格式和完整性")try:image = cv2.resize(image, (224, 224))image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)image = np.ascontiguousarray(image.transpose(2, 0, 1).astype(np.float32) / 255.0)mean = np.array([0.362, 0.279, 0.258]).reshape(3, 1, 1)std = np.array([0.222, 0.191, 0.185]).reshape(3, 1, 1)image = (image - mean) / stdexcept Exception as e:raise RuntimeError(f"圖像預處理失敗: {str(e)}")preprocess_end_time = time.time()preprocess_time = (preprocess_end_time - preprocess_start_time) * 1000print(f" 預處理時間: {preprocess_time:.2f} ms")return imagedef warmup(self, iterations: int = 10):"""模型預熱"""if self.is_warmed_up:print("模型已經預熱,跳過預熱步驟")returnwarmup_start_time = time.time()input_size = int(np.prod(self.input_shape)) * np.float32().itemsizeoutput_size = int(np.prod(self.output_shape)) * np.float32().itemsized_input = cuda.mem_alloc(input_size)d_output = cuda.mem_alloc(output_size)stream = cuda.Stream()dummy_input = np.random.rand(*self.input_shape).astype(np.float32)for _ in range(iterations):cuda.memcpy_htod_async(d_input, dummy_input, stream)self.context.execute_async_v2(bindings=[int(d_input), int(d_output)],stream_handle=stream.handle)stream.synchronize()d_input.free()d_output.free()warmup_end_time = time.time()warmup_time = (warmup_end_time - warmup_start_time) * 1000print(f" 預熱時間: {warmup_time:.2f} ms")self.is_warmed_up = Truedef infer(self, image: np.ndarray) -> Tuple[float, np.ndarray]:"""執行TensorRT推理"""if not self.is_warmed_up:print("警告:模型尚未預熱,推理性能可能受影響")input_size = int(np.prod(self.input_shape)) * np.float32().itemsizeoutput_size = int(np.prod(self.output_shape)) * np.float32().itemsized_input = cuda.mem_alloc(input_size)d_output = cuda.mem_alloc(output_size)stream = cuda.Stream()input_data = np.ascontiguousarray(np.expand_dims(image, axis=0), dtype=np.float32)# 正式推理infer_start_time = time.time()cuda.memcpy_htod_async(d_input, input_data, stream)self.context.execute_async_v2(bindings=[int(d_input), int(d_output)],stream_handle=stream.handle)stream.synchronize()infer_end_time = time.time()infer_time = (infer_end_time - infer_start_time) * 1000print(f" TensorRT 推理時間: {infer_time:.2f} ms")# 獲取輸出output_data = np.empty(self.output_shape, dtype=np.float32)output_start_time = time.time()cuda.memcpy_dtoh_async(output_data, d_output, stream)stream.synchronize()output_end_time = time.time()output_time = (output_end_time - output_start_time) * 1000print(f" 獲取輸出時間: {output_time:.2f} ms")d_input.free()d_output.free()return infer_time, output_dataif __name__ == "__main__":# 配置路徑PATHS = {"image_folder": "D:/Desktop/DATA/balance_bei_liao_hu/temp", # 圖片文件夾路徑"engine": "mnv4.engine" # TensorRT引擎文件路徑}# 驗證文件夾和文件存在if not os.path.exists(PATHS["image_folder"]):print(f"錯誤: 圖片文件夾不存在 -> {os.path.abspath(PATHS['image_folder'])}")exit(1)if not os.path.exists(PATHS["engine"]):print(f"錯誤: 引擎文件不存在 -> {os.path.abspath(PATHS['engine'])}")exit(1)# 獲取文件夾中所有圖片文件(包括子文件夾)image_files = []for root, _, files in os.walk(PATHS["image_folder"]):for file in files:if file.endswith(('.jpg', '.png', '.bmp', '.jpeg')):image_files.append(os.path.join(root, file))if not image_files:print(f"錯誤: 文件夾中沒有圖片文件 -> {PATHS['image_folder']}")exit(1)# 初始化預測器predictor = TensorRTPredictor(PATHS["engine"])# 初始化分類結果統計true_labels = []predicted_labels = []label_mapping = {0: "B", 1: "D", 2: "E"}total_time = 0for image_path in image_files:try:print(f"處理圖片: {image_path}")img = predictor.preprocess_image(image_path)trt_time, trt_out = predictor.infer(img)print(f" TensorRT 推理時間: {trt_time:.2f} ms")predicted_label = np.argmax(trt_out)predicted_labels.append(predicted_label)# 從文件路徑中提取真實標簽true_label = os.path.basename(os.path.dirname(image_path))true_labels.append(true_label)total_time += trt_timeexcept Exception as e:print(f"處理圖片時出錯: {image_path} -> {str(e)}")avg_time = total_time / len(image_files)print(f"\n平均推理時間: {avg_time:.2f} ms")# 計算分類結果true_labels = [label for label in true_labels]predicted_labels = [label_mapping[label] for label in predicted_labels]print("\n分類結果統計:")print(f"圖片總數: {len(image_files)}")print(f"分類結果: {Counter(predicted_labels)}")# 計算準確率和 F1 分數accuracy = accuracy_score(true_labels, predicted_labels)f1 = f1_score(true_labels, predicted_labels, average='weighted')print(f"準確率: {accuracy:.4f}") # 保留四位小數print(f"F1 分數: {f1:.4f}") # 保留四位小數# 輸出詳細的分類報告print("\n分類報告:")print(classification_report(true_labels, predicted_labels, digits=4)) # 保留四位小數
- onnx推理代碼
from datasets.split_data import read_split_data
from datasets.mydataset import MyDataset
from torchvision import transforms
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
import torch
from estimate_model import Predictor, Plot_ROC
from timm.models import create_model
import os, cv2, json, random
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as pltdef read_test_data(root, plot_image=False):filepaths = []labels = []bad_images = []random.seed(0)assert os.path.exists(root), 'Your root does not exists!!!'classes = [cla for cla in os.listdir(root) if os.path.isdir(os.path.join(root, cla))]classes.sort()class_indices = {k: v for v, k in enumerate(classes)}json_str = json.dumps({v: k for k, v in class_indices.items()}, indent=4)with open('output/classes_indices.json', 'w') as json_file:json_file.write(json_str)every_class_num = []supported = ['.jpg', '.png', '.jpeg', '.PNG', '.JPG', '.JPEG', '.bmp']for klass in classes:classpath = os.path.join(root, klass)images = [os.path.join(root, klass, i) for i in os.listdir(classpath) if os.path.splitext(i)[-1] in supported]every_class_num.append(len(images))flist = sorted(os.listdir(classpath))desc = f'{klass:23s}'for f in tqdm(flist, ncols=110, desc=desc, unit='file', colour='blue'):fpath = os.path.join(classpath, f)fl = f.lower()index = fl.rfind('.')ext = fl[index:]if ext in supported:try:img = cv2.imread(fpath)filepaths.append(fpath)labels.append(klass)except:bad_images.append(fpath)print('defective image file: ', fpath)else:bad_images.append(fpath)Fseries = pd.Series(filepaths, name='filepaths')Lseries = pd.Series(labels, name='labels')df = pd.concat([Fseries, Lseries], axis=1)print(f'{len(df.labels.unique())} kind of images were found in the dataset')test_image_path = df['filepaths'].tolist()test_image_label = [class_indices[i] for i in df['labels'].tolist()]sample_df = df.sample(n=50, replace=False)ht, wt, count = 0, 0, 0for i in range(len(sample_df)):fpath = sample_df['filepaths'].iloc[i]try:img = cv2.imread(fpath)h = img.shape[0]w = img.shape[1]ht += hwt += wcount += 1except:passhave = int(ht / count)wave = int(wt / count)aspect_ratio = have / waveprint('{} images were found in the dataset.\n{} for test'.format(sum(every_class_num), len(test_image_path)))print('average image height= ', have, ' average image width= ', wave, ' aspect ratio h/w= ', aspect_ratio)if plot_image:plt.bar(range(len(classes)), every_class_num, align='center')plt.xticks(range(len(classes)), classes)for i, v in enumerate(every_class_num):plt.text(x=i, y=v + 5, s=str(v), ha='center')plt.xlabel('image class')plt.ylabel('number of images')plt.title('class distribution')plt.show()return test_image_path, test_image_labeltest_image_path, test_image_label = read_test_data('D:/Desktop/DATA/balance_bei_liao_hu/temp', False)def build_transform(img_size):t = []t.append(# to maintain same ratio w.r.t. 224 imagestransforms.Resize(img_size, interpolation=3),)t.append(transforms.CenterCrop(img_size))t.append(transforms.ToTensor())t.append(transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD))return transforms.Compose(t)test_transform = build_transform(224)test_set = MyDataset(test_image_path, test_image_label, test_transform)sampler_val = torch.utils.data.SequentialSampler(test_set)data_loader_val = torch.utils.data.DataLoader(test_set, sampler=sampler_val,batch_size=int(1.5 * 24),num_workers=0,pin_memory=True,drop_last=False
)
model_predict = create_model('mobilenetv4_hybrid_medium')model_predict.reset_classifier(num_classes=3)
model_predict.to('cuda')
device = torch.device('cuda')Predictor(model_predict, data_loader_val, f'./output/mobilenetv4_hybrid_medium_best_checkpoint.pth', device)
Plot_ROC(model_predict, data_loader_val, f'./output/mobilenetv4_hybrid_medium_best_checkpoint.pth', device)
- 結果:
-
TensorRT:
-
onnx:
可以觀察到在轉成TensorRT推理后模型精度下降明顯,宏平均Precision下降了約4%,宏平均召回下降了約10%,宏平均F1下降了約10%。