YOLOv5推理代碼解析

代碼如下

import cv2
import numpy as np
import onnxruntime as ort
import time
import random# 畫一個檢測框
def plot_one_box(x, img, color=None, label=None, line_thickness=None):"""description: 在圖像上繪制一個矩形框。param:x: 框的坐標 [x1, y1, x2, y2]img: 輸入圖像color: 矩形框的顏色，默認為隨機顏色label: 框內顯示的標簽line_thickness: 矩形框的線條寬度return: 無返回值，直接在圖像上繪制"""tl = (line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1)  # line/font thickness，計算線條或字體的粗細color = color or [random.randint(0, 255) for _ in range(3)]  # 如果沒有提供顏色，隨機生成顏色c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))  # 左上角和右下角的坐標cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)  # 繪制矩形框if label:  # 如果提供了標簽，則繪制標簽tf = max(tl - 1, 1)  # 字體的粗細t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]  # 獲取標簽的大小c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3  # 計算標簽背景框的位置cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # 繪制標簽背景框cv2.putText(img,label,(c1[0], c1[1] - 2),0,tl / 3,[225, 255, 255],thickness=tf,lineType=cv2.LINE_AA,)  # 繪制標簽文本# 生成網格坐標
def _make_grid(nx, ny):"""description: 生成網格坐標，用于解碼預測框位置。param:nx, ny: 網格的行數和列數return: 返回網格坐標"""xv, yv = np.meshgrid(np.arange(ny), np.arange(nx))  # 生成網格坐標return np.stack((xv, yv), 2).reshape((-1, 2)).astype(np.float32)  # 轉換為需要的格式# 輸出解碼
def cal_outputs(outs, nl, na, model_w, model_h, anchor_grid, stride):"""description: 對模型輸出的坐標進行解碼，轉換為圖像坐標。param:outs: 模型輸出的框的偏移量nl: 輸出層數量na: 每層的anchor數目model_w, model_h: 模型輸入圖像的尺寸anchor_grid: anchor的尺寸stride: 每個輸出層的縮放步長return: 解碼后的輸出"""row_ind = 0grid = [np.zeros(1)] * nl  # 每個層對應一個網格for i in range(nl):h, w = int(model_w / stride[i]), int(model_h / stride[i])  # 計算該層特征圖的高和寬length = int(na * h * w)  # 當前層的總框數if grid[i].shape[2:4] != (h, w):  # 如果網格的大小不匹配，則重新生成網格grid[i] = _make_grid(w, h)# 解碼每個框的中心坐標和寬高outs[row_ind:row_ind + length, 0:2] = (outs[row_ind:row_ind + length, 0:2] * 2. - 0.5 + np.tile(grid[i], (na, 1))) * int(stride[i])outs[row_ind:row_ind + length, 2:4] = (outs[row_ind:row_ind + length, 2:4] * 2) ** 2 * np.repeat(anchor_grid[i], h * w, axis=0)  # 計算寬高row_ind += lengthreturn outs# 后處理，計算檢測框
def post_process_opencv(outputs, model_h, model_w, img_h, img_w, thred_nms, thred_cond):"""description: 對模型輸出的框進行后處理，得到最終的檢測框。param:outputs: 模型輸出的框model_h, model_w: 模型輸入的高度和寬度img_h, img_w: 原圖的高度和寬度thred_nms: 非極大值抑制的閾值thred_cond: 置信度閾值return: 返回處理后的框、置信度和類別"""conf = outputs[:, 4].tolist()  # 獲取每個框的置信度c_x = outputs[:, 0] / model_w * img_w  # 計算中心點x坐標c_y = outputs[:, 1] / model_h * img_h  # 計算中心點y坐標w = outputs[:, 2] / model_w * img_w  # 計算框的寬度h = outputs[:, 3] / model_h * img_h  # 計算框的高度p_cls = outputs[:, 5:]  # 獲取分類得分if len(p_cls.shape) == 1:  # 如果分類結果只有一維，增加一維p_cls = np.expand_dims(p_cls, 1)cls_id = np.argmax(p_cls, axis=1)  # 獲取類別編號# 計算框的四個角坐標p_x1 = np.expand_dims(c_x - w / 2, -1)p_y1 = np.expand_dims(c_y - h / 2, -1)p_x2 = np.expand_dims(c_x + w / 2, -1)p_y2 = np.expand_dims(c_y + h / 2, -1)areas = np.concatenate((p_x1, p_y1, p_x2, p_y2), axis=-1)  # 合并成框的坐標areas = areas.tolist()  # 轉為列表形式ids = cv2.dnn.NMSBoxes(areas, conf, thred_cond, thred_nms)  # 非極大值抑制if len(ids) > 0:  # 如果有框被保留return np.array(areas)[ids], np.array(conf)[ids], cls_id[ids]else:return [], [], []# 圖像推理
def infer_img(img0, net, model_h, model_w, nl, na, stride, anchor_grid, thred_nms=0.4, thred_cond=0.5):"""description: 對輸入圖像進行推理，輸出檢測框。param:img0: 原始圖像net: 加載的ONNX模型model_h, model_w: 模型的輸入尺寸nl: 輸出層數量na: 每層的anchor數量stride: 每層的縮放步長anchor_grid: 每層的anchor尺寸thred_nms: 非極大值抑制閾值thred_cond: 置信度閾值return: 檢測框、置信度和類別"""# 圖像預處理img = cv2.resize(img0, [model_w, model_h], interpolation=cv2.INTER_AREA)  # 將圖像調整為模型輸入大小img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # 轉換為RGB格式img = img.astype(np.float32) / 255.0  # 歸一化blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0)  # 將圖像轉為模型輸入格式# 模型推理outs = net.run(None, {net.get_inputs()[0].name: blob})[0].squeeze(axis=0)  # 推理并去掉batch維度# 輸出坐標矯正outs = cal_outputs(outs, nl, na, model_w, model_h, anchor_grid, stride)# 檢測框計算img_h, img_w, _ = np.shape(img0)  # 獲取原圖的尺寸boxes, confs, ids = post_process_opencv(outs, model_h, model_w, img_h, img_w, thred_nms, thred_cond)return boxes, confs, idsif __name__ == "__main__":# 加載ONNX模型model_pb_path = "a.onnx"  # 模型文件路徑so = ort.SessionOptions()net = ort.InferenceSession(model_pb_path, so)# 類別字典dic_labels = {0: 'jn', 1: 'pill_bag', 2: 'pill_ban', 3: 'yg', 4: 'ys', 5: 'kfy',6: 'pw', 7: 'yanyao_1', 8: 'yanyao_2', 9: 'paper_cup', 10: 'musai',11: 'carrot', 12: 'potato', 13: 'potato_s', 14: 'potato_black',15: 'cizhuan', 16: 'eluanshi_guang', 17: 'stone', 18: 'zhuankuai_bai',19: 'zhuankuai_red', 20: 'empty'}# 模型參數model_h = 320model_w = 320nl = 3na = 3stride = [8., 16., 32.]anchors = [[10, 13, 16, 30, 33, 23], [30, 61, 62, 45, 59, 119], [116, 90, 156, 198, 373, 326]]anchor_grid = np.asarray(anchors, dtype=np.float32).reshape(nl, -1, 2)# 打開攝像頭video = 1  # 攝像頭設備編號，1表示默認攝像頭cap = cv2.VideoCapture(video)  # 視頻捕獲對象flag_det = False  # 檢測開關while True:success, img0 = cap.read()  # 讀取每一幀if success:if flag_det:  # 如果檢測開啟t1 = time.time()  # 記錄推理前的時間det_boxes, scores, ids = infer_img(img0, net, model_h, model_w, nl, na, stride, anchor_grid,thred_nms=0.4, thred_cond=0.5)  # 推理t2 = time.time()  # 記錄推理后的時間# 繪制檢測框和標簽for box, score, id in zip(det_boxes, scores, ids):label = '%s:%.2f' % (dic_labels[id], score)plot_one_box(box.astype(np.int16), img0, color=(255, 0, 0), label=label, line_thickness=None)# 計算并顯示FPSdelta_time = t2 - t1if delta_time > 0:str_FPS = "FPS: %.2f" % (1. / delta_time)else:str_FPS = "FPS: inf"cv2.putText(img0, str_FPS, (50, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (0, 255, 0), 3)cv2.imshow("video", img0)  # 顯示圖像key = cv2.waitKey(1) & 0xFF  # 等待鍵盤輸入if key == ord('q'):  # 按q鍵退出breakelif key & 0xFF == ord('s'):  # 按s鍵切換檢測開關flag_det = not flag_detprint(flag_det)cap.release()  # 釋放視頻捕獲對象

代碼中推理相關的函數做逐行詳細中文注釋和解釋，幫助你從初學者的角度完全理解代碼。我們重點講解這幾個核心函數：

1. `infer_img()` 推理主函數

def infer_img(img0, net, model_h, model_w, nl, na, stride, anchor_grid, thred_nms=0.4, thred_cond=0.5):

這是圖像推理的主函數，完成從原始圖像到預測結果的所有操作。

第一步：圖像預處理

img = cv2.resize(img0, [model_w, model_h], interpolation=cv2.INTER_AREA)

將原始圖像 img0 縮放成模型輸入要求的大小（例如 320×320）。
cv2.INTER_AREA 是一種圖像插值方式，適合縮小圖像時使用。

img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

OpenCV 讀取圖像是 BGR 順序，而深度學習模型通常使用 RGB，因此這里需要轉換顏色通道。

img = img.astype(np.float32) / 255.0

把圖像的數據類型轉為 float32，并將像素值從 [0, 255] 范圍歸一化到 [0, 1]，符合模型輸入要求。

blob = np.expand_dims(np.transpose(img, (2, 0, 1)), axis=0)

OpenCV圖像的格式是 (H, W, C)，而 PyTorch 模型（如YOLO）的輸入是 (B, C, H, W)
np.transpose(img, (2, 0, 1)) 把通道 C 移到第一個維度
np.expand_dims(..., axis=0) 增加 batch 維度：變成 (1, 3, 320, 320)

第二步：模型推理

outs = net.run(None, {net.get_inputs()[0].name: blob})[0].squeeze(axis=0)

用 ONNX Runtime 推理：輸入是 blob
net.get_inputs()[0].name 得到模型輸入的名字
squeeze(axis=0) 把 batch 維度去掉，形狀變成 (N, 85)，N 是預測框數量，85 是每個框的信息（x, y, w, h, conf, + 80類）

第三步：輸出坐標解碼

outs = cal_outputs(outs, nl, na, model_w, model_h, anchor_grid, stride)

YOLO 的輸出是相對 anchor + grid 編碼的，需要轉換為圖像上的真實位置
cal_outputs() 就是做這個解碼變換的函數（后面詳細講）

第四步：后處理，獲取檢測框信息

img_h, img_w, _ = np.shape(img0)
boxes, confs, ids = post_process_opencv(outs, model_h, model_w, img_h, img_w, thred_nms, thred_cond)

將模型輸出映射回原始圖像尺寸
使用置信度閾值和 NMS 非極大值抑制刪除重復框
得到最終的：
- boxes: 框坐標
- confs: 置信度
- ids: 類別編號

2. `cal_outputs()` 坐標解碼函數

def cal_outputs(outs, nl, na, model_w, model_h, anchor_grid, stride):

含義解釋：

outs: 模型輸出，形狀大致是 (N, 85)，前4列是框的位置
nl: YOLO使用的輸出層數量（3個：大中小目標）
na: 每個特征層使用的 anchor 數（通常為 3）
anchor_grid: 每層 anchor 的寬高尺寸
stride: 每層特征圖相對于原圖的縮放倍數

grid = [np.zeros(1)] * nl

每一層都要生成網格坐標 grid，初始化為占位

for i in range(nl):h, w = int(model_w / stride[i]), int(model_h / stride[i])

計算第 i 層的特征圖尺寸（如：320/8=40）

    length = int(na * h * w)

該層有多少個預測框

    if grid[i].shape[2:4] != (h, w):grid[i] = _make_grid(w, h)

如果還沒有生成 grid，就調用 _make_grid() 創建形狀為 (h*w, 2) 的網格點

    outs[row_ind:row_ind + length, 0:2] = ...outs[row_ind:row_ind + length, 2:4] = ...

對該層的所有框做位置矯正（中心點解碼 + 寬高縮放）
用 grid 和 anchor 反算出真實坐標

3. `post_process_opencv()` 后處理函數

def post_process_opencv(outputs, model_h, model_w, img_h, img_w, thred_nms, thred_cond):

功能：

將模型輸出映射回原始圖像尺寸
提取類別信息
使用 OpenCV 的 cv2.dnn.NMSBoxes() 進行非極大值抑制，保留重要框

步驟：

conf = outputs[:, 4].tolist()         # 提取每個框的置信度
c_x = outputs[:, 0] / model_w * img_w
c_y = outputs[:, 1] / model_h * img_h
w = outputs[:, 2] / model_w * img_w
h = outputs[:, 3] / model_h * img_h

將中心點和尺寸從模型尺寸映射回原始圖像尺寸

p_cls = outputs[:, 5:]
cls_id = np.argmax(p_cls, axis=1)

取得每個框的類別分數最大值（即分類結果）

p_x1 = c_x - w/2
p_y1 = c_y - h/2
p_x2 = c_x + w/2
p_y2 = c_y + h/2

把中心點轉為左上角和右下角坐標 [x1, y1, x2, y2]

areas = np.concatenate((p_x1, p_y1, p_x2, p_y2), axis=-1)
ids = cv2.dnn.NMSBoxes(areas, conf, thred_cond, thred_nms)

用 NMS 去除重疊預測框

YOLOv5推理代碼解析

1. `infer_img()` 推理主函數

第一步：圖像預處理

第二步：模型推理

第三步：輸出坐標解碼

第四步：后處理，獲取檢測框信息

2. `cal_outputs()` 坐標解碼函數

含義解釋：

3. `post_process_opencv()` 后處理函數

功能：

步驟：

相關文章

CATIA高效工作指南——常規配置篇（二）

批量重命名bat

嵌入式軟件開發常見warning之 warning: implicit declaration of function

【人工智能-agent】--Dify中MCP工具存數據到MySQL

Golang 應用的 CI/CD 與 K8S 自動化部署全流程指南

網絡基礎1(應用層、傳輸層)

基于大模型預測的吉蘭 - 巴雷綜合征綜合診療方案研究報告大綱

【言語】刷題2

Blueprints - Gameplay Message Subsystem

火山云網站搭建

嵌入式開發學習（第二階段 C語言基礎）

Nginx +Nginx-http-flv-module 推流拉流

射頻ADRV9026驅動

使用本地部署的 LLaMA 3 模型進行中文對話生成

Oracle19c中的全局臨時表

Windows 安裝 Milvus

24、DeepSeek-V3論文筆記

1.8 梯度

274、H指數

【C++11】異常

YOLOv5推理代碼解析

1. infer_img() 推理主函數

第一步：圖像預處理

第二步：模型推理

第三步：輸出坐標解碼

第四步：后處理，獲取檢測框信息

2. cal_outputs() 坐標解碼函數

含義解釋：

3. post_process_opencv() 后處理函數

功能：

步驟：

相關文章

1. `infer_img()` 推理主函數

2. `cal_outputs()` 坐標解碼函數

3. `post_process_opencv()` 后處理函數