在使用ffmpeg解碼后的多路解碼數據非常慢,還要給AI做行的加速方式是在顯存處理數據,在視頻拼接融合產品的產品與架構設計中,提出了比較可靠的方式是使用cuda,那么沒有cuda的顯卡如何處理呢
,比較好的方式是使用opencl來提高數據傳輸效率
核函數
在OpenCL中,將NV12格式轉換為BGR格式通常涉及到對UV分量的處理,nv12 是使用ffmpeg等解碼后的直接數據,注意linesize對齊
#define GROUP_SIZE 16// OpenCL kernel to convert NV12 to BGR
__kernel void nv12_to_bgr(__global const uchar *nv12,__global uchar *bgr,int width, int height) {int x = get_global_id(0);int y = get_global_id(1);// Make sure we are not out of boundsif (x < width && y < height) {// Calculate Y, U, and V indicesint yIndex = y * width + x;int uvIndex = width * height + (y / 2) * (width) + (x & ~1); // Use '& ~1' to get even X indices for U/V// Load Y, U, and V valuesuchar yValue = nv12[yIndex];uchar uValue = nv12[uvIndex];uchar vValue = nv12[uvIndex + 1];// Convert YUV to RGBuchar bValue = (uchar)((yValue + 1.732446 * (uValue - 128));uchar gValue = (uchar)((yValue - 0.344134 * (vValue - 128) - 0.714136 * (uValue - 128));uchar rValue = (uchar)((yValue + 1.402225 * (vValue - 128));// Pack BGR valuesuchar bgrValue = (bValue << 2) | (gValue >> 4) | (rValue << 6);// Store BGR valuebgr[yIndex] = bgrValue;}
}
cpu上繼續
注意錯誤處理
// 設置OpenCL內核參數
size_t global_work_size[2] = {width, height};
cl_kernel nv12_to_bgr_kernel = ...; // 獲取你編譯的內核// 設置內核參數
clSetKernelArg(nv12_to_bgr_kernel, 0, sizeof(cl_mem), &nv12_buffer);
clSetKernelArg(nv12_to_bgr_kernel, 1, sizeof(cl_mem), &bgr_buffer);
clSetKernelArg(nv12_to_bgr_kernel, 2, sizeof(int), &width);
clSetKernelArg(nv12_to_bgr_kernel, 3, sizeof(int), &height);// 執行內核
cl_event event;
clEnqueueNDRangeKernel(command_queue, nv12_to_bgr_kernel, 2, NULL, global_work_size, NULL, 0, NULL, &event);// 等待命令執行完畢
clWaitForEvents(1, &event);
針對arm,非顯存
用128位的寄存器進行處理。
vld1_u8 從內存中讀取88位數據到寄存器
vld1q_u8 從內存中讀取168位數據到寄存器
vld3q_u8 從內存中讀取3個168位數據到寄存器中
vst3q_u8 將三個128位寄存器的數據寫到內存中
vld4_u8 從內存中讀取4個88位數據到寄存器中
vmull_u8 執行兩個8*8位無符號整數的乘法操作
vshrn_n_u16 16位無符號整數右移指定的位數
vst1_u8 將128位寄存器中的8位無符號整數元素存儲到內存中
vshrq_n_s16 16位整數右移指定的位數
舉例
void bgr_to_rgb(uint8_t *bgr, uint8_t *rgb, int width, int height)
{// Ensure BGR and BGR buffers are 16-byte aligned for NEONuint8_t *bgr_aligned = (uint8_t *)(((uintptr_t)bgr + 15) & ~15);uint8_t *rgb_aligned = (uint8_t *)(((uintptr_t)rgb + 15) & ~15);for (int q = 0; q < height * width / 16; q++){// Calculate the index for the current pixelint index = q * 16 * 3;// Load 16 BGR pixels into three vectors.uint8x16x3_t bgr_vector = vld3q_u8(bgr_aligned + index);// Shuffle the bytes to convert from BGR to BGR.uint8x16_t b = bgr_vector.val[2]; // Blueuint8x16_t g = bgr_vector.val[1]; // Greenuint8x16_t r = bgr_vector.val[0]; // Red// Combine the shuffled bytes into a single vector.uint8x16x3_t rgb_vector = {b, g, r};// Store the result.vst3q_u8(rgb_aligned + index, rgb_vector);}
}
使用gstreamer
使用gstremaer pipeline技術寫好插件,直接操作顯存