基于OpenCL的mean filter性能

1.對于一個標準的3*3 均值濾波，kernel代碼如下：

使用buffer/image緩沖對象

__kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage, uint N)
{int x = get_global_id(0);int y = get_global_id(1);int width = get_global_size(0);int height = get_global_size(1);int k = (N-1)/2;int n = N*N; //n*nif(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{ 
outputImage[x + y * width] = inputImage[x + y * width];return;
}uint4 finalcolor = (uint4)(0);int i,j;for(j = y - k; j <= y + k; j++)
{for(i = x - k; i <= x + k; i++)
{
finalcolor = finalcolor + convert_uint4(inputImage[i + j * width]);
}
} outputImage[x + y * width] = convert_uchar4(finalcolor/n);}

__kernel void filterImg( image2d_t inputImage, __write_only image2d_t outputImage, uint N)
{int x = get_global_id(0);int y = get_global_id(1);int width = get_global_size(0);int height = get_global_size(1);uint4 temp = read_imageui(inputImage, imageSampler, (int2)(x,y));int k = (N-1)/2;int n = N*N; //n*nif(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{
write_imageui(outputImage, (int2)(x,y), temp);return;
}/* k*k area */
uint4 finalcolor = (uint4)(0);int i,j;for(j = y - k; j <= y + k; j++)
{for(i = x - k; i <= x + k; i++)
{
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(i,j));
}
}finalcolor = finalcolor/n;write_imageui(outputImage, (int2)(x,y), finalcolor);}

對一個2048*2048的圖像執行filter操作，

global work size = {2048, 2048, 1}, group work size = {16, 16}, 一般group work size應該為64的倍數，因為對于AMD顯卡，wave是基本的硬件線程調度單位。

使用了6個GPRs，沒有使用ScratchRegs,ScratchRregs是指用vedio meory來模擬GPR，但是線程執行的速度會大大降低，應盡量減少ScratchRegs的數量。

可以看到，使用image對象kernel執行時間要短，但奇怪的是各項性能參數都是buffer對象領先，除了alu busy和alu指令數目。

改為下面的kernel代碼，性能會有所提高

__kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage, uint N)
{int x = get_global_id(0);int y = get_global_id(1);int width = get_global_size(0);int height = get_global_size(1);if(x < 1 || y < 1 || x > width - 2 || y > height - 2)
{ 
outputImage[x + y * width] = inputImage[x + y * width];return;
}uint4 finalcolor = (uint4)(0);finalcolor = finalcolor + convert_uint4(inputImage[x-1+( y-1) * width]);
finalcolor = finalcolor + convert_uint4(inputImage[x+( y-1) * width]);
finalcolor = finalcolor + convert_uint4(inputImage[x+1+( y-1) * width]);
finalcolor = finalcolor + convert_uint4(inputImage[x-1+y * width]);
finalcolor = finalcolor + convert_uint4(inputImage[x+y * width]);
finalcolor = finalcolor + convert_uint4(inputImage[x+1+y * width]);
finalcolor = finalcolor + convert_uint4(inputImage[x-1+( y+1) * width]);
finalcolor = finalcolor + convert_uint4(inputImage[x+( y+1) * width]);
finalcolor = finalcolor + convert_uint4(inputImage[x+1+( y+1) * width]);outputImage[x + y * width] = convert_uchar4(finalcolor/9);}
__kernel void filter1(__global uchar4* inputImage, __global uchar4* outputImage, uint N)
{int x = get_global_id(0);int y = get_global_id(1);int width = get_global_size(0);int height = get_global_size(1);int k = (N-1)/2;int n = N*N; //n*nif(x < k || y < k || x > width - k - 1 || y > height - k - 1)
{ 
outputImage[x + y * width inputImage[x + y * width];return;
}// if(x==209 && y ==243)//{// printf("final color:%d,%d,%d,%d\n", finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);// }uint4 finalcolor = (uint4)(0);int i,j;for(j = y - k; j <= y + k; j++)
{for(i = x - k; i <= x + k; i++)
{
finalcolor = finalcolor + convert_uint4(inputImage[i + j * width]);
}
} outputImage[x + y * width] = convert_uchar4(finalcolor/n);}
__kernel void filterImg( image2d_t inputImage, __write_only image2d_t outputImage, uint N)
{int x = get_global_id(0);int y = get_global_id(1);int width = get_global_size(0);int height = get_global_size(1);uint4 temp = read_imageui(inputImage, imageSampler, (int2)(x,y));if(x < 1 || y < 1 || x > width - 2 || y > height - 2)
{
write_imageui(outputImage, (int2)(x,y), temp);return;
}/* k*k area */
uint4 finalcolor = (uint4)(0);finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y-1));
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y-1));
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y-1));
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y));
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y));
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y));
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y+1));
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y+1));
finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y+1));finalcolor = finalcolor/9;write_imageui(outputImage, (int2)(x,y), finalcolor);}

本文來自互聯網用戶投稿，該文觀點僅代表作者本人，不代表本站立場。本站僅提供信息存儲空間服務，不擁有所有權，不承擔相關法律責任。
如若轉載，請注明出處：http://www.pswp.cn/news/448820.shtml
繁體地址，請注明出處：http://hk.pswp.cn/news/448820.shtml
英文地址，請注明出處：http://en.pswp.cn/news/448820.shtml

如若內容造成侵權/違法違規/事實不符，請聯系多彩編程網進行投訴反饋email:809451989@qq.com，一經查實，立即刪除！