上一節我們寫了個一維向量相加的程序。這節我們來看一個4×4矩陣轉置程序。
4X4矩陣我們采用二維數組進行存儲,在程序設計上,我們讓轉置過程分4次轉置完成,就是一次轉一行。注意這里的OpenCL的工作維數是二維。(當然用一維的方式也可以,只是在CL代碼中要用到循環,效率不高)
程序分兩部份:
(1)transposition.cl代碼
1 2 3 4 5 6 7 8 9 10 11 | __kernel void transposition(__global int * A, ???????????????????? __global int * B) { ???? //獲取索引號,這里是二維的,所以可以取兩個 ???? //否則另一個永遠是0 ???? int col = get_global_id(0); ???? int row = get_global_id(1); ???? B[col*4+row] = A[row*4+col]; } |
(2)main.cpp代碼
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | #include <iostream> #include <stdio.h> #include <string.h> #include <string> #include <CL/cl.h>//包含CL的頭文件 using namespace std; //4x4數組 #define dim_x 4 #define dim_y 4 //從外部文件獲取cl內核代碼 bool GetFileData( const char * fname,string& str) { ???? FILE * fp = fopen (fname, "r" ); ???? if (fp==NULL) ???? { ???????? printf ( "no found file\n" ); ???????? return false ; ???? } ???? int n=0; ???? while ( feof (fp)==0) ???? { ???????? str += fgetc (fp); ???? } ???? return true ; } int main() { ???? //先讀外部CL核心代碼,如果失敗則退出。 ???? //代碼存buf_code里面 ???? string code_file; ???? if ( false == GetFileData( "transposition.cl" ,code_file)) ???????? return 0; ???? char * buf_code = new char [code_file.size()]; ???? strcpy (buf_code,code_file.c_str()); ???? buf_code[code_file.size()-1] = NULL; ???? //聲明CL所需變量。 ???? cl_device_id device; ???? cl_platform_id platform_id = NULL; ???? cl_context context; ???? cl_command_queue cmdQueue; ???? cl_mem bufferA,bufferB,bufferC; ???? cl_program program; ???? cl_kernel kernel = NULL; ???? //我們使用的是二維向量 ???? //設定向量大小(維數) ???? size_t globalWorkSize[2]; ???? globalWorkSize[0] = dim_x ; ???? globalWorkSize[1] = dim_y; ???? cl_int err; ???? /* ???????? 定義輸入變量和輸出變量,并設定初值 ???? */ ???? int buf_A[dim_x][dim_y]; ???? int buf_B[dim_x][dim_y]; ???? size_t datasize = sizeof ( int ) * dim_x * dim_y; ???? int n=0; ???? int m=0; ???? for (n=0;n<dim_x;n++) ???? { ???????? for (m=0;m<dim_y;m++) ???????? { ???????????? buf_A[m][n] = m + n*dim_x; ???????? } ???? } ???? //step 1:初始化OpenCL ???? err = clGetPlatformIDs(1,&platform_id,NULL); ???? if (err!=CL_SUCCESS) ???? { ???????? cout<< "clGetPlatformIDs error" <<endl; ???????? return 0; ???? } ???? //這次我們只用CPU來進行并行運算,當然你也可以該成GPU ???? clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_GPU,1,&device,NULL); ???? //step 2:創建上下文 ???? context = clCreateContext(NULL,1,&device,NULL,NULL,NULL); ???? //step 3:創建命令隊列 ???? cmdQueue = clCreateCommandQueue(context,device,0,NULL); ???? //step 4:創建數據緩沖區 ???? bufferA = clCreateBuffer(context, ????????????????????????????? CL_MEM_READ_ONLY, ????????????????????????????? datasize,NULL,NULL); ???? bufferB = clCreateBuffer(context, ????????????????????????????? CL_MEM_WRITE_ONLY, ????????????????????????????? datasize,NULL,NULL); ???? //step 5:將數據上傳到緩沖區 ???? clEnqueueWriteBuffer(cmdQueue, ????????????????????????? bufferA,CL_FALSE, ????????????????????????? 0,datasize, ????????????????????????? buf_A,0, ????????????????????????? NULL,NULL); ???? //step 6:加載編譯代碼,創建內核調用函數 ???? program = clCreateProgramWithSource(context,1, ???????????????????????????????????????? ( const char **)&buf_code, ???????????????????????????????????????? NULL,NULL); ???? clBuildProgram(program,1,&device,NULL,NULL,NULL); ???? kernel = clCreateKernel(program, "transposition" ,NULL); ???? //step 7:設置參數,執行內核 ???? clSetKernelArg(kernel,0, sizeof (cl_mem),&bufferA); ???? clSetKernelArg(kernel,1, sizeof (cl_mem),&bufferB); ???? //<span style="color: #ff0000;"><strong>注意這里第三個參數已經改成2,表示二維數據。</strong></span> ???? clEnqueueNDRangeKernel(cmdQueue,kernel, ??????????????????????????? 2,NULL, ??????????????????????????? globalWorkSize, ??????????????????????????? NULL,0,NULL,NULL); ???? //step 8:取回計算結果 ???? clEnqueueReadBuffer(cmdQueue,bufferB,CL_TRUE,0, ???????????????????????? datasize,buf_B,0,NULL,NULL); ???? //輸出計算結果 ???? for (n=0;n<dim_x;n++) ???? { ???????? for (m=0;m<dim_y;m++) ???????? { ???????????? cout<< buf_A[m][n] << "," ; ???????? } ???????? cout<<endl; ???? } ???? cout<<endl<< "====transposition====" <<endl<<endl; ???? for (n=0;n<dim_x;n++) ???? { ???????? for (m=0;m<dim_y;m++) ???????? { ???????????? cout<< buf_B[m][n] << "," ; ???????? } ???????? cout<<endl; ???? } ???? //釋放所有調用和內存 ???? clReleaseKernel(kernel); ???? clReleaseProgram(program); ???? clReleaseCommandQueue(cmdQueue); ???? clReleaseMemObject(bufferA); ???? clReleaseMemObject(bufferB); ???? clReleaseContext(context); ???? delete buf_code; ???? return 0; } |
運算結果:
?
?
?