由于共享內存擁有僅次于寄存器的讀寫速度,比全局內存快得多。因此,能夠用共享內存訪問替換全局內存訪問的場景都可以考慮做對應的優化。
不利用共享內存的矩陣乘法
不利用共享內存的矩陣乘法的直接實現。每個線程讀取A的一行和B的一列,并計算C的相應元素,如圖。
訪問次數 :
從全局內存中讀取A的次數為B.width,讀取B的次數為A.height。
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "error.cuh"
#include <stdlib.h>
#include<math.h>
#include <malloc.h>
#include <stdlib.h>//利用share memory 和統一內存優化矩陣乘
#define M 80
#define N 2000// 線程塊尺寸
#define BLOCK_SIZE 16///-----------沒有共享內存的矩陣乘法-----------
// 矩陣以行為主的順序存儲:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct
{int width;int height;float* elements;
} Matrix;// MatMul()調用的矩陣乘法內核
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{// 每個線程通過將結果累積到Cvalue中來計算C的一個元素float Cvalue = 0;int row = blockIdx.y * blockDim.y + threadIdx.y;int col = blockIdx.x * blockDim.x + threadIdx.x;for (int e = 0; e < A.width; ++e)Cvalue += A.elements[row * A.width + e] * B.elements[e * B.width + col];C.elements[row * C.width + col] = Cvalue;
}// 矩陣乘法核的前向聲明
//__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// 矩陣乘法-主機代碼
//矩陣維度被假定為BLOCK_SIZE的倍數
void MatMul(const Matrix A, const Matrix B, Matrix C)
{// 將A和B加載到設備內存Matrix d_A;d_A.width = A.width; d_A.height = A.height;size_t size = A.width * A.height * sizeof(float);cudaMalloc(&d_A.elements, size);cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);Matrix d_B;d_B.width = B.width; d_B.height = B.height;size = B.width * B.height * sizeof(float);cudaMalloc(&d_B.elements, size);cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);// 在設備內存中分配CMatrix d_C;d_C.width = C.width; d_C.height = C.height;size = C.width * C.height * sizeof(float);cudaMalloc(&d_C.elements, size);// Invoke kerneldim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);MatMulKernel <<< dimGrid, dimBlock >>>(d_A, d_B, d_C);//從設備內存中讀取CcudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);// 釋放設備內存cudaFree(d_A.elements);cudaFree(d_B.elements);cudaFree(d_C.elements);
}int main()
{ Matrix matrix_1, matrix_2, matrix_out;int memsize = sizeof(float) * M * N;int memsize_out = sizeof(float) * M * M;matrix_1.width = matrix_2.height = M;matrix_2.width = matrix_1.height = N;matrix_out.width = matrix_out.height = M;cudaMallocHost((void**)&matrix_1.elements, memsize);cudaMallocHost((void**)&matrix_2.elements, memsize);cudaMallocHost((void**)&matrix_out.elements, memsize_out);for (int y = 0; y < N; y++)for (int x = 0; x < M; x++)matrix_1.elements[y * M + x] = (float)rand() / 1.0E5 ;for (int y = 0; y < M; y++)for (int x = 0; x < N; x++)matrix_2.elements[y * N + x] = (float)rand() / 1.0E5;//for (int y = 0; y < N; y++)//{// printf("\n matrix_1[%d]:\n", y);// for (int x = 0; x < M; x++)// {// printf("%.2f ", matrix_1.elements[y * M + x]);// }//}//for (int y = 0; y < M; y++)//{// printf("\n matrix_2[%d]:\n", y);// for (int x = 0; x < N; x++)// {// printf("%.2f ", matrix_2.elements[y * N + x]);// }//}cudaEvent_t start, stop_gpu;cudaEventCreate(&start);//創建事件cudaEventCreate(&stop_gpu);//創建事件cudaEventRecord(start, 0);//記錄事件MatMul(matrix_1, matrix_2, matrix_out);cudaEventRecord(stop_gpu,0);//記錄事件cudaEventSynchronize(stop_gpu);float time_gpu;cudaEventElapsedTime(&time_gpu, start, stop_gpu);//事件計時//printf("\n GPU time: %.4f ms \n", time_gpu);cudaEventDestroy(start);//銷毀事件cudaEventDestroy(stop_gpu);for (int y = 0; y < M; y++){printf("\n matrix_out[%d]:\n", y);for (int x = 0; x < M; x++){printf("%.2f ", matrix_out.elements[y * M + x]);}}cudaFreeHost(matrix_1.elements);cudaFreeHost(matrix_2.elements);cudaFreeHost(matrix_out.elements);system("pause");return 0;
}
利用共享內存的矩陣乘法
每個線程塊負責計算矩陣C的一個方子矩陣Csub,塊內的每個線程負責計算Csub的一個元素。
Csub等于兩個矩形矩陣的乘積:維度為(A.width,block_size)的A的子矩陣與Csub具有相同的行索引,維度為(A.width,block_size)的B的子矩陣與Csub具有相同的列索引。
為了適應設備的資源,這兩個矩形矩陣被分割成盡可能多的尺寸為block_size的方陣,Csub被計算為這些方陣乘積的和。
首先將兩個對應的方陣從全局內存加載到共享內存,其中一個線程加載每個矩陣的一個元素,然后讓每個線程計算乘積的一個元素。每個線程將這些產品的結果累積到寄存器中,完成后將結果寫入全局內存。
訪問次數 :
通過這種方式阻塞計算,我們利用了快速共享內存并節省了大量的全局內存帶寬,矩陣A只從全局內存中讀取(B.width/block_size)次,矩陣B只從全局內存中讀取(A.height/block_size)次。
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "error.cuh"
#include <stdlib.h>
#include<math.h>
#include <malloc.h>
#include <stdlib.h>//利用share memory 和統一內存優化矩陣乘#define M 80
#define N 2000// 線程塊尺寸
#define BLOCK_SIZE 16///-----------矩陣乘法與共享內存-----------
// 矩陣以行為主的順序存儲:
// M(row, col) = *(M.elements + row * M.stride + col)
typedef struct
{int width;int height;float* elements;
} Matrix;
// 得到一個矩陣元素
__device__ float GetElement(const Matrix A, int row, int col)
{return A.elements[row * A.width + col];
}
// 設置一個矩陣元素
__device__ void SetElement(Matrix A, int row, int col, float value)
{A.elements[row * A.width + col] = value;
}
// 獲取A的BLOCK_SIZExBLOCK_SIZE子矩陣subb,它位于A的左上角的col子矩陣和行子矩陣
__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{Matrix Asub;Asub.width = BLOCK_SIZE;Asub.height = BLOCK_SIZE;Asub.elements = &A.elements[A.width * BLOCK_SIZE * row + BLOCK_SIZE * col];return Asub;
}// 矩陣乘法核的前向聲明
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// 矩陣乘法-主機代碼
// 矩陣維度被假定為BLOCK_SIZE的倍數
void MatMul(const Matrix A, const Matrix B, Matrix C)
{// 將A和B加載到設備內存Matrix d_A;d_A.width = A.width; d_A.height = A.height;size_t size = A.width * A.height * sizeof(float);cudaMalloc(&d_A.elements, size);cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);Matrix d_B;d_B.width = B.width; d_B.height = B.height;size = B.width * B.height * sizeof(float);cudaMalloc(&d_B.elements, size);cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);// 在設備內存中分配CMatrix d_C;d_C.width = C.width; d_C.height = C.height;size = C.width * C.height * sizeof(float);cudaMalloc(&d_C.elements, size);// 調用內核dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);MatMulKernel <<< dimGrid, dimBlock >>>(d_A, d_B, d_C);// 從設備內存中讀取CcudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);// 空閑設備內存cudaFree(d_A.elements);cudaFree(d_B.elements);cudaFree(d_C.elements);
}
// MatMul()調用的矩陣乘法內核
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{// 塊行和列int blockRow = blockIdx.y;int blockCol = blockIdx.x;// 每個線程塊計算C的一個子矩陣CsubMatrix Csub = GetSubMatrix(C, blockRow, blockCol);// 每個線程通過將結果累積到Cvalue中來計算Csub的一個元素float Cvalue = 0;// 線程行和列在Csubint row = threadIdx.y;int col = threadIdx.x;// 遍歷計算Csub所需的A和B的所有子矩陣,將每對子矩陣相乘并累加結果for (int i = 0; i < (A.width / BLOCK_SIZE); ++i){// 得到A的子矩陣Matrix Asub = GetSubMatrix(A, blockRow, i);// 得到B的子矩陣BMatrix Bsub = GetSubMatrix(B, i, blockCol);// 用于存儲sub和sub的共享內存tively__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];// 將subb和Bsub從設備內存加載到共享內存,每個線程加載每個子矩陣的一個元素As[row][col] = GetElement(Asub, row, col);Bs[row][col] = GetElement(Bsub, row, col);// 同步以確保在開始計算之前加載子矩陣__syncthreads();// 將subb和Bsub相乘for (int j = 0; j < BLOCK_SIZE; ++j)Cvalue += As[row][j] * Bs[j][col];// 同步以確保在下一次迭代中加載兩個新的子矩陣A和B之前完成前面的計算__syncthreads();}// 將Csub寫入設備內存// 每個線程寫入一個元素SetElement(Csub, row, col, Cvalue);
}int main()
{ Matrix matrix_1, matrix_2, matrix_out;int memsize = sizeof(float) * M * N;int memsize_out = sizeof(float) * M * M;matrix_1.width = matrix_2.height = M;matrix_2.width = matrix_1.height = N;matrix_out.width = matrix_out.height = M;cudaMallocHost((void**)&matrix_1.elements, memsize);cudaMallocHost((void**)&matrix_2.elements, memsize);cudaMallocHost((void**)&matrix_out.elements, memsize_out);for (int y = 0; y < N; y++)for (int x = 0; x < M; x++)matrix_1.elements[y * M + x] = (float)rand() / 1.0E5 ;for (int y = 0; y < M; y++)for (int x = 0; x < N; x++)matrix_2.elements[y * N + x] = (float)rand() / 1.0E5;cudaEvent_t start, stop_gpu;cudaEventCreate(&start);//創建事件cudaEventCreate(&stop_gpu);//創建事件cudaEventRecord(start, 0);//記錄事件MatMul(matrix_1, matrix_2, matrix_out);cudaEventRecord(stop_gpu,0);//記錄事件cudaEventSynchronize(stop_gpu);float time_gpu;cudaEventElapsedTime(&time_gpu, start, stop_gpu);//事件計時//printf("\n GPU time: %.4f ms \n", time_gpu);cudaEventDestroy(start);//銷毀事件cudaEventDestroy(stop_gpu);for (int y = 0; y < M; y++){printf("\n matrix_out[%d]:\n", y);for (int x = 0; x < M; x++){printf("%.2f ", matrix_out.elements[y * M + x]);}}cudaFreeHost(matrix_1.elements);cudaFreeHost(matrix_2.elements);cudaFreeHost(matrix_out.elements);system("pause");return 0;
}