共享內存(Shared Memory)
1.是一種低延遲、高帶寬的片上內存
2.由同一個Block內的所有線程共享
3.生命周期與Block相同
4.訪問速度比全局內存快約100倍
Block(線程塊)
1.GPU執行的基本單位,包含一組線程
2.多個Block組成Grid(網格)
3.Block內的線程可以通過共享內存通信
4.Block之間是獨立執行的
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime.h>#define CHECK(call) \{\const cudaError_t error = call; \if (error != cudaSuccess)\{\printf("Error: %s: %d\n", __FILE__, __LINE__);\printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\exit(1);\}\
}void initialInt( int * ip, int size)
{for (int i =0; i < size; i ++){ip[i] = i;}
}void printMatrix(int *C, const int nx, const int ny)
{int *ic = C;printf("\n matrix : (%d, %d)\n", nx, ny);for (int iy = 0; iy < ny; iy++){for (int ix =0; ix < nx; ix++){printf("%3d",ic[ix]);}ic += nx;printf("\n");}printf("\n");
}__global__ void printThreadIndex(int *A, const int nx, const int ny)
{int bx = blockIdx.x;int by = blockIdx.y;int ix = threadIdx.x + blockIdx.x * blockDim.x;int iy = threadIdx.y + blockIdx.y * blockDim.y;int tx = threadIdx.x;int ty = threadIdx.y;unsigned int idx = iy*nx + ix;const int BM = 2; const int BN = 4;__shared__ float smem[BM][BN];smem[ty][tx] = float(A[idx]);printf("threadidx: (%d ,%d) blockidx:(%d ,%d) coordinate: (%d ,%d) global index: (%2d ival %2d), smem val (%f) \n", threadIdx.x, threadIdx.y,blockIdx.x, blockIdx.y,ix, iy,idx, A[idx],//smem[ty][tx]smem[0][0]);}int main(int argc , char **argv)
{printf("%s starting\n", argv[0]);int dev = 0;cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("Using Device %d : %s\n", dev, deviceprop.name);CHECK(cudaSetDevice(dev));// set matrix int nx = 8;int ny = 6;int nxy = nx * ny;int nBytes = nxy * sizeof(float);// malloc host memoryint * h_A;h_A = (int *) malloc(nBytes);//initial intinitialInt(h_A, nxy);printMatrix(h_A, nx, ny);// deviceint *d_MatA;cudaMalloc((void **)&d_MatA, nBytes);cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);dim3 block(4,2);dim3 grid ((nx + block.x - 1)/block.x, (ny + block.y - 1)/ block.y);printf("execution config grid (%d, %d), block (%d, %d)\n", grid.x, grid.y, block.x, block.y);printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);cudaDeviceSynchronize();cudaFree(d_MatA);free(h_A);cudaDeviceReset();return 0;
}
示例代碼中, block的大小是(4,2), 所以在核函數中聲明4x2大小的 SMEM, 只需要一次load操作,則8個線程會將數據load進 4x2大小的SMEM里。
在做printf的時候,因為SMEM對block 可見,所以訪問SMEM[0][0] 打印出來的都是block里第一個線程load進去的數據。
輸出如下:
matrix : (8, 6)0 1 2 3 4 5 6 78 9 10 11 12 13 14 1516 17 18 19 20 21 22 2324 25 26 27 28 29 30 3132 33 34 35 36 37 38 3940 41 42 43 44 45 46 47execution config grid (2, 3), block (4, 2)
threadidx: (0 ,0) blockidx:(0 ,1) coordinate: (0 ,2) global index: (16 ival 16), smem val (16.000000)
threadidx: (1 ,0) blockidx:(0 ,1) coordinate: (1 ,2) global index: (17 ival 17), smem val (16.000000)
threadidx: (2 ,0) blockidx:(0 ,1) coordinate: (2 ,2) global index: (18 ival 18), smem val (16.000000)
threadidx: (3 ,0) blockidx:(0 ,1) coordinate: (3 ,2) global index: (19 ival 19), smem val (16.000000)
threadidx: (0 ,1) blockidx:(0 ,1) coordinate: (0 ,3) global index: (24 ival 24), smem val (16.000000)
threadidx: (1 ,1) blockidx:(0 ,1) coordinate: (1 ,3) global index: (25 ival 25), smem val (16.000000)
threadidx: (2 ,1) blockidx:(0 ,1) coordinate: (2 ,3) global index: (26 ival 26), smem val (16.000000)
threadidx: (3 ,1) blockidx:(0 ,1) coordinate: (3 ,3) global index: (27 ival 27), smem val (16.000000)
threadidx: (0 ,0) blockidx:(1 ,1) coordinate: (4 ,2) global index: (20 ival 20), smem val (20.000000)
threadidx: (1 ,0) blockidx:(1 ,1) coordinate: (5 ,2) global index: (21 ival 21), smem val (20.000000)
threadidx: (2 ,0) blockidx:(1 ,1) coordinate: (6 ,2) global index: (22 ival 22), smem val (20.000000)
threadidx: (3 ,0) blockidx:(1 ,1) coordinate: (7 ,2) global index: (23 ival 23), smem val (20.000000)
threadidx: (0 ,1) blockidx:(1 ,1) coordinate: (4 ,3) global index: (28 ival 28), smem val (20.000000)
threadidx: (1 ,1) blockidx:(1 ,1) coordinate: (5 ,3) global index: (29 ival 29), smem val (20.000000)
threadidx: (2 ,1) blockidx:(1 ,1) coordinate: (6 ,3) global index: (30 ival 30), smem val (20.000000)
threadidx: (3 ,1) blockidx:(1 ,1) coordinate: (7 ,3) global index: (31 ival 31), smem val (20.000000)
threadidx: (0 ,0) blockidx:(1 ,0) coordinate: (4 ,0) global index: ( 4 ival 4), smem val (4.000000)
threadidx: (1 ,0) blockidx:(1 ,0) coordinate: (5 ,0) global index: ( 5 ival 5), smem val (4.000000)
threadidx: (2 ,0) blockidx:(1 ,0) coordinate: (6 ,0) global index: ( 6 ival 6), smem val (4.000000)
threadidx: (3 ,0) blockidx:(1 ,0) coordinate: (7 ,0) global index: ( 7 ival 7), smem val (4.000000)
threadidx: (0 ,1) blockidx:(1 ,0) coordinate: (4 ,1) global index: (12 ival 12), smem val (4.000000)
threadidx: (1 ,1) blockidx:(1 ,0) coordinate: (5 ,1) global index: (13 ival 13), smem val (4.000000)
threadidx: (2 ,1) blockidx:(1 ,0) coordinate: (6 ,1) global index: (14 ival 14), smem val (4.000000)
threadidx: (3 ,1) blockidx:(1 ,0) coordinate: (7 ,1) global index: (15 ival 15), smem val (4.000000)
threadidx: (0 ,0) blockidx:(0 ,2) coordinate: (0 ,4) global index: (32 ival 32), smem val (32.000000)
threadidx: (1 ,0) blockidx:(0 ,2) coordinate: (1 ,4) global index: (33 ival 33), smem val (32.000000)
threadidx: (1 ,0) blockidx:(0 ,2) coordinate: (1 ,4) global index: (33 ival 33), smem val (32.000000)
threadidx: (2 ,0) blockidx:(0 ,2) coordinate: (2 ,4) global index: (34 ival 34), smem val (32.000000)
threadidx: (3 ,0) blockidx:(0 ,2) coordinate: (3 ,4) global index: (35 ival 35), smem val (32.000000)
threadidx: (0 ,1) blockidx:(0 ,2) coordinate: (0 ,5) global index: (40 ival 40), smem val (32.000000)
threadidx: (1 ,1) blockidx:(0 ,2) coordinate: (1 ,5) global index: (41 ival 41), smem val (32.000000)
threadidx: (0 ,1) blockidx:(0 ,2) coordinate: (0 ,5) global index: (40 ival 40), smem val (32.000000)
threadidx: (1 ,1) blockidx:(0 ,2) coordinate: (1 ,5) global index: (41 ival 41), smem val (32.000000)
threadidx: (2 ,1) blockidx:(0 ,2) coordinate: (2 ,5) global index: (42 ival 42), smem val (32.000000)
threadidx: (3 ,1) blockidx:(0 ,2) coordinate: (3 ,5) global index: (43 ival 43), smem val (32.000000)
threadidx: (3 ,1) blockidx:(0 ,2) coordinate: (3 ,5) global index: (43 ival 43), smem val (32.000000)
threadidx: (0 ,0) blockidx:(0 ,0) coordinate: (0 ,0) global index: ( 0 ival 0), smem val (0.000000)
threadidx: (0 ,0) blockidx:(0 ,0) coordinate: (0 ,0) global index: ( 0 ival 0), smem val (0.000000)
threadidx: (1 ,0) blockidx:(0 ,0) coordinate: (1 ,0) global index: ( 1 ival 1), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (0 ,1) blockidx:(0 ,0) coordinate: (0 ,1) global index: ( 8 ival 8), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3), smem val (0.000000)
threadidx: (0 ,1) blockidx:(0 ,0) coordinate: (0 ,1) global index: ( 8 ival 8), smem val (0.000000)
threadidx: (1 ,1) blockidx:(0 ,0) coordinate: (1 ,1) global index: ( 9 ival 9), smem val (0.000000)
threadidx: (2 ,1) blockidx:(0 ,0) coordinate: (2 ,1) global index: (10 ival 10), smem val (0.000000)
threadidx: (3 ,1) blockidx:(0 ,0) coordinate: (3 ,1) global index: (11 ival 11), smem val (0.000000)
threadidx: (0 ,0) blockidx:(1 ,2) coordinate: (4 ,4) global index: (36 ival 36), smem val (36.000000)
threadidx: (1 ,0) blockidx:(1 ,2) coordinate: (5 ,4) global index: (37 ival 37), smem val (36.000000)
threadidx: (2 ,0) blockidx:(1 ,2) coordinate: (6 ,4) global index: (38 ival 38), smem val (36.000000)
threadidx: (3 ,0) blockidx:(1 ,2) coordinate: (7 ,4) global index: (39 ival 39), smem val (36.000000)
threadidx: (0 ,1) blockidx:(1 ,2) coordinate: (4 ,5) global index: (44 ival 44), smem val (36.000000)
threadidx: (1 ,1) blockidx:(1 ,2) coordinate: (5 ,5) global index: (45 ival 45), smem val (36.000000)
threadidx: (2 ,1) blockidx:(1 ,2) coordinate: (6 ,5) global index: (46 ival 46), smem val (36.000000)
threadidx: (3 ,1) blockidx:(1 ,2) coordinate: (7 ,5) global index: (47 ival 47), smem val (36.000000)