cuda編程筆記（13）--使用CUB庫實現基本功能

CUB 是 NVIDIA 提供的 高性能 CUDA 基礎庫，包含常用的并行原語（Reduction、Scan、Histogram 等），可以極大簡化代碼，并且比手寫版本更優化。

CUB無需鏈接，只用包含<cub/cub.cuh>頭文件即可

需要先臨時獲取空間

CUB 內部需要額外的緩沖區來做并行歸約、掃描等操作，而這個緩沖區的大小依賴于 輸入數據量、算法、GPU 結構，編譯期無法確定。

通用函數接口-Device-level（設備級）

運行在整個設備（grid）范圍，需要全局內存臨時空間。

cub::DeviceReduce::Reduce

template <typename InputIteratorT,typename OutputIteratorT,typename ReductionOp,typename T>
static cudaError_t Reduce(void *d_temp_storage,        // 臨時存儲區指針size_t &temp_storage_bytes,  // 存儲區大小InputIteratorT d_in,         // 輸入迭代器（GPU 內存）OutputIteratorT d_out,       // 輸出迭代器（GPU 內存）int num_items,               // 輸入元素個數ReductionOp reduction_op,    // 歸約操作符（如加法、最大值）T init,                      // 歸約初始值cudaStream_t stream = 0);    // CUDA 流

d_temp_storage 和 temp_storage_bytes：兩階段調用機制（先獲取大小再分配）
d_in / d_out：輸入和輸出數組指針（在 GPU 上）
num_items：元素數量
reduction_op：歸約操作（例如 cub::Sum(), cub::Max()，或自定義 Lambda）
init：歸約起始值
stream：運行在哪個 CUDA stream 上

求和

#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cublas_v2.h>
#include <cufft.h>
#include<cub/cub.cuh>
#include <iostream>
#include<cstdio>
#include <vector>void error_handling(cudaError_t res) {if (res !=cudaSuccess) {std::cout << "error!" << std::endl;}
}
int main() {const int N = 8;float h_in[N] = { 1, 2, 3, 4, 5, 6, 7, 8 };float* d_in, * d_out;cudaMalloc(&d_in, N * sizeof(float));cudaMalloc(&d_out,  sizeof(float));cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);void* d_temp_storage = nullptr;size_t temp_storage_bytes = 0;cub::DeviceReduce::Reduce(d_temp_storage,temp_storage_bytes,d_in,d_out,N,cub::Sum(),0.0f);// 分配臨時空間cudaMalloc(&d_temp_storage, temp_storage_bytes);// 第二次調用：執行歸約cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,d_in, d_out, N,cub::Sum(), 0.0f);float h_out;cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);std::cout << "Sum = " << h_out << std::endl;cudaFree(d_in);cudaFree(d_out);cudaFree(d_temp_storage);return 0;
}

當然，Reduce是通用版本，也有Sum的特化版本

  template <typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>cudaError_tstatic Sum(void* d_temp_storage,size_t& temp_storage_bytes,InputIteratorT d_in,OutputIteratorT d_out,NumItemsT num_items,cudaStream_t stream = 0);

#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cublas_v2.h>
#include <cufft.h>
#include<cub/cub.cuh>
#include <iostream>
#include<cstdio>
#include <vector>void error_handling(cudaError_t res) {if (res !=cudaSuccess) {std::cout << "error!" << std::endl;}
}
int main() {const int N = 8;float h_in[N] = { 1, 2, 3, 4, 5, 6, 7, 8 };float* d_in, * d_out;cudaMalloc(&d_in, N * sizeof(float));cudaMalloc(&d_out,  sizeof(float));cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);void* d_temp_storage = nullptr;size_t temp_storage_bytes = 0;// 先獲取臨時空間大小cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, N);cudaMalloc(&d_temp_storage, temp_storage_bytes);// 執行 Reducecub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, N);float h_out;cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);std::cout << "Sum = " << h_out << std::endl;cudaFree(d_in);cudaFree(d_out);cudaFree(d_temp_storage);return 0;
}

自定義乘積

可以求數組所有元素的乘積；乘積Reduce沒有提供接口，可以自己寫一個可執行對象（仿函數類，lambda表達式等都可以）

這里使用lambda表達式

#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cublas_v2.h>
#include <cufft.h>
#include<cub/cub.cuh>
#include <iostream>
#include<cstdio>
#include <vector>void error_handling(cudaError_t res) {if (res !=cudaSuccess) {std::cout << "error!" << std::endl;}
}
int main() {const int N = 8;float h_in[N] = { 1, 2, 3, 4, 5, 6, 7, 8 };float* d_in, * d_out;cudaMalloc(&d_in, N * sizeof(float));cudaMalloc(&d_out, sizeof(float));cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);void* d_temp_storage = nullptr;size_t temp_storage_bytes = 0;cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, N, []__device__(float a, float b) ->float{ return a * b; }, 1.0f);// 分配臨時空間cudaMalloc(&d_temp_storage, temp_storage_bytes);// 第二次調用：執行乘積cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, N, []__device__(float a, float b) ->float{ return a * b; }, 1.0f);float h_out;cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);std::cout << "mul = " << h_out << std::endl;cudaFree(d_in);cudaFree(d_out);cudaFree(d_temp_storage);return 0;
}

如果想要使用在設備使用lambda表達式，需要編譯時加上：

nvcc main.cu -o main --extended-lambda

如果用VS，打開項目屬性，在這里加：

（用仿函數類就不用開啟這個，如此即可）

struct MultiplyOp {__device__ float operator()(float a, float b) const {return a * b;}
};// 調用
cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,d_in, d_out, N,MultiplyOp(), 1.0f);

前綴和

前綴和有專門的函數

  template <typename InputIteratorT, typename OutputIteratorT>static cudaError_t ExclusiveSum(void* d_temp_storage,// 臨時存儲指針size_t& temp_storage_bytes,// 臨時存儲大小InputIteratorT d_in,// 輸入迭代器（指針）OutputIteratorT d_out,// 輸出迭代器（指針）int num_items,// 元素數量cudaStream_t stream = 0)// CUDA 流（可選）
;

使用起來沒有任何差別

#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cublas_v2.h>
#include <cufft.h>
#include<cub/cub.cuh>
#include <iostream>
#include<cstdio>
#include <vector>void error_handling(cudaError_t res) {if (res !=cudaSuccess) {std::cout << "error!" << std::endl;}
}
int main() {const int N = 8;float h_in[N] = { 1, 2, 3, 4, 5, 6, 7, 8 };float* d_in, * d_out;cudaMalloc(&d_in, N * sizeof(float));cudaMalloc(&d_out, N*sizeof(float));cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);void* d_temp_storage = nullptr;size_t temp_storage_bytes = 0;// 獲取臨時空間大小cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, N);cudaMalloc(&d_temp_storage, temp_storage_bytes);// 執行 Exclusive Scancub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, N);float h_out[N];cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);std::cout << "Exclusive Scan: ";for (int i = 0; i < N; i++) std::cout << h_out[i] << " ";std::cout << std::endl;cudaFree(d_in);cudaFree(d_out);cudaFree(d_temp_storage);return 0;
}

ExclusiveScan 是前綴和，不包括當前元素：

輸入:   [1, 2, 3, 4, 5, 6, 7, 8]
輸出:   [0, 1, 3, 6,10,15,21,28]

Block-level （線程塊級）

作用

用于一個 Block 內的線程協作，通常替代手寫的 __shared__ + 手寫 reduce/scan。
比 Warp-level 更大范圍（整個 block），但不涉及 grid 級同步。
用途：塊內歸約、塊內前綴和、塊內排序。

線程塊級類的調用套路是一樣的

定義類型（typedef）
申請共享內存（TempStorage）
調用對象的方法

BlockReduce模板類

namespace cub {template <typename T,                // 數據類型，例如 float, intint BLOCK_DIM_X,           // 線程塊大小cub::BlockReduceAlgorithm ALGORITHM = cub::BLOCK_REDUCE_WARP_REDUCTIONS // 可選//還有一些其他模板參數，一般都可以忽略
>
class BlockReduce {
public:// 內部類型：臨時存儲struct TempStorage;// 構造函數：傳入共享內存__device__ __forceinline__ BlockReduce(TempStorage& temp_storage);// 常用方法：__device__ T Sum(T input); // 塊內所有線程求和template <typename ReductionOp>__device__ T Reduce(T input, ReductionOp reduction_op); // 自定義規約操作__device__ T Sum(T input, T identity); // 帶初始值的求和// 返回最大值和索引struct ArgMax { T value; int index; };__device__ ArgMax Reduce(T input, ReductionOp reduction_op, ArgMax identity);
};} // namespace cub

特點

不需要手寫循環/__syncthreads()，CUB 自動優化 bank conflict。
自定義操作用 .Reduce(val, binary_op)。

塊內歸約


__global__ void block_reduce_sum(float* d_in, float* d_out) {// 定義 BlockReduce 類型：數據類型 float，block 大小 256typedef cub::BlockReduce<float, 256> BlockReduceT;// 共享內存（臨時存儲）__shared__ typename BlockReduceT::TempStorage temp_storage;int tid = threadIdx.x + blockIdx.x * blockDim.x;float val = d_in[tid];// 每個 block 歸約，返回該 block 的總和float block_sum = BlockReduceT(temp_storage).Sum(val);if (threadIdx.x == 0) {d_out[blockIdx.x] = block_sum;  // 每個 block 的結果}
}

BlockReduce::Sum(T input) 里的 input 參數 是 當前線程貢獻的單個值，也就是參與規約的元素。

BlockReduce::Sum() 會在整個 線程塊（block）內，把所有線程的 input 值加起來，返回 塊內的總和。

自定義規約

 __device__ T Reduce(T input, ReductionOp reduction_op); // 自定義規約操作

該成員函數可以讓我們實現自定義的規約操作

參數	含義
`input`	每個線程的本地值，類型為 `T`。
`binary_op`	一個二元操作函數對象，類型為 `BinaryOp`，定義了如何將兩個 `T` 類型的值合并。例如：加法、乘法、最大值等。

比如可以用cub庫提供的可調用對象類

float block_sum = BlockReduceT(temp_storage).Reduce(val, cub::Sum());//規約加法
float block_prod = BlockReduceT(temp_storage).Reduce(val, cub::Multiply());//規約乘法

也可以自己實現仿函數類或lambda表達式；具體操作與上文Device級別的自定義乘積類似；

BlockScan

namespace cub {template <typename T,            // 數據類型int BLOCK_DIM_X,       // 線程塊大小cub::BlockScanAlgorithm ALGORITHM = cub::BLOCK_SCAN_WARP_SCANS // 可選
>
class BlockScan {
public:struct TempStorage;__device__ __forceinline__ BlockScan(TempStorage& temp_storage);// 前綴和（不包含自己）template <typename ScanOp>__device__ void ExclusiveScan(T input, T &output, ScanOp scan_op, T identity);// 前綴和（包含自己）template <typename ScanOp>__device__ void InclusiveScan(T input, T &output, ScanOp scan_op);// 常用簡化版本__device__ void ExclusiveSum(T input, T &output, T identity = 0);__device__ void InclusiveSum(T input, T &output);
};} // namespace cub

使用

typedef cub::BlockScan<int, 256> BlockScanT;
__shared__ typename BlockScanT::TempStorage temp_storage;
int output;
BlockScanT(temp_storage).ExclusiveSum(input, output);

BlockRadixSort

namespace cub {template <typename KeyT,           // 鍵類型int BLOCK_DIM_X,         // 線程塊大小typename ValueT = void,  // 可選，值類型int ITEMS_PER_THREAD = 1 // 每線程處理的元素個數
>
class BlockRadixSort {
public:struct TempStorage;__device__ __forceinline__ BlockRadixSort(TempStorage& temp_storage);// 對鍵排序（升序），排序后把屬于該線程的key更新__device__ void Sort(KeyT &key);// 降序__device__ void SortDescending(KeyT &key);// 鍵值對排序__device__ void Sort(KeyT &key, ValueT &value);__device__ void SortDescending(KeyT &key, ValueT &value);
};} // namespace cub

使用模式

#include <cub/cub.cuh>__global__ void block_sort(int *d_keys) {typedef cub::BlockRadixSort<int, 256> BlockRadixSortT;__shared__ typename BlockRadixSortT::TempStorage temp_storage;int thread_id = threadIdx.x + blockIdx.x * blockDim.x;int key = d_keys[thread_id];// 在 block 內排序（升序）BlockRadixSortT(temp_storage).Sort(key);// 寫回排序后的值d_keys[thread_id] = key;
}