源碼安裝 HIPIFY 和應用示例,將cuda生態源碼轉化成HIP生態源碼

1，源碼下載

GitHub - ROCm/HIPIFY: HIPIFY: Convert CUDA to Portable C++ CodeHIPIFY: Convert CUDA to Portable C++ Code. Contribute to ROCm/HIPIFY development by creating an account on GitHub.https://github.com/ROCm/HIPIFY.git

git clone --recursive https://github.com/ROCm/HIPIFY.git

sudo apt install clang-dev

?2，編譯并安裝

2.1 通常方式

hipify-clang 文檔：

https://github.com/ROCm/HIPIFY/blob/amd-staging/docs/hipify-clang.md

編譯命令：

cmake -DCMAKE_INSTALL_PREFIX=../dist -DCMAKE_BUILD_TYPE=Release  ..

make -j install

此時 hipify-clang 會被安裝到 HIPIFY/dist/bin 中，

測試：

cd  ../dist/bin
hipify --help

如果系統中存在多個llvm版本，在執行翻譯命令時，比如hipify-clang ./vectorAdd.cu? --cuda-path=/usr/local/cuda-12.1可能會發生錯誤，如下提示：

CommandLine Error: Option 'static-func-full-module-prefix' registered more than once!
LLVM ERROR: inconsistency in registered CommandLine option

這時需要使用自制的LLVM，如下2.2節所示。

2.2 自制LLVM的方式

2.2.1? 下載llvm源碼

wget https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-17.0.6.tar.gz

解壓，tar zxf llvmorg.....

2.2.2? 配置編譯LLVM

cd llvmorg.....mkdir -p build ../dist/localcd buildcmake -G "Unix Makefiles" ../llvm      \
-DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;compiler-rt"           \
-DLLVM_BUILD_EXAMPLES=ON           -DLLVM_TARGETS_TO_BUILD="host"      \
-DCMAKE_BUILD_TYPE=Release           -DLLVM_ENABLE_ASSERTIONS=ON       \
-DLLVM_ENABLE_RUNTIMES=all             -DLLVM_BUILD_LLVM_DYLIB=ON      \
-DCMAKE_INSTALL_PREFIX=../../dist/local

make -j

make -j install

測試時，llvm 被install在如下文件夾：

/home/hipper/ex_dock_hipify/dist/local

ls /home/hipper/ex_dock_hipify/dist/local 如圖：

2.2.3 配置編譯HIPIFY

指定 LLVM 安裝目錄的配置方法：

-DCMAKE_PREFIX_PATH=/home/hipper/ex_dock_hipify/dist/local


cmake \-DCMAKE_BUILD_TYPE=Release \-DCMAKE_INSTALL_PREFIX=../dist \-DCMAKE_PREFIX_PATH=/home/hipper/ex_dock_hipify/dist/local \-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-12.1  ..

make -j install

3. 示例

3.1翻譯 .cu 文件到 .hip 文件

命令：

/home/hipper/ex_dock_hipify/HIPIFY/dist/bin/hipify-clang ./vectorAdd.cu  --cuda-path=/usr/local/cuda-12.1

會在 ./ 目錄中生成 vectoreAdd.cu.hip 的文件。

其中，hipify-clang 并不檢查輸入文件的擴展名，比如這里的.cu，它只檢查文件內部的內容，將cuda生態的關鍵字有機地翻譯成 hip生態的關鍵字，輸出文件會在原文件名的基礎上加上 .hip 后綴；

源代碼分別如下。

使用 cuda samples中的vectoradd.cu為例，源碼如下：

vectorAdd.cu

#include <stdio.h>
#include <cuda_runtime.h>__global__ void vectorAdd(const float *A, const float *B, float *C,int numElements) {int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < numElements) {C[i] = A[i] + B[i] + 0.0f;}
}int main(void) {cudaError_t err = cudaSuccess;int numElements = 50000;size_t size = numElements * sizeof(float);printf("[Vector addition of %d elements]\n", numElements);float *h_A = (float *)malloc(size);float *h_B = (float *)malloc(size);float *h_C = (float *)malloc(size);if (h_A == NULL || h_B == NULL || h_C == NULL) {fprintf(stderr, "Failed to allocate host vectors!\n");exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {h_A[i] = rand() / (float)RAND_MAX;h_B[i] = rand() / (float)RAND_MAX;}float *d_A = NULL;err = cudaMalloc((void **)&d_A, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}float *d_B = NULL;err = cudaMalloc((void **)&d_B, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}float *d_C = NULL;err = cudaMalloc((void **)&d_C, size);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy input data from the host memory to the CUDA device\n");err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector A from host to device (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector B from host to device (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}int threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,threadsPerBlock);vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);err = cudaGetLastError();if (err != cudaSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy output data from the CUDA device to the host memory\n");err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);if (err != cudaSuccess) {fprintf(stderr,"Failed to copy vector C from device to host (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);exit(EXIT_FAILURE);}}printf("Test PASSED\n");err = cudaFree(d_A);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaFree(d_B);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}err = cudaFree(d_C);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n",cudaGetErrorString(err));exit(EXIT_FAILURE);}free(h_A);free(h_B);free(h_C);printf("Done\n");return 0;
}

生成的 vectorAdd.cu.hip :

#include <stdio.h>
#include <hip/hip_runtime.h>__global__ void vectorAdd(const float *A, const float *B, float *C,int numElements) {int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < numElements) {C[i] = A[i] + B[i] + 0.0f;}
}int main(void) {hipError_t err = hipSuccess;int numElements = 50000;size_t size = numElements * sizeof(float);printf("[Vector addition of %d elements]\n", numElements);float *h_A = (float *)malloc(size);float *h_B = (float *)malloc(size);float *h_C = (float *)malloc(size);if (h_A == NULL || h_B == NULL || h_C == NULL) {fprintf(stderr, "Failed to allocate host vectors!\n");exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {h_A[i] = rand() / (float)RAND_MAX;h_B[i] = rand() / (float)RAND_MAX;}float *d_A = NULL;err = hipMalloc((void **)&d_A, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}float *d_B = NULL;err = hipMalloc((void **)&d_B, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}float *d_C = NULL;err = hipMalloc((void **)&d_C, size);if (err != hipSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy input data from the host memory to the CUDA device\n");err = hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector A from host to device (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector B from host to device (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}int threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,threadsPerBlock);vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);err = hipGetLastError();if (err != hipSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}printf("Copy output data from the CUDA device to the host memory\n");err = hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost);if (err != hipSuccess) {fprintf(stderr,"Failed to copy vector C from device to host (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}for (int i = 0; i < numElements; ++i) {if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);exit(EXIT_FAILURE);}}printf("Test PASSED\n");err = hipFree(d_A);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipFree(d_B);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}err = hipFree(d_C);if (err != hipSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n",hipGetErrorString(err));exit(EXIT_FAILURE);}free(h_A);free(h_B);free(h_C);printf("Done\n");return 0;
}

3.2 編譯運行 vectorAdd.cu.hip

編譯：

$ /opt/rocm/bin/hipcc ./vectorAdd.cu.hip -o vectorAdd

運行效果如下圖：