MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs

MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs

Here’s a comprehensive MPI code that demonstrates ghost data exchange for a 3D domain decomposition across multiple GPUs. This implementation assumes you’re using CUDA-aware MPI for efficient GPU-to-GPU communication.

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <cuda_runtime.h>// Define the dimensions of the global domain
#define NX_GLOBAL 256
#define NY_GLOBAL 256
#define NZ_GLOBAL 256// Define the ghost layer width
#define GHOST_WIDTH 2// Macro for checking CUDA errors
#define CUDA_CHECK(cmd) {cudaError_t error = cmd; if(error!=cudaSuccess){printf("<%s>:%i ",__FILE__,__LINE__); printf("[CUDA] Error: %s\n", cudaGetErrorString(error));exit(EXIT_FAILURE);}}// Structure to hold domain decomposition information
typedef struct {int rank;           // MPI rankint size;           // Number of MPI processesint dims[3];        // Process grid dimensionsint coords[3];      // Process coordinates in gridMPI_Comm cart_comm; // Cartesian communicator// Local domain sizes (excluding ghost layers)int nx, ny, nz;// Starting indices in global domainint start_x, start_y, start_z;// Device pointers for data and ghost buffersdouble *d_data;         // Main data array on devicedouble *d_send_buffers[6]; // Send buffers for each facedouble *d_recv_buffers[6]; // Receive buffers for each face
} DomainInfo;// Initialize domain decomposition
void init_domain_decomposition(DomainInfo *domain, int argc, char **argv) {MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &domain->rank);MPI_Comm_size(MPI_COMM_WORLD, &domain->size);// Create a 3D Cartesian topologyint periods[3] = {1, 1, 1}; // Periodic boundariesdomain->dims[0] = domain->dims[1] = domain->dims[2] = 0;MPI_Dims_create(domain->size, 3, domain->dims);MPI_Cart_create(MPI_COMM_WORLD, 3, domain->dims, periods, 1, &domain->cart_comm);MPI_Cart_coords(domain->cart_comm, domain->rank, 3, domain->coords);// Calculate local domain sizes (excluding ghost layers)domain->nx = NX_GLOBAL / domain->dims[0];domain->ny = NY_GLOBAL / domain->dims[1];domain->nz = NZ_GLOBAL / domain->dims[2];// Calculate starting indices in global domaindomain->start_x = domain->coords[0] * domain->nx;domain->start_y = domain->coords[1] * domain->ny;domain->start_z = domain->coords[2] * domain->nz;// Adjust for non-divisible casesif (domain->coords[0] == domain->dims[0]-1) domain->nx = NX_GLOBAL - domain->start_x;if (domain->coords[1] == domain->dims[1]-1) domain->ny = NY_GLOBAL - domain->start_y;if (domain->coords[2] == domain->dims[2]-1) domain->nz = NZ_GLOBAL - domain->start_z;// Allocate device memory for main data array (including ghost layers)size_t data_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);CUDA_CHECK(cudaMalloc(&domain->d_data, data_size));CUDA_CHECK(cudaMemset(domain->d_data, 0, data_size));// Allocate send and receive buffers for each facefor (int i = 0; i < 6; i++) {size_t buffer_size = 0;if (i < 2) { // X facesbuffer_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);} else if (i < 4) { // Y facesbuffer_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);} else { // Z facesbuffer_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH * sizeof(double);}CUDA_CHECK(cudaMalloc(&domain->d_send_buffers[i], buffer_size));CUDA_CHECK(cudaMalloc(&domain->d_recv_buffers[i], buffer_size));}
}// Pack data into send buffers
__global__ void pack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {int j = blockIdx.x * blockDim.x + threadIdx.x;int k = blockIdx.y * blockDim.y + threadIdx.y;if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {int src_idx;if (face == 0) { // Left facesrc_idx = ghost_width * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;} else { // Right facesrc_idx = (nx + ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;}int buf_idx = j * (nz + 2*ghost_width) + k;buffer[buf_idx] = data[src_idx];}
}// Similar pack kernels for y and z faces would be defined here...// Unpack data from receive buffers
__global__ void unpack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {int j = blockIdx.x * blockDim.x + threadIdx.x;int k = blockIdx.y * blockDim.y + threadIdx.y;if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {int dest_idx;if (face == 0) { // Left facedest_idx = j * (nz + 2*ghost_width) + k;} else { // Right facedest_idx = (nx + 2*ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;}int buf_idx = j * (nz + 2*ghost_width) + k;data[dest_idx] = buffer[buf_idx];}
}// Similar unpack kernels for y and z faces would be defined here...// Exchange ghost data with neighbors
void exchange_ghost_data(DomainInfo *domain) {MPI_Request requests[12]; // 6 sends + 6 receivesint request_count = 0;MPI_Status statuses[12];// Determine neighbor ranksint neighbors[6];MPI_Cart_shift(domain->cart_comm, 0, 1, &neighbors[0], &neighbors[1]); // X directionMPI_Cart_shift(domain->cart_comm, 1, 1, &neighbors[2], &neighbors[3]); // Y directionMPI_Cart_shift(domain->cart_comm, 2, 1, &neighbors[4], &neighbors[5]); // Z direction// Post receives firstfor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {size_t recv_size = 0;if (face < 2) { // X facesrecv_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);} else if (face < 4) { // Y facesrecv_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);} else { // Z facesrecv_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;}MPI_Irecv(domain->d_recv_buffers[face], recv_size, MPI_DOUBLE, neighbors[face], face, domain->cart_comm, &requests[request_count++]);}}// Pack and send datafor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {// Launch appropriate pack kernel based on facedim3 block(16, 16);dim3 grid;if (face < 2) { // X facesgrid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;pack_x_face<<<grid, block>>>(domain->d_data, domain->d_send_buffers[face], domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);} // Similar for y and z faces would be here...CUDA_CHECK(cudaDeviceSynchronize());size_t send_size = 0;if (face < 2) { // X facessend_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);} else if (face < 4) { // Y facessend_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);} else { // Z facessend_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;}MPI_Isend(domain->d_send_buffers[face], send_size, MPI_DOUBLE, neighbors[face], face, domain->cart_comm, &requests[request_count++]);}}// Wait for all communications to completeMPI_Waitall(request_count, requests, statuses);// Unpack received datafor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {// Launch appropriate unpack kernel based on facedim3 block(16, 16);dim3 grid;if (face < 2) { // X facesgrid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;unpack_x_face<<<grid, block>>>(domain->d_data, domain->d_recv_buffers[face], domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);} // Similar for y and z faces would be here...CUDA_CHECK(cudaDeviceSynchronize());}}
}// Clean up resources
void cleanup(DomainInfo *domain) {CUDA_CHECK(cudaFree(domain->d_data));for (int i = 0; i < 6; i++) {CUDA_CHECK(cudaFree(domain->d_send_buffers[i]));CUDA_CHECK(cudaFree(domain->d_recv_buffers[i]));}MPI_Finalize();
}int main(int argc, char **argv) {DomainInfo domain;init_domain_decomposition(&domain, argc, argv);// Main simulation loop would go herefor (int step = 0; step < 10; step++) {// Perform computation on local domain// Exchange ghost dataexchange_ghost_data(&domain);// Continue computation}cleanup(&domain);return 0;
}

Key Features of This Implementation:

  1. 3D Domain Decomposition:

    • Uses MPI Cartesian topology for logical process arrangement
    • Handles both divisible and non-divisible domain sizes
  2. GPU-Aware MPI:

    • Directly passes GPU pointers to MPI calls (requires CUDA-aware MPI)
    • Avoids host-staging for better performance
  3. Efficient Communication:

    • Non-blocking sends and receives for overlap opportunities
    • Separate buffers for each face to prevent contention
  4. Kernel-Based Packing/Unpacking:

    • CUDA kernels for efficient data movement between main array and buffers
    • Parallel packing/unpacking operations
  5. Flexible Ghost Layer Width:

    • Configurable ghost layer size via GHOST_WIDTH macro

Requirements:

  1. MPI implementation with CUDA-aware support (OpenMPI, MVAPICH2, etc.)
  2. CUDA toolkit
  3. Compilation with nvcc and MPI compiler wrappers

Usage Notes:

  1. You’ll need to complete the pack/unpack kernels for Y and Z faces (omitted for brevity)
  2. The code assumes periodic boundaries (adjust periods array if needed)
  3. For optimal performance, tune the block/grid dimensions in the kernels
  4. Add error checking for MPI operations in production code

This implementation provides a solid foundation for multi-GPU simulations with 3D domain decomposition and ghost data exchange.

本文來自互聯網用戶投稿,該文觀點僅代表作者本人,不代表本站立場。本站僅提供信息存儲空間服務,不擁有所有權,不承擔相關法律責任。
如若轉載,請注明出處:http://www.pswp.cn/pingmian/78563.shtml
繁體地址,請注明出處:http://hk.pswp.cn/pingmian/78563.shtml
英文地址,請注明出處:http://en.pswp.cn/pingmian/78563.shtml

如若內容造成侵權/違法違規/事實不符,請聯系多彩編程網進行投訴反饋email:809451989@qq.com,一經查實,立即刪除!

相關文章

計算機考研精煉 計網

第 19 章 計算機網絡體系結構 19.1 基本概念 19.1.1 計算機網絡概述 1.計算機網絡的定義、組成與功能 計算機網絡是一個將分散的、具有獨立功能的計算機系統&#xff0c;通過通信設備與線路連接起來&#xff0c;由功能完善的軟件實現資源共享和信息傳遞的系統。 …

KUKA機器人自動備份設置

在機器人的使用過程中&#xff0c;對機器人做備份不僅能方便查看機器人的項目配置與程序&#xff0c;還能防止機器人項目和程序丟失時進行及時的還原&#xff0c;因此對機器人做備份是很有必要的。 對于KUKA機器人來說&#xff0c;做備份可以通過U盤來操作。也可以在示教器上設…

【wpf】 WPF中實現動態加載圖片瀏覽器(邊滾動邊加載)

WPF中實現動態加載圖片瀏覽器&#xff08;邊滾動邊加載&#xff09; 在做圖片瀏覽器程序時&#xff0c;遇到圖片數量巨大的情況&#xff08;如幾百張、上千張&#xff09;&#xff0c;一次性加載所有圖片會導致界面卡頓甚至程序崩潰。 本文介紹一種 WPF Prism 實現動態分頁加…

Kubernetes》》k8s》》Taint 污點、Toleration容忍度

污點 》》 節點上 容忍度 》》 Pod上 在K8S中&#xff0c;如果Pod能容忍某個節點上的污點&#xff0c;那么Pod就可以調度到該節點。如果不能容忍&#xff0c;那就無法調度到該節點。 污點和容忍度的概念 》》污點等級——>node 》》容忍度 —>pod Equal——>一種是等…

SEO長尾關鍵詞優化核心策略

內容概要 在搜索引擎優化領域&#xff0c;長尾關鍵詞因其精準的流量捕獲能力與較低的競爭強度&#xff0c;已成為提升網站自然流量的核心突破口。本文圍繞長尾關鍵詞優化的全鏈路邏輯&#xff0c;系統拆解從需求洞察到落地執行的五大策略模塊&#xff0c;涵蓋用戶搜索意圖解析…

AWS中國區ICP備案全攻略:流程、注意事項與最佳實踐

導語 在中國大陸地區開展互聯網業務時,所有通過域名提供服務的網站和應用必須完成ICP備案(互聯網內容提供商備案)。對于選擇使用AWS中國區(北京/寧夏區域)資源的用戶,備案流程因云服務商的特殊運營模式而有所不同。本文將詳細解析AWS中國區備案的核心規則、操作步驟及避坑…

計算機視覺——通過 OWL-ViT 實現開放詞匯對象檢測

介紹 傳統的對象檢測模型大多是封閉詞匯類型&#xff0c;只能識別有限的固定類別。增加新的類別需要大量的注釋數據。然而&#xff0c;現實世界中的物體類別幾乎無窮無盡&#xff0c;這就需要能夠檢測未知類別的開放式詞匯類型。對比學習&#xff08;Contrastive Learning&…

大語言模型的“模型量化”詳解 - 04:KTransformers MoE推理優化技術

基本介紹 隨著大語言模型&#xff08;LLM&#xff09;的規模不斷擴大&#xff0c;模型的推理效率和計算資源的需求也在迅速增加。DeepSeek-V2作為當前熱門的LLM之一&#xff0c;通過創新的架構設計與優化策略&#xff0c;在資源受限環境下實現了高效推理。 本文將詳細介紹Dee…

排序算法詳解筆記

評價維度 運行效率就地性穩定性 自適應性&#xff1a;自適應排序能夠利用輸入數據已有的順序信息來減少計算量&#xff0c;達到更優的時間效率。自適應排序算法的最佳時間復雜度通常優于平均時間復雜度。 是否基于比較&#xff1a;基于比較的排序依賴比較運算符&#xff08;…

【“星瑞” O6 評測】 — llm CPU部署對比高通驍龍CPU

前言 隨著大模型應用場景的不斷拓展&#xff0c;arm cpu 憑借其獨特優勢在大模型推理領域的重要性日益凸顯。它在性能、功耗、架構適配等多方面發揮關鍵作用&#xff0c;推動大模型在不同場景落地 1. CPU對比 星睿 O6 CPU 采用 Armv9 架構&#xff0c;集成了 Armv9 CPU 核心…

Ocelot的應用案例

搭建3個項目&#xff0c;分別是OcelotDemo、ServerApi1和ServerApi2這3個項目。訪問都是通過OcelotDemo進行輪訓轉發。 代碼案例鏈接&#xff1a;https://download.csdn.net/download/ly1h1/90715035 1.架構圖 2.解決方案結構 3.步驟一&#xff0c;添加Nuget包 4.步驟二&…

DeepSeek+Dify之五工作流引用API案例

DeepSeekDify之四Agent引用知識庫案例 文章目錄 背景整體流程測試數據用到的節點開始HTTP請求LLM參數提取器代碼執行結束 實現步驟1、新建工作流2、開始節點3、Http請求節點4、LLM節點&#xff08;大模型檢索&#xff09;5、參數提取器節點&#xff08;提取大模型檢索后數據&am…

《從分遺產說起:JS 原型與繼承詳解》

“天天開心就好” 先來講講概念&#xff1a; 原型&#xff08;Prototype&#xff09; 什么是原型&#xff1f; 原型是 JavaScript 中實現對象間共享屬性和方法的機制。每個 JavaScript 對象&#xff08;除了 null&#xff09;都有一個內部鏈接指向另一個對象&#xff0c;這…

立馬耀:通過阿里云 Serverless Spark 和 Milvus 構建高效向量檢索系統,驅動個性化推薦業務

作者&#xff1a;廈門立馬耀網絡科技有限公司大數據開發工程師 陳宏毅 背景介紹 行業 蟬選是蟬媽媽出品的達人選品服務平臺。蟬選秉持“陪伴達人賺到錢”的品牌使命&#xff0c;致力于洞悉達人變現需求和痛點&#xff0c;提供達人選高傭、穩變現、速響應的選品服務。 業務特…

Android顯示學習筆記本

根據博客 Android-View 繪制原理(01)-JAVA層分析_android view draw原理分析-CSDN博客 提出了我的疑問 Canvas RenderNode updateDisplayListDirty 這些東西的關系 您的理解在基本方向上是對的&#xff0c;但讓我詳細解釋一下 Android 中 updateDisplayListDirty、指令集合、…

JavaWeb學習打卡-Day4-會話技術、JWT、Filter、Interceptor

會話技術 會話&#xff1a;用戶打開瀏覽器&#xff0c;訪問web服務器的資源&#xff0c;會話建立&#xff0c;直到有一方斷開連接&#xff0c;會話結束。在一次會話中可以包含多次請求和響應。會話跟蹤&#xff1a;一種維護瀏覽器狀態的方法&#xff0c;服務器需要識別多次請求…

讓數據優雅落地:用 serde::Deserialize 玩轉結構體實體

前言 想象一下,服務器突然飛來一堆 JSON 數據,就像一群無頭蒼蠅沖進辦公室,嗡嗡作響,橫沖直撞。此刻,你的任務,就是把這群“迷路數據”安置進正確的格子里,分門別類,秩序井然,不混不亂,不漏一只。 好在 Rust 早就為我們備好瑞士軍刀:serde::Deserialize。它不僅刀…

Virtio 技術解析 | 框架、設備實現與實踐指南

本文為 “Virtio” 相關文章合輯。 略作重排&#xff0c;如有內容異常&#xff0c;請看原文。 Virtio 簡介&#xff08;一&#xff09;—— 框架分析 posted 2021-04-21 10:14 Edver 1. 概述 在傳統設備模擬中&#xff0c;虛擬機內部設備驅動完全不知自身處于虛擬化環境&a…

云計算賦能質檢LIMS的價值 質檢LIMS系統在云計算企業的創新應用

在云計算技術高速發展的背景下&#xff0c;實驗室信息化管理正經歷深刻變革。質檢LIMS&#xff08;實驗室信息管理系統&#xff09;作為實驗室數字化轉型的核心工具&#xff0c;通過與云計算深度融合&#xff0c;為企業提供了高彈性、高安全性的解決方案。本文將探討質檢LIMS在…

【win11 安裝WSL2 詳解一遍過!!】

共有五個步驟&#xff0c;按部就班的做&#xff0c;保準成功&#xff01; 1. 打開開發者模式 設置->系統->開發者模式 2. 打開linux的win子系統 找到控制面板-程序和功能-啟用或關閉Windows功能&#xff0c;選中“適用于Linux的Windows子系統”&#xff0c;“虛擬機…