使用MPI-IO并行讀寫HDF5文件
HDF5支持通過MPI-IO進行并行讀寫,這對于大規模科學計算應用非常重要。下面我將提供C++和Fortran的示例程序,展示如何使用MPI-IO并行讀寫HDF5文件。
準備工作
在使用MPI-IO的HDF5之前,需要確保:
- HDF5庫編譯時啟用了MPI支持
- 程序鏈接了HDF5的MPI庫
C++示例
#include <hdf5.h>
#include <mpi.h>
#include <iostream>
#include <vector>#define FILE_NAME "parallel.h5"
#define DATASET_NAME "IntArray"
#define DIM0 100 // 全局維度
#define DIM1 100int main(int argc, char** argv) {// 初始化MPIMPI_Init(&argc, &argv);int mpi_rank, mpi_size;MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);// 初始化HDF5的MPI環境hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);// 創建或打開文件hid_t file_id = H5Fopen(FILE_NAME, H5F_ACC_RDWR, plist_id);if (file_id < 0) {file_id = H5Fcreate(FILE_NAME, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);}H5Pclose(plist_id);// 定義數據集維度hsize_t dims[2] = {DIM0, DIM1};// 創建數據空間hid_t filespace = H5Screate_simple(2, dims, NULL);// 創建數據集hid_t dset_id = H5Dcreate(file_id, DATASET_NAME, H5T_NATIVE_INT, filespace,H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);H5Sclose(filespace);// 定義每個進程的寫入區域hsize_t count[2] = {DIM0/mpi_size, DIM1};hsize_t offset[2] = {mpi_rank * count[0], 0};// 選擇數據集的超平面filespace = H5Dget_space(dset_id);H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);// 創建內存數據空間hid_t memspace = H5Screate_simple(2, count, NULL);// 準備數據 - 每個進程填充自己的部分std::vector<int> data(count[0] * count[1]);for (size_t i = 0; i < count[0]; ++i) {for (size_t j = 0; j < count[1]; ++j) {data[i * count[1] + j] = mpi_rank * 1000 + i * count[1] + j;}}// 設置集體寫入屬性plist_id = H5Pcreate(H5P_DATASET_XFER);H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);// 并行寫入數據herr_t status = H5Dwrite(dset_id, H5T_NATIVE_INT, memspace, filespace,plist_id, data.data());// 清理資源H5Dclose(dset_id);H5Sclose(filespace);H5Sclose(memspace);H5Pclose(plist_id);H5Fclose(file_id);// 讀取示例 - 類似寫入過程if (mpi_rank == 0) {std::cout << "數據寫入完成,開始讀取驗證..." << std::endl;}// 重新打開文件和數據集file_id = H5Fopen(FILE_NAME, H5F_ACC_RDONLY, plist_id);dset_id = H5Dopen(file_id, DATASET_NAME, H5P_DEFAULT);// 分配讀取緩沖區std::vector<int> read_data(count[0] * count[1]);// 設置集體讀取屬性plist_id = H5Pcreate(H5P_DATASET_XFER);H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);// 選擇相同的超平面filespace = H5Dget_space(dset_id);H5Sselect_hyperslab(filespace, H5S_SELECT_SET, offset, NULL, count, NULL);// 并行讀取數據status = H5Dread(dset_id, H5T_NATIVE_INT, memspace, filespace,plist_id, read_data.data());// 驗證數據bool error = false;for (size_t i = 0; i < count[0] * count[1]; ++i) {if (read_data[i] != data[i]) {error = true;break;}}if (!error && mpi_rank == 0) {std::cout << "數據驗證成功!" << std::endl;}// 清理資源H5Dclose(dset_id);H5Sclose(filespace);H5Sclose(memspace);H5Pclose(plist_id);H5Fclose(file_id);// 結束MPIMPI_Finalize();return 0;
}
Fortran示例
program parallel_hdf5_mpiuse hdf5use mpiimplicit noneinteger :: ierr, mpi_rank, mpi_sizeinteger(hid_t) :: file_id, dset_id, filespace, memspace, plist_idinteger(hsize_t), dimension(2) :: dims = (/100, 100/) ! 全局維度integer(hsize_t), dimension(2) :: count, offsetinteger, allocatable :: data(:, :)integer :: i, jcharacter(len=*), parameter :: file_name = "parallel.h5"character(len=*), parameter :: dset_name = "IntArray"! 初始化MPIcall MPI_Init(ierr)call MPI_Comm_rank(MPI_COMM_WORLD, mpi_rank, ierr)call MPI_Comm_size(MPI_COMM_WORLD, mpi_size, ierr)! 初始化HDF5call h5open_f(ierr)! 設置文件訪問屬性為MPI-IOcall h5pcreate_f(H5P_FILE_ACCESS_F, plist_id, ierr)call h5pset_fapl_mpio_f(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL, ierr)! 創建或打開文件call h5fopen_f(file_name, H5F_ACC_RDWR_F, file_id, ierr, access_prp=plist_id)if (ierr /= 0) thencall h5fcreate_f(file_name, H5F_ACC_TRUNC_F, file_id, ierr, access_prp=plist_id)endifcall h5pclose_f(plist_id, ierr)! 創建數據空間call h5screate_simple_f(2, dims, filespace, ierr)! 創建數據集call h5dcreate_f(file_id, dset_name, H5T_NATIVE_INTEGER, filespace, &dset_id, ierr)call h5sclose_f(filespace, ierr)! 定義每個進程的寫入區域count(1) = dims(1)/mpi_sizecount(2) = dims(2)offset(1) = mpi_rank * count(1)offset(2) = 0! 選擇數據集的超平面call h5dget_space_f(dset_id, filespace, ierr)call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, count, ierr)! 創建內存數據空間call h5screate_simple_f(2, count, memspace, ierr)! 準備數據 - 每個進程填充自己的部分allocate(data(count(1), count(2)))do i = 1, count(1)do j = 1, count(2)data(i, j) = mpi_rank * 1000 + (i-1)*count(2) + jend doend do! 設置集體寫入屬性call h5pcreate_f(H5P_DATASET_XFER_F, plist_id, ierr)call h5pset_dxpl_mpio_f(plist_id, H5FD_MPIO_COLLECTIVE_F, ierr)! 并行寫入數據call h5dwrite_f(dset_id, H5T_NATIVE_INTEGER, data, count, ierr, &file_space_id=filespace, mem_space_id=memspace, &xfer_prp=plist_id)! 清理資源deallocate(data)call h5dclose_f(dset_id, ierr)call h5sclose_f(filespace, ierr)call h5sclose_f(memspace, ierr)call h5pclose_f(plist_id, ierr)call h5fclose_f(file_id, ierr)! 讀取示例 - 類似寫入過程if (mpi_rank == 0) thenprint *, "數據寫入完成,開始讀取驗證..."endif! 重新打開文件和數據集call h5fopen_f(file_name, H5F_ACC_RDONLY_F, file_id, ierr, access_prp=plist_id)call h5dopen_f(file_id, dset_name, dset_id, ierr)! 分配讀取緩沖區allocate(data(count(1), count(2)))! 設置集體讀取屬性call h5pcreate_f(H5P_DATASET_XFER_F, plist_id, ierr)call h5pset_dxpl_mpio_f(plist_id, H5FD_MPIO_COLLECTIVE_F, ierr)! 選擇相同的超平面call h5dget_space_f(dset_id, filespace, ierr)call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, offset, count, ierr)! 并行讀取數據call h5dread_f(dset_id, H5T_NATIVE_INTEGER, data, count, ierr, &file_space_id=filespace, mem_space_id=memspace, &xfer_prp=plist_id)! 驗證數據 (這里簡化為檢查第一個元素)if (data(1,1) == mpi_rank * 1000 + 1 .and. mpi_rank == 0) thenprint *, "數據驗證成功!"endif! 清理資源deallocate(data)call h5dclose_f(dset_id, ierr)call h5sclose_f(filespace, ierr)call h5sclose_f(memspace, ierr)call h5pclose_f(plist_id, ierr)call h5fclose_f(file_id, ierr)! 關閉HDF5call h5close_f(ierr)! 結束MPIcall MPI_Finalize(ierr)end program parallel_hdf5_mpi
編譯和運行
對于C++程序:
mpicxx -o parallel_hdf5 parallel_hdf5.cpp -lhdf5 -lz
mpiexec -n 4 ./parallel_hdf5
對于Fortran程序:
mpif90 -o parallel_hdf5 parallel_hdf5.f90 -lhdf5_fortran -lhdf5 -lz
mpiexec -n 4 ./parallel_hdf5
關鍵點說明
- MPI初始化: 必須首先初始化MPI環境
- HDF5 MPI屬性: 使用
H5Pset_fapl_mpio
設置文件訪問屬性 - 數據分區: 每個進程負責數據集的不同部分
- 超平面選擇: 使用
H5Sselect_hyperslab
選擇要讀寫的區域 - 集體操作: 使用
H5Pset_dxpl_mpio
設置集體I/O模式 - 并行一致性: 所有進程必須參與集體操作
這些示例展示了基本的并行讀寫操作,實際應用中可能需要更復雜的數據分區和訪問模式。