parallel-processing - 如何使用并行 HDF5 按多个等级分块写入-6ren

parallel-processing - 如何使用并行 HDF5 按多个等级分块写入

转载作者：行者123 更新时间：2023-12-03 09:20:56

我有 20X20 的数据集。我想将其以 2X2 × 4 列的 block 形式并行写入。我正在使用并行 HDF5。现在每个等级有 25 个 block 要写入。我不明白如何为此编码，因为当我使用普通分块时，所有等级都会写入整个 20X20 数据集。当我使用hyperslab时，我不知道如何为每个等级写入的多个 block 设置它。有人给我指点吗？我真的被困住了。

最佳答案

我不太确定我完全理解你的要求。这就是我如何解释你的问题。

全局域名为20x20
4 MPI 排名
分块为2x2

您不必设置分块，事实上我通常不会。

我会这样做。

MPI 中的域分解。
生成局部矩阵。
创建内存超板(基于局部矩阵)。
创建一个文件 hyperslab(基于全局矩阵)。
创建数据集分块属性。
写入数据集。

它看起来是这样的。 HDF5_Parallel_Chunked_Writing

! Program to use MPI_Cart and Parallel HDF5
!
program hdf_pwrite

        use mpi
        use hdf5
        use kinds, only : r_dp

        implicit none

        ! Local array size with halo
        integer, parameter :: g_N   = 20
        integer, parameter :: ndims = 2
        integer, parameter :: halo  = 0

        integer :: argc         ! Number of command line arguments
        integer :: ierr         ! Error status
        integer :: id           ! My rank/ID
        integer :: np           ! Number of processors
        integer :: iunit        ! File descriptor
        integer :: i,j          ! Loop indexers
        integer :: n(ndims)     ! Local N for i and j directions
        integer :: total(ndims) ! Local total dimension size

        ! MPI IO/Lustre file striping
        integer :: lcount       ! Lustre count size
        integer :: lsize        ! Lustre stripe size
        character(len=1024) :: clcount, clsize ! Strings of LFS

        integer :: info                 ! MPI IO Info
        integer :: m_dims(ndims)        ! MPI cart dims
        integer :: coords(ndims)        ! Co-ords of procs in the grid
        logical :: is_periodic(ndims)   ! Periodic boundary conditions
        logical :: reorder              ! Reorder the MPI structure
        integer :: MPI_COMM_2D          ! New communicator

        character(len=1024) :: filename
        integer(kind=hid_t) :: p_id, f_id, x_id, d_id, c_id
        integer(kind=hid_t) :: memspace, filespace
        ! Chunk sizes
        integer(kind=hsize_t) :: c_size(ndims)
        ! Local hyper slab info
        integer(kind=hsize_t) :: d_size(ndims), s_size(ndims), h_size(ndims), &
                                 stride(ndims), block(ndims)
        ! Global hyper slab info
        integer(kind=hsize_t) :: g_size(ndims), g_start(ndims)

        ! Local data array
        real(kind=r_dp), allocatable :: ld(:,:)

        argc = 0
        ierr = 0
        m_dims = (/ 0, 0/)
        is_periodic = .false.      ! Non-periodic
        reorder     = .false.      ! Not allowed to reorder

        call mpi_init(ierr)

        ! Set up the MPI cartesian topology
        call mpi_comm_size(MPI_COMM_WORLD, np, ierr)
        call mpi_dims_create(np, ndims, m_dims, ierr)

        call mpi_cart_create(MPI_COMM_WORLD, ndims, m_dims, is_periodic, &
                             reorder, MPI_COMM_2D, ierr)
        call mpi_comm_rank(MPI_COMM_2D, id, ierr)
        call mpi_cart_coords(MPI_COMM_2D, id, ndims, coords, ierr)

        if (id .eq. 0) then
                if (mod(g_N,np) .ne. 0) then
                        write(0,*) 'Must use divisiable number of procs.'
                        call mpi_abort(MPI_COMM_WORLD, 1, ierr)
                endif

                ! get the filename
                argc = command_argument_count()
                if (argc .lt. 1 ) then
                        write(0, *) 'Must supply a filename'
                        call exit(1)
                endif
                call get_command_argument(1, filename)
        endif

        ! Broadcast the filename
        call mpi_bcast(filename, len(filename), MPI_CHAR, 0, &
                       MPI_COMM_WORLD, ierr)

        ! Init the HDF5 library
        call h5open_f(ierr)

        ! Set a stripe count of 4 and a stripe size of 4MB
        lcount = 4
        lsize  = 4 * 1024 * 1024
        write(clcount, '(I4)') lcount
        write(clsize, '(I8)') lsize

        call mpi_info_create(info, ierr)
        call mpi_info_set(info, "striping_factor", trim(clcount), ierr)
        call mpi_info_set(info, "striping_unit", trim(clsize), ierr)

        ! Set up the access properties
        call h5pcreate_f(H5P_FILE_ACCESS_F, p_id, ierr)
        call h5pset_fapl_mpio_f(p_id, MPI_COMM_2D, info, ierr)

        ! Open the file
        call h5fcreate_f(filename, H5F_ACC_TRUNC_F, f_id, ierr, &
                         access_prp = p_id)
        if (ierr .ne. 0) then
                write(0,*) 'Unable to open: ', trim(filename), ': ', ierr
                call mpi_abort(MPI_COMM_WORLD, 1, ierr)
        endif

        ! Generate our local matrix
        do i = 1, ndims
                n(i) = g_N / m_dims(i)
                total(i) = n(i) + (2 * halo)
        end do
        if (halo .ne. 0) then
                allocate(ld(0:total(1)-1, 0:total(2)-1), stat=ierr)
        else
                allocate(ld(total(1),total(2)), stat=ierr)
        end if
        if (ierr .ne. 0) then
                write(0,*) 'Unable to allocate local data array: ', ierr
                call mpi_abort(MPI_COMM_WORLD, 1, ierr)
        end if

        ld = -99.99
        ! init the local data
        do j = 1, n(2)
                do i = 1, n(1)
                        ld(i,j) = id
                enddo
        enddo

        ! Create the local memory space and hyperslab
        do i = 1, ndims
                d_size(i) = total(i)
                s_size(i) = n(i)
                h_size(i) = halo
                stride(i) = 1
                block(i)  = 1
        enddo

        call h5screate_simple_f(ndims, d_size, memspace, ierr)
        call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, &
                                   h_size, s_size, ierr,       &
                                   stride, block)

        ! Create the global file space and hyperslab
        g_size  = g_N
        do i = 1, ndims
                g_start(i) = n(i) * coords(i)
        enddo

        call h5screate_simple_f(ndims, g_size, filespace, ierr)
        call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, &
                                   g_start, s_size, ierr,       &
                                   stride, block)

        ! Create a data chunking property
        c_size = 2
        call h5pcreate_f(H5P_DATASET_CREATE_F, c_id, ierr)
        call h5pset_chunk_f(c_id, ndims, c_size, ierr)
        ! Create the dataset id
        call h5dcreate_f(f_id, "/data", H5T_IEEE_F64LE, filespace, d_id, &
                         ierr, dcpl_id=c_id)


        ! Create a data transfer property
        call h5pcreate_f(H5P_DATASET_XFER_F, x_id, ierr)
        call h5pset_dxpl_mpio_f(x_id, H5FD_MPIO_COLLECTIVE_F, ierr)
        ! Write the data
        call h5dwrite_f(d_id, H5T_IEEE_F64LE, ld, s_size, ierr,         &
                        file_space_id=filespace, mem_space_id=memspace, &
                        xfer_prp=x_id)

        if (allocated(ld)) then
                deallocate(ld)
        endif

        ! Close everything and exit
        call h5dclose_f(d_id, ierr)
        call h5sclose_f(filespace, ierr)
        call h5sclose_f(memspace, ierr)
        call h5pclose_f(c_id, ierr)
        call h5pclose_f(x_id, ierr)
        call h5pclose_f(p_id, ierr)
        call h5fclose_f(f_id, ierr)
        call h5close_f(ierr)

        call mpi_finalize(ierr)
end program hdf_pwrite

为了完整起见，这里是种类的定义。

module kinds

        use, intrinsic                               :: iso_fortran_env

        implicit none

        private
        public  :: i_sp, i_dp,      &
                   r_sp, r_dp, r_qp

        integer, parameter                           :: i_sp = INT32
        integer, parameter                           :: i_dp = INT64

        integer, parameter                           :: r_sp = REAL32
        integer, parameter                           :: r_dp = REAL64
        integer, parameter                           :: r_qp = REAL128

end module kinds

然后编译、运行并查看输出文件:

$ make 
rm -f kinds.o kinds.mod
h5pfc -c -O3  -o kinds.o kinds.f90
rm -f hdf_pwrite.o hdf_pwrite.mod
h5pfc -c -O3  -o hdf_pwrite.o hdf_pwrite.f90
h5pfc -O3  -o hdf_pwrite kinds.o hdf_pwrite.o

$ mpiexec -np 4 ./hdf_pwrite test.h5

$ h5dump test.h5
HDF5 "test.h5" {
GROUP "/" {
   DATASET "data" {
      DATATYPE  H5T_IEEE_F64LE
      DATASPACE  SIMPLE { ( 20, 20 ) / ( 20, 20 ) }
      DATA {
      (0,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (1,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (2,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (3,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (4,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (5,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (6,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (7,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (8,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (9,0): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      (10,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (11,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (12,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (13,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (14,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (15,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (16,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (17,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (18,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      (19,0): 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
      }
   }
}
}

希望这有帮助。

编辑:当然，您应该使用更好的算法来进行域分解，例如 MPE_Decomp1d .

关于parallel-processing - 如何使用并行 HDF5 按多个等级分块写入，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/31283594/

文章推荐： apache-spark - Spark 数据帧缓存/持久未按预期工作

文章推荐： dashing - 如何在 dashing.io 项目中显示图像

oracle - 在 Oracle 中，PARALLEL 被广泛使用。 PARALLEL、PARALLEL(8)、PARALLEL(a,8)有什么区别？
在 Oracle 中，PARALLEL 被广泛使用。提示 PARALLEL、PARALLEL(8) 和 PARALLEL(a,8) 有什么区别。如何选择最佳的查询提示？ SELECT /*+ PARA
parallel-processing - OMP : What is the difference between OMP PARALLEL DO and OMP DO (Without parallel directive at all)
好的，我希望以前没有问过这个问题，因为在搜索中很难找到。我查看了 F95 手册，但仍然觉得这很模糊: For the simple case of: DO i=0,99 END DO 我正
parallel-processing - GNU parallel 有两个参数
我有一个 C-shell 脚本，其中有一个名为 $hosts_string 的变量，格式为: host1,host2,...,hostN 我还有一个名为 $chrs_string 的变量，其形式为:
parallel-processing - Gnu平行: nested parallelism
是否可以从由gnu parallel产生的脚本的多次运行中调用gnu parallel？我有一个python脚本，可以运行100个顺序顺序迭代，并且在每次迭代中的某处，并行计算4个值(使用gnu p
gnu-parallel - GNU Parallel - 多个命令
我想在几个输入上运行几个长时间运行的进程。例如。: solver_a problem_1 solver_b problem_1 ... solver_b problem_18 solver_c pro
delphi - Parallel.For 和 Parallel.For 之间有区别吗？
TParallel.&For 和 TParallel.For 之间有区别吗？两者都可以在 Delphi 10 Seattle 中编译。那么我应该坚持哪一个呢？最佳答案 TParallel.&For
parallel-processing - Julia Parallel 宏似乎不起作用
我第一次使用 julia 进行并行计算.我有点头疼。所以假设我开始 julia如下:julia -p 4 .然后我为所有处理器声明 a 函数，然后将它与 pmap 一起使用还有@parallel fo
parallel-processing - "embarrassingly parallel"短语的来源
关闭。这个问题是off-topic .它目前不接受答案。想改善这个问题吗？ Update the question所以它是 on-topic对于堆栈溢出。 10年前关闭。 Improve this
c# - Parallel.For 与 Parallel.Invoke
我有一堆相互排斥的方法，因此可以并行运行。有这样做的好方法吗？到目前为止，我有以下两种实现方式，但我不确定是否应该选择其中一种。使用 Parallel.For : Parallel.For(0, 2
parallel-processing - 使用 GNU parallel 并行化具有各种参数的脚本
我对并行运行脚本很感兴趣，并且我已经开始查看 GNU 并行工具，但是我遇到了一些麻烦。我的脚本 doSomething 有 3 个参数，我想在参数的不同值上并行运行脚本。我该怎么做？我试过:para
parallel-processing - 使用 GNU parallel 在多核上运行并行作业
我需要在多核(和多线程)机器上运行多个作业。我正在使用 GNU Parallel utility跨核心分配作业以加速任务。要执行的命令在名为“命令”的文件中可用。我使用以下命令运行 GNU Paral
parallel-processing - 如何使用 gnu-parallel 处理具有两个输入的脚本？
我正在尝试使用如下两个输入运行 Python 脚本。我得到了大约 300 个这两个输入，所以我想知道是否有人可以建议如何并行运行它们。单次运行看起来像: python stable.py KOG_1
gnu-parallel - 如何使用 "GNU parallel"在多个目录中执行一个命令？
每天我都必须更新一堆存储库，并在其中一些中执行另一个命令(来自 CARTON，Perl 模块依赖管理器)。我总是使用循环来执行此操作，但我想与并行执行GNU 并行如果可能，但我不太了解它的tuto
parallel-processing - @parallel 和 pmap 到底有什么区别？
正如标题所说:@parallel 之间究竟有什么区别？和 pmap ?我的意思不是明显的一个是循环的宏，另一个适用于函数，我的意思是它们的实现究竟有什么不同，我应该如何使用这些知识在它们之间进行选择？
parallel-processing - Windows Azure : Parallelization of the code
我有一些矩阵乘法运算。我想通过多个处理器并行执行这些操作。这可以使用 MPI(消息传递接口(interface))在高性能计算集群上完成。同样，我可以使用多个辅助角色在云中进行一些并行化吗？有什么办
python - 为什么joblib.Parallel()比非并行计算花费更多的时间？ Parallel()的运行速度是否应该比非并行计算快？
joblib模块提供了一个简单的帮助程序类，以使用多处理并行编写循环的循环。这段代码使用列表推导来完成这项工作： import time from math import sqrt from job
c openmp parallel for inside a parallel region
我的问题是这样的one .但我想做一些不同的事情... 例如，在我的并行区域内，我想在 4 个线程上运行我的代码。当每个线程进入 for 循环时，我想在 8 个线程上运行我的代码。像 #pramga
parallel-processing - ipython 笔记本 : how to parallelize external script
我正在尝试使用 ipython 并行库中的并行计算。但是我对此知之甚少，而且我发现很难从对并行计算一无所知的人那里阅读该文档。有趣的是，我发现的所有教程都只是重复使用文档中的示例，并使用相同的解释，
parallel-processing - Gradle : Run subproject's tasks in parallel
我的项目结构看起来像 Root + subproj1 + subproj2 在每个子项目中定义了自己的任务 run(){}。我想要做的是从 Root 项目的运行任务并行运行 :subpro
parallel-processing - Parallel.ForEach 应该在 DB 调用中使用吗？
我有一个 Foo ID 的列表。我需要为每个 ID 调用一个存储过程。例如 Guid[] siteIds = ...; // typically contains 100 to 300 elemen

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

parallel-processing - 如何使用并行 HDF5 按多个等级分块写入