gpt4 book ai didi

fortran - 带有 BLAS 的 OpenMP

转载 作者:行者123 更新时间:2023-12-04 08:04:29 25 4
gpt4 key购买 nike

related question
我试图在上述链接的答案中扩展代码,以包括交叉检查和 openmp。

Program reshape_for_blas

Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64

Implicit None

Real( wp ), Dimension( :, : ), Allocatable :: a
Real( wp ), Dimension( :, :, : ), Allocatable :: b
Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
Real( wp ), Dimension( :, : ), Allocatable :: d
Real( wp ), Dimension( :, : ), Allocatable :: e

Integer :: na, nb, nc, nd, ne
Integer :: la, lb, lc, ld
Integer( li ) :: start, finish, rate, numthreads

numthreads = 2

call omp_set_num_threads(numthreads)


Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
ne = nc * nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c1( 1:na, 1:nc, 1:nd ) )
Allocate( c2( 1:na, 1:nc, 1:nd ) )
Allocate( c3( 1:na, 1:nc, 1:nd ) )
Allocate( c4( 1:na, 1:nc, 1:nd ) )
Allocate( c5( 1:na, 1:nc, 1:nd ) )
Allocate( d ( 1:nb, 1:ne ) )
Allocate( e ( 1:na, 1:ne ) )

! Set up some data
Call Random_number( a )
Call Random_number( b )

! With reshapes
Call System_clock( start, rate )

!write (*,*) 'clock', start, rate


d = Reshape( b, Shape( d ) )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
d, Size( d, Dim = 1 ), &
0.0_wp, e, Size( e, Dim = 1 ) )
c1 = Reshape( e, Shape( c1 ) )
Call System_clock( finish, rate )

!write (*,*) 'clock', finish, rate



Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )


! Direct
Call System_clock( start, rate )
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c2, Size( c2, Dim = 1 ) )
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method ', Real( finish - start, wp ) / rate



Call System_clock( start, rate )

!$omp parallel
! Direct
Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
b , Size( b , Dim = 1 ), &
0.0_wp, c4, Size( c4, Dim = 1 ) )
!$omp end parallel
Call System_clock( finish, rate )
Write( *, * ) 'Time for straight method omp', Real( finish - start, wp ) / rate



!naive
Call System_clock( start, rate )

do la = 1, na
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = 0.0_wp
enddo
enddo
enddo

do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c3(la,lc,ld) = c3(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo


Call System_clock( finish, rate )
Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate



!naive omp
Call System_clock( start, rate )
!$omp parallel

do la = 1, na
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = 0.0_wp
enddo
enddo
enddo

!$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
do la = 1, na
do lb = 1, nb
do lc = 1, nc
do ld = 1, nd
c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
enddo
enddo
enddo
enddo
!$omp end do
!$omp end parallel


Call System_clock( finish, rate )
Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate





do la = 1, na
do lc = 1, nc
do ld = 1, nd

if ( dabs(c3(la,lc,ld) - c1(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
endif


if ( dabs(c3(la,lc,ld) - c2(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
endif

if ( dabs(c3(la,lc,ld) - c4(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
endif

if ( dabs(c3(la,lc,ld) - c5(la,lc,ld)) > 1.e-6 ) then
write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
endif
enddo
enddo
enddo

End Program reshape_for_blas
我有两个问题:
  • BLAS 或幼稚循环都没有显着的加速。例如,通过 gfortran -std=f2008 -Wall -Wextra -fcheck=all reshape.f90 -lblas -fopenmp , 并输入 30 30 30 60 , 我得到了
  • 30 30 30 60
    Time for reshaping method 2.9443999999999998E-003
    Difference between result matrices 12.380937791257775
    Time for straight method 1.0016000000000001E-003
    Time for straight method omp 2.4878000000000001E-003
    Time for loop 6.6072500000000006E-002
    Time for loop omp 0.100242600000000002
  • 当维度变大时,例如 60 60 60 60在输入中,openmp BLAS 结果可以得到与天真的循环不同的值,似乎我错过了一些控制选项。

  • OpenMP 在这里会出现什么问题?
    编辑
    我在 c5 的初始化中添加了 omp 行。部分并注释掉两个打印行,

    Program reshape_for_blas

    Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64

    Implicit None

    Real( wp ), Dimension( :, : ), Allocatable :: a
    Real( wp ), Dimension( :, :, : ), Allocatable :: b
    Real( wp ), Dimension( :, :, : ), Allocatable :: c1, c2, c3, c4, c5
    Real( wp ), Dimension( :, : ), Allocatable :: d
    Real( wp ), Dimension( :, : ), Allocatable :: e

    Integer :: na, nb, nc, nd, ne
    Integer :: la, lb, lc, ld
    Integer( li ) :: start, finish, rate, numthreads

    numthreads = 2

    call omp_set_num_threads(numthreads)


    Write( *, * ) 'na, nb, nc, nd ?'
    Read( *, * ) na, nb, nc, nd
    ne = nc * nd
    Allocate( a ( 1:na, 1:nb ) )
    Allocate( b ( 1:nb, 1:nc, 1:nd ) )
    Allocate( c1( 1:na, 1:nc, 1:nd ) )
    Allocate( c2( 1:na, 1:nc, 1:nd ) )
    Allocate( c3( 1:na, 1:nc, 1:nd ) )
    Allocate( c4( 1:na, 1:nc, 1:nd ) )
    Allocate( c5( 1:na, 1:nc, 1:nd ) )
    Allocate( d ( 1:nb, 1:ne ) )
    Allocate( e ( 1:na, 1:ne ) )

    ! Set up some data
    Call Random_number( a )
    Call Random_number( b )

    ! With reshapes
    Call System_clock( start, rate )

    !write (*,*) 'clock', start, rate


    d = Reshape( b, Shape( d ) )
    Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a, Size( a, Dim = 1 ), &
    d, Size( d, Dim = 1 ), &
    0.0_wp, e, Size( e, Dim = 1 ) )
    c1 = Reshape( e, Shape( c1 ) )
    Call System_clock( finish, rate )

    !write (*,*) 'clock', finish, rate



    Write( *, * ) 'Time for reshaping method ', Real( finish - start, wp ) / rate
    Write( *, * ) 'Difference between result matrices ', Maxval( Abs( c1 - c2 ) )


    ! Direct
    Call System_clock( start, rate )
    Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
    b , Size( b , Dim = 1 ), &
    0.0_wp, c2, Size( c2, Dim = 1 ) )
    Call System_clock( finish, rate )
    Write( *, * ) 'Time for straight method ', Real( finish - start, wp ) / rate



    !naive loop
    Call System_clock( start, rate )

    do la = 1, na
    do lc = 1, nc
    do ld = 1, nd
    c3(la,lc,ld) = 0.0_wp
    enddo
    enddo
    enddo

    do la = 1, na
    do lb = 1, nb
    do lc = 1, nc
    do ld = 1, nd
    c3(la,lc,ld) = c3(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
    enddo
    enddo
    enddo
    enddo


    Call System_clock( finish, rate )
    Write( *, * ) 'Time for loop', Real( finish - start, wp ) / rate




    !dgemm omp
    Call System_clock( start, rate )

    !$omp parallel
    ! Direct
    Call dgemm( 'N', 'N', na, ne, nb, 1.0_wp, a , Size( a , Dim = 1 ), &
    b , Size( b , Dim = 1 ), &
    0.0_wp, c4, Size( c4, Dim = 1 ) )
    !$omp end parallel
    Call System_clock( finish, rate )
    Write( *, * ) 'Time for straight method omp', Real( finish - start, wp ) / rate




    !loop omp
    Call System_clock( start, rate )
    !$omp parallel

    do la = 1, na
    do lc = 1, nc
    do ld = 1, nd
    c5(la,lc,ld) = 0.0_wp
    enddo
    enddo
    enddo

    !$omp do private(la, lb, lc, ld) schedule(dynamic) reduction(+: c5)
    do la = 1, na
    do lb = 1, nb
    do lc = 1, nc
    do ld = 1, nd
    c5(la,lc,ld) = c5(la,lc,ld) + a(la,lb) * b(lb, lc, ld)
    enddo
    enddo
    enddo
    enddo
    !$omp end do
    !$omp end parallel


    Call System_clock( finish, rate )
    Write( *, * ) 'Time for loop omp', Real( finish - start, wp ) / rate




    !single core: c1 c2 c3
    ! c1 reshape blas
    ! c2 blas
    ! c3 naive loop (reference)
    ! parallel: c4 c5
    ! c4 dgemm parallel
    ! c5 naive loop parallel


    do la = 1, na
    do lc = 1, nc
    do ld = 1, nd

    if ( dabs(c3(la,lc,ld) - c1(la,lc,ld)) > 1.e-6 ) then
    write (*,*) '!!! c1', c3(la,lc,ld) - c1(la,lc,ld)
    endif


    if ( dabs(c3(la,lc,ld) - c2(la,lc,ld)) > 1.e-6 ) then
    write (*,*) '!!! c2', c3(la,lc,ld) - c2(la,lc,ld)
    endif

    if ( dabs(c3(la,lc,ld) - c4(la,lc,ld)) > 1.e-6 ) then
    write (*,*) '!!! c4', la,lc,ld, c3(la,lc,ld) - c4(la,lc,ld)
    endif

    if ( dabs(c3(la,lc,ld) - c5(la,lc,ld)) > 1.e-6 ) then
    write (*,*) '!!! c5', la,lc,ld, c3(la,lc,ld) - c5(la,lc,ld)
    endif
    enddo
    enddo
    enddo

    End Program reshape_for_blas

    然后 gfortran reshape.f90 -lblas -fopenmp , 和 30 30 30 30输入导致
     Time for reshaping method    1.3519000000000001E-003
    Difference between result matrices 12.380937791257775
    Time for straight method 6.2549999999999997E-004
    Time for straight method omp 1.2600000000000001E-003
    Time for naive loop 1.0008599999999999E-002
    Time for naive loop omp 1.6678999999999999E-002
    虽然速度不好。

    最佳答案

    您调用DGEMM并行使用相同的变量集(因为并行区域中的变量在 Fortran 中默认共享)。由于数据竞争,这不起作用并产生奇怪的结果。你有两个选择:

  • 找到一个并行的 BLAS 实现,其中 DGEMM已经线程了。英特尔 MKL 和 OpenBLAS 是主要候选者。英特尔 MKL 使用 OpenMP,更具体地说,它是使用英特尔 OpenMP 运行时构建的,因此它可能无法很好地处理使用 GCC 编译的 OpenMP 代码,但它可以完美地处理非线程代码。
  • 调用 DGEMM并行但不具有相同的参数集。相反,执行一个或两个张量的 block 分解,并让每个线程为单独的 block 进行收缩。由于 Fortran 使用列优先存储,分解第二个张量可能是合适的:
    C[i,k,l=1..L] = A[i,j] * B[j,k,l=1..L]
    变成两个线程:
    thread 0: C[i,k,l=1..L/2] = A[i,j] * B[j,k,l=1..L/2]
    thread 1: C[i,k,l=L/2+1..L] = A[i,j] * B[j,k,l=L/2+1..L]
    对于任意数量的线程,归结为计算 l 的开始和结束值。每个线程中的索引并调整 DGEMM 的参数因此。

  • 就个人而言,我会选择并行的 BLAS 实现。使用 Intel MKL,您只需要链接并行驱动程序,它就会自动使用所有可用的 CPU。
    下面是 block 分解的示例实现。仅显示对原始代码的添加和更改:
      ! ADD: Use the OpenMP module
    Use :: omp_lib

    ! ADD: Variables used for the decomposition
    Integer :: ithr, istart, iend

    ! CHANGE: OpenMP with block decomposition
    !$omp parallel private(ithr, istart, iend)
    ithr = omp_get_thread_num()

    ! First index (plane) in B for the current thread
    istart = ithr * nd / omp_get_num_threads()
    ! First index (plane) in B for the next thread
    iend = (ithr + 1) * nd / opm_get_num_threads()

    Call dgemm('N', 'N', na, nc * (iend - istart), nb, 1.0_wp, a, nd, &
    b(1, 1, 1 + istart), Size(b, Dim = 1), &
    0.0_wp, c4(1, 1, 1 + istart), Size(c4, Dim = 1))
    !$omp end parallel
    istartB 的第一个平面的索引每个单独的线程都在工作。 iend是下一个线程的第一个平面,所以 iend - istart是当前线程的平面数。 b(1, 1, 1 + istart)是 B 中的平面 block 开始的地方。 c4(1, 1, 1 + istart)是结果张量中的 block 开始的地方。
    确保你做其中之一,但不要同时做。即,如果您的 BLAS 实现是线程化的,但您决定使用 block 分解,请禁用 BLAS 库中的线程化。相反,如果您在 BLAS 实现中使用线程,请不要在代码中执行 block 分解。

    关于fortran - 带有 BLAS 的 OpenMP,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/66296334/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com