gpt4 book ai didi

fortran - 如何提高这个巨大嵌套循环的性能? (Fortran 90)

转载 作者:行者123 更新时间:2023-12-04 04:41:51 25 4
gpt4 key购买 nike

我将在这里发布整个代码段,但唯一的问题确实是最后的嵌套循环。所有读入矩阵的尺寸为 180x180,循环慢得难以忍受。我没有看到简化计算的简单方法,因为由于索引的三重出现,获得矩阵“AnaInt”的索引乘法不是简单的矩阵乘积。有什么想法吗?谢谢!

program AC
implicit none
integer, parameter :: dp = selected_real_kind(15, 307)
integer :: n, ndim, k, j, i, o, l, m, steps
real(dp) :: emax, omega, pi, EFermi, auev
complex(dp) :: Grs,Gas, ACCond, tinyc, cunit, czero, cone

complex(dp), allocatable :: GammaL(:,:)
complex(dp), allocatable :: GammaL_EB(:,:)
complex(dp), allocatable :: GammaR(:,:)
complex(dp), allocatable :: R(:,:)
complex(dp), allocatable :: Yc(:,:)
complex(dp), allocatable :: Yd(:,:)
complex(dp), allocatable :: AnaInt(:,:)
complex(dp), allocatable :: H(:,:)
complex(dp), allocatable :: HamEff(:,:)
complex(dp), allocatable :: EigVec(:,:)
complex(dp), allocatable :: InvEigVec(:,:)
complex(dp), allocatable :: EigVal(:)
complex(dp), allocatable :: ctemp(:,:)
complex(dp), allocatable :: ctemp2(:,:)
complex(dp), allocatable :: S(:,:)
complex(dp), allocatable :: SelfL(:,:)
complex(dp), allocatable :: SelfR(:,:)
complex(dp), allocatable :: SHalf(:,:)
complex(dp), allocatable :: InvSHalf(:,:)
complex(dp), allocatable :: HEB(:,:)
complex(dp), allocatable :: Integrand(:,:)


!Lapack arrays and variables
integer :: info, lwork
complex(dp), allocatable :: work(:)
real(dp), allocatable :: rwork(:)
integer,allocatable :: ipiv(:)

!########################################################################

!Constants
auev = 27.211385
pi = 3.14159265359
cunit = (0,1)
czero = (0,0)
cone = (1,0)
tinyc = (0.0, 0.000000000001)


!System and calculation parameters
open(unit=123, file="ForAC.dat", action='read', form='formatted')
read(123,*) ndim, EFermi
lwork = ndim*ndim

emax = 5.0/auev
steps = 1000


allocate(HEB(ndim,ndim))
allocate(H(ndim,ndim))
allocate(Yc(ndim,ndim))
allocate(Yd(ndim,ndim))
allocate(S(ndim,ndim))
allocate(SelfL(ndim,ndim))
allocate(SelfR(ndim,ndim))
allocate(HamEff(ndim,ndim))
allocate(GammaR(ndim,ndim))
allocate(GammaL(ndim,ndim))
allocate(AnaInt(ndim,ndim))
allocate(EigVec(ndim,ndim))
allocate(EigVal(ndim))
allocate(InvEigVec(ndim,ndim))
allocate(R(ndim,ndim))
allocate(GammaL_EB(ndim,ndim))
allocate(Integrand(ndim,ndim))

!################################################



read(123,*) H, S, SelfL, SelfR
close(unit=123)

HamEff(:,:)=(H(:,:) + SelfL(:,:) + SelfR(:,:))



allocate(SHalf(ndim, ndim))
allocate(InvSHalf(ndim,ndim))
SHalf(:,:) = (cmplx(real(S(:,:),dp),0.0_dp,dp))

call zpotrf('l', ndim, SHalf, ndim, info)
InvSHalf(:,:) = SHalf(:,:)
call ztrtri('l', 'n', ndim, InvSHalf, ndim, info)

call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim)
call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)
call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)

deallocate(SHalf)
deallocate(InvSHalf)




!In the PDF: B = EigVec, B^(-1) = InvEigVec, Hk = EigVal

allocate(ctemp(ndim,ndim))
ctemp(:,:) = HamEff(:,:)
allocate(work(lwork),rwork(2*ndim))
call zgeev('N', 'V', ndim, ctemp, ndim, EigVal, InvEigVec, ndim, EigVec, ndim, work, lwork, rwork, info)
if(info/=0)write(*,*) "Warning: zgeev info=", info
deallocate(work,rwork)
deallocate(ctemp)

InvEigVec(:,:)=EigVec(:,:)
lwork = 3*ndim
allocate(ipiv(ndim))
allocate(work(lwork))
call zgetrf(ndim,ndim,InvEigVec,ndim,ipiv,info)
if(info/=0)write(*,*) "Warning: zgetrf info=", info ! LU decomposition
call zgetri(ndim,InvEigVec,ndim,ipiv,work,lwork,info)
if(info/=0)write(*,*) "Warning: zgetri info=", info ! Inversion by LU decomposition (Building of InvEigVec)
deallocate(work)
deallocate(ipiv)


R(:,:) = 0.0_dp
do j=1,ndim
do m=1,ndim
do k=1,ndim
do l=1,ndim
R(j,m) = R(j,m) + InvEigVec(j,k) * GammaR(k,l) * conjg(InvEigVec(m,l))
end do
end do
end do
end do





!!!THIS IS THE LOOP IN QUESTION. MATRIX DIMENSION 180x180, STEPS=1000

open(unit=125,file="ACCond.dat")

!Looping over omega
do o=1,steps
omega=real(o,dp)*emax/real(steps,dp)
AnaInt(:,:) = 0.0_dp
do i=1,ndim
do n=1,ndim
do j=1,ndim
do m=1,ndim
Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))

AnaInt(i,n)= AnaInt(i,n) + EigVec(i,j) * R(j,m) * Integrand(j,m) * conjg(EigVec(n,m))
end do
end do
end do
end do

Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)

ACCond = czero
do k=1,ndim
ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
end do
write(125,*) omega, real(ACCond,dp), aimag(ACCond)
end do



!#############################################

deallocate(Integrand)
deallocate(HEB)
deallocate(Yc)
deallocate(Yd)
deallocate(HamEff)
deallocate(GammaR)
deallocate(GammaL)
deallocate(AnaInt)
deallocate(EigVec)
deallocate(EigVal)
deallocate(InvEigVec)
deallocate(H)
deallocate(S)
deallocate(SelfL)
deallocate(SelfR)
deallocate(R)
deallocate(GammaL_EB)
end program AC

所以,这是根据建议的第一个改编:
HermEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermEigVec(i,j) = conjg(EigVec(j,i))
end do
end do

HermInvEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermInvEigVec(i,j) = conjg(InvEigVec(j,i))
end do
end do


R(:,:) = 0.0_dp

R = matmul(InvEigVec,matmul(GammaR,HermInvEigVec))


open(unit=125,file="ACCond.dat")

!Looping over omega
do o=1,steps
omega=real(o,dp)*emax/real(steps,dp)

AnaInt(:,:) = 0.0_dp
do j=1,ndim
do m=1,ndim
Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
Integrand(j,m) = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
T(j,m) = R(j,m) * Integrand(j,m)
end do
end do
AnaInt = matmul(EigVec,matmul(T,HermEigVec))


Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)

ACCond = czero
do k=1,ndim
ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
end do
write(125,*) omega, real(ACCond,dp), aimag(ACCond)
end do

最佳答案

您的代码中有几个问题。
让我们从你强调的那个循环之前的循环开始(它更容易理解,但下面的大循环或多或少有相同的问题)。

所以我们有一个关于 i, j, k, l 的循环。

您可以考虑对循环重新排序,以便更好地访问缓存。您最内部的循环在 l 上,它仅显示为列索引。与 column-major Fortran 中的数组,您可以预期性能会很差。 j 上的内部循环可能会更好。

更糟糕的是,您的整个循环是三个矩阵 (InvEigVec * GammaR * InvEigVec^H) 乘积的矩阵更新,但您在 O(ndim^4) 中进行。每个矩阵乘积都是 O(n^3) (或者,如果您使用 ZGEMM 调用优化 Strassen algorithm ,则可能更少)。因此,通过存储矩阵乘积,两个乘积应该是 O(n^3),而不是 O(n^4)。
也就是说,您可以先进行矩阵乘积,然后再进行矩阵乘积更新。

现在,你的大循环:在 i、n、j、m 上步进多次。

如果我读得好,你就写

Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))

其中右侧的所有变量都是标量,但 Integrand 是一个 ndim*ndim 矩阵。在多个地方复制一个值需要做很多工作。
但是然后您在被积函数上循环,您可以在其中简单地使用标量。或者也许这是一个错误,你应该在左侧有 Integrand(j, m) 或类似的东西?

然后,您的四个内部循环就像前面的评论一样,更新了
AnaInt 与数组乘积 EigVec * (R .* Integrand) * EigVec^H,与 .* 数组的(逐项)标量乘积(如果 Integrand 只是标量,则只是 EigVec * R * EigVec^H)。

同样,尝试用 ZGEMM 编写它可能会很好,从而大大降低复杂性。

关于fortran - 如何提高这个巨大嵌套循环的性能? (Fortran 90),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/18770483/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com