gpt4 book ai didi

python-3.x - Python : Running nested loop, 2D 移动窗口,并行

转载 作者:行者123 更新时间:2023-12-04 00:27:34 27 4
gpt4 key购买 nike

我使用地形数据。对于一个特定问题,我用 Python 编写了一个函数,它使用特定大小的移动窗口来压缩矩阵(高程网格)。然后我必须在这个窗口上执行分析并将窗口中心的单元格设置为结果值。

我的最终输出是一个与我的原始矩阵大小相同的矩阵,该矩阵已根据我的分析进行了更改。这个问题在一个小区域上运行需要 11 个小时,所以我认为并行化内部循环会加速事情。或者,也可能有一个聪明的矢量化解决方案......

看我下面的函数,DEM是一个二维 numpy 数组,w是窗口的大小。

def RMSH_det(DEM, w):
import numpy as np
from scipy import signal
[nrows, ncols] = np.shape(DEM)

#create an empty array to store result
rms = DEM*np.nan

# nw=(w*2)**2
# x = np.arange(0,nw)

for i in np.arange(w+1,nrows-w):


for j in np.arange(w+1,ncols-w):

d1 = np.int64(np.arange(i-w,i+w))
d2 = np.int64(np.arange(j-w,j+w))

win = DEM[d1[0]:d1[-1],d2[0]:d2[-1]]

if np.max(np.isnan(win)) == 1:
rms[i,j] = np.nan

else:
win = signal.detrend(win, type = 'linear')
z = np.reshape(win,-1)
nz = np.size(z)
rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
rms[i,j] = rootms


return(rms)

我已经在 SO/SE 中搜索了我的问题的解决方案,并遇到了许多嵌套 for 循环并尝试并行运行它们的示例。我一直在努力调整我的代码以匹配示例,并希望得到一些帮助。这个问题的解决方案将帮助我使用我拥有的其他几个移动窗口函数。

到目前为止,我已经将内循环移动到它自己的函数中,该函数可以从外循环内调用:
def inLoop(i, w, DEM,rms,ncols):
for j in np.arange(w+1,ncols-w):

d1 = np.int64(np.arange(i-w,i+w))
d2 = np.int64(np.arange(j-w,j+w))

win = DEM[d1[0]:d1[-1],d2[0]:d2[-1]]

if np.max(np.isnan(win)) == 1:
rms[i,j] = np.nan

else:
win = signal.detrend(win, type = 'linear')
z = np.reshape(win,-1)
nz = np.size(z)
rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
rms[i,j] = rootms


return(rms)

但是我不确定使用需要输入到内部循环中的必要变量来编码对 Pool 的调用的正确方法。请参阅下面的外循环:
 for i in np.arange(w+1,nrows-w):
number_of_workers = 8

with Pool(number_of_workers) as p:
#call the pool
p.starmap(inLoop, [i, w, DEM, rms, ncols])



剩余问题:
  • 甚至可以通过并行化来优化这段代码吗?
  • 如何成功存储并行化嵌套 for 循环的结果?
  • 最佳答案

    使用 Numba 的解决方案

    在某些情况下,如果您使用的所有功能都受支持,这很容易做到。在您的代码中 win = signal.detrend(win, type = 'linear')是您必须在 Numba 中实现的部分,因为不支持此功能。

    在 Numba 中实现去趋势

    如果你看 source-code去趋势,并为您的问题提取相关部分,它可能如下所示:

    @nb.njit()
    def detrend(w):
    Npts=w.shape[0]
    A=np.empty((Npts,2),dtype=w.dtype)
    for i in range(Npts):
    A[i,0]=1.*(i+1) / Npts
    A[i,1]=1.

    coef, resids, rank, s = np.linalg.lstsq(A, w.T)
    out=w.T- np.dot(A, coef)
    return out.T

    我还为 np.max(np.isnan(win)) == 1 实现了一个更快的解决方案
    @nb.njit()
    def isnan(win):
    for i in range(win.shape[0]):
    for j in range(win.shape[1]):
    if np.isnan(win[i,j]):
    return True
    return False

    主要功能

    因为我在这里使用了 Numba,所以并行化非常简单,只是外循环上的一个 prange 和
    import numpy as np
    import numba as nb

    @nb.njit(parallel=True)
    def RMSH_det_nb(DEM, w):
    [nrows, ncols] = np.shape(DEM)

    #create an empty array to store result
    rms = DEM*np.nan

    for i in nb.prange(w+1,nrows-w):
    for j in range(w+1,ncols-w):
    win = DEM[i-w:i+w-1,j-w:j+w-1]

    if isnan(win):
    rms[i,j] = np.nan
    else:
    win = detrend(win)
    z = win.flatten()
    nz = z.size
    rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
    rms[i,j] = rootms

    return rms

    时间(小例子)
    w = 10
    DEM=np.random.rand(100, 100).astype(np.float32)

    res1=RMSH_det(DEM, w)
    res2=RMSH_det_nb(DEM, w)
    print(np.allclose(res1,res2,equal_nan=True))
    #True

    %timeit res1=RMSH_det(DEM, w)
    #1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    %timeit res2=RMSH_det_nb(DEM, w) #approx. 55 times faster
    #29 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

    较大阵列的计时
    w = 10
    DEM=np.random.rand(1355, 1165).astype(np.float32)
    %timeit res2=RMSH_det_nb(DEM, w)
    #6.63 s ± 21.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

    [编辑] 使用正规方程实现

    Overdetermined system

    这个方法有一个 较低的数值精度 .虽然这个解决方案要快得多。
    @nb.njit()
    def isnan(win):
    for i in range(win.shape[0]):
    for j in range(win.shape[1]):
    if win[i,j]==np.nan:
    return True
    return False

    @nb.njit()
    def detrend(w):
    Npts=w.shape[0]
    A=np.empty((Npts,2),dtype=w.dtype)
    for i in range(Npts):
    A[i,0]=1.*(i+1) / Npts
    A[i,1]=1.

    coef, resids, rank, s = np.linalg.lstsq(A, w.T)
    out=w.T- np.dot(A, coef)
    return out.T

    @nb.njit()
    def detrend_2(w,T1,A):
    T2=np.dot(A.T,w.T)
    coef=np.linalg.solve(T1,T2)

    out=w.T- np.dot(A, coef)

    return out.T

    @nb.njit(parallel=True)
    def RMSH_det_nb_normal_eq(DEM,w):
    [nrows, ncols] = np.shape(DEM)

    #create an empty array to store result
    rms = DEM*np.nan

    Npts=w*2-1
    A=np.empty((Npts,2),dtype=DEM.dtype)
    for i in range(Npts):
    A[i,0]=1.*(i+1) / Npts
    A[i,1]=1.

    T1=np.dot(A.T,A)

    nz = Npts**2
    for i in nb.prange(w+1,nrows-w):
    for j in range(w+1,ncols-w):
    win = DEM[i-w:i+w-1,j-w:j+w-1]

    if isnan(win):
    rms[i,j] = np.nan
    else:
    win = detrend_2(win,T1,A)
    rootms = np.sqrt(1 / (nz - 1) * np.sum((win-np.mean(win))**2))
    rms[i,j] = rootms

    return rms

    计时
    w = 10
    DEM=np.random.rand(100, 100).astype(np.float32)

    res1=RMSH_det(DEM, w)
    res2=RMSH_det_nb(DEM, w)
    print(np.allclose(res1,res2,equal_nan=True))
    #True

    %timeit res1=RMSH_det(DEM, w)
    #1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    %timeit res2=RMSH_det_nb_normal_eq(DEM,w)
    #7.97 ms ± 89.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

    使用正规方程的优化解

    临时数组被重用以避免昂贵的内存分配,并使用矩阵乘法的自定义实现。这仅适用于非常小的矩阵,在大多数其他情况下 np.dot (sgeemm) 会快得多。
    @nb.njit()
    def matmult_2(A,B,out):
    for j in range(B.shape[1]):
    acc1=nb.float32(0)
    acc2=nb.float32(0)
    for k in range(B.shape[0]):
    acc1+=A[0,k]*B[k,j]
    acc2+=A[1,k]*B[k,j]
    out[0,j]=acc1
    out[1,j]=acc2
    return out

    @nb.njit(fastmath=True)
    def matmult_mod(A,B,w,out):
    for j in range(B.shape[1]):
    for i in range(A.shape[0]):
    acc=nb.float32(0)
    acc+=A[i,0]*B[0,j]+A[i,1]*B[1,j]
    out[j,i]=acc-w[j,i]
    return out

    @nb.njit()
    def detrend_2_opt(w,T1,A,Tempvar_1,Tempvar_2):
    T2=matmult_2(A.T,w.T,Tempvar_1)
    coef=np.linalg.solve(T1,T2)
    return matmult_mod(A, coef,w,Tempvar_2)

    @nb.njit(parallel=True)
    def RMSH_det_nb_normal_eq_opt(DEM,w):
    [nrows, ncols] = np.shape(DEM)

    #create an empty array to store result
    rms = DEM*np.nan

    Npts=w*2-1
    A=np.empty((Npts,2),dtype=DEM.dtype)
    for i in range(Npts):
    A[i,0]=1.*(i+1) / Npts
    A[i,1]=1.

    T1=np.dot(A.T,A)

    nz = Npts**2
    for i in nb.prange(w+1,nrows-w):
    Tempvar_1=np.empty((2,Npts),dtype=DEM.dtype)
    Tempvar_2=np.empty((Npts,Npts),dtype=DEM.dtype)
    for j in range(w+1,ncols-w):
    win = DEM[i-w:i+w-1,j-w:j+w-1]

    if isnan(win):
    rms[i,j] = np.nan
    else:
    win = detrend_2_opt(win,T1,A,Tempvar_1,Tempvar_2)
    rootms = np.sqrt(1 / (nz - 1) * np.sum((win-np.mean(win))**2))
    rms[i,j] = rootms

    return rms

    计时
    w = 10
    DEM=np.random.rand(100, 100).astype(np.float32)

    res1=RMSH_det(DEM, w)
    res2=RMSH_det_nb_normal_eq_opt(DEM, w)
    print(np.allclose(res1,res2,equal_nan=True))
    #True

    %timeit res1=RMSH_det(DEM, w)
    #1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    %timeit res2=RMSH_det_nb_normal_eq_opt(DEM,w)
    #4.66 ms ± 87.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

    isnan 的时间安排

    这个函数是一个完全不同的实现。如果 NaN 位于数组的开头,它会快得多,但无论如何即使没有也有一些加速。我用小数组(大约窗口大小)和@user3666197 建议的大大小对其进行了基准测试。
    case_1=np.full((20,20),np.nan)
    case_2=np.full((20,20),0.)
    case_2[10,10]=np.nan
    case_3=np.full((20,20),0.)

    case_4 = np.full( ( int( 1E4 ), int( 1E4 ) ),np.nan)
    case_5 = np.ones( ( int( 1E4 ), int( 1E4 ) ) )

    %timeit np.any(np.isnan(case_1))
    %timeit np.any(np.isnan(case_2))
    %timeit np.any(np.isnan(case_3))
    %timeit np.any(np.isnan(case_4))
    %timeit np.any(np.isnan(case_5))
    #2.75 µs ± 73.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
    #2.75 µs ± 46.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
    #2.76 µs ± 32.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
    #81.3 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
    #86.7 ms ± 2.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

    %timeit isnan(case_1)
    %timeit isnan(case_2)
    %timeit isnan(case_3)
    %timeit isnan(case_4)
    %timeit isnan(case_5)
    #244 ns ± 5.02 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
    #357 ns ± 1.07 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
    #475 ns ± 9.28 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
    #235 ns ± 0.933 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
    #58.8 ms ± 2.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

    关于python-3.x - Python : Running nested loop, 2D 移动窗口,并行,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58507565/

    27 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com