gpt4 book ai didi

python - Cython 函数比纯 python 花费更多时间

转载 作者:太空宇宙 更新时间:2023-11-03 13:28:59 25 4
gpt4 key购买 nike

我正在尝试加速我的代码,但它的这一部分给我带来了问题,

我尝试使用 Cython,然后遵循给出的建议 here但是我的纯 python 函数比 cython 和 cython_optimized 函数表现得更好

cython代码如下:

import numpy as np
cimport numpy as np

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)

def compute_cython(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6

delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile


def compute_cythonOptimized(np.ndarray[DTYPE_t, ndim=1] u, np.ndarray[DTYPE_t, ndim=1] PorosityProfile, np.ndarray[DTYPE_t, ndim=1] DensityIceProfile, np.ndarray[DTYPE_t, ndim=1] DensityDustProfile, np.ndarray DensityProfile):

assert u.dtype == DTYPE
assert PorosityProfile.dtype == DTYPE
assert DensityIceProfile.dtype == DTYPE
assert DensityDustProfile.dtype == DTYPE
assert DensityProfile.dtype == DTYPE

cdef float DustJ = 250.0
cdef float DustF = 633.0
cdef float DustG = 2.513
cdef float DustH = -2.2e-3
cdef float DustI = -2.8e-6
cdef float IceI = 273.16
cdef float IceC = 1.843e5
cdef float IceD = 1.6357e8
cdef float IceE = 3.5519e9
cdef float IceF = 1.6670e2
cdef float IceG = 6.4650e4
cdef float IceH = 1.6935e6

cdef np.ndarray[DTYPE_t, ndim=1] delta = u-DustJ
cdef np.ndarray[DTYPE_t, ndim=1] result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

cdef np.ndarray[DTYPE_t, ndim=1] x= u/IceI;
cdef np.ndarray[DTYPE_t, ndim=1] result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

然后我运行以下命令:

def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6

delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

import sublimation
import numpy as np

%timeit compute_python(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

%timeit compute_cython(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

%timeit compute_cythonOptimized(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

结果如下:

对于纯 python:每个循环 68.9 µs ± 851 ns(7 次运行的平均值 ± 标准偏差,每次 10000 次循环)

对于未优化的 cython:每个循环 68.2 µs ± 685 ns(7 次运行的平均值 ± 标准偏差,每次 10000 次循环)

对于优化的一个:72.7 µs ± 416 ns 每个循环(7 次运行的平均值 ± std.dev,每次 10000 次循环)

我做错了什么?

谢谢你的帮助,

最佳答案

使用 Numba 的解决方案

CodeSurgeon 已经使用 Cython 给出了很好的答案。在这个答案中,我不想展示使用 Numba 的替代方法。

我已经创建了三个版本。在 naive_numba 中,我只添加了一个函数装饰器。在 improved_Numba 中,我手动组合了循环(每个矢量化命令实际上是一个循环)。在 improved_Numba_p 中,我已将函数并行化。请注意,在使用并行加速器时,显然存在不允许定义常量值的错误。还注意到并行化版本仅对较大的输入数组有益。但您也可以添加一个小包装器,根据输入数组大小调用单线程或并行版本。

代码dtype=float64

import numba as nb
import numpy as np
import time



@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6

delta = u-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

x= u/IceI;
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6
res=np.empty(u.shape[0],dtype=u.dtype)

for i in range(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

return res

#there is obviously a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
res=np.empty((u.shape[0]),dtype=u.dtype)

for i in nb.prange(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

return res

u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH = 273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2, 6.4650e4, 1.6935e6

#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)
for i in range(1000):
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)

print(time.time()-t1)
print(time.time()-t1)

性能

Arraysize np.random.rand(100)
Numpy 46.8µs
naive Numba 3.1µs
improved Numba: 1.62µs
improved_Numba_p: 17.45µs


#Arraysize np.random.rand(1000000)
Numpy 255.8ms
naive Numba 18.6ms
improved Numba: 6.13ms
improved_Numba_p: 3.54ms

代码dtype=np.float32

如果 np.float32 就足够了,您必须将函数中的所有常量值显式声明为 float32。否则 Numba 将使用 float64。

@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6)
res=np.empty(u.shape[0],dtype=u.dtype)

for i in range(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

return res

@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
res=np.empty((u.shape[0]),dtype=u.dtype)
DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
IceI, IceC, IceD, IceE, IceF, IceG, IceH = nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2), nb.float32(6.4650e4), nb.float32(1.6935e6)

for i in nb.prange(u.shape[0]):
delta = u[i]-DustJ
result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

x= u[i]/IceI
result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

return res

性能

Arraysize np.random.rand(100).astype(np.float32)
Numpy 29.3µs
improved Numba: 1.33µs
improved_Numba_p: 18µs


Arraysize np.random.rand(1000000).astype(np.float32)
Numpy 117ms
improved Numba: 2.46ms
improved_Numba_p: 1.56ms

与@CodeSurgeon 提供的 Cython 版本的比较并不公平,因为他没有使用启用的 AVX2 和 FMA3 指令编译函数。 Numba 默认使用 -march=native 进行编译,这会在我的 Core i7-4xxx 上启用 AVX2 和 FMA3 指令。

但是,如果您不想分发代码的已编译 Cython 版本,这是有道理的,因为如果启用了优化,它不会在 Haswell 之前的处理器(或所有奔腾和赛扬)上默认运行。编译多个代码路径应该是可能的,但依赖于编译器并且需要更多工作。

关于python - Cython 函数比纯 python 花费更多时间,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/50380692/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com