python - Cython 函数比纯 python 花费更多时间-6ren

python - Cython 函数比纯 python 花费更多时间

转载作者：太空宇宙更新时间：2023-11-03 13:28:59

我正在尝试加速我的代码，但它的这一部分给我带来了问题，

我尝试使用 Cython，然后遵循给出的建议 here但是我的纯 python 函数比 cython 和 cython_optimized 函数表现得更好

cython代码如下:

import numpy as np
cimport numpy as np

DTYPE = np.float
ctypedef np.float_t DTYPE_t

cimport cython
@cython.boundscheck(False)
@cython.wraparound(False) 

def compute_cython(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u/IceI;
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile


def compute_cythonOptimized(np.ndarray[DTYPE_t, ndim=1] u, np.ndarray[DTYPE_t, ndim=1] PorosityProfile, np.ndarray[DTYPE_t, ndim=1] DensityIceProfile, np.ndarray[DTYPE_t, ndim=1] DensityDustProfile, np.ndarray DensityProfile):

    assert u.dtype == DTYPE
    assert PorosityProfile.dtype == DTYPE
    assert DensityIceProfile.dtype == DTYPE
    assert DensityDustProfile.dtype == DTYPE
    assert DensityProfile.dtype == DTYPE

    cdef float DustJ = 250.0
    cdef float DustF = 633.0  
    cdef float DustG = 2.513 
    cdef float DustH = -2.2e-3   
    cdef float DustI = -2.8e-6 
    cdef float IceI =  273.16
    cdef float IceC =  1.843e5 
    cdef float IceD =  1.6357e8 
    cdef float IceE =  3.5519e9 
    cdef float IceF =  1.6670e2 
    cdef float IceG =  6.4650e4
    cdef float IceH =  1.6935e6

    cdef np.ndarray[DTYPE_t, ndim=1] delta = u-DustJ
    cdef np.ndarray[DTYPE_t, ndim=1] result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    cdef np.ndarray[DTYPE_t, ndim=1] x= u/IceI;
    cdef np.ndarray[DTYPE_t, ndim=1] result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

然后我运行以下命令:

def compute_python(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):

    DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
    IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

    delta = u-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u/IceI;
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

import sublimation
import numpy as np

%timeit compute_python(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

%timeit compute_cython(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

%timeit compute_cythonOptimized(np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100),np.random.rand(100))

结果如下:

对于纯 python:每个循环 68.9 µs ± 851 ns(7 次运行的平均值 ± 标准偏差，每次 10000 次循环)

对于未优化的 cython:每个循环 68.2 µs ± 685 ns(7 次运行的平均值 ± 标准偏差，每次 10000 次循环)

对于优化的一个:72.7 µs ± 416 ns 每个循环(7 次运行的平均值 ± std.dev，每次 10000 次循环)

我做错了什么？

谢谢你的帮助，

最佳答案

使用 Numba 的解决方案

CodeSurgeon 已经使用 Cython 给出了很好的答案。在这个答案中，我不想展示使用 Numba 的替代方法。

我已经创建了三个版本。在 naive_numba 中，我只添加了一个函数装饰器。在 improved_Numba 中，我手动组合了循环(每个矢量化命令实际上是一个循环)。在 improved_Numba_p 中，我已将函数并行化。请注意，在使用并行加速器时，显然存在不允许定义常量值的错误。还注意到并行化版本仅对较大的输入数组有益。但您也可以添加一个小包装器，根据输入数组大小调用单线程或并行版本。

代码dtype=float64

import numba as nb
import numpy as np
import time



@nb.njit(fastmath=True)
def naive_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

  delta = u-DustJ
  result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

  x= u/IceI;
  result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

  return (DensityIceProfile*result_ice+DensityDustProfile*result_dust)/DensityProfile

#error_model='numpy' sets divison by 0 to NaN instead of throwing a exception, this allows vectorization
@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6   
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

#there is obviously a bug in Numba (declaring const values in the function)
@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH):
  res=np.empty((u.shape[0]),dtype=u.dtype)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3);

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(1+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

u=np.array(np.random.rand(1000000),dtype=np.float32)
PorosityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityIceProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityDustProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DensityProfile=np.array(np.random.rand(1000000),dtype=np.float32)
DustJ, DustF, DustG, DustH, DustI = 250.0, 633.0, 2.513, -2.2e-3, -2.8e-6
IceI, IceC, IceD, IceE, IceF, IceG, IceH =  273.16, 1.843e5, 1.6357e8, 3.5519e9, 1.6670e2,  6.4650e4, 1.6935e6

#don't measure compilation overhead on first call
res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH) 
for i in range(1000):
  res=improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile,DustJ, DustF, DustG, DustH, DustI,IceI, IceC, IceD, IceE, IceF, IceG, IceH)

print(time.time()-t1)
print(time.time()-t1)

性能

Arraysize np.random.rand(100)
Numpy             46.8µs
naive Numba       3.1µs
improved Numba:   1.62µs
improved_Numba_p: 17.45µs


#Arraysize np.random.rand(1000000)
Numpy             255.8ms
naive Numba       18.6ms
improved Numba:   6.13ms
improved_Numba_p: 3.54ms

代码dtype=np.float32

如果 np.float32 就足够了，您必须将函数中的所有常量值显式声明为 float32。否则 Numba 将使用 float64。

@nb.njit(fastmath=True,error_model='numpy')
def improved_Numba(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)
  res=np.empty(u.shape[0],dtype=u.dtype)

  for i in range(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

@nb.njit(fastmath=True,parallel=True,error_model='numpy')
def improved_Numba_p(u, PorosityProfile, DensityIceProfile, DensityDustProfile, DensityProfile):
  res=np.empty((u.shape[0]),dtype=u.dtype)
  DustJ, DustF, DustG, DustH, DustI = nb.float32(250.0), nb.float32(633.0), nb.float32(2.513), nb.float32(-2.2e-3), nb.float32(-2.8e-6)
  IceI, IceC, IceD, IceE, IceF, IceG, IceH =  nb.float32(273.16), nb.float32(1.843e5), nb.float32(1.6357e8), nb.float32(3.5519e9), nb.float32(1.6670e2),  nb.float32(6.4650e4), nb.float32(1.6935e6)

  for i in nb.prange(u.shape[0]):
    delta = u[i]-DustJ
    result_dust = DustF+DustG*delta+DustH*delta**2+DustI*(delta**3)

    x= u[i]/IceI
    result_ice = (x**3)*(IceC+IceD*(x**2)+IceE*(x**6))/(nb.float32(1)+IceF*(x**2)+IceG*(x**4)+IceH*(x**8))

    res[i]=(DensityIceProfile[i]*result_ice+DensityDustProfile[i]*result_dust)/DensityProfile[i]

  return res

性能

Arraysize np.random.rand(100).astype(np.float32)
Numpy             29.3µs
improved Numba:   1.33µs
improved_Numba_p: 18µs


Arraysize np.random.rand(1000000).astype(np.float32)
Numpy             117ms
improved Numba:   2.46ms
improved_Numba_p: 1.56ms

与@CodeSurgeon 提供的 Cython 版本的比较并不公平，因为他没有使用启用的 AVX2 和 FMA3 指令编译函数。 Numba 默认使用 -march=native 进行编译，这会在我的 Core i7-4xxx 上启用 AVX2 和 FMA3 指令。

但是，如果您不想分发代码的已编译 Cython 版本，这是有道理的，因为如果启用了优化，它不会在 Haswell 之前的处理器(或所有奔腾和赛扬)上默认运行。编译多个代码路径应该是可能的，但依赖于编译器并且需要更多工作。

关于python - Cython 函数比纯 python 花费更多时间，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/50380692/

文章推荐： python - 根据名称 pandas python 对某些列进行乘法和求和

文章推荐： c# - Azure Servicebus 上的 ConnectivityMode.Http 是否支持 HTTPS

文章推荐： PHP AMQ 库通过 WAN 加密凭据

文章推荐： python - 从 Pandas 文件名中提取文件扩展名

cython - cython 何时以及如何进行边界检查？
c 不做边界检查。那么cython是如何检查是否编译成c的呢？ %%cython --annotate cimport cython @cython.boundscheck(True) cpdef m
cython - Cython 中的
可以直接声明用于 Cython 构造函数？据我了解，这是可能的: # Cython cdef int[3] li = [1, 2, 3] # C++ int[3] li = {1, 2, 3} 但
cython - 在 Cython 中将结构自动转换为字典
所以，如果你有一个头文件。 %%file test.h struct mystruct{ int i; int j; }; 然后你将它包装在 Cython 中: cdef extern fr
cython - 如何在定义 cython 扩展之前识别编译器？
我正在构建一个独立于平台的 cython 项目，我想根据正在使用的编译器传递编译器参数。我可以猜测基于平台的编译器，或者假设它与用于 Python 的编译器相同，但不能保证匹配。通常我注入(injec
cython - 诗歌+狮身人面像+Cython
我使用诗歌构建我的 cython 包。我在所有函数和类中都有 NumPy 风格的文档字符串。我现在要做的是添加 Sphinx 自动文档并发布在 Read the Docs。我已阅读此主题 How d
cython - 将自定义比较器传递给 Cython 中的优先级队列
赛通 libcpp模块包含 priority_queue 的模板，这很好，除了一件事:我不能通过自定义比较器(或者，至少，我不知道如何)。我需要这个，因为我需要 priority_queue做一个a
cython - 如何在文档中显示 Cython 函数的参数？
以下代码定义了一个简单的 Cython 函数(为方便起见，使用 Ipython 魔法)。 %load_ext cython %%cython def f(float x, float y=2):
cython - 使用 Cython 进行复值计算
我正在尝试使用 cython 进行复数计算。在示例代码中，我想计算复数的复指数函数。问题是我不知道如何将我的整数乘以虚数单位。python的虚数单位1.0j乘以cython执行时报错。这是我的代码:
cython - 在 Cython 中定义字符串数组
在这里停留在一些基本的 Cython 上 - 在 Cython 中定义字符串数组的规范且有效的方法是什么？具体来说，我想定义一个定长常量数组char . (请注意，此时我不想引入 NumPy。) 在
cython - 在 Cython 中在编译时获取整数的大小
是否有可能，如果是，如何确定 Cython 中整数数据类型的大小(以位为单位)？我正在尝试做这样的事情，以获得整数大小: cdef WORD_BITS = 0 IF sizeof(unsigned
cython - 打印 cython 变量的地址
我只是想打印 cython 变量的地址，但我无法绕过错误消息: cdef int myvar print &myvar 抛出 Cannot convert 'int *' to Python obje
cython - 如何在 Cython 中扩展宏
我有一个 C 头文件，它在宏中定义了一个函数。我需要从 Cython 调用它。有没有办法在 Cython 中使用宏并使其完全扩展？我已经有了 C 类型的参数。我尝试像使用函数一样使用 cdef，我认
cython - 在 Cython 中获取结构元素
令人惊讶的是，我似乎找不到通过名称获取结构体元素的单个示例(无论是在网络上还是在 cython 示例中)。所以我收到了一个指向 C 函数结构体的指针，并且想要一一访问这些元素并将它们重新打包到 py
cython - 我的 Cython 有什么问题？
我尝试围绕 C++ 库编写一个 Cython 包装器 http://primesieve.org/ 它包装了一个函数count。到目前为止，它可以正确安装 python setup.py instal
python - Cython:ImportError:没有名为 'myModule' 的模块:如何将包含 cimport 的 cython 模块调用到另一个 cython 结节？
我正在尝试将 cython 模块 data.pyx 导入另一个 cython 模块 user.pyx。一切都编译得很好，但是当我尝试在 python 模块中调用 user.pyx 时，我收到错误“Im
cython - Bakeoff 第 1 部分 Python vs Cython vs Cython 类型化内存 View : LDA by Gibbs Sampling
更新:内存 View 获胜。Cython 使用类型化内存 View :0.0253449 特别感谢 lothario，他指出了几个关键的变化。荒谬。当然现在的问题是，似乎不能对它们做太多算术(加法和
cython - 为什么不能腌制 cython 内存 View ？
我有一个使用 memoryview 数组的 cython 模块，即... double[:,:] foo 我想使用多处理并行运行这个模块。但是我得到了错误: PicklingError: Can't
cython - mypy 不喜欢别名 Cython 类型
我正在尝试使用 Cython 加速 PEP 484 类型的 python 脚本。我想保持一些语义和可读性。之前，我有一个 Flags = int def difference(f1: Flags,
cython - 使用 cython 为一组文件制作一个 pyd
这个问题已经有答案了: Collapse multiple submodules to one Cython extension (5 个回答) 已关闭 3 年前。我在一个包中有多个 .py 文件
cython - 如何分发从 cython 生成的 .so 文件
我已经能够在我的 .pyx 脚本上使用 cython 在 linux 上创建一个 .so 文件。我也可以成功地在我的 python 解释器上进行导入。我的问题是如何在不使用 cython 的情况下将

太空宇宙

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

python - Cython 函数比纯 python 花费更多时间

使用 Numba 的解决方案