python - Cython 没有加速-6ren

python - Cython 没有加速

转载作者：行者123 更新时间：2023-12-04 12:04:39

我一直在尝试测试使用 Cython 与基本 Python 代码相比的加速潜力。为此，我编写了两个脚本“linearAdvec_mat.py”和“linearAdvec_mat.pyx”，如下所示:
linearAdvec_mat.py:

import numpy as np

def Adv_mat(N):
    A = np.zeros((N, N));
    for i in range(N):
        if i == 0:
            A[i, N - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, i + 1] = 1.0;
        elif i == N - 1:
            A[i, i - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, 0] = 1.0;
        else:
            A[i, i - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, i + 1] = 1.0;
    return A;

def Diff_mat(N):
    D = np.zeros((N, N));
    for i in range(N):
        if i == 0:
            D[i, N - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, i + 1] = 1.0;
        elif i == N - 1:
            D[i, i - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, 0] = 1.0;
        else:
            D[i, i - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, i + 1] = 1.0;
    return D;

def Compute_eigVals(N, alpha, kdt):
    A = Adv_mat(N);
    D = Diff_mat(N);
    ADt = A*(-alpha/2.0) + D*kdt;
    ldt = np.zeros(N, 'complex');
    beta = np.zeros(N);
    for m in range(N):
        beta[m] = 2*np.pi*m/N;
        if beta[m] > np.pi:
            beta[m] = 2*np.pi - beta[m];
        for j in range(N):
            ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
    return ldt;

和linearAdvec_mat.pyx:

import numpy as np
cimport numpy as np

DTYPE = np.float64;
DTYPE_c = np.complex128;
ctypedef np.float64_t DTYPE_t;

cdef np.ndarray[DTYPE_t, ndim = 2] Adv_mat(int N):
    cdef np.ndarray[DTYPE_t, ndim = 2] A = np.zeros((N, N), dtype = DTYPE);
    cdef int i;
    for i in range(N):
        if i == 0:
            A[i, N - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, i + 1] = 1.0;
        elif i == N - 1:
            A[i, i - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, 0] = 1.0;
        else:
            A[i, i - 1] = -1.0;
            A[i, i] = 0.0;
            A[i, i + 1] = 1.0;
    return A;

cdef np.ndarray[DTYPE_t, ndim = 2] Diff_mat(int N):
    cdef np.ndarray[DTYPE_t, ndim = 2] D = np.zeros((N, N), dtype = DTYPE);
    cdef int i;
    for i in range(N):
        if i == 0:
            D[i, N - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, i + 1] = 1.0;
        elif i == N - 1:
            D[i, i - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, 0] = 1.0;
        else:
            D[i, i - 1] = 1.0;
            D[i, i] = -2.0;
            D[i, i + 1] = 1.0;
    return D;

cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
    cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N);
    cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N);
    cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j;
    cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c);
    cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE);
    cdef int m, k;
    for m in range(N):
        beta[m] = 2*np.pi*m/N;
        if beta[m] > np.pi:
            beta[m] = 2*np.pi - beta[m];
        for k in range(N):
            ldt[m] = ldt[m] + ADt[0, k]*np.exp(1j*2.0*np.pi*k*m/N);
    return ldt;

当我从基础 python 和编译后的 .so 文件调用 'Compute_eigVals' 函数时，如下所示，我没有从 cython 脚本中获得任何显着的加速。

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'
from libs.linearAdvec_mat import Compute_eigVals as Compute_eigVals_cy
from linearAdvec_mat import Compute_eigVals as Compute_eigVals_py
import time
#%% ------------------ Inputs ---------------------
N = 1000;
alpha = 0.8;
kdt = 0.05;

st = time.time();
eigs = Compute_eigVals_cy(N, alpha, kdt);
t_cy = time.time() - st;
print('Cython time : %0.8fs\n'%(t_cy));

st = time.time();
eigs = Compute_eigVals_py(N, alpha, kdt);
t_py = time.time() - st;
print('Python time : %0.8fs\n'%(t_py));
print('Cython is %0.5f times faster'%(t_py/t_cy));

我试图通过运行来检查 python 交互的数量

cython -a linearAdvec_mat.pyx

在终端中，但我无法从中解决任何问题。有人可以提供一些关于为什么我在使用 cython 时没有获得大量加速的见解吗？我的第一个猜测是，我的基本 python 脚本严重依赖于 numpy，它已经处于优化状态，但我完全确定并渴望弄清楚实际发生了什么。

最佳答案

Cython 解决方案:
让我们将您的 Python 函数计时为基准引用:

In [3]: %timeit Compute_eigVals(N, alpha, kdt)
3.85 s ± 22.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

通过在 jupyter notebook 中分析您的 Python 代码

In [4]: %lprun -f Compute_eigVals Compute_eigVals(N, alpha, kdt)
Timer unit: 1e-06 s

Total time: 4.35475 s
File: <ipython-input-1-61dba133ade4>
Function: Compute_eigVals at line 37

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    37                                           def Compute_eigVals(N, alpha, kdt):
    38         1       2491.0   2491.0      0.1      A = Adv_mat(N);
    39         1       2295.0   2295.0      0.1      D = Diff_mat(N);
    40         1       8582.0   8582.0      0.2      ADt = A*(-alpha/2.0) + D*kdt;
    41         1         11.0     11.0      0.0      ldt = np.zeros(N, 'complex');
    42         1          2.0      2.0      0.0      beta = np.zeros(N);
    43      1001        357.0      0.4      0.0      for m in range(N):
    44      1000        713.0      0.7      0.0          beta[m] = 2*np.pi*m/N;
    45      1000        720.0      0.7      0.0          if beta[m] > np.pi:
    46       499        356.0      0.7      0.0              beta[m] = 2*np.pi - beta[m];
    47   1001000     390717.0      0.4      9.0          for j in range(N):
    48   1000000    3948510.0      3.9     90.7              ldt[m] += ADt[0, j]*np.exp(1j*2.0*np.pi*j*m/N);
    49         1          1.0      1.0      0.0      return ldt;

我们可以观察到时间关键部分是最里面的循环。那么让我们来看看你的 cython 代码:

为了减少python开销，这里有几个关键点:

访问常量 np.pi有明显的 python 开销。相反，您可以使用 C 常量 pi内libc.math .另外可以缓存2.0*pi的结果和 1j*2.0*pi因为您多次使用两者。

同样，函数 np.exp也有 python 开销，并且为标量参数调用它并不能证明调用 python 函数的开销是合理的。相反，您可以使用 C cexp功能。

最后，您可以使用 Cython Compiler directives进一步加速您的代码。在这里，我们启用 C 整数除法 ( cdivision )，禁用索引检查 ( boundscheck ) 并禁用负索引 ( wraparound )

在代码中:

cimport cython

from libc.math cimport pi
cdef extern from "complex.h":
    double complex cexp(double complex)

# Adv_mat and Diff_mat are the same as above

@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[np.complex128_t, ndim = 1] Compute_eigVals(int N, double alpha, double kdt):
    cdef np.ndarray[DTYPE_t, ndim = 2] A = Adv_mat(N)
    cdef np.ndarray[DTYPE_t, ndim = 2] D = Diff_mat(N)
    cdef np.ndarray[np.complex128_t, ndim = 2] ADt = A*(-alpha/2.0) + D*kdt + 0j
    cdef np.ndarray[np.complex128_t, ndim = 1] ldt = np.zeros(N, dtype = DTYPE_c)
    cdef np.ndarray[DTYPE_t, ndim = 1] beta = np.zeros(N, dtype = DTYPE)
    cdef int m, k
    cdef double two_pi = 2*pi
    cdef double complex factor = 1j*2.0*pi+0
    for m in range(N):
        beta[m] = two_pi*m / N;
        if beta[m] > pi:
            beta[m] = two_pi - beta[m];
        for k in range(N):
            ldt[m] = ldt[m] + ADt[0, k]*cexp(factor*k*m / N);
    return ldt;

这消除了循环内的所有 python 交互。在我的机器上计时给出:

In [6]: %timeit Compute_eigVals(N, alpha, kdt)
45.8 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

改进的 Python 版本:
另请注意，实际上不需要 Cython，因为您可以用矢量化 numpy 操作替换 python 循环:

def Compute_eigVals2(N, alpha, kdt):
    A = Adv_mat(N);
    D = Diff_mat(N);
    ADt = A*(-alpha/2.0) + D*kdt;
    beta = 2*np.pi*np.arange(N)/N
    beta[beta > np.pi] = 2*np.pi - beta[beta > np.pi]
    JM = np.arange(N) * np.arange(N)[:, None]
    ldt = np.sum(ADt[0, :] * np.exp(1j*2.0*np.pi*JM/N), axis=-1)
    return ldt

In [7]: %timeit Compute_eigVals2(N, alpha, kdt)
35.8 ms ± 655 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

关于python - Cython 没有加速，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/68594551/

文章推荐： traits - 特质是否仅适用于对象？

文章推荐： exception - 我什么时候想要恢复 Perl 6 异常？

文章推荐： spring-boot - Spring Cloud 配置服务器

文章推荐： c - ARM 皮质 M0+ : How to use "Branch if Carry" instructions in C-code?

iphone - <加速/加速.h> "file not found"
我想在我的 iPhone 应用程序中加入线性回归。经过一些搜索，我发现 Accelerate Framework 中的 LAPACK 和 BLAS 是正确的库。但是我很难将加速框架添加到我的 XCod
Javascript 加速？
有什么方法可以加速 JS 脚本(我指的是一些复杂的 DOM 操作，比如游戏或动画)？最佳答案真的没有办法真正加快速度。您可以压缩它，但不会快很多。关于Javascript 加速？，我们在Stac
MySQL加载数据infile - 加速？
有时，我必须为一个项目重新导入数据，从而将大约 360 万行读入 MySQL 表(目前是 InnoDB，但我实际上并不局限于这个引擎)。 “加载数据文件...”已被证明是最快的解决方案，但它有一个权衡
performance - 如何计算执行时间(加速)
在尝试计算加速时，我被卡住了。所以给出的问题是: 问题 1 如果程序的 50% 增强了 2 倍，其余 50% 增强了 4 倍，那么由于增强而导致的整体加速是多少？ Hints:考虑增强前(未增强)机器
python - 加速 Matplotlib
目前我正在处理实时绘图，但可视化非常慢。我想知道你可以做些什么来加速 Matplotlib 中的事情: 后端如何影响性能？是否有后端实时绘图比其他人更好吗？我可以降低分辨率以提高 FPS 吗？如
haskell - 加速 runhaskell
我有一个小型测试框架。它执行一个循环，执行以下操作: 生成一个小的 Haskell 源文件。使用 runhaskell 执行此操作.该程序生成各种磁盘文件。处理刚刚生成的磁盘文件。这种情况发生了
javascript - 加速 swfobject
这是我的网站:Instant-YouTube 如您所见，加载需要很长时间。在 IE8 及以下甚至有时会导致浏览器崩溃。我不确定是什么原因造成的。可能是 Clicksor 广告，但我认为是 swfobj
ios - 加速 SKSpriteNode
是否可以加速 SKSpriteNode？我知道可以使用 node.physicsBody.velocity 轻松设置速度但是设置它的加速度有多难？最佳答案从牛顿第二定律倒推运动:F = m.a您
javascript - 加速 FCKEditor
有没有人有加速 FCKEditor 的技术？是否有一些关键的 JavaScript 文件可以缩小或删除？最佳答案在最新版本 (3.0.1) 中，FCKEditor 已重命名为 CKEditor .
MySQL查询优化-加速|索引使用
我有以下 MySQL 查询，需要一天多的时间才能执行: SELECT SN,NUMBER FROM a WHERE SN IN (SELECT LOWER_SN FROM b WHER
ios - 加速、移动元素
我现在正在开发一款使用加速来玩的游戏。我找到了如何让我的元素移动，但不改变它的“原点”，或者更准确地说，改变加速度计算的原点: 事实上，我的图像是移动的，它的中心是这样定义的: imageView.c
mysql - 加速 ORDER BY
我有一个 mysql 表，其中存储有 4 列的成员消息: message_id(主键，自增) sender_id( key ) receiver_id( key ) 消息内容我做了很多 SELECT
用于简单计算的 CUDA 加速
我在 cuda_computation.cu 中有以下代码 #include #include #include #include void checkCUDAError(const char
python - 加速 BeautifulSoup
我正在使用 BeautifulSoup 在 for 循环中解析数千个网站。这是我的代码片段: def parse_decision(link): t1 = time.time() de
c++ - 加速 OpenCV
我正在使用 OpenCV 2.4 (C++) 在灰度图像上进行寻线。这涉及一些基本的图像处理步骤，如模糊、阈值、Canny 边缘检测器、梯度滤波器或霍夫变换。我必须在数千张图像上应用寻线算法。考虑到
java - 加速 jasperreports
当我试图连续生成四次相同的报告时，我刚刚分析了我的报告应用程序。第一个用了 1859 毫秒，而后面的只用了 400 到 600 毫秒。对此的解释是什么？我能以某种方式使用它来使我的应用程序更快吗？报告
ios - 加速 Storyboard打开
当我打开 Storyboard文件时，由于其中包含的 VC 数量，打开它需要 1-2 分钟。加快速度的最佳做法是什么？我们应该将一些 VC 移动到不同的 Storyboard文件中吗？我们是否应该使用
iphone - 加速 UIPageViewController
我有一个包含多个页面的 UIPageViewController。每个页面都是相同的 View Controller ，但会跟踪页码并显示 PDF 的正确页面。问题是每个 PDF 页面都需要在 cur
java - 加速 Java
这实际上是两个问题，但它们非常相似，为了简单起见，我想将它们放在一起: 首先:给定一个已建立的 Java 项目，除了简单的代码内优化之外，还有哪些不错的方法可以加快它的速度？其次:在用Java从头写
java - 加速 xpath
我有一个包含 1000 个条目的文档，其格式类似于:

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

python - Cython 没有加速