gpt4 book ai didi

python - Cython 使用 prange/parallel 没有性能提升

转载 作者:太空宇宙 更新时间:2023-11-03 13:09:27 28 4
gpt4 key购买 nike

我正在使用 Cython 版本 0.27.3 为一个简单的素数测试模块编译以下源代码,该模块包含同一算法的 python 和 cython 实现。当我将 threads 参数设置为不同的值时,尽管 GIL 已发布,但我没有看到性能提升。有什么东西阻止它并行运行吗?

有问题的函数是 cdef void _getprimes,它接受一个 memoryview 切片作为参数,并且应该将该切片中的所有非质数设置为 0。

primes.pyx

#cython: boundscheck=False, wraparound=False, nonecheck=False
cimport cython
from cpython cimport array
from cython.parallel cimport parallel, prange
from libc.math cimport sqrt, ceil
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
import math

# =====================
# Python implementation
# =====================

def pyisprime(n):
"""Python implementation"""
if n < 2 or n & 1 == 0:
if n == 2:
return True
return False
for i in range(2, int(math.sqrt(n)) + 1):
if n % i == 0:
return False
return True

def pygetprimes(nums):
return [num for num in nums if pyisprime(num)]


# =====================
# Cython implementation
# =====================
cdef int _isprime(unsigned long long n) nogil:
"""Cython implementation of a simple primality check"""
cdef unsigned long long upper
cdef unsigned long long i = 3
cdef int prime = 1
if n < 2 or n & 1 == 0:
if n == 2:
return 1
return 0
upper = <unsigned long long>ceil(sqrt(<double>n))
while i <= upper:
if n % i == 0:
prime = 0
break
i += 1
return prime

def isprime(unsigned long long n):
"""Wrapper for _isprime"""
cdef int result
with nogil:
result = _isprime(n)
return result

cdef void _getprimes(unsigned long long[:] nums, int threads) nogil:
cdef unsigned long num
cdef int i = 0
with parallel(num_threads=threads):
for i in prange(nums.shape[0], schedule="dynamic"):
if _isprime(nums[i]) == 0:
nums[i] = 0

def getprimes(nums, int threads = 1):
"""Wrapper for _getprimes"""
cdef unsigned long long num
cdef unsigned long long[:] primes = array.array("Q", nums)

with nogil:
_getprimes(primes, threads)

return [num for num in primes if num != 0]

setup.py

#!/usr/bin/env python3
from distutils.core import setup
from Cython.Build import cythonize

setup(
name="primes",
ext_modules=cythonize('primes.pyx'),
)

测试.py

#!/usr/bin/env python3
import functools
import random
import time
import primes

def timed(func):
def wrapped(*args, **kwargs):
start = time.time()
val = func(*args, **kwargs)
end = time.time()
print(func.__name__, end - start)
return val
return functools.wraps(func)(wrapped)


def main():
nums = [random.randint(0, 0xffffff) for _ in range(500000)]

pyfoo = timed(primes.pygetprimes)
cyfoo = timed(primes.getprimes)

x = pyfoo(nums)
y = cyfoo(nums, 1)
z = cyfoo(nums, 4)
assert x == y == z

if __name__ == "__main__":
main()

当我运行 cyfoo 时,我预计将线程数从 1 增加到 4 会显示某种类型的速度增加,但事实并非如此:

[aarcher@Arch]: ~/Programming/Cython/build/lib.linux-x86_64-3.6>$ ./test.py 
pygetprimes 5.11554741859436
getprimes 1.1129701137542725
getprimes 1.1306445598602295

最佳答案

看来您需要为 OpenMP 启用编译器标志,以便并行语句真正执行任何操作。

在此处查看 cython 文档 http://cython.readthedocs.io/en/latest/src/userguide/parallelism.html#compiling

# setup.py
# ... omitted ...

ext_modules = [
Extension(
"hello",
["hello.pyx"],
extra_compile_args=['-fopenmp'],
extra_link_args=['-fopenmp'],
)
]

关于python - Cython 使用 prange/parallel 没有性能提升,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/47293153/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com