gpt4 book ai didi

python - Numba CUDA `vectorize` 和 `reduce` 装饰器比预期慢

转载 作者:行者123 更新时间:2023-11-28 17:20:11 24 4
gpt4 key购买 nike

我一直在使用 Numba 包测试一些基本的 CUDA 功能。我的主要目标是实现 Richardson-Lucy GPU上的算法。可以加速算法,这样做的主要步骤之一可以总结为以下虚拟函数

def dummy(arr1, arr2):
return (arr1 * arr2).sum() / ((arr2**2).sum() + eps)

此函数在 CPU 上运行得相当快,但我想将所有内容都保留在 GPU 上以避免主机 <---> 设备复制。

为了比较不同计算的速度,我编写了一组简短的函数:

import numpy as np
from numba import njit, jit
import numba
import numba.cuda as cuda
import timeit
import time


# define our functions
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def add_gpu(a, b):
return a + b

@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def mult_gpu(a, b):
return a * b

@cuda.reduce
def sum_gpu(a, b):
return a + b

@cuda.jit
def add_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] + b[x]

@cuda.jit
def mult_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] * b[x]

@cuda.jit
def mult_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] * b[x, y]

@cuda.jit
def add_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] + b[x, y]

和一些定时器函数:

def avg_t(t, num):
return np.mean(t) / num

def format_t(t):
"""Turn t into nice formating"""
if t < 1e-3:
return "{:.1f} us".format(t * 1e6)
elif t < 1:
return "{:.1f} ms".format(t * 1e3)
else:
return "{:.1f} s".format(t)

def test_1d_times(data_len, dtype=np.float32):
num_times = 10

title = "Testing 1D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")

t = time.time()
arr1, arr2 = np.empty((2, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in " + format_t(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)

print()
print("Testing multiplication times")
print("----------------------------")

t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

t= timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_1d time = " + format_t(avg_t(t, num_times)))

print()
print("Testing sum times")
print("------------------")

t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

t= timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_1d time = " + format_t(avg_t(t, num_times)))

print()
print("Testing reduction times")
print("-----------------------")

t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()

def test_2d_times(data_len, dtype=np.float32):
num_times = 10

title = "Testing 2D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")

t = time.time()
arr1, arr2 = np.empty((2, data_len, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in {} seconds".format(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)

print()
print("Testing multiplication times")
print("----------------------------")

t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

t= timeit.repeat((lambda: mult_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_2d time = " + format_t(avg_t(t, num_times)))

print()
print("Testing sum times")
print("------------------")

t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

t= timeit.repeat((lambda: add_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_2d time = " + format_t(avg_t(t, num_times)))

print()
print("Testing reduction times")
print("-----------------------")

t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

t = timeit.repeat((lambda: add_gpu.reduce(d_arr1.ravel())), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

t = timeit.repeat((lambda: sum_gpu(d_arr1.ravel())), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()

运行测试函数

numba.cuda.detect()
test_1d_times(2**24)
test_2d_times(2**12)
test_1d_times(2**24, dtype=np.float64)
test_2d_times(2**12, dtype=np.float64)

给出以下输出:

Found 1 CUDA devices
id 0 b'GeForce GTX TITAN X' [SUPPORTED]
compute capability: 5.2
pci device id: 0
pci bus id: 3
Summary:
1/1 devices are supported
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float32'>
============================================================================
Data generated in 88.2 ms
d_arr1 dtype = float32
d_arr1 size = 16777216

Testing multiplication times
----------------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 122.8 ms
cuda_mult_1d time = 206.8 us

Testing sum times
------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 106.1 ms
cuda_add_1d time = 212.6 us

Testing reduction times
-----------------------
cpu/numpy time = 16.7 ms
cuda vectorize time = 11.1 ms
sum_gpu time = 127.3 ms

========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float32'>
========================================================================
Data generated in 0.0800013542175293 seconds
d_arr1 dtype = float32
d_arr1 size = 16777216

Testing multiplication times
----------------------------
cpu/numpy time = 35.4 ms
cuda vectorize time = 97.9 ms
cuda_mult_2d time = 208.9 us

Testing sum times
------------------
cpu/numpy time = 36.3 ms
cuda vectorize time = 94.5 ms
cuda_add_2d time = 250.8 us

Testing reduction times
-----------------------
cpu/numpy time = 16.4 ms
cuda vectorize time = 15.8 ms
sum_gpu time = 125.4 ms

============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float64'>
============================================================================
Data generated in 171.0 ms
d_arr1 dtype = float64
d_arr1 size = 16777216

Testing multiplication times
----------------------------
cpu/numpy time = 73.2 ms
cuda vectorize time = 114.9 ms
cuda_mult_1d time = 201.9 us

Testing sum times
------------------
cpu/numpy time = 71.4 ms
cuda vectorize time = 71.0 ms
cuda_add_1d time = 217.2 us

Testing reduction times
-----------------------
cpu/numpy time = 29.0 ms
cuda vectorize time = 12.8 ms
sum_gpu time = 123.5 ms

========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float64'>
========================================================================
Data generated in 0.301849365234375 seconds
d_arr1 dtype = float64
d_arr1 size = 16777216

Testing multiplication times
----------------------------
cpu/numpy time = 73.7 ms
cuda vectorize time = 84.2 ms
cuda_mult_2d time = 226.2 us

Testing sum times
------------------
cpu/numpy time = 74.9 ms
cuda vectorize time = 84.3 ms
cuda_add_2d time = 208.7 us

Testing reduction times
-----------------------
cpu/numpy time = 29.9 ms
cuda vectorize time = 14.3 ms
sum_gpu time = 121.2 ms

似乎 @cuda.vectorize 修饰函数的执行速度比 CPU 和自定义编写的 @cuda.jit 函数慢。而 @cuda.jit 函数提供了预期的数量级加速和几乎恒定的时间性能(结果未显示)。

另一方面,@cuda.reduce 函数的运行速度明显低于 @cuda.vectorize 函数或 CPU 函数。

@cuda.vectorize@cuda.reduce 函数性能不佳是否有原因?是否可以仅使用 Numba 编写 CUDA 缩减内核?

编辑:

看起来这是 Numba 中的一个合法错误:https://github.com/numba/numba/issues/2266 , https://github.com/numba/numba/issues/2268

最佳答案

我无法解释 @cuda.vectorize@cuda.reduce 的行为。有时结果对我来说看起来有点奇怪。例如这里 Negative Speed Gain Using Numba Vectorize target='cuda' @cuda.vectorize 会减慢计算速度,而使用 @cuda.jit 会加快计算速度。在这里我建议尝试 PyCUDA ( https://documen.tician.de/pycuda/ )。我测试了点积 ( https://documen.tician.de/pycuda/array.html ) 的性能。

import numpy as np
from pycuda.curandom import rand as curand
import pycuda.gpuarray as gpuarray
import pycuda.driver as pycu
import pycuda.autoinit
from pycuda.reduction import ReductionKernel
import numba.cuda as cuda
from time import time

dot = ReductionKernel(dtype_out=np.float32, neutral="0",
reduce_expr="a+b", map_expr="x[i]*y[i]",
arguments="float *x, float *y")
n = 2**24
x = curand((n), dtype=np.float32)
y = curand((n), dtype=np.float32)

x_cpu = np.random.random((n))
y_cpu = np.random.random((n))

st = time()
x_dot_y = dot(x, y).get()
gpu_time = (time() - st)
print "GPU: ", gpu_time

st = time()
x_dot_y_cpu = np.dot(x_cpu, y_cpu)
cpu_time = (time() - st)
print "CPU: ", cpu_time
print "speedup: ", cpu_time/gpu_time

在我的电脑上 CPU:Intel Core2 Quad 3GHz,GPU:NVIDIA GeForce GTX 580。我得到了以下结果:

GPU:  0.00191593170166
CPU: 0.0518710613251
speedup: 27.0735440518

需要注意的是,上面的代码没有考虑内核初始化和预编译所需的时间。然而,这一次可能意义重大。考虑到这段时间,我获得了:

GPU:  0.316560029984
CPU: 0.0511090755463
speedup: 0.161451449031

因此,在这种情况下,GPU 代码比 CPU 代码慢。同时,对于大多数应用程序,您只需初始化一次内核,然后多次使用它。在这种情况下,使用 PyCUDA 缩减内核看起来是合理的。

之前,我通过二维扩散方程的计算测试了@cuda.jit、PyCUDA和CUDA-C代码的性能。我发现 PyCUDA 允许获得与 CUDA-C 几乎相同的性能,而 Numba 表现出更差的性能。下图展示了这些结果。 enter image description here

关于python - Numba CUDA `vectorize` 和 `reduce` 装饰器比预期慢,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/42009827/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com