gpt4 book ai didi

c++ - 为什么在 for 循环中重复内核会使 CUDA 代码显着变慢?

转载 作者:搜寻专家 更新时间:2023-10-31 01:32:49 25 4
gpt4 key购买 nike

假设我们有四个 float 数组用于主机端,还有四个对应的数组用于设备端:

float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];

现在假设我们有一个非常简单的内核,取自 NVIDIA 博客中的一个示例:

__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
}
}

这样的内核将由主机端在 for 循环中调用,如下所示:

for (int r = 0; r < LOOP_N; r++)
{
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}

然后我将这种循环的执行时间与其纯 CPU 版本的执行时间进行比较:

for (int r = 0; r < LOOP_N; r++)
{
for (int i = 0; i < ARRAYS_SIZE; i++) {
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
}
}

现在,我不明白的是以下内容。例如,对于 ARRAYS_SIZE = 1000000LOOP_N = 1000,当我在上面显示的版本中运行两个循环时,我得到 CPU 版本和 CUDA 的执行时间之间的比率版本大约是 6。也就是说,CUDA 版本大约快 6 倍。

但是,如果我注释掉 CUDA 版本循环内对 saxpy 的调用之一和循环 CPU 版本内的计算之一,则 CPU 和 CUDA 之间的比率变成大约 210。也就是说,CUDA 版本大约快 210 倍。

如果没有向设备传输内存或从设备传输内存,仅重复调用内核时造成这种性能损失的技术原因是什么?对此有任何解决方法吗?

下面是一个(希望)完全可重现的代码示例:

#include <algorithm>
#include <chrono>
#include <iostream>
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


// Typedef and constant variables
typedef std::chrono::high_resolution_clock::time_point timers;
const int LOOP_N = 1000;
const int ARRAYS_SIZE = 1000000;

//Pretty simple kernel, from the example in Nvidia's blog
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
y[i] = a*x[i] + y[i];
}
}

// Main loop
int main(void)
{
timers t0, t1, t2;
timers tfinal0, tfinal1, tfinal2;

float *x, *x2, *y, *y2;
float *d_x, *d_x2, *d_y, *d_y2;
x = new float[ARRAYS_SIZE];
x2 = new float[ARRAYS_SIZE];
y = new float[ARRAYS_SIZE];
y2 = new float[ARRAYS_SIZE];

//Initializing arrays at the host side:
for (int i = 0; i < ARRAYS_SIZE; i++) {
x[i] = 1.0f;
x2[i] = 1.0f;
y[i] = 2.0f;
y2[i] = 2.0f;
}

// GPU memory allocation:
cudaMalloc(&d_x, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_x2, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y, ARRAYS_SIZE * sizeof(float));
cudaMalloc(&d_y2, ARRAYS_SIZE * sizeof(float));

// Transfering arrays from host to device:
cudaMemcpy(d_x, x, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, ARRAYS_SIZE * sizeof(float), cudaMemcpyHostToDevice);


//////////////////
// CPU run //
//////////////////
t0 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
{
for (int i = 0; i < ARRAYS_SIZE; i++) {
//comment one of the following out to see the point of my question:
y[i] = 2.0f*x[i] + y[i];
y2[i] = 2.0f*x2[i] + y2[i];
}
}
tfinal0 = std::chrono::high_resolution_clock::now();
auto time0 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal0 - t0).count();
std::cout << "CPU: " << (float)time0 << " microseconds" << std::endl;


//////////////////
// GPU-CUDA run //
//////////////////

// Perform SAXPY kernel on ARRAYS_SIZE elements, for LOOP_N times
t1 = std::chrono::high_resolution_clock::now();
for (int r = 0; r < LOOP_N; r++)
{
//comment one of the following out to see the point of my question:
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x, d_y);
saxpy <<<(ARRAYS_SIZE + 255) / 256, 256 >>> (ARRAYS_SIZE, 2.0f, d_x2, d_y2);
}
tfinal1 = std::chrono::high_resolution_clock::now();
auto time1 = std::chrono::duration_cast<std::chrono::microseconds>(tfinal1 - t1).count();
std::cout << "CUDA: " << (float)time1 << " microseconds" << std::endl;

//Display performance ratio CPU / GPU-CUDA
std::cout << "Ratio CPU/CUDA: " << (float)time0 / (float)time1 << std::endl;

//Freeing memory used by arrays:
cudaFree(d_x);
cudaFree(d_x2);
cudaFree(d_y);
cudaFree(d_y2);
free(x);
free(x2);
free(y);
free(y2);

return 0;
}

最佳答案

您不是在等待内核完成。一如既往kernel launches are asynchronous ,您需要在停止计时器之前显式调用 cudaDeviceSynchronize()

您观察到的当前代码变体的差异可能源于这样一个事实,即启动内核的队列是有限的,因此在某些时候您的代码将开始等待部分您的内核无论如何。 在 Windows 上,内核批处理也会影响到这一点,直到某个数字(或超时),驱动程序甚至不会开始启动内核。

关于c++ - 为什么在 for 循环中重复内核会使 CUDA 代码显着变慢?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/42519157/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com