gpt4 book ai didi

c++ - CUDA 动态并行,性能差

转载 作者:塔克拉玛干 更新时间:2023-11-02 23:33:30 24 4
gpt4 key购买 nike

我们在使用 CUDA 动态并行时遇到了性能问题。目前,CDP 的执行速度至少比传统方法慢 3 倍。我们做了最简单的可重现代码来展示这个问题,就是把一个数组的所有元素的值都增加+1。即,

a[0,0,0,0,0,0,0,.....,0] --> kernel +1 --> a[1,1,1,1,1,1,1,1,1]

这个简单示例的目的只是为了查看 CDP 是否可以像其他的一样执行,或者是否存在严重的开销。

代码在这里:

#include <stdio.h>
#include <cuda.h>
#define BLOCKSIZE 512

__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);


// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
cudaStream_t s1, s2;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);

int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid == 0){
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);

kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
}

a[tid] += 1;
}


__global__ void kernel_simple(int *a, int n, int N, int offset){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int pos = tid + offset;
if(pos < N){
a[pos] += 1;
}
}

int main(int argc, char **argv){
if(argc != 3){
fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional) \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
exit(EXIT_FAILURE);
}
int N = atoi(argv[1])*3;
int method = atoi(argv[2]);
// init array as 0
int *ah, *ad;
printf("genarray of 3*N = %i.......", N); fflush(stdout);
ah = (int*)malloc(sizeof(int)*N);
for(int i=0; i<N; ++i){
ah[i] = 0;
}
printf("done\n"); fflush(stdout);

// malloc and copy array to gpu
printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
cudaMalloc(&ad, sizeof(int)*N);
cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
printf("done\n"); fflush(stdout);

// kernel launch (timed)
cudaStream_t s1, s2, s3;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float rtime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
printf("Kernel...........................", N); fflush(stdout);
if(method == 0){
// CLASSIC KERNEL LAUNCH
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block >>> (ad, N, N, 0);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else if(method == 1){
// DYNAMIC PARALLELISM
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else{
// THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
printf("done\n"); fflush(stdout);


printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("done\n"); fflush(stdout);

printf("checking result.................."); fflush(stdout);
for(int i=0; i<N; ++i){
if(ah[i] != 1){
fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
exit(EXIT_FAILURE);
}
}
printf("done\n"); fflush(stdout);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&rtime, start, stop);
printf("rtime: %f ms\n", rtime); fflush(stdout);
return EXIT_SUCCESS;
}

可以编译

nvcc -arch=sm_35 -rdc=true -lineinfo -lcudadevrt -use_fast_math main.cu -o prog

这个例子可以用3种方法计算结果:

  1. 简单内核:只有一个经典内核 +1 遍历数组。
  2. 动态并行:从 main() 调用一个在范围 [0,N/3) 上执行 +1 的父内核,还调用两个子内核。第一个 child 在 [N/3, 2*N/3) 范围内 +1,第二个 child 在 [2*N/3,N) 范围内。 Childs 使用不同的流启动,因此它们可以并发。
  3. 来自主机的三个流:这个只是从 main() 启动三个非阻塞流,一个用于数组的三分之一。

我得到以下方法 0(简单内核)的配置文件: Simple Kernel方法 1(动态并行)的以下内容: Dynamic Parallelism以下是方法 2(来自主机的三个流) enter image description here运行时间是这样的:

➜  simple-cdp git:(master) ✗ ./prog 16777216 0
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.140928 ms
➜ simple-cdp git:(master) ✗ ./prog 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 5.790048 ms
➜ simple-cdp git:(master) ✗ ./prog 16777216 2
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 1.011936 ms

从图片中可以看出,主要问题是在动态并行方法中,父内核在两个子内核完成后花费过多时间关闭,这导致它需要 3X 或4 倍以上。即使考虑到最坏的情况,如果所有三个内核(父内核和两个子内核)都串行运行,也应该少得多。也就是说,每个内核有 N/3 的工作,所以整个父内核应该占用大约 3 个子内核的长度,这要少得多。 有办法解决这个问题吗?

编辑:Robert Crovella 在评论中解释了子内核以及方法 2 的序列化现象(非常感谢)。内核确实以串行方式运行的事实并不能使粗体文本中描述的问题无效(至少现在不是)。

最佳答案

调用设备运行时是“昂贵的”,就像调用主机运行时是昂贵的一样。在这种情况下,您似乎正在调用设备运行时来为每个线程创建流,即使此代码仅需要线程 0 的流。

通过修改您的代码以仅请求线程 0 的流创建,我们可以在我们为子内核启动使用单独的流的情况和我们没有为子内核使用单独的流的情况之间产生时间奇偶校验内核启动:

$ cat t370.cu
#include <stdio.h>
#define BLOCKSIZE 512

__global__ void kernel_parent(int *a, int n, int N);
__global__ void kernel_simple(int *a, int n, int N, int offset);


// N is the total array size
// n is the worksize for a kernel (one third of N)
__global__ void kernel_parent(int *a, int n, int N){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if(tid == 0){
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (n + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
#ifdef USE_STREAMS
cudaStream_t s1, s2;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
kernel_simple<<< grid, block, 0, s1 >>> (a, n, N, n);
kernel_simple<<< grid, block, 0, s2 >>> (a, n, N, 2*n);
#else
kernel_simple<<< grid, block >>> (a, n, N, n);
kernel_simple<<< grid, block >>> (a, n, N, 2*n);
#endif
// these next 2 lines add noticeably to the overall timing
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("oops1: %d\n", (int)err);
}

a[tid] += 1;
}


__global__ void kernel_simple(int *a, int n, int N, int offset){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int pos = tid + offset;
if(pos < N){
a[pos] += 1;
}
}

int main(int argc, char **argv){
if(argc != 3){
fprintf(stderr, "run as ./prog n method\nn multiple of 32 eg: 1024, 1048576 (1024^2), 4194304 (2048^2), 16777216 (4096^2)\nmethod:\n0 (traditional) \n1 (dynamic parallelism)\n2 (three kernels using unique streams)\n");
exit(EXIT_FAILURE);
}
int N = atoi(argv[1])*3;
int method = atoi(argv[2]);
// init array as 0
int *ah, *ad;
printf("genarray of 3*N = %i.......", N); fflush(stdout);
ah = (int*)malloc(sizeof(int)*N);
for(int i=0; i<N; ++i){
ah[i] = 0;
}
printf("done\n"); fflush(stdout);

// malloc and copy array to gpu
printf("cudaMemcpy:Host->Device..........", N); fflush(stdout);
cudaMalloc(&ad, sizeof(int)*N);
cudaMemcpy(ad, ah, sizeof(int)*N, cudaMemcpyHostToDevice);
printf("done\n"); fflush(stdout);

// kernel launch (timed)
cudaStream_t s1, s2, s3;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s2, cudaStreamNonBlocking);
cudaStreamCreateWithFlags(&s3, cudaStreamNonBlocking);
cudaEvent_t start, stop;
float rtime = 0.0f;
cudaEventCreate(&start);
cudaEventCreate(&stop);
printf("Kernel...........................", N); fflush(stdout);
if(method == 0){
// CLASSIC KERNEL LAUNCH
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block >>> (ad, N, N, 0);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else if(method == 1){
// DYNAMIC PARALLELISM
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_parent<<< grid, block, 0, s1 >>> (ad, N/3, N);
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
else{
// THREE CONCURRENT KERNEL LAUNCHES USING STREAMS
dim3 block(BLOCKSIZE, 1, 1);
dim3 grid( (N/3 + BLOCKSIZE - 1)/BLOCKSIZE, 1, 1);
cudaEventRecord(start, 0);
kernel_simple<<< grid, block, 0, s1 >>> (ad, N/3, N, 0);
kernel_simple<<< grid, block, 0, s2 >>> (ad, N/3, N, N/3);
kernel_simple<<< grid, block, 0, s3 >>> (ad, N/3, N, 2*(N/3));
cudaDeviceSynchronize();
cudaEventRecord(stop, 0);
}
printf("done\n"); fflush(stdout);


printf("cudaMemcpy:Device->Host..........", N); fflush(stdout);
cudaMemcpy(ah, ad, sizeof(int)*N, cudaMemcpyDeviceToHost);
printf("done\n"); fflush(stdout);

printf("checking result.................."); fflush(stdout);
for(int i=0; i<N; ++i){
if(ah[i] != 1){
fprintf(stderr, "bad element: a[%i] = %i\n", i, ah[i]);
exit(EXIT_FAILURE);
}
}
printf("done\n"); fflush(stdout);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&rtime, start, stop);
printf("rtime: %f ms\n", rtime); fflush(stdout);
return EXIT_SUCCESS;
}
$ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu
$ ./t370 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 6.925632 ms
$ nvcc -arch=sm_52 -rdc=true -lcudadevrt -o t370 t370.cu -DUSE_STREAMS
$ ./t370 16777216 1
genarray of 3*N = 50331648.......done
cudaMemcpy:Host->Device..........done
Kernel...........................done
cudaMemcpy:Device->Host..........done
checking result..................done
rtime: 6.673568 ms
$

虽然没有包含在上面的测试输出中,但根据我的测试,这也使 CUDA 动态并行 (CDP) 案例 (1) 与非 CDP 案例 ( 02)。请注意,我们可以通过放弃调用父内核(我添加到您的代码中)中的 cudaGetLastError() 来将上述时间缩短大约 1 毫秒 (!)。

关于c++ - CUDA 动态并行,性能差,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/45201062/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com