gpt4 book ai didi

CUDA C - 使用 Clock() 和 cudaEvent 的 CPU 和 GPU 执行时间,是否正确?

转载 作者:行者123 更新时间:2023-12-02 11:24:31 25 4
gpt4 key购买 nike

我编写了一个程序来添加两个二维数组来检查CPU和GPU的性能。我使用clock()函数来测量CPU执行情况,使用cudaEvent来测量GPU中的内核执行时间。由于我是在 Udacity 下学习 CUDA 的,所以我尝试在他们的服务器上执行该程序,发现结果为,

 Output:
GPU: 0.001984 ms
CPU : 30.000000 ms

现在回到我真正的问题,我发现这些结果在 GPU 上的速度惊人地快,现在我有点怀疑这些结果是否准确或者我在程序中是否犯了任何错误?

这是我的程序:

 #include "stdio.h"
#include<time.h>
#define COLUMNS 900
#define ROWS 900
long a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS],d[ROWS][COLUMNS];
__global__ void add(long *a, long *b, long *c,long *d)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
a[i]=d[i];
}

int main()
{
long *dev_a, *dev_b, *dev_c,*dev_d;
float ms;
clock_t startc, end;
double cpu_time_used;
cudaEvent_t start,stop;


cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int));

startc = clock();
for (long y = 0; y < ROWS; y++) // Fill Arrays
for (long x = 0; x < COLUMNS; x++)
{
a[y][x] = x;
b[y][x] = y;
d[y][x]=rand()%4;
c[y][x]=a[y][x]+b[y][x];
}
end = clock();

cpu_time_used = ((double) (end - startc)) / CLOCKS_PER_SEC;
cpu_time_used*=1000;


cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaEventRecord(stop, 0);


add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d);

cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventDestroy(start);
cudaEventDestroy(stop);




printf("GPU: %f ms",ms);
printf("\n CPU : %f ms",cpu_time_used);

return 0;
}

感谢大家对我的查询提供的答案,以下是我对代码所做的更改和更新的结果,

更新的代码:

#include "stdio.h"
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#define COLUMNS 500
#define ROWS 500
long a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS],d[ROWS][COLUMNS];



__global__ void add(long *a, long *b, long *c,long *d)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
a[i]=d[i];
}
int main()
{
long *dev_a, *dev_b, *dev_c,*dev_d;
struct timeval startc, end;
float ms;
long mtime, seconds, useconds;
// clock_t startc, end;
// double cpu_time_used;
long ns;
cudaEvent_t start,stop;


cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int));

gettimeofday(&startc, NULL);
for (long y = 0; y < ROWS; y++) // Fill Arrays
for (long x = 0; x < COLUMNS; x++)
{
a[y][x] = x;
b[y][x] = y;
d[y][x]=rand()%4;
c[y][x]=a[y][x]+b[y][x];
}
gettimeofday(&end, NULL);

seconds = end.tv_sec - startc.tv_sec;
useconds = end.tv_usec - startc.tv_usec;
mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;


for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
{
for (long x = COLUMNS-1; x < COLUMNS; x++)
{
// printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
// printf("[%d][%d]=%d ",y,x,d[y][x]);
}
printf("\n");
}



cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);



add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d);

cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);

cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventDestroy(start);
cudaEventDestroy(stop);



//cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("GPU: %f ms",ms);
printf("\n CPU : %ld ms",mtime);
for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
{
for (long x = COLUMNS-1; x < COLUMNS; x++)
{
// printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
// printf("[%d][%d]=%d ",y,x,d[y][x]);
}
printf("\n");
}
return 0;
}

输出:

GPU: 0.011040 ms
CPU : 9 ms

现在我可以安全地判断它是否正确吗?

最佳答案

您认为加速太多是正确的,CPU 的计时太长。使用此方法对CPU进行计时C++ obtaining milliseconds time on Linux -- clock() doesn't seem to work properly您还可能需要将 cudaEventRecord(stop, 0); 移至内核之后。

我在你的内核中看到了 5 个读写操作。占用 5*4Bytes*500*500/(1024^3*0.009) 您的内存中大约会占用 0.517 GB/s,这只是一小部分可用的。我想说你的CPU版本需要一些改进。相比之下,您的 GPU 的 5*4Bytes*500*500/(1024^3*0.01104e-3) 约为 421GB/s。我想说你还没有完全做到这一点。

所以,这么多错误......

#include "stdio.h"
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include <cuda.h>
#include <cuda_runtime.h>

#define COLUMNS 500
#define ROWS 500
long a[ROWS*COLUMNS], b[ROWS*COLUMNS], c[ROWS*COLUMNS],d[ROWS*COLUMNS];



__global__ void add(long *a, long *b, long *c,long *d)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
a[i]=d[i];
}
int main()
{
long *dev_a, *dev_b, *dev_c,*dev_d;
struct timeval startc, end;
float ms;
long seconds, useconds;
double mtime;
cudaEvent_t start,stop;


for(int i=0; i<ROWS*COLUMNS; i++)
d[i]=rand()%4;

for(int i=0; i<ROWS; i++){
for(int j=0; j<COLUMNS; j++){
a[i*COLUMNS+j]=j;
b[i*COLUMNS+j]=i;
}
}

cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int));



gettimeofday(&startc, NULL);
for (long i = 0; i < ROWS*COLUMNS; i++){ // Fill Arrays
c[i]=a[i]+b[i];
a[i]=d[i];
}
gettimeofday(&end, NULL);

seconds = end.tv_sec - startc.tv_sec;
useconds = end.tv_usec - startc.tv_usec;
mtime = useconds;
mtime/=1000;
mtime+=seconds*1000;

for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
{
for (long x = COLUMNS-1; x < COLUMNS; x++)
{
// printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
// printf("[%d][%d]=%d ",y,x,d[y][x]);
}
printf("\n");
}



cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);


cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);



add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d);



cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);

cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventDestroy(start);
cudaEventDestroy(stop);

printf("GPUassert: %s\n", cudaGetErrorString(cudaGetLastError()));

//cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
double memXFers=5*4*COLUMNS*ROWS;
memXFers/=1024*1024*1024;


printf("GPU: %f ms bandwidth %g GB/s",ms, memXFers/(ms/1000.0));
printf("\n CPU : %g ms bandwidth %g GB/s",mtime, memXFers/(mtime/1000.0));
for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
{
for (long x = COLUMNS-1; x < COLUMNS; x++)
{
// printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
// printf("[%d][%d]=%d ",y,x,d[y][x]);
}
printf("\n");
}

return 0;
}

顺便说一句,我当前的结果(显然不正确)...

GPU: 0.001792 ms bandwidth 2598.56 GB/s
CPU : 0.567 ms bandwidth 8.21272 GB/s

关于CUDA C - 使用 Clock() 和 cudaEvent 的 CPU 和 GPU 执行时间,是否正确?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28701788/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com