gpt4 book ai didi

concurrency - CUDA 流和并发内核执行

转载 作者:行者123 更新时间:2023-12-01 11:43:56 26 4
gpt4 key购买 nike

我想使用流来并行执行在单独的设备数据阵列上工作的内核。数据在设备上分配并从以前的内核填充。

我编写了以下程序,表明我目前无法达到我的目标。事实上,两个非默认流上的内核在各自的流中顺序执行。

在 2 台装有最新 Debian Linux 版本的 Intel 机器上观察到同样的行为。一个是带有 CUDA 4.2 的 Tesla C2075,另一个是带有 CUDA 5.0 的 Geforce 460GT。 Visual Profiler 显示了 4.2 和 5.0 CUDA 版本中的顺序执行。

代码如下:

#include <iostream>
#include <stdio.h>
#include <ctime>

#include <curand.h>

using namespace std;

// compile and run this way:
// nvcc cuStreamsBasics.cu -arch=sm_20 -o testCuStream -lcuda -lcufft -lcurand
// testCuStream 1024 512 512


/* -------------------------------------------------------------------------- */
// "useful" macros
/* -------------------------------------------------------------------------- */


#define MSG_ASSERT( CONDITION, MSG ) \
if (! (CONDITION)) \
{ \
std::cerr << std::endl << "Dynamic assertion `" #CONDITION "` failed in " << __FILE__ \
<< " line " << __LINE__ << ": <" << MSG << ">" << std::endl; \
exit( 1 ); \
} \



#define ASSERT( CONDITION ) \
MSG_ASSERT( CONDITION, " " )



// allocate data on the GPU memory, unpinned
#define CUDALLOC_GPU( _TAB, _DIM, _DATATYPE ) \
MSG_ASSERT( \
cudaMalloc( (void**) &_TAB, _DIM * sizeof( _DATATYPE) ) \
== cudaSuccess , "failed CUDALLOC" );



/* -------------------------------------------------------------------------- */
// the CUDA kernels
/* -------------------------------------------------------------------------- */


// finds index in 1D array from sequential blocks
#define CUDAINDEX_1D \
blockIdx.y * ( gridDim.x * blockDim.x ) + \
blockIdx.x * blockDim.x + \
threadIdx.x; \



__global__ void
kernel_diva(float* data, float value, int array_size)
{
int i = CUDAINDEX_1D
if (i < array_size)
data[i] /= value;
}


__global__ void
kernel_jokea(float* data, float value, int array_size)
{
int i = CUDAINDEX_1D
if (i < array_size)
data[i] *= value + sin( double(i)) * 1/ cos( double(i) );
}


/* -------------------------------------------------------------------------- */
// usage
/* -------------------------------------------------------------------------- */


static void
usage(int argc, char **argv)
{
if ((argc -1) != 3)
{

printf("Usage: %s <dimx> <dimy> <dimz> \n", argv[0]);
printf("do stuff\n");

exit(1);
}
}


/* -------------------------------------------------------------------------- */
// main program, finally!
/* -------------------------------------------------------------------------- */


int
main(int argc, char** argv)
{
usage(argc, argv);
size_t x_dim = atoi( argv[1] );
size_t y_dim = atoi( argv[2] );
size_t z_dim = atoi( argv[3] );



cudaStream_t stream1, stream2;
ASSERT( cudaStreamCreate( &stream1 ) == cudaSuccess );
ASSERT( cudaStreamCreate( &stream2 ) == cudaSuccess );



size_t size = x_dim * y_dim * z_dim;
float *data1, *data2;
CUDALLOC_GPU( data1, size, float);
CUDALLOC_GPU( data2, size, float);


curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
/* Set seed */
curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
/* Generate n floats on device */
curandGenerateUniform(gen, data1, size);
curandGenerateUniform(gen, data2, size);


dim3 dimBlock( z_dim, 1, 1);
dim3 dimGrid( x_dim, y_dim, 1);

clock_t start;
double diff;


cudaDeviceSynchronize();
start = clock();
kernel_diva <<< dimGrid, dimBlock>>>( data1, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock>>>( data1, 5.55f, size);
kernel_diva <<< dimGrid, dimBlock>>>( data2, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock>>>( data2, 5.55f, size);
cudaDeviceSynchronize();
diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;

cout << endl << "sequential: " << diff;


cudaDeviceSynchronize();
start = clock();
kernel_diva <<< dimGrid, dimBlock, 0, stream1 >>>( data1, 5.55f, size);
kernel_diva <<< dimGrid, dimBlock, 0, stream2 >>>( data2, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock, 0, stream1 >>>( data1, 5.55f, size);
kernel_jokea<<< dimGrid, dimBlock, 0, stream2 >>>( data2, 5.55f, size);
cudaDeviceSynchronize();
diff = ( std::clock() - start ) / (double)CLOCKS_PER_SEC;

cout << endl << "parallel: " << diff;



cudaStreamDestroy( stream1 );
cudaStreamDestroy( stream2 );


return 0;
}

通常,数组的维度是 512^3单例float .我通常只是将数组分成 (512,1,1) block 。我放在大小为 (1<<15, (rest), 1) 的网格上的线程.

提前感谢您的任何提示或评论。

最好的问候。

最佳答案

我试图解释为什么您看不到两个内核的执行重叠。为此,我构建了下面报告的代码,它使用您的两个内核并监控每个 block 在哪个流式多处理器 (SM) 上运行。我使用的是 CUDA 6.5(候选版本),我在 GT540M 卡上运行,它只有 2 SM,因此它提供了一个简单的工作场所。 blockSize 选择委托(delegate)给新的 CUDA 6.5 cudaOccupancyMaxPotentialBlockSize 工具。

代码

#include <stdio.h>
#include <time.h>

//#define DEBUG_MODE

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

/**************************************************/
/* STREAMING MULTIPROCESSOR IDENTIFICATION NUMBER */
/**************************************************/
__device__ unsigned int get_smid(void) {
unsigned int ret;
asm("mov.u32 %0, %smid;" : "=r"(ret) );
return ret;
}

/************/
/* KERNEL 1 */
/************/
__global__ void kernel_1(float * __restrict__ data, const float value, int *sm, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;

if (i < N) {
data[i] = data[i] / value;
if (threadIdx.x==0) sm[blockIdx.x]=get_smid();
}

}

//__global__ void kernel_1(float* data, float value, int N)
//{
// int start = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = start; i < N; i += blockDim.x * gridDim.x)
// {
// data[i] = data[i] / value;
// }
//}

/************/
/* KERNEL 2 */
/************/
__global__ void kernel_2(float * __restrict__ data, const float value, int *sm, int N)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;

if (i < N) {
data[i] = data[i] * (value + sin(double(i)) * 1./cos(double(i)));
if (threadIdx.x==0) sm[blockIdx.x]=get_smid();
}
}

//__global__ void kernel_2(float* data, float value, int N)
//{
// int start = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = start; i < N; i += blockDim.x * gridDim.x)
// {
// data[i] = data[i] * (value + sin(double(i)) * 1./cos(double(i)));
// }
//}

/********/
/* MAIN */
/********/
int main()
{
const int N = 10000;

const float value = 5.55f;

const int rep_num = 20;

// --- CPU memory allocations
float *h_data1 = (float*) malloc(N*sizeof(float));
float *h_data2 = (float*) malloc(N*sizeof(float));
float *h_data1_ref = (float*) malloc(N*sizeof(float));
float *h_data2_ref = (float*) malloc(N*sizeof(float));

// --- CPU data initializations
srand(time(NULL));
for (int i=0; i<N; i++) {
h_data1[i] = rand() / RAND_MAX;
h_data2[i] = rand() / RAND_MAX;
}

// --- GPU memory allocations
float *d_data1, *d_data2;
gpuErrchk(cudaMalloc((void**)&d_data1, N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_data2, N*sizeof(float)));

// --- CPU -> GPU memory transfers
gpuErrchk(cudaMemcpy(d_data1, h_data1, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_data2, h_data2, N*sizeof(float), cudaMemcpyHostToDevice));

// --- CPU data initializations
srand(time(NULL));
for (int i=0; i<N; i++) {
h_data1_ref[i] = h_data1[i] / value;
h_data2_ref[i] = h_data2[i] * (value + sin(double(i)) * 1./cos(double(i)));
}

// --- Stream creations
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));

// --- Launch parameters configuration
int blockSize1, blockSize2, minGridSize1, minGridSize2, gridSize1, gridSize2;
cudaOccupancyMaxPotentialBlockSize(&minGridSize1, &blockSize1, kernel_1, 0, N);
cudaOccupancyMaxPotentialBlockSize(&minGridSize2, &blockSize2, kernel_2, 0, N);

gridSize1 = (N + blockSize1 - 1) / blockSize1;
gridSize2 = (N + blockSize2 - 1) / blockSize2;

// --- Allocating space for SM IDs
int *h_sm_11 = (int*) malloc(gridSize1*sizeof(int));
int *h_sm_12 = (int*) malloc(gridSize1*sizeof(int));
int *h_sm_21 = (int*) malloc(gridSize2*sizeof(int));
int *h_sm_22 = (int*) malloc(gridSize2*sizeof(int));
int *d_sm_11, *d_sm_12, *d_sm_21, *d_sm_22;
gpuErrchk(cudaMalloc((void**)&d_sm_11, gridSize1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_12, gridSize1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_21, gridSize2*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_22, gridSize2*sizeof(int)));

// --- Timing individual kernels
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

for (int i=0; i<rep_num; i++) kernel_1<<<gridSize1, blockSize1>>>(d_data1, value, d_sm_11, N);

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel 1 - elapsed time: %3.3f ms \n", time/rep_num);

cudaEventRecord(start, 0);

for (int i=0; i<rep_num; i++) kernel_2<<<gridSize2, blockSize2>>>(d_data1, value, d_sm_21, N);

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel 2 - elapsed time: %3.3f ms \n", time/rep_num);

// --- No stream case
cudaEventRecord(start, 0);

kernel_1<<<gridSize1, blockSize1>>>(d_data1, value, d_sm_11, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_data1, d_data1, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Results check
for (int i=0; i<N; i++) {
if (h_data1[i] != h_data1_ref[i]) {
printf("Kernel1 - Error at i = %i; Host = %f; Device = %f\n", i, h_data1_ref[i], h_data1[i]);
return;
}
}
#endif
kernel_2<<<gridSize2, blockSize2>>>(d_data1, value, d_sm_21, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_1<<<gridSize1, blockSize1>>>(d_data2, value, d_sm_12, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d_data2, h_data2, N*sizeof(float), cudaMemcpyHostToDevice));
#endif
kernel_2<<<gridSize2, blockSize2>>>(d_data2, value, d_sm_22, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_data2, d_data2, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (h_data2[i] != h_data2_ref[i]) {
printf("Kernel2 - Error at i = %i; Host = %f; Device = %f\n", i, h_data2_ref[i], h_data2[i]);
return;
}
}
#endif

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("No stream - elapsed time: %3.3f ms \n", time);

// --- Stream case
cudaEventRecord(start, 0);

kernel_1<<<gridSize1, blockSize1, 0, stream1 >>>(d_data1, value, d_sm_11, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_1<<<gridSize1, blockSize1, 0, stream2 >>>(d_data2, value, d_sm_12, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_2<<<gridSize2, blockSize2, 0, stream1 >>>(d_data1, value, d_sm_21, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_2<<<gridSize2, blockSize2, 0, stream2 >>>(d_data2, value, d_sm_22, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Stream - elapsed time: %3.3f ms \n", time);

cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);

printf("Test passed!\n");

gpuErrchk(cudaMemcpy(h_sm_11, d_sm_11, gridSize1*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_12, d_sm_12, gridSize1*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_21, d_sm_21, gridSize2*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_22, d_sm_22, gridSize2*sizeof(int), cudaMemcpyDeviceToHost));

printf("Kernel 1: gridSize = %i; blockSize = %i\n", gridSize1, blockSize1);
printf("Kernel 2: gridSize = %i; blockSize = %i\n", gridSize2, blockSize2);
for (int i=0; i<gridSize1; i++) {
printf("Kernel 1 - Data 1: blockNumber = %i; SMID = %d\n", i, h_sm_11[i]);
printf("Kernel 1 - Data 2: blockNumber = %i; SMID = %d\n", i, h_sm_12[i]);
}
for (int i=0; i<gridSize2; i++) {
printf("Kernel 2 - Data 1: blockNumber = %i; SMID = %d\n", i, h_sm_21[i]);
printf("Kernel 2 - Data 2: blockNumber = %i; SMID = %d\n", i, h_sm_22[i]);
}
cudaDeviceReset();

return 0;
}

N = 100N = 10000 的内核时序

N = 100
kernel_1 0.003ms
kernel_2 0.005ms

N = 10000
kernel_1 0.011ms
kernel_2 0.053ms

因此,内核 1 的计算成本高于内核 2。

N = 100的结果

Kernel 1: gridSize = 1; blockSize = 100
Kernel 2: gridSize = 1; blockSize = 100
Kernel 1 - Data 1: blockNumber = 0; SMID = 0
Kernel 1 - Data 2: blockNumber = 0; SMID = 1
Kernel 2 - Data 1: blockNumber = 0; SMID = 0
Kernel 2 - Data 2: blockNumber = 0; SMID = 1

在这种情况下,每个内核仅启动一个 block ,这是时间线。

enter image description here

如您所见,发生了重叠。通过查看上述结果,调度程序将两次调用的单个 block 并行地传递给两个可用的 SM,然后对内核 2 执行相同的操作。这似乎是发生重叠的主要原因。

N = 10000的结果

Kernel 1: gridSize = 14; blockSize = 768
Kernel 2: gridSize = 10; blockSize = 1024
Kernel 1 - Data 1: blockNumber = 0; SMID = 0
Kernel 1 - Data 2: blockNumber = 0; SMID = 1
Kernel 1 - Data 1: blockNumber = 1; SMID = 1
Kernel 1 - Data 2: blockNumber = 1; SMID = 0
Kernel 1 - Data 1: blockNumber = 2; SMID = 0
Kernel 1 - Data 2: blockNumber = 2; SMID = 1
Kernel 1 - Data 1: blockNumber = 3; SMID = 1
Kernel 1 - Data 2: blockNumber = 3; SMID = 0
Kernel 1 - Data 1: blockNumber = 4; SMID = 0
Kernel 1 - Data 2: blockNumber = 4; SMID = 1
Kernel 1 - Data 1: blockNumber = 5; SMID = 1
Kernel 1 - Data 2: blockNumber = 5; SMID = 0
Kernel 1 - Data 1: blockNumber = 6; SMID = 0
Kernel 1 - Data 2: blockNumber = 6; SMID = 0
Kernel 1 - Data 1: blockNumber = 7; SMID = 1
Kernel 1 - Data 2: blockNumber = 7; SMID = 1
Kernel 1 - Data 1: blockNumber = 8; SMID = 0
Kernel 1 - Data 2: blockNumber = 8; SMID = 1
Kernel 1 - Data 1: blockNumber = 9; SMID = 1
Kernel 1 - Data 2: blockNumber = 9; SMID = 0
Kernel 1 - Data 1: blockNumber = 10; SMID = 0
Kernel 1 - Data 2: blockNumber = 10; SMID = 0
Kernel 1 - Data 1: blockNumber = 11; SMID = 1
Kernel 1 - Data 2: blockNumber = 11; SMID = 1
Kernel 1 - Data 1: blockNumber = 12; SMID = 0
Kernel 1 - Data 2: blockNumber = 12; SMID = 1
Kernel 1 - Data 1: blockNumber = 13; SMID = 1
Kernel 1 - Data 2: blockNumber = 13; SMID = 0
Kernel 2 - Data 1: blockNumber = 0; SMID = 0
Kernel 2 - Data 2: blockNumber = 0; SMID = 0
Kernel 2 - Data 1: blockNumber = 1; SMID = 1
Kernel 2 - Data 2: blockNumber = 1; SMID = 1
Kernel 2 - Data 1: blockNumber = 2; SMID = 1
Kernel 2 - Data 2: blockNumber = 2; SMID = 0
Kernel 2 - Data 1: blockNumber = 3; SMID = 0
Kernel 2 - Data 2: blockNumber = 3; SMID = 1
Kernel 2 - Data 1: blockNumber = 4; SMID = 1
Kernel 2 - Data 2: blockNumber = 4; SMID = 0
Kernel 2 - Data 1: blockNumber = 5; SMID = 0
Kernel 2 - Data 2: blockNumber = 5; SMID = 1
Kernel 2 - Data 1: blockNumber = 6; SMID = 1
Kernel 2 - Data 2: blockNumber = 6; SMID = 0
Kernel 2 - Data 1: blockNumber = 7; SMID = 0
Kernel 2 - Data 2: blockNumber = 7; SMID = 1
Kernel 2 - Data 1: blockNumber = 8; SMID = 1
Kernel 2 - Data 2: blockNumber = 8; SMID = 0
Kernel 2 - Data 1: blockNumber = 9; SMID = 0
Kernel 2 - Data 2: blockNumber = 9; SMID = 1

这是时间线:

enter image description here

在这种情况下,不会发生重叠。根据以上结果,这并不意味着两个 SM 没有同时被利用,而是(我认为),由于要启动的区 block 数量较多,分配两个不同内核的区 block 或相同的两个区 block 内核在性能方面没有太大区别,因此调度程序选择第二个选项。

我已经测试过,考虑到每个线程完成的工作更多,行为保持不变。

关于concurrency - CUDA 流和并发内核执行,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/17201473/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com