gpt4 book ai didi

cuda - 费米架构的错误依赖问题

转载 作者:行者123 更新时间:2023-12-01 16:43:32 25 4
gpt4 key购买 nike

我正在尝试使用 3 流实现“3 方式重叠”,如 CUDA streams and concurrency webinar 中的示例所示。 。但我无法实现它。

我拥有 Geforce GT 550M(带有一个复制引擎的 Fermi 架构),并且使用的是 Windows 7(64 位)。

这是我编写的代码。

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions

#include <stdio.h>

using namespace std;

#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}

int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];

// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;

for(int i = 0; i < dataSize; i++)
h_out[i] = 0;

// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];

// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors( cudaSetDevice(0) );

int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize / NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize / NUM_STREAMS;

checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );

// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));

// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));

// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));

cudaEventRecord(start, 0);

// overlapped execution using version 2

for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}

//cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice);

for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

//kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}

for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}



for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));

cudaEventRecord(stop, 0);

checkCudaErrors(cudaStreamSynchronize(0));

checkCudaErrors(cudaDeviceSynchronize());

float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));


// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));

for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));

cudaDeviceReset();

cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;


// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];

cout << "Error between CPU and GPU: " << sum << endl;

delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;

return 0;
}

使用 Nsight 进行分析,我得到以下结果:

enter image description here

这似乎是正确的,但为什么流 #1 中的 D2H 传输仅在流 #2 的最后一次内核启动时开始,而不是之前?我还尝试使用 8 流(只需将 NUM_STREAM 更改为 8)来实现这样的“3-方式重叠”,结果如下:

enter image description here

有趣的是,当我使用 8 流时,计算和内存传输之间的重叠似乎要好得多。

这个问题的原因是什么?是WDDM驱动的问题还是我的程序有问题?

最佳答案

从上面的评论来看,OP的问题似乎是一个错误依赖问题,受到Fermi架构的困扰,并通过Kepler架构的Hyper-Q功能解决。

总而言之,OP 强调了这样一个事实:第一个 D2H 传输(流 #1)不会在最后一个 H2D(流 #3)完成后立即开始,但原则上可以。下图中红色圆圈突出显示的时间差距(此后,但如有不同,所有测试均指Fermi系列的GeForce GT540M):

enter image description here

OP 的方法是一种广度优先方法,它根据以下方案运行:

for(int i = 0; i < NUM_STREAMS; i++)
cudaMemcpyAsync(..., cudaMemcpyHostToDevice, streams[i]);

for(int i = 0; i < NUM_STREAMS; i++)
{
kernel_launch_1<<<..., 0, streams[i]>>>(...);
kernel_launch_2<<<..., 0, streams[i]>>>(...);
}

for(int i = 0; i < NUM_STREAMS; i++)
cudaMemcpyAsync(..., cudaMemcpyDeviceToHost, streams[i]);

采用深度优先方法,按照以下方案进行操作

for(int i = 0; i < NUM_STREAMS; i++)
{
cudaMemcpyAsync(...., cudaMemcpyHostToDevice, streams[i]);

kernel_launch_1<<<...., 0, streams[i]>>>(....);
kernel_launch_2<<<...., 0, streams[i]>>>(....);

cudaMemcpyAsync(...., cudaMemcpyDeviceToHost, streams[i]);
}

根据以下时间线,情况似乎没有改善(深度优先代码在答案的底部报告),但似乎显示出更糟糕的重叠:

enter image description here

在广度优先方法下,并评论第二个内核启动,第一个 D2H 副本会立即启动,如以下时间线所示:

enter image description here

最后在Kepler K20c上运行代码,问题没有出现,如下图所示:

enter image description here

这是深度优先方法的代码:

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions

#include <stdio.h>

using namespace std;

#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}

int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];

// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;

for(int i = 0; i < dataSize; i++)
h_out[i] = 0;

// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];

// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors( cudaSetDevice(0) );

int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize / NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize / NUM_STREAMS;

checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );

// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));

// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));

// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));

cudaEventRecord(start, 0);

for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;

cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);

dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);

cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}



for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));

cudaEventRecord(stop, 0);

checkCudaErrors(cudaStreamSynchronize(0));

checkCudaErrors(cudaDeviceSynchronize());

float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));


// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));

for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));

cudaDeviceReset();

cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;


// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];

cout << "Error between CPU and GPU: " << sum << endl;

delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;

return 0;
}

关于cuda - 费米架构的错误依赖问题,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24754878/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com