gpt4 book ai didi

c++ - 为什么 cudaMemcpyAsync(主机到设备)和 CUDA 内核不是并行的?

转载 作者:太空狗 更新时间:2023-10-29 23:01:21 24 4
gpt4 key购买 nike

我加载了一张大小为 1080 x 1920 的图像(8 位,无符号字符)。出于测试目的,我使用 for 循环 处理同一图像 4 次,然后生成其时间线分析。

策略:我将图像分成 3 部分。我为整个图像的处理制作了三个流。

我在下面提供了一个最小的工作示例。很抱歉,它需要使用 OpenCV 的图像,但我不知道如何在不使用 OpenCV 加载图像的情况下模拟相同的情况。

问题:时间线分析显示第一个流已完成数据传输,但分配给它的内核仍未启动。分配给第一个流的内核和第三个流的数据传输是并行的。 那么,我的问题是,为什么第一个流的内核处理没有与第二个流的数据传输并行开始?

GPU: NVIDIA Quadro K2000,兼容 3.0

时间轴配置文件:每个流都分配了不同的颜色。

image

我的代码:

__global__ void multiStream_ColorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
{
// Calculate our pixel's location
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

// Operate only if we are in the correct boundaries
if (x >= 0 && x < iw && y >= 0 && y < ih / 3)
{
ptr_dst[numChannels* (iw*y + x) + 0] = ptr_source[numChannels* (iw*y + x) + 0];
ptr_dst[numChannels* (iw*y + x) + 1] = ptr_source[numChannels* (iw*y + x) + 1];
ptr_dst[numChannels* (iw*y + x) + 2] = ptr_source[numChannels* (iw*y + x) + 2];

}
}

void callMultiStreamingCudaKernel(unsigned char *dev_src, unsigned char *dev_dst, int numChannels, int iw, int ih, cudaStream_t *ptr_stream)
{

dim3 numOfBlocks((iw / 20), (ih / 20)); //DON'T multiply by 3 because we have 1/3 data of image
dim3 numOfThreadsPerBlocks(20, 20);
multiStream_ColorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks, 0, *ptr_stream >> >(numChannels, iw, ih, dev_src, dev_dst);

return;
}

int main()
{

cudaStream_t stream_one;
cudaStream_t stream_two;
cudaStream_t stream_three;

cudaStreamCreate(&stream_one);
cudaStreamCreate(&stream_two);
cudaStreamCreate(&stream_three);

Mat image = imread("DijSDK_test_image.jpg", 1);
//Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
size_t numBytes = image.rows * image.cols * 3;
int numChannels = 3;

int iw = image.rows;
int ih = image.cols;
size_t totalMemSize = numBytes * sizeof(unsigned char);
size_t oneThirdMemSize = totalMemSize / 3;

unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;


//Allocate memomry at device for SOURCE and DESTINATION and get their pointers
cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);

//Get the processed image
Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
h_dst = org_dijSDK_img.data;

//while (1)
for (int i = 0; i < 3; i++)
{
std::cout << "\nLoop: " << i;

//copy new data of image to the host pointer
h_src = image.data;

//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
//KERNEL--stream-1
callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);


//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
//KERNEL--stream-2
callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);

//Copy the source image to the device i.e. GPU
cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);
//KERNEL--stream-3
callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);


//RESULT copy: GPU to CPU
cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);

// wait for results
cudaStreamSynchronize(stream_one);
cudaStreamSynchronize(stream_two);
cudaStreamSynchronize(stream_three);

//Assign the processed data to the display image.
org_dijSDK_img.data = h_dst;
//DISPLAY PROCESSED IMAGE
imshow("Processed dijSDK image", org_dijSDK_img);
waitKey(33);
}

cudaDeviceReset();
return 0;
}

UPDATE-1:如果我删除第一个流的内核调用,那么第二个内核和第三个流的 H2D 拷贝会以某种方式重叠(不完全重叠),如下所示。

image2

UPDATE-2 我什至尝试使用 10 个流,但结果保持不变。第一个流的内核处理仅在第十个流数据的 H2D 拷贝之后开始。

image-3

最佳答案

正如评论者已经指出的那样,主机内存必须是 page locked .

不需要通过cudaHostAlloc分配额外的主机内存,可以使用cudaHostRegister在您现有的 OpenCV 图像上:

cudaHostRegister(image.data, totalMemSize, cudaHostRegisterPortable)

关于c++ - 为什么 cudaMemcpyAsync(主机到设备)和 CUDA 内核不是并行的?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/31450020/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com