gpt4 book ai didi

concurrency - cuda了解并发内核执行

转载 作者:行者123 更新时间:2023-12-04 18:11:47 25 4
gpt4 key购买 nike

我试图了解并行内核执行的工作方式。我写了一个简单的程序来尝试理解它。内核将使用2个流填充2D数组。当有1个流,没有并发时,我得到正确的结果。当我尝试使用2个流进行尝试并发尝试时,我得到了错误的结果。我相信这与内存传输有关,因为我不确定我是否正确,或者我设置内核的方式。编程指南对我的解释不够好。
就我的目的而言,我需要Matlab来调用内核。

据我了解,主程序将:

  • 在主机
  • 上分配固定的内存
  • 在GPU上分配单个流所需的内存(2个流=主机总内存的一半)
  • 创建流
  • 通过流
  • 循环
  • 使用cudaMemcpyAsync()从主机到设备将单个流的内存复制到设备
  • 为流
  • 执行内核
  • 将流的内存复制回主机cudaMemcpyAsync()
  • 我相信我正在做正确的事情,通过使用基于每个流的数据大小和流号的偏移量从每个流所需的位置引用内存。
  • 破坏流
  • 释放内存

  • 这是我尝试使用的代码。

    parallelKernel.cpp
    __global__ void concurrentKernel(int const width, 
    int const streamIdx,
    double *array)
    {
    int thread = (blockIdx.x * blockDim.x) + threadIdx.x;;

    for (int i = 0; i < width; i ++)
    {
    array[thread*width+i] = thread+i*width+1;
    // array[thread*width+i+streamIdx] = thread+i*width+streamIdx*width/2;
    }

    }

    parallelMexFunction.cu
    #include <stdio.h>
    #include <math.h>
    #include "mex.h"

    /* Kernel function */
    #include "concurrentKernel.cpp"


    void mexFunction(int nlhs,
    mxArray *plhs[],
    int nrhs,
    mxArray *prhs[])
    {

    int const numberOfStreams = 2; // set number of streams to use here.
    cudaError_t cudaError;
    int offset;

    int width, height, fullSize, streamSize;
    width = 512;
    height = 512;
    fullSize = height*width;
    streamSize = (int)(fullSize/numberOfStreams);
    mexPrintf("fullSize: %d, streamSize: %d\n",fullSize, streamSize);

    /* Return the populated array */
    double *returnedArray;
    plhs[0] = mxCreateDoubleMatrix(height, width, mxREAL);
    returnedArray = mxGetPr(plhs[0]);

    cudaStream_t stream[numberOfStreams];
    for (int i = 0; i < numberOfStreams; i++)
    {
    cudaStreamCreate(&stream[i]);
    }

    /* host memory */
    double *hostArray;
    cudaError = cudaMallocHost(&hostArray,sizeof(double)*fullSize); // full size of array.
    if (cudaError != cudaSuccess) {mexPrintf("hostArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    for (int i = 0; i < height; i++)
    {
    for (int j = 0; j < width; j++)
    {
    hostArray[i*width+j] = -1.0;
    }
    }

    /* device memory */
    double *deviceArray;
    cudaError = cudaMalloc( (void **)&deviceArray,sizeof(double)*streamSize); // size of array for each stream.
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }


    for (int i = 0; i < numberOfStreams; i++)
    {
    offset = i;//*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);

    cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);
    }


    for (int i = 0; i < numberOfStreams; i++)
    {
    cudaStreamDestroy(stream[i]);
    }

    cudaFree(hostArray);
    cudaFree(deviceArray);

    }

    当有2个流时,结果是一个零数组,这使我认为它对内存做错了。
    谁能解释我在做什么错?
    如果有人需要帮助从Matlab编译和运行它们,我可以提供命令来完成。

    更新:
    for (int i = 0; i < numberOfStreams; i++)
    {
    offset = i*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);


    }
    cudaDeviceSynchronize();


    for (int i = 0; i < numberOfStreams; i++)
    {
    offset = i*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);

    cudaStreamDestroy(stream[i]);
    }

    最佳答案

    您需要记住,与流一起使用的API是完全异步的,因此控制权将立即返回给调用宿主线程。如果您没有在运行异步操作的GPU和主机之间插入某种类型的同步点,则无法保证流中已排队的操作实际上已经完成。在您的示例中,这意味着需要执行以下操作:

    for (int i = 0; i < numberOfStreams; i++) 
    {
    offset = i;//*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize,
    cudaMemcpyHostToDevice, stream[i]);

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);

    cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize,
    cudaMemcpyDeviceToHost, stream[i]);
    }

    // Host thread waits here until both kernels and copies are finished
    cudaDeviceSynchronize();

    for (int i = 0; i < numberOfStreams; i++)
    {
    mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);
    cudaStreamDestroy(stream[i]);
    }

    这里的关键是,在尝试检查主机内存中的结果之前,您需要确保两个内存传输均已完成。您的原始代码或更新都不会执行此操作。

    关于concurrency - cuda了解并发内核执行,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/12344223/

    25 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com