gpt4 book ai didi

CUDA 固定内存从设备刷新

转载 作者:行者123 更新时间:2023-12-04 23:04:28 24 4
gpt4 key购买 nike

CUDA 5,设备功能 3.5,VS 2012,64 位 Win 2012 Server。

线程之间没有共享内存访问,每个线程都是独立的。

我正在使用带零拷贝的固定内存。从主机上,我只能读取设备写入的固定内存,只有当我在主机上发出 cudaDeviceSynchronize 时。

I want to be able to:

  1. Flush into the pinned memory as soon as the device has updated it.
  2. Not block the device thread (maybe by copying asynchronously)

我尝试在每次设备写入后调用 __threadfence_system__threadfence,但没有刷新。

下面是演示我的问题的完整示例 CUDA 代码:

#include <conio.h>
#include <cstdio>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;

printf("Kernel %u: Before Writing in Kernel\n", tid);
hResult[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
printf("Kernel %u: After Writing in Kernel\n", tid);
// time waster for-loop (sleep)
for (int timeWater = 0; timeWater < 100000000; timeWater++);
}

void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
int filledElementsCounter = 0;
// naiive thread implementation that can be impelemted using
// another host thread
while (filledElementsCounter < blocks)
{
// blocks until the value changes, this moves sequentially
// while threads have no order (fine for this sample).
while(hResult[filledElementsCounter] == 0);
printf("%f\n", hResult[filledElementsCounter]);;
filledElementsCounter++;
}
cudaFreeHost((void *)hResult);
system("pause");
}

目前此示例将无限期等待,因为没有从设备读取任何内容,除非我发出 cudaDeviceSynchronize。下面的示例有效,但它不是我想要的,因为它违背了异步复制的目的:

void main()
{
size_t blocks = 2;
volatile float* hResult;
cudaHostAlloc((void**)&hResult, blocks*sizeof(float), cudaHostAllocMapped);
Kernel<<<1,blocks>>>(hResult);
cudaError_t error = cudaDeviceSynchronize();
if (error != cudaSuccess) { throw; }
for(int i = 0; i < blocks; i++)
{
printf("%f\n", hResult[i]);
}
cudaFreeHost((void *)hResult);
system("pause");
}

最佳答案

我在带有 CUDA 5.5 和 Tesla M2090 的 Centos 6.2 上玩过你的代码,可以得出以下结论:

它在您的系统上不起作用的问题一定是驱动程序问题,我建议您获取 TCC 驱动程序。

我附上了我的代码,它运行良好并且可以满足您的要求。这些值在内核结束之前出现在主机端。如您所见,我添加了一些计算代码以防止由于编译器优化而删除 for 循环。我添加了一个流和一个在流中的所有工作完成后执行的回调。该程序输出 1 2 并在很长一段时间内什么都不做,直到 stream finished... 被打印到控制台。

 #include <iostream>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define SEC_CUDA_CALL(val) checkCall ( (val), #val, __FILE__, __LINE__ )

bool checkCall(cudaError_t result, char const* const func, const char *const file, int const line)
{
if (result != cudaSuccess)
{
std::cout << "CUDA (runtime api) error: " << func << " failed! " << cudaGetErrorString(result) << " (" << result << ") " << file << ":" << line << std::endl;
}
return result != cudaSuccess;
}

class Callback
{
public:
static void CUDART_CB dispatch(cudaStream_t stream, cudaError_t status, void *userData);

private:
void call();
};

void CUDART_CB Callback::dispatch(cudaStream_t stream, cudaError_t status, void *userData)
{
Callback* cb = (Callback*) userData;
cb->call();
}

void Callback::call()
{
std::cout << "stream finished..." << std::endl;
}



__global__ void Kernel(volatile float* hResult)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;

hResult[tid] = tid + 1;
__threadfence_system();
float A = 0;
for (int timeWater = 0; timeWater < 100000000; timeWater++)
{
A = sin(cos(log(hResult[0] * hResult[1]))) + A;
A = sqrt(A);
}
}

int main(int argc, char* argv[])
{
size_t blocks = 2;
volatile float* hResult;
SEC_CUDA_CALL(cudaHostAlloc((void**)&hResult,blocks*sizeof(float),cudaHostAllocMapped));

cudaStream_t stream;
SEC_CUDA_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
Callback obj;
Kernel<<<1,blocks,NULL,stream>>>(hResult);
SEC_CUDA_CALL(cudaStreamAddCallback(stream, Callback::dispatch, &obj, 0));

int filledElementsCounter = 0;

while (filledElementsCounter < blocks)
{
while(hResult[filledElementsCounter] == 0);
std::cout << hResult[filledElementsCounter] << std::endl;
filledElementsCounter++;
}

SEC_CUDA_CALL(cudaStreamDestroy(stream));
SEC_CUDA_CALL(cudaFreeHost((void *)hResult));
}

没有调用返回错误,cuda-memcheck 没有发现任何问题。这按预期工作。你真的应该试试 TCC 驱动程序。

关于CUDA 固定内存从设备刷新,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/16417346/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com