gpt4 book ai didi

c++ - cudaMemcpy 上的 cudaErrorIllegalAdress

转载 作者:太空宇宙 更新时间:2023-11-04 02:36:32 24 4
gpt4 key购买 nike

我是 cuda 的新手,正在尝试编写一些应该在球体上生成随机点的代码。这是代码。

    __global__ 
void setup_kernel(curandStateMRG32k3a *state)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(0, id, 0, &state[id]);
}

__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}

void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}

size_t size = h_x.size() * sizeof(float);

float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}

float* d_p_x;
float* d_p_y;
float* d_p_z;

if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}

int threads = 256;
dim3 grid = size / threads;
setup_kernel<<<grid,threads>>>(devStates);

if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}

computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);

if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}

free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}

如果 vector 中的元素数量少于 4000(我尝试了 1K、2K、3K 和 4K),则此代码有效。比它在第一个 cudaMemcpy 中给我 cuda Error Illegal Address。我认为我没有用完内存,我正在使用 gtx 980(4GB 全局内存)。知道如何解决这个问题吗?


编辑:建议修改后的代码如下:

__global__ 
void setup_kernel(curandStateMRG32k3a *state, unsigned int numberOfElements)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < numberOfElements) curand_init(0, id, 0, &state[id]);
}

__global__
void computeRandomVectors(float* x, float* y, float* z, unsigned int numberOfElements,curandStateMRG32k3a *state)
{
float a,b;
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
curandStateMRG32k3a localState = state[i];
if(i < numberOfElements)
{
a = curand_uniform(&localState);
b = curand_uniform(&localState);
while(a * a + b * b > 1.0f)
{
a = curand_uniform(&localState) * 2.0f - 1.0f;
b = curand_uniform(&localState) * 2.0f - 1.0f;
}
x[i] = 2.0f * a * sqrtf(1.0f - a * a - b * b);
y[i] = 2.0f * b * sqrtf(1.0f - a * a - b * b);
z[i] = 1.0f - 2.0f * (a * a + b * b);
}
}

void generatePointsOnASphere(thrust::host_vector<float>& h_x, thrust::host_vector<float>& h_y, thrust::host_vector<float>& h_z)
{
if(h_x.size() != h_y.size() && h_x.size() != h_z.size())
{
std::cout << "The three component vectors have unmatching size()" << std::endl;
return;
}

size_t size = h_x.size() * sizeof(float);

float* h_p_x = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_y = (float*) calloc(h_x.size(),sizeof(float));
float* h_p_z = (float*) calloc(h_x.size(),sizeof(float));
if(h_p_x==NULL || h_p_y==NULL || h_p_z==NULL)
{
std::cout << "Host memory allocation failure" << std::endl;
return;
}

float* d_p_x;
float* d_p_y;
float* d_p_z;

if(cudaMalloc((void **)&d_p_x,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_y,size) != cudaSuccess ||
cudaMalloc((void **)&d_p_z,size) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device memory allocation failure" << std::endl;
return;
}
curandStateMRG32k3a *devStates;
if(cudaMalloc((void **)&devStates, h_x.size() * sizeof(curandStateMRG32k3a)) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Random generator states memory allocation failure" << std::endl;
return;
}

if(cudaMemcpy(d_p_x,h_p_x,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_y,h_p_y,size,cudaMemcpyHostToDevice) != cudaSuccess ||
cudaMemcpy(d_p_z,h_p_z,size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Host to Device memory copy failure" << std::endl;
}

int threads = 512;
dim3 grid = (h_x.size() + threads - 1) / threads;
setup_kernel<<<grid,threads>>>(devStates, size / sizeof(float));
computeRandomVectors<<< grid, threads >>>(d_p_x,d_p_y,d_p_z,size / sizeof(float), devStates);
cudaDeviceSynchronize();
if(cudaMemcpy(h_p_x,d_p_x,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_y,d_p_y,size,cudaMemcpyDeviceToHost) != cudaSuccess ||
cudaMemcpy(h_p_z,d_p_z,size,cudaMemcpyDeviceToHost) != cudaSuccess)
{
std::string errorString(cudaGetErrorName(cudaGetLastError()));
std::cout << errorString << std::endl;
std::cout << "Device to Host memory copy failure" << std::endl;
}
for(size_t i = 0; i < h_x.size(); ++i)
{
h_x[i] = h_p_x[i];
h_y[i] = h_p_y[i];
h_z[i] = h_p_z[i];
}

free (h_p_x);
free (h_p_y);
free (h_p_z);
cudaFree (devStates);
cudaFree (d_p_x);
cudaFree (d_p_y);
cudaFree (d_p_z);
cudaDeviceReset();
}

我为继续在这里发帖感到抱歉,但我认为通过了解我现在的错误,我想我可能会更好地了解 cuda。所以,现在当 h_x.size() 为 20k 时,我在 cudaMemcpy device->host 上收到 errorIllegalAdress。我仍然不明白代码如何适用于小数字而不适用于大数字。

最佳答案

问题出在这里:

  size_t size = h_x.size() * sizeof(float);

...
int threads = 256;
dim3 grid = size / threads;

您的size 变量按字节 的数量缩放。所以这不是用于网格大小的正确变量。您应该像这样计算网格大小:

  dim3 grid = h_x.size() / threads;

或类似的。另请注意,此构造不会正确初始化所有 curand 状态,除非 vector 长度 (h_x.size()) 可被 threads 整除,即 256。这将在您的 setup_kernel 中包含线程检查,类似于您的其他内核中的线程检查:

__global__ 
void setup_kernel(curandStateMRG32k3a *state, int size)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id < size)
curand_init(0, id, 0, &state[id]);
}

并启动足够多的线程来覆盖 vector 大小:

  dim3 grid = (h_x.size()+threads-1) / threads;

关于c++ - cudaMemcpy 上的 cudaErrorIllegalAdress,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36508747/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com