gpt4 book ai didi

c++ - cudaGetLastError 返回 "unknown error"

转载 作者:行者123 更新时间:2023-11-30 02:04:44 24 4
gpt4 key购买 nike

我是 CUDA C 的新手。我正在编写一个简单的数组添加和减少,当它运行错误检查以从设备复制回主机时,我收到“未知错误”。我不确定错误检查器是否有问题并且没有返回正确的 cudaError 但我无法找出问题所在......

using namespace std;


#include <iostream>

void CudaAddReduce(int *input, int *output, size_t size);

__global__ void Fill(int *fillItem);

__global__ void Add(int *input1, int *result);

__global__ void Reduce(int *intputArray, int *outputArray);



main(int argc, char *argv[])
{
const int N = 100;
int inp[N];
int outp[N];
size_t size = (N * sizeof(int));

CudaAddReduce(inp,outp,size);

cout << outp[N] << endl;

}

void CudaAddReduce(int *input, int *output, size_t size)
{

// allocate buffers to device

//input
int *d_input;
if (cudaMalloc(&d_input,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
cout << "Input allocation to device" << endl;
exit(1);
}
////////////////////////////
//output
int *d_output;
if (cudaMalloc(&d_output,size) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) <<endl;
cout << "output allocation to device" << endl;
exit(1);
}

//////////////////////////////////
//copy buffers to device from host
//////////////////////////////////
//input
if (cudaMemcpy(d_input, input, size, cudaMemcpyHostToDevice) != cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
cout << "Input Copy from host to device" << endl;
exit(1);
}

/////////////////////////////////
//execute device kernals
/////////////////////////////////
int numThreads = 256;
int numBlocks = 1;

//Fill Kernal
Fill<<<numBlocks,numThreads>>>(d_input);



// Add Kernal
Add<<<numBlocks,numThreads>>>(d_input,d_output);


//execute Reduce Kernal
Reduce<<<numBlocks,numThreads>>>(d_output,d_input);

cudaThreadSynchronize();

/////////////////////////////////
//copy result from device to host
/////////////////////////////////
//output


if (cudaMemcpy(output,d_output,size,cudaMemcpyDeviceToHost)!= cudaSuccess){
cerr << cudaGetErrorString(cudaGetLastError()) << endl;
cout << "Output Copy from device to host" << endl;
exit(1);
}

//clear device buffers
cudaFree(d_input);
cudaFree(d_output);
}

__global__ void Fill(int *fillItem)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
fillItem[id] = 1;
}
__global__ void Add (int *input1, int* result)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
result[id] = input1[id] + input1[id];
}
__global__ void Reduce(int *inputArray, int *outputArray)
{
extern __shared__ int sdata[];

// each thread loads one element from global to shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = inputArray[i];
__syncthreads();

// do reduction in shared mem
for(unsigned int s=1; s < blockDim.x; s *= 2)
{
if(tid % (2*s) == 0)
{
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}


// write result for this block to global mem
if(tid == 0) outputArray[blockIdx.x] = sdata[0];
}

谢谢

最佳答案

编辑:在您的内核中,您试图越界访问数组元素!您的数组大小为 100,但您使用的线程维度为 256,每个线程都试图写入该维度!您需要使用一致的尺寸。

您在什么时候收到错误?这两个 malloc 函数看起来应该可以正常运行,而且 cudaGetErrorString 不太可能出错。我对未知错误的典型体验是,您正试图从不应该复制的地方复制或复制到不应该复制的地方,或者复制大小错误。

为什么要将未分配的数组复制到内存中?您从未填写过 main 中的数组。

此外,您不需要使用<<<>>>声明内核函数。这些只有在您使用该功能时才需要。

关于c++ - cudaGetLastError 返回 "unknown error",我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/10271747/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com