gpt4 book ai didi

c++ - CUDA 结果使用非常大的数组返回垃圾,但没有报告错误

转载 作者:塔克拉玛干 更新时间:2023-11-03 00:21:03 26 4
gpt4 key购买 nike

我正在创建一个测试程序,它将创建一个设备和一个大小为 n 的主机数组,然后启动一个内核来创建 n 个分配常量值 0.95 的线程f 到设备数组中的每个位置。完成后,将设备数组复制到主机数组,并对所有条目进行总计,并显示最终总计。

下面的程序似乎适用于大约 6000 万个 float 的数组大小并很快返回正确的结果,但在达到 7000 万个时程序似乎会挂起一段时间并最终返回总数的 NAN 结果。在 6000 万次运行后检查主机阵列显示它正确填充了 0.95f,但在 7000 万次运行后检查它显示它填充了 NAN。据我所知,所有 CUDA 调用都不会返回错误。

我使用的是 2GB GT640m(Compute 3.0),最大块大小为 1024,最大网格尺寸为 2147483647。

我确信有更好的方法可以实现类似的目标,我想听听建议。但我也想了解这里出了什么问题,以便我可以从中吸取教训。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <fstream>

void cudaErrorHandler(cudaError_t status)
{
// Cuda call returned an error, just print error for now
if(status != cudaSuccess)
{
printf("Error");
}
}

__global__ void addKernel(float* _Results, int _TotalCombinations)
{
// Get thread Id
unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x;

//If the Id is within simulation range, log it
if(Id < _TotalCombinations)
{
_Results[Id] = 0.95f;
}
}

#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024)
#define CUDA_CALL(x) cudaErrorHandler(x)

int main()
{
// The number of simulations to run
unsigned int totalCombinations = 45000000;

int gridsize = 1;

// Work out how many blocks of size 1024 are required to perform all of totalCombinations
for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations;
gridsize++, totalsize = gridsize * BLOCK_SIZE)
;

// Allocate host memory
float* host_results = new float[totalCombinations];
memset(host_results, 0, sizeof(float) * totalCombinations);
float *dev_results = 0;

cudaSetDevice(0);

// Allocate device memory
CUDA_CALL(cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float)));

dim3 grid, block;

block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y);

grid = dim3(gridsize);

// Launch kernel
addKernel<<<gridsize, block>>>(dev_results, totalCombinations);

// Wait for synchronize
CUDA_CALL(cudaDeviceSynchronize());

// Copy device data back to host
CUDA_CALL(cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost));

double total = 0.0;

// Total the results in the host array
for(unsigned int i = 0; i < totalCombinations; i++)
total+=host_results[i];

// Print results to screen
printf("Total %f\n", total);

delete[] host_results;

return 0;
}

最佳答案

如您所见,您的错误处理方法不起作用。下面我粘贴了您的代码版本,其中包含我经常使用的错误检查方法。在您的故障点无法正常工作的原因是您的网格大小(您正在启动一维网格)超过了 X 维度中的最大网格大小(默认情况下为 65535,即计算能力高达 2.x)。如果您想利用更大的网格尺寸(2^31 -1 是计算能力 3.0 的限制),您需要使用 -arch=sm_30 开关进行编译。

此处仅供引用的是您的代码版本,其中显示了我经常使用的错误检查方法。

#include <stdio.h>
#include <fstream>


#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)

__global__ void addKernel(float* _Results, int _TotalCombinations)
{
// Get thread Id
unsigned int Id = (blockDim.x * blockDim.y * blockIdx.x) + (blockDim.x * threadIdx.y) + threadIdx.x;

//If the Id is within simulation range, log it
if(Id < _TotalCombinations)
{
_Results[Id] = 0.95f;
}
}

#define BLOCK_DIM_X 32
#define BLOCK_DIM_Y 32
#define BLOCK_SIZE BLOCK_DIM_X * BLOCK_DIM_Y // Statc block size of 32*32 (1024)

int main()
{
// The number of simulations to run
unsigned int totalCombinations = 65000000;

int gridsize = 1;

// Work out how many blocks of size 1024 are required to perform all of totalCombinations
for(unsigned int totalsize = gridsize * BLOCK_SIZE; totalsize < totalCombinations;
gridsize++, totalsize = gridsize * BLOCK_SIZE)
;
printf("gridsize = %d, blocksize = %d\n", gridsize, BLOCK_SIZE);
// Allocate host memory
float* host_results = new float[totalCombinations];
memset(host_results, 0, sizeof(float) * totalCombinations);
float *dev_results = 0;

cudaSetDevice(0);

// Allocate device memory
cudaMalloc((void**)&dev_results, totalCombinations * sizeof(float));
cudaCheckErrors("cudaMalloc fail");

dim3 grid, block;

block = dim3(BLOCK_DIM_X, BLOCK_DIM_Y);

grid = dim3(gridsize);

// Launch kernel
addKernel<<<gridsize, block>>>(dev_results, totalCombinations);
cudaCheckErrors("kernel fail");
// Wait for synchronize
cudaDeviceSynchronize();
cudaCheckErrors("sync fail");

// Copy device data back to host
cudaMemcpy(host_results, dev_results, totalCombinations * sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");

double total = 0.0;

// Total the results in the host array
for(unsigned int i = 0; i < totalCombinations; i++)
total+=host_results[i];

// Print results to screen
printf("Total %f\n", total);

delete[] host_results;

return 0;
}

关于c++ - CUDA 结果使用非常大的数组返回垃圾,但没有报告错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/13532089/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com