gpt4 book ai didi

c - GPU(CUDA)和CPU计算结果不同

转载 作者:行者123 更新时间:2023-11-30 15:51:17 26 4
gpt4 key购买 nike

我想创建在我的 GPU 上生成分形的程序。首先,我用 C 创建了一个工作项目,然后尝试将其转换为 CUDA/C。

不幸的是,在我这样做之后,我发现CPU和GPU的结果存在差异。

我花了几个小时思考我做错了什么,这对我来说是个谜。

IMO:似乎 while 循环中的计算值存在差异,因此它比正常的 CPU 函数更早结束。

问题:这有可能是真的吗?如果,我该怎么做才能避免这种计算错误?

这是我的完整代码:

// C libs
#include <stdint.h>
#include <stdio.h>
#include <iostream>

// Help libs
#include <windows.h>
#include <math.h>

// CUDA libs
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void calulateFractal(unsigned char *a, int N, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;

if(i < N)
{
int x = i % width;
int y = i / width;

double p_im = y * ratioY + minY;
double p_re = x * ratioX + minX;

double z_re = p_re;
double z_im = p_im;

int iteration = 0;

while ((z_re * z_re + z_im * z_im) < 4 && iteration < maxLevel)
{
double tmp_re = z_re * z_re - z_im * z_im + c_re;
double tmp_im = 2 * z_re * z_im + c_im;
z_re = tmp_re;
z_im = tmp_im;
iteration++;
}

a[i] = iteration;
}
}

void calulateFractalCPU(unsigned char *a, int i, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
int x = i % width;
int y = i / width;

double p_im = y * ratioY + minY;
double p_re = x * ratioX + minX;

double z_re = p_re;
double z_im = p_im;

int iteration = 0;

while ((z_re * z_re + z_im * z_im) < 4 && iteration < 99)
{
double tmp_re = z_re * z_re - z_im * z_im + c_re;
double tmp_im = 2 * z_re * z_im + c_im;
z_re = tmp_re;
z_im = tmp_im;
iteration++;
}

a[i] = iteration;
}

int saveFractalToBitmap(unsigned char **colorsArray, unsigned char *bitmap, int width, int height, char *filename)
{
// Bitmap structures to be written to file
BITMAPFILEHEADER bfh;
BITMAPINFOHEADER bih;

// Fill BITMAPFILEHEADER structure
memcpy((char *)&bfh.bfType, "BM", 2);
bfh.bfSize = sizeof(bfh) + sizeof(bih) + 3*height*width;
bfh.bfReserved1 = 0;
bfh.bfReserved2 = 0;
bfh.bfOffBits = sizeof(bfh) + sizeof(bih);

// Fill BITMAPINFOHEADER structure
bih.biSize = sizeof(bih);
bih.biWidth = width;
bih.biHeight = height;
bih.biPlanes = 1;
bih.biBitCount = 24;
bih.biCompression = BI_RGB; // uncompressed 24-bit RGB
bih.biSizeImage = 0; // can be zero for BI_RGB bitmaps
bih.biXPelsPerMeter = 3780; // 96dpi equivalent
bih.biYPelsPerMeter = 3780;
bih.biClrUsed = 0;
bih.biClrImportant = 0;

// Open bitmap file (binary mode)
FILE *f;
f = fopen(filename, "wb");

if(f == NULL)
return -1;

// Write bitmap file header
fwrite(&bfh, 1, sizeof(bfh), f);
fwrite(&bih, 1, sizeof(bih), f);

// Write bitmap pixel data starting with the
// bottom line of pixels, left hand side
for (int i = 0; i < width * height ; i++)
{
// Write pixel components in BGR order
fputc(colorsArray[bitmap[i]][2], f);
fputc(colorsArray[bitmap[i]][1], f);
fputc(colorsArray[bitmap[i]][0], f);
}

// Close bitmap file
fclose(f);

return 0;
}

int main()
{
unsigned char **colorsArray;
unsigned char *fractalLevelsCPU;
unsigned char *fractalLevelsGPU;

double minX = -1.7;
double maxX = 1.7;
double minY = -1.5;
double maxY = 1.5;

double input_re = -0.79;
double input_im = 0.1463;

int width = 10;
int height = 5;
int N = width * height;
int maxLevel = 100;
size_t levelsArraySize = N * sizeof(unsigned char);

double ratioX = (maxX - minX) / (double) width;
double ratioY = (maxY - minY) / (double) height;

bool gpu = true;

// Allocate memory
colorsArray = (unsigned char**) malloc((maxLevel+1) * sizeof(unsigned char*));
for(int i=0; i<=maxLevel; i++)
{
colorsArray[i] = (unsigned char *) malloc(3 * sizeof(unsigned char));
colorsArray[i][0] = (int) (255.0 * i / maxLevel);
colorsArray[i][1] = (int) (255.0 * i / maxLevel);
colorsArray[i][2] = (int) (255.0 * log((double) i) / log((double) maxLevel));
}

fractalLevelsCPU = (unsigned char*) malloc(levelsArraySize);
cudaMalloc((unsigned char **) &fractalLevelsGPU, levelsArraySize);

cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyHostToDevice);

if(gpu)
{
// Run GPU method
calulateFractal <<< 1, N >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);

// Copy data from GPU to CPU array
cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyDeviceToHost);
}
else
{
// Iterate every element in array and compute level of fractal
for(int i=0; i<N; i++)
{
calulateFractalCPU(fractalLevelsCPU, i, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);
}
}

// Show results
for(int i=0; i<N; i++)
{
if((i % width) == 0)
printf("\n");

printf("%d\t", fractalLevelsCPU[i]);
}
//saveFractalToBitmap(colorsArray, fractalLevelsCPU, width, height, "frac.bmp");

// Free memory
for(int i=0; i<=maxLevel; i++)
{
free(colorsArray[i]);
}
free(colorsArray);

free(fractalLevelsCPU);
cudaFree(fractalLevelsGPU);

return 0;
}

最佳答案

我已经找到了解决问题的方法。

首先,每个 block 的线程数应该是 2 的幂。我还意识到我的 GPU 对每个 block 的线程数和 block 本身有限制。NVIDIA Utils 向我展示了我最多可以使用 65536 个 block ,每个 block 可以使用 512 个线程。

解决方案:

int threadsPerBlock = 512;
int blocksNumber = N/threadsPerBlock + (N % threadsPerBlock == 0 ? 0:1);

if(blocksNumber > 65536)
return -1;

calulateFractal <<< blocksNumber, threadsPerBlock >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);

关于c - GPU(CUDA)和CPU计算结果不同,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/15249453/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com