gpt4 book ai didi

c - Opencl Reduction 不符合预期

转载 作者:太空宇宙 更新时间:2023-11-03 23:49:08 25 4
gpt4 key购买 nike

我是 opencl 的新手。我试过“获取数组中每个元素的所有立方体的总和”。这是我的内核代码:

kernel void cubeSum(global float *input,
local float *prods,
global float *output )
{
int gid = get_global_id( 0 );
int tnum = get_local_id( 0 ); // thread number
int wgNum = get_group_id( 0 ); // work-group number
int numItems = get_local_size( 0 );
prods[ tnum ] = input[ gid ] * input[ gid ] * input[gid]; // cube

for (int offset = 1; offset < numItems; offset *= 2) {
int mask = 2 * offset - 1;
barrier(CLK_LOCAL_MEM_FENCE);
if ( (tnum & mask) == 0 ) {
prods[tnum] += prods[tnum + offset];
}
}
barrier(CLK_LOCAL_MEM_FENCE);

if ( tnum == 0 )
output[wgNum] = prods[0];
}

我不明白为什么我的结果与顺序结果不一样。当数组从0到511时,我的结果是顺序结果-2048;当数组从0到1023时,我的结果是顺序结果加上16384。

在等待您的答复时,我会尝试自己弄清楚。

另一个问题是我发现很难调试内核代码,因为数据集非常大并且同时运行。有什么调试建议吗?

感谢所有的建议=)。

顺便说一句,这是我的主机代码:

#include <stdio.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>

#define NUM_ELEMENTS (512)
#define LOCAL_SIZE (512)
#define MAX_SOURCE_SIZE (0x100000)

int main(int argc, const char * argv[])
{
float data[NUM_ELEMENTS]; //hA
float sum;
float sumTest;

size_t global;
size_t local;
size_t numWorkGroups;
size_t dataSize;
size_t resultsSize;

cl_device_id device;
cl_context context;
cl_command_queue cmdQueue;
cl_program program;
cl_kernel kernel;

cl_mem input;
cl_mem output;

FILE *fp;
//failed to use relative path here. permission problem?
char fileName[] = "/Users/sure/USC/590/cubeSum/cubeSum/cubeSum.cl";
char *source_str;
size_t source_size;

/* カーネルを含むソースコードをロード */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );

//allocate the host memory buffers:
int i = 0;
unsigned int count = NUM_ELEMENTS;
for (i = 0; i < count; i++) {
data[i] = i;
}

//array size in bytes (will need this later):
dataSize = NUM_ELEMENTS * sizeof(float);

//opencl function status
cl_int status;

// Connect to a compute device
//
int gpu = 1;

status = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device, NULL);
if (status != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}

//create an Opencl context
context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);

//create a command queue
cmdQueue = clCreateCommandQueue( context, device, 0, &status );

//allocate memory buffers on the device
input = clCreateBuffer( context, CL_MEM_READ_ONLY, dataSize, NULL, &status ); //dA

//TODO: at this line, I don't have the value of local which is calculated by clGetKernelWorkGroupInfo
//need to figure out a way to avoid hardcode it.
output = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(float) * NUM_ELEMENTS / LOCAL_SIZE, NULL, &status ); //dC

// enqueue the 2 commands to write data into the device buffers:
status = clEnqueueWriteBuffer( cmdQueue, input, CL_FALSE, 0, dataSize, data, 0, NULL, NULL );

// create the kernel program on the device:
program = clCreateProgramWithSource(context, 1, (const char **) & source_str, (const size_t *)&source_size, &status);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}


// Build the program executable
//
status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (status != CL_SUCCESS)
{
size_t len;
char buffer[2048];

printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}

//create compute kernel
kernel = clCreateKernel( program, "cubeSum", &status );

// Get the maximum work group size for executing the kernel on the device
//
status = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (status != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", status);
exit(1);
}

global = count;

numWorkGroups = global / local;
float results[numWorkGroups]; //hC
resultsSize = numWorkGroups * sizeof(float);

//set kernel parameter
status = clSetKernelArg( kernel, 0, sizeof(cl_mem), &input );
status = clSetKernelArg( kernel, 1, sizeof(float), NULL );
status = clSetKernelArg( kernel, 2, sizeof(cl_mem), &output );

// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (status)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}

clFinish(cmdQueue);
status = clEnqueueReadBuffer( cmdQueue, output, CL_TRUE, 0, resultsSize, results, 0, NULL, NULL );

// Validate our results
//
sum = 0;

for (int i=0; i<numWorkGroups; i++) {
sum += results[i];
}

sumTest = 0;
for(i = 0; i < count; i++)
{
sumTest += data[i] * data[i] * data[i];
}

// Print a brief summary detailing the results
//
printf("Computed '%f/%f'!\n", sum, sumTest);

// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmdQueue);
clReleaseContext(context);

return 0;

}

编辑:刚发现另一件事。如果我只是对没有立方体/正方形的所有元素求和,我的代码是正确的。因此,我要弄清楚立方体对我的程序有何影响。

最佳答案

您似乎只分配了 4 个字节的本地内存:

status = clSetKernelArg( kernel, 1, sizeof(float), NULL );

这应该是整个工作组为该参数所需的本地内存总量。对于您的内核,这是 (work-group-size * sizeof(float))

所以,你应该有这样的东西:

status = clSetKernelArg( kernel, 1, local*sizeof(float), NULL );

您看到的差异可能来自 float 的限制,因为您要对一些非常大的数字求和。如果您使用较小的数字(例如 data[i] = i*0.01;)初始化您的输入,您应该得到与您的顺序实现相同的结果(我已经在我自己的系统上验证了这一点)。这就是为什么在删除多维数据集时看不到错误的原因。

关于c - Opencl Reduction 不符合预期,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/25510691/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com