gpt4 book ai didi

c++ - OpenCL 内核重新编译的问题会减慢程序速度,并因此可能出现内存问题

转载 作者:行者123 更新时间:2023-11-28 08:25:07 50 4
gpt4 key购买 nike

我是 OpenCL 的新手,我运行的是 OS X 10.6 和 Nvidia 330 显卡。我正在用 C++ 进行布料模拟,我已经设法编写了一个内核来编译和运行。问题是它的运行速度比在没有 OpenCL 的 cpu 上慢。我相信这是因为每次我调用 update() 方法进行一些计算时,我都会设置上下文和设备,然后从源代码重新编译内核。

为了解决这个问题,我尝试将我需要的各种 OpenCL 类型封装到布料模拟类中以尝试将它们存储在那里,然后创建一个 initCL() 来设置这些值。然后我创建了一个 runCL() 来执行内核。奇怪的是,当我将 OpenCL 的东西分成两种方法时,这只会给我带来内存问题。如果 initCL() 和 runCL() 都组合成一个方法,它工作正常,但这就是我有点卡住的原因。

程序编译并运行,但我随后在 runCL() 代码中标记的位置收到 SIGABRT 或 EXC BAD ACCESS。当我收到 SIGABRT 时,我收到错误 CL_INVALID_COMMAND_QUEUE 但我终生无法弄清楚为什么只有在我拆分这两种方法时才会发生这种情况。当断言失败时,我有时会收到 SIGABRT,这是可以预料的,但其他时候,我只是在尝试写入缓冲区时收到错误的内存访问错误。

此外,如果有人能告诉我更好的方法/正确的做法,或者如果 JIT 重新编译不是减慢我的代码速度的原因,那么我将非常感激,因为我已经盯着这个看太久了!

谢谢,

乔恩

OpenCL变量的初始化代码:

int VPESimulationCloth::initCL(){
// Find the CPU CL device, as a fallback
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
assert(err == CL_SUCCESS);

// Find the GPU CL device, this is what we really want
// If there is no GPU device is CL capable, fall back to CPU
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
assert(device);

// Get some information about the returned device
cl_char vendor_name[1024] = {0};
cl_char device_name[1024] = {0};
err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name),
vendor_name, &returned_size);
err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
device_name, &returned_size);
assert(err == CL_SUCCESS);
//printf("Connecting to %s %s...\n", vendor_name, device_name);

// Now create a context to perform our calculation with the
// specified device
context = clCreateContext(0, 1, &device, NULL, NULL, &err);
assert(err == CL_SUCCESS);

// And also a command queue for the context
cmd_queue = clCreateCommandQueue(context, device, 0, NULL);

// Load the program source from disk
// The kernel/program should be in the resource directory
const char * filename = "clothSimKernel.cl";
char *program_source = load_program_source(filename);


program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
NULL, &err);
if (!program[0])
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
assert(err == CL_SUCCESS);

err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
char build[2048];
clGetProgramBuildInfo(program[0], device, CL_PROGRAM_BUILD_LOG, 2048, build, NULL);
printf("Build Log:\n%s\n",build);
if (err == CL_BUILD_PROGRAM_FAILURE) {
printf("CL_BUILD_PROGRAM_FAILURE\n");
}
}
if (err != CL_SUCCESS) {
cout<<getErrorDesc(err)<<endl;
}
assert(err == CL_SUCCESS);
//writeBinaries();
// Now create the kernel "objects" that we want to use in the example file
kernel[0] = clCreateKernel(program[0], "clothSimulation", &err);

}

执行内核的方法代码:

int VPESimulationCloth::runCL(){

// Find the GPU CL device, this is what we really want
// If there is no GPU device is CL capable, fall back to CPU
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
assert(device);

// Get some information about the returned device
cl_char vendor_name[1024] = {0};
cl_char device_name[1024] = {0};
err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name),
vendor_name, &returned_size);
err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name),
device_name, &returned_size);
assert(err == CL_SUCCESS);
//printf("Connecting to %s %s...\n", vendor_name, device_name);

// Now create a context to perform our calculation with the
// specified device

//cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
//memory allocation
cl_mem nowPos_mem, prevPos_mem, rForce_mem, mass_mem, passive_mem, canMove_mem,numPart_mem, theForces_mem, numForces_mem, drag_mem, answerPos_mem;

// Allocate memory on the device to hold our data and store the results into
buffer_size = sizeof(float4) * numParts;

// Input arrays
//------------------------------------
// This is where the error occurs
nowPos_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, nowPos_mem, CL_TRUE, 0, buffer_size,
(void*)nowPos, 0, NULL, NULL);
if (err != CL_SUCCESS) {
cout<<getErrorDesc(err)<<endl;
}
assert(err == CL_SUCCESS);
//------------------------------------
prevPos_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, prevPos_mem, CL_TRUE, 0, buffer_size,
(void*)prevPos, 0, NULL, NULL);
assert(err == CL_SUCCESS);
rForce_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, rForce_mem, CL_TRUE, 0, buffer_size,
(void*)rForce, 0, NULL, NULL);
assert(err == CL_SUCCESS);
mass_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, mass_mem, CL_TRUE, 0, buffer_size,
(void*)mass, 0, NULL, NULL);
assert(err == CL_SUCCESS);
answerPos_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
//uint buffer
buffer_size = sizeof(uint) * numParts;
passive_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, passive_mem, CL_TRUE, 0, buffer_size,
(void*)passive, 0, NULL, NULL);
assert(err == CL_SUCCESS);
canMove_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, canMove_mem, CL_TRUE, 0, buffer_size,
(void*)canMove, 0, NULL, NULL);
assert(err == CL_SUCCESS);

buffer_size = sizeof(float4) * numForces;
theForces_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, theForces_mem, CL_TRUE, 0, buffer_size,
(void*)theForces, 0, NULL, NULL);
assert(err == CL_SUCCESS);

//drag float
buffer_size = sizeof(float);
drag_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err |= clEnqueueWriteBuffer(cmd_queue, drag_mem, CL_TRUE, 0, buffer_size,
(void*)drag, 0, NULL, NULL);
assert(err == CL_SUCCESS);

// Now setup the arguments to our kernel
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &nowPos_mem);
err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &prevPos_mem);
err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &rForce_mem);
err |= clSetKernelArg(kernel[0], 3, sizeof(cl_mem), &mass_mem);
err |= clSetKernelArg(kernel[0], 4, sizeof(cl_mem), &passive_mem);
err |= clSetKernelArg(kernel[0], 5, sizeof(cl_mem), &canMove_mem);
err |= clSetKernelArg(kernel[0], 6, sizeof(cl_mem), &numParts);
err |= clSetKernelArg(kernel[0], 7, sizeof(cl_mem), &theForces_mem);
err |= clSetKernelArg(kernel[0], 8, sizeof(cl_mem), &numForces);
err |= clSetKernelArg(kernel[0], 9, sizeof(cl_mem), &drag_mem);
err |= clSetKernelArg(kernel[0], 10, sizeof(cl_mem), &answerPos_mem);
if (err != CL_SUCCESS) {
cout<<getErrorDesc(err)<<endl;
}
assert(err == CL_SUCCESS);
// Run the calculation by enqueuing it and forcing the
// command queue to complete the task
size_t global_work_size = numParts;
size_t local_work_size = global_work_size/8;
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL,
&global_work_size, &local_work_size, 0, NULL, NULL);
if (err != CL_SUCCESS) {
cout<<getErrorDesc(err)<<endl;
}

assert(err == CL_SUCCESS);
//clFinish(cmd_queue);

// Once finished read back the results from the answer
// array into the results array
//reset the buffer first
buffer_size = sizeof(float4) * numParts;
err = clEnqueueReadBuffer(cmd_queue, answerPos_mem, CL_TRUE, 0, buffer_size,
answerPos, 0, NULL, NULL);
if (err != CL_SUCCESS) {
cout<<getErrorDesc(err)<<endl;
}


//cl mem
clReleaseMemObject(nowPos_mem);
clReleaseMemObject(prevPos_mem);
clReleaseMemObject(rForce_mem);
clReleaseMemObject(mass_mem);
clReleaseMemObject(passive_mem);
clReleaseMemObject(canMove_mem);
clReleaseMemObject(theForces_mem);
clReleaseMemObject(drag_mem);
clReleaseMemObject(answerPos_mem);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
assert(err == CL_SUCCESS);
return err;

}

最佳答案

问题解决了!在 runCL() 方法的底部,我“释放”了我所有的 cl 类型,虽然我只是释放了一些 cl_mem 但仔细检查后我释放了上下文等。一如既往的明显和烦人的错误:)。

感谢 Khronos 论坛上的 andrew.brownsword 发现了这个。

关于c++ - OpenCL 内核重新编译的问题会减慢程序速度,并因此可能出现内存问题,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/4323849/

50 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com