gpt4 book ai didi

c - C 中的 OpenCL : can't free gpu memory

转载 作者:太空宇宙 更新时间:2023-11-04 03:14:05 24 4
gpt4 key购买 nike

我正在编写一个需要能够快速乘以矩阵的算法。我尝试使用线程,然后考虑使用 GPU。

我最初想使用 CUDA,但无法让它工作,所以我使用了 OpenCL。

我使用了在 Internet 上找到的代码并对其进行了更改以使其适用于我的程序。

但是,GPU 内存不断增加,直到没有剩余内存为止。代码似乎正确地释放了内存。

你知道哪里出了问题吗?

这是我用来加载 OpenCL 和矩阵相乘的代码:

/***************
Copyright (c) 2015, MedicineYeh
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************/

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <CL/cl.h>
#include "matrix.h"

#define checkErr(fun, statement) err = fun;\
if (err != CL_SUCCESS) {statement}
#define checkExit(value, message) if (value == 0) {printf(message); goto release;}

//define function

double get_event_exec_time(cl_event event)
{
cl_ulong start_time, end_time;
/*Get start device counter for the event*/
clGetEventProfilingInfo (event,
CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
&start_time,
NULL);
/*Get end device counter for the event*/
clGetEventProfilingInfo (event,
CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),
&end_time,
NULL);
/*Convert the counter values to milli seconds*/
double total_time = (end_time - start_time) * 1e-6;
return total_time;
}

cl_program load_program(cl_context context, cl_device_id device, const char* filename)
{
FILE *fp = fopen(filename, "rt");
size_t length;
char *data;
char *build_log;
size_t ret_val_size;
cl_program program = 0;
cl_int status = 0;
if(!fp) return 0;

// get file length
fseek(fp, 0, SEEK_END);
length = ftell(fp);
fseek(fp, 0, SEEK_SET);

// read program source
data = (char *)malloc(length + 1);
fread(data, sizeof(char), length, fp);
data[length] = '\0';

// create and build program
program = clCreateProgramWithSource(context, 1, (const char **)&data, 0, 0);
if (program == 0) return 0;

status = clBuildProgram(program, 0, 0, 0, 0, 0);
if (status != CL_SUCCESS) {
printf("Error: Building Program from file %s\n", filename);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
build_log = (char *)malloc(ret_val_size + 1);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("Building Log:\n%s", build_log);
return 0;
}

return program;
}

void gpu_mul(matrix m1, matrix m2, matrix r)
{
cl_int err = 0;
cl_uint num = 0;
cl_platform_id *platforms = NULL;
cl_context_properties prop[3] = {0};
cl_context context = 0;
cl_device_id *devices = NULL;
cl_command_queue queue = 0;
cl_program program = 0;
cl_mem cl_a = 0, cl_b = 0, cl_res = 0;
cl_kernel adder = 0;
cl_event event;
unsigned int num_total_devices = 0;
char devname[16][256] = {{0}};
size_t cb, work_size;
unsigned int i;

int m = m1.rows;
int n = m1.columns;
int p = m2.columns;
double *a = m1.value;
double *b = m2.value;
double *res = r.value;

checkErr(clGetPlatformIDs(0, 0, &num),
printf("Unable to get platforms\n");
return;
);

platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num);
checkErr(clGetPlatformIDs(num, platforms, NULL),
printf("Unable to get platform ID\n");
return;
);

checkErr(clGetPlatformIDs(0, 0, &num),
printf("Unable to get platforms\n");
return;
);

//printf("Found %d platforms:\n", num);
for (i = 0; i < num; i++) {
char str[1024];
clGetPlatformInfo (platforms[i], CL_PLATFORM_NAME, 1024, str, NULL);
//printf("\t%d: %s\n", i, str);
}

prop[0] = CL_CONTEXT_PLATFORM;
prop[1] = (cl_context_properties)platforms[0];
prop[2] = 0;
context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, NULL);
checkExit(context, "Can't create OpenCL context\n");

clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
devices = (cl_device_id *)malloc(cb);
clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, 0);
checkExit(cb, "Can't get devices\n");
num_total_devices = cb / sizeof(cl_device_id);

//printf("Found %d devices:\n", num_total_devices);
for (i = 0; i < num_total_devices; i++) {
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 256, devname[i], 0);
//printf("\t%d: %s", i, devname[i]);
clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &cb, 0);
//printf(" - %d\n", (int)cb);
}

//Specify the queue to be profile-able
queue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, 0);
checkExit(queue, "Can't create command queue\n");

program = load_program(context, devices[0], "matrixmul_kernel.cl");
checkExit(program, "Fail to build program\n");

cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * m * n, a, NULL);
cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * n * p, b, NULL);
cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double) * m * p , res, NULL);
if (cl_a == 0 ||
cl_b == 0 ||
cl_res == 0) {
if(cl_a==0)
printf("Can't create OpenCL buffer (cl_a) \n");
if(cl_b==0)
printf("Can't create OpenCL buffer (cl_b) \n");
if(cl_res==0)
printf("Can't create OpenCL buffer (cl_res) \n");
goto release;
}
// adder = clCreateKernel(program, "adder", &err);
adder = clCreateKernel(program, "test", &err);
if (err == CL_INVALID_KERNEL_NAME) printf("CL_INVALID_KERNEL_NAME\n");
checkExit(adder, "Can't load kernel\n");

clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);
clSetKernelArg(adder, 3, sizeof(cl_int), &m);
clSetKernelArg(adder, 4, sizeof(cl_int), &n);
clSetKernelArg(adder, 5, sizeof(cl_int), &p);
work_size = m * p;

checkErr(clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, &event),
printf("Can't enqueue kernel\n");
);
checkErr(clEnqueueReadBuffer(queue, cl_res, CL_TRUE, 0, sizeof(double) * work_size, res, 0, 0, 0),
printf("Can't enqueue read buffer\n");
);
clWaitForEvents(1, &event);
//printf("Execution Time: %.04lf ms\n\n", get_event_exec_time(event));

//Make sure everything is done before we do anything
clFinish(queue);

release:
clReleaseKernel(adder);
clReleaseProgram(program);
clReleaseMemObject(cl_a);
clReleaseMemObject(cl_b);
clReleaseMemObject(cl_res);
clReleaseCommandQueue(queue);
clReleaseContext(context);
}

最佳答案

您每次迭代泄漏了多少字节?这会让你知道你在泄漏什么。如果它很小,我注意到你在 clEnqueueNDRangeKernel 中取了一个 cl_event 对象。 (你用 clWaitForEvents 调用的那个),但你永远不会用 clReleaseEvent 释放它然后。对于其他项目,您可以检查引用计数以查看是否有您不知道的引用。

关于c - C 中的 OpenCL : can't free gpu memory,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/53575846/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com