gpt4 book ai didi

c++ - 提高opencl的性能

转载 作者:行者123 更新时间:2023-12-01 14:53:49 28 4
gpt4 key购买 nike

我正在尝试使用opencl实现一些图像处理算法。但是正如我看到的那样,当我使用opencl时,它需要大约0.5毫秒才能完成一个过程,即一帧。除了我仅使用类对象声明一次初始化opencl参数,还是仅调用运行主内核的函数,没有其他方法吗?我尝试通过创建类来进行这种尝试,但是当我找到上下文时,无法单独声明和使用设备,因此每次都需要创建它。

#include <CL/cl.hpp>
#include <chrono>
#include <iostream>
using namespace std::chrono;
using namespace std;
namespace Color {
enum Code {
FG_RED = 31,
FG_GREEN = 32,
FG_BLUE = 34,
FG_DEFAULT = 39,
BG_RED = 41,
BG_GREEN = 42,
BG_BLUE = 44,
BG_DEFAULT = 49
};
class Modifier {
Code code;

public:
Modifier(Code pCode) : code(pCode) {}
friend std::ostream& operator<<(std::ostream& os, const Modifier& mod) {
return os << "\033[" << mod.code << "m";
}
};
} // namespace Color
class useOpenCL {
public:
int size = 294400;
std::vector<cl::Platform> all_platforms;
std::vector<cl::Device> all_devices;
cl::Platform default_platform;
cl::Device default_device;
cl::Program::Sources sources;
std::string kernel_code;
cl::Kernel kernel_add;
cl::Buffer buffer_A;

useOpenCL();
~useOpenCL() {}
void backgroundSub();
};

useOpenCL::useOpenCL() {
Color::Modifier green(Color::FG_GREEN);
Color::Modifier red(Color::FG_RED);
Color::Modifier def(Color::FG_DEFAULT);
// get all platforms (drivers)
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << red << " No platforms found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_platform = all_platforms[0];
std::cout << green << "Using platform: " << def
<< default_platform.getInfo<CL_PLATFORM_NAME>() << std::endl;

// get default device of the default platform
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << red << " No devices found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_device = all_devices[0];
std::cout << green << "Using device: " << def
<< default_device.getInfo<CL_DEVICE_NAME>() << std::endl;

// kernel calculates for each element C=A+B
kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" "
" } "
" ";
sources.push_back({kernel_code.c_str(), kernel_code.length()});
}

void useOpenCL::backgroundSub() {
int A[size], B[size];
for (int i = 0; i < size; i++) {
A[i] = i;
B[i] = i + 1;
}
auto start1 = high_resolution_clock::now();

cl::Context context({default_device});

cl::Program program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
std::cout << " Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)
<< "\n";
exit(1);
}
// create buffers on the device
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * size);
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * size);
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * size);

// create queue to which we will push commands for the device.
cl::CommandQueue queue(context, default_device);

// write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);

// run the kernel
/*cl::KernelFunctor
simple_add(cl::Kernel(program,"simple_add"),queue,cl::NullRange,cl::NDRange(10),cl::NullRange);
simple_add(buffer_A,buffer_B,buffer_C);*/

// alternative way to run the kernel
kernel_add.setArg(0, buffer_A);
kernel_add.setArg(1, buffer_B);
kernel_add.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(size),
cl::NullRange);
queue.finish();

int C[size];
// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * size, C);
/*std::cout<<" result: \n";
for(int i=0;i<size;i++){
std::cout<<C[i]<<"\t";
}*/
auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
auto FPS = 1000000.0 / duration1.count();
cout << "Segmentation FPS=" << FPS << "\t"
<< "Execution Time(sec)=" << duration1.count() / 1000000.0 << endl;
}

int main() {
useOpenCL img;
while (true) {
img.backgroundSub();
}
return 0;
}

它给了我下面的结果:
Segmentation FPS=13.2557    Execution Time(sec)=0.075439
Segmentation FPS=15.7602 Execution Time(sec)=0.063451
Segmentation FPS=14.3872 Execution Time(sec)=0.069506
Segmentation FPS=12.7525 Execution Time(sec)=0.078416

这不好,因为fps仅为12、13 fps。那么我怎样才能使该程序更快呢?

最佳答案

将只需要调用一次的初始化部分放在构造函数的开头。此初始化应包含所有内存分配,OpenCL C代码编译以及从主机到设备的任何初始内存传输:

useOpenCL::useOpenCL() {
Color::Modifier green(Color::FG_GREEN);
Color::Modifier red(Color::FG_RED);
Color::Modifier def(Color::FG_DEFAULT);
// get all platforms (drivers)
cl::Platform::get(&all_platforms);
if (all_platforms.size() == 0) {
std::cout << red << " No platforms found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_platform = all_platforms[0];
std::cout << green << "Using platform: " << def
<< default_platform.getInfo<CL_PLATFORM_NAME>() << std::endl;

// get default device of the default platform
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if (all_devices.size() == 0) {
std::cout << red << " No devices found. Check OpenCL installation!" << def
<< endl;
exit(1);
}
default_device = all_devices[0];
std::cout << green << "Using device: " << def
<< default_device.getInfo<CL_DEVICE_NAME>() << std::endl;

// kernel calculates for each element C=A+B
kernel_code =
" void kernel simple_add(global const int* A, global const int* B, "
"global int* C){ "
" C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)]; "
" "
" } "
" ";
sources.push_back({kernel_code.c_str(), kernel_code.length()});

context = cl::Context({default_device});

program = cl::Program(context, sources);
if (program.build({default_device}) != CL_SUCCESS) {
std::cout << " Error building: "
<< program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)
<< "\n";
exit(1);
}

// create queue to which we will push commands for the device.
queue = cl::CommandQueue(context, default_device);

// create buffers on host
int A[size], B[size];
int C[size];
for (int i = 0; i < size; i++) {
A[i] = i;
B[i] = i + 1;
}

// create buffers on the device
buffer_A = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
buffer_B = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);
buffer_C = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(int) * size);

// write arrays A and B to the device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);

// alternative way to run the kernel
kernel_add.setArg(0, buffer_A);
kernel_add.setArg(1, buffer_B);
kernel_add.setArg(2, buffer_C);
}

因此,请将 contextprogramqueuebuffer_Abuffer_Bbuffer_C成员变量设为类 useOpenCL。尤其是内存分配和编译需要很长时间,因此只需执行一次并重用缓冲区。

class useOpenCL {
public:
int size = 294400;
std::vector<cl::Platform> all_platforms;
std::vector<cl::Device> all_devices;
cl::Platform default_platform;
cl::Device default_device;
cl::Program::Sources sources;
std::string kernel_code;
cl::Kernel kernel_add;

cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_C;

cl::Context context;
cl::Program program;
cl::CommandQueue queue;

useOpenCL();
~useOpenCL() {}
void backgroundSub();
};

然后,对于每个帧计算,仅保留内核调用和最终的内存传输主机<->设备:

void useOpenCL::backgroundSub() {
auto start1 = high_resolution_clock::now();

// write arrays A and B to the device (ONLY IF NECESSARY FOR EVERY FRAME)
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int) * size, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int) * size, B);

// run the kernel
queue.enqueueNDRangeKernel(kernel_add, cl::NullRange, cl::NDRange(size),
cl::NullRange);

// read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int) * size, C);

queue.finish();

auto stop1 = high_resolution_clock::now();
auto duration1 = duration_cast<microseconds>(stop1 - start1);
auto FPS = 1000000.0 / duration1.count();
cout << "Segmentation FPS=" << FPS << "\t"
<< "Execution Time(sec)=" << duration1.count() / 1000000.0 << endl;
}


后者的代码可以一遍又一遍地调用,并且比一遍又一遍地重新初始化所有内容要快得多。还要确保 size足够大,否则GPU可能无法充分发挥其潜力,并且主机设备存储传输的等待时间将使每一帧的速度变得异常缓慢。

关于c++ - 提高opencl的性能,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59651484/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com