gpt4 book ai didi

c++ - 是什么导致我的矢量化内核以如此荒谬的幅度减速?

转载 作者:搜寻专家 更新时间:2023-10-31 01:32:49 24 4
gpt4 key购买 nike

我编写了一个程序,旨在测试在操作系统中注册的每个计算设备的性能。显卡是 AMD Radeon HD 5450。特别是在我的电脑上,这些是找到的四个设备:

  • 平台:AMD 加速并行处理
    • 姓名:雪松
    • 类型:GPU
  • 平台:AMD 加速并行处理
    • 姓名:雪松
    • 类型:GPU
    • (显然,显卡本身列出了两次?我不知道,我没有构建这个东西...)
  • 平台:AMD 加速并行处理
    • 名称:Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz
    • 类型:CPU
  • 平台:实验性 OpenCL 2.0 CPU 专用平台
    • 名称:Intel(R) Core(TM) i7-2600 CPU @ 3.40GHz
    • 类型:CPU

在设备 0、1 和 3 上,当执行下面列出的内核时,它们在整体速度方面都有很大差异,但名义上都在直接比较方面的预期差异幅度内完成。我已将英特尔 CPU 在英特尔平台上的结果放在 Pastebin 上.

但是对于设备 2,不仅执行速度比其他设备慢得多(我可能会补充说,仅对于整数类型!)它的速度慢了荒谬的数量级,尽管事实上它应该使用相同的设备(英特尔 CPU),因为英特尔平台正在使用,没有此类问题。 See this pastebin .

显着的异常时间(导致大幅减速)与我的代码的矢量化版本有关,并且还取决于代码不是一元的。驱动 Intel CPU 的 AMD 平台似乎非常不兼容。

有人知道发生了什么事吗?我在下面包含了我的完整代码,以防它与潜在问题有关。

执行器.hpp

#pragma once
#define CL_HPP_ENABLE_EXCEPTIONS
#pragma warning(disable : 4996)
#include<CL\cl2.hpp>

class buffers {
cl::Buffer a, b, c;
cl::Buffer output;
size_t size;

template<typename T>
buffers(cl::Context const& context, size_t size, T t) :
size(size){
std::vector<T> values;
values.resize(size * 16);
for (size_t i = 0; i < size; i++)
values[i] = T(i);
a = cl::Buffer( context, values.begin(), values.end(), true );
for (auto & val : values)
val *= 3;
b = cl::Buffer( context, values.begin(), values.end(), true );
for (auto & val : values)
val /= 10;
c = cl::Buffer( context, values.begin(), values.end(), true );
output = cl::Buffer( context, CL_MEM_WRITE_ONLY, size * 16 * sizeof(T) );
}

public:
template<typename T>
static buffers make_buffer(cl::Context const& context, size_t size) {
return buffers(context, size, T(0));
}
cl::Buffer get_a() const {
return a;
}

cl::Buffer get_b() const {
return b;
}

cl::Buffer get_c() const {
return c;
}

cl::Buffer get_output() const {
return output;
}

size_t get_size() const {
return size;
}
};
class task {
cl::CommandQueue queue;
cl::Kernel kernel;
buffers * b;
std::string type_name;
public:
task(cl::CommandQueue queue, cl::Kernel kernel, buffers * b, std::string const& type_name) :
queue(queue),
kernel(kernel),
b(b),
type_name(type_name){
int argc = kernel.getInfo<CL_KERNEL_NUM_ARGS>();
for (int i = 0; i < argc; i++) {
std::string something = kernel.getArgInfo<CL_KERNEL_ARG_NAME>(i);
if(something == "a")
kernel.setArg(i, b->get_a());
else if(something == "b")
kernel.setArg(i, b->get_b());
else if(something == "c")
kernel.setArg(i, b->get_c());
else if(something == "output")
kernel.setArg(i, b->get_output());
}
}

cl::Event enqueue() {
cl::Event event;
queue.enqueueNDRangeKernel(kernel, {}, cl::NDRange{ b->get_size() }, {}, nullptr, &event);
return event;
}

cl::Kernel get_kernel() const {
return kernel;
}

std::string get_type_name() const {
return type_name;
}

static std::chrono::nanoseconds time_event(cl::Event event) {
event.wait();
return std::chrono::nanoseconds{ event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_START>() };
}
};

任务生成器.hpp

#pragma once
#include "Executor.hpp"
#include<iostream>
#include<fstream>

class kernel_generator {
public:
static std::string get_primary_kernels() {
return ""
#include "Primary Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}
static std::string get_trigonometric_kernels() {
return ""
#include "Trigonometry Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}
static std::string get_utility_kernels() {
return ""
#include "Utility Transform.cl"
#include "Transformations.cl"
#include "Transform Kernels.cl"
;
}

private:
static std::vector<cl::Kernel> get_kernels(std::string src, cl::Context context, cl::Device device, std::ostream & err_log) {
try {
cl::Program program{ context, src, false };
program.build();
std::vector<cl::Kernel> kernels;
program.createKernels(&kernels);
return kernels;
}
catch (cl::BuildError const& e) {
std::cerr << "Unable to build kernels for " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
err_log << "Build Log:\n";
auto log = e.getBuildLog();
for (auto const& log_p : log) {
err_log << log_p.second << "\n";
}
return{};
}
}
public:
static std::vector<cl::Kernel> get_char_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "char_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_short_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "short_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_int_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "int_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_long_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "long_defines.cl"
+ get_primary_kernels();
return get_kernels(src, context, device, err_log);
}
static std::vector<cl::Kernel> get_float_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "float_defines.cl"
+ get_primary_kernels();
std::vector<cl::Kernel> primary_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "float_defines.cl"
+ get_utility_kernels();
std::vector<cl::Kernel> utility_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "float_defines.cl"
+ get_trigonometric_kernels();
std::vector<cl::Kernel> trig_kernels = get_kernels(src, context, device, err_log);
std::vector<cl::Kernel> final_kernels;
final_kernels.insert(final_kernels.end(), primary_kernels.begin(), primary_kernels.end());
final_kernels.insert(final_kernels.end(), utility_kernels.begin(), utility_kernels.end());
final_kernels.insert(final_kernels.end(), trig_kernels.begin(), trig_kernels.end());
return final_kernels;
}
static std::vector<cl::Kernel> get_double_kernels(cl::Context context, cl::Device device, std::ostream & err_log) {
std::string src = ""
#include "double_defines.cl"
+ get_primary_kernels();
std::vector<cl::Kernel> primary_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "double_defines.cl"
+ get_utility_kernels();
std::vector<cl::Kernel> utility_kernels = get_kernels(src, context, device, err_log);
src = ""
#include "double_defines.cl"
+ get_trigonometric_kernels();
std::vector<cl::Kernel> trig_kernels = get_kernels(src, context, device, err_log);
std::vector<cl::Kernel> final_kernels;
final_kernels.insert(final_kernels.end(), primary_kernels.begin(), primary_kernels.end());
final_kernels.insert(final_kernels.end(), utility_kernels.begin(), utility_kernels.end());
final_kernels.insert(final_kernels.end(), trig_kernels.begin(), trig_kernels.end());
return final_kernels;
}
};

内核测试.cpp(主)

#define CL_HPP_ENABLE_EXCEPTIONS
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#pragma warning(disable : 4996)
#include<CL\cl2.hpp>
#include<iostream>
#include<iomanip>
#include<chrono>
#include<fstream>
#include<sstream>
#include<filesystem>

#include "TaskGenerator.hpp"

namespace filesystem = std::experimental::filesystem;

void print_device_info(std::ostream & out, cl::Platform platform, cl::Device device) {
out << std::setw(20) << std::left << "Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << "\n";
out << std::setw(20) << std::left << "Name: " << device.getInfo<CL_DEVICE_NAME>() << "\n";
out << std::setw(20) << std::left << "Memory Size: " << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() << "\n";
out << std::setw(20) << std::left << "Device Type: " << ((device.getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU") << "\n";
}

void test_device(cl::Platform platform, cl::Device device, filesystem::path output_file_path) {
std::ofstream out(output_file_path);
print_device_info(out, platform, device);
cl::Context context(device);
cl::CommandQueue queue{ context, CL_QUEUE_PROFILING_ENABLE };
size_t size = 100'000;
buffers char_buffers = buffers::make_buffer<cl_char>(context, size);
buffers short_buffers = buffers::make_buffer<cl_short>(context, size);
buffers int_buffers = buffers::make_buffer<cl_int>(context, size);
buffers long_buffers = buffers::make_buffer<cl_long>(context, size);
buffers float_buffers = buffers::make_buffer<cl_float>(context, size);
buffers double_buffers = buffers::make_buffer<cl_double>(context, size);
auto char_kernels = kernel_generator::get_char_kernels(context, device, out);
auto short_kernels = kernel_generator::get_short_kernels(context, device, out);
auto int_kernels = kernel_generator::get_int_kernels(context, device, out);
auto long_kernels = kernel_generator::get_long_kernels(context, device, out);
auto float_kernels = kernel_generator::get_float_kernels(context, device, out);
std::vector<cl::Kernel> double_kernels = kernel_generator::get_double_kernels(context, device, out);
std::vector<task> tasks;
for (auto & kernel : char_kernels) {
tasks.emplace_back(queue, kernel, &char_buffers, "char");
}
for (auto & kernel : short_kernels) {
tasks.emplace_back(queue, kernel, &short_buffers, "short");
}
for (auto & kernel : int_kernels) {
tasks.emplace_back(queue, kernel, &int_buffers, "int");
}
for (auto & kernel : long_kernels) {
tasks.emplace_back(queue, kernel, &long_buffers, "long");
}
for (auto & kernel : float_kernels) {
tasks.emplace_back(queue, kernel, &float_buffers, "float");
}
for (auto & kernel : double_kernels) {
tasks.emplace_back(queue, kernel, &double_buffers, "double");
}

std::vector<cl::Event> events;
size_t index = 0;
for (auto & task : tasks) {
events.emplace_back(task.enqueue());
std::cout << "Enqueueing " << task.get_kernel().getInfo<CL_KERNEL_FUNCTION_NAME>() << "(" << task.get_type_name() << ")" << "\n";
cl::Event e = task.enqueue();
}

out << "==========================================\n\nProfiling Results:\n\n";

for (size_t i = 0; i < events.size(); i++) {
events[i].wait();
auto duration = task::time_event(events[i]);
std::cout << "Task " << (i + 1) << " of " << events.size() << " complete.\r";
std::string task_name = tasks[i].get_kernel().getInfo<CL_KERNEL_FUNCTION_NAME>();
task_name.append("(" + tasks[i].get_type_name() + ")");
out << " " << std::setw(40) << std::right << task_name;
out << ": " << std::setw(12) << std::right << duration.count() << "ns\n";
}
}

int main() {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
int i = 0;
for (auto & platform : platforms) {
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
for (auto & device : devices) {
test_device(platform, device, "Device " + std::to_string(i++) + ".txt");
}
}

system("pause");
return 0;
}

char_defines.cl, short_defines.cl, int_defines.cl, long_defines.cl, float_defines.cl, double_defines.cl

R"D(
typedef char Scalar;
typedef char2 Vector2;
typedef char4 Vector4;
typedef char8 Vector8;
typedef char16 Vector16;
)D"
//All the other defines files are identical, but with their respective types swapped in.
//Only exception is double_defines.cl, which also has '#pragma OPENCL EXTENSION cl_khr_fp64 : enable' added.

Primary Transform.cl

R"D(
#define UNARY_TRANSFORM_PROCESS(a) a * a + a / a
#define BINARY_TRANSFORM_PROCESS(a, b) a * b + a / b
#define TERNARY_TRANSFORM_PROCESS(a, b, c) a * b + c / a
)D"

三角变换.cl

R"D(
#define UNARY_TRANSFORM_PROCESS(a) sin(a) + cos(a) + tan(a)
#define BINARY_TRANSFORM_PROCESS(a, b) sin(a) + cos(b) + tan(a)
#define TERNARY_TRANSFORM_PROCESS(a, b, c) sin(a) + cos(b) + tan(c)
)D"

实用程序 Transform.cl

R"D(
#define UNARY_TRANSFORM_PROCESS(a) log(a) + hypot(a, a) + tgamma(a)
#define BINARY_TRANSFORM_PROCESS(a, b) log(a) + hypot(b, a) + tgamma(b)
#define TERNARY_TRANSFORM_PROCESS(a, b, c) log(a) + hypot(b, c) + tgamma(a)
)D"

Transformations.cl

R"D(
Scalar Unary_Transform1(Scalar a) {
return UNARY_TRANSFORM_PROCESS(a);
}

Vector2 Unary_Transform2(Vector2 a) {
return UNARY_TRANSFORM_PROCESS(a);
}

Vector4 Unary_Transform4(Vector4 a) {
return UNARY_TRANSFORM_PROCESS(a);
}

Vector8 Unary_Transform8(Vector8 a) {
return UNARY_TRANSFORM_PROCESS(a);
}

Vector16 Unary_Transform16(Vector16 a) {
return UNARY_TRANSFORM_PROCESS(a);
}

Scalar Binary_Transform1(Scalar a, Scalar b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}

Vector2 Binary_Transform2(Vector2 a, Vector2 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}

Vector4 Binary_Transform4(Vector4 a, Vector4 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}

Vector8 Binary_Transform8(Vector8 a, Vector8 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}

Vector16 Binary_Transform16(Vector16 a, Vector16 b) {
return BINARY_TRANSFORM_PROCESS(a, b);
}

Scalar Ternary_Transform1(Scalar a, Scalar b, Scalar c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}

Vector2 Ternary_Transform2(Vector2 a, Vector2 b, Vector2 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}

Vector4 Ternary_Transform4(Vector4 a, Vector4 b, Vector4 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}

Vector8 Ternary_Transform8(Vector8 a, Vector8 b, Vector8 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}

Vector16 Ternary_Transform16(Vector16 a, Vector16 b, Vector16 c) {
return TERNARY_TRANSFORM_PROCESS(a, b, c);
}
)D"

转换 Kernels.cl

R"D(
kernel void unary_transform_scalar(global Scalar * a, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform1(a[id]);
}

kernel void binary_transform_scalar(global Scalar * a, global Scalar * b, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Binary_Transform1(a[id], b[id]);
}

kernel void ternary_transform_scalar(global Scalar * a, global Scalar * b, global Scalar * c, global Scalar * output) {
size_t id = get_global_id(0);
output[id] = Ternary_Transform1(a[id], b[id], c[id]);
}

kernel void unary_transform_vector2(global Vector2 * a, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform2(a[id]);
}

kernel void binary_transform_vector2(global Vector2 * a, global Vector2 * b, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Binary_Transform2(a[id], b[id]);
}

kernel void ternary_transform_vector2(global Vector2 * a, global Vector2 * b, global Vector2 * c, global Vector2 * output) {
size_t id = get_global_id(0);
output[id] = Ternary_Transform2(a[id], b[id], c[id]);
}

kernel void unary_transform_vector4(global Vector4 * a, global Vector4 * output) {
size_t id = get_global_id(0);
output[id] = Unary_Transform4(a[id]);
}
/* For the sake of brevity, I've cut the rest. It should be pretty clear what the
rest look like.*/

)D"

最佳答案

据我所知,AMD Radeon HD 5450 不是双 GPU,并且可能被列出两次,因为安装了 2 个不同版本的 AMD OpenCL 平台。我记得有这样一个案例,当时有 OpenCL 1.2 和实验性 OpenCL 2.0。检查平台版本。

谈到 CPU,我认为 Experimental OpenCL 2.0 CPU Only Platform 是针对 Intel CPU 进行了很好优化的 Intel 实现。 AMD OpenCL SDK 专为在 Intel CPU 上运行而设计,性能很差 - 我过去遇到过类似的问题。

总而言之 - 您不必在所有设备上使用所有可用的 OpenCL 平台。通常适用于 GPU 的最新 OpenCL 平台版本可提供不错的性能,并且始终在 Intel CPU 上使用 Intel OpenCL。

关于c++ - 是什么导致我的矢量化内核以如此荒谬的幅度减速?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/42513485/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com