gpt4 book ai didi

c++ - OpenCL 中的共享内存

转载 作者:搜寻专家 更新时间:2023-10-31 02:09:53 25 4
gpt4 key购买 nike

我打算执行 vector 操作,并正在尝试一个带有 vector 加法和乘法的小型虚拟程序。但是,由于我对共享内存的了解有限,代码无法运行。互联网上的所有资源都显示了 2D 矩阵运算,我无法将其转化为我的 vector 问题。考虑到我是 OpenCL 的新手,请尝试解释我哪里出错了。代码如下:

主机代码:

std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
cl::Context context;
cl::CommandQueue queue;
cl::Program program;
cl::Kernel kernel;

cl::Platform::get(&platforms);

deviceUsed = 0;

cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(),0 };

context = cl::Context(CL_DEVICE_TYPE_ALL, properties);
devices = context.getInfo<CL_CONTEXT_DEVICES>();

queue = cl::CommandQueue(context, devices[deviceUsed]);
cl::Program::Sources source( 1, std::make_pair(kernel_source.c_str(), kernel_source.size()));
program = cl::Program(context, source);
program.build(devices);

std::vector < float > a;
std::vector < float > b;
std::vector < float > sum;
std::vector < float > prod;

int globalSize = 128;
int localSize = 16;

a.resize(globalSize);
b.resize(globalSize);
sum.resize(globalSize);
prod.resize(globalSize);

for (int i = 0; i < globalSize ; i++)
{
a[i] = 1.0f * i;
b[i] = 5.0f * i;
}
cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_sum;
cl::Buffer buffer_prod;

buffer_A = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);
buffer_B = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);

queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * globalSize , &a[0]);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * globalSize , &b[0]);

buffer_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);
buffer_prod = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);

kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_sum);
kernel.setArg(3, buffer_prod);

queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize/localSize), cl::NDRange(N), NULL);
queue.finish();
queue.enqueueReadBuffer(buffer_sum, CL_TRUE, 0, sizeof(float) * globalSize, &sum[0]);
queue.enqueueReadBuffer(buffer_prod, CL_TRUE, 0, sizeof(float) * globalSize, &prod[0]);

内核:

#define STRINGI(ker) #ker
std::string kernel_source = STRINGI(

__kernel void KernelAddMul(__global float* a, __global float* b, __global float* sum, __global float* prod)
{
unsigned int j = get_local_id(0);
int N = get_local_size(0);
unsigned int i = N * get_global_id(0) + j;

float locSum[N];
float locProd[N];

__local float Asub[N];
__local float Bsub[N];

for(int k = 0; k < N; k++){

Asub[k] = a[i];
Bsub[k] = b[i];
barrier(CLK_LOCAL_MEM_FENCE);

locSum[k] = Asub[k] + Bsub[k];
locProd[k] = Asub[k] * Bsub[k];
barrier(CLK_LOCAL_MEM_FENCE);

sum[i] = locSum[k];
prod[i] = locProd[k];
}

}

);

最佳答案

我怀疑你的代码没有运行是因为你的内核没有编译。

以下行无效:

int N = get_local_size(0);

float locSum[N];
float locProd[N];

__local float Asub[N];
__local float Bsub[N];

N 必须是常量,您不能使用 get_local_size(0) 动态调整数组大小。

我强烈建议您使用独立的编译器来编译您的内核: CodeXL非常好,Intel SDK for OpenCL也是如此.任何事情都比尝试在应用程序中调试内核要好!

关于c++ - OpenCL 中的共享内存,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/46056042/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com