gpt4 book ai didi

c - 如何创建OpenCL命令队列?

转载 作者:行者123 更新时间:2023-11-30 16:36:51 31 4
gpt4 key购买 nike

我正在尝试学习 OpenCL,但我什至无法制作一个简单的内核来工作。

下面的代码来自《OpenCL 示例编程》一书,我修改、修改、修改......但我仍然不知道问题出在哪里。

每次我在 PC(AMD Athlon 5350 APU 和 Radeon R3)中执行该程序时,它都会将结果打印为“0.0000”。如果我在另一台带有 NVIDIA 1080 TI 的计算机(该 HD 的克隆,因此一切都相同)中运行相同的可执行文件,程序将输出“3.000”作为结果。

我注意到编译器输出中出现警告,因此我将过时的 clCreateCommandQueue 调用更改为 clCreateCommandQueueWithProperties()。

现在......它只是段错误(通过 printf() 测试,我知道它在 clCreateCommandQueueWithProperties 期间/之后出现段错误)。

在配备 NVIDIA GPU 的系统上,它就可以正常工作。

我错过了什么?

#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
#include <malloc.h>

#include <CL/cl.h>

#define VECTOR_NS 4096
#define VECTOR_SIZE (VECTOR_NS*sizeof(float))

static const char* saxpy_kernel =
"__kernel void saxpy_kernel(__global float *A, __global float *B, __global float *C)\n"
"{\n"
"int index = get_global_id(0);\n"
"C[0] = 3;\n"
"}\n"
;

int main(void)
{
int i;
float total;
float* A;
float* B;
float* C;
float* Cmapped;
cl_mem Acl;
cl_mem Bcl;
cl_mem Ccl;
cl_context context;
cl_platform_id* platforms;
cl_uint num_platforms;
cl_uint num_devices;
cl_command_queue queue;
cl_kernel kernel;
cl_int clStatus;

// Get platform and device information
platforms = NULL;
//Set up the Platform
clStatus = clGetPlatformIDs(0, NULL, &num_platforms);
platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id)*num_platforms);
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
//Get the devices list and choose the device you want to run on
cl_device_id* device_list = NULL;

clStatus = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);

clStatus = clGetDeviceIDs( platforms[0], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
// Create one OpenCL context for each device in the platform
context = clCreateContext( NULL, num_devices, device_list, NULL, NULL, &clStatus);

/* Create the command queue */
//queue = clCreateCommandQueue(context, device_list[0], 0, &clStatus);
queue = clCreateCommandQueueWithProperties(context, device_list[0], NULL, &clStatus);

if(clStatus != CL_SUCCESS){
fprintf(stderr, "ERROR: failed to execute the kernel: %d.\n", clStatus);
exit(1);
}

/* */
if((A = aligned_alloc(sysconf(_SC_PAGESIZE), VECTOR_SIZE)) == NULL){
fprintf(stderr, "ERROR: failed to allocate memory.\n");
exit(1);
}
if((B = aligned_alloc(sysconf(_SC_PAGESIZE), VECTOR_SIZE)) == NULL){
fprintf(stderr, "ERROR: failed to allocate memory.\n");
exit(1);
}
if((C = aligned_alloc(sysconf(_SC_PAGESIZE), VECTOR_SIZE)) == NULL){
fprintf(stderr, "ERROR: failed to allocate memory.\n");
exit(1);
}

/* Initialize it */
i = 0;
do {
A[i] = 1;
B[i] = 2;
C[i] = 0;
} while(++i != VECTOR_NS);

Acl = clCreateBuffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, VECTOR_SIZE, A, &clStatus); // CL_MEM_READ_ONLY
Bcl = clCreateBuffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, VECTOR_SIZE, B, &clStatus); // CL_MEM_READ_ONLY
Ccl = clCreateBuffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, VECTOR_SIZE, C, &clStatus); // CL_MEM_WRITE_ONLY

// Create a program from the kernel source
// Build the program
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&saxpy_kernel, NULL, &clStatus);

if(clStatus != CL_SUCCESS){
fprintf(stderr, "ERROR: failed to compile the OpenCL code.\n");
exit(1);
}

clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);

if(clStatus != CL_SUCCESS){
fprintf(stderr, "ERROR: failed to compile the OpenCL code.\n");
exit(1);
}

// Create the OpenCL kernel
kernel = clCreateKernel(program, "saxpy_kernel", &clStatus);
// Set the arguments of the kernel
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&Acl);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&Bcl);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&Ccl);

// Execute the OpenCL kernel on the list
size_t global_size = VECTOR_NS; // Process the entire lists
size_t local_size = 1;

// Process one item at a time
clStatus = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);

if(clStatus != CL_SUCCESS){
fprintf(stderr, "ERROR: failed to execute the kernel: %d.\n", clStatus);
exit(1);
}

//* Clean up and wait for all the comands to complete. */
clFlush(queue);

/* Display the result to the screen */
Cmapped = (float*) clEnqueueMapBuffer(queue, Ccl, CL_TRUE, CL_MAP_READ, 0, VECTOR_SIZE, 0, NULL, NULL, &clStatus); // CL_MEM_USE_HOST_PTR

if(clStatus != CL_SUCCESS){
fprintf(stderr, "ERROR: failed to execute the kernel: %d.\n", clStatus);
exit(1);
}

total = 0;
for(i = 0; i < VECTOR_NS; i++)
total += C[i];

printf("TOTAL: %f\n", total);

/* Clean up and wait for all the comands to complete. */
clFlush(queue);
clFinish(queue);

/* Finally release all OpenCL allocated objects and host buffers. */
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseMemObject(Acl);
clReleaseMemObject(Bcl);
clReleaseMemObject(Ccl);
clReleaseCommandQueue(queue);
clReleaseContext(context);

free(platforms);
free(device_list);

return 0;
}

最佳答案

虽然您的代码在我的机器上按原样工作,但我认为可能存在导致您出现问题的问题。通话

Cmapped = (float*)clEnqueueMapBuffer(queue, Ccl, CL_TRUE, CL_MAP_READ, 0, VECTOR_SIZE, 0, NULL, NULL, &clStatus);

更改了Cmapped,但您尝试从原始C缓冲区中读取

total = 0;
for(i = 0; i < VECTOR_NS; i++)
total += C[i];

这可能应该是这样的

total = 0;
for(i = 0; i < VECTOR_NS; i++)
total += Cmapped[i];

但是,由于您使用 CL_MEM_USE_HOST_PTR 标志创建了缓冲区,如果您使用 clEnqueueReadBuffer 来代替,您的 OpenCL 驱动程序也应该能够优化复制操作,如果您使用原始指针作为目的地:

clEnqueueReadBuffer(queue, Ccl, CL_TRUE, 0, VECTOR_SIZE, C, 0, NULL, NULL);

如果 OpenCL 实现尚未将数据缓存到设备内存,则这应该是无操作。

关于c - 如何创建OpenCL命令队列?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/48271444/

31 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com