- android - 多次调用 OnPrimaryClipChangedListener
- android - 无法更新 RecyclerView 中的 TextView 字段
- android.database.CursorIndexOutOfBoundsException : Index 0 requested, 光标大小为 0
- android - 使用 AppCompat 时,我们是否需要明确指定其 UI 组件(Spinner、EditText)颜色
您好,我正在尝试在 7 周内执行 7 Concurreny Models 一书中的示例代码。作者使用的是 macbook,而我使用的是带有 windows 10 的 dell xps。
我的程序崩溃了,因为在我调用函数 clEnqueueNDRangeKernel()
之后 timing_event
仍然为 null。
cl_event timing_event;
size_t work_units = NUM_ELEMENTS;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
NULL, 0, NULL,&timing_event);
docs声明以下有关事件参数的信息
event
Returns an event object that identifies this particular kernel execution instance. Event objects are unique and can be used to identify a particular kernel execution instance later on. If event is NULL, no event will be created for this kernel execution instance and therefore it will not be possible for the application to query or queue a wait for this particular kernel execution instance.
谁能解释一下为什么这种情况发生在我的戴尔而不是作者的 macbook 上?
最佳答案
我找到了解决方案。问题并非来自 clEnqueueNDRangeKernel()
,而是发生在 clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
中。我使用 clGetProgramBuildInfo()
检索了构建信息。问题是我的 multiply_arrays.cl 文件不是 utf 8 编码
对于所有刚接触 opencl 的人。每个 opencl 函数都会返回一个映射到特定错误代码的状态整数。如果函数确实返回但不返回状态代码,则可以将指针传递给该函数。请参阅我在下面链接的示例函数。这对调试您的程序非常有帮助。
main.cpp
/***
* Excerpted from "Seven Concurrency Models in Seven Weeks",
* published by The Pragmatic Bookshelf.
* Copyrights apply to this code. It may not be used to create training material,
* courses, books, articles, and the like. Contact us if you are in doubt.
* We make no guarantees that this code is fit for any purpose.
* Visit http://www.pragmaticprogrammer.com/titles/pb7con for more book information.
***/
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <mach/mach_time.h>
#else
#include <CL/cl.h>
#include <Windows.h>
#endif
#include <stdio.h>
#include<iostream>
#include <inttypes.h>
#include <chrono>
#define NUM_ELEMENTS (100000)
char* read_source(const char* filename) {
FILE *h = fopen(filename, "r");
fseek(h, 0, SEEK_END);
size_t s = ftell(h);
rewind(h);
char* program = (char*)malloc(s + 1);
fread(program, sizeof(char), s, h);
program[s] = '\0';
fclose(h);
return program;
}
void random_fill(cl_float array[], size_t size) {
for (int i = 0; i < size; ++i)
array[i] = (cl_float)rand() / RAND_MAX;
}
int main() {
//Status for Errorhandling
cl_int status;
//Identify Platform
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
//Get Id of GPU
cl_device_id device;
cl_uint num_devices = 0;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
// Create Context
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
//Use context to create Command Queue
//Que enables us to send commands to the gpu device
cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);
//Load Kernel
char* source = read_source("multiply_arrays.cl");
cl_program program = clCreateProgramWithSource(context, 1,
(const char**)&source, NULL, &status);
free(source);
// Build Program
status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
size_t len;
char *buffer;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
buffer = (char *) malloc(len);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
printf("%s\n", buffer);
//Create Kernel
cl_kernel kernel = clCreateKernel(program, "multiply_arrays", &status);
// Create Arrays with random Numbers
cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
random_fill(a, NUM_ELEMENTS);
random_fill(b, NUM_ELEMENTS);
//uint64_t startGPU = mach_absolute_time();
auto start = std::chrono::high_resolution_clock::now();
//Create Readonly input Buffers with value from a and b
cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float) * NUM_ELEMENTS, b, NULL);
//Create Output buffer write Only
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);
//set Kernel Arguments
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
cl_event timing_event;
size_t work_units = NUM_ELEMENTS;
status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
NULL, 0, NULL,&timing_event);
cl_float results[NUM_ELEMENTS];
//Calculate Results and copy from output buffer to results
clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
results, 0, NULL, NULL);
//uint64_t endGPU = mach_absolute_time();
auto finish = std::chrono::high_resolution_clock::now();
//printf("Total (GPU): %lu ns\n\n", (unsigned long)(endGPU - startGPU));
std::cout << "Total(GPU) :"<< std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";
cl_ulong starttime;
clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &starttime, NULL);
cl_ulong endtime;
clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endtime, NULL);
printf("Elapsed (GPU): %lu ns\n\n", (unsigned long)(endtime - starttime));
clReleaseEvent(timing_event);
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
//uint64_t startCPU = mach_absolute_time();
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < NUM_ELEMENTS; ++i)
results[i] = a[i] * b[i];
//uint64_t endCPU = mach_absolute_time();
finish = std::chrono::high_resolution_clock::now();
//printf("Elapsed (CPU): %lu ns\n\n", (unsigned long)(endCPU - startCPU));
std::cout << "Elapsed (CPU) :" << std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";
return 0;
}
multiply_arrays.cl
__kernel void multiply_arrays(__global const float* inputA,
__global const float* inputB,
__global float* output) {
int i = get_global_id(0);
output[i] = inputA[i] * inputB[i];
}
//ö
关于c - clEnqueueNDRangeKernel 何时或为何将 Null 作为事件返回?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/41211233/
现在问我另一个问题。当我的内核被分解为多个调用时,我一直在尝试分析我的内核与其执行并行的结果。然而,虽然 clEnqueueReadBuffer 有一个 bool 值来确定它是否阻塞,但 clEnqu
我目前正在尝试编写一个 OpenCL 应用程序来执行一些内存密集型计算。为了跟踪所有计算的进度,我创建了一个 for 循环,它创建了不同的内核组。不幸的是,计算填满了我的整个内存。我的猜测是内核是在添
我正在编写代码来添加两个维度为 1024*1024 的矩阵。所以我的工作维度必须是 2,全局工作大小应该是 1024*1024。我想设置每个工作组的大小为64*64。我该如何实现? 所以我的代码应该是
我在二维数组上执行 5 点模板操作,直到在该二维数组上计算收敛。所以我有多次迭代(直到收敛),对于每次迭代,我调用 clEnqueueNDRangeKernel 函数来计算 2D 输入数组的新值。 实
您好,我正在尝试在 7 周内执行 7 Concurreny Models 一书中的示例代码。作者使用的是 macbook,而我使用的是带有 windows 10 的 dell xps。 我的程序崩溃了
我正在为 OpenCL 使用 C++ 绑定(bind),当我的一个内核入队时,我得到一个 cl::Error,它说 -38 (CL_INVALID_MEM_OBJECT) for clEnqueueN
我使用的是 clEnqueueNDRangeKernel 函数,它比 clEnqueueTask 快得多。尽管如此,我还是不能让它快于 16 毫秒,即使添加更多的 global_item_size 也
我是一名优秀的程序员,十分优秀!