gpt4 book ai didi

opencl - 内核随机崩溃,错误36和5

转载 作者:行者123 更新时间:2023-12-01 09:26:33 26 4
gpt4 key购买 nike

我有一个问题,就是我正在开发的内核随机崩溃。这意味着它每运行10次就会崩溃。我认为我的内核可能太复杂了,但是降低复杂性并没有真正的帮助。

当它崩溃时,clEnqueueNDRangeKernel不返回任何错误,但是以下clFinish返回一个-36(CL_INVALID_COMMAND_QUEUE),以下clEnqueueReadBuffer返回一个-5错误(CL_OUT_OF_RESOURCES)。

所以我的问题是:

  • 错误消息和崩溃的原因可能是什么?
  • 我的内核已经太复杂了吗?你有什么经验?
  • 是否有办法找出无需等待崩溃的内核有多复杂?

  • 如果您认为缺少信息对您有所帮助,请发表评论。

    我试图将内核减少到一个最小的示例,仍然显示错误。看起来像这样:
    "    __kernel void myKernel1(",
    " __local float* x,",
    " __local float* y,",
    " __global float* z,",
    " __global float* b1,",
    " __global float* data,",
    " __global float* classes,",
    " int nsamples,",
    " int nfeatures,",
    " int sizeb,",
    " int nclasses,",
    " int groupsize,",
    " )",
    " {",
    " int local_id = get_local_id(0); ",
    " int j = get_global_id(0); ",
    " int cls = classes[j];",
    " for(int k = 0; k<nfeatures; k++) x[k] = data[j*nfeatures+k];",
    " float target[20];",
    " for(int k = 0; k<nclasses; k++) target[k] = 0;",
    " z[1]=z[1]+(float)get_local_id(0);",
    " b1[2]=b1[2]+(float)local_id+get_group_id(0)*groupsize;",
    " target[cls] =1;",
    " int l1 = 0;",
    " for(int l1 = 0; l1<sizeb ; l1++) {",
    " y[l1+local_id*groupsize]=b1[l1];",
    " for(int l2 = 0; l2<nfeatures; l2++){",
    " }",
    " }",

    缓冲区:
    cl_mem z_cl = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, niters * nclasses * sizeof(*z), z, &_err);            
    cl_mem b1_cl = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeb*sizeof(*b1), b1, &_err);
    cl_mem gpu_data_cl = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nsamples * nfeatures * sizeof(*gpu_data), gpu_data, &_err);
    cl_mem gpu_classes_cl = clCreateBuffer(GPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nsamples * sizeof(*gpu_classes), gpu_classes, &_err);




    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(float)*nfeatures, NULL);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(float)*nhidden*groupsize,NULL);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(cl_mem),(void*)&z_cl);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(cl_mem),(void*)&b1_cl);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(cl_mem),(void*)&gpu_data_cl);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(cl_mem),(void*)&gpu_classes_cl);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(int), (void *) &nsamples);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(int), (void *) &nfeatures);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(int), (void *) &sizeb);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(int), (void *) &nclasses);
    _err = clSetKernelArg(myKernel1, ArgCounter++, sizeof(int), (void *) &groupsize);

    带有:
  • niters〜= 10000
  • nclasses〜= 10
  • sizeb〜= 80
  • nsamples〜= 50000
  • 功能〜= 10

  • 我在Ubuntu 10.10 64Bit下使用驱动程序版本为260.19.21的Quadro FX 580。

    感谢您抽出宝贵的时间阅读本文!!!

    [更新]

    像oclBandwidthTest这样的SDK示例可以正常工作,我在每个cl命令之后检查错误,比较我的命令队列创建和启动:
    cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_myKernel1,cl_program *OpenCLProgram){
    cl_int _err=0;
    cl_platform_id cpPlatform; // OpenCL platform
    cl_device_id cdDevice; // OpenCL device

    //Get an OpenCL platform
    _err = clGetPlatformIDs(1, &cpPlatform, NULL);
    if(_err || VERBOSE)printf("clGetPlatformIDs:%i\n",_err);

    //Get the devices
    _err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
    if(_err || VERBOSE)printf("clGetDeviceIDs:%i\n",_err);

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
    *GPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &_err);
    if(_err || VERBOSE)printf("clCreateContextFromType:%i\n",_err);

    // Get the list of GPU devices associated with this context
    size_t ParmDataBytes;
    _err = clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
    if(_err || VERBOSE)printf("clGetContextInfo:%i\n",_err);
    cl_device_id* GPUDevices;
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
    _err = clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);
    if(_err || VERBOSE)printf("clGetContextInfo:%i\n",_err);

    // Create a command-queue on the first GPU device
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
    if(_err || VERBOSE)printf("clCreateCommandQueue:%i\n",_err);

    // Create OpenCL program with source code
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, sizeof(OpenCLSource)/sizeof(char *), OpenCLSource, NULL, &_err);
    if(_err || VERBOSE)printf("CreateProgramWithSource:%i\n",_err);

    //build OpenCl program
    char * buildoptions= "-Werror";
    _err= clBuildProgram(*(OpenCLProgram), 0, NULL, buildoptions, NULL, NULL);
    if(_err != CL_SUCCESS){
    if(_err || VERBOSE)printf("clBuildProgram:%i\n",_err);
    cl_build_status build_status;
    _err = clGetProgramBuildInfo(*(OpenCLProgram), GPUDevices[0], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
    char *build_log;
    size_t ret_val_size;
    _err = clGetProgramBuildInfo(*(OpenCLProgram), GPUDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
    build_log = (char*)malloc(ret_val_size+1);
    _err = clGetProgramBuildInfo(*(OpenCLProgram), GPUDevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
    build_log[ret_val_size] = '\0';
    printf("BUILD LOG: \n %s", build_log);
    }

    //create Kernel
    *cl_myKernel1 = clCreateKernel(*(OpenCLProgram), "myKernel1", &_err);
    if(_err || VERBOSE)printf("clCreateKernel:%i\n",_err);

    //output system info
    if (VERBOSE){
    size_t workgroupsize;
    cl_uint devicedata;
    size_t maxitems[3];
    clGetKernelWorkGroupInfo(*cl_myKernel1,GPUDevices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroupsize, NULL);
    printf("CL_KERNEL_WORK_GROUP_SIZE:%i (recommended workgroupsize for the used kernel)\n",workgroupsize);
    clGetDeviceInfo(GPUDevices[0], CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &devicedata, NULL);
    printf("CL_DEVICE_ADDRESS_BITS:%i\n",devicedata);
    clGetDeviceInfo(GPUDevices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &devicedata, NULL);
    printf("CL_DEVICE_MAX_COMPUTE_UNITS:%i\n",devicedata);
    _err= clGetDeviceInfo(GPUDevices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof( maxitems), &maxitems, NULL);
    printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:%i,%i,%i error=%i\n",maxitems[0],maxitems[1],maxitems[2],_err);
    _err= clGetDeviceInfo(GPUDevices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof( maxitems), &maxitems, NULL);
    printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:%i,%i,%i error=%i\n",maxitems[0],maxitems[1],maxitems[2],_err);
    printf("Lines of CL code: %i\n",sizeof(OpenCLSource)/sizeof(char*));
    getchar();
    }

    return GPUDevices;
    }

    发射:
    clEnqueueNDRangeKernel(GPUCommandQueue, cl_myKernel1, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
    if(_err!=CL_SUCCESS)printf("\nclEnqueueNDRangeKernel:%i\n",_err);
    _err = clFinish(GPUCommandQueue);
    if(_err!=CL_SUCCESS)printf("\nclFinish GPUCommandQueue:%i\n",_err);

    最佳答案

    NVIDIA将CL_OUT_OF_RESOURCES称为“一般错误”,这意味着在读取或写入缓冲区超出范围时出现了问题。您应该检查一下。

    我知道这是一个模糊的答案,但这就是此错误的含义(或用于的意思)。

    关于opencl - 内核随机崩溃,错误36和5,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/5173302/

    26 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com