gpt4 book ai didi

c++ - 带有 float 的自定义内核 GpuMat

转载 作者:塔克拉玛干 更新时间:2023-11-02 23:52:00 24 4
gpt4 key购买 nike

我正在尝试使用 GpuMat 数据编写自定义内核来查找图像像素的反余弦。当 GPU 有 CV_8UC1 数据但字符不能用于计算反余弦时,我可以在上传数据时上传、下载和更改值。但是,当我尝试将我的 GPU 转换为 CV_32FC1 类型( float )时,我在下载部分遇到了非法内存访问错误。这是我的代码:

//.cu code 
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <stdio.h>
__global__ void funcKernel(const float* srcptr, float* dstptr, size_t srcstep, const size_t dststep, int cols, int rows){
int rowInd = blockIdx.y*blockDim.y+threadIdx.y;
int colInd = blockIdx.x*blockDim.x+threadIdx.x;
if(rowInd >= rows || colInd >= cols)
return;
const float* rowsrcptr=srcptr+rowInd*srcstep;
float* rowdstPtr= dstptr+rowInd*dststep;
float val = rowsrcptr[colInd];
if((int) val % 90 == 0)
rowdstPtr[colInd] = -1 ;
else{
float acos_val = acos(val);
rowdstPtr[colInd] = acos_val;
}
}

int divUp(int a, int b){
return (a+b-1)/b;
}

extern "C"
{
void func(const float* srcptr, float* dstptr, size_t srcstep, const size_t dststep, int cols, int rows){
dim3 blDim(32,8);
dim3 grDim(divUp(cols, blDim.x), divUp(rows,blDim.y));
std::cout << "calling kernel from func\n";
funcKernel<<<grDim,blDim>>>(srcptr,dstptr,srcstep,dststep,cols,rows);
std::cout << "done with kernel call\n";
cudaDeviceSynchronize();
}

//.cpp code
void callKernel(const GpuMat &src, GpuMat &dst){
float* p = (float*)src.data;
float* p2 =(float*) dst.data;
func(p,p2,src.step,dst.step,src.cols,src.rows);
}

int main(){
Mat input = imread("cat.jpg",0);
Mat float_input;
input.convertTo(float_input,CV_32FC1);
GpuMat d_frame,d_output;
Size size = float_input.size();
d_frame.upload(float_input);
d_output.create(size,CV_32FC1);
callKernel(d_frame,d_output);
Mat output(d_output);
return 0;
}

当我运行程序时,我的编译器告诉我:

OpenCV Error: Gpu API call (an illegal memory access was encountered) in copy, file /home/mobile/opencv-2.4.9/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp, line 882 terminate called after throwing an instance of 'cv::Exception' what(): /home/mobile/opencv-2.4.9/modules/dynamicuda/include/opencv2/dynamicuda/dynamicuda.hpp:882: error: (-217) an illegal memory access was encountered in function copy

最佳答案

您可以使用 cv::cuda::PtrStp<>cv::cuda::PtrStpSz<>编写您自己的内核(因此您不必为 GpuMat 使用步进参数,它会稍微简化您的代码 :D):

内核:

    __global__ void myKernel(const cv::cuda::PtrStepSzf input,
cv::cuda::PtrStepSzf output)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;

if (x <= input.cols - 1 && y <= input.rows - 1 && y >= 0 && x >= 0)
{
output(y, x) = input(y, x);
}
}

注意:
cv::cuda::PtrStep<> : 没有尺码信息
cv::cuda::PtrStepSz<> : 有尺码信息
cv::cuda::PtrStepSzb : 对于无符号字符垫 (CV_8U)
cv::cuda::PtrStepSzf : 用于浮垫 (CV_32F)
cv::cuda::PtrStep<cv::Point2f> : 其他类型的例子

内核调用:

    void callKernel(cv::InputArray _input,
cv::OutputArray _output,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();

_output.create(input.size(), input.type());
cv::cuda::GpuMat output = _output.getGpuMat();

dim3 cthreads(16, 16);
dim3 cblocks(
static_cast<int>(std::ceil(input1.size().width /
static_cast<double>(cthreads.x))),
static_cast<int>(std::ceil(input1.size().height /
static_cast<double>(cthreads.y))));

cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
myKernel<<<cblocks, cthreads, 0, stream>>>(input, output);

cudaSafeCall(cudaGetLastError());
}

您可以使用 cv::cuda::GpuMat 调用此函数:

   callKernel(d_frame, d_output, cv::cuda::Stream());

关于c++ - 带有 float 的自定义内核 GpuMat,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/24613637/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com