gpt4 book ai didi

filter - CUDA 中的 2D 中值滤波 : how to efficiently copy global memory to shared memory

转载 作者:行者123 更新时间:2023-12-04 04:54:58 33 4
gpt4 key购买 nike

我正在尝试做一个带窗口的中值滤波器 x*y哪里xy是奇数和程序的参数。

我的想法是首先看看我可以在单个块中执行多少个线程以及我有多少可用的共享内存,如下所示:

void cudaInit(int imgX, int imgY, int kx, int ky, int* cudaVars){
int device;
int deviceCount;
cudaDeviceProp deviceProp;

cudaGetDevice(&device);
cudaGetDeviceProperties(&deviceProp, device);
int kxMed = kx/2;
int kyMed = ky/2;
int n = deviceProp.maxThreadsPerBlock;
while(f(n,kxMed,kyMed)>deviceProp.sharedMemPerBlock){
n = n/2;
}

cudaVars[0] = n;
cudaVars[1] = imgX/cudaVars[0];
cudaVars[2] = imgY/cudaVars[0];
}
}



void mediaFilter_gpuB(uchar4* h_img,int width, int height, int kx, int ky){

assert(h_img!=NULL && width!=0 && height!=0);
int dev=0;
cudaDeviceProp deviceProp;
//DEVICE
uchar4* d_img;
uchar4* d_buf;

int cudaVars[3]={0};
cudaInit(width,height,kx,ky,cudaVars);
checkCudaErrors(cudaMalloc((void**) &(d_img), width*height*sizeof(unsigned char)*4));
checkCudaErrors(cudaMalloc((void**) &(d_buf), width*height*sizeof(unsigned char)*4));

cudaGetDevice(&dev);
cudaGetDeviceProperties(&deviceProp,dev);
checkCudaErrors(cudaMemcpy(d_img, h_img, width*height*sizeof(uchar4), cudaMemcpyHostToDevice));

dim3 dimGrid(cudaVars[1],cudaVars[2],1);
dim3 threads(cudaVars[0],1,1);
mediaFilterB<<<dimGrid,threads,f(cudaVars[0],kx/2,ky/2)>>>(d_buf,d_img,width,height, kx,ky,cudaVars[0]);

checkCudaErrors(cudaMemcpy(h_img, d_buf, width*height*sizeof(uchar4), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_img));
checkCudaErrors(cudaFree(d_buf));

}
__device__ void fillSmem(int* sMem, uchar4* buf, int width, int height, int kx, int ky){
int kyMed=ky/2;
int kxMed=kx/2;
int sWidth = 2*kxMed+gridDim.x;
int sHeight =2*kyMed+gridDim.x;
int X = blockIdx.x*gridDim.x+threadIdx.x;
int Y = blockIdx.y*gridDim.y;
int j=0;
while(threadIdx.x+j < sHeight){
for(int i=0;i<sWidth;i++){
sMem[threadIdx.x*gridDim.x+gridDim.x*j+i] = buf[X + i + (threadIdx.x + Y)*width + j*width].x;
}
j++;
}
}

目前,在函数 mediaFilterB 中,我只复制全局内存到共享内存,但是需要很多时间,即大约 5 8000*8000 图像中的秒数像素。另一方面,没有CUDA的顺序算法需要 23秒计算图像的中值滤波器。

我知道我在将全局内存复制到共享内存的过程中做错了,而且我的算法效率很低,但我不知道如何纠正它。

最佳答案

我正在提供这个问题的答案,以将其从未答复列表中删除。

关于如何使用共享内存改进 CUDA 中值过滤的经典示例是 Accelereyes 开发的代码,可从以下帖子下载:

Median Filtering: CUDA tips and tricks

这个想法是分配一个 (BLOCK_WIDTH+2)x(BLOCK_HEIGHT+2)大小共享内存。在第一步,外部元素归零。只有当这些元素对应于真实的图像元素时,它们才会被全局内存值填充,否则它们将保持为零以进行填充。

为了方便起见,我在下面提供了完整的工作代码。

#include <iostream>  
#include <fstream>

using namespace std;

#define BLOCK_WIDTH 16
#define BLOCK_HEIGHT 16

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
{
const int tx_l = threadIdx.x; // --- Local thread x index
const int ty_l = threadIdx.y; // --- Local thread y index

const int tx_g = blockIdx.x * blockDim.x + tx_l; // --- Global thread x index
const int ty_g = blockIdx.y * blockDim.y + ty_l; // --- Global thread y index

__shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];

// --- Fill the shared memory border with zeros
if (tx_l == 0) smem[tx_l] [ty_l+1] = 0; // --- left border
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+1] = 0; // --- right border
if (ty_l == 0) { smem[tx_l+1][ty_l] = 0; // --- upper border
if (tx_l == 0) smem[tx_l] [ty_l] = 0; // --- top-left corner
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l] = 0; // --- top-right corner
} else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2] = 0; // --- bottom border
if (tx_l == 0) smem[tx_l] [ty_l+2] = 0; // --- bottom-left corder
else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2] = 0; // --- bottom-right corner
}

// --- Fill shared memory
smem[tx_l+1][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g]; // --- center
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1]; // --- left border
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1]; // --- right border
if ((ty_l == 0)&&(ty_g > 0)) { smem[tx_l+1][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g]; // --- upper border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- top-left corner
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g+1]; // --- top-right corner
} else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) { smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g]; // --- bottom border
if ((tx_l == 0)&&((tx_g > 0))) smem[tx_l] [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- bottom-left corder
else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1)) smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1]; // --- bottom-right corner
}
__syncthreads();

// --- Pull the 3x3 window in a local array
unsigned short v[9] = { smem[tx_l][ty_l], smem[tx_l+1][ty_l], smem[tx_l+2][ty_l],
smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1], smem[tx_l+2][ty_l+1],
smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2], smem[tx_l+2][ty_l+2] };

// --- Bubble-sort
for (int i = 0; i < 5; i++) {
for (int j = i + 1; j < 9; j++) {
if (v[i] > v[j]) { // swap?
unsigned short tmp = v[i];
v[i] = v[j];
v[j] = tmp;
}
}
}

// --- Pick the middle one
Output_Image[ty_g*Image_Width + tx_g] = v[4];
}

/********/
/* MAIN */
/********/
int main()
{
const int Image_Width = 1580;
const int Image_Height = 1050;

// --- Open data file
ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );

// --- Get file length
is.seekg(0, ios::end);
int dataLength = is.tellg();
is.seekg(0, ios::beg);

// --- Read data from file and close file
unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
is.read((char*)Input_Image_Host,dataLength);
is.close();

// --- CUDA warm up
unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
gpuErrchk(cudaFree(forFirstCudaMalloc));

// --- Allocate host and device memory spaces
unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short)));
unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short)));

// --- Copy data from host to device
gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering

// --- Grid and block sizes
const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);
const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1);

/**********************************************/
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
/**********************************************/

cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());

// --- Copy results back to the host
gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));

// --- Open results file, write results and close the file
ofstream of2; of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw", ios::binary);
of2.write((char*)Output_Image_Host, dataLength);
of2.close();

cout << "\n Press Any Key To Exit..!!";
gpuErrchk(cudaFree(Input_Image));

delete Input_Image_Host;
delete Output_Image_Host;

return 0;
}

关于filter - CUDA 中的 2D 中值滤波 : how to efficiently copy global memory to shared memory,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/16876480/

33 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com