gpt4 book ai didi

c++ - CUDA优化

转载 作者:行者123 更新时间:2023-11-30 03:45:11 27 4
gpt4 key购买 nike

我使用 CUDA 开发了枕形失真以支持实时 - 对于 3680*2456 图像序列超过 40 fps。

但如果我使用 CUDA - nVIDIA GeForce GT 610、2GB DDR3,则需要 130 毫秒。

但如果我使用 CPU 和 OpenMP - Core i7 3.4GHz,QuadCore,它只需要 60 毫秒。

请告诉我怎样做才能加快速度。谢谢。

完整的源代码可以在这里下载。 https://drive.google.com/file/d/0B9SEJgsu0G6QX2FpMnRja0o5STA/view?usp=sharing https://drive.google.com/file/d/0B9SEJgsu0G6QOGNPMmVQLWpSb2c/view?usp=sharing

代码如下。

__global__
void undistort(int N, float k, int width, int height, int depth, int pitch, float R, float L, unsigned char* in_bits, unsigned char* out_bits)
{
// Get the Index of the Array from GPU Grid/Block/Thread Index and Dimension.
int i, j;
i = blockIdx.y * blockDim.y + threadIdx.y;
j = blockIdx.x * blockDim.x + threadIdx.x;

// If Out of Array
if (i >= height || j >= width)
{
return;
}

// Calculating Undistortion Equation.
// In CPU, We used Fast Approximation equations of atan and sqrt - It makes 2 times faster.
// But In GPU, No need to use Approximation Functions as it is faster.

int cx = width * 0.5;
int cy = height * 0.5;

int xt = j - cx;
int yt = i - cy;

float distance = sqrt((float)(xt*xt + yt*yt));
float r = distance*k / R;

float theta = 1;
if (r == 0)
theta = 1;
else
theta = atan(r)/r;

theta = theta*L;

float tx = theta*xt + cx;
float ty = theta*yt + cy;

// When we correct the frame, its size will be greater than Original.
// So We should Crop it.
if (tx < 0)
tx = 0;
if (tx >= width)
tx = width - 1;
if (ty < 0)
ty = 0;
if (ty >= height)
ty = height - 1;

// Output the Result.
int ux = (int)(tx);
int uy = (int)(ty);

tx = tx - ux;
ty = ty - uy;

unsigned char *p = (unsigned char*)out_bits + i*pitch + j*depth;
unsigned char *q00 = (unsigned char*)in_bits + uy*pitch + ux*depth;
unsigned char *q01 = q00 + depth;
unsigned char *q10 = q00 + pitch;
unsigned char *q11 = q10 + depth;

unsigned char newVal[4] = {0};
for (int k = 0; k < depth; k++)
{
newVal[k] = (q00[k]*(1-tx)*(1-ty) + q01[k]*tx*(1-ty) + q10[k]*(1-tx)*ty + q11[k]*tx*ty);
memcpy(p + k, &newVal[k], 1);
}

}

void wideframe_correction(char* bits, int width, int height, int depth)
{
// Find the device.
// Initialize the nVIDIA Device.
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);

// This works for Calculating GPU Time.
cudaProfilerStart();

// This works for Measuring Total Time
long int dwTime = clock();

// Setting Distortion Parameters
// Note that Multiplying 0.5 works faster than divide into 2.
int cx = (int)(width * 0.5);
int cy = (int)(height * 0.5);
float k = -0.73f;
float R = sqrt((float)(cx*cx + cy*cy));

// Set the Radius of the Result.
float L = (float)(width<height ? width:height);
L = L/2.0f;
L = L/R;
L = L*L*L*0.3333f;
L = 1.0f/(1-L);

// Create the GPU Memory Pointers.
unsigned char* d_img_in = NULL;
unsigned char* d_img_out = NULL;

// Allocate the GPU Memory2D with pitch for fast performance.
size_t pitch;
cudaMallocPitch( (void**) &d_img_in, &pitch, width*depth, height );
cudaMallocPitch( (void**) &d_img_out, &pitch, width*depth, height );
_tprintf(_T("\nPitch : %d\n"), pitch);

// Copy RAM data to VRAM.
cudaMemcpy2D( d_img_in, pitch,
bits, width*depth, width*depth, height,
cudaMemcpyHostToDevice );
cudaMemcpy2D( d_img_out, pitch,
bits, width*depth, width*depth, height,
cudaMemcpyHostToDevice );

// Create Variables for Timing
cudaEvent_t startEvent, stopEvent;
cudaError_t err = cudaEventCreate(&startEvent, 0);
assert( err == cudaSuccess );
err = cudaEventCreate(&stopEvent, 0);
assert( err == cudaSuccess );

// Execution of the version using global memory
float elapsedTime;
cudaEventRecord(startEvent);

// Process image
dim3 dGrid(width / BLOCK_WIDTH + 1, height / BLOCK_HEIGHT + 1);
dim3 dBlock(BLOCK_WIDTH, BLOCK_HEIGHT);

undistort<<< dGrid, dBlock >>> (width*height, k, width, height, depth, pitch, R, L, d_img_in, d_img_out);

cudaThreadSynchronize();
cudaEventRecord(stopEvent);
cudaEventSynchronize( stopEvent );

// Estimate the GPU Time.
cudaEventElapsedTime( &elapsedTime, startEvent, stopEvent);

// Calculate the Total Time.
dwTime = clock() - dwTime;

// Save Image data from VRAM to RAM
cudaMemcpy2D( bits, width*depth,
d_img_out, pitch, width*depth, height,
cudaMemcpyDeviceToHost );

_tprintf(_T("GPU Processing Time(ms) : %d\n"), (int)elapsedTime);
_tprintf(_T("VRAM Memory Read/Write Time(ms) : %d\n"), dwTime - (int)elapsedTime);
_tprintf(_T("Total Time(ms) : %d\n"), dwTime );

// Free GPU Memory
cudaFree(d_img_in);
cudaFree(d_img_out);
cudaProfilerStop();
cudaDeviceReset();
}

最佳答案

我没看过源码,但是有些东西你是过不去的。

您的 GPU 的性能几乎与 CPU 相同:

根据您的真实 GPU/CPU 型号调整以下信息。

Specification | GPU          | CPU
----------------------------------------
Bandwith | 14,4 GB/sec | 25.6 GB/s
Flops | 155 (FMA) | 135

我们可以得出结论,对于内存受限的内核,您的 GPU 永远不会比 CPU 快。

在此处找到的 GPU 信息: http://www.nvidia.fr/object/geforce-gt-610-fr.html#pdpContent=2

在此处找到的 CPU 信息:http://ark.intel.com/products/75123/Intel-Core-i7-4770K-Processor-8M-Cache-up-to-3_90-GHz?q=Intel%20Core%20i7%204770K

这里http://www.ocaholic.ch/modules/smartsection/item.php?page=6&itemid=1005

关于c++ - CUDA优化,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/35006924/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com