gpt4 book ai didi

cuda - CUDA 中的一维 fftshift

转载 作者:行者123 更新时间:2023-12-02 05:19:00 26 4
gpt4 key购买 nike

我正在 CUDA 中设置一维 fftshift。我的代码如下

__global__ void fftshift(double2 *u_d, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

double2 temp;

if(i< N/2)
{
temp.x = u_d[i].x;
temp.y = u_d[i].y;

u_d[i].x =u_d[i+N/2].x;
u_d[i].y =u_d[i+N/2].y;

u_d[i+N/2].x = temp.x;
u_d[i+N/2].y = temp.y;
}
}

有没有比上面显示的更聪明的方法来在 CUDA 中执行 fftshift?

提前致谢。

或许更好的解决方案

我发现也许下面的解决方案是一个不错的选择

__global__ void fftshift(double2 *u_d, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if(i < N)
{
double a = pow(-1.0,i&1);
u_d[i].x *= a;
u_d[i].y *= a;
}
}

它包括将要转换的向量乘以 1-1 的序列,这相当于乘以 exp(-j npi) 从而在共轭域中移动。

你必须在应用CUFFT之前和之后调用这个内核。

一个优点是避免了内存移动/交换,这个想法可以立即扩展到 2D 情况,参见 CUDA Device To Device transfer expensive .

关于对称数据

这个解决方案似乎并不局限于对称数据。例如尝试以下 Matlab 代码,将想法应用于完全复杂的随机矩阵(高斯振幅和均匀相位)。

N1=512;
N2=256;

Phase=(rand(N1,N2)-0.5)*2*pi;
Magnitude=randn(N1,N2);

Im=Magnitude.*exp(j*Phase);

Transform=fftshift(fft2(ifftshift(Im)));

n1=0:(N1-1);
n2=0:(N2-1);
[N2,N1]=meshgrid(n2,n1);
Im2=Im.*(-1).^(N1+N2);
Im3=fft2(Im2);
Im4=Im3.*(-1).^(N1+N2);

100*sqrt(sum(abs(Im4-Transform).^2)/sum(abs(Transform).^2))

返回的归一化均方根误差将为 0,确认 Transform=Im4

提高速度

按照在 NVIDIA Forum 收到的建议, 可以通过改变指令来提高速度

double a = pow(-1.0,i&1);

double a = 1-2*(i&1);

避免使用慢程序 pow

最佳答案

经过很多时间和 cuFFT 回调功能的介绍,我可以为我自己的问题提供一个有意义的答案。

上面我提出了一个“也许更好的解决方案”。经过一些测试,我意识到,如果不使用回调 cuFFT 功能,该解决方案速度较慢,因为它使用了 pow。然后,我探索了使用 pow 的两种替代方法,比如

float a     = (float)(1-2*((int)offset%2));

float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;

float2  out = ((float2*)d_in)[offset];

if ((int)offset&1) {

out.x = -out.x;
out.y = -out.y;

}

但是,对于标准的 cuFFT,上述所有解决方案都需要两个单独的内核调用,一个用于 fftshift,一个用于 cuFFT 执行调用。然而,借助新的 cuFFT 回调功能,上述替代解决方案可以作为 __device__ 函数嵌入到代码中。

所以,最后我得到了下面的比较代码

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <assert.h>

#include <cufft.h>
#include <cufftXt.h>

//#define DEBUG

#define BLOCKSIZE 256

/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

/*********************/
/* CUFFT ERROR CHECK */
/*********************/
// See http://stackoverflow.com/questions/16267149/cufft-error-handling
#ifdef _CUFFT_H_
// cuFFT API errors
static const char *_cudaGetErrorEnum(cufftResult error)
{
switch (error)
{
case CUFFT_SUCCESS:
return "CUFFT_SUCCESS";

case CUFFT_INVALID_PLAN:
return "CUFFT_INVALID_PLAN";

case CUFFT_ALLOC_FAILED:
return "CUFFT_ALLOC_FAILED";

case CUFFT_INVALID_TYPE:
return "CUFFT_INVALID_TYPE";

case CUFFT_INVALID_VALUE:
return "CUFFT_INVALID_VALUE";

case CUFFT_INTERNAL_ERROR:
return "CUFFT_INTERNAL_ERROR";

case CUFFT_EXEC_FAILED:
return "CUFFT_EXEC_FAILED";

case CUFFT_SETUP_FAILED:
return "CUFFT_SETUP_FAILED";

case CUFFT_INVALID_SIZE:
return "CUFFT_INVALID_SIZE";

case CUFFT_UNALIGNED_DATA:
return "CUFFT_UNALIGNED_DATA";
}

return "<unknown>";
}
#endif

#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{
if( CUFFT_SUCCESS != err) {
fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
_cudaGetErrorEnum(err)); \
cudaDeviceReset(); assert(0); \
}
}

/****************************************/
/* FFTSHIFT 1D INPLACE MEMORY MOVEMENTS */
/****************************************/
__global__ void fftshift_1D_inplace_memory_movements(float2 *d_inout, unsigned int N)
{
unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;

if (tid < N/2)
{
float2 temp = d_inout[tid];
d_inout[tid] = d_inout[tid + (N / 2)];
d_inout[tid + (N / 2)] = temp;
}
}

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 1 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v1(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

float a = (float)(1-2*((int)offset%2));

float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v1_Ptr = fftshift_1D_chessboard_callback_v1;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 2 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v2(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

float a = pow(-1.,(double)(offset&1));

float2 out = ((float2*)d_in)[offset];
out.x = out.x * a;
out.y = out.y * a;
return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v2_Ptr = fftshift_1D_chessboard_callback_v2;

/**********************************************/
/* FFTSHIFT 1D INPLACE CHESSBOARD - VERSION 3 */
/**********************************************/
__device__ float2 fftshift_1D_chessboard_callback_v3(void *d_in, size_t offset, void *callerInfo, void *sharedPtr) {

float2 out = ((float2*)d_in)[offset];

if ((int)offset&1) {

out.x = -out.x;
out.y = -out.y;

}
return out;
}

__device__ cufftCallbackLoadC fftshift_1D_chessboard_callback_v3_Ptr = fftshift_1D_chessboard_callback_v3;

/********/
/* MAIN */
/********/
int main()
{
const int N = 131072;

printf("N = %d\n", N);

// --- Host side input array
float2 *h_vect = (float2 *)malloc(N*sizeof(float2));
for (int i=0; i<N; i++) {
h_vect[i].x = (float)rand() / (float)RAND_MAX;
h_vect[i].y = (float)rand() / (float)RAND_MAX;
}

// --- Host side output arrays
float2 *h_out1 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out2 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out3 = (float2 *)malloc(N*sizeof(float2));
float2 *h_out4 = (float2 *)malloc(N*sizeof(float2));

// --- Device side input arrays
float2 *d_vect1; gpuErrchk(cudaMalloc((void**)&d_vect1, N*sizeof(float2)));
float2 *d_vect2; gpuErrchk(cudaMalloc((void**)&d_vect2, N*sizeof(float2)));
float2 *d_vect3; gpuErrchk(cudaMalloc((void**)&d_vect3, N*sizeof(float2)));
float2 *d_vect4; gpuErrchk(cudaMalloc((void**)&d_vect4, N*sizeof(float2)));
gpuErrchk(cudaMemcpy(d_vect1, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect2, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect3, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_vect4, h_vect, N*sizeof(float2), cudaMemcpyHostToDevice));

// --- Device side output arrays
float2 *d_out1; gpuErrchk(cudaMalloc((void**)&d_out1, N*sizeof(float2)));
float2 *d_out2; gpuErrchk(cudaMalloc((void**)&d_out2, N*sizeof(float2)));
float2 *d_out3; gpuErrchk(cudaMalloc((void**)&d_out3, N*sizeof(float2)));
float2 *d_out4; gpuErrchk(cudaMalloc((void**)&d_out4, N*sizeof(float2)));

float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

/*******************************************/
/* cuFFT + MEMORY MOVEMENTS BASED FFTSHIFT */
/*******************************************/
cufftHandle planinverse; cufftSafeCall(cufftPlan1d(&planinverse, N, CUFFT_C2C, 1));
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse, d_vect1, d_vect1, CUFFT_INVERSE));
fftshift_1D_inplace_memory_movements<<<iDivUp(N/2, BLOCKSIZE), BLOCKSIZE>>>(d_vect1, N);
#ifdef DEBUG
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Memory movements elapsed time: %3.3f ms \n", time);
gpuErrchk(cudaMemcpy(h_out1, d_vect1, N*sizeof(float2), cudaMemcpyDeviceToHost));

/****************************************/
/* CHESSBOARD MULTIPLICATION V1 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v1_Ptr;

gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v1_Ptr, fftshift_1D_chessboard_callback_v1_Ptr, sizeof(hfftshift_1D_chessboard_callback_v1_Ptr)));
cufftHandle planinverse_v1; cufftSafeCall(cufftPlan1d(&planinverse_v1, N, CUFFT_C2C, 1));
cufftResult status = cufftXtSetCallback(planinverse_v1, (void **)&hfftshift_1D_chessboard_callback_v1_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v1, d_vect2, d_out2, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v1 elapsed time: %3.3f ms \n", time);

gpuErrchk(cudaMemcpy(h_out2, d_out2, N*sizeof(float2), cudaMemcpyDeviceToHost));

// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out2[i].x)||(h_out1[i].y != h_out2[i].y)) { printf("Chessboard v1 test failed!\n"); return 0; }

printf("Chessboard v1 test passed!\n");

/****************************************/
/* CHESSBOARD MULTIPLICATION V2 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v2_Ptr;

gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v2_Ptr, fftshift_1D_chessboard_callback_v2_Ptr, sizeof(hfftshift_1D_chessboard_callback_v2_Ptr)));
cufftHandle planinverse_v2; cufftSafeCall(cufftPlan1d(&planinverse_v2, N, CUFFT_C2C, 1));
status = cufftXtSetCallback(planinverse_v2, (void **)&hfftshift_1D_chessboard_callback_v2_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v2, d_vect3, d_out3, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v2 elapsed time: %3.3f ms \n", time);

gpuErrchk(cudaMemcpy(h_out3, d_out3, N*sizeof(float2), cudaMemcpyDeviceToHost));

// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out3[i].x)||(h_out1[i].y != h_out3[i].y)) { printf("Chessboard v2 test failed!\n"); return 0; }

printf("Chessboard v2 test passed!\n");

/****************************************/
/* CHESSBOARD MULTIPLICATION V3 + cuFFT */
/****************************************/
cufftCallbackLoadC hfftshift_1D_chessboard_callback_v3_Ptr;

gpuErrchk(cudaMemcpyFromSymbol(&hfftshift_1D_chessboard_callback_v3_Ptr, fftshift_1D_chessboard_callback_v3_Ptr, sizeof(hfftshift_1D_chessboard_callback_v3_Ptr)));
cufftHandle planinverse_v3; cufftSafeCall(cufftPlan1d(&planinverse_v3, N, CUFFT_C2C, 1));
status = cufftXtSetCallback(planinverse_v3, (void **)&hfftshift_1D_chessboard_callback_v3_Ptr, CUFFT_CB_LD_COMPLEX, 0);
if (status == CUFFT_LICENSE_ERROR) {
printf("This sample requires a valid license file.\n");
printf("The file was either not found, out of date, or otherwise invalid.\n");
exit(EXIT_FAILURE);
} else {
cufftSafeCall(status);
}
cudaEventRecord(start, 0);
cufftSafeCall(cufftExecC2C(planinverse_v3, d_vect4, d_out4, CUFFT_INVERSE));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Chessboard v3 elapsed time: %3.3f ms \n", time);

gpuErrchk(cudaMemcpy(h_out4, d_out4, N*sizeof(float2), cudaMemcpyDeviceToHost));

// --- Checking the results
for (int i=0; i<N; i++) if ((h_out1[i].x != h_out4[i].x)||(h_out1[i].y != h_out4[i].y)) { printf("Chessboard v3 test failed!\n"); return 0; }

printf("Chessboard v3 test passed!\n");

return 0;
}

在 GTX 480 上的结果

N         Mem mov   v1      v2      v3 
131072 0.552 0.136 0.354 0.183
262144 0.536 0.175 0.451 0.237
524288 0.661 0.283 0.822 0.290
1048576 0.784 0.565 1.548 0.548
2097152 1.298 0.952 2.973 0.944

在 TESLA C2050 上的结果

N         Mem mov   v1      v2      v3 
131072 0.278 0.130 0.236 0.132
262144 0.344 0.202 0.374 0.206
524288 0.544 0.378 0.696 0.387
1048576 0.909 0.695 1.294 0.695
2097152 1.656 1.349 2.531 1.349

KEPLER K20c 的结果

N         Mem mov   v1      v2      v3 
131072 0.077 0.076 0.136 0.076
262144 0.142 0.128 0.202 0.127
524288 0.268 0.229 0.374 0.230
1048576 0.516 0.433 0.717 0.435
2097152 1.019 0.853 1.400 0.855

一些更多的细节最近出现在 The 1D fftshift in CUDA by chessboard multiplicationGitHub page .

关于cuda - CUDA 中的一维 fftshift,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14187481/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com