gpt4 book ai didi

cuda - 一维纹理内存访问比一维全局内存访问更快吗?

转载 作者:行者123 更新时间:2023-12-02 10:13:09 24 4
gpt4 key购买 nike

我正在测量标准纹理和 1Dtexture 内存访问之间的差异。为此,我创建了两个内核

__global__ void texture1D(float* doarray,int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]=tex1Dfetch(texreference,index);
return;
}
__global__ void standard1D(float* diarray, float* doarray, int size)
{
int index;
//calculate each thread global index
index=blockIdx.x*blockDim.x+threadIdx.x;
//fetch global memory through texture reference
doarray[index]= diarray[index];
return;
}

然后,我调用每个内核来测量其花费的时间:

//copy array from host to device memory
cudaMemcpy(diarray,harray,sizeof(float)*size,cudaMemcpyHostToDevice);

checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
checkCuda( cudaEventRecord(startEvent, 0) );

//bind texture reference with linear memory
cudaBindTexture(0,texreference,diarray,sizeof(float)*size);

//execute device kernel
texture1D<<<(int)ceil((float)size/threadSize),threadSize>>>(doarray,size);

//unbind texture reference to free resource
cudaUnbindTexture(texreference);

checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );

//copy result array from device to host memory
cudaMemcpy(horray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost);

//check result
checkResutl(horray, harray, size);

cudaEvent_t startEvent2, stopEvent2;
checkCuda( cudaEventCreate(&startEvent2) );
checkCuda( cudaEventCreate(&stopEvent2) );
checkCuda( cudaEventRecord(startEvent2, 0) );
standard1D<<<(int)ceil((float)size/threadSize),threadSize>>>(diarray,doarray,size);
checkCuda( cudaEventRecord(stopEvent2, 0) );
checkCuda( cudaEventSynchronize(stopEvent2) );

//copy back to CPU
cudaMemcpy(horray,doarray,sizeof(float)*size,cudaMemcpyDeviceToHost);

并打印结果:

  float time,time2;
checkCuda( cudaEventElapsedTime(&time, startEvent, stopEvent) );
checkCuda( cudaEventElapsedTime(&time2, startEvent2, stopEvent2) );
printf("Texture bandwidth (GB/s): %f\n",bytes * 1e-6 / time);
printf("Standard bandwidth (GB/s): %f\n",bytes * 1e-6 / time2);

事实证明,无论我分配的数组大小 (size),标准带宽总是要高得多。事情本来就是这样,还是我在某个时候搞砸了?我对纹理内存访问的理解是它可以加速全局内存访问。

最佳答案

我对用于 1D 复值函数插值的全局内存和纹理内存(仅用于缓存目的,而不用于过滤)进行了比较。

我要比较的内核是使用全局内存的 42 和使用纹理内存的 2。它们根据复杂值的访问方式(1 float22 float)进行区分,并在下面报告。我将在某处发布完整的 Visual Studio 2010,以防有人提出批评或进行自己的测试。

__global__ void linear_interpolation_kernel_function_GPU(float* __restrict__ result_d, const float* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;

if(j<N)
{
float reg_x_out = x_out_d[j/2]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - __int2float_rz(k);
float dk = data_d[2*k+(j&1)];
float dkp1 = data_d[2*k+2+(j&1)];
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}

__global__ void linear_interpolation_kernel_function_GPU_alternative(float2* __restrict__ result_d, const float2* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;

if(j<N)
{
float reg_x_out = x_out_d[j]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - __int2float_rz(k);
float2 dk = data_d[k];
float2 dkp1 = data_d[k+1];
result_d[j].x = a * dkp1.x + (-dk.x * a + dk.x);
result_d[j].y = a * dkp1.y + (-dk.y * a + dk.y);
}
}

__global__ void linear_interpolation_kernel_function_GPU_texture(float2* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;

if(j<N)
{
float reg_x_out = x_out_d[j]+M/2;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - __int2float_rz(k);
float2 dk = tex1Dfetch(data_d_texture,k);
float2 dkp1 = tex1Dfetch(data_d_texture,k+1);
result_d[j].x = a * dkp1.x + (-dk.x * a + dk.x);
result_d[j].y = a * dkp1.y + (-dk.y * a + dk.y);
}
}

__global__ void linear_interpolation_kernel_function_GPU_texture_alternative(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
int j = threadIdx.x + blockDim.x * blockIdx.x;

if(j<N)
{
float reg_x_out = x_out_d[j/2]+M/4;
int k = __float2int_rz(reg_x_out);
float a = reg_x_out - __int2float_rz(k);
float dk = tex1Dfetch(data_d_texture2,2*k+(j&1));
float dkp1 = tex1Dfetch(data_d_texture2,2*k+2+(j&1));
result_d[j] = a * dkp1 + (-dk * a + dk);
}
}

我考虑了 4 种不同的 GPU,即 GeForce GT540M (cc 2.1)、Tesla C2050 (cc 2.0)、Kepler K20c (cc 3.5) 和 GT210 (cc 1.2)。结果如下图所示。可以看出,使用纹理作为具有旧计算能力的缓存比全局内存的使用得到了改善,而对于最新的架构来说,这两种解决方案非常等效。

当然,这个示例并不详尽,实际上可能存在其他情况,对于特定应用程序应该首选前者或后者。

附:处理时间以 [ms] 为单位,而不是如图标签所示的 [s]。

GT210 Tesla C2050 GeForce GT540M Kepler K20c

关于cuda - 一维纹理内存访问比一维全局内存访问更快吗?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/19860094/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com