gpt4 book ai didi

CUDA C - 如何使用Texture2D实现 double 浮点

转载 作者:行者123 更新时间:2023-11-30 20:05:05 26 4
gpt4 key购买 nike

我想使用纹理 2D 内存来实现 double 。我想从纹理读取到共享内存并将 int2 转换为 double,然后传输回主机内存但我只根据需要获取第一行,所有其他行的值为 2.00000000。

#include<stdio.h>
#include<cuda.h>
#define Xdim 8
#define Ydim 8
texture<int2,2>me_texture;

static __inline__ __device__ double fetch_double(int2 p){
return __hiloint2double(p.y, p.x);
}

__global__ void kern(double *o, int pitch){
__shared__ double A[Xdim][Ydim];
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
int2 jj;

if(i<Xdim && j<Ydim){

jj = tex2D(me_texture, i, j);

A[threadIdx.x][threadIdx.y] = fetch_double(jj);
}
__syncthreads();

if(i<Xdim && j<Ydim){
o[j*Xdim + i] = A[threadIdx.x][threadIdx.y];
}
}

int main(int argc, char *argv[]){
double hbuf[Xdim][Ydim];
double hout[Xdim][Ydim];
double *dob;
double *dbuf;
size_t pitch_bytes;

cudaMallocPitch((void**)&dbuf, &pitch_bytes, sizeof(double)*Xdim, Ydim);
cudaMallocPitch((void**)&dob, &pitch_bytes, sizeof(double)*Xdim, Ydim);


hbuf[0][0] = 1.234567891234567;
hbuf[0][1] = 12.34567891234567;
hbuf[0][2] = 123.4567891234567;
hbuf[0][3] = 1234.567891234567;
hbuf[0][4] = 12345.67891234567;
hbuf[0][5] = 123456.7891234567;
hbuf[0][6] = 1234567.891234567;
hbuf[0][7] = 12345678.91234567;
hbuf[1][0] = 123456789.1234567;
hbuf[1][1] = 1234567891.234567;
hbuf[1][2] = 12345678912.34567;
hbuf[1][3] = 123456789123.4567;
hbuf[1][4] = 1234567891234.567;
hbuf[1][5] = 12345678912345.67;
hbuf[1][6] = 123456789123456.7;
hbuf[1][7] = 1234567891234567;
hbuf[2][0] = 123456789.7654321;
hbuf[2][1] = 1234567897.654321;
hbuf[2][2] = 12345678976.54321;
hbuf[2][3] = 123456789765.4321;
hbuf[2][4] = 1234567897654.321;
hbuf[2][5] = 12345678976543.21;
hbuf[2][6] = 123456789765432.1;
hbuf[2][7] = 1234567897654321;
hbuf[3][0] = 9.876543211234567;
hbuf[3][1] = 98.76543211234567;
hbuf[3][2] = 987.6543211234567;
hbuf[3][3] = 9876.543211234567;
hbuf[3][4] = 98765.43211234567;
hbuf[3][5] = 987654.3211234567;
hbuf[3][6] = 9876543.211234567;
hbuf[3][7] = 98765432.11234567;
hbuf[4][0] = 987654321.1234567;
hbuf[4][1] = 9876543211.234567;
hbuf[4][2] = 98765432112.34567;
hbuf[4][3] = 987654321123.4567;
hbuf[4][4] = 9876543211234.567;
hbuf[4][5] = 98765432112345.67;
hbuf[4][6] = 987654321123456.7;
hbuf[4][7] = 9876543211234567;
hbuf[5][0] = 987654321.7654321;
hbuf[5][1] = 9876543217.654321;
hbuf[5][2] = 98765432176.54321;
hbuf[5][3] = 987654321765.4321;
hbuf[5][4] = 9876543217654.321;
hbuf[5][5] = 98765432176543.21;
hbuf[5][6] = 987654321765432.1;
hbuf[5][7] = 9876543217654321;
hbuf[6][0] = 1234567891234567;
hbuf[6][1] = 123456789123456.7;
hbuf[6][2] = 12345678912345.67;
hbuf[6][3] = 1234567891234.567;
hbuf[6][4] = 123456789123.4567;
hbuf[6][5] = 12345678912.34567;
hbuf[6][6] = 1234567891.234567;
hbuf[6][7] = 123456789.1234567;
hbuf[7][0] = 12345678.91234567;
hbuf[7][1] = 1234567.891234567;
hbuf[7][2] = 123456.7891234567;
hbuf[7][3] = 12345.67891234567;
hbuf[7][4] = 1234.567891234567;
hbuf[7][5] = 123.4567891234567;
hbuf[7][6] = 12.34567891234567;
hbuf[7][7] = 1.234567891234567;
for (int i=0; i<Xdim; i++){
for(int j=0; j<Ydim; j++){

printf("%.16f\t", hbuf[i][j]);
}
printf("\n");
}

cudaMemcpy2D(dbuf, pitch_bytes, hbuf, Xdim*sizeof(double), Xdim*sizeof(double), Ydim, cudaMemcpyHostToDevice);

me_texture.addressMode[0] = cudaAddressModeClamp;
me_texture.addressMode[1] = cudaAddressModeClamp;
me_texture.filterMode = cudaFilterModeLinear;
me_texture.normalized = false;

cudaBindTexture2D(0, me_texture, dbuf, cudaCreateChannelDesc(32,32,0,0, cudaChannelFormatKindSigned), Xdim, Ydim, pitch_bytes );

int pitch = pitch_bytes/sizeof(double);

kern<<<1, 64>>>(dob, pitch);

cudaMemcpy2D(hout,Xdim*sizeof(double), dob, pitch_bytes, Xdim*sizeof(double),Ydim, cudaMemcpyDeviceToHost);

printf("\nI am Fine\n");

for(int i = 0 ; i < Xdim ; i++){
for(int j=0; j<Ydim; j++){
printf("%.16f\t", hout[i][j]);
}
printf("\n");
}
cudaUnbindTexture(me_texture);
cudaFree(dbuf);
cudaFree(dob);
return 0;
}

最佳答案

如果您更改以下内容,上面的代码可以正常工作。替换

kern<<<1, 64>>>(..., ..)

dim3 blockPerGrid(1, 1)
dim3 threadPerBlock(8, 8)
kern<<<blockPerGrid, threadPerBlock>>>(....)

此处将 Xdim 更改为 pitch

 o[j*pitch + i] = A[threadIdx.x][threadIdx.y]; 

并将 cudaFilterModeLinear 更改为 cudaFilterModePoint 。编译时需要指定计算能力,假设你的计算能力是3.0,那么就是

 nvcc -arch=sm_30 file.cu 

关于CUDA C - 如何使用Texture2D实现 double 浮点,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/33739373/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com