gpt4 book ai didi

来自 3D 数组的 CUDA 2D 分层纹理( float 与整数)

转载 作者:行者123 更新时间:2023-12-01 16:03:28 25 4
gpt4 key购买 nike

我得到了以下程序,它几乎是 SDK 示例“简单分层纹理”。

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, kernels
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA SDK samples

#define EXIT_WAIVED 2

static char *sSDKname = "simpleLayeredTexture";

// includes, kernels
// declare texture reference for layered 2D float texture
// Note: The "dim" field in the texture reference template is now deprecated.
// Instead, please use a texture type macro such as cudaTextureType1D, etc.

typedef int TYPE;

texture<TYPE, cudaTextureType2DLayered> tex;

////////////////////////////////////////////////////////////////////////////////
//! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel(TYPE *g_odata, int width, int height, int layer)
{
// calculate this thread's data point
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

// 0.5f offset and division are necessary to access the original data points
// in the texture (such that bilinear interpolation will not be activated).
// For details, see also CUDA Programming Guide, Appendix D
float u = (x+0.5f) / (float) width;
float v = (y+0.5f) / (float) height;

// read from texture, do expected transformation and write to global memory
TYPE sample = tex2DLayered(tex, u, v, layer);
g_odata[layer*width*height + y*width + x] = sample;

printf("Sample %d\n", sample);
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
printf("[%s] - Starting...\n", sSDKname);

// use command-line specified CUDA device, otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);

bool bResult = true;

// get number of SMs on this GPU
cudaDeviceProp deviceProps;

checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);

if (deviceProps.major < 2)
{
printf("%s requires SM >= 2.0 to support Texture Arrays. Test will be waived... \n", sSDKname);
cudaDeviceReset();
exit(EXIT_SUCCESS);
}

// generate input data for layered texture
unsigned int width=16, height=16, num_layers = 5;
unsigned int size = width * height * num_layers * sizeof(TYPE);
TYPE *h_data = (TYPE *) malloc(size);

for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++)
{
h_data[layer*width*height + i] = 15;//(float)i;
}

// this is the expected transformation of the input data (the expected output)
TYPE *h_data_ref = (TYPE *) malloc(size);

for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++)
{
h_data_ref[layer*width*height + i] = h_data[layer*width*height + i];
}

// allocate device memory for result
TYPE *d_data = NULL;
checkCudaErrors(cudaMalloc((void **) &d_data, size));

// allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<TYPE>();
cudaArray *cu_3darray;
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0,0,0);
myparms.dstPos = make_cudaPos(0,0,0);
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(TYPE), width, height);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, height, num_layers);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));

// set texture parameters
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
// tex.filterMode = cudaFilterModeLinear;
tex.filterMode = cudaFilterModePoint;
tex.normalized = true; // access with normalized texture coordinates

// Bind the array to the texture
checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));

dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has 8 x 8 threads\n",
width, height, dimGrid.x, dimGrid.y);

transformKernel<<< dimGrid, dimBlock >>>(d_data, width, height, 0); // warmup (for better timing)

// check if kernel execution generated an error
getLastCudaError("warmup Kernel execution failed");

checkCudaErrors(cudaDeviceSynchronize());

StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);

// execute the kernel
for (unsigned int layer = 0; layer < num_layers; layer++)
transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width, height, layer);

// check if kernel execution generated an error
getLastCudaError("Kernel execution failed");

checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);

// allocate mem for the result on host side
TYPE *h_odata = (TYPE *) malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));

printf("Comparing kernel output to expected data\n");

#define MIN_EPSILON_ERROR 5e-3f
bResult = compareData(h_odata, h_data_ref, width*height*num_layers, MIN_EPSILON_ERROR, 0.0f);

printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);

// cleanup memory
free(h_data);
free(h_data_ref);
free(h_odata);

checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFreeArray(cu_3darray));

cudaDeviceReset();

if (bResult)
printf("Success!");
else
printf("Failure!");

exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

如果我使用 int(或 uint)作为 TYPE,输出是正确的。对于 float,它会产生错误的结果,即始终为 0(尽管 SDK compareData 函数说一切都很好!?)。我开始相信 CUDA 中存在错误。我在 Kepler K20 上使用 5.0 版。

感谢任何建议和测试结果。代码应该可以按原样运行。

提前致谢,本

编辑:操作系统是 Linux (Ubuntu 12.04.2 LTS) x86_64 3.2.0-38-generic

最佳答案

这里的问题是如果你只改变这个:

typedef int TYPE;

为此:

typedef float TYPE;

那么内核中的这一行就不再正确了:

printf("Sample %d\n", sample);
^^

因为 printf 格式说明符 %d 对于 float 类型不正确。如果将该说明符更改为 %f,您将获得预期的输出:

$ cat t1519.cu
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, kernels
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA SDK samples

#define EXIT_WAIVED 2

static char *sSDKname = "simpleLayeredTexture";

// includes, kernels
// declare texture reference for layered 2D float texture
// Note: The "dim" field in the texture reference template is now deprecated.
// Instead, please use a texture type macro such as cudaTextureType1D, etc.

typedef float TYPE;

texture<TYPE, cudaTextureType2DLayered> tex;

////////////////////////////////////////////////////////////////////////////////
//! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel(TYPE *g_odata, int width, int height, int layer)
{
// calculate this thread's data point
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

// 0.5f offset and division are necessary to access the original data points
// in the texture (such that bilinear interpolation will not be activated).
// For details, see also CUDA Programming Guide, Appendix D
float u = (x+0.5f) / (float) width;
float v = (y+0.5f) / (float) height;

// read from texture, do expected transformation and write to global memory
TYPE sample = tex2DLayered(tex, u, v, layer);
g_odata[layer*width*height + y*width + x] = sample;

printf("Sample %f\n", sample);
}


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
printf("[%s] - Starting...\n", sSDKname);

// use command-line specified CUDA device, otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);

bool bResult = true;

// get number of SMs on this GPU
cudaDeviceProp deviceProps;

checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);

if (deviceProps.major < 2)
{
printf("%s requires SM >= 2.0 to support Texture Arrays. Test will be waived... \n", sSDKname);
cudaDeviceReset();
exit(EXIT_SUCCESS);
}

// generate input data for layered texture
unsigned int width=16, height=16, num_layers = 5;
unsigned int size = width * height * num_layers * sizeof(TYPE);
TYPE *h_data = (TYPE *) malloc(size);

for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++)
{
h_data[layer*width*height + i] = 15;//(float)i;
}

// this is the expected transformation of the input data (the expected output)
TYPE *h_data_ref = (TYPE *) malloc(size);

for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++)
{
h_data_ref[layer*width*height + i] = h_data[layer*width*height + i];
}

// allocate device memory for result
TYPE *d_data = NULL;
checkCudaErrors(cudaMalloc((void **) &d_data, size));

// allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<TYPE>();
cudaArray *cu_3darray;
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0,0,0);
myparms.dstPos = make_cudaPos(0,0,0);
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(TYPE), width, height);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, height, num_layers);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));

// set texture parameters
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
// tex.filterMode = cudaFilterModeLinear;
tex.filterMode = cudaFilterModePoint;
tex.normalized = true; // access with normalized texture coordinates

// Bind the array to the texture
checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));

dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has 8 x 8 threads\n",
width, height, dimGrid.x, dimGrid.y);

transformKernel<<< dimGrid, dimBlock >>>(d_data, width, height, 0); // warmup (for better timing)

// check if kernel execution generated an error
getLastCudaError("warmup Kernel execution failed");

checkCudaErrors(cudaDeviceSynchronize());

StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);

// execute the kernel
for (unsigned int layer = 0; layer < num_layers; layer++)
transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width, height, layer);

// check if kernel execution generated an error
getLastCudaError("Kernel execution failed");

checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);

// allocate mem for the result on host side
TYPE *h_odata = (TYPE *) malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));

printf("Comparing kernel output to expected data\n");

#define MIN_EPSILON_ERROR 5e-3f
bResult = compareData(h_odata, h_data_ref, width*height*num_layers, MIN_EPSILON_ERROR, 0.0f);

printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);

// cleanup memory
free(h_data);
free(h_data_ref);
free(h_odata);

checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFreeArray(cu_3darray));

cudaDeviceReset();

if (bResult)
printf("Success!");
else
printf("Failure!");

exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1519.cu -o t1519
t1519.cu(15): warning: conversion from a string literal to "char *" is deprecated

t1519.cu(15): warning: conversion from a string literal to "char *" is deprecated

[user2@dc10 misc]$ cuda-memcheck ./t1519
========= CUDA-MEMCHECK
[simpleLayeredTexture] - Starting...
GPU Device 0: "Tesla V100-PCIE-32GB" with compute capability 7.0

CUDA device [Tesla V100-PCIE-32GB] has 80 Multi-Processors SM 7.0
Covering 2D data array of 16 x 16: Grid size is 2 x 2, each block has 8 x 8 threads
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
...
Sample 15.000000
Sample 15.000000
Sample 15.000000
Processing time: 13.991 msec
0.09 Mtexlookups/sec
Comparing kernel output to expected data
Host sample: 8964432 == 1
Success!========= ERROR SUMMARY: 0 errors
$

请注意,最终输出行仍然不正确,因为我没有修改那里不正确的 printf 格式说明符:

printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);

关于来自 3D 数组的 CUDA 2D 分层纹理( float 与整数),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/17107422/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com