- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我得到了以下程序,它几乎是 SDK 示例“简单分层纹理”。
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, kernels
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA SDK samples
#define EXIT_WAIVED 2
static char *sSDKname = "simpleLayeredTexture";
// includes, kernels
// declare texture reference for layered 2D float texture
// Note: The "dim" field in the texture reference template is now deprecated.
// Instead, please use a texture type macro such as cudaTextureType1D, etc.
typedef int TYPE;
texture<TYPE, cudaTextureType2DLayered> tex;
////////////////////////////////////////////////////////////////////////////////
//! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel(TYPE *g_odata, int width, int height, int layer)
{
// calculate this thread's data point
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// 0.5f offset and division are necessary to access the original data points
// in the texture (such that bilinear interpolation will not be activated).
// For details, see also CUDA Programming Guide, Appendix D
float u = (x+0.5f) / (float) width;
float v = (y+0.5f) / (float) height;
// read from texture, do expected transformation and write to global memory
TYPE sample = tex2DLayered(tex, u, v, layer);
g_odata[layer*width*height + y*width + x] = sample;
printf("Sample %d\n", sample);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
printf("[%s] - Starting...\n", sSDKname);
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
bool bResult = true;
// get number of SMs on this GPU
cudaDeviceProp deviceProps;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
if (deviceProps.major < 2)
{
printf("%s requires SM >= 2.0 to support Texture Arrays. Test will be waived... \n", sSDKname);
cudaDeviceReset();
exit(EXIT_SUCCESS);
}
// generate input data for layered texture
unsigned int width=16, height=16, num_layers = 5;
unsigned int size = width * height * num_layers * sizeof(TYPE);
TYPE *h_data = (TYPE *) malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++)
{
h_data[layer*width*height + i] = 15;//(float)i;
}
// this is the expected transformation of the input data (the expected output)
TYPE *h_data_ref = (TYPE *) malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++)
{
h_data_ref[layer*width*height + i] = h_data[layer*width*height + i];
}
// allocate device memory for result
TYPE *d_data = NULL;
checkCudaErrors(cudaMalloc((void **) &d_data, size));
// allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<TYPE>();
cudaArray *cu_3darray;
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0,0,0);
myparms.dstPos = make_cudaPos(0,0,0);
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(TYPE), width, height);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, height, num_layers);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));
// set texture parameters
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
// tex.filterMode = cudaFilterModeLinear;
tex.filterMode = cudaFilterModePoint;
tex.normalized = true; // access with normalized texture coordinates
// Bind the array to the texture
checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has 8 x 8 threads\n",
width, height, dimGrid.x, dimGrid.y);
transformKernel<<< dimGrid, dimBlock >>>(d_data, width, height, 0); // warmup (for better timing)
// check if kernel execution generated an error
getLastCudaError("warmup Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// execute the kernel
for (unsigned int layer = 0; layer < num_layers; layer++)
transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width, height, layer);
// check if kernel execution generated an error
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);
// allocate mem for the result on host side
TYPE *h_odata = (TYPE *) malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
printf("Comparing kernel output to expected data\n");
#define MIN_EPSILON_ERROR 5e-3f
bResult = compareData(h_odata, h_data_ref, width*height*num_layers, MIN_EPSILON_ERROR, 0.0f);
printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);
// cleanup memory
free(h_data);
free(h_data_ref);
free(h_odata);
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFreeArray(cu_3darray));
cudaDeviceReset();
if (bResult)
printf("Success!");
else
printf("Failure!");
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
如果我使用 int(或 uint)作为 TYPE,输出是正确的。对于 float,它会产生错误的结果,即始终为 0(尽管 SDK compareData 函数说一切都很好!?)。我开始相信 CUDA 中存在错误。我在 Kepler K20 上使用 5.0 版。
感谢任何建议和测试结果。代码应该可以按原样运行。
提前致谢,本
编辑:操作系统是 Linux (Ubuntu 12.04.2 LTS) x86_64 3.2.0-38-generic
最佳答案
这里的问题是如果你只改变这个:
typedef int TYPE;
为此:
typedef float TYPE;
那么内核中的这一行就不再正确了:
printf("Sample %d\n", sample);
^^
因为 printf
格式说明符 %d
对于 float
类型不正确。如果将该说明符更改为 %f
,您将获得预期的输出:
$ cat t1519.cu
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, kernels
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA SDK samples
#define EXIT_WAIVED 2
static char *sSDKname = "simpleLayeredTexture";
// includes, kernels
// declare texture reference for layered 2D float texture
// Note: The "dim" field in the texture reference template is now deprecated.
// Instead, please use a texture type macro such as cudaTextureType1D, etc.
typedef float TYPE;
texture<TYPE, cudaTextureType2DLayered> tex;
////////////////////////////////////////////////////////////////////////////////
//! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void
transformKernel(TYPE *g_odata, int width, int height, int layer)
{
// calculate this thread's data point
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
// 0.5f offset and division are necessary to access the original data points
// in the texture (such that bilinear interpolation will not be activated).
// For details, see also CUDA Programming Guide, Appendix D
float u = (x+0.5f) / (float) width;
float v = (y+0.5f) / (float) height;
// read from texture, do expected transformation and write to global memory
TYPE sample = tex2DLayered(tex, u, v, layer);
g_odata[layer*width*height + y*width + x] = sample;
printf("Sample %f\n", sample);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
printf("[%s] - Starting...\n", sSDKname);
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
bool bResult = true;
// get number of SMs on this GPU
cudaDeviceProp deviceProps;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
if (deviceProps.major < 2)
{
printf("%s requires SM >= 2.0 to support Texture Arrays. Test will be waived... \n", sSDKname);
cudaDeviceReset();
exit(EXIT_SUCCESS);
}
// generate input data for layered texture
unsigned int width=16, height=16, num_layers = 5;
unsigned int size = width * height * num_layers * sizeof(TYPE);
TYPE *h_data = (TYPE *) malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++)
{
h_data[layer*width*height + i] = 15;//(float)i;
}
// this is the expected transformation of the input data (the expected output)
TYPE *h_data_ref = (TYPE *) malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++)
{
h_data_ref[layer*width*height + i] = h_data[layer*width*height + i];
}
// allocate device memory for result
TYPE *d_data = NULL;
checkCudaErrors(cudaMalloc((void **) &d_data, size));
// allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<TYPE>();
cudaArray *cu_3darray;
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0,0,0);
myparms.dstPos = make_cudaPos(0,0,0);
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(TYPE), width, height);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, height, num_layers);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));
// set texture parameters
tex.addressMode[0] = cudaAddressModeWrap;
tex.addressMode[1] = cudaAddressModeWrap;
// tex.filterMode = cudaFilterModeLinear;
tex.filterMode = cudaFilterModePoint;
tex.normalized = true; // access with normalized texture coordinates
// Bind the array to the texture
checkCudaErrors(cudaBindTextureToArray(tex, cu_3darray, channelDesc));
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has 8 x 8 threads\n",
width, height, dimGrid.x, dimGrid.y);
transformKernel<<< dimGrid, dimBlock >>>(d_data, width, height, 0); // warmup (for better timing)
// check if kernel execution generated an error
getLastCudaError("warmup Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// execute the kernel
for (unsigned int layer = 0; layer < num_layers; layer++)
transformKernel<<< dimGrid, dimBlock, 0 >>>(d_data, width, height, layer);
// check if kernel execution generated an error
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", (width *height *num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);
// allocate mem for the result on host side
TYPE *h_odata = (TYPE *) malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
printf("Comparing kernel output to expected data\n");
#define MIN_EPSILON_ERROR 5e-3f
bResult = compareData(h_odata, h_data_ref, width*height*num_layers, MIN_EPSILON_ERROR, 0.0f);
printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);
// cleanup memory
free(h_data);
free(h_data_ref);
free(h_odata);
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFreeArray(cu_3darray));
cudaDeviceReset();
if (bResult)
printf("Success!");
else
printf("Failure!");
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
$ nvcc -I/usr/local/cuda/samples/common/inc t1519.cu -o t1519
t1519.cu(15): warning: conversion from a string literal to "char *" is deprecated
t1519.cu(15): warning: conversion from a string literal to "char *" is deprecated
[user2@dc10 misc]$ cuda-memcheck ./t1519
========= CUDA-MEMCHECK
[simpleLayeredTexture] - Starting...
GPU Device 0: "Tesla V100-PCIE-32GB" with compute capability 7.0
CUDA device [Tesla V100-PCIE-32GB] has 80 Multi-Processors SM 7.0
Covering 2D data array of 16 x 16: Grid size is 2 x 2, each block has 8 x 8 threads
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
Sample 15.000000
...
Sample 15.000000
Sample 15.000000
Sample 15.000000
Processing time: 13.991 msec
0.09 Mtexlookups/sec
Comparing kernel output to expected data
Host sample: 8964432 == 1
Success!========= ERROR SUMMARY: 0 errors
$
请注意,最终输出行仍然不正确,因为我没有修改那里不正确的 printf
格式说明符:
printf("Host sample: %d == %d\n", h_data_ref[0], h_odata[0]);
关于来自 3D 数组的 CUDA 2D 分层纹理( float 与整数),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/17107422/
我正在尝试创建一个包含 int[][] 项的数组 即 int version0Indexes[][4] = { {1,2,3,4}, {5,6,7,8} }; int version1Indexes[
我有一个整数数组: private int array[]; 如果我还有一个名为 add 的方法,那么以下有什么区别: public void add(int value) { array[va
当您尝试在 JavaScript 中将一个数组添加到另一个数组时,它会将其转换为一个字符串。通常,当以另一种语言执行此操作时,列表会合并。 JavaScript [1, 2] + [3, 4] = "
根据我正在阅读的教程,如果您想创建一个包含 5 列和 3 行的表格来表示这样的数据... 45 4 34 99 56 3 23 99 43 2 1 1 0 43 67 ...它说你可以使用下
我通常使用 python 编写脚本/程序,但最近开始使用 JavaScript 进行编程,并且在使用数组时遇到了一些问题。 在 python 中,当我创建一个数组并使用 for x in y 时,我得
我有一个这样的数组: temp = [ 'data1', ['data1_a','data1_b'], ['data2_a','data2_b','data2_c'] ]; // 我想使用 toStr
rent_property (table name) id fullName propertyName 1 A House Name1 2 B
这个问题在这里已经有了答案: 关闭13年前。 Possible Duplicate: In C arrays why is this true? a[5] == 5[a] array[index] 和
使用 Excel 2013。经过多年的寻找和适应,我的第一篇文章。 我正在尝试将当前 App 用户(即“John Smith”)与他的电子邮件地址“jsmith@work.com”进行匹配。 使用两个
当仅在一个边距上操作时,apply 似乎不会重新组装 3D 数组。考虑: arr 1),但对我来说仍然很奇怪,如果一个函数返回一个具有尺寸的对象,那么它们基本上会被忽略。 最佳答案 这是一个不太理
我有一个包含 GPS 坐标的 MySQL 数据库。这是我检索坐标的部分 PHP 代码; $sql = "SELECT lat, lon FROM gps_data"; $stmt=$db->query
我需要找到一种方法来执行这个操作,我有一个形状数组 [批量大小, 150, 1] 代表 batch_size 整数序列,每个序列有 150 个元素长,但在每个序列中都有很多添加的零,以使所有序列具有相
我必须通过 url 中的 json 获取文本。 层次结构如下: 对象>数组>对象>数组>对象。 我想用这段代码获取文本。但是我收到错误 :org.json.JSONException: No valu
enter code here- (void)viewDidLoad { NSMutableArray *imageViewArray= [[NSMutableArray alloc] init];
知道如何对二维字符串数组执行修剪操作,例如使用 Java 流 API 进行 3x3 并将其收集回相同维度的 3x3 数组? 重点是避免使用显式的 for 循环。 当前的解决方案只是简单地执行一个 fo
已关闭。此问题需要 debugging details 。目前不接受答案。 编辑问题以包含 desired behavior, a specific problem or error, and the
我有来自 ASP.NET Web 服务的以下 XML 输出: 1710 1711 1712 1713
如果我有一个对象todo作为您状态的一部分,并且该对象包含数组列表,则列表内部有对象,在这些对象内部还有另一个数组listItems。如何更新数组 listItems 中 id 为“poi098”的对
我想将最大长度为 8 的 bool 数组打包成一个字节,通过网络发送它,然后将其解压回 bool 数组。已经在这里尝试了一些解决方案,但没有用。我正在使用单声道。 我制作了 BitArray,然后尝试
我们的数据库中有这个字段指示一周中的每一天的真/假标志,如下所示:'1111110' 我需要将此值转换为 boolean 数组。 为此,我编写了以下代码: char[] freqs = weekday
我是一名优秀的程序员,十分优秀!