gpt4 book ai didi

struct - 将结构传递到CUDA内核

转载 作者:行者123 更新时间:2023-12-03 21:27:01 36 4
gpt4 key购买 nike

我是CUDA C的新手,正在尝试将类型定义的结构传递到内核中。当我使用仅包含int的结构尝试此方法时,我的方法运行良好,但是当我切换到float时,返回无意义的数字作为结果。我认为这与对齐有关,我尝试在我的类型声明中包含__align__,但无济于事。有人可以给我一个例子,还是提供替代方法?我正在尝试对其进行设置,以便可以轻松添加或删除字段,而无需更改结构和内核以外的任何内容。我的代码:

typedef struct __align__(8)
{
float a, b;
} point;

__global__ void testKernel(point *p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
p[i].a = 1.1;
p[i].b = 2.2;
}

int main(void)
{
// set number of points
int numPoints = 16,
gpuBlockSize = 4,
pointSize = sizeof(point),
numBytes = numPoints * pointSize,
gpuGridSize = numPoints / gpuBlockSize;

// allocate memory
point *cpuPointArray = new point[numPoints],
*gpuPointArray = new point[numPoints];
cpuPointArray = (point*)malloc(numBytes);
cudaMalloc((void**)&gpuPointArray, numBytes);

// launch kernel
testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);

// retrieve the results
cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
printf("testKernel results:\n");
for(int i = 0; i < numPoints; ++i)
{
printf("point.a: %d, point.b: %d\n",cpuPointArray[i].a,cpuPointArray[i].b);
}

// deallocate memory
free(cpuPointArray);
cudaFree(gpuPointArray);

return 0;
}

最佳答案

由于似乎没有任何有关此操作的不错的文档,因此我认为我将在此处发布最终的,经过修订的代码。事实证明,__align__部分也是不必要的,实际问题是在尝试打印浮点数时在printf中使用了%d。

#include <stdlib.h>
#include <stdio.h>

typedef struct
{
float a, b;
} point;

__global__ void testKernel(point *p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
p[i].a = 1.1;
p[i].b = 2.2;
}

int main(void)
{
// set number of points
int numPoints = 16,
gpuBlockSize = 4,
pointSize = sizeof(point),
numBytes = numPoints * pointSize,
gpuGridSize = numPoints / gpuBlockSize;

// allocate memory
point *cpuPointArray,
*gpuPointArray;
cpuPointArray = (point*)malloc(numBytes);
cudaMalloc((void**)&gpuPointArray, numBytes);

// launch kernel
testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);

// retrieve the results
cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
printf("testKernel results:\n");
for(int i = 0; i < numPoints; ++i)
{
printf("point.a: %f, point.b: %f\n",cpuPointArray[i].a,cpuPointArray[i].b);
}

// deallocate memory
free(cpuPointArray);
cudaFree(gpuPointArray);

return 0;
}

关于struct - 将结构传递到CUDA内核,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/4176762/

36 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com