gpt4 book ai didi

c++ - cudaMemcpy 结构设备主机不工作

转载 作者:太空宇宙 更新时间:2023-11-04 13:06:16 26 4
gpt4 key购买 nike

当我尝试将节点数组从设备复制回主机时,我在 Node.m[...] 中得到的是零而不是值,即使当我在内核中打印节点时它显示值设置正确。不幸的是,我无法自己发现任何错误,所以我恳请您提供帮助。我使用 visual studio 编译器和计算能力 3 编译代码。来自 this 的代码答案对我有用。

我粘贴了整个代码,但只有有意义的部分是

__global__ void divideLeft(Node * nodes,float * leftSide){...}

divideLeft<<<1,1>>>(dNodes,dLeftSide);
ERRCHECK(cudaDeviceSynchronize());
ERRCHECK(cudaGetLastError());
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
printNode(nodes[3]);

 #include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <conio.h>
#include <new>
#include <cmath>

#define ERRCHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true,bool wait=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (wait) getch();
if (abort) exit(code);
}
}

#define MSIZE 36
#define INPUT_SIZE(N) N*5 - 3*2
#define PARENT(i) (i-1)/2
#define LEFT(i) 2*i + 1
#define RIGHT(i) 2*i + 2
#define BOTTOM_HEAP_NODES_COUNT(N) (N-2)/3 //size of input must be 2+3n,n>1
#define HEAP_SIZE(N) 2*BOTTOM_HEAP_NODES_COUNT(N)-1
#define FIRST_LEVEL_SIZE 19
#define ROW_LENGTH 5
#define FIRST_LVL_MAT_SIZE 5
#define XY(x,y) x*6+y

__constant__ int dHigherTreeLevelThreshold;
__constant__ int dNodesCount;
__constant__ int dLeftSize;
__constant__ int dHeapSize;
__constant__ int dBottomNodes;
__constant__ int dRemainingNodes;
__constant__ int dRightCols;
__constant__ int dInputCount;

struct Node
{
float m[MSIZE];
float *x;
};

__device__ __host__ void printNode(Node node);
__global__ void divideLeft(Node * nodes,float * leftSide)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx>=dBottomNodes)
return;
int nodeIdx = idx + dRemainingNodes - (idx >= dHigherTreeLevelThreshold)*dBottomNodes;
// printf("%d %d\n",idx,nodeIdx);
Node node = nodes[nodeIdx];
idx*=5*3;
node.m[XY(3,3)] = leftSide[idx+2]/3;
node.m[XY(3,2)] = leftSide[idx+3]/2;
node.m[XY(3,1)] = leftSide[idx+4];

node.m[XY(2,3)] = leftSide[idx+6]/2;
node.m[XY(2,2)] = leftSide[idx+7]*2/3;
node.m[XY(2,1)] = leftSide[idx+8];
node.m[XY(2,4)] = leftSide[idx+9];

node.m[XY(1,3)] = leftSide[idx+10];
node.m[XY(1,2)] = leftSide[idx+11];
node.m[XY(1,1)] = leftSide[idx+12];
node.m[XY(1,4)] = leftSide[idx+13];
node.m[XY(1,5)] = leftSide[idx+14];

node.m[XY(4,2)] = leftSide[idx+15];
node.m[XY(4,1)] = leftSide[idx+16];
node.m[XY(4,4)] = leftSide[idx+17]*2/3;
node.m[XY(4,5)] = leftSide[idx+18]/2;

node.m[XY(5,1)] = leftSide[idx+20];
node.m[XY(5,4)] = leftSide[idx+21]/2;
node.m[XY(5,5)] = leftSide[idx+22]/3;
printNode(node);
}

void leftSideInit(float * leftSide,int size)
{
for(int i = 0;i<size;i++)
{
leftSide[i] = 1;//(i+1)%26;
}
}

int main(){
ERRCHECK(cudaSetDevice(0));

int leftCount = 11;
int leftSize = leftCount*5;
int rightSize = 10;
int heapSize = HEAP_SIZE(leftCount);
int bottomNodes = BOTTOM_HEAP_NODES_COUNT(leftCount);
int greatestPowerOfTwo = pow(2,(int)log2(bottomNodes));
int remainingNodes = heapSize - greatestPowerOfTwo;

ERRCHECK(cudaMemcpyToSymbol(dBottomNodes,&bottomNodes,sizeof(int)));
ERRCHECK(cudaMemcpyToSymbol(dHigherTreeLevelThreshold,&greatestPowerOfTwo,sizeof(int)));
ERRCHECK(cudaMemcpyToSymbol(dRemainingNodes,&remainingNodes,sizeof(int)));
ERRCHECK(cudaMemcpyToSymbol(dRightCols,&rightSize,sizeof(int)));
ERRCHECK(cudaMemcpyToSymbol(dHeapSize,&heapSize,sizeof(int)));

float * leftSide = new float[leftSize];
float * rightSide = new float[rightSize];
Node * nodes = new Node[heapSize];
Node * dNodes = nullptr;
float * dLeftSide =nullptr;
leftSideInit(leftSide,leftSize);

ERRCHECK(cudaMalloc(&dNodes,sizeof(Node)* heapSize));
ERRCHECK(cudaMemset(dNodes,0,sizeof(Node)*heapSize));
ERRCHECK(cudaMalloc(&dLeftSide,leftSize*sizeof(float)));
ERRCHECK(cudaMemcpy(dLeftSide,leftSide,leftSize*sizeof(float),cudaMemcpyHostToDevice));
divideLeft<<<1,1>>>(dNodes,dLeftSide);
ERRCHECK(cudaDeviceSynchronize());
ERRCHECK(cudaGetLastError());
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
printNode(nodes[3]);
delete [] nodes;
cudaFree(dNodes);

ERRCHECK(cudaDeviceReset());

getch();
return 0;
}

__device__ __host__ void printNode(Node node)
{
for(int i= 0;i<6;i++)
printf("%.3f %.3f %.3f %.3f %.3f %.3f\n",node.m[XY(i,0)],node.m[XY(i,1)],node.m[XY(i,2)],node.m[XY(i,3)],node.m[XY(i,4)],node.m[XY(i,5)]);

}

最佳答案

在您的内核中,您制作了您正在处理的 Node 的本地拷贝:

Node node = nodes[nodeIdx];

内核的其余部分继续修改 node 的元素,您的本地拷贝。

但在所有修改完成后,您永远不会将本地拷贝复制回全局拷贝,因此全局拷贝保持不变。

要解决这个问题,一种可能是在内核末尾添加这一行:

nodes[nodeIdx] = node;

顺便说一句,我注意到您的 struct Node 包含一个指针变量:

struct Node
{
float m[MSIZE];
float *x;
};

您应该意识到使用带有嵌入式指针的结构数组可能会有些特殊的复杂性。您实际上还没有使用该变量 (x),所以我只是将其作为注释提及。您可能需要引用 cuda tag info page有关此概念的规范问题(“在 CUDA 中使用指针数组”)。

关于c++ - cudaMemcpy 结构设备主机不工作,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/42180066/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com