gpt4 book ai didi

c++ - 在cuda中分配结构数组后变量丢失

转载 作者:行者123 更新时间:2023-11-30 03:22:08 24 4
gpt4 key购买 nike

我在 C 中有一个包含结构数组的结构,我需要在 GPU 中复制它。为此,我正在编写一个函数,使结构中的变量的一些 cudaMalloccudaMemcpy 从主机到设备。

该结构的一个简单版本(真正的版本内部有各种结构和变量/数组)是:

struct Node {

float* position;

};

struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};

我的问题是我一定是在结构的内存分配和复制中做错了什么。当我使用 Graph 复制变量时,我可以看到它们已正确复制(通过在内核中访问它,如下例所示)。例如,我可以检查 graph.nBoundary=3

但是,如果我不分配和复制Node *的内存,我只能看到这个。如果这样做,我将得到 -858993460 而不是 3。有趣的是,Node * 没有错误分配,因为我可以检查 say graph.node[0].pos[0] 的值并且它具有正确的值。

这只发生在 graph.nBoundary 中。所有其他变量都保持正确的数值,但这个变量在运行 Node*cudaMemcpy 时会“出错”。

我哪里做错了,为什么会这样?我该如何解决?

如果您需要更多信息,请告诉我。


MCVE:

#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

float* position;

};

struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

__global__ void testKernel(Graph* graph,unsigned int * d_res){
d_res[0] = graph->nBoundary;

};
int main()
{

// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++){
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}

// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));

//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

printf("%u\n", graph.nBoundary);
printf("%d", h_res[0]);

return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph){
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));

//copy constants
gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));


// copy boundary
unsigned int * d_auxboundary, *h_auxboundary;
h_auxboundary = inGraph->boundary;
gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));


//Create nodes
Node * auxnode;
gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));

// Crate auxiliary pointers to grab them from host and pass them to device
float ** d_position, ** h_position;
d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));

for (int i = 0; i < inGraph->nNode; i++){

// Positions
h_position[i] = inGraph->node[i].position;
gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));

}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////// If I comment the following section, nBoundary can be read by the kernel
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////

gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));



return outGraph;
}

最佳答案

问题出在函数 cudaGraphMalloc 中,您在其中尝试将设备内存分配给已经在设备上分配的 outGraph 成员。在这样做的过程中,您正在取消对主机上非法设备指针的引用。

要为设备上存在的struct 类型变量的成员分配设备内存,我们首先必须创建该struct 类型的临时主机变量,然后分配设备内存到其成员,然后将其复制到设备上存在的结构。

我回答过类似的问题here .请看一下。

固定代码可能是这样的:

#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

float* position;

};

struct Graph {
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

__global__ void testKernel(Graph* graph, unsigned int * d_res) {
d_res[0] = graph->nBoundary;

};
int main()
{

// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++) {
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++) {
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}

// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));

//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

printf("%u\n", graph.nBoundary);
printf("%u\n", h_res[0]);

return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph)
{
//Create auxiliary Graph variable on host
Graph temp;

//copy constants
temp.nNode = inGraph->nNode;
temp.nBoundary = inGraph->nBoundary;

// copy boundary
gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));


//Create nodes
size_t nodeBytesTotal = temp.nNode * sizeof(Node);
gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));

for (int i = 0; i < temp.nNode; i++)
{
//Create auxiliary node on host
Node auxNodeHost;

//Allocate device memory to position member of auxillary node
size_t nodeBytes = 3 * sizeof(float);
gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));

//Copy auxillary host node to device
Node* dPtr = temp.node + i;
gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
}


Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));

return outGraph;
}

请注意您必须保留内部设备指针(即辅助主机变量)的主机拷贝。这是因为稍后您将不得不释放设备内存,并且由于您在主代码中只有 Graph 的设备拷贝,您将无法从主机访问其成员以调用cudaFree 在他们身上。在这种情况下,变量 Node auxNodeHost(在每次迭代中创建)和 Graph temp 就是这些变量。

上面的代码并没有这样做,只是为了演示目的。

在 Windows 10、Visual Studio 2015、CUDA 9.2、NVIDIA 驱动程序 397.44 上测试。

关于c++ - 在cuda中分配结构数组后变量丢失,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/51408172/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com