gpt4 book ai didi

cuda - 使用cudaMemcpy3D转***指针

转载 作者:行者123 更新时间:2023-12-02 17:38:38 24 4
gpt4 key购买 nike

我正在尝试使用 cudaMemcpy3D 来传输动态分配的 3d 矩阵(张量)。张量被分配为连续的内存块(见下面的代码)。我尝试了 cudaExtentcudaMemcpy3DParms 的各种组合,但是元素的顺序混淆了。我创建了以下示例来演示该问题:

#include <stdio.h>

int ***alloc_tensor(int Nx, int Ny, int Nz) {
int i, j;
int ***tensor;

tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));

for(j = 1; j < Ny; j++)
tensor[0][j] = tensor[0][j-1] + Nz;
for(i = 1; i < Nx; i++) {
tensor[i] = tensor[i - 1] + Ny;
tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
for(j = 1; j < Ny; j++)
tensor[i][j] = tensor[i][j - 1] + Nz;
}

return tensor;
}

__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
int i, j, k;
char *tensorslice;
int *tensorrow;

for (i = 0; i < Nx; i++) {
for (j = 0; j < Ny; j++) {
for (k = 0; k < Nz; k++) {
tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Nx;
tensorrow = (int *)(tensorslice + i * tensor.pitch);
printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[j]);
}
}
}
}

int main() {
int i, j, k, value = 0;
int Nx = 2, Ny = 6, Nz = 4;

int ***h_tensor;
struct cudaPitchedPtr d_tensor;

h_tensor = alloc_tensor(Nx, Ny, Nz);
cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));

for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
h_tensor[i][j][k] = value++;
printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
}
}
}

cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
cpy.dstPtr = d_tensor;
cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
cpy.kind = cudaMemcpyHostToDevice;

cudaMemcpy3D(&cpy);

kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);

// ... clean-up
}

主机变量(h_tensor)和设备(d_tensor)的输出不同,看起来像

h_tensor[0][0][0] = 0
h_tensor[0][0][1] = 1
h_tensor[0][0][2] = 2
h_tensor[0][0][3] = 3
h_tensor[0][1][0] = 4
h_tensor[0][1][1] = 5
h_tensor[0][1][2] = 6
...

d_tensor[0][0][0] = 0
d_tensor[0][0][1] = 12
d_tensor[0][0][2] = 24
d_tensor[0][0][3] = 36
d_tensor[0][1][0] = 1
d_tensor[0][1][1] = 13
d_tensor[0][1][2] = 25
...

我做错了什么?使用 cudaMemcpy3D 的正确方法是什么?

最佳答案

  1. 任何时候你在使用 cuda 代码时遇到问题,最好做 proper cuda error checking .您在此处发布的代码至少对我而言无法正确运行 - cudaMemcpy3D 行会引发错误。这是由于下面的第 2 项。 (我怀疑您用于生成输出的代码与您在此处显示的代码不同,但这只是一个猜测。)
  2. 您对 make_cudaPitchedPtr 的使用不正确:

    cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);

    查看 API 文档。以这种方式制作 CUDA 倾斜指针在 2D 和 3D 之间没有区别。所以像你这样传递 3 个不同的维度是没有意义的。而是这样做:

    cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
  3. 我发现的其余问题归因于对 C 中 3 维的错误理解。乘法下标数组的最后一个下标是快速变化的维度,即它是内存中相邻值占据相邻索引的维度值。因此,您在第三维中对 Z 的使用让我感到困惑。您的主机分配在第一个下标位置使用了 Nx,但您的设备索引不匹配。显然有多种方法可以处理这个问题。如果你不喜欢我的安排,你可以改变它,但主机和设备索引必须匹配。

无论如何,以下代码修改对我有用:

#include <stdio.h>

int ***alloc_tensor(int Nx, int Ny, int Nz) {
int i, j;
int ***tensor;

tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));

for(j = 1; j < Ny; j++)
tensor[0][j] = tensor[0][j-1] + Nz;
for(i = 1; i < Nx; i++) {
tensor[i] = tensor[i - 1] + Ny;
tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
for(j = 1; j < Ny; j++)
tensor[i][j] = tensor[i][j - 1] + Nz;
}

return tensor;
}

__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
int i, j, k;
char *tensorslice;
int *tensorrow;

for (i = 0; i < Nx; i++) {
for (j = 0; j < Ny; j++) {
for (k = 0; k < Nz; k++) {
tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Ny;
tensorrow = (int *)(tensorslice + j * tensor.pitch);
printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[i]);
}
}
}
}

int main() {
int i, j, k, value = 0;
int Nx = 2, Ny = 6, Nz = 4;

int ***h_tensor;
struct cudaPitchedPtr d_tensor;

h_tensor = alloc_tensor(Nz, Ny, Nx);
cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));

for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
h_tensor[k][j][i] = value++;
//printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
}
}
}
for(i = 0; i < Nx; i++) {
for(j = 0; j < Ny; j++) {
for(k = 0; k < Nz; k++) {
//h_tensor[i][j][k] = value++;
printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[k][j][i]);
}
}
}

cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
cpy.dstPtr = d_tensor;
cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
cpy.kind = cudaMemcpyHostToDevice;

cudaMemcpy3D(&cpy);

kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
cudaDeviceSynchronize();
// ... clean-up
}

关于cuda - 使用cudaMemcpy3D转***指针,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/23310520/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com