gpt4 book ai didi

c - 尝试写入使用 cudaMalloc3D 分配的二维数组时出现“非法内存访问”

转载 作者:太空宇宙 更新时间:2023-11-04 08:25:08 25 4
gpt4 key购买 nike

我正在尝试使用 cudaMalloc3D 将展平的二维数组的内存分配和复制到设备上,以测试 cudaMalloc3D 的性能。但是当我尝试从内核写入数组时,它会抛出“遇到非法内存访问”异常。如果我只是从数组中读取,程序运行良好,但当我尝试写入时,出现错误。对此的任何帮助将不胜感激。下面是我的代码和编译代码的语法。

编译使用

nvcc -O2 -arch sm_20 test.cu 

代码:test.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define NZ 1 /* needed for cudaMalloc3D */

#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y

#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )

#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */


// Macro for checking CUDA errors following a CUDA launch or API call
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError();\
if( e != cudaSuccess ) {\
printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}

typedef double REAL;
typedef int INT;


void meshGrid ( REAL *x, REAL *y )
{

INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}


void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}

for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}


__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{

INT i,j;

i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;

if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {

char *unewPtr = (char *) unew.ptr;
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);

REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
}

}


INT main(INT argc, char *argv[])
{

INT nTimeSteps = 10;

// pointers for the host side
REAL *unew, *u, *uold, *uFinal, *x, *y;

// allocate memory on the host
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));


// pointer for the device side
size_t pitch = NX * sizeof(REAL);
cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;
cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ);

// allocate 3D memory on the device
cudaMalloc3D( &d_u, myExtent ); cudaCheckError();
cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();


// initialize grid and wave
meshGrid( x, y );
initWave( u, uold, x, y );


// copy host memory to 3D device memory
cudaMemcpy3DParms cpy3D = { 0 };
cpy3D.kind = cudaMemcpyHostToDevice;

// copying u to d_u
cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY);
cpy3D.dstPtr = d_u;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();

// copying uold to d_uold
cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY);
cpy3D.dstPtr = d_uold;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();


// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );

for ( INT n = 1; n < nTimeSteps + 1; n++ ) {
solveWaveGPU <<< dimGrid, dimBlock >>> ( d_uold, d_u, d_unew );
cudaThreadSynchronize();
cudaCheckError();

d_tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = d_tmp;
}

// copy the memory back to host
cpy3D.kind = cudaMemcpyDeviceToHost;

// copying d_unew to uFinal
cpy3D.srcPtr = d_unew;
cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY);
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();

free(u); cudaFree(d_u.ptr);
free(unew); cudaFree(d_unew.ptr);
free(uold); cudaFree(d_uold.ptr);

free(uFinal); free(x); free(y);

return EXIT_SUCCESS;
}

最佳答案

此行没有出现错误的原因:

REAL tmp = unew_row[j]; // no error on this line

是因为编译器正在优化该行。它没有做任何有用的事情,所以编译器完全消除了它。编译器警告:

xxx.cu(87): warning: variable "tmp" was declared but never referenced

是对这种效果的暗示。

您的代码几乎是正确的。问题在这里:

REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);

应该是:

REAL *unew_row = (REAL *) (unewPtr + j * unew.pitch);

内核中的 i 变量是 width(即 X)维度。j 变量是高度(即 Y)维度。

高度是指你在哪一行,因此行间距应该乘以高度参数,即 j,而不是 i

同样,虽然它不是特定维度的特定故障的根源,但这段代码也可能不是您想要的:

REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error

例如,如果您打算计算行的偏移量,然后索引到行中(例如,可能设置分配中的每个元素)那么我想您会想要使用 i 而不是 j 作为你的最终索引:

REAL tmp = unew_row[i]; // no error on this line
unew_row[i] = 1.2; // this is where I get the error

但是,对于这个特定示例,这并不是非法内存访问的实际来源。

关于c - 尝试写入使用 cudaMalloc3D 分配的二维数组时出现“非法内存访问”,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30898815/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com