gpt4 book ai didi

c - cuda程序内存泄漏

转载 作者:行者123 更新时间:2023-11-30 20:07:20 24 4
gpt4 key购买 nike

我写了一个CUDA程序来模拟John Conway's Game of Life ,但有时我会遇到内存泄漏(出现不应该存在的单元格)。

这是我的内核:

__global__ void gameOfLife(matrix pcuda_main,int lblock,int generations_to_run) {
/************************************
notice: first dimension is one section for read and one for
write(change purpose every simulation generation). 2 extra section rows
for first and last line not need to check their up and down position evry time
also edge lines ensure that blocks handles edge sectors are having edge lines less that other blocks
************************************/
__shared__ unsigned int section[2][SECTION_SIZE][CELLS_IN_LINE];
int i,j;
unsigned int sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b;
unsigned int left_top,left,left_bot,right_top,right,right_bot;
int read_section=0;
int write_section=1;
int bx = blockIdx.x;
int row = SECTION_ROWS*blockIdx.x+threadIdx.x;
int rowx = threadIdx.x+SECTION_ROWS;

// I am zeroeing the perimiters lines since they dont loaded with values and can be corrupt while more lines may be zeroes its done for avoiding ifs
section[0][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[1][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[0][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[1][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[0][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[1][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[0][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
section[1][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
__syncthreads(); // ensure no crashes between zeroeing perimiter lines and loading data
// since entire warp access the first and last cell together no extrag edges are needed however only first and last line access lines beyond the edges
for(i=0;i<CELLS_IN_LINE;i++) {
if ( bx > 0 ) {
section[0][rowx-SECTION_ROWS+1][i] = pcuda_main[((row-SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
}
section[0][rowx+1][i] = pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i];
if ( bx < lblock ) {
// not last block sector row
section[0][rowx+SECTION_ROWS+1][i] = pcuda_main[((row+SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
}
}
__syncthreads(); // ensure all data read
for ( i=0;i<generations_to_run;i++ ) {
for(j=0;j<CELLS_IN_LINE;j++) {
if ( bx > 0 ) {
if ( j > 0 ) {
left_top = section[read_section][rowx-SECTION_ROWS][j-1];
left = section[read_section][rowx-SECTION_ROWS+1][j-1];
left_bot = section[read_section][rowx-SECTION_ROWS+2][j-1];
} else {
left_top = 0;
left=0;
left_bot=0;
}
if ( j<CELLS_IN_LINE_RESIDUE ) {
right_top= section[read_section][rowx-SECTION_ROWS][j+1];
right= section[read_section][rowx-SECTION_ROWS+1][j+1];
right_bot= section[read_section][rowx-SECTION_ROWS+2][j+1];
} else {
right_top = 0;
right=0;
right_bot=0;
}

CELL32(section[write_section][rowx-SECTION_ROWS+1][j],
left_top,section[read_section][rowx-SECTION_ROWS][j],right_top,
left,section[read_section][rowx-SECTION_ROWS+1][j],right,
left_bot,section[read_section][rowx-SECTION_ROWS+2][j],right_bot,
sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
}
if ( j > 0 ) {
left_top = section[read_section][rowx][j-1];
left = section[read_section][rowx+1][j-1];
left_bot = section[read_section][rowx+2][j-1];
} else {
left_top = 0;
left=0;
left_bot=0;
}
if ( j<CELLS_IN_LINE_RESIDUE ) {
right_top= section[read_section][rowx][j+1];
right= section[read_section][rowx+1][j+1];
right_bot= section[read_section][rowx+2][j+1];
} else {
right_top = 0;
right=0;
right_bot=0;
}
CELL32(section[write_section][rowx+1][j],
left_top,section[read_section][rowx][j],right_top,
left,section[read_section][rowx+1][j],right,
left_bot,section[read_section][rowx+2][j],right_bot,
sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
if ( bx < lblock ) {
if ( j > 0 ) {
left_top = section[read_section][rowx+SECTION_ROWS][j-1];
left = section[read_section][rowx+SECTION_ROWS+1][j-1];
left_bot = section[read_section][rowx+SECTION_ROWS+2][j-1];
} else {
left_top = 0;
left=0;
left_bot=0;
}
if ( j<CELLS_IN_LINE_RESIDUE ) {
right_top= section[read_section][rowx+SECTION_ROWS][j+1];
right= section[read_section][rowx+SECTION_ROWS+1][j+1];
right_bot= section[read_section][rowx+SECTION_ROWS+2][j+1];
} else {
right_top = 0;
right=0;
right_bot=0;
}
CELL32(section[write_section][rowx+SECTION_ROWS+1][j],
left_top,section[read_section][rowx+SECTION_ROWS][j],right_top,
left,section[read_section][rowx+SECTION_ROWS+1][j],right,
left_bot,section[read_section][rowx+SECTION_ROWS+2][j],right_bot,
sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
}
}
read_section = read_section^1;
write_section = write_section^1;
//printf("passed %u generation for row: %u\n",i,row);
__syncthreads();
}

// now writing back to the global memory notice write section turns into read section after every generation so
// I write the read section
for(i=0;i<CELLS_IN_LINE;i++) {
pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i] = section[read_section][rowx+1][i];
}
__syncthreads();
}

这是单元格 32 的定义及其依赖项:

#define HALFADDER(s0,s1,a0,a1)do{s1=(a0)&(a1);s0=(a0)^(a1);}while(0)
#define FULLADDER(s0,s1,a0,a1,a2)do{s1=((a0)&(a1))|((a2)&((a0)^(a1)));s0 =(a2)^((a0)^(a1));}while(0)

#define CELL32(output,top_left,top,top_right,left,cur,right,bot_left,bot,bot_right,sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b)do{FULLADDER(sum_top_b0,sum_top_b1,(top_left<<31)|(top>>1),top,(top_right>>31)|(top<<1));HALFADDER(sum_cur_b0,sum_cur_b1,(left<<31)|(cur>>1),(right>>31)|(cur<<1));FULLADDER(sum_bot_b0,sum_bot_b1,(bot_left<<31)|(bot>>1),bot,((bot_right>>31)|(bot<<1)));FULLADDER(newone,newtwo,sum_bot_b0,sum_cur_b0,sum_top_b0);FULLADDER(newtwo,new4a,newtwo,sum_bot_b1,sum_top_b1);HALFADDER(newtwo,new4b,newtwo,sum_cur_b1);newone=newone|cur;output=newone&newtwo&(~new4a)&(~new4b);}while(0)

这是主循环和内存副本:

cudaMalloc((void **)(&pcuda),NUM_CELLS*sizeof(unsigned int));
//cudaMalloc((void **)(&pdata),ROWS*sizeof(int));
cudaMemcpy((void *)pcuda,(void *)p,sizeof(unsigned int)*NUM_CELLS,cudaMemcpyHostToDevice);
cudaDeviceSynchronize();

while ( generations > 0 ) {
if ( generations > SECTION_ROWS ) {
generations_run = SECTION_ROWS;
} else {
generations_run = generations;
}
generations -= generations_run;
printf("running params last_row:%u, generations_run:%u, generations left:%u,grid size:%d, array size in bytes: %u,last cell index: %d\n",
ROWS-SECTION_ROWS,generations_run,generations,dimGrid.x,NUM_CELLS*sizeof(unsigned int),((ROWS-1)<<LINE_NUMBER_BITS_OFFSET)+CELLS_IN_LINE_RESIDUE);
gameOfLife<<<dimGrid,dimBlock>>>(pcuda,dimGrid.x-1,generations_run);
error = cudaDeviceSynchronize();
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
lineid = __LINE__;
err = error;
}
else
{
printf("GPU Device %d:synchronized\n", devID);
}
}
cudaMemcpy((void *)p,(void *)pcuda,sizeof(int)*NUM_CELLS,cudaMemcpyDeviceToHost);

内存泄漏在哪里?

最佳答案

CUDA 工具包附带 cuda-memcheck,默认情况下,它将检查内核内的越界访问。它还具有其他模式,包括泄漏检查器。请注意,您需要在退出之前调用 cudaDeviceReset(),以便该工具知道查找未释放的设备内存。

关于c - cuda程序内存泄漏,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14536431/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com