c - cuda程序内存泄漏-6ren

c - cuda程序内存泄漏

转载作者：行者123 更新时间：2023-11-30 20:07:20

我写了一个CUDA程序来模拟John Conway's Game of Life ，但有时我会遇到内存泄漏(出现不应该存在的单元格)。

这是我的内核:

__global__ void gameOfLife(matrix pcuda_main,int lblock,int generations_to_run) {
/************************************
notice: first dimension is one section for read and one for 
write(change purpose every simulation generation). 2 extra section rows
for first and last line not need to check their up and down position evry time
also edge lines ensure that blocks handles edge sectors are having edge lines less that other blocks
************************************/
    __shared__ unsigned int section[2][SECTION_SIZE][CELLS_IN_LINE];
    int i,j;
    unsigned int sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b;
    unsigned int left_top,left,left_bot,right_top,right,right_bot;
    int read_section=0;
    int write_section=1;
    int bx = blockIdx.x;
    int row = SECTION_ROWS*blockIdx.x+threadIdx.x;
    int rowx = threadIdx.x+SECTION_ROWS;

    // I am zeroeing the perimiters lines since they dont loaded with values and can be corrupt while more lines may be zeroes its done for avoiding ifs
    section[0][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][rowx>>LINE_NUMBER_BITS_OFFSET][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][SECTION_ROWS-1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][3*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[0][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    section[1][2*SECTION_ROWS+1-(rowx>>LINE_NUMBER_BITS_OFFSET)][rowx&CELLS_IN_LINE_RESIDUE] = 0;
    __syncthreads(); // ensure no crashes between zeroeing perimiter lines and loading data
    // since entire warp access the first and last cell together no extrag edges are needed however only first and last line access lines beyond the edges
    for(i=0;i<CELLS_IN_LINE;i++) {
        if ( bx > 0 ) {
            section[0][rowx-SECTION_ROWS+1][i] = pcuda_main[((row-SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
        }
        section[0][rowx+1][i] = pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i];
        if ( bx < lblock ) {
            // not last block sector row
            section[0][rowx+SECTION_ROWS+1][i] = pcuda_main[((row+SECTION_ROWS)<<LINE_NUMBER_BITS_OFFSET)+i];
        }
    }
    __syncthreads(); // ensure all data read
    for ( i=0;i<generations_to_run;i++ ) {
        for(j=0;j<CELLS_IN_LINE;j++) {
            if ( bx > 0 ) {
                if ( j > 0 ) {
                    left_top = section[read_section][rowx-SECTION_ROWS][j-1];
                    left = section[read_section][rowx-SECTION_ROWS+1][j-1];
                    left_bot = section[read_section][rowx-SECTION_ROWS+2][j-1];
                } else {
                    left_top = 0;
                    left=0;
                    left_bot=0;
                }
                if ( j<CELLS_IN_LINE_RESIDUE ) {
                    right_top= section[read_section][rowx-SECTION_ROWS][j+1];
                    right= section[read_section][rowx-SECTION_ROWS+1][j+1];
                    right_bot= section[read_section][rowx-SECTION_ROWS+2][j+1];
                } else {
                    right_top = 0;
                    right=0;
                    right_bot=0;
                }

                CELL32(section[write_section][rowx-SECTION_ROWS+1][j],
                    left_top,section[read_section][rowx-SECTION_ROWS][j],right_top,
                    left,section[read_section][rowx-SECTION_ROWS+1][j],right,
                    left_bot,section[read_section][rowx-SECTION_ROWS+2][j],right_bot,
                    sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            }
            if ( j > 0 ) {
                left_top = section[read_section][rowx][j-1];
                left = section[read_section][rowx+1][j-1];
                left_bot = section[read_section][rowx+2][j-1];
            } else {
                left_top = 0;
                left=0;
                left_bot=0;
            }
            if ( j<CELLS_IN_LINE_RESIDUE ) {
                right_top= section[read_section][rowx][j+1];
                right= section[read_section][rowx+1][j+1];
                right_bot= section[read_section][rowx+2][j+1];
            } else {
                right_top = 0;
                right=0;
                right_bot=0;
            }
            CELL32(section[write_section][rowx+1][j],
                left_top,section[read_section][rowx][j],right_top,
                left,section[read_section][rowx+1][j],right,
                left_bot,section[read_section][rowx+2][j],right_bot,
                sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            if ( bx < lblock ) {
                if ( j > 0 ) {
                    left_top = section[read_section][rowx+SECTION_ROWS][j-1];
                    left = section[read_section][rowx+SECTION_ROWS+1][j-1];
                    left_bot = section[read_section][rowx+SECTION_ROWS+2][j-1];
                } else {
                    left_top = 0;
                    left=0;
                    left_bot=0;
                }
                if ( j<CELLS_IN_LINE_RESIDUE ) {
                    right_top= section[read_section][rowx+SECTION_ROWS][j+1];
                    right= section[read_section][rowx+SECTION_ROWS+1][j+1];
                    right_bot= section[read_section][rowx+SECTION_ROWS+2][j+1];
                } else {
                    right_top = 0;
                    right=0;
                    right_bot=0;
                }
                CELL32(section[write_section][rowx+SECTION_ROWS+1][j],
                    left_top,section[read_section][rowx+SECTION_ROWS][j],right_top,
                    left,section[read_section][rowx+SECTION_ROWS+1][j],right,
                    left_bot,section[read_section][rowx+SECTION_ROWS+2][j],right_bot,
                    sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b);
            }
        }
        read_section = read_section^1;
        write_section = write_section^1;
        //printf("passed %u generation for row: %u\n",i,row);
        __syncthreads();
    }

    // now writing back to the global memory notice write section turns into read section after every generation so 
    // I write the read section
    for(i=0;i<CELLS_IN_LINE;i++) {
        pcuda_main[((row)<<LINE_NUMBER_BITS_OFFSET)+i] = section[read_section][rowx+1][i];
    }
    __syncthreads();
}

这是单元格 32 的定义及其依赖项:

#define HALFADDER(s0,s1,a0,a1)do{s1=(a0)&(a1);s0=(a0)^(a1);}while(0)
#define FULLADDER(s0,s1,a0,a1,a2)do{s1=((a0)&(a1))|((a2)&((a0)^(a1)));s0 =(a2)^((a0)^(a1));}while(0)

#define CELL32(output,top_left,top,top_right,left,cur,right,bot_left,bot,bot_right,sum_top_b1,sum_top_b0,sum_cur_b1,sum_cur_b0,sum_bot_b1,sum_bot_b0,newone,newtwo,new4a,new4b)do{FULLADDER(sum_top_b0,sum_top_b1,(top_left<<31)|(top>>1),top,(top_right>>31)|(top<<1));HALFADDER(sum_cur_b0,sum_cur_b1,(left<<31)|(cur>>1),(right>>31)|(cur<<1));FULLADDER(sum_bot_b0,sum_bot_b1,(bot_left<<31)|(bot>>1),bot,((bot_right>>31)|(bot<<1)));FULLADDER(newone,newtwo,sum_bot_b0,sum_cur_b0,sum_top_b0);FULLADDER(newtwo,new4a,newtwo,sum_bot_b1,sum_top_b1);HALFADDER(newtwo,new4b,newtwo,sum_cur_b1);newone=newone|cur;output=newone&newtwo&(~new4a)&(~new4b);}while(0)

这是主循环和内存副本:

cudaMalloc((void **)(&pcuda),NUM_CELLS*sizeof(unsigned int));
    //cudaMalloc((void **)(&pdata),ROWS*sizeof(int));
    cudaMemcpy((void *)pcuda,(void *)p,sizeof(unsigned int)*NUM_CELLS,cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();

    while ( generations > 0 ) {
        if ( generations > SECTION_ROWS ) {
            generations_run = SECTION_ROWS;
        } else {
            generations_run = generations;
        }
        generations -= generations_run;
        printf("running params last_row:%u, generations_run:%u, generations left:%u,grid size:%d, array size in bytes: %u,last cell index: %d\n",
            ROWS-SECTION_ROWS,generations_run,generations,dimGrid.x,NUM_CELLS*sizeof(unsigned int),((ROWS-1)<<LINE_NUMBER_BITS_OFFSET)+CELLS_IN_LINE_RESIDUE);
        gameOfLife<<<dimGrid,dimBlock>>>(pcuda,dimGrid.x-1,generations_run);
        error = cudaDeviceSynchronize();
        if (error != cudaSuccess)
        {
            printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
            lineid = __LINE__;
            err = error;
        }
        else
        {
            printf("GPU Device %d:synchronized\n", devID);
        }
    }
    cudaMemcpy((void *)p,(void *)pcuda,sizeof(int)*NUM_CELLS,cudaMemcpyDeviceToHost);

内存泄漏在哪里？

最佳答案

CUDA 工具包附带 cuda-memcheck，默认情况下，它将检查内核内的越界访问。它还具有其他模式，包括泄漏检查器。请注意，您需要在退出之前调用 cudaDeviceReset()，以便该工具知道查找未释放的设备内存。

关于c - cuda程序内存泄漏，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/14536431/

文章推荐： javascript - 一个奇怪的 moment.js 事情

文章推荐： c# - 如何使用正则表达式禁止重复字符

文章推荐： c# - 在 Linux 上导入和开发 WinForms 应用程序

文章推荐： javascript - OAuth2 : how to get token?

java - Android - IntentReceiver 泄漏/泄漏/泄漏
IntentReceiver 正在泄漏由于 onDetachedFromWindow 在某些情况下未被调用。 @Override protected void onDetachedFromWind
iPhone - UIImage 泄漏、CGBitmapContextCreateImage 泄漏
好吧，我很难追踪这个内存泄漏。运行此脚本时，我没有看到任何内存泄漏，但我的 objectalloc 正在攀升。 Instruments 指向 CGBitmapContextCreateImage >
iphone - 使用仪器检测 C 泄漏(泄漏)
我编写了一个测试代码来检查如何使用 Instrument(Leaks)。我创建了一个单一 View 应用程序，单击按钮后我加载了一个像这样的新 View ... - (IBAction)btn_clk
Python单调增加内存使用量(泄漏？)
我正在使用这个简单的代码并观察单调增加的内存使用量。我正在使用这个小模块将内容转储到磁盘。我观察到它发生在 unicode 字符串上而不是整数上，我做错了什么吗？当我这样做时: >>> from u
iphone - NSXMLParser 泄漏
我有以下泄漏的代码。 Instruments 表示，泄漏的是 rssParser 对象。我“刷新”了 XML 提要，它运行了该 block 并且发生了泄漏...... 文件.h @interface
iphone - NSFileManager 泄漏
我在我编写的以下代码片段中发现了内存泄漏 NSFileManager *fileManager=[[NSFileManager alloc] init]; fileList=[[fileManager
javascript - 一段时间后HTML5音频停止工作(泄漏)
因此，我正在开发HTML5 / javascript rts游戏。观察一直有几种声音在播放。因此，对我来说，是一段时间后声音听起来像是“崩溃”，并且此浏览器选项卡上的所有声音都停止了工作。我只能通过重
perl - 了解开发::泄漏
下面是我正在使用的一段代码及其输出。 my $handle; my $enterCount = Devel::Leak::NoteSV($handle); print "$date entry $en
Go goroutine 泄漏
在这篇关于 go-routines 泄漏的帖子之后，https://www.ardanlabs.com/blog/2018/11/goroutine-leaks-the-forgotten-sende
c - 执行后内存(泄漏)
我想知道为什么在执行 ./a.out 后随机得到以下结果。有什么想法我做错了吗？谢谢 http://img710.imageshack.us/img710/8708/trasht.png 最佳答案正
ios - 二维码扫描后内存峰值/泄漏
我正在 Swift 中开发一个应用程序，在呈现捕获我放在一起的二维码的自定义 ViewController 后，我注意到出现了巨大的内存跳跃。该代码本质上基于以下示例:http://www.appc
javascript - XMLHttpRequest 泄漏
下面是我的 javascript 代码片段。它没有按预期运行，请帮我解决这个问题。 function getCurrentLocation() { console.log("insi
java - 卡夫卡生产者巨大的内存使用(泄漏？)
我们在生产环境中部署了 3 个代理 Kafka 0.10.1.0。有些应用程序嵌入了 Kafka Producer，它们将应用程序日志发送到某个主题。该主题有 10 个分区，复制因子为 3。我们观察
iphone - appendFormat 泄漏
我正在使用仪器来检测一些泄漏，但有一些泄漏我无法解决； NSMutableString *textedetails = [[NSMutableString alloc] init];
ios - 性能工具 - 泄漏
如果我使用性能工具测试我的代码 - 泄漏，它没有检测到任何泄漏。这是否意味着代码没有泄漏任何内存？我有一个越狱的 iPhone，我可以监控可用内存。如果有人知道，那就是 SBSettings。我测试
ios - ABPersonCopyImageData 泄漏
我在从 AddressBook 中获取图像时遇到了很大的问题，下面我粘贴了我的代码。此 imageData 从未被释放，在我的 Allocations Instruments 上它看起来总是在内存中它
ios - class_copyIvarList 泄漏
- (NSMutableArray *)getArrayValue:(NSArray *)array{ NSMutableArray *valueArray = [NSMutableArra
读取字符串值时 iPhone 泄漏
Instruments 工具说这是一个泄漏，有什么想法吗？我在 for 循环结束时释放变量对象在上述方法的开头，这就是我设置变量对象的方式，即自动释放； NSMutableArray *varia
ios - 泄漏 CGImageMergeXMPPropsWhithLegacyProps
我正在跟踪我的 iOS 应用程序的内存泄漏，我有一个奇怪的泄漏导致我的应用程序崩溃......负责的框架是:CGImageMergeXMPPropsWhithLegacyProps。在某些时候，我的应
ios - NSOperationQueue 泄漏？
我正在尝试使用 NSOperationQueue 在后台线程中执行一个方法，如下所示: NSOperationQueue *queue = [NSOperationQueue new]; NS

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c - cuda程序内存泄漏