memory - 为什么 CudaFree 似乎没有释放内存？-6ren

memory - 为什么 CudaFree 似乎没有释放内存？

转载作者：IT王子更新时间：2023-10-28 23:35:27

我正在尝试分配设备内存，复制到它，在 GPU 上执行计算，将结果复制回来，然后释放我分配的设备内存。我想确保我没有超出限制，我想看看共享内存空间中是否有足够的内存来转储一些数组。

当我分配设备内存时，没有返回错误。当我使用 cudaMemGetInfo 检查分配的内存量时，看起来一个 cudaMalloc 没有分配任何内存。此外，当我尝试释放内存时，似乎只释放了一个指针。

我正在使用 matlab Mexfunction 接口(interface)来设置 GPU 内存并启动内核。在这一点上，我什至没有调用内核，只是返回一个单位矩阵作为结果。

cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);  

/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;

/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceScattDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{   
    mexPrintf("could not allocate memory to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}  
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedReal\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
    mexPrintf("could not allocate memory to deviceReceivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));

/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to devicePulseDelay\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to deviceTarDistance\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);   
if (cudaErr != cudaSuccess)
{  
    mexPrintf("could not copy to deviceScattDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice); 
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could not copy to deviceScatterers\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));  

/* call the kernel */
// launchKernel<<<1,512>>>(........);   

/* retireve the output */  
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);   
if (cudaErr != cudaSuccess)
{   
    mexPrintf("could not copy to receivedReal\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost); 
if (cudaErr != cudaSuccess)
{ 
    mexPrintf("could not copy to receivedImag\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem); 
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));   

/* free the memory. */ 
mexPrintf("\nFree'ing memory.\n");   
cudaMemGetInfo(&freeMem, &totalMem);  
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);  
cudaErr = cudaFree(devicePulseDelay); 
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free devicePulseDelay\n");   
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceTarDistance);   
if (cudaErr != cudaSuccess) 
{
    mexPrintf("could free deviceTarDistance\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScattDistance);   
if (cudaErr != cudaSuccess) 
{   
    mexPrintf("could free deviceScattDistance\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}   
cudaMemGetInfo(&allocMem, &totalMem);   
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceScatterers);  
if (cudaErr != cudaSuccess) 
{   
    mexPrintf("could free deviceScatterers\n");  
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));   
}   
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));  
cudaErr = cudaFree(deviceReceivedReal);  
if (cudaErr != cudaSuccess) 
{  
    mexPrintf("could free deviceReceivedReal\n"); 
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
} 
cudaMemGetInfo(&allocMem, &totalMem);  
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));   
cudaErr = cudaFree(deviceReceivedImag);   
if (cudaErr != cudaSuccess) 
{ 
    mexPrintf("could free deviceReceivedImag\n");
    mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));  
}   
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));

下面是这个的输出:

 Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576

我觉得我缺少一些明显的东西。任何人都可以帮助解释发生了什么吗？

编辑:平台是带有 Tesla C2050 GPU 卡的 windows 7。

最佳答案

malloc 在调用时直接从主机操作系统获取内存分配，而 free 在调用时直接将它们释放回主机操作，这是一个很常见的误解。但是它们几乎总是不会那样工作，而是标准库维护一个循环的 free'd 和 malloc'd 内存列表，通过与主机操作系统交互来机会主义地扩展和收缩(有关更多信息，请参阅 How do malloc() and free() work? 上的一些答案如果您有兴趣，请详细说明)。不管它是如何工作的，这都会导致一些不直观的结果，包括通常不可能像操作系统所说的那样分配尽可能多的内存是免费的，分配有时似乎不会改变可用内存的数量，并且 free 有时对操作系统所说的空闲内存量没有影响。

虽然我只有经验证据支持这一点，但我相信 CUDA 的工作方式完全相同。上下文维护自己的 malloc'd 和 free'd 内存列表，并将扩展和收缩该列表中保存的内存作为主机驱动程序/窗口管理器和 GPU 本身允许。所有硬件都有一个特有的 MMU 页面大小，有证据表明 NVIDIA GPU 上的页面大小相当大。这意味着 cudaMalloc 调用的粒度相当粗略，这意味着有时 malloc 似乎不会影响可用内存量或消耗比请求更多的内存，并且有时 free 调用似乎没有任何效果(如果您有兴趣，可以找到一个小工具来帮助说明 CUDA 驱动程序 here 的页面大小行为，尽管它是为早期版本的CUDA API，并且可能需要进行一些更改才能与现代版本一起编译)。我相信这是您观察到的行为最可能的解释。

顺便说一句，如果我使用 GT200 系列设备运行您在 MacOS 10.6 上发布的代码的简化版本:

#include <cstdio>

#define mexPrintf printf

inline void gpuAssert(cudaError_t code, char *file, int line, 
                 bool abort=true)
{
   if (code != cudaSuccess) 
   {
      mexPrintf("GPUassert: %s %s %d\n", cudaGetErrorString(code),
          file, line);
      if (abort) exit(code);
   }
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuMemReport(size_t * avail, size_t * total, 
        const char * title = 0, const size_t * free = 0, const bool sense = true) 
{
    char tstring[32] = { '\0' };
    gpuErrchk( cudaMemGetInfo(avail, total) );  

    if (free) {
        if (title) {
            strncpy(tstring, title, 31);
        }
        mexPrintf("%s Memory avaliable: Free: %zu, Total: %zu, %s: %zu\n",
                tstring, *avail, *total, (sense) ? "Allocated\0" : "Freed\0", 
                (sense) ? (*free - *avail) : (*avail - *free));
    } else {
        mexPrintf("Memory avaliable: Free: %zu, Total: %zu\n", *avail, *total);  
    }
}

int main()
{
    size_t freeMem = 0;
    size_t totalMem = 0;
    size_t allocMem = 0;

    gpuErrchk( cudaFree(0) );
    gpuMemReport(&freeMem, &totalMem);

    double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
    double *deviceReceivedReal, *deviceReceivedImag;

    mexPrintf("Allocating memory.\n");
    gpuErrchk( cudaMalloc( (void **) &devicePulseDelay, sizeof(double)*512) );
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceTarDistance, sizeof(double)*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceScattDistance, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceScatterers, sizeof(double)*999) );
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceReceivedReal, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem);

    gpuErrchk( cudaMalloc( (void **) &deviceReceivedImag, sizeof(double)*999*512) );
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem);

    mexPrintf("\nFree'ing memory.\n");   
    gpuMemReport(&freeMem, &totalMem);

    gpuErrchk( cudaFree(devicePulseDelay) ); 
    gpuMemReport(&allocMem, &totalMem, "devicePulseDelay:", &freeMem, false);

    gpuErrchk( cudaFree(deviceTarDistance) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceTarDistance:", &freeMem, false);

    gpuErrchk( cudaFree(deviceScattDistance) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceScattDistance:", &freeMem, false);

    gpuErrchk( cudaFree(deviceScatterers) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceScatterers:", &freeMem, false);

    gpuErrchk( cudaFree(deviceReceivedReal) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedReal:", &freeMem, false);

    gpuErrchk( cudaFree(deviceReceivedImag) ); 
    gpuMemReport(&allocMem, &totalMem, "deviceReceivedImag:", &freeMem, false);

    return 0;
}

我得到了不同的结果，但也显示了相同的现象:

Allocating memory.
devicePulseDelay: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceTarDistance: Memory avaliable: Free: 202870784, Total: 265027584, Allocated: 1048576
deviceScattDistance: Memory avaliable: Free: 198778880, Total: 265027584, Allocated: 5140480
deviceScatterers: Memory avaliable: Free: 197730304, Total: 265027584, Allocated: 6189056
deviceReceivedReal: Memory avaliable: Free: 193638400, Total: 265027584, Allocated: 10280960
deviceReceivedImag: Memory avaliable: Free: 189546496, Total: 265027584, Allocated: 14372864

Free'ing memory.
Memory avaliable: Free: 189546496, Total: 265027584
devicePulseDelay: Memory avaliable: Free: 189546496, Total: 265027584, Freed: 0
deviceTarDistance: Memory avaliable: Free: 190595072, Total: 265027584, Freed: 1048576
deviceScattDistance: Memory avaliable: Free: 194686976, Total: 265027584, Freed: 5140480
deviceScatterers: Memory avaliable: Free: 195735552, Total: 265027584, Freed: 6189056
deviceReceivedReal: Memory avaliable: Free: 199827456, Total: 265027584, Freed: 10280960
deviceReceivedImag: Memory avaliable: Free: 203919360, Total: 265027584, Freed: 14372864

这表明该行为也取决于硬件/主机操作系统。

关于memory - 为什么 CudaFree 似乎没有释放内存？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/10394755/

文章推荐： android - 使用 JUnit 对 SparseArray 进行单元测试(使用 JVM)

文章推荐： android - 在 Android Studio 中放置 credentials.json 的位置

文章推荐： android - 我可以将连接的设备设置为默认启动目标吗？

opengl - cudaFree - 无效的设备指针错误
我正在尝试释放设备内存 dev_inp我在我的 CUDA + OpenGL 互操作代码中分配的。经过错误检查，我收到了 Invalid Device Pointer错误，程序在 cudaFree(de
cuda - cudaFree()是异步的吗？
我的问题就是标题。实际上，我正在寻找一种异步释放设备内存的方法。谢谢! 最佳答案 cudaFree()不是异步的。从主机调用cudaFree()时，将在内部调用同步调用。我不明白您实际上需要异步释
memory - cudaFree 没有释放内存
下面的代码计算两个向量 a 和 b 的点积。正确的结果是 8192。当我第一次运行它时，结果是正确的。然后当我第二次运行它时，结果是之前的结果 + 8192 等等: 1st iteration: re
c++ - 如何在全局实例化变量上处理 cudaFree
我有一个用于实例化全局变量的类: class BitUnpackPtrs { public: ushort* d_dataIn; BitUnpackPtrs() : d_dataIn(
asynchronous - 异步调用后的 cudaFree 是否有效？
我想问一下在一些异步调用后调用cudaFree是否有效？例如 int* dev_a; // prepare dev_a... // launch a kernel to process dev_a (
c++ - cudaDeviceReset 诉 cudaFree
关于 cudaDeviceReset() 的正确使用存在各种问题，但我无法找到以下问题的答案。 cudaDeviceReset() 上的文档说它明确销毁并清除当前进程中与当前设备关联的所有资源。假设
memory - 为什么 CudaFree 似乎没有释放内存？
我正在尝试分配设备内存，复制到它，在 GPU 上执行计算，将结果复制回来，然后释放我分配的设备内存。我想确保我没有超出限制，我想看看共享内存空间中是否有足够的内存来转储一些数组。当我分配设备内存时，
memory-management - cudaFree() 之前是否需要 cudaDeviceSynchronize()？
CUDA 版本 10.1。帕斯卡GPU。所有命令都发布到默认流: void * ptr; cudaMalloc(&ptr, ...); launch_kernel>>(ptr); cudaDevice
c++ - cudaFree 和 cudaFreeHost 无法释放堆分配的内存
我编写了一个类，其中堆中的构造函数内存是使用 cudaMallocHost() 和 cudaMalloc() 分配的。如果我尝试释放内存 cudaFree() 或 cudaFreeHost()，GP
cuda - 如果未使用 cudaFree()，在使用它的应用程序退出后，GPU 上分配的内存会发生什么？
如果最后没有使用cudaFree()，使用它的应用程序/内核函数退出后，正在使用的内存是否会自动释放？最佳答案是的。当您的应用程序终止时(无论是否正常)，它的所有内存都会被操作系统回收，无论它是
c++ - 调用 CudaFree 时多线程 CPU CUDA 应用程序不是异步的
我有一个由多个 CPU 线程组成的应用程序，每个 CPU 线程在我的 GPU 上的同一个 cudaContext 中创建一个单独的 cudaStream。我有一辆特斯拉 K20c。我正在使用 Wi

IT王子

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

memory - 为什么 CudaFree 似乎没有释放内存？