gpt4 book ai didi

python-3.x - 在并行过程中评估 PyTorch 模型上的数据后,GPU 内存不会被释放

转载 作者:行者123 更新时间:2023-12-05 07:16:07 42 4
gpt4 key购买 nike

对于我的优化算法,每次迭代我都需要评估几百张图像。为了加快这个过程,我想充分利用我的 3 个 GPU。

我的过程:

  • 在我的每个 GPU 上加载我的深度学习模型实例
  • 然后将工作负载分成与我拥有的 GPU 数量一样多的部分
  • 将元组中的每个工作负载与应该在其上处理的 GPU 加载模型的实例配对
  • 运行 starmap(_runDataThroughModel, sub_workload) 并行处理所有 sub_workload

现在执行一次并结束问题没有问题,但是,当我重复执行此操作时,GPU 内存开始填满每次迭代,直到出现“RuntimeError:CUDA 错误:内存不足”

我的问题:

  • 正确的做法是什么?
  • 为什么是GPU显存没有释放?因为我在外部预实例化了 GPU 模型“starmap”命令并始终传递相同的实例,为什么会有是一个积累?

更新考虑到此 thread 中出现的问题,我重新编写了代码.在程序的任何循环之外实例化 Pool() 并没有解决 GPU 内存溢出问题,但是,它阻止了 CPU 内存随着时间的推移而增加。

'''
Test GPU Memory Leak
Description: Tests how the memory doesn't get freed up when running multiprocessing with PyTorch Model forward pass
'''
import torch
import torch.multiprocessing as mp
import importlib
from PIL import Image
from skimage import io, transform
from skimage.color import rgb2gray
from skimage.io._plugins.pil_plugin import *
import torch
import torch.nn as nn

# Convolutional neural network (twohttps://duckduckgo.com/?q=install+gmsh+conda&t=canonical convolutional layers)
class ConvNet(nn.Module):
def __init__(self, num_classes=10, num_img_layers = 1, img_res = 128):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
#torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1,
# padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros')
nn.Conv2d(num_img_layers, 64, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(64),
nn.LeakyReLU())
self.layer2 = nn.Sequential(
nn.Conv2d(64, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc1 = nn.Linear(32*int(img_res/2)*int(img_res/2), 32*32)
self.fc2 = nn.Linear(32*32, num_classes)

def forward(self, x):
#print(x.shape)
out = self.layer1(x)
#print(out.shape)
out = self.layer2(out)
#print(out.shape)
out = out.reshape(out.size(0), -1)
out = self.fc1(out)
out = self.fc2(out)
return out

class NNEvaluator:
def __init__(self, model_dict, GPU, img_res = 128, num_img_layers = 1, num_classes = None):
# Load the model checkpoint
gpu_id = 'cuda:' + str(GPU)
self.device = torch.device(gpu_id if torch.cuda.is_available() else 'cpu')
self.model_state_dict = model_dict['model_state_dict']
self.model = ConvNet(num_classes = num_classes, num_img_layers = num_img_layers, img_res = img_res).to(self.device)
self.model.to(self.device)
self.model.load_state_dict(self.model_state_dict)

self.epsilon = torch.tensor(1e-12, dtype = torch.float)
def evaluate(self, img):
self.model.eval()
with torch.no_grad():
img = img.to(self.device)
out = self.model(img)
out = out.to('cpu')
return out
def loadImage(filename):
im = Image.open("test.jpg")
im = io._plugins.pil_plugin.pil_to_ndarray(im)
im = rgb2gray(im)
image = im.transpose((0, 1))
im = torch.from_numpy(image).float()
im = torch.unsqueeze(im,0)
im = torch.unsqueeze(im,1)
return im

def _worker(workload, evaluator):
results = []
for img in workload:
results.append(evaluator.evaluate(img))
def main():
# load a model for each GPU
model_dict = torch.load('model_dict.ckpt')
GPUs = [0,1,2] # available GPUs in the system
evaluators = []
for gpu_id in GPUs:
evaluators.append(NNEvaluator(model_dict, gpu_id, num_classes=3))
# instantiate multiprocessing pool
mp.set_start_method('spawn')
mypool = mp.Pool()

# evaluate all datapoints 20 times
im = loadImage('test.jpg')
total_nr_iterations = 20
for i in range(total_nr_iterations):
# run a subset of the workload on each GPU in a separate process
nr_datapoints = 99
dp_per_evaluator = int(nr_datapoints/len(evaluators))
workload = [im for i in range(dp_per_evaluator)]
jobslist = [(workload, evaluator) for evaluator in evaluators]
mypool.starmap(_worker, jobslist)
print("Finished iteration {}".format(i))
if __name__ == '__main__':
main()

运行代码时的输出:

Finished iteration 0
Finished iteration 1
Finished iteration 2
Process SpawnPoolWorker-10:
Process SpawnPoolWorker-12:
Traceback (most recent call last):
Traceback (most recent call last):
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
task = get()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
return _ForkingPickler.loads(res)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
task = get()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
event_sync_required)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
return _ForkingPickler.loads(res)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
event_sync_required)
RuntimeError: CUDA error: out of memory
RuntimeError: CUDA error: out of memory
Process SpawnPoolWorker-11:
Traceback (most recent call last):
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/pool.py", line 110, in worker
task = get()
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/multiprocessing/queues.py", line 354, in get
return _ForkingPickler.loads(res)
File "/home/ron/miniconda3/envs/PyTorchNN/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 119, in rebuild_cuda_tensor
event_sync_required)
RuntimeError: CUDA error: out of memory

最佳答案

我发现这个类似 thread由于 Pool() 在循环中而不是在外部的实例化而发生内存泄漏。

上述问题还在函数内部实例化了 Pool() 而没有使用 with 符号,这将确保所有启动的进程都返回

例如坏方法:

def evaluation(workload):
jobslist = [job for job in workload]
with Pool() as mypool:
mypool.starmap(_workerfunction, jobslist)
if __name__ == '__main__':
# pseudo data
workload = [[(100,200) for i in range(1000)] for i in range(50)]
for i in range(100):
evaluation(workload)

这样做的正确方法是在循环外实例化池,并将对池的引用传递给函数进行处理,即:

def evaluation(workload, mypool):
jobslist = [job for job in workload]
mypool.starmap(_workerfunction, jobslist)
if __name__ == '__main__':
# pseudo data
with Pool() as mypool:
workload = [[(100,200) for i in range(1000)] for i in range(50)]
for i in range(100):
evaluation(workload, mypool)

我怀疑 GPU 内存泄漏是由于并行进程中尚未清理的遗留引用造成的。

关于python-3.x - 在并行过程中评估 PyTorch 模型上的数据后,GPU 内存不会被释放,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59395680/

42 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com