gpt4 book ai didi

python - pytorch 耗尽 GPU 内存

转载 作者:行者123 更新时间:2023-12-02 06:06:14 26 4
gpt4 key购买 nike

我正在尝试在 pytorch 中实现 Yolo-v2。但是,我似乎只是通过网络传递数据而耗尽了内存。该模型很大,如下所示。但是,我觉得我在用我的网络做一些愚蠢的事情(比如不在某处释放内存)。网络在 cpu 上按预期工作。

测试代码(内存用完的地方)是:

x = torch.rand(32,3,416, 416).cuda()
model = Yolov2().cuda()
y = model(x.float())

问题

  1. 我的模型有什么明显的错误吗?
  2. 如何利用内存提高效率?
  3. 其他意见?

模型:

import torch
from torch import nn
import torch.nn.functional as F

class Yolov2(nn.Module):

def __init__(self):
super(Yolov2, self).__init__()

self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm1 = nn.BatchNorm2d(32)

self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm2 = nn.BatchNorm2d(64)

self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm3 = nn.BatchNorm2d(128)
self.conv4 = nn.Conv2d(in_channels=128, out_channels=64, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm4 = nn.BatchNorm2d(64)
self.conv5 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm5 = nn.BatchNorm2d(128)

self.conv6 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm6 = nn.BatchNorm2d(256)
self.conv7 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm7 = nn.BatchNorm2d(128)
self.conv8 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm8 = nn.BatchNorm2d(256)

self.conv9 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm9 = nn.BatchNorm2d(512)
self.conv10 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm10 = nn.BatchNorm2d(256)
self.conv11 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm11 = nn.BatchNorm2d(512)
self.conv12 = nn.Conv2d(in_channels=512, out_channels=256, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm12 = nn.BatchNorm2d(256)
self.conv13 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm13 = nn.BatchNorm2d(512)

self.conv14 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm14 = nn.BatchNorm2d(1024)
self.conv15 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm15 = nn.BatchNorm2d(512)
self.conv16 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm16 = nn.BatchNorm2d(1024)
self.conv17 = nn.Conv2d(in_channels=1024, out_channels=512, kernel_size=1, stride=1, padding=0, bias=False)
self.batchnorm17 = nn.BatchNorm2d(512)
self.conv18 = nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm18 = nn.BatchNorm2d(1024)

self.conv19 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm19 = nn.BatchNorm2d(1024)
self.conv20 = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm20 = nn.BatchNorm2d(1024)

self.conv21 = nn.Conv2d(in_channels=3072, out_channels=1024, kernel_size=3, stride=1, padding=1, bias=False)
self.batchnorm21 = nn.BatchNorm2d(1024)

self.conv22 = nn.Conv2d(in_channels=1024, out_channels=125, kernel_size=1, stride=1, padding=0)

def reorg_layer(self, x):
stride = 2
batch_size, channels, height, width = x.size()
new_ht = int(height/stride)
new_wd = int(width/stride)
new_channels = channels * stride * stride

# from IPython.core.debugger import Tracer; Tracer()()
passthrough = x.permute(0, 2, 3, 1)
passthrough = passthrough.contiguous().view(-1, new_ht, stride, new_wd, stride, channels)
passthrough = passthrough.permute(0, 1, 3, 2, 4, 5)
passthrough = passthrough.contiguous().view(-1, new_ht, new_wd, new_channels)
passthrough = passthrough.permute(0, 3, 1, 2)
return passthrough

def forward(self, x):
out = F.max_pool2d(F.leaky_relu(self.batchnorm1(self.conv1(x)), negative_slope=0.1), 2, stride=2)
out = F.max_pool2d(F.leaky_relu(self.batchnorm2(self.conv2(out)), negative_slope=0.1), 2, stride=2)

out = F.leaky_relu(self.batchnorm3(self.conv3(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm4(self.conv4(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm5(self.conv5(out)), negative_slope=0.1)
out = F.max_pool2d(out, 2, stride=2)

out = F.leaky_relu(self.batchnorm6(self.conv6(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm7(self.conv7(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm8(self.conv8(out)), negative_slope=0.1)
out = F.max_pool2d(out, 2, stride=2)

out = F.leaky_relu(self.batchnorm9(self.conv9(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm10(self.conv10(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm11(self.conv11(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm12(self.conv12(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm13(self.conv13(out)), negative_slope=0.1)
# from IPython.core.debugger import Tracer; Tracer()()
passthrough = self.reorg_layer(out)
out = F.max_pool2d(out, 2, stride=2)

out = F.leaky_relu(self.batchnorm14(self.conv14(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm15(self.conv15(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm16(self.conv16(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm17(self.conv17(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm18(self.conv18(out)), negative_slope=0.1)

out = F.leaky_relu(self.batchnorm19(self.conv19(out)), negative_slope=0.1)
out = F.leaky_relu(self.batchnorm20(self.conv20(out)), negative_slope=0.1)

out = torch.cat([passthrough, out], 1)
out = F.leaky_relu(self.batchnorm21(self.conv21(out)), negative_slope=0.1)
out = self.conv22(out)

return out

附加信息:

  • Torch 版本是 '0.4.1.post2'
  • 在 aws p2.xlarge 上运行(限制 12GB GPU 内存)。
  • 此模型的参数数量为 67137565。这将占用 <500MB。
  • this thread from pytorch可能是相关的。

最佳答案

我会尝试使用较小的批量。从 1 开始,然后检查您的最大值是多少。我也可以尝试减少您的输入张量尺寸。你的网络对于你的 GPU 来说不是那么小

关于python - pytorch 耗尽 GPU 内存,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/52621570/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com