gpt4 book ai didi

python - 带有 CUDA 的 Pytorch 在使用 pack_padded_sequence 时抛出 RuntimeError

转载 作者:行者123 更新时间:2023-12-04 03:23:29 26 4
gpt4 key购买 nike

我正在尝试使用 Pytorch 训练 BiLSTM-CRF 检测新的 NER 实体。
为此,我使用了从 Pytorch Advanced tutorial 派生的一段代码。 This snippet 实现批量训练。
我遵循自述文件以根据需要显示数据。在 CPU 上一切正常,但是当我尝试将其连接到 GPU 时,出现以下错误:

---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-23-794982510db6> in <module>
4 batch_input, batch_input_lens, batch_mask, batch_target = batch_info
5
----> 6 loss_train = model.neg_log_likelihood(batch_input, batch_input_lens, batch_mask, batch_target)
7 optimizer.zero_grad()
8 loss_train.backward()

<ipython-input-11-e44ffbf7d75f> in neg_log_likelihood(self, batch_input, batch_input_lens, batch_mask, batch_target)
185
186 def neg_log_likelihood(self, batch_input, batch_input_lens, batch_mask, batch_target):
--> 187 feats = self.bilstm(batch_input, batch_input_lens, batch_mask)
188 gold_score = self.CRF.score_sentence(feats, batch_target)
189 forward_score = self.CRF.score_z(feats, batch_input_lens)

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-11-e44ffbf7d75f> in forward(self, batch_input, batch_input_lens, batch_mask)
46 batch_input = self.word_embeds(batch_input) # size: #batch * padding_length * embedding_dim
47 batch_input = rnn_utils.pack_padded_sequence(
---> 48 batch_input, batch_input_lens, batch_first=True)
49 batch_output, self.hidden = self.lstm(batch_input, self.hidden)
50 self.repackage_hidden(self.hidden)

/opt/conda/lib/python3.7/site-packages/torch/nn/utils/rnn.py in pack_padded_sequence(input, lengths, batch_first, enforce_sorted)
247
248 data, batch_sizes = \
--> 249 _VF._pack_padded_sequence(input, lengths, batch_first)
250 return _packed_sequence_init(data, batch_sizes, sorted_indices, None)
251

RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor`

如果我理解得很好,pack_padded_sequence 需要张量在 CPU 而不是 GPU 上。不幸的是,我的前向函数调用了 pack_padded_sequence,如果不回到 CPU 进行整个训练,我看不出有任何方法可以这样做。
这是完整的代码。
类定义:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils


class BiLSTM(nn.Module):
def __init__(self, vocab_size, tagset, embedding_dim, hidden_dim,
num_layers, bidirectional, dropout, pretrained=None):
super(BiLSTM, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.tagset_size = len(tagset)
self.bidirectional = bidirectional
self.num_layers = num_layers
self.word_embeds = nn.Embedding(vocab_size+2, embedding_dim)
if pretrained is not None:
self.word_embeds = nn.Embedding.from_pretrained(pretrained)
self.lstm = nn.LSTM(
input_size=embedding_dim,
hidden_size=hidden_dim // 2 if bidirectional else hidden_dim,
num_layers=num_layers,
dropout=dropout,
bidirectional=bidirectional,
batch_first=True,
)
self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
self.hidden = None

def init_hidden(self, batch_size, device):
init_hidden_dim = self.hidden_dim // 2 if self.bidirectional else self.hidden_dim
init_first_dim = self.num_layers * 2 if self.bidirectional else self.num_layers
self.hidden = (
torch.randn(init_first_dim, batch_size, init_hidden_dim).to(device),
torch.randn(init_first_dim, batch_size, init_hidden_dim).to(device)
)

def repackage_hidden(self, hidden):
"""Wraps hidden states in new Tensors, to detach them from their history."""
if isinstance(hidden, torch.Tensor):
return hidden.detach_().to(device)
else:
return tuple(self.repackage_hidden(h) for h in hidden)

def forward(self, batch_input, batch_input_lens, batch_mask):
batch_size, padding_length = batch_input.size()
batch_input = self.word_embeds(batch_input) # size: #batch * padding_length * embedding_dim
batch_input = rnn_utils.pack_padded_sequence(
batch_input, batch_input_lens, batch_first=True)
batch_output, self.hidden = self.lstm(batch_input, self.hidden)
self.repackage_hidden(self.hidden)
batch_output, _ = rnn_utils.pad_packed_sequence(batch_output, batch_first=True)
batch_output = batch_output.contiguous().view(batch_size * padding_length, -1)
batch_output = batch_output[batch_mask, ...]
out = self.hidden2tag(batch_output)
return out

def neg_log_likelihood(self, batch_input, batch_input_lens, batch_mask, batch_target):
loss = nn.CrossEntropyLoss(reduction='mean')
feats = self(batch_input, batch_input_lens, batch_mask)
batch_target = torch.cat(batch_target, 0).to(device)
return loss(feats, batch_target)

def predict(self, batch_input, batch_input_lens, batch_mask):
feats = self(batch_input, batch_input_lens, batch_mask)
val, pred = torch.max(feats, 1)
return pred


class CRF(nn.Module):
def __init__(self, tagset, start_tag, end_tag, device):
super(CRF, self).__init__()
self.tagset_size = len(tagset)
self.START_TAG_IDX = tagset.index(start_tag)
self.END_TAG_IDX = tagset.index(end_tag)
self.START_TAG_TENSOR = torch.LongTensor([self.START_TAG_IDX]).to(device)
self.END_TAG_TENSOR = torch.LongTensor([self.END_TAG_IDX]).to(device)
# trans: (tagset_size, tagset_size) trans (i, j) means state_i -> state_j
self.trans = nn.Parameter(
torch.randn(self.tagset_size, self.tagset_size)
)
# self.trans.data[...] = 1
self.trans.data[:, self.START_TAG_IDX] = -10000
self.trans.data[self.END_TAG_IDX, :] = -10000
self.device = device

def init_alpha(self, batch_size, tagset_size):
return torch.full((batch_size, tagset_size, 1), -10000, dtype=torch.float, device=self.device)

def init_path(self, size_shape):
# Initialization Path - LongTensor + Device + Full_value=0
return torch.full(size_shape, 0, dtype=torch.long, device=self.device)

def _iter_legal_batch(self, batch_input_lens, reverse=False):
index = torch.arange(0, batch_input_lens.sum(), dtype=torch.long)
packed_index = rnn_utils.pack_sequence(
torch.split(index, batch_input_lens.tolist())
)
batch_iter = torch.split(packed_index.data, packed_index.batch_sizes.tolist())
batch_iter = reversed(batch_iter) if reverse else batch_iter
for idx in batch_iter:
yield idx, idx.size()[0]

def score_z(self, feats, batch_input_lens):
# 模拟packed pad过程
tagset_size = feats.shape[1]
batch_size = len(batch_input_lens)
alpha = self.init_alpha(batch_size, tagset_size)
alpha[:, self.START_TAG_IDX, :] = 0 # Initialization
for legal_idx, legal_batch_size in self._iter_legal_batch(batch_input_lens):
feat = feats[legal_idx, ].view(legal_batch_size, 1, tagset_size) #
# #batch * 1 * |tag| + #batch * |tag| * 1 + |tag| * |tag| = #batch * |tag| * |tag|
legal_batch_score = feat + alpha[:legal_batch_size, ] + self.trans
alpha_new = torch.logsumexp(legal_batch_score, 1).unsqueeze(2).to(device)
alpha[:legal_batch_size, ] = alpha_new
alpha = alpha + self.trans[:, self.END_TAG_IDX].unsqueeze(1)
score = torch.logsumexp(alpha, 1).sum().to(device)
return score

def score_sentence(self, feats, batch_target):
# CRF Batched Sentence Score
# feats: (#batch_state(#words), tagset_size)
# batch_target: list<torch.LongTensor> At least One LongTensor
# Warning: words order = batch_target order
def _add_start_tag(target):
return torch.cat([self.START_TAG_TENSOR, target]).to(device)

def _add_end_tag(target):
return torch.cat([target, self.END_TAG_TENSOR]).to(device)

from_state = [_add_start_tag(target) for target in batch_target]
to_state = [_add_end_tag(target) for target in batch_target]
from_state = torch.cat(from_state).to(device)
to_state = torch.cat(to_state).to(device)
trans_score = self.trans[from_state, to_state]

gather_target = torch.cat(batch_target).view(-1, 1).to(device)
emit_score = torch.gather(feats, 1, gather_target).to(device)

return trans_score.sum() + emit_score.sum()

def viterbi(self, feats, batch_input_lens):
word_size, tagset_size = feats.shape
batch_size = len(batch_input_lens)
viterbi_path = self.init_path(feats.shape) # use feats.shape to init path.shape
alpha = self.init_alpha(batch_size, tagset_size)
alpha[:, self.START_TAG_IDX, :] = 0 # Initialization
for legal_idx, legal_batch_size in self._iter_legal_batch(batch_input_lens):
feat = feats[legal_idx, :].view(legal_batch_size, 1, tagset_size)
legal_batch_score = feat + alpha[:legal_batch_size, ] + self.trans
alpha_new, best_tag = torch.max(legal_batch_score, 1).to(device)
alpha[:legal_batch_size, ] = alpha_new.unsqueeze(2)
viterbi_path[legal_idx, ] = best_tag
alpha = alpha + self.trans[:, self.END_TAG_IDX].unsqueeze(1)
path_score, best_tag = torch.max(alpha, 1).to(device)
path_score = path_score.squeeze() # path_score=#batch

best_paths = self.init_path((word_size, 1))
for legal_idx, legal_batch_size in self._iter_legal_batch(batch_input_lens, reverse=True):
best_paths[legal_idx, ] = best_tag[:legal_batch_size, ] #
backword_path = viterbi_path[legal_idx, ] # 1 * |Tag|
this_tag = best_tag[:legal_batch_size, ] # 1 * |legal_batch_size|
backword_tag = torch.gather(backword_path, 1, this_tag).to(device)
best_tag[:legal_batch_size, ] = backword_tag
# never computing <START>

# best_paths = #words
return path_score.view(-1), best_paths.view(-1)


class BiLSTM_CRF(nn.Module):
def __init__(self, vocab_size, tagset, embedding_dim, hidden_dim,
num_layers, bidirectional, dropout, start_tag, end_tag, device, pretrained=None):
super(BiLSTM_CRF, self).__init__()
self.bilstm = BiLSTM(vocab_size, tagset, embedding_dim, hidden_dim,
num_layers, bidirectional, dropout, pretrained)
self.CRF = CRF(tagset, start_tag, end_tag, device)

def init_hidden(self, batch_size, device):
self.bilstm.hidden = self.bilstm.init_hidden(batch_size, device)

def forward(self, batch_input, batch_input_lens, batch_mask):
feats = self.bilstm(batch_input, batch_input_lens, batch_mask)
score, path = self.CRF.viterbi(feats, batch_input_lens)
return path

def neg_log_likelihood(self, batch_input, batch_input_lens, batch_mask, batch_target):
feats = self.bilstm(batch_input, batch_input_lens, batch_mask)
gold_score = self.CRF.score_sentence(feats, batch_target)
forward_score = self.CRF.score_z(feats, batch_input_lens)
return forward_score - gold_score

def predict(self, batch_input, batch_input_lens, batch_mask):
return self(batch_input, batch_input_lens, batch_mask)
训练单元:
def prepare_sequence(seq, to_ix, device):
idxs = [to_ix[w] for w in seq]
return torch.tensor(idxs, dtype=torch.long).to(device)

def prepare_labels(lab, tag_to_ix, device):
idxs = [tag_to_ix[w] for w in lab]
return torch.tensor(idxs, dtype=torch.long).to(device)


class PadSequence:
def __call__(self, batch):
device = torch.device('cuda')
# Let's assume that each element in "batch" is a tuple (data, label).
# Sort the batch in the descending order
sorted_batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
# Get each sequence and pad it
sequences = [x[0] for x in sorted_batch]
sentence_in =[prepare_sequence(x, word_to_ix, device) for x in sequences]
sequences_padded = torch.nn.utils.rnn.pad_sequence(sentence_in, padding_value = len(word_to_ix) +1, batch_first=True).to(device)

lengths = torch.LongTensor([len(x) for x in sequences]).to(device)

masks = [True if index_word!=len(word_to_ix)+1 else False for sentence in sequences_padded for index_word in sentence ]

labels = [x[1] for x in sorted_batch]
labels_in = [prepare_sequence(x, tag_to_ix, device) for x in labels]
return sequences_padded, lengths, masks, labels_in


{ .... code to get the data formatted...}


device = torch.device("cuda")
batch_size = 64


START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 200
HIDDEN_DIM = 20
NUM_LAYER = 3
BIDIRECTIONNAL = True
DROPOUT = 0.1

train_iter = DataLoader(dataset=training_data, collate_fn=PadSequence(), batch_size=64, shuffle=True)




model = BiLSTM_CRF(len(word_to_ix), tagset, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYER, BIDIRECTIONNAL, DROPOUT, START_TAG, STOP_TAG, device ).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
model.init_hidden(batch_size, device)
with tqdm(total=len(train_iter)) as progress_bar:
for batch_info in train_iter:
batch_input, batch_input_lens, batch_mask, batch_target = batch_info

loss_train = model.neg_log_likelihood(batch_input, batch_input_lens, batch_mask, batch_target)
optimizer.zero_grad()
loss_train.backward()
optimizer.step()
progress_bar.update(1) # update progress

最佳答案

PadSequence 函数(作为 collate_fn 收集样本并从中进行批处理)中,您明确地转换为 cuda 设备,即:

class PadSequence:
def __call__(self, batch):
device = torch.device('cuda')

# Left rest of the code for brevity
...
lengths = torch.LongTensor([len(x) for x in sequences]).to(device)
...
return sequences_padded, lengths, masks, labels_in
在创建批处理 时不需要转换数据,我们通常在通过神经网络推送示例之前就这样做。
此外,您至少应该像这样定义设备:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
或者甚至更好地在您设置所有内容的代码的某些部分为您/用户选择设备。

关于python - 带有 CUDA 的 Pytorch 在使用 pack_padded_sequence 时抛出 RuntimeError,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/68086528/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com