I've been building a LipNet model with cross entropy loss but the model does a pretty bad job in learning to predict the correct words. So I switched to CTCLoss (which was the same loss used in the official lipnet paper) however the model is not learning anything.
The model is a Conv-LSTM network, takes an input video of tensor (batch_size, 1, 75, 46, 140) and outputs a tensor of shape ( batch_size , 10) which is fed to the lstm.
我一直在构建一个具有交叉熵损失的LipNet模型,但该模型在学习预测正确的单词方面做得相当糟糕。因此,我转而使用CTCLoss(这与官方唇网论文中使用的损失相同),但该模型没有学到任何东西。该模型是Conv-LSTM网络,获取张量(Batch_Size,1,75,46,140)的输入视频,并输出形状张量(Batch_Size,10),该张量被馈送到LSTM。
Here is the code.
以下是代码。
class LSTM(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size):
super(LSTM, self).__init__()
self.vocab_size = vocab_size
self.embed_size = embed_size
self.hidden_size = hidden_size
self.embeddings = nn.Embedding(vocab_size , embed_size)
self.model = nn.LSTM(self.embed_size, self.hidden_size,num_layers = 2 , batch_first=True , bidirectional = True )
self.dropout = nn.Dropout(0.5)
self.ln = nn.Linear(self.hidden_size*2, self.vocab_size)
self.conv = Conv()
self.m = nn.Softmax(dim=0)
def forward(self , features, sentence):
features = self.conv(features)
print(features.shape)
x = torch.Tensor().to(device)
sentences = torch.Tensor().to(device)
states = None
for i in range(36):
out , states = self.model(features.unsqueeze(1), states)
# print(out.shape) size: (batch_size , 1 , 10)
out = self.ln(out)
# print(out.shape) size : (batch , 1 , 40)
# print(out)
x = torch.cat((x , out) , dim = 1)
pred = out.squeeze(1).argmax(1)
features = self.embeddings(pred).to(device)
# features_shape = (batch , 1 , 10)
# x_shape: (batch_size , 36 , 40)
return x
def sample(self , x , max_length):
words = []
with torch.no_grad():
features = self.conv(x)
print("df" , features.shape)
states = None
# print(features.shape)
return words
Here is how I trained the model.
这就是我如何训练这个模型的。
criterion = nn.CTCLoss()
for j in range(300):
print(j , "EPOCHS")
for m , (x , y) in enumerate(tqdm(train_data)):
x = x.type(torch.cuda.FloatTensor)
x = x.to(device)
Y = np.array(y)
y = np.array(y)
y = torch.from_numpy(y)
y = y.type(torch.cuda.FloatTensor)
y = y.to(device)
print(x.shape)
sen = model(x , y)
sentence = sen[: , 1: , : ] #the model returns tensor shape (36 , 40)
word = [ ]
words= [ ]
for i in range(y.shape[0]):
for n in range(35):
max = torch.argmax(sentence[i][n])
word.append(max.cpu().detach().numpy())
words.append(word)
word = []
words = np.stack(words , axis=0)
# print(words.shape)
word = itos(words[0])
words = torch.from_numpy(words)
words = words.to(device)
# print(y.shape , sentence.shape) y_shape: (batch , 35 , 40) , sentence : (batch , 35 , 40)
sentence = torch.permute(sentence , (1 , 0 , 2)
# print(sentence.shape) (35 , batch , 40)
lengths = torch.full(size=(x.shape[0],), fill_value=1, dtype= torch.int)
loss = criterion(sentence , y, lengths , lengths)
optimizer.zero_grad()
loss.backward(loss)
optimizer.step()
print("Predicted word: " , word)
print("Actual word: " , itos(Y[0]))
But the output is something like this:
但输出结果如下所示:
0 EPOCHS
100%|██████████| 57/57 [00:11<00:00, 4.80it/s]
Predicted word: 3vvv3v3vv3v3vv3v3vv3v3vv3v3vv3v3vv3
Actual word: lay blue with r three soon ''''''''
And it stays like this for all the epochs
can you please explain what is it that I'm doing wrong, am I giving wrong inputs to the criterion(CTCLoss) or is it something else.
在所有时代都是这样的,你能解释一下我做错了什么吗,是我给标准(CTCLoss)提供了错误的输入,还是其他什么。
更多回答
我是一名优秀的程序员,十分优秀!