python - 为什么我的 LSTM 在 tensorflow 中学习得这么慢而且很糟糕？-6ren

python - 为什么我的 LSTM 在 tensorflow 中学习得这么慢而且很糟糕？

转载作者：行者123 更新时间：2023-11-30 09:35:08

该程序读取文本文件 RNNtext.txt，为所有数据创建单热向量表示，用数据训练 LSTM，并时不时地显示一堆采样字符。然而，即使查看cost vs iterations graph表明它的学习效率非常非常低。老实说，我的 LSTM 原始代码(numpy)做得更好。它不仅速度更快，而且能生成大部分有意义的单词。这只会产生乱码。我的错误在哪里？我真的没有主意了，我似乎找不到逻辑上错误的地方。

import numpy as np
import random
import tensorflow as tf
import os
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

# Reading RNNtext.txt file
direc = os.path.dirname(os.path.realpath(__file__))
data = open(direc + "/RNNtext.txt", "r").read()

# Array of unique characters
chars = list(set(data))

num_hidden = 80
iterations = 1000
display_iteration = 100 # Sample when iteration % display_iteration == 0
sample_size = 250
batch_size = 120 # batch size or the number of time steps to unroll RNN
alpha = 0.01 # Learning rate

#Vocabulary and text file sizes
vocab_size = len(chars)
data_size = len(data)

# Bijection from a unique character to an index
char_to_ix = {}
# Bijection from an index to a unique character
ix_to_char = {}

for j in range(vocab_size):
    char_to_ix[chars[j]] = j
    ix_to_char[j] = chars[j]


# Transforming all characters to indices    
data_ix = [char_to_ix[ch] for ch in data]


train_data = [] # This will contain one-hot vectors
for k in range(data_size):
    # Representing each index/character by a one-hot vector
    hot1 = np.zeros((vocab_size, 1))
    hot1[data_ix[k]] = 1
    train_data.append(hot1)



X = tf.placeholder(tf.float32, [None, vocab_size, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, vocab_size])


cell = tf.contrib.rnn.LSTMCell(num_hidden,state_is_tuple=True)
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
output = tf.transpose(output, [1, 0, 2])


weight = tf.Variable(tf.random_normal([num_hidden, vocab_size]))
bias = tf.Variable(tf.constant(0.0, shape=[vocab_size]))

prediction = tf.matmul(output[-1], weight) + bias
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=target))

optimizer = tf.train.ProximalGradientDescentOptimizer(alpha)
minimize = optimizer.minimize(cost)


init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)


ARR = [i for i in range(vocab_size)] # for extracting index by probabilities in np.random.choice()

ITER = []
COST = []

p = 0 # p will be iterated by batch_size steps
for i in range(iterations):
    if p + batch_size >= data_size:
        p = 0

    # sweeping through data one-hot vectors 
    inp, out = train_data[p:p+batch_size], train_data[p+1:p+batch_size+1]
    out = np.reshape(out, [-1, vocab_size])

    c = sess.run(cost, {X: inp, target: out}) # calculating cost for plotting later
    COST.append(c)
    ITER.append(i)

    sess.run(minimize, {X: inp, target: out})

    # displaying sample_size number of characters with random seed
    # doesn't affect training
    if i % display_iteration == 0:
        seed = np.random.randint(0, vocab_size)
        CHARS = []
        for j in range(sample_size):
            x = np.zeros((vocab_size, 1))
            x[seed] = 1
            x = [x]
            pred = sess.run(prediction, {X: x})[0]
            pred = np.exp(pred) / np.sum(np.exp(pred))
            pred = pred.ravel()

            seed = np.random.choice(ARR, 1, p = pred)[0]
            ch = ix_to_char[seed]
            CHARS.append(ch)
        TXT = ''.join(CHARS)

        print("-------------------------------------------------")
        print(TXT)
        print("Iteration: ", str(i))

    p += batch_size
sess.close()
plt.plot(ITER, COST)
plt.show()

编辑:添加了 numpy 代码进行比较

import numpy as np
import matplotlib.pyplot as plt
import os
plt.style.use('fivethirtyeight')
direc = os.path.dirname(os.path.realpath(__file__))

readFile = open(direc + "\RNNtext.txt", 'r')

data = readFile.read()
readFile.close()


chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(chars)
print("Vocabulary size: " + str(vocab_size))
char_to_ix = {}
ix_to_char = {}

for j in range(len(chars)):
    char_to_ix[chars[j]] = j
    ix_to_char[j] = chars[j]

hidden_size = 80
batch_size = 120
alpha = 0.1
sample_size = 250
iterations = 1000
display_iteration = 100



Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias


def sample(hid, seed, weights, sample_size):
    X = np.zeros((vocab_size, 1))
    X[seed] = 1
    CHARS = []
    ARR = [i for i in range(vocab_size)]

    for t in range(sample_size):
        hid = np.tanh(np.dot(Wxh, X)  + np.dot(Whh, hid) + bh)
        y = np.dot(Why, hid) + by
        prob = np.exp(y) / np.sum(np.exp(y))
        prob = prob.ravel()
        ix = np.random.choice(ARR, 1, p=prob)[0]
        CHARS.append(ix_to_char[ix])
        X = np.zeros((vocab_size, 1))
        X[ix] = 1
        TXT = ''.join(CHARS)
    return TXT

LOSS = []
ITER = []
p = 0

mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad

smooth_loss = -np.log(1.0/vocab_size)*batch_size # loss at iteration 0
hprev = np.zeros((hidden_size,1))


for i in range(iterations): ## just time passing by

    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)   
    dWhy = np.zeros_like(Why)   
    dbh = np.zeros_like(bh) 
    dby = np.zeros_like(by)     


    if p+batch_size >= len(data) or i == 0:
        hprev = np.zeros((hidden_size,1))
        p = 0

    inputs = [char_to_ix[ch] for ch in data[p:p+batch_size]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+batch_size+1]]

    HID = {}
    X = {}
    Y = {}
    P = {}
    HID[-1] = np.copy(hprev)

    loss = 0

    ##======FORWARD======##
    for t in range(len(inputs)):
        X[t] = np.zeros((vocab_size,1))
        X[t][inputs[t]] = 1

        HID[t] = np.tanh(np.dot(Wxh, X[t])  + np.dot(Whh, HID[t-1]) + bh) # inp -> X
        Y[t] = np.dot(Why, HID[t]) + by # tanh
        P[t] = np.exp(Y[t]) / np.sum(np.exp(Y[t]))
        loss += -np.log(P[t][targets[t]][0])
    dhnext = np.zeros_like(HID[0])
    ##======BACKPROP======##
    for t in reversed(range(len(inputs))):

        dy = np.copy(P[t])
        dy[targets[t]] -= 1
        dh = (np.dot(Why.T, dy) + dhnext)*(1-HID[t]*HID[t]) 
        dx = np.dot(Why.T, dy)*(1 - HID[t]**2)

        dWhy += np.dot(dy, HID[t].T) 
        dWhh += np.dot(dh, HID[t-1].T) 
        dWxh += np.dot(dh, X[t].T) 
        dby += dy 
        dbh += dh 

        dhnext = np.dot(Whh.T, dh)

    ##=====================##
    hprev = HID[-1]
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients


    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):


        mem += dparam * dparam

        param += -alpha * dparam / np.sqrt(mem + 1e-8) # Adagrad
    if i % display_iteration == 0:
        print(str(i))
        weights = [Wxh,Whh,Why,bh,by]
        seed = inputs[np.random.randint(0,len(inputs))]
        TXT = sample(HID[-1], seed, weights, sample_size)
        print("-----------------------------------------------")
        print(TXT)
        print("-----------------------------------------------")
        with open(direc + "\RNNout.txt", 'w') as writeFile:
            writeFile.write(TXT)
    ITER.append(i)
    LOSS.append(loss)

    p += batch_size
best_text = sample(HID[-1], inputs[0], weights, sample_size)


plt.plot(ITER, LOSS, linewidth = 1)
plt.show()

writeFile.close()

最佳答案

嗯，哦...看起来您没有重新使用状态!如果不维护状态，LSTM(状态机)如何正常工作？

对我来说，这看起来像是一个危险信号:

output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)

tf.nn.dynamic_rnn 的第二个输出是处理给定序列后的最新状态。看起来您明确忽略了它，并且没有将其重新输入到 sess.run(...) 中的每次后续训练迭代中(因此您的 dynamic_rnn 不会有 initial_state 参数)。

我强烈建议您在进一步查看之前更改这部分代码。

此外，我不知道您的数据是什么样的，但您的馈送和批处理策略必须能够使整个状态传递练习有意义。否则，它会再次产生乱码。

关于python - 为什么我的 LSTM 在 tensorflow 中学习得这么慢而且很糟糕？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/44957023/

文章推荐： javascript - Angular2如何更新另一个组件中的图像

文章推荐： javascript - Openlayers 4 - 使图层在功能点击时不可见

文章推荐： javascript - Angular (v4) 表单数据对象不匹配 json 数据

lstm - LSTM 单元如何映射到层？
我无法准确理解 LSTM 单元的范围——它如何映射到网络层。来自格雷夫斯 (2014): 在我看来，在单层网络中，layer = lstm 单元。这实际上如何在多层 rnn 中工作？三层RNN LS
machine-learning - lstm(256) + lstm(256) 和 lstm(512) 有什么区别？
这是代码 model = Sequential() model.add(LSTM(256, input_shape=(None, 1), return_sequences=True)) model.a
lstm - Pytorch 隐藏状态 LSTM
为什么我们需要在pytorch中初始化LSTM中的隐藏状态h0。由于 h0 无论如何都会被计算并被覆盖？是不是很像整合一个一 = 0 一个= 4 即使我们不做a=0，也应该没问题.. 最佳答案重点
lstm - Deeplearning4j LSTM 时间序列预测示例
我正在尝试使用 LSTM 在 Deeplearning4j 中进行一些简单的时间序列预测，但我很难让它工作。我有一个简单的文本文件，其中包含如下所示的数字列表，并希望网络学习预测下一个数字。有没有
keras - 对如何实现时间分布的 LSTM + LSTM 感到困惑
在大量阅读和绘制图表之后，我想我已经提出了一个模型，我可以将其用作更多测试我需要调整哪些参数和功能的基础。但是，我对如何实现以下测试用例感到困惑(所有数字都比最终模型小几个数量级，但我想从小处着手):
lstm - torch.nn.LSTM 运行时错误
我正在尝试实现“Livelinet:用于预测教育视频中的活力的多模式深度循环神经网络”中的结构。为了简单说明，我将 10 秒音频剪辑分成 10 个 1 秒音频剪辑，并从该 1 秒音频剪辑中获取频谱图
python - Tensorflow LSTM - LSTM 单元上的矩阵乘法
我正在 Tensorflow 中制作 LSTM 神经网络。输入张量大小为 92。 import tensorflow as tf from tensorflow.contrib import rnn
python - 在 LSTM 层之前具有嵌入层的 Keras LSTM
我正在尝试 keras IMDB 数据的示例，数据形状是这样的: x_train shape: (25000, 80) 我只是把keras例子的原始代码改成了这样的代码: model = Sequen
lstm - 如何正确地为 PyTorch 中的嵌入、LSTM 和线性层提供输入？
我需要了解如何使用 torch.nn 的不同组件正确准备批量训练的输入。模块。具体来说，我希望为 seq2seq 模型创建一个编码器-解码器网络。假设我有一个包含这三层的模块，按顺序: nn.Emb
tensorflow - Keras - 有状态 LSTM 与无状态 LSTM
我很难概念化 Keras 中有状态 LSTM 和无状态 LSTM 之间的区别。我的理解是，在每个批处理结束时，在无状态情况下“网络状态被重置”，而对于有状态情况，网络状态会为每个批处理保留，然后必须在
lstm - PyTorch LSTM - 使用词嵌入而不是 nn.Embedding()
nn.Embedding() 是学习 LSTM 所必需的吗？我在 PyTorch 中使用 LSTM 来预测 NER - 此处是类似任务的示例 - https://pytorch.org/tutori
python - 塑造 LSTM 的数据，并将密集层的输出馈送到 LSTM
我正在尝试找出适合我想要拟合的模型的正确语法。这是一个时间序列预测问题，我想在将时间序列输入 LSTM 之前使用一些密集层来改进时间序列的表示。这是我正在使用的虚拟系列: import pandas
deep-learning - 堆叠式 LSTM 网络中每个 LSTM 层的输入是什么？
我在理解堆叠式 LSTM 网络中各层的输入-输出流时遇到了一些困难。假设我已经创建了一个如下所示的堆叠式 LSTM 网络: # parameters time_steps = 10 features
lstm - 将 LSTM 中的 Tanh 激活更改为 ReLU
LSTM 类中的默认非线性激活函数是 tanh。我希望在我的项目中使用 ReLU。浏览文档和其他资源，我无法找到一种简单的方法来做到这一点。我能找到的唯一方法是定义我自己的自定义 LSTMCell，但
lstm - 是否可以在 PyTorch 中使用 LSTMCells 模块实现多层 LSTM？
在 PyTorch 中，有一个 LSTM 模块，除了输入序列、隐藏状态和单元状态之外，它还接受 num_layers 参数，该参数指定我们的 LSTM 有多少层。然而，还有另一个模块 LSTMCel
machine-learning - TensorFlow:在另一个 LSTM 之上的 LSTM
没什么好说的作为介绍:我想在 TensorFlow 中将 LSTM 堆叠在另一个 LSTM 上，但一直被错误阻止，我不太明白，更不用说单独解决了。代码如下: def RNN(_X, _istate,
machine-learning - 双向 LSTM 和 LSTM 有什么区别？
有人可以解释一下吗？我知道双向 LSTM 具有前向和反向传递，但是与单向 LSTM 相比，它有什么优势？它们各自更适合什么？最佳答案 LSTM 的核心是使用隐藏状态保留已经通过它的输入信息。单向
python - LSTM 内的 Tensorflow 序列到序列 LSTM(嵌套)
我想构建一个带有特殊词嵌入的 LSTM，但我对它的工作原理有一些疑问。您可能知道，一些 LSTM 对字符进行操作，因此它是字符输入，字符输出。我想做同样的事情，通过对单词的抽象来学习使用嵌套的 LS
Keras LSTM for continuous output and with EarlyStopping(用于连续输出和早期停止的KERAS LSTM)
我编写了一个LSTM回归模型。它是最后一个LSTM层的BATCH_SIZE=1和RETURN_Sequence=True的模型。我还设置了VERIFICATION_DATA和耐心进行培训。但似乎存在一
python - TensorFlow:为下一批记住 LSTM 状态(有状态 LSTM)
给定一个训练有素的 LSTM 模型，我想对单个时间步执行推理，即以下示例中的 seq_length = 1。在每个时间步之后，需要为下一个“批处理”记住内部 LSTM(内存和隐藏)状态。在推理的最开始

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

python - 为什么我的 LSTM 在 tensorflow 中学习得这么慢而且很糟糕？