gpt4 book ai didi

python - tf2.0 : Gradient Tape returns None gradient in RNN model

转载 作者:行者123 更新时间:2023-12-05 05:46:35 26 4
gpt4 key购买 nike

在具有嵌入层和 SimpleRNN 层的模型中,我想为每个步骤 t 计算偏导数 dh_t/dh_0。

我的模型结构,包括导入和数据预处理。
有毒评论列车数据可用:https://www.kaggle.com/c/jigsaw-multilingual-toxic-comment-classification/data?select=jigsaw-toxic-comment-train.csv
GloVe 6B 100d 嵌入可用:https://nlp.stanford.edu/projects/glove/

### 1. Imports 
from __future__ import print_function
import numpy as np
from numpy import array, asarray, zeros
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import Input, Model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization, PReLU
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras import backend as k

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 2. Text data tokenisation and GloVe-100d embeddings:
def data_pp():
train= pd.read_csv('/Users/Toxic comment data/jigsaw-toxic-comment-train.csv') train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)
train= train.iloc[:12000,:]
xtr, xte, ytr, yte= train_test_split(train['comment_text'].values,
train['toxic'].values,
stratify= train['toxic'].values,
random_state= 42, test_size= 0.2, shuffle= True)

# Tokenise data
tok= text.Tokenizer(num_words= None)
tok.fit_on_texts(list(xtr)+ list(xte))
input_dim= len(tok.word_index)+1
input_length= train['comment_text'].apply(lambda x: len(str(x).split())).max()
xtr_seq= tok.texts_to_sequences(xtr); xte_seq= tok.texts_to_sequences(xte)
xtr_pad= sequence.pad_sequences(xtr_seq, maxlen= input_length)
xte_pad= sequence.pad_sequences(xte_seq, maxlen= input_length)
print('Shape of tokenised training input:', xtr_pad.shape)
return xtr_pad, ytr, xte_pad, yte, input_dim, input_length, tok

xtr_pad, ytr, xte_pad, yte, input_dim, input_length, tok= data_pp()

# Word embeddings
def embed_mat(input_dim, output_dim, tok):
'''By default output_dim = 100 for GloVe 100d embeddings'''
embedding_dict=dict()
f= open('/Users/GloVe/glove.6B.100d.txt')
for line in f:
values= line.split()
word= values[0]; coefs= asarray(values[1:], dtype= 'float32')
embedding_dict[word]= coefs
f.close()
Emat= zeros((input_dim, output_dim))
for word, i in tok.word_index.items():
embedding_vector= embedding_dict.get(word)
if embedding_vector is not None:
Emat[i]= embedding_vector
print('Embedding weight matrix has shape:', Emat.shape)
return Emat

output_dim = 100
Emat= embed_mat(input_dim, output_dim, took)

### 3. Define model and compute gradients:
# You can let it run for a few steps and stop the process. Then inspect the first step h_t, h_0 and the computed dh_t/dh_0.
# For the case in my comment, you can remove the for-loop over the steps t, comment out ht, and compute tape.gradient(states, h0) instead.

batch_size = 100
inp= Input(batch_shape= (batch_size, input_length), name= 'input')
emb_out= Embedding(input_dim, output_dim, input_length= input_length,
weights= [Emat], trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.convert_to_tensor(np.random.uniform(size= (batch_size, 200)).astype(np.float32))
rnn_allstates= rnn(emb_out, initial_state=h0)
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad[:100], ytr[:100])).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
for t in range(input_length):
with tf.GradientTape() as tape:
tape.watch(h0)
et = embedding_layer(x_batch_train)
states = rnn_layer(et, initial_state= h0) # (100, 1403, 200)
ht = states[:,t,:]

grad_t= tape.gradient(ht, h0) # (100, 200)
print('Computed gradient dht/dh0 at step ', t+1, 'in batch', b+1)
grads_allsteps.append(grad_t)

在每个步骤 t,h_t 的形状为 (100,200),h_0 的形状为 (100,200)。但是 tape.gradient(ht, h0) 对每个 t 都返回 None。下面是第一步的结果:

for t in range(1):
with tf.GradientTape() as tape:
tape.watch(h0)
et = embedding_layer(x_batch_train)
#tape.watch(et)
states = rnn_layer(et, initial_state= h0) # (100, 1403, 200)
ht = states[:,t,:]
print(ht)
print(h0)
grad_t = tape.gradient(ht, h0)
tf.print(grad_t)

>>
# h_t:
tf.Tensor(
[[ 0.25634336 0.5259362 0.60045886 ... -0.4978792 0.62755316
0.09803997]
[ 0.58387524 0.26037565 0.5646103 ... 0.31233114 0.4853201
0.10877549]
[ 0.17190906 0.68681747 -0.32054633 ... -0.6139967 0.48944488
0.06301598]
...
[ 0.1985917 -0.11821499 -0.47709295 ... -0.05718012 0.16089934
0.20585683]
[ 0.73872745 0.503326 0.25224414 ... -0.5771631 0.03748894
0.09212588]
[-0.6597108 -0.43926442 -0.23546427 ... 0.26760277 0.28221437
-0.4039318 ]], shape=(100, 200), dtype=float32)

# h_0:
tf.Tensor(
[[0.51580787 0.51664346 0.70773274 ... 0.45973232 0.7760376 0.48297063]
[0.61048764 0.26038417 0.60392565 ... 0.7426153 0.15507504 0.57494944]
[0.11859739 0.33591187 0.68375146 ... 0.59409297 0.5302879 0.28876984]
...
[0.12401487 0.39376178 0.9850304 ... 0.21582918 0.9592233 0.5257605 ]
[0.9401199 0.2157638 0.6445949 ... 0.36316434 0.5799403 0.3749675 ]
[0.37230062 0.18162128 0.0739954 ... 0.21624395 0.66291 0.7807376 ]], shape=(100, 200), dtype=float32)

# dh_t/dh_0:
None

Gradient tape 看这个h_0,进行梯度计算似乎有些困难。我已经成功地使用 GradientTape 观察 RNN 层的输入 e_t,并计算了梯度 dh_t/de_t,但这并没有真正提供太多关于模型拟合质量的信息。

如何使用它观察固定时间量h_0,从而计算梯度dh_t/dh_0?在此先感谢您的帮助。


可重现的测试用例:

### 1. Imports 
from __future__ import print_function
import numpy as np
from numpy import array, asarray, zeros
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import Input, Model
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import BatchNormalization, PReLU
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.preprocessing import sequence, text
from keras import backend as k

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 2. Simulated data and gradient computation:
batch_size = 100; input_length = 5
xtr_pad = tf.random.uniform((batch_size, input_length), maxval = 500, dtype=tf.int32)
ytr = tf.random.normal((batch_size, input_length, 200))


inp= Input(batch_shape= (batch_size, input_length), name= 'input')
emb_out= Embedding(500, 100, input_length= input_length, trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.convert_to_tensor(np.random.uniform(size= (batch_size, 200)).astype(np.float32))

rnn_allstates= rnn(emb_out, initial_state=h0)
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad, ytr)).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
for t in range(input_length):
with tf.GradientTape() as tape:
tape.watch(h0)
states= model_rnn(x_batch_train)
ht = states[:,t,:]

grad_t= tape.gradient(ht, h0)
print('Computed gradient dht/dh0 at step ', t+1, 'in batch', b+1)
grads_allsteps.append(grad_t)


一些有趣的事情:计算第一步梯度并且看起来很好。其余为无。

grads_allsteps

>>
[<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 1.2307187 , -1.0343404 , 0.52859926, ..., -0.09879799,
-1.1407609 , -0.7241671 ],
[ 1.142821 , -1.312029 , 0.37148148, ..., 0.2300478 ,
-1.1440411 , -0.36673146],
[ 1.2778691 , -1.2225235 , 0.69951147, ..., 0.17701946,
-1.2816343 , -0.52648413],
...,
[ 1.1717036 , -1.2444504 , 0.5874837 , ..., -0.13161334,
-1.3752006 , -0.376719 ],
[ 1.1333262 , -1.0013355 , 0.3363382 , ..., -0.22350994,
-1.299541 , -0.5073889 ],
[ 1.18489 , -0.90809333, 0.55045474, ..., -0.10550319,
-1.0866506 , -0.58325446]], dtype=float32)>, None, None, None, None]

最佳答案

您可以尝试使用 tf.gradients。还可以将 tf.Variable 用于 h0:

# Your imports
#-------
### 2. Simulated data and gradient computation:
batch_size = 100; input_length = 5
xtr_pad = tf.random.uniform((batch_size, input_length), maxval = 500, dtype=tf.int32)
ytr = tf.random.normal((batch_size, input_length, 200))


inp= Input(batch_shape= (batch_size, input_length), name= 'input')
emb_out= Embedding(500, 100, input_length= input_length, trainable= False, name= 'embedding')(inp)
rnn= SimpleRNN(200, return_sequences= True, return_state= False, stateful= True, name= 'simpleRNN')

h0 = tf.Variable(tf.random.uniform((batch_size, 200)))

rnn_allstates= rnn(emb_out, initial_state=h0)
model_rnn = Model(inputs=inp, outputs= rnn_allstates, name= 'model_rnn')
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

ds = tf.data.Dataset.from_tensor_slices((xtr_pad, ytr)).batch(100)
embedding_layer = model_rnn.layers[1]
rnn_layer = model_rnn.layers[2]


@tf.function
def calculate_t_gradients(t, x, h0):
return tf.gradients(model_rnn(x)[:,t,:], h0)

grads_allsteps= []
for b, (x_batch_train, y_batch_train) in enumerate(ds):
for t in range(input_length):
grads_allsteps.append(calculate_t_gradients(t, x_batch_train, h0))

print(grads_allsteps)
[[<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 1.2034059 , -0.46448404, 0.6272926 , ..., -0.40906236,
0.07618493, 0.6338958 ],
[ 1.2781916 , -0.20411322, 0.6174417 , ..., -0.31636393,
-0.23417974, 0.67499626],
[ 1.113218 , -0.65086263, 0.63425934, ..., -0.66614366,
-0.07726163, 0.53647137],
...,
[ 1.3399608 , -0.54088974, 0.6213518 , ..., 0.00831087,
-0.14397278, 0.2614633 ],
[ 1.213171 , -0.42787278, 0.60535026, ..., -0.56198204,
-0.09142771, 0.6212783 ],
[ 1.1901733 , -0.5743524 , 0.36872283, ..., -0.42522985,
-0.0861398 , 0.495057 ]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 0.3487598 , 1.2738569 , -0.48500937, ..., 0.6011117 ,
-0.20381093, 0.45596513],
[ 0.37931004, 1.2778724 , -0.8682532 , ..., 0.8170228 ,
0.1456329 , 0.23715591],
[ 0.5984771 , 0.92434835, -0.8879645 , ..., 0.38756457,
-0.17436962, 0.47174054],
...,
[ 0.61081064, 0.99631476, -0.5104377 , ..., 0.5042721 ,
0.02844866, 0.34626445],
[ 0.7126102 , 1.0205276 , -0.60710275, ..., 0.49418694,
-0.16092762, 0.41363668],
[ 0.8581749 , 1.1259711 , -0.5824491 , ..., 0.45388597,
-0.16205123, 0.72434616]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 3.8507193e-01, 1.2925258e+00, 1.2027258e+00, ...,
3.2430276e-01, 2.2319333e-01, -2.5218868e-01],
[ 5.9262186e-01, 1.4497797e+00, 1.2479483e+00, ...,
4.6175608e-01, 2.5466472e-01, -2.4279505e-01],
[ 2.5734475e-01, 1.4562432e+00, 1.1020679e+00, ...,
6.6081107e-01, 1.9841105e-01, -2.5595558e-01],
...,
[ 5.1541841e-01, 1.6206543e+00, 9.6205616e-01, ...,
7.2725344e-01, 2.5501373e-01, -7.7709556e-04],
[ 4.4518453e-01, 1.6381552e+00, 1.0112666e+00, ...,
5.5238277e-01, 2.4137528e-01, -2.6242572e-01],
[ 6.6721851e-01, 1.5826726e+00, 1.1282607e+00, ...,
3.2301426e-01, 2.2295776e-01, 1.1724380e-01]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[ 0.14262576, 0.578709 , 0.1149607 , ..., 0.1229499 ,
-0.42344815, 0.8837458 ],
[-0.09711604, 0.04376438, -0.11737494, ..., 0.00389774,
0.01737173, 0.17246482],
[ 0.24414796, 0.30101255, -0.12234146, ..., -0.04850931,
-0.31790918, 0.21326394],
...,
[-0.20562285, 0.21999156, 0.02703794, ..., -0.03547464,
-0.59052145, 0.04695258],
[ 0.2087476 , 0.46558812, -0.18172565, ..., -0.01167884,
-0.20868361, 0.09055485],
[-0.22442941, 0.16119067, 0.10854454, ..., 0.14752978,
-0.32307786, 0.343314 ]], dtype=float32)>], [<tf.Tensor: shape=(100, 200), dtype=float32, numpy=
array([[-1.1414615 , 0.37376842, -1.0230722 , ..., 0.60619426,
0.22550163, -0.6948315 ],
[-1.0124328 , 0.27892357, -0.96915233, ..., 0.7048603 ,
-0.15284726, -0.6734605 ],
[-0.8542529 , 0.25970122, -0.90076745, ..., 0.8825682 ,
-0.02474228, -0.55014515],
...,
[-0.89430666, 0.68327624, -1.0109956 , ..., 0.31722566,
-0.23703958, -0.6766514 ],
[-0.8633691 , 0.28742114, -0.9896866 , ..., 0.98315084,
0.0115847 , -0.55474746],
[-0.7229766 , 0.62417865, -1.2342371 , ..., 0.85149145,
-0.04468453, -0.60606724]], dtype=float32)>]]

您需要确保 SimpleRNNstateful 参数为 False,因为根据 docs :

If True, the last state for each sample at index i in a batch will beused as initial state for the sample of index i in the followingbatch.

因此,如果您将 stateful 设置为 False,您的代码还将为每个时间步计算梯度。

关于python - tf2.0 : Gradient Tape returns None gradient in RNN model,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/71153292/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com