gpt4 book ai didi

python - 我的 LSTM 学习,损失减少,但数值梯度与分析梯度不匹配

转载 作者:太空狗 更新时间:2023-10-29 22:28:25 38 4
gpt4 key购买 nike

以下是自包含,当您运行它时,它将:

1. 打印损失以验证它正在减少(学习 sin 波),

2.对照我的手推梯度函数检查数值梯度。

这两个梯度倾向于在 1e-1 到 1e-2 内匹配(这仍然很糟糕,但表明它正在尝试)并且偶尔会出现极端异常值。

我整个星期六都在退回到正常的 FFNN,让它工作(耶,梯度匹配!),现在星期天在这个 LSTM 上,好吧,我找不到我逻辑中的错误。哦,这在很大程度上取决于我的随机种子,有时很好,有时很糟糕。

我已经根据 LSTM 方程的手推导数(我做了微积分)和这 3 个博客/要点中的实现手工检查了我的实现:

并尝试了此处建议的(惊人的)调试方法:https://blog.slavv.com/37-reasons-why-your-neural-network-is-not-working-4020854bd607

你能帮忙看看我哪里做错了吗?

import numpy as np
np.set_printoptions(precision=3, suppress=True)

def check_grad(params, In, Target, f, df_analytical, delta=1e-5, tolerance=1e-7, num_checks=10):
"""
delta : how far on either side of the param value to go

tolerance : how far the analytical and numerical values can diverge
"""

h_n = params['Wf'].shape[1] # TODO: h & c should be passed in (?)
h = np.zeros(h_n)
c = np.zeros(h_n)

y, outputs, loss, h, c, caches = f(params, h, c, inputs, targets)
dparams = df_analytical(params, inputs, targets, outputs, caches)

passes = True
for _ in range(num_checks):
print()
for pname, p, dpname, dp in zip(params.keys(), params.values(), dparams.keys(), dparams.values()):

pix = np.random.randint(0, p.size)
old_val = p.flat[pix]

# d = delta * abs(old_val) if old_val != 0 else 1e-5
d = delta

p.flat[pix] = old_val + d
_, _, loss_plus, _, _, _ = f(params, h, c, In, Target) # note `_` is the cache
p.flat[pix] = old_val - d
_, _, loss_minus, _, _, _ = f(params, h, c, In, Target)
p.flat[pix] = old_val

grad_analytic = dp.flat[pix]
grad_numeric = (loss_plus - loss_minus) / (2 * d)

denom = abs(grad_numeric + grad_analytic) + 1e-12 # max((abs(grad_numeric), abs(grad_analytic)))
relative_error = abs(grad_analytic - grad_numeric) / denom

if relative_error > tolerance:
print(("fails: %s % 4d | r: % 3.4f, a: % 3.4f, n: % 3.4f, a/n: %0.2f") % (pname, pix, relative_error, grad_analytic, grad_numeric, grad_analytic/grad_numeric))
passes &= relative_error <= tolerance

return passes


# ----------

def lstm(params, inp, h_old, c_old):

Wf, Wi, Wg, Wo, Wy = params['Wf'], params['Wi'], params['Wg'], params['Wo'], params['Wy']
bf, bi, bg, bo, by = params['bf'], params['bi'], params['bg'], params['bo'], params['by']

xh = np.concatenate([inp, h_old])

f = np.dot(xh, Wf) + bf
f_sigm = 1 / (1 + np.exp(-f))

i = np.dot(xh, Wi) + bi
i_sigm = 1 / (1 + np.exp(-i))

g = np.dot(xh, Wg) + bg # C-tilde or C-bar in the literature
g_tanh = np.tanh(g)

o = np.dot(xh, Wo) + bo
o_sigm = 1 / (1 + np.exp(-o))

c = f_sigm * c_old + i_sigm * g_tanh

c_tanh = np.tanh(c)
h = o_sigm * c_tanh

y = np.dot(h, Wy) + by # NOTE: this is a dense layer bolted on after a normal LSTM
# TODO: should it have a nonlinearity after it? MSE would not work well with, for ex, a sigmoid

cache = (xh, f, f_sigm, i, i_sigm, g, g_tanh, o, o_sigm, c, c_tanh, c_old, h)
return y, h, c, cache


def dlstm(params, dy, dh_next, dc_next, cache):

Wf, Wi, Wg, Wo, Wy = params['Wf'], params['Wi'], params['Wg'], params['Wo'], params['Wy']
bf, bi, bg, bo, by = params['bf'], params['bi'], params['bg'], params['bo'], params['by']

xh, f, f_sigm, i, i_sigm, g, g_tanh, o, o_sigm, c, c_tanh, c_old, h = cache

dby = dy.copy()
dWy = np.outer(h, dy)
dh = np.dot(dy, Wy.T) + dh_next.copy()
do = c_tanh * dh * o_sigm * (1 - o_sigm)
dc = dc_next.copy() + o_sigm * dh * (1 - c_tanh ** 2) # TODO: copy?
dg = i_sigm * dc * (1 - g_tanh ** 2)
di = g_tanh * dc * i_sigm * (1 - i_sigm)
df = c_old * dc * f_sigm * (1 - f_sigm) # ERROR FIXED: ??? c_old -> c?, c->c_old?

dWo = np.outer(xh, do); dbo = do; dXo = np.dot(do, Wo.T)
dWg = np.outer(xh, dg); dbg = dg; dXg = np.dot(dg, Wg.T)
dWi = np.outer(xh, di); dbi = di; dXi = np.dot(di, Wi.T)
dWf = np.outer(xh, df); dbf = df; dXf = np.dot(df, Wf.T)

dX = dXo + dXg + dXi + dXf
dh_next = dX[-h.size:]
dc_next = f_sigm * dc

dparams = dict(Wf = dWf, Wi = dWi, Wg = dWg, Wo = dWo, Wy = dWy,
bf = dbf, bi = dbi, bg = dbg, bo = dbo, by = dby)

return dparams, dh_next, dc_next


def lstm_loss(params, h, c, inputs, targets):
loss = 0
outputs = []
caches = []
for inp, target in zip(inputs, targets):
y, h, c, cache = lstm(params, inp, h, c)
loss += np.mean((y - target) ** 2)
outputs.append(y)
caches.append(cache)
loss = loss # / inputs.shape[0]
return y, outputs, loss, h, c, caches

def dlstm_loss(params, inputs, targets, outputs, caches):
h_shape = caches[0][-1].shape
dparams = {k:np.zeros_like(v) for k, v in params.items()}
dh = np.zeros(h_shape)
dc = np.zeros(h_shape)

for inp, out, target, cache in reversed(list(zip(inputs, outputs, targets, caches))):
dy = 2 * (out - target)
dps, dh, dc = dlstm(params, dy, dh, dc, cache)
for dpk, dpv in dps.items():
dparams[dpk] += dpv
return dparams


# ----------
# setup

x_n = 1
h_n = 5
o_n = 1

params = dict(
Wf = np.random.normal(size=(x_n + h_n, h_n)),
Wi = np.random.normal(size=(x_n + h_n, h_n)),
Wg = np.random.normal(size=(x_n + h_n, h_n)),
Wo = np.random.normal(size=(x_n + h_n, h_n)),
Wy = np.random.normal(size=(h_n, o_n)),
bf = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1,
bi = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1,
bg = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1,
bo = np.zeros(h_n) + np.random.normal(size=h_n) * 0.1,
by = np.zeros(o_n) + np.random.normal(size=o_n) * 0.1,
)

for name in ['Wf', 'Wi', 'Wg', 'Wo', 'Wy']:
W = params[name]
W *= np.sqrt(2 / (W.shape[0] + W.shape[1])) # Xavier initialization
for name in params:
params[name] = params[name].astype('float64')


# ----------
# Sanity check, learn sin wave

def test_sin():
emaloss = 1 # EMA average
emak = 0.99

for t in range(5000):
data = np.sin(np.linspace(0, 3 * np.pi, 30))
start = np.random.randint(0, data.size // 4)
end = np.random.randint((data.size * 3) // 4, data.size)
inputs = data[start:end, None]
targets = np.roll(inputs, 1, axis=0)


h_n = params['Wf'].shape[1] # TODO: h & c should be passed in
h = np.random.normal(size=h_n)
c = np.random.normal(size=h_n)

y, outputs, loss, h, c, caches = lstm_loss(params, h, c, inputs, targets)
dparams = dlstm_loss(params, inputs, targets, outputs, caches)

for k in params.keys():
params[k] -= dparams[k] * 0.01


emaloss = emaloss * emak + loss * (1 - emak)
if t % 100 == 0:
print('%.4f' % emaloss)
test_sin()

# ----------
data = np.sin(np.linspace(0, 4 * np.pi, 90))
start = np.random.randint(0, data.size // 4)
end = np.random.randint((data.size * 3) // 4, data.size)
inputs = data[start:end, None]
targets = np.roll(inputs, 1, axis=0)

for inp, targ in zip(inputs, targets):
assert(check_grad(params, inputs, targets, lstm_loss, dlstm_loss, delta=1e-5, tolerance=1e-7, num_checks=10))
print('grads are ok') # <- i never reach here

最佳答案

解决了!在我的 check_grad 中,我需要构建用于 df_analyticalcaches,但在这样做时,我也覆盖了 h c 应该是 np.zeroes

y, outputs, loss, h, c, caches = f(params, h, c, inputs, targets)

_, _, loss_minus, _, _, _ = f(params, h, c, inputs, targets)
p.flat[pix] = old_val

因此,只需不覆盖 hc 即可修复它,并且 LSTM 代码正常。

_, outputs, loss, _, _, caches = f(params, h, c, inputs, targets)

关于python - 我的 LSTM 学习,损失减少,但数值梯度与分析梯度不匹配,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/54282206/

38 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com