gpt4 book ai didi

python - 简单的非面向对象神经网络的成本 "jumping"

转载 作者:太空宇宙 更新时间:2023-11-04 02:52:26 28 4
gpt4 key购买 nike

我正在 Python 3.4 中使用 numpy 和矩阵构建一个神经网络草图,以学习简单的 XOR。我的符号如下:

a 是神经元的事件

z 是一个神经元的输入

W 是一个权重矩阵,大小为 R^{#上一层的神经元数}x{#下一层的神经元数}

B 是偏置值的向量

在 python 中实现了一个非常简单的网络后,仅在单个输入向量上训练时一切正常。然而,当对 XOR 的所有四个训练示例进行训练时,误差函数表现出非常奇怪的行为(见图)并且网络的输出始终约为 0.5。更改网络大小、学习率或训练周期似乎没有帮助。

Cost J while only training on one training example仅在一个训练样本上训练时花费 J

Cost J while training with all training examples使用所有训练示例训练时花费 J

这是网络的代码:

import numpy as np
import time
import matplotlib.pyplot as plt


Js = []
start = time.time()
np.random.seed(2)


#Sigmoid
def activation(x, derivative = False):
if(derivative):
a = activation(x)
return a * (1 - a)
else:
return 1/(1+np.exp(-x))

def cost(output, target):
return (1/2) * np.sum((target - output)**2)


INPUTS = np.array([
[0, 1],
[1, 0],
[0, 0],
[1, 1],
])
TARGET = np.array([
[1],
[1],
[0],
[0],
])

"Hyper-Parameters"
# Layer Structure
LAYER = [2, 3, 1]
LEARNING_RATE = 0.1
ITERATIONS = int(1e3)

# Init Weights
W1 = np.random.rand(LAYER[0], LAYER[1])
W2 = np.random.rand(LAYER[1], LAYER[2])

# Init Biases
B1 = np.random.rand(LAYER[1], 1)
B2 = np.random.rand(LAYER[2], 1)

for i in range(0, ITERATIONS):
exampleIndex = i % len(INPUTS)
#exampleIndex = 2
"Forward Pass"
# Layer One Activity (Input layer)
A0 = np.transpose(INPUTS[exampleIndex:exampleIndex+1])

# Layer Two Activity (Hidden Layer)
Z1 = np.dot(np.transpose(W1), A0) + B1
A1 = activation(Z1)

# Layer Three Activity (Output Layer)
Z2 = np.dot(np.transpose(W2), A1) + B2
A2 = activation(Z2)

# Output
O = A2

# Cost J

# Target Vector T
T = np.transpose(TARGET[exampleIndex:exampleIndex+1])
J = cost(O, T)
Js.append(J)

print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))

"Backward Pass"

# Calculate Delta of output layer
D2 = (O - T) * activation(Z2, True)

# Calculate Delta of hidden layer
D1 = np.dot(W2, D2) * activation(Z1, True)

# Calculate Derivatives w.r.t. W2
DerW2 = np.dot(A1, np.transpose(D2))
# Calculate Derivatives w.r.t. W1
DerW1 = np.dot(A0, np.transpose(D1))

# Calculate Derivatives w.r.t. B2
DerB2 = D2
# Calculate Derivatives w.r.t. B1
DerB1 = D1

"Update Weights and Biases"

W1 -= LEARNING_RATE * DerW1
B1 -= LEARNING_RATE * DerB1

W2 -= LEARNING_RATE * DerW2
B2 -= LEARNING_RATE * DerB2

# Show prediction

print("Time elapsed {}s".format(time.time() - start))
plt.plot(Js)
plt.ylabel("Cost J")
plt.xlabel("Iterations")
plt.show()

我的实现中出现这种奇怪行为的原因可能是什么?

最佳答案

我认为您的成本函数正在跳跃,因为您在每个样本后执行权重更新。然而,您的网络仍然在训练正确的行为:

479997
J = 4.7222501603409765e-05
I = [[1]
[0]], O = [[ 0.99028172]]
T = [[1]]
479998
J = 7.3205311398742e-05
I = [[0]
[0]], O = [[ 0.01210003]]
T = [[0]]
479999
J = 4.577485181547362e-05
I = [[1]
[1]], O = [[ 0.00956816]]
T = [[0]]
480000
J = 4.726257702199439e-05
I = [[0]
[1]], O = [[ 0.9902776]]
T = [[1]]

成本函数表现出一些有趣的行为:训练过程达到一个点,成本函数中的跳跃将变得非常小。您可以使用下面的代码重现它(我只做了细微的改动;请注意,我训练了更多的时期):

import numpy as np
import time
import matplotlib.pyplot as plt


Js = []
start = time.time()
np.random.seed(2)


#Sigmoid
def activation(x, derivative = False):
if(derivative):
a = activation(x)
return a * (1 - a)
else:
return 1/(1+np.exp(-x))

def cost(output, target):
return (1/2) * np.sum((target - output)**2)


INPUTS = np.array([[0, 1],[1, 0],[0, 0],[1, 1]])
TARGET = np.array([[1],[1],[0],[0]])

"Hyper-Parameters"
# Layer Structure
LAYER = [2, 3, 1]
LEARNING_RATE = 0.1
ITERATIONS = int(5e5)

# Init Weights
W1 = np.random.rand(LAYER[0], LAYER[1])
W2 = np.random.rand(LAYER[1], LAYER[2])

# Init Biases
B1 = np.random.rand(LAYER[1], 1)
B2 = np.random.rand(LAYER[2], 1)

for i in range(0, ITERATIONS):
exampleIndex = i % len(INPUTS)
# exampleIndex = 2
"Forward Pass"
# Layer One Activity (Input layer)
A0 = np.transpose(INPUTS[exampleIndex:exampleIndex+1])

# Layer Two Activity (Hidden Layer)
Z1 = np.dot(np.transpose(W1), A0) + B1
A1 = activation(Z1)

# Layer Three Activity (Output Layer)
Z2 = np.dot(np.transpose(W2), A1) + B2
A2 = activation(Z2)

# Output
O = A2

# Cost J

# Target Vector T
T = np.transpose(TARGET[exampleIndex:exampleIndex+1])
J = cost(O, T)
Js.append(J)

# print("J = {}".format(J))
# print("I = {}, O = {}".format(A0, O))
# print("T = {}".format(T))
if ((i+3) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if ((i+2) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if ((i+1) % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))
if (i % 20000 == 0):
print(i)
print("J = {}".format(J))
print("I = {}, O = {}".format(A0, O))
print("T = {}".format(T))

"Backward Pass"

# Calculate Delta of output layer
D2 = (O - T) * activation(Z2, True)

# Calculate Delta of hidden layer
D1 = np.dot(W2, D2) * activation(Z1, True)

# Calculate Derivatives w.r.t. W2
DerW2 = np.dot(A1, np.transpose(D2))
# Calculate Derivatives w.r.t. W1
DerW1 = np.dot(A0, np.transpose(D1))

# Calculate Derivatives w.r.t. B2
DerB2 = D2
# Calculate Derivatives w.r.t. B1
DerB1 = D1

"Update Weights and Biases"

W1 -= LEARNING_RATE * DerW1
B1 -= LEARNING_RATE * DerB1

W2 -= LEARNING_RATE * DerW2
B2 -= LEARNING_RATE * DerB2

# Show prediction

print("Time elapsed {}s".format(time.time() - start))
plt.plot(Js)
plt.ylabel("Cost J")
plt.xlabel("Iterations")
plt.savefig('cost.pdf')
plt.show()

为了减少成本函数的波动,人们通常在执行更新(一些平均更新)之前使用多个数据样本,但我发现这在仅包含四个不同训练事件的集合中很难做到。因此,总结这个相当长的答案:您的成本函数跳跃是因为它是针对每个示例计算的,而不是针对多个示例的平均值计算的。然而,网络输出很好地遵循了 XOR 函数的分布,所以你不需要改变它。

关于python - 简单的非面向对象神经网络的成本 "jumping",我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43457429/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com