gpt4 book ai didi

python - CartPole 的 Deep Q 分数停留在 9 分

转载 作者:行者123 更新时间:2023-11-30 09:49:23 24 4
gpt4 key购买 nike

所以我正在使用使用tensorflow的deepQ实现来解决CartPole-v0,但是输出有时(所有运行的40%)停留在9。我尝试使用tf.set_random_seed修复种子,但这仍然没有不保证输出不会卡住。这是我的代码:

from collections import deque
import tensorflow as tf
import numpy as np
import random
import gym
import matplotlib.pyplot as plt
import pickle
from time import time
t = int(time())
class DQNAgent:

def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen = 2000)
self.gamma = 0.95
#self.epsilon = 1.0
#self.epsilon_min = 0.01
#self.epsilon_decay = 0.995
self.learning_rate = 0.001
self.model = self._build_model()

def _build_model(self):
graph = tf.Graph()
with graph.as_default():
inp = tf.placeholder(tf.float32, [None, self.state_size])
out = tf.placeholder(tf.float32, [None, self.action_size])
w1 = tf.Variable(tf.truncated_normal([self.state_size, 24]))
b1 = tf.Variable(tf.zeros([24]))

hidden = tf.nn.tanh(tf.matmul(inp, w1) + b1)

w2 = tf.Variable(tf.truncated_normal([24, 24]))
b2 = tf.Variable(tf.zeros([24]))

hidden1 = tf.nn.tanh(tf.matmul(hidden, w2) + b2)

w3 = tf.Variable(tf.truncated_normal([24, 24]))
b3 = tf.Variable(tf.zeros([24]))

hidden2 = tf.nn.tanh(tf.matmul(hidden1, w3) + b3)

wo = tf.Variable(tf.truncated_normal([24, self.action_size]))
bo = tf.Variable(tf.zeros([self.action_size]))

prediction = tf.matmul(hidden2, wo) + bo

loss = tf.losses.mean_squared_error(out, prediction)
train = tf.train.AdamOptimizer().minimize(loss)
init = tf.global_variables_initializer()

return graph, inp, out, prediction, train, init

def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))

def act(self, state, sess):
act_values = sess.run(self.model[3], feed_dict = { self.model[1]: state})
return np.argmax(act_values[0])

def replay(self, batch_size, sess):
try:
minibatch = random.sample(self.memory, batch_size)
except ValueError:
minibatch = self.memory
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = reward + self.gamma * np.amax(sess.run(self.model[3], feed_dict = { self.model[1]: next_state}))
target_f = sess.run(self.model[3], feed_dict = { self.model[1]: state})
target_f[0][action] = target
#print(target_f)
sess.run(self.model[4], feed_dict = { self.model[1]: state, self.model[2]: target_f})

if __name__ == "__main__":
environment = 'CartPole-v0'
env = gym.make(environment)
avgs = deque(maxlen = 50)
rewardLA = []
agent = DQNAgent(env.observation_space.shape[0], env.action_space.n)
sess = tf.Session(graph = agent.model[0])
sess.run(agent.model[5])
episodes = 10000
rewardL = []
for e in range(episodes):
state = env.reset()
state = np.reshape(state, [1, 4])
for time_t in range(500):
#env.render()
action = agent.act(state, sess)
next_state, reward, done, _ = env.step(action)
next_state = np.reshape(next_state, [1, 4])
agent.remember(state, action, reward, next_state, done)
state = next_state
if done:
break
avgs.append(time_t)
rewardLA.append(sum(avgs)/len(avgs))
print("episode: ", e, "score: ", time_t)
rewardL.append(time_t)
agent.replay(32, sess)
#pickle.dump(rewardL, open(environment + "_" + str(t) + "_rewardL.pickle", "wb"))
plt.plot(rewardLA)
plt.show()

我尝试将优化器更改为 GD、rmsProp,但没有任何作用,但如果我只是重新启动代码,它会工作得更好(在 200 个时期内达到 199)。为什么会发生这种情况?我该如何解决它。

最佳答案

看看你的代码,我看不出环境是如何被探索的。难道你不需要像 epsilon 贪婪之类的东西来确保探索的发生吗?例如,我尝试将 agent.act() 方法修改如下,似乎可以解决问题。

def act(self, state, sess, episode):
if random.random() < math.pow(2, -episode / 30):
return env.action_space.sample()

act_values = sess.run(self.model[3], feed_dict = { self.model[1]: state})
return np.argmax(act_values[0])

尝试使用 30,由于缺乏更好的术语,我将其称为“探索常数”。

无论如何,在我看来,如果没有像 epsilon 贪婪这样的东西(或者像上面那样随着时间的推移而衰减的东西),你就依赖神经网络输出来拥有足够的熵来引起足够的探索。有时可能是这样;其他时候则不然。

关于python - CartPole 的 Deep Q 分数停留在 9 分,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/47750291/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com