python-3.x - Eager Execution，tf.GradientTape 只返回 None-6ren

python-3.x - Eager Execution，tf.GradientTape 只返回 None

转载作者：行者123 更新时间：2023-12-04 17:38:36

我正在尝试使用 tf.GradientTape 计算渐变。当我尝试使用损失和 Model.trainable_weights (tf.keras.Model) 作为输入时，结果以 None 数组返回我。我究竟做错了什么？我使用的tensorflow版本是1.13.0。

实现的算法是一个 OnPolicy DQN(不是通常的 DQN)，所以我不使用目标网络(在传统的 DQN 代码中用作行为网络)。所以，我想区分错误，它在下面的代码中被定义为 Y 的小批量 MSE(即 R + gamma * max_a Q(s', a'))和 Q(s,a)。

import gym
import numpy as np
import tensorflow as tf
from collections import deque

# ==== import below from my repo ====
from common.wrappers import MyWrapper   # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters    # params for training
from common.memory import ReplayBuffer  # Experience Replay Buffer

tf.enable_eager_execution()

class Model(tf.keras.Model):
    def __init__(self, num_action):
        super(Model, self).__init__()
        self.dense1 = tf.keras.layers.Dense(16, activation='relu')
        self.dense2 = tf.keras.layers.Dense(16, activation='relu')
        self.dense3 = tf.keras.layers.Dense(16, activation='relu')
        self.pred = tf.keras.layers.Dense(num_action, activation='softmax')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        pred = self.pred(x)
        return pred


class DQN:
    """
    On policy DQN

    """

    def __init__(self, num_action):
        self.num_action = num_action
        self.model = Model(num_action)
        self.optimizer = tf.train.AdamOptimizer()

    def predict(self, state):
        return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]

    def update(self, state, action, target):
        # target: R + gamma * Q(s',a')
        # calculate Q(s,a)
        q_values = self.predict(state)
        actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
        action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)

        # Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
        loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
        return loss


if __name__ == '__main__':
    reward_buffer = deque(maxlen=5)
    env = MyWrapper(gym.make("CartPole-v0"))
    replay_buffer = ReplayBuffer(5000)
    params = Parameters(mode="CartPole")
    agent = DQN(env.action_space.n)

    for i in range(2000):
        state = env.reset()

        total_reward = 0
        for t in range(210):
            # env.render()
            action = np.argmax(agent.predict(state)) # behave greedily
            next_state, reward, done, info = env.step(action)
            replay_buffer.add(state, action, reward, next_state, done)

            total_reward += reward
            state = next_state

            if done:
                print("Episode {0} finished after {1} timesteps".format(i, t + 1))

                if i > 10:
                    print("Update")
                    with tf.GradientTape() as tape:
                        states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
                        next_Q = agent.predict(next_states)
                        Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
                        loss = agent.update(states, actions, Y)
                        print(loss)

                    grads = tape.gradient(loss, agent.model.trainable_weights)

                    # ==== THIS RETURNS ONLY NONE ====
                    print(grads)
                    agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
                break

        # store the episode reward
        reward_buffer.append(total_reward)

        # check the stopping condition
        if np.mean(reward_buffer) > 195:
            print("GAME OVER!!")
            break

    env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque

# ==== import below from my repo ====
from common.wrappers import MyWrapper   # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters    # params for training
from common.memory import ReplayBuffer  # Experience Replay Buffer

tf.enable_eager_execution()

class Model(tf.keras.Model):
    def __init__(self, num_action):
        super(Model, self).__init__()
        self.dense1 = tf.keras.layers.Dense(16, activation='relu')
        self.dense2 = tf.keras.layers.Dense(16, activation='relu')
        self.dense3 = tf.keras.layers.Dense(16, activation='relu')
        self.pred = tf.keras.layers.Dense(num_action, activation='softmax')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        pred = self.pred(x)
        return pred


class DQN:
    """
    On policy DQN

    """

    def __init__(self, num_action):
        self.num_action = num_action
        self.model = Model(num_action)
        self.optimizer = tf.train.AdamOptimizer()

    def predict(self, state):
        return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]

    def update(self, state, action, target):
        # target: R + gamma * Q(s',a')
        # calculate Q(s,a)
        q_values = self.predict(state)
        actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
        action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)

        # Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
        loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
        return loss


if __name__ == '__main__':
    reward_buffer = deque(maxlen=5)
    env = MyWrapper(gym.make("CartPole-v0"))
    replay_buffer = ReplayBuffer(5000)
    params = Parameters(mode="CartPole")
    agent = DQN(env.action_space.n)

    for i in range(2000):
        state = env.reset()

        total_reward = 0
        for t in range(210):
            # env.render()
            action = np.argmax(agent.predict(state)) # behave greedily
            next_state, reward, done, info = env.step(action)
            replay_buffer.add(state, action, reward, next_state, done)

            total_reward += reward
            state = next_state

            if done:
                print("Episode {0} finished after {1} timesteps".format(i, t + 1))

                if i > 10:
                    print("Update")
                    with tf.GradientTape() as tape:
                        states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
                        next_Q = agent.predict(next_states)
                        Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
                        loss = agent.update(states, actions, Y)
                        print(loss)

                    grads = tape.gradient(loss, agent.model.trainable_weights)

                    # ==== THIS RETURNS ONLY NONE ====
                    print(grads)
                    agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
                break

        # store the episode reward
        reward_buffer.append(total_reward)

        # check the stopping condition
        if np.mean(reward_buffer) > 195:
            print("GAME OVER!!")
            break

    env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque

# ==== import below from my repo ====
from common.wrappers import MyWrapper   # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters    # params for training
from common.memory import ReplayBuffer  # Experience Replay Buffer

tf.enable_eager_execution()

class Model(tf.keras.Model):
    def __init__(self, num_action):
        super(Model, self).__init__()
        self.dense1 = tf.keras.layers.Dense(16, activation='relu')
        self.dense2 = tf.keras.layers.Dense(16, activation='relu')
        self.dense3 = tf.keras.layers.Dense(16, activation='relu')
        self.pred = tf.keras.layers.Dense(num_action, activation='softmax')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        pred = self.pred(x)
        return pred


class DQN:
    """
    On policy DQN

    """

    def __init__(self, num_action):
        self.num_action = num_action
        self.model = Model(num_action)
        self.optimizer = tf.train.AdamOptimizer()

    def predict(self, state):
        return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]

    def update(self, state, action, target):
        # target: R + gamma * Q(s',a')
        # calculate Q(s,a)
        q_values = self.predict(state)
        actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
        action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)

        # Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
        loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
        return loss


if __name__ == '__main__':
    reward_buffer = deque(maxlen=5)
    env = MyWrapper(gym.make("CartPole-v0"))
    replay_buffer = ReplayBuffer(5000)
    params = Parameters(mode="CartPole")
    agent = DQN(env.action_space.n)

    for i in range(2000):
        state = env.reset()

        total_reward = 0
        for t in range(210):
            # env.render()
            action = np.argmax(agent.predict(state)) # behave greedily
            next_state, reward, done, info = env.step(action)
            replay_buffer.add(state, action, reward, next_state, done)

            total_reward += reward
            state = next_state

            if done:
                print("Episode {0} finished after {1} timesteps".format(i, t + 1))

                if i > 10:
                    print("Update")
                    with tf.GradientTape() as tape:
                        states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
                        next_Q = agent.predict(next_states)
                        Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
                        loss = agent.update(states, actions, Y)
                        print(loss)

                    grads = tape.gradient(loss, agent.model.trainable_weights)

                    # ==== THIS RETURNS ONLY NONE ====
                    print(grads)
                    agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
                break

        # store the episode reward
        reward_buffer.append(total_reward)

        # check the stopping condition
        if np.mean(reward_buffer) > 195:
            print("GAME OVER!!")
            break

    env.close()
import gym
import numpy as np
import tensorflow as tf
from collections import deque

# ==== import below from my repo ====
from common.wrappers import MyWrapper   # just a wrapper to set a reward at the terminal state -1
from common.params import Parameters    # params for training
from common.memory import ReplayBuffer  # Experience Replay Buffer

tf.enable_eager_execution()

class Model(tf.keras.Model):
    def __init__(self, num_action):
        super(Model, self).__init__()
        self.dense1 = tf.keras.layers.Dense(16, activation='relu')
        self.dense2 = tf.keras.layers.Dense(16, activation='relu')
        self.dense3 = tf.keras.layers.Dense(16, activation='relu')
        self.pred = tf.keras.layers.Dense(num_action, activation='softmax')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        pred = self.pred(x)
        return pred


class DQN:
    """
    On policy DQN

    """

    def __init__(self, num_action):
        self.num_action = num_action
        self.model = Model(num_action)
        self.optimizer = tf.train.AdamOptimizer()

    def predict(self, state):
        return self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32)).numpy()[0]

    def update(self, state, action, target):
        # target: R + gamma * Q(s',a')
        # calculate Q(s,a)
        q_values = self.predict(state)
        actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
        action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)

        # Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
        loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
        return loss


if __name__ == '__main__':
    reward_buffer = deque(maxlen=5)
    env = MyWrapper(gym.make("CartPole-v0"))
    replay_buffer = ReplayBuffer(5000)
    params = Parameters(mode="CartPole")
    agent = DQN(env.action_space.n)

    for i in range(2000):
        state = env.reset()

        total_reward = 0
        for t in range(210):
            # env.render()
            action = np.argmax(agent.predict(state)) # behave greedily
            next_state, reward, done, info = env.step(action)
            replay_buffer.add(state, action, reward, next_state, done)

            total_reward += reward
            state = next_state

            if done:
                print("Episode {0} finished after {1} timesteps".format(i, t + 1))

                if i > 10:
                    print("Update")
                    with tf.GradientTape() as tape:
                        states, actions, rewards, next_states, dones = replay_buffer.sample(params.batch_size)
                        next_Q = agent.predict(next_states)
                        Y = rewards + params.gamma * np.max(next_Q, axis=1) * np.logical_not(dones)
                        loss = agent.update(states, actions, Y)
                        print(loss)

                    grads = tape.gradient(loss, agent.model.trainable_weights)

                    # ==== THIS RETURNS ONLY NONE ====
                    print(grads)
                    agent.optimizer.apply_gradients(zip(grads, agent.model.trainable_weights))
                break

        # store the episode reward
        reward_buffer.append(total_reward)

        # check the stopping condition
        if np.mean(reward_buffer) > 195:
            print("GAME OVER!!")
            break

    env.close()

最佳答案

尝试将您的更新功能更改为:

def update(self, state, action, target):
        # target: R + gamma * Q(s',a')
        # calculate Q(s,a)
        q_values = self.model(tf.convert_to_tensor(state[None, :], dtype=tf.float32))
        actions_one_hot = tf.one_hot(action, self.num_action, 1.0, 0.0)
        action_probs = tf.reduce_sum(actions_one_hot * q_values, reduction_indices=-1)

        # Minibatch MSE => (1/batch_size) * (R + gamma * Q(s',a') - Q(s,a))^2
        loss = tf.reduce_mean(tf.squared_difference(target, action_probs))
        return loss

我认为通过预测函数中的 .numpy() 调用，磁带失去了对权重的引用。 (我没有测试我的答案)

关于python-3.x - Eager Execution，tf.GradientTape 只返回 None，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/55602633/

文章推荐： php - Laravel Collective 中多重选择的选定值

文章推荐： python - 如何限制opencv在Python中使用的线程数？

python - 列表理解返回值加上 [None, None, None]，为什么？
这个问题在这里已经有了答案: Why does the print function return None? (1 个回答) 关闭 6 年前。我正在学习理解。我得到了 print(x) 部分(我
python - 如何理解Python中 `None or False`、 `False or None`、 `None and False`、 `False and None`的结果？
我以为我理解了 Python 中的这两个单例值，直到我看到有人在代码中使用 return l1 or l2，其中 l1 和 l2 都是链表对象，并且(s)他想如果不为 None 则返回 l1，否则返回
python - IPython Notebook 中的列表理解返回 [None, None, None...]
我希望在 IPython Notebook 中使用列表理解生成枚举字符串列表。它有效，但给了我一个我不理解的奇怪输出。 cols = [] [cols.append('Value'+str(hour)
python - 为什么 `None is None is None` 返回 True？
这个问题在这里已经有了答案: Why does the expression 0 >> import dis >>> def a(): ... return None is None is N
python - 为什么 list(print(x.upper(), end =' ' ) for x in 'spam' ) 得到一个 [None, None, None, None] 列表？
《Learning Python 5th》第608页有示例代码: >>> list(print(x.upper(), end=' ') for x in 'spam') S P A M [None,
python - 为什么这个函数也返回 "None None"？
我对此进行了搜索并遇到了列表返回函数，但我仍然不明白。我试图理解为什么 Print 函数到另一个函数返回以下内容: 生日快乐生日快乐无无我的代码: def happy(): prin
python - "None not in"与 "not None in"
除非我疯了 if None not in x 和 if not None in x 是等价的。有首选版本吗？我想 None not in 更像英语，因此更像 pythonic，但 not None i
python - 获取类型错误 : '(slice(None, None, None), 0)' is an invalid key
尝试绘制 k-NN 分类器的决策边界但无法这样做得到 TypeError: '(slice(None, None, None), 0)' is an invalid key h = .01 # st
python - 如何在 Keras 中将 (None, 10) 维张量 reshape 为 (None, None, 10)？
我正在尝试将可变大小的序列输入 LSTM。因此我使用生成器且批量大小为 1。我有一个嵌入的 (sequence_length,)-input-tensor，并输出 (batch_size,equen
python - 区分参数值 `None` 和默认参数值 `None`
这个问题在这里已经有了答案: 关闭 10 年前。 Possible Duplicate: Is there any way to know if the value of an argument i
Python:字符串与 None 连接以返回 None？
我正在尝试根据环境变量的返回值进行条件赋值。 self._TBLFilePath = iTBLFilePath or os.environ.get("CDO_TBLPATH") + os.enviro
python - 强制加载 `None` 并在转储时跳过 `None`
我正在使用 marshmallow 2.0.0rc2 验证 HTTP 请求的输入数据，并在 HTTP 响应上将 SQLAlchemy 模型加载到 JSON。我偶然发现了两个问题: 首先，在通过 HTT
python - lxml 'None' 类型不是 None
我想将我设置为 None 的变量与 is 进行比较，但它失败了。当我使用 == 将此变量与 None 进行比较时，它起作用了。这就是我所说的变量: print type(xml.a) -> 因为
python - "is None"和 "== None"有什么区别
我最近遇到了这种语法，我不知道有什么区别。如果有人能告诉我其中的区别，我将不胜感激。最佳答案答案解释here . 引用: A class is free to implement compari
python - 获取类型错误 : '(slice(None, None, None), array([0, 1, 2, 3, 4]))' is an invalid key
尝试使用 BorutaPy 进行特征选择。但出现 TypeError: '(slice(None, None, None), array([0, 1, 2, 3, 4]))' 是无效键。 from s
tensorflow - 对于占位符的形状，[]、[None]、None 和 () 有什么区别？
我见过使用 [] 的代码片段, [None] , None或 ()作为 placeholder 的形状，那是 x = tf.placeholder(..., shape=[], ...) y = t
ansible - 为什么 `default( None )` 并不总是显示为 `None`
是否有逻辑推理可以解释为什么下面的 Ansible playbook 中的两个 debug 任务分别输出 "NONE" 和 "FALSE"并且不是两者都“NONE”？ - hosts: 'all'
python - 我应该使用 `==` 与 `(None, None)` 元组进行比较吗？
我有一个函数，它可以返回两个整数的元组或(None, None)的元组: (出于本问题的目的，我们假设此返回格式是执行此操作的唯一方法，并且无法更改) from typing import Tuple
python - 从嵌套字典中递归删除 None 值或 None 键
问题: 如何遍历字典并从中删除 None 键或值？这是我尝试过的: 代码: import copy def _ignore(data): copied_data = copy.deepcop
python - 简明地说 "none of the elements of an array are None"？
什么是简洁的 python 表达方式 if : # do a bunch of stuff once 最佳答案为什么不简单， None not in lst 关于python - 简明地说 "

行者123

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

python-3.x - Eager Execution，tf.GradientTape 只返回 None