gpt4 book ai didi

python-3.x - 稳定基线 3 的字典观察空间不起作用

转载 作者:行者123 更新时间:2023-12-02 01:28:14 25 4
gpt4 key购买 nike

我在下面创建了一个最小的可重现示例,它可以轻松地在新的 Google Colab 笔记本中运行。第一次安装完成后,只需 Runtime > Restart and Run All 即可生效。

我在下面制作了一个简单的轮盘游戏环境用于测试。对于观察空间,我创建了一个 gym.spaces.Dict(),您将看到它(代码注释很好)。

它训练得很好,但是当它进入测试迭代时,我得到了错误:

ValueError                                Traceback (most recent call last)
<ipython-input-56-7c2cb900b44f> in <module>
6 obs = env.reset()
7 for i in range(1000):
----> 8 action, _state = model.predict(obs, deterministic=True)
9 obs, reward, done, info = env.step(action)
10 env.render()

ValueError: Error: Unexpected observation shape () for Box environment, please use (1,) or (n_env, 1) for the observation shape.

我在某处读到字典空间需要用 gym.wrappers.FlattenObservation 展平,所以我更改了这一行:

    action, _state = model.predict(obs, deterministic=True)

...到:

    action, _state = model.predict(FlattenObservation(obs), deterministic=True)

...导致此错误:

AttributeError                            Traceback (most recent call last)
<ipython-input-57-87824c61fc45> in <module>
6 obs = env.reset()
7 for i in range(1000):
----> 8 action, _state = model.predict(FlattenObservation(obs), deterministic=True)
9 obs, reward, done, info = env.step(action)
10 env.render()

AttributeError: 'collections.OrderedDict' object has no attribute 'observation_space'

我也试过这样做,结果和上一个一样的错误:

obs = env.reset()
obs = FlattenObservation(obs)

很明显我做错了什么,但我只是不知道它是什么,因为这将是我第一次使用 Dict 空间。

import os, sys
if not os.path.isdir('/usr/local/lib/python3.7/dist-packages/stable_baselines3'):
!pip3 install stable_baselines3
print("\n\n\n Stable Baselines3 has been installed, Restart and Run All now. DO NOT factory reset, or you'll have to start over\n")
sys.exit(0)

from random import randint
from numpy import inf, float32, array, int32, int64
import gym
from gym.wrappers import FlattenObservation
from stable_baselines3 import A2C, DQN, PPO

"""Roulette environment class"""
class Roulette_Environment(gym.Env):

metadata = {'render.modes': ['human', 'text']}

"""Initialize the environment"""
def __init__(self):
super(Roulette_Environment, self).__init__()

# Some global variables
self.max_table_limit = 1000
self.initial_bankroll = 2000

# Spaces
# Each number on roulette board can have 0-1000 units placed on it
self.action_space = gym.spaces.Box(low=0, high=1000, shape=(37,))

# We're going to keep track of how many times each number shows up
# while we're playing, plus our current bankroll and the max
# table betting limit so the agent knows how much $ in total is allowed
# to be placed on the table. Going to use a Dict space for this.
self.observation_space = gym.spaces.Dict(
{
"0": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"1": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"2": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"3": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"4": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"5": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"6": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"7": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"8": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"9": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"10": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"11": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"12": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"13": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"14": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"15": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"16": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"17": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"18": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"19": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"20": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"21": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"22": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"23": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"24": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"25": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"26": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"27": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"28": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"29": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"30": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"31": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"32": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"33": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"34": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"35": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"36": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),

"current_bankroll": gym.spaces.Box(low=-inf, high=inf, shape=(1,), dtype=int),

"max_table_limit": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
}
)

"""Reset the Environment"""
def reset(self):
self.current_bankroll = self.initial_bankroll
self.done = False

# Take a sample from the observation_space to modify the values of
self.current_state = self.observation_space.sample()

# Reset each number being tracked throughout gameplay to 0
for i in range(0, 37):
self.current_state[str(i)] = 0

# Reset our globals
self.current_state['current_bankroll'] = self.current_bankroll
self.current_state['max_table_limit'] = self.max_table_limit

return self.current_state


"""Step Through the Environment"""
def step(self, action):

# Convert actions to ints cuz they show up as floats,
# even when defined as ints in the environment.
# https://github.com/openai/gym/issues/3107
for i in range(len(action)):
action[i] = int(action[i])
self.current_action = action

# Subtract your bets from bankroll
sum_of_bets = sum([bet for bet in self.current_action])

# Spin the wheel
self.current_number = randint(a=0, b=36)

# Calculate payout/reward
self.reward = 36 * self.current_action[self.current_number] - sum_of_bets

self.current_bankroll += self.reward

# Update the current state
self.current_state['current_bankroll'] = self.current_bankroll
self.current_state[str(self.current_number)] += 1

# If we've doubled our money, or lost our money
if self.current_bankroll >= self.initial_bankroll * 2 or self.current_bankroll <= 0:
self.done = True

return self.current_state, self.reward, self.done, {}


"""Render the Environment"""
def render(self, mode='text'):
# Text rendering
if mode == "text":
print(f'Bets Placed: {self.current_action}')
print(f'Number rolled: {self.current_number}')
print(f'Reward: {self.reward}')
print(f'New Bankroll: {self.current_bankroll}')

env = Roulette_Environment()

model = PPO('MultiInputPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
# obs = FlattenObservation(obs)

for i in range(1000):
action, _state = model.predict(obs, deterministic=True)
# action, _state = model.predict(FlattenObservation(obs), deterministic=True)
obs, reward, done, info = env.step(action)
env.render()
if done:
obs = env.reset()

最佳答案

不幸的是,stable-baselines3对观察形式非常挑剔。
我前几天遇到了同样的问题。
一些文档以及 example model帮我解决了问题:

可以使用 Dict-observations

然而,valuesBox -空格 必须映射为 numpy.ndarrays正确 dtypes .
Discrete观察,观察也可以传递为 int值(value)。但是,我不完全确定这是否仍然适用于多维 MultiDiscrete -空格

一个非常简单的解决方案

您的示例的解决方案是每次通过以下方式重新分配 Dict 的值时替换代码:
self.current_state[key] = np.array([value], dtype=int)

下面是您的问题的有效实现(虽然我的系统安装了 Python=3.10。但它应该也适用于较低版本)。

工作代码:

import os, sys

from random import randint
from numpy import inf, float32, array, int32, int64
import gym
from gym.wrappers import FlattenObservation
from stable_baselines3 import A2C, DQN, PPO
import numpy as np

"""Roulette environment class"""
class Roulette_Environment(gym.Env):

metadata = {'render.modes': ['human', 'text']}

"""Initialize the environment"""
def __init__(self):
super(Roulette_Environment, self).__init__()

# Some global variables
self.max_table_limit = 1000
self.initial_bankroll = 2000

# Spaces
# Each number on roulette board can have 0-1000 units placed on it
self.action_space = gym.spaces.Box(low=0, high=1000, shape=(37,))

# We're going to keep track of how many times each number shows up
# while we're playing, plus our current bankroll and the max
# table betting limit so the agent knows how much $ in total is allowed
# to be placed on the table. Going to use a Dict space for this.
self.observation_space = gym.spaces.Dict(
{
"0": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"1": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"2": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"3": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"4": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"5": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"6": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"7": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"8": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"9": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"10": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"11": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"12": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"13": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"14": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"15": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"16": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"17": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"18": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"19": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"20": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"21": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"22": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"23": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"24": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"25": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"26": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"27": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"28": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"29": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"30": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"31": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"32": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"33": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"34": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"35": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
"36": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),

"current_bankroll": gym.spaces.Box(low=-inf, high=inf, shape=(1,), dtype=int),

"max_table_limit": gym.spaces.Box(low=0, high=inf, shape=(1,), dtype=int),
}
)

"""Reset the Environment"""
def reset(self):
self.current_bankroll = self.initial_bankroll
self.done = False

# Take a sample from the observation_space to modify the values of
self.current_state = self.observation_space.sample()

# Reset each number being tracked throughout gameplay to 0
for i in range(0, 37):
self.current_state[str(i)] = np.array([0], dtype=int)

# Reset our globals
self.current_state['current_bankroll'] = np.array([self.current_bankroll], dtype=int)
self.current_state['max_table_limit'] = np.array([self.max_table_limit], dtype=int)

return self.current_state


"""Step Through the Environment"""
def step(self, action):

# Convert actions to ints cuz they show up as floats,
# even when defined as ints in the environment.
# https://github.com/openai/gym/issues/3107
for i in range(len(action)):
action[i] = int(action[i])
self.current_action = action

# Subtract your bets from bankroll
sum_of_bets = sum([bet for bet in self.current_action])

# Spin the wheel
self.current_number = randint(a=0, b=36)

# Calculate payout/reward
self.reward = 36 * self.current_action[self.current_number] - sum_of_bets

self.current_bankroll += self.reward

# Update the current state
self.current_state['current_bankroll'] = np.array([self.current_bankroll], dtype=int)
self.current_state[str(self.current_number)] += np.array([1], dtype=int)

# If we've doubled our money, or lost our money
if self.current_bankroll >= self.initial_bankroll * 2 or self.current_bankroll <= 0:
self.done = True

return self.current_state, self.reward, self.done, {}


"""Render the Environment"""
def render(self, mode='text'):
# Text rendering
if mode == "text":
print(f'Bets Placed: {self.current_action}')
print(f'Number rolled: {self.current_number}')
print(f'Reward: {self.reward}')
print(f'New Bankroll: {self.current_bankroll}')

env = Roulette_Environment()

model = PPO('MultiInputPolicy', env, verbose=1)
model.learn(total_timesteps=10)

obs = env.reset()
# obs = FlattenObservation(obs)

for i in range(1000):
action, _state = model.predict(obs, deterministic=True)
# action, _state = model.predict(FlattenObservation(obs), deterministic=True)
obs, reward, done, info = env.step(action)
env.render()
if done:
obs = env.reset()

关于python-3.x - 稳定基线 3 的字典观察空间不起作用,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/73922332/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com