UPDATE: I solved the problem. See the solution at the bottom of this post.
I have created a custom py_environment using Python 3.10 When I tried to validate it using utils.validate_py_enivronment, I got a ValueError.
Other people had similar problems here and also here. If I use utils.validate_py_enivronment(env, episode=1), I don't get an error. The error appears only for episode > 2. Here are the specs:
action_spec: BoundedArraySpec (shape=(3,), dtype=dtype('int32'), name='action', minimum=[0 0 0], maximum=[1 1 1]) time_step_spec.observation: BoundedArraySpec (shape=(4,), dtype=dtype('float32'), name='observation', minimum=[0. 0. 0. 0.], maximum=[1. 1. 1. 1.]) time_step_spec.step_type: ArraySpec (shape=(), dtype=dtype('int32'), name='step_type') time_step_spec.discount: BoundedArraySpec (shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0) time_step_spec.reward: ArraySpec (shape=(), dtype=dtype('float32'), name='reward')
This is the error I get for episode > 2:
in validate_py_environment
if not array_spec.check_arrays_nest (time_step, batched_time_step_s raise ValueError(while episode_count < episodes:
Given time_step`: %r does not match expected
"`time_step_spec: %r' % (time_step, batched_time_step_spec)
ValueError: Given `time_step`: TimeStep(
{'discount': array(0., dtype=float32),
'observation': array([[0.9375 0.9694037, 0.7618361, 0.0593321]],
dtype float32),
J
'reward': array(-0.9964797, dtype=float32),
'step_type': array(2)}) does not match expected `time_step_spec`: TimeStep( {'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'),
name' discount minimum=0.0, maximum=1.0),
'observation': BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name='observation minimum=[0.0.0.0.], maximum=[1. 1. 1. 1.]),
'reward': ArraySpec (shape=(), dtype=dtype('float32'), name=' reward'), 'step_type': ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')})
That is simplified environment that throws the same error:
# Import all relevant libs
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
class SimpleTestEnv(py_environment. PyEnvironment):
def __init__(self):
self._action_spec=array_spec. BoundedArraySpec(
shape=(3,), dtype=np.int32, minimum=[0,0,0], maximum=[1,1,1], name='action')
self._observation_spec = array_spec. BoundedArraySpec(
shape=(4,), dtype=np.float32, minimum=[0,0,0,0], maximum=[1,1,1,1], name='observation')
self._state = np.array([1, 1, 1, 0], dtype=np.float32) # initial state
self._episode_ended = False
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def scale_value(self, normed_value, min_value, max_value):
scaled_value = min_value + (normed_value* (max_value - min_value))
return scaled_value
def normalize_value(self, value, min_value, max_value):
normalized_value = (value - min_value) / (max_value - min_value)
return normalized_value
# resets everything to the initial state
def _reset(self):
self._state = [1, 1, 1, 0] # initial state, everything starts at 1
self._episode_ended = False
return ts.restart(np.array(self._state, dtype=np.float32))
# gets new actions
def _step(self, action):
if self._episode_ended:
# The last action ended the episode. Ignore the current action and start # a new episode.
return self.reset()
# scale the current state to its real values
current_state1 = self.scale_value(self._state[0], 0, 0.3)
current_state2 = self.scale_value(self._state[1], 0, 0.3)
current_state3 = self.scale_value(self._state[2], 20, 100)
current_state4 = self._state[3]
# calc the new state based on the actions
new_state1 = 0.2 * current_state1 * action[0]
new_state3 = current_state3 - action[1] * 20 # just some simple calcs
new_state4 = current_state4 + 0.1
if action[2] == 1: # check if the material was rotated or not
new_state2 = current_state1
else:
new_state2 = current_state2
# norm the new state
new_normed_state1 = self.normalize_value(new_state1, 0, 0.3)
new_normed_state2 = self.normalize_value(new_state2, 0.1, 0.3)
new_normed_state3 = self.normalize_value(new_state3, 20, 100)
# calc the reward
reward = new_state3**2 - 1 # just an example
# return the new state and reward & if not termination than discount, too
self._state = np.array([new_normed_state1, new_normed_state2, new_normed_state3, new_state4], dtype=np.float32)
# Make sure episodes don't go on forever.
if new_normed_state1 <= 0.15:
self._episode_ended = True
if self._episode_ended:
return ts.termination (np.array([self._state], dtype=np.float32), reward)
else:
return ts.transition(
np.array([self._state], dtype=np.float32), reward, discount=1.0)
environment = SimpleTestEnv()
print('action_spec:', environment.action_spec())
print('time_step_spec.observation:', environment.time_step_spec().observation) print('time_step_spec.step_type:', environment.time_step_spec().step_type) print('time_step_spec.discount:', environment.time_step_spec().discount)
print('time_step_spec.reward:', environment.time_step_spec().reward)
utils.validate_py_environment (environment, episodes=3)
SOLUTION:
I have done two things wrong.
The state defintion, to be more concrete it was the shape:
self._observation_spec = array_spec.BoundedArraySpec( shape=(1,4), dtype=np.float32, name='observation')
A numpy array is needed for the tf_agents.trajectories.time_step but the self._state must be a list.
Here is a working simplified env:
class CardGameEnv(py_environment.PyEnvironment):
def __init__(self):
self._action_spec = array_spec.BoundedArraySpec(
shape=(3,), dtype=np.int32, minimum=[0,0,0], maximum=[1,1,1], name='action')
self._observation_spec = array_spec.BoundedArraySpec(
shape=(1,4), dtype=np.float32, minimum=0, name='observation')
self._state = [0, 0, 0, 0]
self._episode_ended = False
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self._state = [0, 0, 0, 0]
self._episode_ended = False
return ts.restart(np.array([self._state], dtype=np.float32))
def _step(self, action):
if self._episode_ended:
# The last action ended the episode. Ignore the current action and start
# a new episode.
return self.reset()
# check action
print('Action: ', action)
print('Action data type: ', type(action))
# calc state
print('State before: ', self._state)
print('Action data type: ', type(self._state))
self._state[0] = self._state[0] + action[0]
self._state[1] = self._state[1] + action[1]
self._state[2] = self._state[2] + action[2]
self._state[3] = self._state[3] + action[2]
print('State after: ', self._state)
if self._state[3] >= 5.0:
self._episode_ended = True
reward = self._state[0] - 21
return ts.termination(np.array([self._state], dtype=np.float32), reward)
else:
return ts.transition(
np.array([self._state], dtype=np.float32), reward=0.0, discount=1.0)