custom py_environment 'time_step' doesn't match 'time_step_spec' after the first episode

65 views Asked by At

UPDATE: I solved the problem. See the solution at the bottom of this post.

I have created a custom py_environment using Python 3.10 When I tried to validate it using utils.validate_py_enivronment, I got a ValueError.

Other people had similar problems here and also here. If I use utils.validate_py_enivronment(env, episode=1), I don't get an error. The error appears only for episode > 2. Here are the specs:

action_spec: BoundedArraySpec (shape=(3,), dtype=dtype('int32'), name='action', minimum=[0 0 0], maximum=[1 1 1]) time_step_spec.observation: BoundedArraySpec (shape=(4,), dtype=dtype('float32'), name='observation', minimum=[0. 0. 0. 0.], maximum=[1. 1. 1. 1.]) time_step_spec.step_type: ArraySpec (shape=(), dtype=dtype('int32'), name='step_type') time_step_spec.discount: BoundedArraySpec (shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0) time_step_spec.reward: ArraySpec (shape=(), dtype=dtype('float32'), name='reward')

This is the error I get for episode > 2:

in validate_py_environment
if not array_spec.check_arrays_nest (time_step, batched_time_step_s raise ValueError(while episode_count < episodes:
Given time_step`: %r does not match expected
"`time_step_spec: %r' % (time_step, batched_time_step_spec)


ValueError: Given `time_step`: TimeStep(
{'discount': array(0., dtype=float32),
'observation': array([[0.9375 0.9694037, 0.7618361, 0.0593321]],
dtype float32),
J
'reward': array(-0.9964797, dtype=float32),
'step_type': array(2)}) does not match expected `time_step_spec`: TimeStep( {'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'),
name' discount minimum=0.0, maximum=1.0),
'observation': BoundedArraySpec(shape=(4,), dtype=dtype('float32'), name='observation minimum=[0.0.0.0.], maximum=[1. 1. 1. 1.]),
'reward': ArraySpec (shape=(), dtype=dtype('float32'), name=' reward'), 'step_type': ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')})

That is simplified environment that throws the same error:

# Import all relevant libs
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts

class SimpleTestEnv(py_environment. PyEnvironment):
   def __init__(self):
      self._action_spec=array_spec. BoundedArraySpec(
        shape=(3,), dtype=np.int32, minimum=[0,0,0], maximum=[1,1,1], name='action')
      self._observation_spec = array_spec. BoundedArraySpec(
       shape=(4,), dtype=np.float32, minimum=[0,0,0,0], maximum=[1,1,1,1],  name='observation')
      self._state = np.array([1, 1, 1, 0], dtype=np.float32) # initial state
      self._episode_ended = False
   def action_spec(self):
      return self._action_spec
   def observation_spec(self):
      return self._observation_spec

   def scale_value(self, normed_value, min_value, max_value):
      scaled_value = min_value + (normed_value* (max_value - min_value)) 
      return scaled_value
   def normalize_value(self, value, min_value, max_value):
      normalized_value = (value - min_value) / (max_value - min_value) 
      return normalized_value
  # resets everything to the initial state
   def _reset(self):
      self._state = [1, 1, 1, 0] # initial state, everything starts at 1 
      self._episode_ended = False
      return ts.restart(np.array(self._state, dtype=np.float32))
  # gets new actions
   def _step(self, action):
      if self._episode_ended:
          # The last action ended the episode. Ignore the current action and start # a new episode.
          return self.reset()
      # scale the current state to its real values
      current_state1 = self.scale_value(self._state[0], 0, 0.3)
      current_state2 = self.scale_value(self._state[1], 0, 0.3)
      current_state3 = self.scale_value(self._state[2], 20, 100)
      current_state4 = self._state[3]

      # calc the new state based on the actions
      new_state1 = 0.2 * current_state1 * action[0]
      new_state3 = current_state3 - action[1] * 20  # just some simple calcs 
      new_state4 = current_state4 + 0.1
      if action[2] == 1: #  check if the material was rotated or not 
        new_state2 = current_state1
      else:
        new_state2 = current_state2

      # norm the new state
      new_normed_state1 = self.normalize_value(new_state1, 0, 0.3) 
      new_normed_state2 = self.normalize_value(new_state2, 0.1, 0.3)
      new_normed_state3 = self.normalize_value(new_state3, 20, 100)

      # calc the reward
      reward = new_state3**2 - 1 # just an example
      # return the new state and reward & if not termination than discount, too
      self._state = np.array([new_normed_state1, new_normed_state2, new_normed_state3, new_state4],      dtype=np.float32)
      # Make sure episodes don't go on forever.
      if new_normed_state1 <= 0.15:
         self._episode_ended = True
      if self._episode_ended:
         return ts.termination (np.array([self._state], dtype=np.float32), reward)
      else:
         return ts.transition(
           np.array([self._state], dtype=np.float32), reward, discount=1.0)

environment = SimpleTestEnv()
print('action_spec:', environment.action_spec())
print('time_step_spec.observation:', environment.time_step_spec().observation) print('time_step_spec.step_type:', environment.time_step_spec().step_type) print('time_step_spec.discount:', environment.time_step_spec().discount)
print('time_step_spec.reward:', environment.time_step_spec().reward)
utils.validate_py_environment (environment, episodes=3)

SOLUTION:

I have done two things wrong.

  1. The state defintion, to be more concrete it was the shape:

    self._observation_spec = array_spec.BoundedArraySpec( shape=(1,4), dtype=np.float32, name='observation')

  2. A numpy array is needed for the tf_agents.trajectories.time_step but the self._state must be a list.

Here is a working simplified env:

class CardGameEnv(py_environment.PyEnvironment):

  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(3,), dtype=np.int32, minimum=[0,0,0], maximum=[1,1,1], name='action')
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=(1,4), dtype=np.float32, minimum=0, name='observation')
    
    self._state = [0, 0, 0, 0]
    self._episode_ended = False

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self._state = [0, 0, 0, 0]
    self._episode_ended = False
    return ts.restart(np.array([self._state], dtype=np.float32))

  def _step(self, action):

    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start
      # a new episode.
      return self.reset()

    # check action
    print('Action: ', action)
    print('Action data type: ', type(action))

    # calc state
    print('State before: ', self._state)
    print('Action data type: ', type(self._state))
    self._state[0] = self._state[0] + action[0]  
    self._state[1] = self._state[1] + action[1]
    self._state[2] = self._state[2] + action[2]
    self._state[3] = self._state[3] + action[2]
    print('State after: ', self._state)    

    if self._state[3] >= 5.0:
      self._episode_ended = True
      reward = self._state[0] - 21
      return ts.termination(np.array([self._state], dtype=np.float32), reward)
    else:
      return ts.transition(
          np.array([self._state], dtype=np.float32), reward=0.0, discount=1.0)
0

There are 0 answers