1) It's unclear how to make action masking just more complex in rllib than we can find in examples.
This mask works good from example action_mask_model.py with class TorchActionMaskModel(TorchModelV2, nn.Module)
self.observation_space = Dict({
"action_mask": Box(0, 1, shape=(self.actions,)),
"actual_obs": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
})
Now I want to make more complex:
self.observation_space = Dict({
"action_mask": Box(0, 1, shape=(self.actions,)),
"actual_obs": Dict({
"obs1": Discrete(10),
"obs2": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
}),
})
It makes error:
prev_layer_size = int(np.product(obs_space.shape))
TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Does anyone know how can I fix it? Full reproducible code python=3.8 rllib=1.12.0:
import numpy as np
import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.registry import register_env
import gym
from gym.spaces import Box, Dict, Discrete
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.torch_utils import FLOAT_MIN
torch, nn = try_import_torch()
# copy pasted from rllib/examples/models/action_mask_model.py
class TorchActionMaskModel(TorchModelV2, nn.Module):
"""PyTorch version of above ActionMaskingModel."""
def __init__(
self,
obs_space,
action_space,
num_outputs,
model_config,
name,
**kwargs,
):
orig_space = getattr(obs_space, "original_space", obs_space)
assert (
isinstance(orig_space, Dict)
and "action_mask" in orig_space.spaces
and "actual_obs" in orig_space.spaces
)
TorchModelV2.__init__(
self, obs_space, action_space, num_outputs, model_config, name, **kwargs
)
nn.Module.__init__(self)
self.internal_model = TorchFC(
orig_space["actual_obs"],
action_space,
num_outputs,
model_config,
name + "_internal",
)
def forward(self, input_dict, state, seq_lens):
# Extract the available actions tensor from the observation.
action_mask = input_dict["obs"]["action_mask"]
# Compute the unmasked logits.
logits, _ = self.internal_model({"obs": input_dict["obs"]["actual_obs"]})
# Convert action_mask into a [0.0 || -inf]-type mask.
inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)
# Return masked logits.
return logits + inf_mask, state
def value_function(self):
return self.internal_model.value_function()
class MyEnv(gym.Env):
metadata = {"render.modes": ["human"]}
def __init__(self):
super(MyEnv, self).__init__()
self.actions = 4
self.action_space = Discrete(self.actions)
self.observation_space = Dict({
"action_mask": Box(0, 1, shape=(self.actions,)),
#"actual_obs": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
"actual_obs": Dict({
"obs1": Discrete(10),
"obs2": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
}),
})
def reset(self):
return self._make_obs()
def step(self, action):
return self._make_obs(), 0, False, {}
def _make_obs(self):
return {
"action_mask": np.array([1.0] * self.actions),
#"actual_obs": np.zeros((10, 10), dtype=np.float32),
"actual_obs": {"obs1": 1, "obs2": np.zeros((10, 10), dtype=np.float32)},
}
def main ():
ray.init()
select_env = "env-v1"
register_env(select_env, lambda config: MyEnv())
config = ppo.DEFAULT_CONFIG.copy()
config.update({
"env": select_env,
"framework": 'torch',
"log_level": 'DEBUG',
"model": {
"custom_model": TorchActionMaskModel,
# "no_final_linear": False,
},
})
agent = ppo.PPOTrainer(config, env=select_env)
for _ in range(5):
agent.train()
if __name__ == "__main__":
main()
2) there is also parametric_actions_model.py class TorchParametricActionsModel(DQNTorchModel):
that also makes action masking, docs describes it here Variable-length / Parametric Action Spaces
class MyParamActionEnv(gym.Env):
def __init__(self, max_avail_actions):
self.action_space = Discrete(max_avail_actions)
self.observation_space = Dict({
"action_mask": Box(0, 1, shape=(max_avail_actions, )),
"avail_actions": Box(-1, 1, shape=(max_avail_actions, action_embedding_sz)),
"real_obs": ...,
})
Anyone knows what's the difference TorchParametricActionsModel(DQNTorchModel)
and class TorchActionMaskModel(TorchModelV2, nn.Module)
? What class should I use for simple action masking PPO algorithm?