How really make action masking in Ray (rllib)?

1.4k views Asked by At

1) It's unclear how to make action masking just more complex in rllib than we can find in examples. This mask works good from example action_mask_model.py with class TorchActionMaskModel(TorchModelV2, nn.Module)

self.observation_space = Dict({
    "action_mask": Box(0, 1, shape=(self.actions,)),
    "actual_obs": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
})

Now I want to make more complex:

self.observation_space = Dict({
    "action_mask": Box(0, 1, shape=(self.actions,)),
    "actual_obs": Dict({
        "obs1": Discrete(10),
        "obs2": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
    }),
})

It makes error:

prev_layer_size = int(np.product(obs_space.shape))
TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

Does anyone know how can I fix it? Full reproducible code python=3.8 rllib=1.12.0:

import numpy as np
import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.registry import register_env
import gym
from gym.spaces import Box, Dict, Discrete

from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.torch_utils import FLOAT_MIN

torch, nn = try_import_torch()


# copy pasted from rllib/examples/models/action_mask_model.py
class TorchActionMaskModel(TorchModelV2, nn.Module):
    """PyTorch version of above ActionMaskingModel."""

    def __init__(
        self,
        obs_space,
        action_space,
        num_outputs,
        model_config,
        name,
        **kwargs,
    ):
        orig_space = getattr(obs_space, "original_space", obs_space)
        assert (
            isinstance(orig_space, Dict)
            and "action_mask" in orig_space.spaces
            and "actual_obs" in orig_space.spaces
        )

        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name, **kwargs
        )
        nn.Module.__init__(self)

        self.internal_model = TorchFC(
            orig_space["actual_obs"],
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )


    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]

        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs"]["actual_obs"]})

        # Convert action_mask into a [0.0 || -inf]-type mask.
        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)

        # Return masked logits.
        return logits + inf_mask, state

    def value_function(self):
        return self.internal_model.value_function()




class MyEnv(gym.Env):

    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super(MyEnv, self).__init__()

        self.actions = 4

        self.action_space = Discrete(self.actions)
        self.observation_space = Dict({
            "action_mask": Box(0, 1, shape=(self.actions,)),
            #"actual_obs": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
            "actual_obs": Dict({
                "obs1": Discrete(10),
                "obs2": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
            }),
        })
    
    def reset(self):
        return self._make_obs()
    
    def step(self, action):
        return self._make_obs(), 0, False, {}

    def _make_obs(self):
        return {
            "action_mask": np.array([1.0] * self.actions),
            #"actual_obs": np.zeros((10, 10), dtype=np.float32),
            "actual_obs": {"obs1": 1, "obs2": np.zeros((10, 10), dtype=np.float32)},
        }


def main ():

    ray.init()

    select_env = "env-v1"
    register_env(select_env, lambda config: MyEnv())
    config = ppo.DEFAULT_CONFIG.copy()
    config.update({
        "env": select_env,
        "framework": 'torch',
        "log_level": 'DEBUG',
        "model": {
            "custom_model": TorchActionMaskModel,
            # "no_final_linear": False,
        },
    })

    agent = ppo.PPOTrainer(config, env=select_env)
    for _ in range(5):
        agent.train()


if __name__ == "__main__":
    main()

2) there is also parametric_actions_model.py class TorchParametricActionsModel(DQNTorchModel): that also makes action masking, docs describes it here Variable-length / Parametric Action Spaces

class MyParamActionEnv(gym.Env):
    def __init__(self, max_avail_actions):
        self.action_space = Discrete(max_avail_actions)
        self.observation_space = Dict({
            "action_mask": Box(0, 1, shape=(max_avail_actions, )),
            "avail_actions": Box(-1, 1, shape=(max_avail_actions, action_embedding_sz)),
            "real_obs": ...,
        })

Anyone knows what's the difference TorchParametricActionsModel(DQNTorchModel) and class TorchActionMaskModel(TorchModelV2, nn.Module)? What class should I use for simple action masking PPO algorithm?

0

There are 0 answers