We have now released v0.3.0! Please use the latest version for the best experience.

Source code for omni.isaac.orbit.envs.rl_task_env

# Copyright (c) 2022-2024, The ORBIT Project Developers.
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause

# needed to import for allowing type-hinting: np.ndarray | None
from __future__ import annotations

import gymnasium as gym
import math
import numpy as np
import torch
from collections.abc import Sequence
from typing import Any, ClassVar

from omni.isaac.version import get_version

from omni.isaac.orbit.managers import CommandManager, CurriculumManager, RewardManager, TerminationManager

from .base_env import BaseEnv, VecEnvObs
from .rl_task_env_cfg import RLTaskEnvCfg

VecEnvStepReturn = tuple[VecEnvObs, torch.Tensor, torch.Tensor, torch.Tensor, dict]
"""The environment signals processed at the end of each step.

The tuple contains batched information for each sub-environment. The information is stored in the following order:

1. **Observations**: The observations from the environment.
2. **Rewards**: The rewards from the environment.
3. **Terminated Dones**: Whether the environment reached a terminal state, such as task success or robot falling etc.
4. **Timeout Dones**: Whether the environment reached a timeout state, such as end of max episode length.
5. **Extras**: A dictionary containing additional information from the environment.
"""


[docs]class RLTaskEnv(BaseEnv, gym.Env): """The superclass for reinforcement learning-based environments. This class inherits from :class:`BaseEnv` and implements the core functionality for reinforcement learning-based environments. It is designed to be used with any RL library. The class is designed to be used with vectorized environments, i.e., the environment is expected to be run in parallel with multiple sub-environments. The number of sub-environments is specified using the ``num_envs``. Each observation from the environment is a batch of observations for each sub- environments. The method :meth:`step` is also expected to receive a batch of actions for each sub-environment. While the environment itself is implemented as a vectorized environment, we do not inherit from :class:`gym.vector.VectorEnv`. This is mainly because the class adds various methods (for wait and asynchronous updates) which are not required. Additionally, each RL library typically has its own definition for a vectorized environment. Thus, to reduce complexity, we directly use the :class:`gym.Env` over here and leave it up to library-defined wrappers to take care of wrapping this environment for their agents. Note: For vectorized environments, it is recommended to **only** call the :meth:`reset` method once before the first call to :meth:`step`, i.e. after the environment is created. After that, the :meth:`step` function handles the reset of terminated sub-environments. This is because the simulator does not support resetting individual sub-environments in a vectorized environment. """ is_vector_env: ClassVar[bool] = True """Whether the environment is a vectorized environment.""" metadata: ClassVar[dict[str, Any]] = { "render_modes": [None, "human", "rgb_array"], "isaac_sim_version": get_version(), } """Metadata for the environment.""" cfg: RLTaskEnvCfg """Configuration for the environment."""
[docs] def __init__(self, cfg: RLTaskEnvCfg, render_mode: str | None = None, **kwargs): """Initialize the environment. Args: cfg: The configuration for the environment. render_mode: The render mode for the environment. Defaults to None, which is similar to ``"human"``. """ # initialize the base class to setup the scene. super().__init__(cfg=cfg) # store the render mode self.render_mode = render_mode # initialize data and constants # -- counter for curriculum self.common_step_counter = 0 # -- init buffers self.episode_length_buf = torch.zeros(self.num_envs, device=self.device, dtype=torch.long) # setup the action and observation spaces for Gym self._configure_gym_env_spaces() # perform events at the start of the simulation if "startup" in self.event_manager.available_modes: self.event_manager.apply(mode="startup") # print the environment information print("[INFO]: Completed setting up the environment...")
""" Properties. """ @property def max_episode_length_s(self) -> float: """Maximum episode length in seconds.""" return self.cfg.episode_length_s @property def max_episode_length(self) -> int: """Maximum episode length in environment steps.""" return math.ceil(self.max_episode_length_s / self.step_dt) """ Operations - Setup. """
[docs] def load_managers(self): # note: this order is important since observation manager needs to know the command and action managers # and the reward manager needs to know the termination manager # -- command manager self.command_manager: CommandManager = CommandManager(self.cfg.commands, self) print("[INFO] Command Manager: ", self.command_manager) # call the parent class to load the managers for observations and actions. super().load_managers() # prepare the managers # -- termination manager self.termination_manager = TerminationManager(self.cfg.terminations, self) print("[INFO] Termination Manager: ", self.termination_manager) # -- reward manager self.reward_manager = RewardManager(self.cfg.rewards, self) print("[INFO] Reward Manager: ", self.reward_manager) # -- curriculum manager self.curriculum_manager = CurriculumManager(self.cfg.curriculum, self) print("[INFO] Curriculum Manager: ", self.curriculum_manager)
""" Operations - MDP """
[docs] def step(self, action: torch.Tensor) -> VecEnvStepReturn: """Execute one time-step of the environment's dynamics and reset terminated environments. Unlike the :class:`BaseEnv.step` class, the function performs the following operations: 1. Process the actions. 2. Perform physics stepping. 3. Perform rendering if gui is enabled. 4. Update the environment counters and compute the rewards and terminations. 5. Reset the environments that terminated. 6. Compute the observations. 7. Return the observations, rewards, resets and extras. Args: action: The actions to apply on the environment. Shape is (num_envs, action_dim). Returns: A tuple containing the observations, rewards, resets (terminated and truncated) and extras. """ # process actions self.action_manager.process_action(action) # perform physics stepping for _ in range(self.cfg.decimation): # set actions into buffers self.action_manager.apply_action() # set actions into simulator self.scene.write_data_to_sim() # simulate self.sim.step(render=False) # update buffers at sim dt self.scene.update(dt=self.physics_dt) # perform rendering if gui is enabled if self.sim.has_gui() or self.sim.has_rtx_sensors(): self.sim.render() # post-step: # -- update env counters (used for curriculum generation) self.episode_length_buf += 1 # step in current episode (per env) self.common_step_counter += 1 # total step (common for all envs) # -- check terminations self.reset_buf = self.termination_manager.compute() self.reset_terminated = self.termination_manager.terminated self.reset_time_outs = self.termination_manager.time_outs # -- reward computation self.reward_buf = self.reward_manager.compute(dt=self.step_dt) # -- reset envs that terminated/timed-out and log the episode information reset_env_ids = self.reset_buf.nonzero(as_tuple=False).squeeze(-1) if len(reset_env_ids) > 0: self._reset_idx(reset_env_ids) # -- update command self.command_manager.compute(dt=self.step_dt) # -- step interval events if "interval" in self.event_manager.available_modes: self.event_manager.apply(mode="interval", dt=self.step_dt) # -- compute observations # note: done after reset to get the correct observations for reset envs self.obs_buf = self.observation_manager.compute() # return observations, rewards, resets and extras return self.obs_buf, self.reward_buf, self.reset_terminated, self.reset_time_outs, self.extras
[docs] def render(self, recompute: bool = False) -> np.ndarray | None: """Run rendering without stepping through the physics. By convention, if mode is: - **human**: Render to the current display and return nothing. Usually for human consumption. - **rgb_array**: Return an numpy.ndarray with shape (x, y, 3), representing RGB values for an x-by-y pixel image, suitable for turning into a video. Args: recompute: Whether to force a render even if the simulator has already rendered the scene. Defaults to False. Returns: The rendered image as a numpy array if mode is "rgb_array". Otherwise, returns None. Raises: RuntimeError: If mode is set to "rgb_data" and simulation render mode does not support it. In this case, the simulation render mode must be set to ``RenderMode.PARTIAL_RENDERING`` or ``RenderMode.FULL_RENDERING``. NotImplementedError: If an unsupported rendering mode is specified. """ # run a rendering step of the simulator # if we have rtx sensors, we do not need to render again sin if not self.sim.has_rtx_sensors() and not recompute: self.sim.render() # decide the rendering mode if self.render_mode == "human" or self.render_mode is None: return None elif self.render_mode == "rgb_array": # check that if any render could have happened if self.sim.render_mode.value < self.sim.RenderMode.PARTIAL_RENDERING.value: raise RuntimeError( f"Cannot render '{self.render_mode}' when the simulation render mode is" f" '{self.sim.render_mode.name}'. Please set the simulation render mode to:" f"'{self.sim.RenderMode.PARTIAL_RENDERING.name}' or '{self.sim.RenderMode.FULL_RENDERING.name}'." ) # create the annotator if it does not exist if not hasattr(self, "_rgb_annotator"): import omni.replicator.core as rep # create render product self._render_product = rep.create.render_product( self.cfg.viewer.cam_prim_path, self.cfg.viewer.resolution ) # create rgb annotator -- used to read data from the render product self._rgb_annotator = rep.AnnotatorRegistry.get_annotator("rgb", device="cpu") self._rgb_annotator.attach([self._render_product]) # obtain the rgb data rgb_data = self._rgb_annotator.get_data() # convert to numpy array rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape) # return the rgb data # note: initially the renerer is warming up and returns empty data if rgb_data.size == 0: return np.zeros((self.cfg.viewer.resolution[1], self.cfg.viewer.resolution[0], 3), dtype=np.uint8) else: return rgb_data[:, :, :3] else: raise NotImplementedError( f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}." )
[docs] def close(self): if not self._is_closed: # destructor is order-sensitive del self.command_manager del self.reward_manager del self.termination_manager del self.curriculum_manager # call the parent class to close the environment super().close()
""" Helper functions. """ def _configure_gym_env_spaces(self): """Configure the action and observation spaces for the Gym environment.""" # observation space (unbounded since we don't impose any limits) self.single_observation_space = gym.spaces.Dict() for group_name, group_term_names in self.observation_manager.active_terms.items(): # extract quantities about the group has_concatenated_obs = self.observation_manager.group_obs_concatenate[group_name] group_dim = self.observation_manager.group_obs_dim[group_name] group_term_dim = self.observation_manager.group_obs_term_dim[group_name] # check if group is concatenated or not # if not concatenated, then we need to add each term separately as a dictionary if has_concatenated_obs: self.single_observation_space[group_name] = gym.spaces.Box(low=-np.inf, high=np.inf, shape=group_dim) else: self.single_observation_space[group_name] = gym.spaces.Dict({ term_name: gym.spaces.Box(low=-np.inf, high=np.inf, shape=term_dim) for term_name, term_dim in zip(group_term_names, group_term_dim) }) # action space (unbounded since we don't impose any limits) action_dim = sum(self.action_manager.action_term_dim) self.single_action_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(action_dim,)) # batch the spaces for vectorized environments self.observation_space = gym.vector.utils.batch_space(self.single_observation_space, self.num_envs) self.action_space = gym.vector.utils.batch_space(self.single_action_space, self.num_envs) def _reset_idx(self, env_ids: Sequence[int]): """Reset environments based on specified indices. Args: env_ids: List of environment ids which must be reset """ # update the curriculum for environments that need a reset self.curriculum_manager.compute(env_ids=env_ids) # reset the internal buffers of the scene elements self.scene.reset(env_ids) # apply events such as randomizations for environments that need a reset if "reset" in self.event_manager.available_modes: self.event_manager.apply(env_ids=env_ids, mode="reset") # iterate over all managers and reset them # this returns a dictionary of information which is stored in the extras # note: This is order-sensitive! Certain things need be reset before others. self.extras["log"] = dict() # -- observation manager info = self.observation_manager.reset(env_ids) self.extras["log"].update(info) # -- action manager info = self.action_manager.reset(env_ids) self.extras["log"].update(info) # -- rewards manager info = self.reward_manager.reset(env_ids) self.extras["log"].update(info) # -- curriculum manager info = self.curriculum_manager.reset(env_ids) self.extras["log"].update(info) # -- command manager info = self.command_manager.reset(env_ids) self.extras["log"].update(info) # -- event manager info = self.event_manager.reset(env_ids) self.extras["log"].update(info) # -- termination manager info = self.termination_manager.reset(env_ids) self.extras["log"].update(info) # reset the episode length buffer self.episode_length_buf[env_ids] = 0