leggedrobotics
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config/dummy_config.yaml‎
Lines changed: 0 additions & 95 deletions b/‎config/dummy_config.yaml‎
Lines changed: 0 additions & 95 deletions
diff --git a/‎config/example_config.yaml‎
Lines changed: 76 additions & 0 deletions b/‎config/example_config.yaml‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎licenses/dependencies/tensordict.txt‎
Lines changed: 21 additions & 0 deletions b/‎licenses/dependencies/tensordict.txt‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 4 deletions b/‎pyproject.toml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎rsl_rl/algorithms/distillation.py‎
Lines changed: 23 additions & 22 deletions b/‎rsl_rl/algorithms/distillation.py‎
Lines changed: 23 additions & 22 deletions
@@ -46,7 +46,7 @@ The package supports the following logging frameworks which can be configured th
 * Weights & Biases: https://wandb.ai/site
 * Neptune: https://docs.neptune.ai/
 
-For a demo configuration of PPO, please check the [dummy_config.yaml](config/dummy_config.yaml) file.
+For a demo configuration of PPO, please check the [example_config.yaml](config/example_config.yaml) file.
 
 
 ## Contribution Guidelines
 
@@ -0,0 +1,76 @@
+runner:
+  class_name: OnPolicyRunner
+  # -- general
+  num_steps_per_env: 24  # number of steps per environment per iteration
+  max_iterations: 1500  # number of policy updates
+  seed: 1
+  # -- observations
+  obs_groups: {"policy": ["policy"], "critic": ["policy", "privileged"]} # maps observation groups to types. See `vec_env.py` for more information
+  # -- logging parameters
+  save_interval: 50  # check for potential saves every `save_interval` iterations
+  experiment_name: walking_experiment
+  run_name: ""
+  # -- logging writer
+  logger: tensorboard  # tensorboard, neptune, wandb
+  neptune_project: legged_gym
+  wandb_project: legged_gym
+
+  # -- policy
+  policy:
+    class_name: ActorCritic
+    activation: elu
+    actor_obs_normalization: false
+    critic_obs_normalization: false
+    actor_hidden_dims: [256, 256, 256]
+    critic_hidden_dims: [256, 256, 256]
+    init_noise_std: 1.0
+    noise_std_type: "scalar"  # 'scalar' or 'log'
+
+  # -- algorithm
+  algorithm:
+    class_name: PPO
+    # -- training
+    learning_rate: 0.001
+    num_learning_epochs: 5
+    num_mini_batches: 4  # mini batch size = num_envs * num_steps / num_mini_batches
+    schedule: adaptive  # adaptive, fixed
+    # -- value function
+    value_loss_coef: 1.0
+    clip_param: 0.2
+    use_clipped_value_loss: true
+    # -- surrogate loss
+    desired_kl: 0.01
+    entropy_coef: 0.01
+    gamma: 0.99
+    lam: 0.95
+    max_grad_norm: 1.0
+    # -- miscellaneous
+    normalize_advantage_per_mini_batch: false
+
+    # -- random network distillation
+    rnd_cfg:
+        weight: 0.0  # initial weight of the RND reward
+        weight_schedule: null # note: this is a dictionary with a required key called "mode". Please check the RND module for more information
+        reward_normalization: false  # whether to normalize RND reward
+        # -- learning parameters
+        learning_rate: 0.001  # learning rate for RND
+        # -- network parameters
+        num_outputs: 1  # number of outputs of RND network. Note: if -1, then the network will use dimensions of the observation
+        predictor_hidden_dims: [-1] # hidden dimensions of predictor network
+        target_hidden_dims: [-1]  # hidden dimensions of target network
+
+    # -- symmetry augmentation
+    symmetry_cfg:
+      use_data_augmentation: true  # this adds symmetric trajectories to the batch
+      use_mirror_loss: false  # this adds symmetry loss term to the loss function
+      data_augmentation_func: null # string containing the module and function name to import
+      # Example: "legged_gym.envs.locomotion.anymal_c.symmetry:get_symmetric_states"
+      #
+      # .. code-block:: python
+      #
+      #     @torch.no_grad()
+      #     def get_symmetric_states(
+      #        obs: Optional[torch.Tensor] = None, actions: Optional[torch.Tensor] = None, cfg: "BaseEnvCfg" = None, obs_type: str = "policy"
+      #     ) -> Tuple[torch.Tensor, torch.Tensor]:
+      #
+      mirror_loss_coeff: 0.0 #coefficient for symmetry loss term. If 0, no symmetry loss is used
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Meta Platforms, Inc. and affiliates.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "rsl-rl-lib"
-version = "2.3.3"
+version = "3.0.0"
 keywords = ["reinforcement-learning", "isaac", "leggedrobotics", "rl-pytorch"]
 maintainers = [
   { name="Clemens Schwarke", email="[email protected]" },
@@ -26,8 +26,9 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
-    "torch>=1.10.0",
+    "torch>=2.6.0",
     "torchvision>=0.5.0",
+    "tensordict>=0.7.0",
     "numpy>=1.16.4",
     "GitPython",
     "onnx",
@@ -46,7 +47,7 @@ include = ["rsl_rl*"]
 
 [tool.isort]
 
-py_version = 37
+py_version = 38
 line_length = 120
 group_by_package = true
 
@@ -79,7 +80,7 @@ known_first_party = "rsl_rl"
 include = ["rsl_rl"]
 
 typeCheckingMode = "basic"
-pythonVersion = "3.7"
+pythonVersion = "3.8"
 pythonPlatform = "Linux"
 enableTypeIgnoreComments = true
 
 
@@ -3,14 +3,12 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-# torch
 import torch
 import torch.nn as nn
-import torch.optim as optim
 
-# rsl-rl
 from rsl_rl.modules import StudentTeacher, StudentTeacherRecurrent
 from rsl_rl.storage import RolloutStorage
+from rsl_rl.utils import resolve_optimizer
 
 
 class Distillation:
@@ -27,6 +25,7 @@ def __init__(
         learning_rate=1e-3,
         max_grad_norm=None,
         loss_type="mse",
+        optimizer="adam",
         device="cpu",
         # Distributed training parameters
         multi_gpu_cfg: dict | None = None,
@@ -42,13 +41,15 @@ def __init__(
             self.gpu_global_rank = 0
             self.gpu_world_size = 1
 
-        self.rnd = None  # TODO: remove when runner has a proper base class
-
         # distillation components
         self.policy = policy
         self.policy.to(self.device)
         self.storage = None  # initialized later
-        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
+
+        # initialize the optimizer
+        self.optimizer = resolve_optimizer(optimizer)(self.policy.parameters(), lr=learning_rate)
+
+        # initialize the transition
         self.transition = RolloutStorage.Transition()
         self.last_hidden_states = None
 
@@ -59,40 +60,40 @@ def __init__(
         self.max_grad_norm = max_grad_norm
 
         # initialize the loss function
-        if loss_type == "mse":
-            self.loss_fn = nn.functional.mse_loss
-        elif loss_type == "huber":
-            self.loss_fn = nn.functional.huber_loss
+        loss_fn_dict = {
+            "mse": nn.functional.mse_loss,
+            "huber": nn.functional.huber_loss,
+        }
+        if loss_type in loss_fn_dict:
+            self.loss_fn = loss_fn_dict[loss_type]
         else:
-            raise ValueError(f"Unknown loss type: {loss_type}. Supported types are: mse, huber")
+            raise ValueError(f"Unknown loss type: {loss_type}. Supported types are: {list(loss_fn_dict.keys())}")
 
         self.num_updates = 0
 
-    def init_storage(
-        self, training_type, num_envs, num_transitions_per_env, student_obs_shape, teacher_obs_shape, actions_shape
-    ):
+    def init_storage(self, training_type, num_envs, num_transitions_per_env, obs, actions_shape):
         # create rollout storage
         self.storage = RolloutStorage(
             training_type,
             num_envs,
             num_transitions_per_env,
-            student_obs_shape,
-            teacher_obs_shape,
+            obs,
             actions_shape,
-            None,
             self.device,
         )
 
-    def act(self, obs, teacher_obs):
+    def act(self, obs):
         # compute the actions
         self.transition.actions = self.policy.act(obs).detach()
-        self.transition.privileged_actions = self.policy.evaluate(teacher_obs).detach()
+        self.transition.privileged_actions = self.policy.evaluate(obs).detach()
         # record the observations
         self.transition.observations = obs
-        self.transition.privileged_observations = teacher_obs
         return self.transition.actions
 
-    def process_env_step(self, rewards, dones, infos):
+    def process_env_step(self, obs, rewards, dones, extras):
+        # update the normalizers
+        self.policy.update_normalization(obs)
+
         # record the rewards and dones
         self.transition.rewards = rewards
         self.transition.dones = dones
@@ -110,7 +111,7 @@ def update(self):
         for epoch in range(self.num_learning_epochs):
             self.policy.reset(hidden_states=self.last_hidden_states)
             self.policy.detach_hidden_states()
-            for obs, _, _, privileged_actions, dones in self.storage.generator():
+            for obs, _, privileged_actions, dones in self.storage.generator():
 
                 # inference the student for gradient computation
                 actions = self.policy.act_inference(obs)