Add uv index for source install. Add warp impl to training script.

btaba · copybara-github · commit 4981d0560f97 · 2025-08-15T10:04:56.000-07:00
PiperOrigin-RevId: 795515399
Change-Id: I4e3397893d5a417c1888dc2b10ff39eabfcfda26
diff --git a/README.md b/README.md
@@ -8,13 +8,16 @@ A comprehensive suite of GPU-accelerated environments for robot learning researc
 
 Features include:
 
-- Classic control environments from `dm_control` reimplemented in MJX.
+- Classic control environments from `dm_control`.
 - Quadruped and bipedal locomotion environments.
 - Non-prehensile and dexterous manipulation environments.
 - Vision-based support available via [Madrona-MJX](https://github.com/shacklettbp/madrona_mjx).
 
 For more details, check out the project [website](https://playground.mujoco.org/).
 
+> [!NOTE]
+> We now support training with both the MuJoCo MJX JAX implementation, as well as the [MuJoCo Warp](https://github.com/google-deepmind/mujoco_warp) implementation at HEAD. See MuJoCo 3.3.5 [release notes](https://mujoco.readthedocs.io/en/stable/changelog.html#version-3-3-5-august-8-2025) under `MJX` for more details.
+
 ## Installation
 
 You can install MuJoCo Playground directly from PyPI:
@@ -23,6 +26,14 @@ You can install MuJoCo Playground directly from PyPI:
 pip install playground
 ```
 
+> [!WARNING]
+> The `playground` release may depend on pre-release versions of `mujoco` and
+> `warp-lang`, in which case you can try `pip install playground
+> --extra-index-url=https://py.mujoco.org
+> --extra-index-url=https://pypi.nvidia.com/warp-lang/`.
+> If there are still version mismatches, please open a github issue, and install
+> from source.
+
 ### From Source
 
 > [!IMPORTANT]
diff --git a/learning/train_jax_ppo.py b/learning/train_jax_ppo.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Train a PPO agent using JAX on the specified environment."""
 
-from datetime import datetime
+import datetime
 import functools
 import json
 import os
@@ -28,22 +28,20 @@
 from brax.training.agents.ppo import networks_vision as ppo_networks_vision
 from brax.training.agents.ppo import train as ppo
 from etils import epath
-from flax.training import orbax_utils
 import jax
 import jax.numpy as jp
 import mediapy as media
 from ml_collections import config_dict
 import mujoco
-from orbax import checkpoint as ocp
-from tensorboardX import SummaryWriter
-import wandb
-
 import mujoco_playground
 from mujoco_playground import registry
 from mujoco_playground import wrapper
 from mujoco_playground.config import dm_control_suite_params
 from mujoco_playground.config import locomotion_params
 from mujoco_playground.config import manipulation_params
+import tensorboardX
+import wandb
+
 
 xla_flags = os.environ.get("XLA_FLAGS", "")
 xla_flags += " --xla_gpu_triton_gemm_any=True"
@@ -69,6 +67,7 @@
     "LeapCubeReorient",
     f"Name of the environment. One of {', '.join(registry.ALL_ENVS)}",
 )
+_IMPL = flags.DEFINE_enum("impl", "jax", ["jax", "warp"], "MJX implementation")
 _VISION = flags.DEFINE_boolean("vision", False, "Use vision input")
 _LOAD_CHECKPOINT_PATH = flags.DEFINE_string(
     "load_checkpoint_path", None, "Path to load checkpoint from"
@@ -92,6 +91,9 @@
 _NUM_TIMESTEPS = flags.DEFINE_integer(
     "num_timesteps", 1_000_000, "Number of timesteps"
 )
+_NUM_VIDEOS = flags.DEFINE_integer(
+    "num_videos", 1, "Number of videos to record after training."
+)
 _NUM_EVALS = flags.DEFINE_integer("num_evals", 5, "Number of evaluations")
 _REWARD_SCALING = flags.DEFINE_float("reward_scaling", 0.1, "Reward scaling")
 _EPISODE_LENGTH = flags.DEFINE_integer("episode_length", 1000, "Episode length")
@@ -269,7 +271,7 @@ def main(argv):
   print(f"PPO Training Parameters:\n{ppo_params}")
 
   # Generate unique experiment name
-  now = datetime.now()
+  now = datetime.datetime.now()
   timestamp = now.strftime("%Y%m%d-%H%M%S")
   exp_name = f"{_ENV_NAME.value}-{timestamp}"
   if _SUFFIX.value is not None:
@@ -289,7 +291,7 @@ def main(argv):
 
   # Initialize TensorBoard if required
   if _USE_TB.value and not _PLAY_ONLY.value:
-    writer = SummaryWriter(logdir)
+    writer = tensorboardX.SummaryWriter(logdir)
 
   # Handle checkpoint loading
   if _LOAD_CHECKPOINT_PATH.value is not None:
@@ -393,18 +395,26 @@ def progress(num_steps, metrics):
             f" reward={metrics['episode/sum_reward']:.3f}"
         )
 
-  # Load evaluation environment
-  eval_env = (
-      None if _VISION.value else registry.load(_ENV_NAME.value, config=env_cfg)
-  )
+  # Load evaluation environment.
+  config_overrides = {"impl": _IMPL.value}
+  eval_env = None
+  if not _VISION.value:
+    eval_env = registry.load(
+        _ENV_NAME.value, config=env_cfg, config_overrides=config_overrides
+    )
+  num_envs = 1
+  if _VISION.value:
+    num_envs = env_cfg.vision_config.render_batch_size
 
   policy_params_fn = lambda *args: None
   if _RSCOPE_ENVS.value:
     # Interactive visualisation of policy checkpoints
     from rscope import brax as rscope_utils
 
     if not _VISION.value:
-      rscope_env = registry.load(_ENV_NAME.value, config=env_cfg)
+      rscope_env = registry.load(
+          _ENV_NAME.value, config=env_cfg, config_overrides=config_overrides
+      )
       rscope_env = wrapper.wrap_for_brax_training(
           rscope_env,
           episode_length=ppo_params.episode_length,
@@ -433,7 +443,7 @@ def policy_params_fn(current_step, make_policy, params):  # pylint: disable=unus
       environment=env,
       progress_fn=progress,
       policy_params_fn=policy_params_fn,
-      eval_env=None if _VISION.value else eval_env,
+      eval_env=eval_env,
   )
 
   print("Done training.")
@@ -443,63 +453,69 @@ def policy_params_fn(current_step, make_policy, params):  # pylint: disable=unus
 
   print("Starting inference...")
 
-  # Create inference function
+  # Create inference function.
   inference_fn = make_inference_fn(params, deterministic=True)
   jit_inference_fn = jax.jit(inference_fn)
 
-  # Prepare for evaluation
-  eval_env = (
-      None if _VISION.value else registry.load(_ENV_NAME.value, config=env_cfg)
-  )
-  num_envs = 1
-  if _VISION.value:
-    eval_env = env
-    num_envs = env_cfg.vision_config.render_batch_size
-
-  jit_reset = jax.jit(eval_env.reset)
-  jit_step = jax.jit(eval_env.step)
-
-  rng = jax.random.PRNGKey(123)
-  rng, reset_rng = jax.random.split(rng)
-  if _VISION.value:
-    reset_rng = jp.asarray(jax.random.split(reset_rng, num_envs))
-  state = jit_reset(reset_rng)
-  state0 = (
-      jax.tree_util.tree_map(lambda x: x[0], state) if _VISION.value else state
-  )
-  rollout = [state0]
-
-  # Run evaluation rollout
-  for _ in range(env_cfg.episode_length):
-    act_rng, rng = jax.random.split(rng)
-    ctrl, _ = jit_inference_fn(state.obs, act_rng)
-    state = jit_step(state, ctrl)
-    state0 = (
-        jax.tree_util.tree_map(lambda x: x[0], state)
-        if _VISION.value
-        else state
+  # Run evaluation rollouts.
+  def do_rollout(rng, state):
+    empty_data = state.data.__class__(
+        **{k: None for k in state.data.__annotations__}
+    )  # pytype: disable=attribute-error
+    empty_traj = state.__class__(**{k: None for k in state.__annotations__})  # pytype: disable=attribute-error
+    empty_traj = empty_traj.replace(data=empty_data)
+
+    def step(carry, _):
+      state, rng = carry
+      rng, act_key = jax.random.split(rng)
+      act = jit_inference_fn(state.obs, act_key)[0]
+      state = eval_env.step(state, act)
+      traj_data = empty_traj.tree_replace({
+          "data.qpos": state.data.qpos,
+          "data.qvel": state.data.qvel,
+          "data.time": state.data.time,
+          "data.ctrl": state.data.ctrl,
+          "data.mocap_pos": state.data.mocap_pos,
+          "data.mocap_quat": state.data.mocap_quat,
+          "data.xfrc_applied": state.data.xfrc_applied,
+      })
+      if _VISION.value:
+        traj_data = jax.tree_util.tree_map(lambda x: x[0], traj_data)
+      return (state, rng), traj_data
+
+    _, traj = jax.lax.scan(
+        step, (state, rng), None, length=_EPISODE_LENGTH.value
     )
-    rollout.append(state0)
-    if state0.done:
-      break
+    return traj
 
-  # Render and save the rollout
+  rng = jax.random.split(jax.random.PRNGKey(_SEED.value), _NUM_VIDEOS.value)
+  reset_states = jax.jit(jax.vmap(eval_env.reset))(rng)
+  if _VISION.value:
+    reset_states = jax.tree_util.tree_map(lambda x: x[0], reset_states)
+  traj_stacked = jax.jit(jax.vmap(do_rollout))(rng, reset_states)
+  trajectories = [None] * _NUM_VIDEOS.value
+  for i in range(_NUM_VIDEOS.value):
+    t = jax.tree.map(lambda x, i=i: x[i], traj_stacked)
+    trajectories[i] = [
+        jax.tree.map(lambda x, j=j: x[j], t)
+        for j in range(_EPISODE_LENGTH.value)
+    ]
+
+  # Render and save the rollout.
   render_every = 2
   fps = 1.0 / eval_env.dt / render_every
   print(f"FPS for rendering: {fps}")
-
-  traj = rollout[::render_every]
-
   scene_option = mujoco.MjvOption()
   scene_option.flags[mujoco.mjtVisFlag.mjVIS_TRANSPARENT] = False
   scene_option.flags[mujoco.mjtVisFlag.mjVIS_PERTFORCE] = False
   scene_option.flags[mujoco.mjtVisFlag.mjVIS_CONTACTFORCE] = False
-
-  frames = eval_env.render(
-      traj, height=480, width=640, scene_option=scene_option
-  )
-  media.write_video("rollout.mp4", frames, fps=fps)
-  print("Rollout video saved as 'rollout.mp4'.")
+  for i, rollout in enumerate(trajectories):
+    traj = rollout[::render_every]
+    frames = eval_env.render(
+        traj, height=480, width=640, scene_option=scene_option
+    )
+    media.write_video(f"rollout{i}.mp4", frames, fps=fps)
+    print(f"Rollout video saved as 'rollout{i}.mp4'.")
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,18 +23,37 @@ classifiers = [
     "Topic :: Scientific/Engineering",
 ]
 dependencies = [
-    "brax>=0.12.1",
+    "brax>=0.12.5",
     "etils",
     "flax",
     "jax",
     "lxml",
+    "mediapy",
     "ml_collections",
-    "mujoco-mjx>=3.2.7",
-    "mujoco>=3.2.7",
+    "mujoco-mjx>=3.3.6.dev",
+    "mujoco>=3.3.6.dev",
+    "orbax-checkpoint>=0.11.22",
     "tqdm",
+    "warp-lang>=1.9.0.dev",
+    "wandb",
 ]
 keywords = ["mjx", "mujoco", "sim2real", "reinforcement learning"]
 
+[[tool.uv.index]]
+name = "mujoco"
+url = "https://py.mujoco.org"
+explicit = true
+
+[[tool.uv.index]]
+name = "nvidia"
+url = "https://pypi.nvidia.com"
+explicit = true
+
+[tool.uv.sources]
+mujoco = { index = "mujoco" }
+warp-lang = { index = "nvidia" }
+mujoco-mjx = { git = "https://github.com/google-deepmind/mujoco", rev = "977f94e9dfba12e1861fe5cbc56c50bfb734a78e", subdirectory = "mjx" }
+
 [project.optional-dependencies]
 test = [
     "absl-py",
@@ -44,9 +63,7 @@ test = [
 ]
 notebooks = [
     "matplotlib",
-    "mediapy",
     "jupyter",
-    "wandb",
 ]
 dev = [
     "playground[test]",
@@ -63,6 +80,9 @@ all = [
     "playground[notebooks]",
 ]
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
 [tool.hatch.build.targets.wheel]
 packages = ["mujoco_playground"]