convert alpha to log value

varunagrawal · varunagrawal · commit ab7f75cac463 · 2025-06-30T15:50:59.000-04:00
diff --git a/brax/training/agents/sac/train.py b/brax/training/agents/sac/train.py
@@ -80,11 +80,11 @@ def _init_training_state(
     alpha_optimizer: optax.GradientTransformation,
     policy_optimizer: optax.GradientTransformation,
     q_optimizer: optax.GradientTransformation,
-    initial_alpha: float = 0.0,
+    initial_alpha: float = 1.0,
 ) -> TrainingState:
   """Inits the training state and replicates it over devices."""
   key_policy, key_q = jax.random.split(key)
-  log_alpha = jnp.asarray(initial_alpha, dtype=jnp.float32)
+  log_alpha = jnp.asarray(jnp.log(initial_alpha), dtype=jnp.float32)
   alpha_optimizer_state = alpha_optimizer.init(log_alpha)
 
   policy_params = sac_network.policy_network.init(key_policy)
@@ -162,10 +162,6 @@ def train(
     num_envs: the number of parallel environments to use for rollouts
       NOTE: `num_envs` must be divisible by the total number of chips since each
         chip gets `num_envs // total_number_of_chips` environments to roll out
-      NOTE: `batch_size * num_minibatches` must be divisible by `num_envs` since
-        data generated by `num_envs` parallel envs gets used for gradient
-        updates over `num_minibatches` of data, where each minibatch has a
-        leading dimension of `batch_size`
     num_eval_envs: the number of envs to use for evluation. Each env will run 1
       episode, and all envs run in parallel during eval.
     learning_rate: learning rate for SAC loss
@@ -178,7 +174,7 @@ def train(
     max_devices_per_host: maximum number of chips to use per host process
     reward_scaling: float scaling for reward
     tau: interpolation factor in polyak averaging for target networks
-    intial_alpha: initial value for the temperature parameter alpha
+    initial_alpha: initial value for the temperature parameter α
     min_replay_size: the minimum number of samples in the replay buffer before
       starting training. This is used to prefill the replay buffer with random
       samples before training starts