count the expert usage as well for MoE

wang55 · wang55 · commit cb4e41a3c71c · 2025-08-19T23:45:34.000+02:00
diff --git a/torchtitan/components/optimizer.py b/torchtitan/components/optimizer.py
@@ -359,15 +359,12 @@ def _update_expert_bias(
                 if not transformer_block.moe_enabled:
                     continue
                 moe = transformer_block.moe
-                if moe.load_balance_coeff is None:
-                    continue
                 tokens_per_expert = transformer_block.moe.tokens_per_expert
-                if is_full_recompute(transformer_block.moe) or is_full_recompute(
-                    transformer_block
-                ):
+                if is_full_recompute(transformer_block):
                     # TODO: This is a hack, we assume with full AC, the tokens_per_expert is counted twice.
                     # This does not affect to expert choice, but affects the experts usage metrics.
                     # We divide by 2 to correct for this double-counting due to recomputation
+                    # TODO: new API to help determine if AC is enabled https://github.com/pytorch/pytorch/pull/160888
                     tokens_per_expert = tokens_per_expert // 2
                 tokens_per_expert_list.append(tokens_per_expert)
 
@@ -390,30 +387,37 @@ def _update_expert_bias(
                 tokens_per_expert_by_layer, group=pg, op=torch.distributed.ReduceOp.SUM
             )
 
-        layer_idx = 0
+        moe_layer_idx = 0
         with torch.no_grad():
             for model_part in model_parts:
-                for transformer_block in model_part.layers.values():
+                for layer_id, transformer_block in enumerate(
+                    model_part.layers.values()
+                ):
                     if not transformer_block.moe_enabled:
                         continue
                     moe = transformer_block.moe
 
+                    tokens_per_expert = tokens_per_expert_by_layer[
+                        moe_layer_idx
+                    ].float()
+                    moe_layer_idx += 1
+                    # uncomment to log expert usage once we fix https://github.com/pytorch/torchtitan/pull/1578
+                    # sum_tokens = tokens_per_expert.sum().clamp(min=1.0)
+                    # expert_usage_metrics = {
+                    #     f"moe_ep_usage/L-{layer_id}_EP-{ep_idx}": usage / sum_tokens
+                    #     for ep_idx, usage in enumerate(tokens_per_expert)
+                    # }
+
                     if moe.load_balance_coeff is None:
                         continue
-
-                    tokens_per_expert = tokens_per_expert_by_layer[layer_idx].float()
-                    layer_idx += 1
-
                     # update the expert bias
-                    # https://github.com/pytorch/torchtitan/issues/1506
                     # this is not exactly the same as https://arxiv.org/pdf/2408.15664 proposed
-                    expert_bias_delta = load_balance_coeff * torch.sign(
+                    expert_bias_delta = moe.load_balance_coeff * torch.sign(
                         tokens_per_expert.mean() - tokens_per_expert
                     )
                     expert_bias_delta = expert_bias_delta - expert_bias_delta.mean()
                     moe.expert_bias.add_(expert_bias_delta)
                     moe.tokens_per_expert.zero_()
-                    # placeholder to record and log the expert usage
 
     optimizers.register_step_pre_hook(
         lambda *args, **kwargs: _update_expert_bias(
diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py
@@ -350,13 +350,14 @@ def __init__(self, moe_args: MoEArgs, dim: int, hidden_dim: int):
                 torch.zeros(num_experts, dtype=torch.float32),
                 persistent=True,
             )
-            self.register_buffer(
-                "tokens_per_expert",
-                torch.zeros(num_experts, dtype=torch.float32),
-                persistent=False,
-            )
         else:
             self.expert_bias = None
+        # We create tokens_per_expert buffer anyhow to help us conunt the expert usage
+        self.register_buffer(
+            "tokens_per_expert",
+            torch.zeros(num_experts, dtype=torch.float32),
+            persistent=False,
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -378,12 +379,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         ) = self.router(x, self.expert_bias)
 
         # tokens_per_expert will be used to update the expert bias for load balancing.
+        # and also to count the expert usage
         # TODO: Activation Checkpointing has the side effect of double counting tokens_per_expert --
         #       first in the forward pass, and then in the backward pass. However, this has no
         #       effect on the expert bias update thanks to the torch.sign() operator.
-        if self.load_balance_coeff is not None:
-            with torch.no_grad():
-                self.tokens_per_expert.add_(num_tokens_per_expert)
+        with torch.no_grad():
+            self.tokens_per_expert.add_(num_tokens_per_expert)
 
         # top_scores and token_indices_experts_sorted shape (bs*slen*top_k,)
         # num_tokens_per_expert shape (num_experts,)
@@ -444,11 +445,11 @@ def init_weights(
         if self.shared_experts is not None:
             self.shared_experts.init_weights(init_std)
 
-        if self.load_balance_coeff is not None:
-            with torch.device(buffer_device):
+        with torch.device(buffer_device):
+            self.tokens_per_expert = torch.zeros(
+                self.experts.num_experts, dtype=torch.float32
+            )
+            if self.load_balance_coeff is not None:
                 self.expert_bias = torch.zeros(
                     self.experts.num_experts, dtype=torch.float32
                 )
-                self.tokens_per_expert = torch.zeros(
-                    self.experts.num_experts, dtype=torch.float32
-                )