add comments, rename ac check func name

wang55 · wang55 · commit f340bcb1b15a · 2025-08-20T09:39:07.000+02:00
diff --git a/torchtitan/components/optimizer.py b/torchtitan/components/optimizer.py
@@ -341,7 +341,7 @@ def build_optimizers_with_moe_load_balancing(
     )
 
     # for MoE auxiliary-loss-free load balancing
-    def is_full_recompute(module):
+    def _is_recomputation_enabled(module):
         return getattr(module, "checkpoint_impl", None) is CheckpointImpl.NO_REENTRANT
 
     def _update_expert_bias(
@@ -358,9 +358,8 @@ def _update_expert_bias(
             for transformer_block in model_part.layers.values():
                 if not transformer_block.moe_enabled:
                     continue
-                moe = transformer_block.moe
                 tokens_per_expert = transformer_block.moe.tokens_per_expert
-                if is_full_recompute(transformer_block):
+                if _is_recomputation_enabled(transformer_block):
                     # TODO: This is a hack, we assume with full AC, the tokens_per_expert is counted twice.
                     # This does not affect to expert choice, but affects the experts usage metrics.
                     # We divide by 2 to correct for this double-counting due to recomputation
@@ -372,12 +371,6 @@ def _update_expert_bias(
             # avoid cat empty tensor
             return
 
-        n_expert = tokens_per_expert_list[0].numel()
-        assert all(
-            t.numel() == n_expert for t in tokens_per_expert_list
-        ), "All MoE layers must have the same number of experts."
-
-        # [n_layers, n_expert], int32
         tokens_per_expert_by_layer = torch.vstack(tokens_per_expert_list)
 
         if dp_cp_mesh is not None:
diff --git a/torchtitan/models/moe.py b/torchtitan/models/moe.py
@@ -352,7 +352,7 @@ def __init__(self, moe_args: MoEArgs, dim: int, hidden_dim: int):
             )
         else:
             self.expert_bias = None
-        # We create tokens_per_expert buffer anyhow to help us conunt the expert usage
+        # tokens_per_expert will be used to track expert usage and to update the expert bias for load balancing
         self.register_buffer(
             "tokens_per_expert",
             torch.zeros(num_experts, dtype=torch.float32),

Original file line number	Diff line number	Diff line change
`@@ -352,7 +352,7 @@ def __init__(self, moe_args: MoEArgs, dim: int, hidden_dim: int):`
`352`	`352`	`)`
`353`	`353`	`else:`
`354`	`354`	`self.expert_bias = None`
`355`		`- # We create tokens_per_expert buffer anyhow to help us conunt the expert usage`
	`355`	`+ # tokens_per_expert will be used to track expert usage and to update the expert bias for load balancing`
`356`	`356`	`self.register_buffer(`
`357`	`357`	`"tokens_per_expert",`
`358`	`358`	`torch.zeros(num_experts, dtype=torch.float32),`