AI-Hypercomputer
diff --git a/‎MaxText/layers/deepseek.py‎
Lines changed: 1 addition & 1 deletion b/‎MaxText/layers/deepseek.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎MaxText/layers/gemma.py‎
Lines changed: 4 additions & 10 deletions b/‎MaxText/layers/gemma.py‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎MaxText/layers/gemma2.py‎
Lines changed: 4 additions & 11 deletions b/‎MaxText/layers/gemma2.py‎
Lines changed: 4 additions & 11 deletions
diff --git a/‎MaxText/layers/gemma3.py‎
Lines changed: 4 additions & 11 deletions b/‎MaxText/layers/gemma3.py‎
Lines changed: 4 additions & 11 deletions
diff --git a/‎MaxText/layers/linears.py‎
Lines changed: 12 additions & 1 deletion b/‎MaxText/layers/linears.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎MaxText/layers/llama2.py‎
Lines changed: 4 additions & 14 deletions b/‎MaxText/layers/llama2.py‎
Lines changed: 4 additions & 14 deletions
diff --git a/‎MaxText/layers/llama4.py‎
Lines changed: 13 additions & 12 deletions b/‎MaxText/layers/llama4.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎MaxText/layers/mistral.py‎
Lines changed: 4 additions & 15 deletions b/‎MaxText/layers/mistral.py‎
Lines changed: 4 additions & 15 deletions
diff --git a/‎MaxText/layers/mixtral.py‎
Lines changed: 1 addition & 1 deletion b/‎MaxText/layers/mixtral.py‎
Lines changed: 1 addition & 1 deletion
@@ -259,7 +259,7 @@ def __call__(
     # NOTE: the naming mismatch here is to ensure reverse compatibility with existing checkpoints.
     # The `name` represents the weight name in JAX/checkpoints and so the class name
     # is just for readability.
-    mlp_lnx = moe.RoutedAndSharedMoE(
+    mlp_lnx = moe.get_routed_and_shared_moe(
         name="DeepSeekMoeBlock_0",
         config=cfg,
         mesh=self.mesh,
 
@@ -105,17 +105,10 @@ def __call__(
     )
     attention_lnx += inputs
     residual = attention_lnx
-    attn_output = rms_norm(
-        num_features=attention_lnx.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="pre_ffw_norm",
-        kernel_axes=("norm",),
-    )(attention_lnx)
 
-    # MLP block.
+    # MLP block with pre-norm.
     mlp_lnx = mlp_block(
-        in_features=attn_output.shape[-1],
+        in_features=residual.shape[-1],
         intermediate_dim=cfg.mlp_dim,
         activations=cfg.mlp_activations,
         intermediate_dropout_rate=cfg.dropout_rate,
@@ -124,7 +117,8 @@ def __call__(
         name="mlp",
         config=cfg,
         quant=self.quant,
-    )(attn_output, deterministic=deterministic)
+        use_pre_norm=True,
+    )(residual, deterministic=deterministic)
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     next_layer_addition = mlp_lnx + residual
 
@@ -116,17 +116,9 @@ def __call__(
     attention_lnx += inputs
     residual = attention_lnx
 
-    attn_output = rms_norm(
-        num_features=attention_lnx.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="pre_ffw_norm_local",
-        kernel_axes=("norm",),
-    )(attention_lnx)
-
-    # MLP block.
+    # MLP block with pre-norm.
     mlp_lnx = mlp_block(
-        in_features=attn_output.shape[-1],
+        in_features=attention_lnx.shape[-1],
         intermediate_dim=cfg.mlp_dim,
         activations=cfg.mlp_activations,
         intermediate_dropout_rate=cfg.dropout_rate,
@@ -136,7 +128,8 @@ def __call__(
         model_mode=model_mode,
         config=cfg,
         quant=self.quant,
-    )(attn_output, deterministic=deterministic)
+        use_pre_norm=True,
+    )(attention_lnx, deterministic=deterministic)
 
     if cfg.use_post_ffw_norm:
       mlp_lnx = rms_norm(
 
@@ -147,17 +147,9 @@ def __call__(
     attention_lnx += inputs
     residual = attention_lnx
 
-    attn_output = rms_norm(
-        num_features=attention_lnx.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="pre_ffw_norm",
-        kernel_axes=("norm",),
-    )(attention_lnx)
-
-    # MLP block.
+    # MLP block with pre-norm.
     mlp_lnx = mlp_block(
-        in_features=attn_output.shape[-1],
+        in_features=attention_lnx.shape[-1],
         intermediate_dim=cfg.mlp_dim,
         activations=cfg.mlp_activations,
         intermediate_dropout_rate=cfg.dropout_rate,
@@ -166,7 +158,8 @@ def __call__(
         name="mlp",
         config=cfg,
         quant=self.quant,
-    )(attn_output, deterministic=deterministic)
+        use_pre_norm=True,
+    )(attention_lnx, deterministic=deterministic)
 
     if cfg.use_post_ffw_norm:
       mlp_lnx = rms_norm(
 
@@ -315,7 +315,7 @@ def __init__(
       rngs: nnx.Rngs,
   ) -> None:
     """A MlpBlock module.
-    
+
     Args:
       config: Config object containing model parameters.
       in_features: Number of input features.
@@ -423,8 +423,19 @@ def __call__(self, inputs, decode: bool = False, deterministic: bool = False):
     """Applies Transformer MlpBlock module."""
     cfg = self.config
 
+
     if self.mlp_layer_norm is not None:
       inputs = self.mlp_layer_norm(inputs)
+      if self.model_mode == MODEL_MODE_PREFILL:
+        inputs = nn.with_logical_constraint(inputs, ("activation_batch",
+                                                     "prefill_activation_norm_length",
+                                                     "activation_embed")
+                                            )
+      else:
+        inputs = nn.with_logical_constraint(inputs, ("activation_batch",
+                                                     "activation_norm_length",
+                                                     "activation_embed")
+                                            )
 
     # Iterate over specified MLP input activation functions.
     # e.g. ('relu',) or ('gelu', 'linear') for gated-gelu.
 
@@ -128,20 +128,9 @@ def __call__(
     attention_lnx = nn.with_logical_constraint(attention_lnx, activation_axis_names)
     intermediate_inputs = inputs + attention_lnx
 
-    # Fully Connected
-    hidden_states = rms_norm(
-        num_features=intermediate_inputs.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="post_self_attention_layer_norm",
-        kernel_axes=("norm",),
-        epsilon=cfg.normalization_layer_epsilon,
-    )(intermediate_inputs)
-    hidden_states = nn.with_logical_constraint(hidden_states, activation_axis_names)
-
-    # MLP block.
+    # MLP block with pre-norm.
     mlp_lnx = mlp_block(
-        in_features=hidden_states.shape[-1],
+        in_features=intermediate_inputs.shape[-1],
         intermediate_dim=cfg.mlp_dim,
         activations=cfg.mlp_activations,
         intermediate_dropout_rate=cfg.dropout_rate,
@@ -151,7 +140,8 @@ def __call__(
         config=cfg,
         quant=self.quant,
         model_mode=model_mode,
-    )(hidden_states, deterministic=deterministic)
+        use_pre_norm=True,
+    )(intermediate_inputs, deterministic=deterministic)
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, activation_axis_names)
 
     layer_output = mlp_lnx + intermediate_inputs
 
@@ -455,25 +455,24 @@ def __call__(
     )
     intermediate_inputs = inputs + attention_lnx
 
-    # Fully Connected
-    hidden_states = rms_norm(
+    load_balance_loss = None
+    if self.is_moe_layer:
+      # Fully Connected
+      hidden_states = rms_norm(
         num_features=intermediate_inputs.shape[-1],
         dtype=cfg.dtype,
         weight_dtype=cfg.weight_dtype,
         name="post_self_attention_layer_norm",
         kernel_axes=("norm",),
         epsilon=cfg.normalization_layer_epsilon,
-    )(intermediate_inputs)
-    hidden_states = nn.with_logical_constraint(
-        hidden_states, ("activation_batch", "activation_norm_length", "activation_embed")
-    )
-
-    load_balance_loss = None
-    if self.is_moe_layer:
+      )(intermediate_inputs)
+      hidden_states = nn.with_logical_constraint(
+          hidden_states, ("activation_batch", "activation_norm_length", "activation_embed")
+      )
       # NOTE: the naming mismatch here is to ensure reverse compatibility with existing checkpoints.
       # The `name` represents the weight name in JAX/checkpoints and so the class name
       # is just for readability.
-      mlp_lnx = moe.RoutedAndSharedMoE(
+      mlp_lnx = moe.get_routed_and_shared_moe(
           name="Llama4MoEBlock_0",
           config=cfg,
           mesh=self.mesh,
@@ -484,8 +483,9 @@ def __call__(
           quant=self.quant,
       )(hidden_states)
     else:
+      # MLP block with pre-norm.
       mlp_lnx = mlp_block(
-          in_features=hidden_states.shape[-1],
+          in_features=intermediate_inputs.shape[-1],
           intermediate_dim=cfg.mlp_dim,
           activations=cfg.mlp_activations,
           intermediate_dropout_rate=cfg.dropout_rate,
@@ -494,7 +494,8 @@ def __call__(
           name="mlp",
           config=cfg,
           quant=self.quant,
-      )(hidden_states, deterministic=deterministic)
+          use_pre_norm=True,
+      )(intermediate_inputs, deterministic=deterministic)
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
 
@@ -118,21 +118,9 @@ def __call__(
     )
     intermediate_inputs = inputs + attention_lnx
 
-    # Fully Connected
-    hidden_states = rms_norm(
-        num_features=intermediate_inputs.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="post_self_attention_layer_norm",
-        kernel_axes=("norm",),
-        epsilon=cfg.normalization_layer_epsilon,
-    )(intermediate_inputs)
-    hidden_states = nn.with_logical_constraint(
-        hidden_states, ("activation_batch", "activation_norm_length", "activation_embed")
-    )
-
+    # MLP block with pre-norm.
     mlp_lnx = mlp_block(
-        in_features=hidden_states.shape[-1],
+        in_features=intermediate_inputs.shape[-1],
         intermediate_dim=cfg.mlp_dim,
         activations=cfg.mlp_activations,
         intermediate_dropout_rate=cfg.dropout_rate,
@@ -141,7 +129,8 @@ def __call__(
         name="mlp",
         config=cfg,
         quant=self.quant,
-    )(hidden_states, deterministic=deterministic)
+        use_pre_norm=True,
+    )(intermediate_inputs, deterministic=deterministic)
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
 
@@ -136,7 +136,7 @@ def __call__(
     # NOTE: the naming mismatch here is to ensure reverse compatibility with existing checkpoints.
     # The `name` represents the weight name in JAX/checkpoints and so the class name
     # is just for readability.
-    mlp_lnx, load_balance_loss = moe.RoutedMoE(
+    mlp_lnx, load_balance_loss = moe.get_routed_moe(
         name="MoeBlock_0",
         config=cfg,
         num_experts=cfg.num_experts,