mistral migration

NuojCheng · NuojCheng · commit b2bfd1d83b82 · 2025-08-05T23:29:49.000Z
diff --git a/MaxText/layers/decoders.py b/MaxText/layers/decoders.py
@@ -338,7 +338,7 @@ def get_decoder_layers(self):
         return [llama2.LlamaDecoderLayer]
       case DecoderBlockType.MISTRAL:
         # TODO(ranran): update to Mistral with sliding window attention
-        return [mistral.MistralDecoderLayer]
+        return [mistral.mistral_decoder_layer_class()]
       case DecoderBlockType.MIXTRAL:
         return [mixtral.MixtralDecoderLayer]
       case DecoderBlockType.DEEPSEEK:
diff --git a/MaxText/layers/mistral.py b/MaxText/layers/mistral.py
@@ -19,35 +19,45 @@
 # pylint: disable=no-name-in-module
 
 
-from typing import Optional
+from typing import Any
 
 from jax.ad_checkpoint import checkpoint_name
 from jax.sharding import Mesh
 import jax.numpy as jnp
 
 from flax import linen as nn
+from flax import nnx
 
-from MaxText.layers.linears import mlp_block
-from MaxText.layers import models
+from MaxText.layers import initializers, nnx_wrappers
+from MaxText.layers.linears import MlpBlock
+from MaxText.layers.models import Config
+from MaxText.layers.attentions import Attention
 from MaxText.layers import quantizations
-from MaxText.layers.attentions import attention_as_linen
 from MaxText.layers.quantizations import AqtQuantization as Quant
-from MaxText.layers.normalizations import rms_norm
+from MaxText.layers.normalizations import RMSNorm
 
 
 # -----------------------------------------
 # The Decoder Layer for Mistral
 # -----------------------------------------
 
 
-class MistralDecoderLayer(nn.Module):
+class MistralDecoderLayer(nnx.Module):
   """Transformer decoder layer that attends to the encoder."""
 
-  config: models.Config
-  mesh: Mesh
-  quant: Optional[Quant] = None
+  def __init__(
+      self,
+      config: Config,
+      mesh: Mesh,
+      quant: Quant | None = None,
+      rngs: nnx.Rngs | None = None,
+      **kwargs: Any,
+  ):
+    self.config = config
+    self.mesh = mesh
+    self.quant = quant
+    self.rngs = rngs if rngs else kwargs.get("rngs", nnx.Rngs(0))
 
-  @nn.compact
   def __call__(
       self,
       inputs,
@@ -59,47 +69,43 @@ def __call__(
       page_state=None,
       slot=None,
   ):
-    cfg = self.config
-    mesh = self.mesh
 
     inputs = nn.with_logical_constraint(inputs, ("activation_batch", "activation_norm_length", "activation_embed"))
     inputs = checkpoint_name(inputs, "decoder_layer_input")
-    lnx_rms = rms_norm(
+    lnx_rms = RMSNorm(
         num_features=inputs.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="pre_self_attention_layer_norm",
+        dtype=self.config.dtype,
+        weight_dtype=self.config.weight_dtype,
         kernel_axes=("norm",),
-        epsilon=cfg.normalization_layer_epsilon,
+        epsilon=self.config.normalization_layer_epsilon,
+        rngs=self.rngs,
     )
     lnx = lnx_rms(inputs)
 
     lnx = nn.with_logical_constraint(lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     # Self-attention block
-    attention_layer = attention_as_linen(
-        config=cfg,
-        num_query_heads=cfg.num_query_heads,
-        num_kv_heads=cfg.num_kv_heads,
-        head_dim=cfg.head_dim,
-        max_target_length=cfg.max_target_length,
-        max_prefill_predict_length=cfg.max_prefill_predict_length,
-        attention_kernel=cfg.attention,
+    attention_layer = Attention(
+        config=self.config,
+        num_query_heads=self.config.num_query_heads,
+        num_kv_heads=self.config.num_kv_heads,
+        head_dim=self.config.head_dim,
+        max_target_length=self.config.max_target_length,
+        max_prefill_predict_length=self.config.max_prefill_predict_length,
+        attention_kernel=self.config.attention,
+        mesh=self.mesh,
+        dtype=self.config.dtype,
         inputs_q=lnx,
         inputs_kv=lnx,
-        mesh=mesh,
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        dropout_rate=cfg.dropout_rate,
-        name="self_attention",
-        float32_qk_product=cfg.float32_qk_product,
-        float32_logits=cfg.float32_logits,
+        weight_dtype=self.config.weight_dtype,
+        dropout_rate=self.config.dropout_rate,
+        float32_qk_product=self.config.float32_qk_product,
+        float32_logits=self.config.float32_logits,
         quant=self.quant,
-        kv_quant=quantizations.configure_kv_quant(cfg),
-        prefill_cache_axis_order=tuple(map(int, cfg.prefill_cache_axis_order.split(","))),
-        ar_cache_axis_order=tuple(map(int, cfg.ar_cache_axis_order.split(","))),
-        compute_axis_order=tuple(map(int, cfg.compute_axis_order.split(","))),
-        model_mode=model_mode,
+        kv_quant=quantizations.configure_kv_quant(self.config),
+        prefill_cache_axis_order=tuple(map(int, self.config.prefill_cache_axis_order.split(","))),
+        ar_cache_axis_order=tuple(map(int, self.config.ar_cache_axis_order.split(","))),
+        compute_axis_order=tuple(map(int, self.config.compute_axis_order.split(","))),
     )
 
     attention_lnx = attention_layer(
@@ -118,40 +124,40 @@ def __call__(
     intermediate_inputs = inputs + attention_lnx
 
     # Fully Connected
-    hidden_states = rms_norm(
+    hidden_states = RMSNorm(
         num_features=intermediate_inputs.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="post_self_attention_layer_norm",
+        dtype=self.config.dtype,
+        weight_dtype=self.config.weight_dtype,
         kernel_axes=("norm",),
-        epsilon=cfg.normalization_layer_epsilon,
+        epsilon=self.config.normalization_layer_epsilon,
+        rngs=self.rngs,
     )(intermediate_inputs)
     hidden_states = nn.with_logical_constraint(
         hidden_states, ("activation_batch", "activation_norm_length", "activation_embed")
     )
 
-    mlp_lnx = mlp_block(
+    mlp_lnx = MlpBlock(
         in_features=hidden_states.shape[-1],
-        intermediate_dim=cfg.mlp_dim,
-        activations=cfg.mlp_activations,
-        intermediate_dropout_rate=cfg.dropout_rate,
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="mlp",
-        config=cfg,
+        intermediate_dim=self.config.mlp_dim,
+        activations=self.config.mlp_activations,
+        intermediate_dropout_rate=self.config.dropout_rate,
+        dtype=self.config.dtype,
+        weight_dtype=self.config.weight_dtype,
+        config=self.config,
         quant=self.quant,
+        rngs=self.rngs,
     )(hidden_states, deterministic=deterministic)
     mlp_lnx = nn.with_logical_constraint(mlp_lnx, ("activation_batch", "activation_norm_length", "activation_embed"))
 
     layer_output = mlp_lnx + intermediate_inputs
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
+    layer_output = nn.Dropout(rate=self.config.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
 
     layer_output = nn.with_logical_constraint(
         layer_output,
         ("activation_batch", "activation_norm_length", "activation_embed"),
     )
 
-    if cfg.record_internal_nn_metrics:
+    if self.config.record_internal_nn_metrics:
       self.sow("intermediates", "activation_mean", jnp.mean(layer_output))
       self.sow("intermediates", "activation_stdev", jnp.std(layer_output))
       self.sow(
@@ -160,7 +166,14 @@ def __call__(
           jnp.sum(layer_output == 0) / jnp.size(layer_output),
       )
 
-    if cfg.scan_layers:
+    if self.config.scan_layers:
       return layer_output, None
     else:
       return layer_output
+
+def mistral_decoder_layer_class() -> nn.Module:
+  """Create a MistralDecoderLayer Linen module"""
+  return nnx_wrappers.to_linen_class(
+    MistralDecoderLayer,
+    metadata_fn=initializers.variable_to_logically_partitioned,
+  )