Support multimodal in logit checker and match gemma3 logits with HF

aireenmei · aireenmei · commit 0a94059016c7 · 2025-08-20T20:44:57.000Z
diff --git a/MaxText/configs/base.yml b/MaxText/configs/base.yml
@@ -593,6 +593,7 @@ use_untrainable_positional_embedding: False
 trainable_position_size: -1  # enable gpt3 position embedding with a positive trainable_position_size
 # RoPE parameters
 rope_type: "default" # one of "default", "llama3.1" or "yarn"
+rope_linear_scaling_factor: 1.0 # linear scaling factor for "default" RoPE (see class `RotaryEmbedding` for more)
 rope_use_scale: True # apply rope scaling for llama3.1 (see class `LLaMARotaryEmbedding` for more)
 rope_min_timescale: 1
 rope_max_timescale: 10_000 # Timescale For global Attention
diff --git a/MaxText/configs/models/gemma3-12b.yml b/MaxText/configs/models/gemma3-12b.yml
@@ -21,7 +21,7 @@ base_num_kv_heads: 8
 base_mlp_dim: 15360
 head_dim: 256
 mlp_activations: ["gelu","linear"]
-vocab_size: 262_144
+vocab_size: 262_208
 decoder_block: "gemma3"
 normalization_layer_epsilon: 1e-6
 logits_via_embedding: True
@@ -30,3 +30,4 @@ use_post_attn_norm: true
 use_post_ffw_norm: true
 local_rope_max_timescale: 10_000
 rope_max_timescale: 1_000_000
+rope_linear_scaling_factor: 8.0
diff --git a/MaxText/configs/models/gemma3-27b.yml b/MaxText/configs/models/gemma3-27b.yml
@@ -21,7 +21,7 @@ base_num_kv_heads: 16
 base_mlp_dim: 21504
 head_dim: 128
 mlp_activations: ["gelu","linear"]
-vocab_size: 262_144
+vocab_size: 262_208
 decoder_block: "gemma3"
 normalization_layer_epsilon: 1e-6
 logits_via_embedding: True
@@ -30,3 +30,4 @@ use_post_attn_norm: true
 use_post_ffw_norm: true
 local_rope_max_timescale: 10_000
 rope_max_timescale: 1_000_000
+rope_linear_scaling_factor: 8.0
diff --git a/MaxText/configs/models/gemma3-4b.yml b/MaxText/configs/models/gemma3-4b.yml
@@ -21,7 +21,7 @@ base_num_kv_heads: 4
 base_mlp_dim: 10240
 head_dim: 256
 mlp_activations: ["gelu","linear"]
-vocab_size: 262_144
+vocab_size: 262_208
 decoder_block: "gemma3"
 normalization_layer_epsilon: 1e-6
 logits_via_embedding: True
@@ -30,3 +30,4 @@ use_post_attn_norm: true
 use_post_ffw_norm: true
 local_rope_max_timescale: 10_000
 rope_max_timescale: 1_000_000
+rope_linear_scaling_factor: 8.0
diff --git a/MaxText/layers/attentions.py b/MaxText/layers/attentions.py
@@ -694,11 +694,18 @@ def init_rotary_embedding(self):
       # For local attention use local_rope_max_timescale if it's is positive
       if self.attention_type == AttentionType.LOCAL_SLIDING and self.config.local_rope_max_timescale > 0:
         max_timescale = self.config.local_rope_max_timescale
+
+      rope_linear_scaling_factor = self.config.rope_linear_scaling_factor
+      # In gemma3, linear scaling factor does not apply to local sliding layers.
+      if self.config.model_name.startswith("gemma3") and self.attention_type == AttentionType.LOCAL_SLIDING:
+        rope_linear_scaling_factor = 1.0
+
       rotary_embedding = RotaryEmbedding(
           min_timescale=self.config.rope_min_timescale,
           max_timescale=max_timescale,
           embedding_dims=rope_embedding_dims,
           fprop_dtype=self.dtype,
+          rope_linear_scaling_factor=rope_linear_scaling_factor,
           rngs=self.rngs,
       )
     return rotary_embedding
diff --git a/MaxText/layers/embeddings.py b/MaxText/layers/embeddings.py
@@ -242,6 +242,7 @@ def __init__(
       fprop_dtype: DType = jnp.bfloat16,
       # Not used in RotaryEmbedding but passed in by nnx.bridge.to_linen.
       # TODO: Remove when bridge no longer needed
+      rope_linear_scaling_factor: float = 1.0,
       rngs: nnx.Rngs = None,
   ):
     """Initializes the RotaryEmbedding module.
@@ -261,6 +262,7 @@ def __init__(
     self.embedding_dims = embedding_dims
     self.cast_as_fprop_dtype = cast_as_fprop_dtype
     self.fprop_dtype = fprop_dtype
+    self.rope_linear_scaling_factor = rope_linear_scaling_factor
 
     if self.embedding_dims % 2:
       raise ValueError("Embedding dim for rotary position embedding must be a multiple of 2.")
@@ -270,7 +272,10 @@ def timescale(self):
     """Returns the timescale for the rotary embedding."""
     half_embedding_dim = self.embedding_dims // 2
     fraction = 2 * jnp.arange(0, half_embedding_dim) / self.embedding_dims
-    return self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction
+    timescale = self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction
+    if self.rope_linear_scaling_factor != 1.0:
+      timescale = timescale * self.rope_linear_scaling_factor
+    return timescale
 
   def __call__(
       self,  # pytype: disable=signature-mismatch  # overriding-parameter-count-checks
@@ -448,9 +453,7 @@ def __call__(self, inputs: jax.Array, position: None | jax.Array = None) -> jax.
     if len(inputs.shape) != 4:
       raise ValueError("Input is assumed to be a rank 4 tensor of shape [B, S, N, H].")
     if self.embedding_dims != inputs.shape[3]:
-      raise ValueError(
-          "The embedding dims of the rotary position embedding must match the hidden dimension of the inputs."
-      )
+      raise ValueError("The embedding dims of the rotary position embedding must match the hidden dimension of the inputs.")
 
     # Shift the inputs left and right as per LLaMA's specific behavior
     inputs_shifted_left = jnp.concatenate([inputs[..., 1:], inputs[..., :1]], axis=-1)
@@ -649,9 +652,7 @@ def __call__(self, inputs: Array, position: None | Array = None) -> Array:
     if len(inputs.shape) != 4:
       raise ValueError("Input is assumed to be a rank 4 tensor of shape [batch, sequence, heads, dims].")
     if self.embedding_dims != inputs.shape[3]:
-      raise ValueError(
-          "The embedding dims of the rotary position embedding must match the hidden dimension of the inputs."
-      )
+      raise ValueError("The embedding dims of the rotary position embedding must match the hidden dimension of the inputs.")
 
     # Determine positions if not provided
     if position is None:
diff --git a/MaxText/layers/gemma3.py b/MaxText/layers/gemma3.py
@@ -277,15 +277,16 @@ def _posemb_sincos_2d(
     width: int,
     temperature: float = 10_000.0,
     dtype: jnp.dtype = jnp.float32,
+    precision: str = "default",
 ):
   """Follows the MoCo v3 logic."""
   y, x = jnp.mgrid[:h, :w]  # pylint: disable=unpacking-non-sequence
 
   assert width % 4 == 0, "Width must be mult of 4 for sincos posemb"
   omega = jnp.arange(width // 4) / (width // 4 - 1)
   omega = 1.0 / (temperature**omega)
-  y = jnp.einsum("m,d->md", y.flatten(), omega)
-  x = jnp.einsum("m,d->md", x.flatten(), omega)
+  y = jnp.einsum("m,d->md", y.flatten(), omega, precision=jax.lax.Precision(precision))
+  x = jnp.einsum("m,d->md", x.flatten(), omega, precision=jax.lax.Precision(precision))
   pe = jnp.concatenate([jnp.sin(x), jnp.cos(x), jnp.sin(y), jnp.cos(y)], axis=1)
   return jnp.asarray(pe, dtype)[None, :, :]
 
@@ -297,18 +298,22 @@ class MlpBlockViT(nn.Module):
   dtype_mm: str
   mlp_dim: int | None = None  # Defaults to 4x input dim
   dropout: float = 0.0
+  precision: str = "default"
 
   @nn.compact
   def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
     """Applies Transformer MlpBlock module."""
     inits = {"kernel_init": nn.initializers.xavier_uniform(), "bias_init": nn.initializers.normal(stddev=1e-6)}
 
     d = x.shape[-1]
-    x = nn.Dense(features=self.mlp_dim or 4 * d, dtype=self.dtype_mm, **inits)(x)
+    x = nn.Dense(features=self.mlp_dim or 4 * d, precision=jax.lax.Precision(self.precision), dtype=self.dtype_mm, **inits)(
+        x
+    )
     x = nn.gelu(x)
     x = nn.Dropout(rate=self.dropout)(x, deterministic)
     x = nn.Dense(
         features=d,
+        precision=jax.lax.Precision(self.precision),
         dtype=self.dtype_mm,
         **inits,
     )(x)
@@ -323,6 +328,7 @@ class Encoder1DBlock(nn.Module):
   mlp_dim: int | None = None  # Defaults to 4x input dim
   num_heads: int = 12
   dropout: float = 0.0
+  precision: str = "default"
 
   @nn.compact
   def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
@@ -331,6 +337,7 @@ def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
     y = nn.MultiHeadDotProductAttention(
         num_heads=self.num_heads,
         kernel_init=nn.initializers.xavier_uniform(),
+        precision=jax.lax.Precision(self.precision),
         deterministic=deterministic,
         dtype=self.dtype_mm,
     )(y, y)
@@ -343,6 +350,7 @@ def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
         mlp_dim=self.mlp_dim,
         dropout=self.dropout,
         dtype_mm=self.dtype_mm,
+        precision=self.precision,
     )(y, deterministic)
     y = nn.Dropout(rate=self.dropout)(y, deterministic)
     x = x + y
@@ -358,7 +366,8 @@ class Encoder(nn.Module):
   mlp_dim: int | None = None  # Defaults to 4x input dim
   num_heads: int = 12
   dropout: float = 0.0
-  scan: bool = False
+  scan: bool = False,
+  precision: str = "default",
 
   @nn.compact
   def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
@@ -383,6 +392,7 @@ def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
           mlp_dim=self.mlp_dim,
           num_heads=self.num_heads,
           dropout=self.dropout,
+          precision=self.precision,
       )(
           x, deterministic
       )
@@ -396,6 +406,7 @@ def __call__(self, x: jax.Array, deterministic: bool = True) -> jax.Array:
             mlp_dim=self.mlp_dim,
             num_heads=self.num_heads,
             dropout=self.dropout,
+            precision=self.precision,
         )
         x = block_cur(x, deterministic)
     x: jax.Array = nn.LayerNorm(name="encoder_norm")(x)
@@ -409,6 +420,7 @@ class Einsum(nn.Module):
   weight_name: str = "w"
   initializer: nn.initializers.Initializer = nn.initializers.normal()
   dtype: jnp.dtype | None = None
+  precision: str = "default"
 
   @nn.compact
   def __call__(self, eqn: str, x: jax.Array) -> jax.Array:
@@ -418,7 +430,7 @@ def __call__(self, eqn: str, x: jax.Array) -> jax.Array:
         self.shape,
         self.dtype if self.dtype is not None else None,
     )
-    return jnp.einsum(eqn, x, w)
+    return jnp.einsum(eqn, x, w, precision=jax.lax.Precision(self.precision))
 
 
 class VisionEmbedder(nn.Module):
@@ -430,8 +442,10 @@ class VisionEmbedder(nn.Module):
 
   def setup(self):
     if self.vision_proj_dim:
-      self.mm_soft_embedding_norm = rms_norm(self.vision_proj_dim)
-      self.mm_input_projection = Einsum((self.vision_proj_dim, self.config.emb_dim))
+      self.mm_soft_embedding_norm = rms_norm(self.vision_proj_dim, dtype=self.config.dtype_mm)
+      self.mm_input_projection = Einsum(
+          (self.vision_proj_dim, self.config.emb_dim), dtype=self.config.dtype_mm, precision=self.config.matmul_precision
+      )
 
   def encode_vision(self, x: jax.Array) -> jax.Array:
     x = self.mm_soft_embedding_norm(x)
@@ -494,6 +508,7 @@ def _get_posemb(
       width: int,
       name: str,
       dtype: jnp.dtype = jnp.float32,
+      precision: str = "default",
   ):
     """Returns the position embedding."""
     if typ == "learn":
@@ -505,7 +520,7 @@ def _get_posemb(
           dtype,
       )
     elif typ == "sincos2d":
-      return _posemb_sincos_2d(*seqshape, width=width, dtype=dtype)
+      return _posemb_sincos_2d(*seqshape, width=width, dtype=dtype, precision=precision)
     else:
       raise ValueError(f"Unknown posemb type: {typ}")
 
@@ -524,7 +539,15 @@ def __call__(self, inputs, deterministic, train=False):
     b, n, h, w, c = inputs.shape
     x = jnp.reshape(inputs, [b * n, h, w, c])
     # Gemma3 uses conv2d with stride 14 and kernel size 14 to extract patches.
-    x = nn.Conv(features=1152, kernel_size=(14, 14), strides=14, padding="VALID", name="embedding")(x)
+    x = nn.Conv(
+        features=1152,
+        kernel_size=(14, 14),
+        strides=14,
+        padding="VALID",
+        name="embedding",
+        dtype=cfg.dtype_mm,
+        precision=jax.lax.Precision(cfg.matmul_precision),
+    )(x)
     bn, h, w, c = x.shape
     x = jnp.reshape(x, [bn, h * w, c])
 
@@ -535,6 +558,7 @@ def __call__(self, inputs, deterministic, train=False):
         width=c,
         name="pos_embedding",
         dtype=x.dtype,
+        precision=cfg.matmul_precision,
     )
 
     x = nn.Dropout(rate=self.dropout)(x, not train)
@@ -549,6 +573,7 @@ def __call__(self, inputs, deterministic, train=False):
         remat_policy=cfg.remat_policy_for_vit,
         dtype_mm=cfg.dtype_mm,
         name="Transformer",
+        precision=cfg.matmul_precision,
     )(x, deterministic=deterministic)
 
     # Gemma3 use a vision exit layer to downsample the soft tokens to a required output length.
diff --git a/MaxText/multimodal_utils.py b/MaxText/multimodal_utils.py
@@ -35,9 +35,9 @@
 GEMMA_IMAGE_STD = (127.5,) * 3
 GEMMA_IMAGE_PLACEHOLDER_IN_PROMPT = "<start_of_image>"
 GEMMA_BEGIN_IMAGE_TOKEN = 255999
-GEMMA_END_IMAGE_TOKEN = 262144
+GEMMA_END_IMAGE_TOKEN = 256000
 GEMMA_NEW_LINE_TOKEN = 108
-GEMMA_TOKEN_PLACEHOLDER = -2
+GEMMA_TOKEN_PLACEHOLDER = 262144
 # The number of GEMMA_TOKEN_PLACEHOLDER tokens per image in Gemma3
 GEMMA_NUM_PLACEHOLDER_TOKENS_PER_IMAGE = 256
 # +4 means 4 extra tokens to pad around image: \n\n, <start_of_image>, <end_of_image>, \n\n
diff --git a/MaxText/scratch_code/generate_hf_golden_logits.py b/MaxText/scratch_code/generate_hf_golden_logits.py
diff --git a/MaxText/tests/check_gemma3_layers.py b/MaxText/tests/check_gemma3_layers.py
diff --git a/MaxText/tests/forward_pass_logit_checker.py b/MaxText/tests/forward_pass_logit_checker.py
diff --git a/MaxText/utils/ckpt_conversion/utils/param_mapping.py b/MaxText/utils/ckpt_conversion/utils/param_mapping.py