[NVIDIA] Fix Llama4 Scout FP4 functionality issues

nvpohanh · nvpohanh · commit 765aaffeb5e9 · 2025-07-24T18:16:16.000-07:00
Fix the weight loading issues and accuray issues when using the NVIDIA
ModelOpt Llama4 Scout FP4 model.

Signed-off-by: Po-Han Huang &lt;pohanh@nvidia.com&gt;
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -362,6 +362,7 @@ class EngineArgs:
     lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
     lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
 
+    device: Device = DeviceConfig.device
     num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
     multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -810,8 +810,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # Swizzle the weight blockscale.
         # contracting dimension is input dimension
         # block_size = 16;
-        assert (layer.weight_scale.shape[1] % 16 == 0), (
-            "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
         swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
@@ -363,11 +363,22 @@ def load_moe_expert_weights(
                 continue
             param = params_dict[full_param_name]
             weight_loader = param.weight_loader
+
+            # Helper function to check if the weight is FP4.
+            # We use uint8 to store FP4 weights for now.
+            def is_fp4_weight(weight):
+                return weight.dtype == torch.uint8
+
             if fused:
                 if "w13" in full_param_name:
                     shard_idx = 0 if shard_id == "w1" else 1
                     new_loaded_weight = new_loaded_weight[shard_idx]
-                new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+
+                # Only transpose for non-FP4 weights
+                # FP4 weights are already in the correct format and shouldn't be transposed here.
+                if not is_fp4_weight(new_loaded_weight):
+                    new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+
                 layer_idx = extract_layer_index(name)
                 # EP mapping
                 expert_map = self.layers[
@@ -382,6 +393,11 @@ def load_moe_expert_weights(
             else:
                 # TODO: add EP support for non fused weights
                 pass
+
+            # Only transpose for FP4 weights
+            if is_fp4_weight(new_loaded_weight):
+                new_loaded_weight = new_loaded_weight.transpose(-1, -2)
+
             weight_loader(param,
                           new_loaded_weight,
                           full_param_name,
@@ -402,6 +418,12 @@ def load_weights(self, weights: Iterable[tuple[str,
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
+        expert_scale_params_mapping = [
+            # (expert_name, expert_id, shard_id)
+            ("w13_", 0, 'w1'),
+            ("w13_", 0, 'w3'),
+            ("w2_",  0, 'w2')
+        ]
         fused_experts_params = False
         expert_params_mapping = FusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
@@ -483,19 +505,19 @@ def load_weights(self, weights: Iterable[tuple[str,
                                                'supports_moe_loading', False)
 
                         if supports_moe:
-                            # This is a MoE weight loader
-                            if "w13_" in name:
-                                shard_id = "w1"
-                            elif "w2_" in name:
-                                shard_id = "w2"
-                            else:
-                                shard_id = "w1"
-
-                            weight_loader(param,
-                                          loaded_weight,
-                                          name,
-                                          shard_id=shard_id,
-                                          expert_id=0)
+                            # Transpose if the weights are FP8 or FP4.
+                            if loaded_weight.dtype == torch.uint8 or loaded_weight.dtype == torch.float8_e4m3fn:
+                                loaded_weight = loaded_weight.transpose(-1, -2)
+                                param.data.fill_(0)
+
+                            for (expert_name, expert_id, shard_id) in expert_scale_params_mapping:
+                                if expert_name in name:
+                                    weight_loader(param,
+                                                loaded_weight,
+                                                name,
+                                                shard_id=shard_id,
+                                                expert_id=expert_id)
+
                         else:
                             # Regular weight loader (handles both
                             # param.weight_loader and default_weight_loader)
@@ -560,23 +582,28 @@ def permute_qk_weight_for_rotary(
         loaded_weight: torch.Tensor,
     ) -> tuple[str, torch.Tensor]:
 
+        # Helper function to permute the weight's channels
         def permute(w: torch.Tensor, n_heads: int):
-            attn_in = self.config.head_dim * n_heads
-            attn_out = self.config.hidden_size
-
-            return w.view(n_heads, attn_in // n_heads // 2, 2,
-                          attn_out).transpose(1, 2).reshape(attn_in, attn_out)
+            head_dim = w.shape[0] // n_heads
+            return (
+                w.view(n_heads, head_dim // 2, 2, w.shape[1])
+                .transpose(1, 2)
+                .reshape(w.shape[0], w.shape[1])
+            )
 
         modules = name.split(".")
 
-        # rotary embeds should be sliced
-        if ("wk" in modules or "k_proj" in modules) \
-           and modules[-1] == "weight":
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_key_value_heads)
-        elif ("wq" in modules or "q_proj" in modules) \
-                and modules[-1] == "weight":
-            loaded_weight = permute(loaded_weight,
-                                    self.config.num_attention_heads)
+        # Permute Q/K weights and weight block scales for rotary embedding
+        is_weight = modules[-1] == "weight"
+        is_nvfp4_weight_scale = (modules[-1] == "weight_scale"
+            and loaded_weight.dtype == torch.float8_e4m3fn)
+
+        if is_weight or is_nvfp4_weight_scale:
+            if ("wk" in modules or "k_proj" in modules):
+                loaded_weight = permute(loaded_weight,
+                                        self.config.num_key_value_heads)
+            elif ("wq" in modules or "q_proj" in modules):
+                loaded_weight = permute(loaded_weight,
+                                        self.config.num_attention_heads)
 
         return name, loaded_weight