[NVIDIA] Fix Llama4 Scout FP4 functionality issues (vllm-project#21499)

nvpohanh · vadiklyutiy · commit 668f1ae2ac97 · 2025-08-05T15:03:15.000Z
Signed-off-by: Po-Han Huang &lt;pohanh@nvidia.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -874,6 +874,14 @@ def _load_per_tensor_weight_scale(self, shard_id: str,
         elif shard_id == "w2":
             param_data[expert_id] = loaded_weight
 
+    def _load_w13_weight_scale(self, shard_dim: int,
+                               loaded_weight: torch.Tensor,
+                               param: torch.Tensor, tp_rank: int):
+        shard_size = param.shape[shard_dim]
+        loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank,
+                                             shard_size)
+        param.copy_(loaded_weight)
+
     def _load_model_weight_or_group_weight_scale(self,
                                                  shard_dim: int,
                                                  expert_data: torch.Tensor,
@@ -1123,7 +1131,12 @@ def weight_loader(self,
                 "weight_scale_2" in weight_name if uses_weight_scale_2 else
                 "weight_scale" in weight_name) or "input_scale" in weight_name
 
-            if per_tensor_conditions:
+            if "w13_weight_scale" in weight_name:
+                self._load_w13_weight_scale(shard_dim=shard_dim,
+                                            loaded_weight=loaded_weight,
+                                            param=param,
+                                            tp_rank=self.tp_rank)
+            elif per_tensor_conditions:
                 self._load_per_tensor_weight_scale(
                     shard_id=shard_id,
                     param=param,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -778,8 +778,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # Swizzle the weight blockscale.
         # contracting dimension is input dimension
         # block_size = 16;
-        assert (layer.weight_scale.shape[1] % 16 == 0), (
-            "Expected weight_scale.dim(1) to be divisible by 16")
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
         swizzled_weight_scale = swizzle_blockscale(layer.weight_scale)
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py