Lightning-AI
diff --git a/‎thunder/core/prims.py‎
Lines changed: 123 additions & 0 deletions b/‎thunder/core/prims.py‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎thunder/executors/nvfuserex_impl.py‎
Lines changed: 72 additions & 0 deletions b/‎thunder/executors/nvfuserex_impl.py‎
Lines changed: 72 additions & 0 deletions
@@ -270,6 +270,7 @@ class PrimIDs(Enum):
     # Linear algebra prims (Mostly experimental)
     MATMUL = auto()
     _GROUPED_MM = auto()  # Used for grouped matmuls
+    SCALED_GROUPED_MM = auto()  # Used for scaled grouped matmuls
     # NN prims (Experimental!)
     CONVOLUTION = auto()
     EMBEDDING = auto()
@@ -3792,6 +3793,128 @@ def _grouped_mm_meta(a: TensorProxy, b: TensorProxy, offsets: TensorProxy) -> Te
 )
 
 
+def scaled_grouped_mm_meta(
+    a: TensorProxy,
+    b: TensorProxy,
+    scale_a: TensorProxy,
+    scale_b: TensorProxy,
+    offsets: None | TensorProxy = None,
+    bias: None | TensorProxy = None,
+    scale_result: None | TensorProxy = None,
+    out_dtype: None | dtypes.dtype = None,
+) -> TensorProxy:
+    """Meta function for scaled_grouped_mm primitive.
+
+    Similar to _grouped_mm but with scale tensors for quantization/dequantization.
+    Accepts the following shape combinations:
+    1. (m, k) x (k, n) -> (groups, m, n)
+    2. (groups, m, k) x (k, n) -> (m, n)
+    3. (m, k) x (groups, k, n) -> (m, n)
+
+    Args:
+        a: Input tensor of shape (groups, m, k) or (m, k)
+        b: Input tensor of shape (groups, k, n) or (k, n)
+        scale_a: Scale tensor for a
+        scale_b: Scale tensor for b
+        offsets: Optional offset tensor of shape (groups,)
+        bias: Optional bias tensor
+        scale_result: Optional scale tensor for result
+        out_dtype: Optional output dtype
+
+    Returns:
+        TensorProxy with shape (groups, m, n) or (m, n)
+    """
+    # Validate types
+    utils.check_type(a, TensorProxy)
+    utils.check_type(b, TensorProxy)
+    utils.check_type(scale_a, TensorProxy)
+    utils.check_type(scale_b, TensorProxy)
+
+    # Accept 2D or 3D tensors
+    utils.check(a.ndim in (2, 3), lambda: f"Expected a to have 2 or 3 dimensions, got {a.ndim}")
+    utils.check(b.ndim in (2, 3), lambda: f"Expected b to have 2 or 3 dimensions, got {b.ndim}")
+
+    # Compute output shape using same logic as _grouped_mm
+    if offsets is not None:
+        utils.check_type(offsets, TensorProxy)
+        utils.check(offsets.ndim == 1, lambda: f"`offsets` must be a vector, got shape {offsets.shape}")
+
+        if a.ndim == 2 and b.ndim == 2:
+            utils.check(a.shape[1] == b.shape[0], lambda: f"Inner dimension mismatch: {a.shape} vs {b.shape}")
+            out_shape = (offsets.shape[0], a.shape[0], b.shape[1])
+        elif a.ndim == 3 and b.ndim == 2:
+            utils.check(a.shape[2] == b.shape[1], lambda: f"Inner dimension mismatch: {a.shape} vs {b.shape}")
+            utils.check(a.shape[0] == offsets.shape[0], lambda: f"Group count mismatch: {a.shape} vs {offsets.shape}")
+            out_shape = (a.shape[1], b.shape[1])
+        elif a.ndim == 2 and b.ndim == 3:
+            utils.check(a.shape[1] == b.shape[1], lambda: f"Inner dimension mismatch: {a.shape} vs {b.shape}")
+            utils.check(b.shape[0] == offsets.shape[0], lambda: f"Group count mismatch: {b.shape} vs {offsets.shape}")
+            out_shape = (a.shape[0], b.shape[2])
+        else:
+            utils.check(False, lambda: f"Unexpected shape combination: {a.shape} and {b.shape}")
+    else:
+        # Without offsets, fall back to standard matmul shape logic
+        if a.ndim == 2 and b.ndim == 2:
+            utils.check(a.shape[1] == b.shape[0], lambda: f"Inner dimension mismatch: {a.shape} vs {b.shape}")
+            out_shape = (a.shape[0], b.shape[1])
+        elif a.ndim == 3 and b.ndim == 2:
+            utils.check(a.shape[2] == b.shape[1], lambda: f"Inner dimension mismatch: {a.shape} vs {b.shape}")
+            out_shape = (a.shape[0], a.shape[1], b.shape[1])
+        elif a.ndim == 2 and b.ndim == 3:
+            utils.check(a.shape[1] == b.shape[1], lambda: f"Inner dimension mismatch: {a.shape} vs {b.shape}")
+            out_shape = (b.shape[0], a.shape[0], b.shape[2])
+        else:
+            utils.check(False, lambda: f"Unexpected shape combination: {a.shape} and {b.shape}")
+
+    # Validate scale tensors
+    # Scale tensors are typically 1D with shape matching the number of groups
+    # or they can be scalars
+    utils.check(
+        scale_a.ndim <= 1,
+        lambda: f"Expected scale_a to be a scalar or 1D tensor, got shape {scale_a.shape}",
+    )
+    utils.check(
+        scale_b.ndim <= 1,
+        lambda: f"Expected scale_b to be a scalar or 1D tensor, got shape {scale_b.shape}",
+    )
+
+    # Validate bias if provided
+    if bias is not None:
+        utils.check_type(bias, TensorProxy)
+        utils.check_same_device(a, bias)
+        utils.check_same_dtype(a, bias)
+
+    # Validate scale_result if provided
+    if scale_result is not None:
+        utils.check_type(scale_result, TensorProxy)
+        utils.check(
+            scale_result.ndim <= 1,
+            lambda: f"Expected scale_result to be a scalar or 1D tensor, got shape {scale_result.shape}",
+        )
+
+    utils.check_same_dtype(a, b)
+    utils.check(a.dtype in dtypes.float_math_dtypes, lambda: f"`a` must be 16-bit float or higher, got {a.dtype}")
+    if offsets is not None:
+        utils.check(utils.is_integer_dtype(offsets.dtype), lambda: f"`offsets` must be integers, got {offsets.dtype}")
+
+    utils.check_same_device(a, b, scale_a, scale_b)
+    if offsets is not None:
+        utils.check_same_device(a, offsets)
+
+    # Determine output dtype
+    result_dtype = out_dtype if out_dtype is not None else a.dtype
+
+    return TensorProxy(like=a, shape=out_shape, dtype=result_dtype)
+
+
+scaled_grouped_mm = make_prim(
+    PrimIDs.SCALED_GROUPED_MM,
+    "scaled_grouped_mm",
+    meta=scaled_grouped_mm_meta,
+    tags=(OpTags.MATMUL_OP,),
+)
+
+
 def transpose_meta(a: TensorProxy, /, permutation: tuple[int, ...]) -> TensorProxy:
     utils.check_type(a, TensorProxy)
     utils.check_type(permutation, tuple)
 
@@ -3231,6 +3231,78 @@ def _grouped_mm_transform(
 register_supported(DTensorPrimIDs._GROUPED_MM, _grouped_mm_transform, _grouped_mm_check)
 
 
+def _scaled_grouped_mm_check(
+    a: TensorProxy,
+    b: TensorProxy,
+    scale_a: TensorProxy,
+    scale_b: TensorProxy,
+    offsets: None | TensorProxy = None,
+    bias: None | TensorProxy = None,
+    scale_result: None | TensorProxy = None,
+    out_dtype: None | dtypes.dtype = None,
+) -> bool:
+    # Check version requirement - scaled_grouped_mm likely requires same or newer version than grouped_mm
+    if nvfuser_version() < LooseVersion("0.2.28"):
+        return False
+
+    # Check all required tensors are supported
+    if not are_supported_tensors(a, b, scale_a, scale_b):
+        return False
+
+    # Check optional tensors if provided
+    if offsets is not None and not is_supported_tensor(offsets):
+        return False
+    if bias is not None and not is_supported_tensor(bias):
+        return False
+    if scale_result is not None and not is_supported_tensor(scale_result):
+        return False
+
+    # Check that nvfp4 is supported if used
+    if a.dtype == dtypes.float4_e2m1fn_x2 or b.dtype == dtypes.float4_e2m1fn_x2:
+        # nvfp4 requires nvFuser 0.2.28+ (already checked above)
+        # Additionally check device capability for fp8/fp4 support
+        if not device_supports_fp8():
+            return False
+
+    return True
+
+
+def _scaled_grouped_mm_transform(
+    a: TensorProxy,
+    b: TensorProxy,
+    scale_a: TensorProxy,
+    scale_b: TensorProxy,
+    offsets: None | TensorProxy = None,
+    bias: None | TensorProxy = None,
+    scale_result: None | TensorProxy = None,
+    out_dtype: None | dtypes.dtype = None,
+    *,
+    fd: FusionDefinition,
+    lc_to_nv_map: dict,
+) -> Any:
+    nva = getnv(a, fd, lc_to_nv_map)
+    nvb = getnv(b, fd, lc_to_nv_map)
+    nv_scale_a = getnv(scale_a, fd, lc_to_nv_map)
+    nv_scale_b = getnv(scale_b, fd, lc_to_nv_map)
+    nv_offsets = getnv(offsets, fd, lc_to_nv_map) if offsets is not None else None
+    nv_bias = getnv(bias, fd, lc_to_nv_map) if bias is not None else None
+    nv_scale_result = getnv(scale_result, fd, lc_to_nv_map) if scale_result is not None else None
+
+    # Translate out_dtype to nvFuser dtype if provided
+    nv_out_dtype = None
+    if out_dtype is not None:
+        nv_out_dtype = lcdtype_to_nvdtype(out_dtype)
+
+    # Call nvFuser's scaled_grouped_mm operation
+    # The API signature may vary, but typically includes all parameters
+    return fd.ops.scaled_grouped_mm(
+        nva, nvb, nv_scale_a, nv_scale_b, nv_offsets, nv_bias, nv_scale_result, nv_out_dtype
+    )
+
+
+register_supported(prims.scaled_grouped_mm, _scaled_grouped_mm_transform, _scaled_grouped_mm_check)
+
+
 def _cumsum_check(a: TensorProxy, dim: int, /, dtype: dtypes.dtype | None = None) -> bool:
     if nvfuser_version() < LooseVersion("0.2.33") and a.ndim != 1:
         return False