NVIDIA · Aidyn-A · Sep 19, 2025 · crcrpar · Oct 27, 2025
diff --git a/apex/contrib/fmha/fmha.py b/apex/contrib/fmha/fmha.py
@@ -33,6 +33,13 @@
 class FMHAFun(torch.autograd.Function):
     @staticmethod
     def forward(ctx, qkv, cu_seqlens, p_dropout, max_s, is_training, zero_tensors):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.fmha` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         batch_size = cu_seqlens.numel() - 1
         if batch_size < 4:
             max_s = 512

diff --git a/apex/contrib/multihead_attn/encdec_multihead_attn_func.py b/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
@@ -21,6 +21,13 @@ def forward(
         mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         use_biases_t = torch.tensor([input_biases_q is not None])
         heads_t = torch.tensor([heads])
         scale_t = torch.tensor([scale])

diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
@@ -18,6 +18,13 @@ def forward(
         pad_mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
         null_tensor = torch.tensor([])

diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
@@ -27,6 +27,13 @@ def forward(
         pad_mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
         null_tensor = torch.tensor([])

diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
@@ -19,6 +19,13 @@ def forward(
         mask_additive,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         use_biases_t = torch.tensor([input_biases is not None])
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])

diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
@@ -18,6 +18,13 @@ def forward(
         pad_mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
         null_tensor = torch.tensor([])

diff --git a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
@@ -6,6 +6,13 @@
 class MaskSoftmaxDropout(torch.autograd.Function):
     @staticmethod
     def forward(ctx, is_training, heads, inputs, pad_mask, mask_additive, dropout_prob):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
         null_tensor = torch.tensor([])

diff --git a/apex/contrib/multihead_attn/self_multihead_attn_func.py b/apex/contrib/multihead_attn/self_multihead_attn_func.py
@@ -19,6 +19,13 @@ def forward(
         is_additive_mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         use_biases_t = torch.tensor([input_biases is not None])
         heads_t = torch.tensor([heads])
         scale_t = torch.tensor([scale])