diff --git a/apex/contrib/fmha/fmha.py b/apex/contrib/fmha/fmha.py
index 6aaca804a..b43600c84 100644
--- a/apex/contrib/fmha/fmha.py
+++ b/apex/contrib/fmha/fmha.py
@@ -33,6 +33,13 @@
 class FMHAFun(torch.autograd.Function):
     @staticmethod
     def forward(ctx, qkv, cu_seqlens, p_dropout, max_s, is_training, zero_tensors):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.fmha` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         batch_size = cu_seqlens.numel() - 1
         if batch_size < 4:
             max_s = 512
diff --git a/apex/contrib/multihead_attn/encdec_multihead_attn_func.py b/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
index cef255ba8..f32cf1d37 100644
--- a/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
@@ -21,6 +21,13 @@ def forward(
         mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         use_biases_t = torch.tensor([input_biases_q is not None])
         heads_t = torch.tensor([heads])
         scale_t = torch.tensor([scale])
diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
index 9431a4936..a6adae67c 100644
--- a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
@@ -18,6 +18,13 @@ def forward(
         pad_mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
         null_tensor = torch.tensor([])
diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
index 320bebd66..5fe9cc264 100644
--- a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
+++ b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
@@ -27,6 +27,13 @@ def forward(
         pad_mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
         null_tensor = torch.tensor([])
diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
index 6b50fe227..402a16f97 100644
--- a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
@@ -19,6 +19,13 @@ def forward(
         mask_additive,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         use_biases_t = torch.tensor([input_biases is not None])
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
index 7f110cb33..10329af77 100644
--- a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
+++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
@@ -18,6 +18,13 @@ def forward(
         pad_mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
         null_tensor = torch.tensor([])
diff --git a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
index b34eec444..c3edf2a27 100644
--- a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
+++ b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
@@ -6,6 +6,13 @@
 class MaskSoftmaxDropout(torch.autograd.Function):
     @staticmethod
     def forward(ctx, is_training, heads, inputs, pad_mask, mask_additive, dropout_prob):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         heads_t = torch.tensor([heads])
         dropout_prob_t = torch.tensor([dropout_prob])
         null_tensor = torch.tensor([])
diff --git a/apex/contrib/multihead_attn/self_multihead_attn_func.py b/apex/contrib/multihead_attn/self_multihead_attn_func.py
index c27a7203c..3bef54aa8 100644
--- a/apex/contrib/multihead_attn/self_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/self_multihead_attn_func.py
@@ -19,6 +19,13 @@ def forward(
         is_additive_mask,
         dropout_prob,
     ):
+        from apex import deprecated_warning
+        deprecated_warning(
+            "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. "
+            "We encourage you to migrate to PyTorch native MultiheadAttention"
+            "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html"
+        )
+
         use_biases_t = torch.tensor([input_biases is not None])
         heads_t = torch.tensor([heads])
         scale_t = torch.tensor([scale])