diff --git a/apex/contrib/fmha/fmha.py b/apex/contrib/fmha/fmha.py index 6aaca804a..b43600c84 100644 --- a/apex/contrib/fmha/fmha.py +++ b/apex/contrib/fmha/fmha.py @@ -33,6 +33,13 @@ class FMHAFun(torch.autograd.Function): @staticmethod def forward(ctx, qkv, cu_seqlens, p_dropout, max_s, is_training, zero_tensors): + from apex import deprecated_warning + deprecated_warning( + "`apex.contrib.fmha` is deprecated and will be removed in March 2026. " + "We encourage you to migrate to PyTorch native MultiheadAttention" + "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html" + ) + batch_size = cu_seqlens.numel() - 1 if batch_size < 4: max_s = 512 diff --git a/apex/contrib/multihead_attn/encdec_multihead_attn_func.py b/apex/contrib/multihead_attn/encdec_multihead_attn_func.py index cef255ba8..f32cf1d37 100644 --- a/apex/contrib/multihead_attn/encdec_multihead_attn_func.py +++ b/apex/contrib/multihead_attn/encdec_multihead_attn_func.py @@ -21,6 +21,13 @@ def forward( mask, dropout_prob, ): + from apex import deprecated_warning + deprecated_warning( + "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. " + "We encourage you to migrate to PyTorch native MultiheadAttention" + "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html" + ) + use_biases_t = torch.tensor([input_biases_q is not None]) heads_t = torch.tensor([heads]) scale_t = torch.tensor([scale]) diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py index 9431a4936..a6adae67c 100644 --- a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py +++ b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py @@ -18,6 +18,13 @@ def forward( pad_mask, dropout_prob, ): + from apex import deprecated_warning + deprecated_warning( + "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. " + "We encourage you to migrate to PyTorch native MultiheadAttention" + "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html" + ) + heads_t = torch.tensor([heads]) dropout_prob_t = torch.tensor([dropout_prob]) null_tensor = torch.tensor([]) diff --git a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py index 320bebd66..5fe9cc264 100644 --- a/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py +++ b/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py @@ -27,6 +27,13 @@ def forward( pad_mask, dropout_prob, ): + from apex import deprecated_warning + deprecated_warning( + "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. " + "We encourage you to migrate to PyTorch native MultiheadAttention" + "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html" + ) + heads_t = torch.tensor([heads]) dropout_prob_t = torch.tensor([dropout_prob]) null_tensor = torch.tensor([]) diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py index 6b50fe227..402a16f97 100644 --- a/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py +++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py @@ -19,6 +19,13 @@ def forward( mask_additive, dropout_prob, ): + from apex import deprecated_warning + deprecated_warning( + "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. " + "We encourage you to migrate to PyTorch native MultiheadAttention" + "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html" + ) + use_biases_t = torch.tensor([input_biases is not None]) heads_t = torch.tensor([heads]) dropout_prob_t = torch.tensor([dropout_prob]) diff --git a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py index 7f110cb33..10329af77 100644 --- a/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py +++ b/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py @@ -18,6 +18,13 @@ def forward( pad_mask, dropout_prob, ): + from apex import deprecated_warning + deprecated_warning( + "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. " + "We encourage you to migrate to PyTorch native MultiheadAttention" + "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html" + ) + heads_t = torch.tensor([heads]) dropout_prob_t = torch.tensor([dropout_prob]) null_tensor = torch.tensor([]) diff --git a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py index b34eec444..c3edf2a27 100644 --- a/apex/contrib/multihead_attn/mask_softmax_dropout_func.py +++ b/apex/contrib/multihead_attn/mask_softmax_dropout_func.py @@ -6,6 +6,13 @@ class MaskSoftmaxDropout(torch.autograd.Function): @staticmethod def forward(ctx, is_training, heads, inputs, pad_mask, mask_additive, dropout_prob): + from apex import deprecated_warning + deprecated_warning( + "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. " + "We encourage you to migrate to PyTorch native MultiheadAttention" + "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html" + ) + heads_t = torch.tensor([heads]) dropout_prob_t = torch.tensor([dropout_prob]) null_tensor = torch.tensor([]) diff --git a/apex/contrib/multihead_attn/self_multihead_attn_func.py b/apex/contrib/multihead_attn/self_multihead_attn_func.py index c27a7203c..3bef54aa8 100644 --- a/apex/contrib/multihead_attn/self_multihead_attn_func.py +++ b/apex/contrib/multihead_attn/self_multihead_attn_func.py @@ -19,6 +19,13 @@ def forward( is_additive_mask, dropout_prob, ): + from apex import deprecated_warning + deprecated_warning( + "`apex.contrib.multihead_attn` is deprecated and will be removed in March 2026. " + "We encourage you to migrate to PyTorch native MultiheadAttention" + "The documentation is available in https://docs.pytorch.org/docs/main/generated/torch.nn.MultiheadAttention.html" + ) + use_biases_t = torch.tensor([input_biases is not None]) heads_t = torch.tensor([heads]) scale_t = torch.tensor([scale])