revert change to cu_seqlen_k and max_k when preparing from position_ids (#39653)

winglian · web-flow · commit c46c17db570a · 2025-07-25T10:28:22.000+02:00
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -223,11 +223,6 @@ def _prepare_from_posids(query, key, value, position_ids):
     key = key.contiguous().view(-1, key.size(-2), key.size(-1))
     value = value.contiguous().view(-1, value.size(-2), value.size(-1))
 
-    cu_seqlens_k = torch.cat(
-        [torch.tensor([0], dtype=torch.int32, device=query.device), position_ids[:, -1].cumsum(dim=0) + 1], dim=0
-    )
-    max_k = torch.max(position_ids, dim=1).values.max().item() + 1
-
     position_ids = position_ids.flatten()
     indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
 
@@ -246,7 +241,7 @@ def _prepare_from_posids(query, key, value, position_ids):
     # We should use cu_seq_lens instead of position_ids to get the max length since position_ids is not always increasing
     # for some models (e.g. qwen2-vl).
     max_length = cu_seq_lens.diff().max().item()
-    return (query, key, value, indices_q, (cu_seq_lens, cu_seqlens_k), (max_length, max_k))
+    return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length))
 
 
 def _prepare_flash_attention_from_position_ids(query, key, value, position_ids):