microsoft · justinchuby · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/onnxruntime/core/providers/cpu/llm/attention.cc b/onnxruntime/core/providers/cpu/llm/attention.cc
@@ -199,13 +199,10 @@ void AttentionBase<T>::ComputeAttentionProbs(T* attention_probs,
                                              T* output_qk,                           // Q*K output
                                              ThreadPool* tp,
                                              AllocatorPtr allocator) const {
-  // The case past_key != nullptr and present_key == nullptr is not supported.
-  // We use the fact present_key is requested to avoid any extra allocation.
-  // However, if present_key is not requested, we should avoid allocated more memory than needed but that mean
-  // allocating one buffer per thread. That's why the implementation is not done.
-  // The user should define a model with a present_key even if not used if past_key is not null.
-  ORT_ENFORCE((past_key == nullptr) == (present_key == nullptr),
-              "The implementation only supports past_key and present_key both null or both not null.");
+  if (present_key != nullptr) {
+    ORT_ENFORCE(past_key != nullptr, "past_key must be provided when present_key is requested.");
+  }
+
   const size_t past_chunk_length = static_cast<size_t>(parameters.past_sequence_length) * parameters.head_size;   // P x H
   const size_t q_input_chunk_length = static_cast<size_t>(parameters.q_sequence_length) * parameters.head_size;   // S x H
   const size_t k_input_chunk_length = static_cast<size_t>(parameters.kv_sequence_length) * parameters.head_size;  // L x H