@@ -138,6 +138,20 @@ bool BlockKVCacheExtension::initialize(
138138 // Phase 4: Create managers and pre-compute binding helpers
139139 create_block_managers_and_helpers (layer_blocks, prefill_in_ports, generate_requests, gen_variant_in_ports);
140140
141+ // Phase 5: Snapshot original prefill output tensors for restore_prefill_output_buffers().
142+ // These are the model-owned buffers that exist before any zero-copy redirect happens.
143+ for (const auto & [layer_idx, layer_managers] : m_kv_cache_block_managers) {
144+ const std::string layer_str = std::to_string (layer_idx);
145+ for (const char * kv_type : {" key" , " value" }) {
146+ std::string output_name = " present." + layer_str + " ." + kv_type;
147+ auto port_it = prefill_out_ports.find (output_name);
148+ if (port_it != prefill_out_ports.end ()) {
149+ m_prefill_original_output_tensors[output_name] = prefill_request->get_tensor (port_it->second );
150+ }
151+ }
152+ }
153+ LOG_INFO (" Snapshotted " << m_prefill_original_output_tensors.size () << " original prefill output tensors" );
154+
141155 LOG_INFO (" === Block-based KV Cache Initialization Complete ===" );
142156
143157 m_enabled = true ;
@@ -163,8 +177,9 @@ void BlockKVCacheExtension::reset() {
163177// Prefill path
164178// ============================================================================
165179
166- void BlockKVCacheExtension::bind_prefill_inputs (const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
167- const PortsMap& prefill_in_ports) {
180+ void BlockKVCacheExtension::load_past_kv_blocks_to_prefill (
181+ const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
182+ const PortsMap& prefill_in_ports) {
168183 if (m_kv_cache_block_managers.empty ()) {
169184 return ;
170185 }
@@ -195,9 +210,10 @@ void BlockKVCacheExtension::bind_prefill_inputs(const std::shared_ptr<ov::IAsync
195210 }
196211}
197212
198- bool BlockKVCacheExtension::bind_prefill_outputs (const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
199- const PortsMap& prefill_out_ports,
200- uint32_t num_new_tokens) {
213+ bool BlockKVCacheExtension::redirect_prefill_outputs_to_new_blocks (
214+ const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
215+ const PortsMap& prefill_out_ports,
216+ uint32_t num_new_tokens) {
201217 if (m_kv_cache_block_managers.empty ()) {
202218 return false ;
203219 }
@@ -206,16 +222,15 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
206222 // This lets the model write KV cache straight into BlockManager memory with no post-inference copy.
207223 //
208224 // Steps:
209- // 1. Backup original output tensors (first call only) — needed for non-aligned chunks.
210- // 2. Pre-allocate one block per layer to hold the new tokens.
211- // 3. Bind that block to the output port (present.N.key / present.N.value).
212- // 4. Update block metadata immediately (token count is known before inference).
225+ // 1. Pre-allocate one block per layer to hold the new tokens.
226+ // 2. Redirect that output port to the block tensor (present.N.key / present.N.value).
227+ // 3. Update block metadata immediately (token count is known before inference).
213228 // After infer(), data and metadata are already in place — no additional work needed.
214229 //
215230 // CONSTRAINT: Only valid when the chunk writes into exactly ONE block:
216231 // - current_position must be block-aligned (start of a block boundary).
217232 // - num_new_tokens must not span across a block boundary.
218- // When these constraints are not met, callers must call restore_prefill_outputs () instead
233+ // When these constraints are not met, callers must call restore_prefill_output_buffers () instead
219234 // and fall back to the copy-based copy_prefill_outputs_to_blocks() path.
220235
221236 auto & kvcache_desc = m_compiled_model->m_kvcache_desc ;
@@ -227,25 +242,9 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
227242 return false ;
228243 }
229244
230- LOG_DEBUG (" === Binding Block Tensors to Prefill Model Outputs (Zero-Copy + Metadata Update ) ===" );
245+ LOG_DEBUG (" === Redirecting Prefill Outputs to New Blocks (Zero-Copy) ===" );
231246 LOG_DEBUG (" Pre-allocating blocks for " << num_new_tokens << " new tokens" );
232247
233- // Backup original output tensors on first zero-copy call
234- if (m_prefill_original_output_tensors.empty ()) {
235- LOG_DEBUG (" First zero-copy call - backing up original output tensors" );
236- for (const auto & [layer_idx, layer_managers] : m_kv_cache_block_managers) {
237- const std::string layer_str = std::to_string (layer_idx);
238- for (const char * kv_type : {" key" , " value" }) {
239- std::string output_name = " present." + layer_str + " ." + kv_type;
240- auto port_it = prefill_out_ports.find (output_name);
241- if (port_it != prefill_out_ports.end ()) {
242- m_prefill_original_output_tensors[output_name] = prefill_request->get_tensor (port_it->second );
243- }
244- }
245- }
246- LOG_DEBUG (" Backed up " << m_prefill_original_output_tensors.size () << " original output tensors" );
247- }
248-
249248 size_t total_blocks_allocated = 0 ;
250249 size_t total_outputs_bound = 0 ;
251250
@@ -326,12 +325,13 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
326325
327326 LOG_DEBUG (" Pre-binding complete: allocated=" << total_blocks_allocated << " blocks, bound=" << total_outputs_bound
328327 << " outputs" );
329- m_prefill_outputs_bound_to_blocks = true ;
328+ m_prefill_outputs_redirected = true ;
330329 return true ;
331330}
332331
333- void BlockKVCacheExtension::restore_prefill_outputs (const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
334- const PortsMap& prefill_out_ports) {
332+ void BlockKVCacheExtension::restore_prefill_output_buffers (
333+ const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
334+ const PortsMap& prefill_out_ports) {
335335 if (m_kv_cache_block_managers.empty ()) {
336336 return ;
337337 }
@@ -344,18 +344,16 @@ void BlockKVCacheExtension::restore_prefill_outputs(const std::shared_ptr<ov::IA
344344 // But "present.0.key" is STILL bound to block 0 from the previous chunk.
345345 // infer() now overwrites block 0 with misaligned data → silent data corruption.
346346 //
347- // Fix: restore the original model output buffers (backed up on the first zero-copy call ).
347+ // Fix: restore the original model output buffers (snapshotted in initialize() ).
348348 // The model then writes into its own buffers, and copy_prefill_outputs_to_blocks() copies
349349 // the result into the correct blocks afterwards.
350350
351- if (!m_prefill_outputs_bound_to_blocks ) {
351+ if (!m_prefill_outputs_redirected ) {
352352 LOG_VERB (" Prefill outputs already pointing to original tensors - no restore needed" );
353353 return ;
354354 }
355- if (m_prefill_original_output_tensors.empty ()) {
356- LOG_WARN (" No original output tensors backed up - cannot restore" );
357- return ;
358- }
355+ OPENVINO_ASSERT (!m_prefill_original_output_tensors.empty (),
356+ " Original output tensors were not snapshotted during initialize()." );
359357
360358 LOG_DEBUG (" === Restoring Original Output Tensors (Non-Zero-Copy Path) ===" );
361359
@@ -379,16 +377,17 @@ void BlockKVCacheExtension::restore_prefill_outputs(const std::shared_ptr<ov::IA
379377 }
380378
381379 LOG_DEBUG (" Restore complete: restored=" << total_restored << " tensors" );
382- m_prefill_outputs_bound_to_blocks = false ;
380+ m_prefill_outputs_redirected = false ;
383381}
384382
385383// ============================================================================
386384// Generate path
387385// ============================================================================
388386
389- void BlockKVCacheExtension::bind_generate_inputs (const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
390- const PortsMap& kvcache_in_ports,
391- uint32_t num_stored_tokens) {
387+ void BlockKVCacheExtension::init_generate_kv_block_bindings (
388+ const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
389+ const PortsMap& kvcache_in_ports,
390+ uint32_t num_stored_tokens) {
392391 if (m_kv_cache_block_managers.empty ()) {
393392 return ;
394393 }
@@ -661,7 +660,7 @@ void BlockKVCacheExtension::copy_prefill_outputs_to_blocks(
661660 copy_outputs_to_blocks (prefill_request, prefill_out_ports, num_tokens, v_transposed, kv_position);
662661}
663662
664- void BlockKVCacheExtension::absorb_generate_output_and_rebind (
663+ void BlockKVCacheExtension::commit_generate_kv_and_rebind (
665664 uint32_t old_num_tokens,
666665 uint32_t new_num_tokens,
667666 uint32_t input_tokens_len,
0 commit comments