Methods rename.

intelgaoxiong · intelgaoxiong · commit 6c75888204f1 · 2026-03-17T20:00:18.000-07:00
Signed-off-by: intelgaoxiong &lt;xiong.gao@intel.com&gt;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_block_kvcache_extension.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_block_kvcache_extension.cpp
@@ -138,6 +138,20 @@ bool BlockKVCacheExtension::initialize(
     // Phase 4: Create managers and pre-compute binding helpers
     create_block_managers_and_helpers(layer_blocks, prefill_in_ports, generate_requests, gen_variant_in_ports);
 
+    // Phase 5: Snapshot original prefill output tensors for restore_prefill_output_buffers().
+    // These are the model-owned buffers that exist before any zero-copy redirect happens.
+    for (const auto& [layer_idx, layer_managers] : m_kv_cache_block_managers) {
+        const std::string layer_str = std::to_string(layer_idx);
+        for (const char* kv_type : {"key", "value"}) {
+            std::string output_name = "present." + layer_str + "." + kv_type;
+            auto port_it = prefill_out_ports.find(output_name);
+            if (port_it != prefill_out_ports.end()) {
+                m_prefill_original_output_tensors[output_name] = prefill_request->get_tensor(port_it->second);
+            }
+        }
+    }
+    LOG_INFO("Snapshotted " << m_prefill_original_output_tensors.size() << " original prefill output tensors");
+
     LOG_INFO("=== Block-based KV Cache Initialization Complete ===");
 
     m_enabled = true;
@@ -163,8 +177,9 @@ void BlockKVCacheExtension::reset() {
 // Prefill path
 // ============================================================================
 
-void BlockKVCacheExtension::bind_prefill_inputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
-                                                const PortsMap& prefill_in_ports) {
+void BlockKVCacheExtension::load_past_kv_blocks_to_prefill(
+    const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
+    const PortsMap& prefill_in_ports) {
     if (m_kv_cache_block_managers.empty()) {
         return;
     }
@@ -195,9 +210,10 @@ void BlockKVCacheExtension::bind_prefill_inputs(const std::shared_ptr<ov::IAsync
     }
 }
 
-bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
-                                                 const PortsMap& prefill_out_ports,
-                                                 uint32_t num_new_tokens) {
+bool BlockKVCacheExtension::redirect_prefill_outputs_to_new_blocks(
+    const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
+    const PortsMap& prefill_out_ports,
+    uint32_t num_new_tokens) {
     if (m_kv_cache_block_managers.empty()) {
         return false;
     }
@@ -206,16 +222,15 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
     // This lets the model write KV cache straight into BlockManager memory with no post-inference copy.
     //
     // Steps:
-    //   1. Backup original output tensors (first call only) — needed for non-aligned chunks.
-    //   2. Pre-allocate one block per layer to hold the new tokens.
-    //   3. Bind that block to the output port (present.N.key / present.N.value).
-    //   4. Update block metadata immediately (token count is known before inference).
+    //   1. Pre-allocate one block per layer to hold the new tokens.
+    //   2. Redirect that output port to the block tensor (present.N.key / present.N.value).
+    //   3. Update block metadata immediately (token count is known before inference).
     //   After infer(), data and metadata are already in place — no additional work needed.
     //
     // CONSTRAINT: Only valid when the chunk writes into exactly ONE block:
     //   - current_position must be block-aligned (start of a block boundary).
     //   - num_new_tokens must not span across a block boundary.
-    // When these constraints are not met, callers must call restore_prefill_outputs() instead
+    // When these constraints are not met, callers must call restore_prefill_output_buffers() instead
     // and fall back to the copy-based copy_prefill_outputs_to_blocks() path.
 
     auto& kvcache_desc = m_compiled_model->m_kvcache_desc;
@@ -227,25 +242,9 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
         return false;
     }
 
-    LOG_DEBUG("=== Binding Block Tensors to Prefill Model Outputs (Zero-Copy + Metadata Update) ===");
+    LOG_DEBUG("=== Redirecting Prefill Outputs to New Blocks (Zero-Copy) ===");
     LOG_DEBUG("Pre-allocating blocks for " << num_new_tokens << " new tokens");
 
-    // Backup original output tensors on first zero-copy call
-    if (m_prefill_original_output_tensors.empty()) {
-        LOG_DEBUG("First zero-copy call - backing up original output tensors");
-        for (const auto& [layer_idx, layer_managers] : m_kv_cache_block_managers) {
-            const std::string layer_str = std::to_string(layer_idx);
-            for (const char* kv_type : {"key", "value"}) {
-                std::string output_name = "present." + layer_str + "." + kv_type;
-                auto port_it = prefill_out_ports.find(output_name);
-                if (port_it != prefill_out_ports.end()) {
-                    m_prefill_original_output_tensors[output_name] = prefill_request->get_tensor(port_it->second);
-                }
-            }
-        }
-        LOG_DEBUG("Backed up " << m_prefill_original_output_tensors.size() << " original output tensors");
-    }
-
     size_t total_blocks_allocated = 0;
     size_t total_outputs_bound = 0;
 
@@ -326,12 +325,13 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
 
     LOG_DEBUG("Pre-binding complete: allocated=" << total_blocks_allocated << " blocks, bound=" << total_outputs_bound
                                                  << " outputs");
-    m_prefill_outputs_bound_to_blocks = true;
+    m_prefill_outputs_redirected = true;
     return true;
 }
 
-void BlockKVCacheExtension::restore_prefill_outputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
-                                                    const PortsMap& prefill_out_ports) {
+void BlockKVCacheExtension::restore_prefill_output_buffers(
+    const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
+    const PortsMap& prefill_out_ports) {
     if (m_kv_cache_block_managers.empty()) {
         return;
     }
@@ -344,18 +344,16 @@ void BlockKVCacheExtension::restore_prefill_outputs(const std::shared_ptr<ov::IA
     //   But "present.0.key" is STILL bound to block 0 from the previous chunk.
     //   infer() now overwrites block 0 with misaligned data → silent data corruption.
     //
-    // Fix: restore the original model output buffers (backed up on the first zero-copy call).
+    // Fix: restore the original model output buffers (snapshotted in initialize()).
     // The model then writes into its own buffers, and copy_prefill_outputs_to_blocks() copies
     // the result into the correct blocks afterwards.
 
-    if (!m_prefill_outputs_bound_to_blocks) {
+    if (!m_prefill_outputs_redirected) {
         LOG_VERB("Prefill outputs already pointing to original tensors - no restore needed");
         return;
     }
-    if (m_prefill_original_output_tensors.empty()) {
-        LOG_WARN("No original output tensors backed up - cannot restore");
-        return;
-    }
+    OPENVINO_ASSERT(!m_prefill_original_output_tensors.empty(),
+                    "Original output tensors were not snapshotted during initialize().");
 
     LOG_DEBUG("=== Restoring Original Output Tensors (Non-Zero-Copy Path) ===");
 
@@ -379,16 +377,17 @@ void BlockKVCacheExtension::restore_prefill_outputs(const std::shared_ptr<ov::IA
     }
 
     LOG_DEBUG("Restore complete: restored=" << total_restored << " tensors");
-    m_prefill_outputs_bound_to_blocks = false;
+    m_prefill_outputs_redirected = false;
 }
 
 // ============================================================================
 // Generate path
 // ============================================================================
 
-void BlockKVCacheExtension::bind_generate_inputs(const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
-                                                 const PortsMap& kvcache_in_ports,
-                                                 uint32_t num_stored_tokens) {
+void BlockKVCacheExtension::init_generate_kv_block_bindings(
+    const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
+    const PortsMap& kvcache_in_ports,
+    uint32_t num_stored_tokens) {
     if (m_kv_cache_block_managers.empty()) {
         return;
     }
@@ -661,7 +660,7 @@ void BlockKVCacheExtension::copy_prefill_outputs_to_blocks(
     copy_outputs_to_blocks(prefill_request, prefill_out_ports, num_tokens, v_transposed, kv_position);
 }
 
-void BlockKVCacheExtension::absorb_generate_output_and_rebind(
+void BlockKVCacheExtension::commit_generate_kv_and_rebind(
     uint32_t old_num_tokens,
     uint32_t new_num_tokens,
     uint32_t input_tokens_len,
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_block_kvcache_extension.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_block_kvcache_extension.hpp
@@ -67,8 +67,9 @@ struct BlockBindingHelper {
  * Lifecycle:
  *   1. initialize() - detects block format, sets up managers and pre-computed helpers
  *   2. reset()      - called at the start of each new conversation
- *   3. bind_prefill_inputs / bind_prefill_outputs / restore_prefill_outputs - per prefill chunk
- *   4. bind_generate_inputs - called once on the first generate step
+ *   3. load_past_kv_blocks_to_prefill / redirect_prefill_outputs_to_new_blocks / restore_prefill_output_buffers - per
+ * prefill chunk
+ *   4. init_generate_kv_block_bindings - called once on the first generate step
  *   5. update_generate_bindings - called after every generate inference
  */
 class BlockKVCacheExtension {
@@ -132,38 +133,38 @@ class BlockKVCacheExtension {
     // -------------------------------------------------------------------------
 
     /**
-     * @brief Bind previously-computed blocks to prefill model inputs.
+     * @brief Load previously-computed KV blocks into prefill model inputs.
      *
      * Called before each prefill chunk inference so the model can read past KV cache.
      */
-    void bind_prefill_inputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
-                             const PortsMap& prefill_in_ports);
+    void load_past_kv_blocks_to_prefill(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
+                                        const PortsMap& prefill_in_ports);
 
     /**
-     * @brief Pre-bind prefill output ports to fresh blocks (zero-copy optimisation).
+     * @brief Redirect prefill output ports to fresh blocks before inference (zero-copy optimisation).
      *
      * Call BEFORE prefill inference when the chunk is block-aligned.
      * Model will write directly into blocks — no post-inference copy is needed.
      *
      * @param num_new_tokens Tokens that will be written by this inference pass.
-     * @return true if binding was applied (block-aligned); call restore_prefill_outputs()
+     * @return true if redirect was applied (block-aligned); call restore_prefill_output_buffers()
      *         before inference when this returns false.
      */
-    bool bind_prefill_outputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
-                              const PortsMap& prefill_out_ports,
-                              uint32_t num_new_tokens);
+    bool redirect_prefill_outputs_to_new_blocks(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
+                                                const PortsMap& prefill_out_ports,
+                                                uint32_t num_new_tokens);
 
-    bool prefill_outputs_bound() const {
-        return m_prefill_outputs_bound_to_blocks;
+    bool prefill_outputs_redirected() const {
+        return m_prefill_outputs_redirected;
     }
 
     /**
      * @brief Restore original output buffers when zero-copy cannot be used.
      *
      * Must be called before non-aligned prefill chunks to prevent block corruption.
      */
-    void restore_prefill_outputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
-                                 const PortsMap& prefill_out_ports);
+    void restore_prefill_output_buffers(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
+                                        const PortsMap& prefill_out_ports);
 
     // -------------------------------------------------------------------------
     // Generate path
@@ -177,9 +178,9 @@ class BlockKVCacheExtension {
      *
      * @param num_stored_tokens Total tokens in KV cache at call time.
      */
-    void bind_generate_inputs(const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
-                              const PortsMap& kvcache_in_ports,
-                              uint32_t num_stored_tokens);
+    void init_generate_kv_block_bindings(const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
+                                         const PortsMap& kvcache_in_ports,
+                                         uint32_t num_stored_tokens);
 
     /**
      * @brief Copy prefill outputs into blocks after inference (zero-copy fallback).
@@ -208,12 +209,12 @@ class BlockKVCacheExtension {
      * @param new_num_tokens  kvcache_desc.num_stored_tokens AFTER this generate step.
      * @param input_tokens_len  Number of tokens actually generated (usually 1).
      */
-    void absorb_generate_output_and_rebind(uint32_t old_num_tokens,
-                                           uint32_t new_num_tokens,
-                                           uint32_t input_tokens_len,
-                                           const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
-                                           const PortsMap& kvcache_out_ports,
-                                           const PortsMap& kvcache_in_ports);
+    void commit_generate_kv_and_rebind(uint32_t old_num_tokens,
+                                       uint32_t new_num_tokens,
+                                       uint32_t input_tokens_len,
+                                       const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
+                                       const PortsMap& kvcache_out_ports,
+                                       const PortsMap& kvcache_in_ports);
 
 private:
     // -------------------------------------------------------------------------
@@ -264,7 +265,7 @@ class BlockKVCacheExtension {
                                 bool v_transposed,
                                 uint32_t current_kv_position);
 
-    // Re-bind model inputs after a generate step (called by absorb_generate_output_and_rebind).
+    // Re-bind model inputs after a generate step (called by commit_generate_kv_and_rebind).
     void update_generate_bindings(uint32_t old_num_tokens,
                                   uint32_t new_num_tokens,
                                   const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
@@ -287,7 +288,7 @@ class BlockKVCacheExtension {
 
     // Zero-copy prefill output state
     std::unordered_map<std::string, ov::SoPtr<ov::ITensor>> m_prefill_original_output_tensors;
-    bool m_prefill_outputs_bound_to_blocks = false;
+    bool m_prefill_outputs_redirected = false;
 };
 
 }  // namespace npuw
diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp