Skip to content

Commit 6c75888

Browse files
committed
Methods rename.
Signed-off-by: intelgaoxiong <xiong.gao@intel.com>
1 parent af12cb7 commit 6c75888

File tree

3 files changed

+90
-90
lines changed

3 files changed

+90
-90
lines changed

src/plugins/intel_npu/src/plugin/npuw/llm_block_kvcache_extension.cpp

Lines changed: 40 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,20 @@ bool BlockKVCacheExtension::initialize(
138138
// Phase 4: Create managers and pre-compute binding helpers
139139
create_block_managers_and_helpers(layer_blocks, prefill_in_ports, generate_requests, gen_variant_in_ports);
140140

141+
// Phase 5: Snapshot original prefill output tensors for restore_prefill_output_buffers().
142+
// These are the model-owned buffers that exist before any zero-copy redirect happens.
143+
for (const auto& [layer_idx, layer_managers] : m_kv_cache_block_managers) {
144+
const std::string layer_str = std::to_string(layer_idx);
145+
for (const char* kv_type : {"key", "value"}) {
146+
std::string output_name = "present." + layer_str + "." + kv_type;
147+
auto port_it = prefill_out_ports.find(output_name);
148+
if (port_it != prefill_out_ports.end()) {
149+
m_prefill_original_output_tensors[output_name] = prefill_request->get_tensor(port_it->second);
150+
}
151+
}
152+
}
153+
LOG_INFO("Snapshotted " << m_prefill_original_output_tensors.size() << " original prefill output tensors");
154+
141155
LOG_INFO("=== Block-based KV Cache Initialization Complete ===");
142156

143157
m_enabled = true;
@@ -163,8 +177,9 @@ void BlockKVCacheExtension::reset() {
163177
// Prefill path
164178
// ============================================================================
165179

166-
void BlockKVCacheExtension::bind_prefill_inputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
167-
const PortsMap& prefill_in_ports) {
180+
void BlockKVCacheExtension::load_past_kv_blocks_to_prefill(
181+
const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
182+
const PortsMap& prefill_in_ports) {
168183
if (m_kv_cache_block_managers.empty()) {
169184
return;
170185
}
@@ -195,9 +210,10 @@ void BlockKVCacheExtension::bind_prefill_inputs(const std::shared_ptr<ov::IAsync
195210
}
196211
}
197212

198-
bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
199-
const PortsMap& prefill_out_ports,
200-
uint32_t num_new_tokens) {
213+
bool BlockKVCacheExtension::redirect_prefill_outputs_to_new_blocks(
214+
const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
215+
const PortsMap& prefill_out_ports,
216+
uint32_t num_new_tokens) {
201217
if (m_kv_cache_block_managers.empty()) {
202218
return false;
203219
}
@@ -206,16 +222,15 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
206222
// This lets the model write KV cache straight into BlockManager memory with no post-inference copy.
207223
//
208224
// Steps:
209-
// 1. Backup original output tensors (first call only) — needed for non-aligned chunks.
210-
// 2. Pre-allocate one block per layer to hold the new tokens.
211-
// 3. Bind that block to the output port (present.N.key / present.N.value).
212-
// 4. Update block metadata immediately (token count is known before inference).
225+
// 1. Pre-allocate one block per layer to hold the new tokens.
226+
// 2. Redirect that output port to the block tensor (present.N.key / present.N.value).
227+
// 3. Update block metadata immediately (token count is known before inference).
213228
// After infer(), data and metadata are already in place — no additional work needed.
214229
//
215230
// CONSTRAINT: Only valid when the chunk writes into exactly ONE block:
216231
// - current_position must be block-aligned (start of a block boundary).
217232
// - num_new_tokens must not span across a block boundary.
218-
// When these constraints are not met, callers must call restore_prefill_outputs() instead
233+
// When these constraints are not met, callers must call restore_prefill_output_buffers() instead
219234
// and fall back to the copy-based copy_prefill_outputs_to_blocks() path.
220235

221236
auto& kvcache_desc = m_compiled_model->m_kvcache_desc;
@@ -227,25 +242,9 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
227242
return false;
228243
}
229244

230-
LOG_DEBUG("=== Binding Block Tensors to Prefill Model Outputs (Zero-Copy + Metadata Update) ===");
245+
LOG_DEBUG("=== Redirecting Prefill Outputs to New Blocks (Zero-Copy) ===");
231246
LOG_DEBUG("Pre-allocating blocks for " << num_new_tokens << " new tokens");
232247

233-
// Backup original output tensors on first zero-copy call
234-
if (m_prefill_original_output_tensors.empty()) {
235-
LOG_DEBUG("First zero-copy call - backing up original output tensors");
236-
for (const auto& [layer_idx, layer_managers] : m_kv_cache_block_managers) {
237-
const std::string layer_str = std::to_string(layer_idx);
238-
for (const char* kv_type : {"key", "value"}) {
239-
std::string output_name = "present." + layer_str + "." + kv_type;
240-
auto port_it = prefill_out_ports.find(output_name);
241-
if (port_it != prefill_out_ports.end()) {
242-
m_prefill_original_output_tensors[output_name] = prefill_request->get_tensor(port_it->second);
243-
}
244-
}
245-
}
246-
LOG_DEBUG("Backed up " << m_prefill_original_output_tensors.size() << " original output tensors");
247-
}
248-
249248
size_t total_blocks_allocated = 0;
250249
size_t total_outputs_bound = 0;
251250

@@ -326,12 +325,13 @@ bool BlockKVCacheExtension::bind_prefill_outputs(const std::shared_ptr<ov::IAsyn
326325

327326
LOG_DEBUG("Pre-binding complete: allocated=" << total_blocks_allocated << " blocks, bound=" << total_outputs_bound
328327
<< " outputs");
329-
m_prefill_outputs_bound_to_blocks = true;
328+
m_prefill_outputs_redirected = true;
330329
return true;
331330
}
332331

333-
void BlockKVCacheExtension::restore_prefill_outputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
334-
const PortsMap& prefill_out_ports) {
332+
void BlockKVCacheExtension::restore_prefill_output_buffers(
333+
const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
334+
const PortsMap& prefill_out_ports) {
335335
if (m_kv_cache_block_managers.empty()) {
336336
return;
337337
}
@@ -344,18 +344,16 @@ void BlockKVCacheExtension::restore_prefill_outputs(const std::shared_ptr<ov::IA
344344
// But "present.0.key" is STILL bound to block 0 from the previous chunk.
345345
// infer() now overwrites block 0 with misaligned data → silent data corruption.
346346
//
347-
// Fix: restore the original model output buffers (backed up on the first zero-copy call).
347+
// Fix: restore the original model output buffers (snapshotted in initialize()).
348348
// The model then writes into its own buffers, and copy_prefill_outputs_to_blocks() copies
349349
// the result into the correct blocks afterwards.
350350

351-
if (!m_prefill_outputs_bound_to_blocks) {
351+
if (!m_prefill_outputs_redirected) {
352352
LOG_VERB("Prefill outputs already pointing to original tensors - no restore needed");
353353
return;
354354
}
355-
if (m_prefill_original_output_tensors.empty()) {
356-
LOG_WARN("No original output tensors backed up - cannot restore");
357-
return;
358-
}
355+
OPENVINO_ASSERT(!m_prefill_original_output_tensors.empty(),
356+
"Original output tensors were not snapshotted during initialize().");
359357

360358
LOG_DEBUG("=== Restoring Original Output Tensors (Non-Zero-Copy Path) ===");
361359

@@ -379,16 +377,17 @@ void BlockKVCacheExtension::restore_prefill_outputs(const std::shared_ptr<ov::IA
379377
}
380378

381379
LOG_DEBUG("Restore complete: restored=" << total_restored << " tensors");
382-
m_prefill_outputs_bound_to_blocks = false;
380+
m_prefill_outputs_redirected = false;
383381
}
384382

385383
// ============================================================================
386384
// Generate path
387385
// ============================================================================
388386

389-
void BlockKVCacheExtension::bind_generate_inputs(const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
390-
const PortsMap& kvcache_in_ports,
391-
uint32_t num_stored_tokens) {
387+
void BlockKVCacheExtension::init_generate_kv_block_bindings(
388+
const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
389+
const PortsMap& kvcache_in_ports,
390+
uint32_t num_stored_tokens) {
392391
if (m_kv_cache_block_managers.empty()) {
393392
return;
394393
}
@@ -661,7 +660,7 @@ void BlockKVCacheExtension::copy_prefill_outputs_to_blocks(
661660
copy_outputs_to_blocks(prefill_request, prefill_out_ports, num_tokens, v_transposed, kv_position);
662661
}
663662

664-
void BlockKVCacheExtension::absorb_generate_output_and_rebind(
663+
void BlockKVCacheExtension::commit_generate_kv_and_rebind(
665664
uint32_t old_num_tokens,
666665
uint32_t new_num_tokens,
667666
uint32_t input_tokens_len,

src/plugins/intel_npu/src/plugin/npuw/llm_block_kvcache_extension.hpp

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,9 @@ struct BlockBindingHelper {
6767
* Lifecycle:
6868
* 1. initialize() - detects block format, sets up managers and pre-computed helpers
6969
* 2. reset() - called at the start of each new conversation
70-
* 3. bind_prefill_inputs / bind_prefill_outputs / restore_prefill_outputs - per prefill chunk
71-
* 4. bind_generate_inputs - called once on the first generate step
70+
* 3. load_past_kv_blocks_to_prefill / redirect_prefill_outputs_to_new_blocks / restore_prefill_output_buffers - per
71+
* prefill chunk
72+
* 4. init_generate_kv_block_bindings - called once on the first generate step
7273
* 5. update_generate_bindings - called after every generate inference
7374
*/
7475
class BlockKVCacheExtension {
@@ -132,38 +133,38 @@ class BlockKVCacheExtension {
132133
// -------------------------------------------------------------------------
133134

134135
/**
135-
* @brief Bind previously-computed blocks to prefill model inputs.
136+
* @brief Load previously-computed KV blocks into prefill model inputs.
136137
*
137138
* Called before each prefill chunk inference so the model can read past KV cache.
138139
*/
139-
void bind_prefill_inputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
140-
const PortsMap& prefill_in_ports);
140+
void load_past_kv_blocks_to_prefill(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
141+
const PortsMap& prefill_in_ports);
141142

142143
/**
143-
* @brief Pre-bind prefill output ports to fresh blocks (zero-copy optimisation).
144+
* @brief Redirect prefill output ports to fresh blocks before inference (zero-copy optimisation).
144145
*
145146
* Call BEFORE prefill inference when the chunk is block-aligned.
146147
* Model will write directly into blocks — no post-inference copy is needed.
147148
*
148149
* @param num_new_tokens Tokens that will be written by this inference pass.
149-
* @return true if binding was applied (block-aligned); call restore_prefill_outputs()
150+
* @return true if redirect was applied (block-aligned); call restore_prefill_output_buffers()
150151
* before inference when this returns false.
151152
*/
152-
bool bind_prefill_outputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
153-
const PortsMap& prefill_out_ports,
154-
uint32_t num_new_tokens);
153+
bool redirect_prefill_outputs_to_new_blocks(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
154+
const PortsMap& prefill_out_ports,
155+
uint32_t num_new_tokens);
155156

156-
bool prefill_outputs_bound() const {
157-
return m_prefill_outputs_bound_to_blocks;
157+
bool prefill_outputs_redirected() const {
158+
return m_prefill_outputs_redirected;
158159
}
159160

160161
/**
161162
* @brief Restore original output buffers when zero-copy cannot be used.
162163
*
163164
* Must be called before non-aligned prefill chunks to prevent block corruption.
164165
*/
165-
void restore_prefill_outputs(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
166-
const PortsMap& prefill_out_ports);
166+
void restore_prefill_output_buffers(const std::shared_ptr<ov::IAsyncInferRequest>& prefill_request,
167+
const PortsMap& prefill_out_ports);
167168

168169
// -------------------------------------------------------------------------
169170
// Generate path
@@ -177,9 +178,9 @@ class BlockKVCacheExtension {
177178
*
178179
* @param num_stored_tokens Total tokens in KV cache at call time.
179180
*/
180-
void bind_generate_inputs(const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
181-
const PortsMap& kvcache_in_ports,
182-
uint32_t num_stored_tokens);
181+
void init_generate_kv_block_bindings(const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
182+
const PortsMap& kvcache_in_ports,
183+
uint32_t num_stored_tokens);
183184

184185
/**
185186
* @brief Copy prefill outputs into blocks after inference (zero-copy fallback).
@@ -208,12 +209,12 @@ class BlockKVCacheExtension {
208209
* @param new_num_tokens kvcache_desc.num_stored_tokens AFTER this generate step.
209210
* @param input_tokens_len Number of tokens actually generated (usually 1).
210211
*/
211-
void absorb_generate_output_and_rebind(uint32_t old_num_tokens,
212-
uint32_t new_num_tokens,
213-
uint32_t input_tokens_len,
214-
const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
215-
const PortsMap& kvcache_out_ports,
216-
const PortsMap& kvcache_in_ports);
212+
void commit_generate_kv_and_rebind(uint32_t old_num_tokens,
213+
uint32_t new_num_tokens,
214+
uint32_t input_tokens_len,
215+
const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
216+
const PortsMap& kvcache_out_ports,
217+
const PortsMap& kvcache_in_ports);
217218

218219
private:
219220
// -------------------------------------------------------------------------
@@ -264,7 +265,7 @@ class BlockKVCacheExtension {
264265
bool v_transposed,
265266
uint32_t current_kv_position);
266267

267-
// Re-bind model inputs after a generate step (called by absorb_generate_output_and_rebind).
268+
// Re-bind model inputs after a generate step (called by commit_generate_kv_and_rebind).
268269
void update_generate_bindings(uint32_t old_num_tokens,
269270
uint32_t new_num_tokens,
270271
const std::shared_ptr<ov::IAsyncInferRequest>& kvcache_request,
@@ -287,7 +288,7 @@ class BlockKVCacheExtension {
287288

288289
// Zero-copy prefill output state
289290
std::unordered_map<std::string, ov::SoPtr<ov::ITensor>> m_prefill_original_output_tensors;
290-
bool m_prefill_outputs_bound_to_blocks = false;
291+
bool m_prefill_outputs_redirected = false;
291292
};
292293

293294
} // namespace npuw

0 commit comments

Comments
 (0)