Skip to content

Commit 744c69f

Browse files
AlexanderKalistratovYour NamesbalandiWovchena
authored
Extend using of full chat history mode for stateful pipeline for VLM and LLM with encoded inputs (#2835)
## Description This PR is mostly based on @sbalandi PR: [Extend using of full chat history mode for stateful pipeline for VLM and LLM with encoded inputs](#2486) and contains few additional fixes Ticket: CVS-168848 CVS-168079 ## Checklist: - [x] Tests have been updated or added to cover the new code <!--- If the change isn't maintenance related, update the tests at https://github.com/openvinotoolkit/openvino.genai/tree/master/tests or explain in the description why the tests don't need an update. --> - [x] This patch fully addresses the ticket. <!--- If follow-up pull requests are needed, specify in description. --> - [ ] I have made corresponding changes to the documentation --------- Co-authored-by: Your Name <[email protected]> Co-authored-by: Sofya Balandina <[email protected]> Co-authored-by: Vladimir Zlobin <[email protected]>
1 parent 5b05799 commit 744c69f

File tree

7 files changed

+172
-73
lines changed

7 files changed

+172
-73
lines changed

src/cpp/src/llm/pipeline_stateful.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,9 @@ EncodedResults StatefulLLMPipeline::generate(
320320

321321
size_t real_input_ids_size = input_ids.get_shape().at(1);
322322

323+
if (is_chat_conversation && m_use_full_chat_history)
324+
m_kv_cache_state.reset_state();
325+
323326
// Tail of previous output in chat mode is missing in KV cache.
324327
if (is_chat_conversation && m_chat_input_type == ov::genai::utils::GenerationChatInputsType::ENCODED_INPUTS) {
325328
ov::Tensor new_chat_tokens = ov::Tensor{ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data()};

src/cpp/src/visual_language/inputs_embedder.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ InputsEmbedder::InputsEmbedder(const std::filesystem::path& model_dir,
248248
} else if (vlm_config.model_type == VLMModelType::QWEN2_5_VL) {
249249
m_impl = std::make_shared<InputsEmbedderQwen2_5_VL>(vlm_config, model_dir, device, device_config);
250250
} else if (vlm_config.model_type == VLMModelType::GEMMA3) {
251-
m_impl = std::make_shared<InputsEmbedderGemma3>(vlm_config, model_dir, device, device_config);
251+
m_impl = std::make_shared<InputsEmbedderGemma3>(vlm_config, model_dir, device, device_config);
252252
} else {
253253
OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
254254
}
@@ -282,7 +282,7 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map,
282282
} else if (vlm_config.model_type == VLMModelType::QWEN2_5_VL) {
283283
m_impl = std::make_shared<InputsEmbedderQwen2_5_VL>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
284284
} else if (vlm_config.model_type == VLMModelType::GEMMA3) {
285-
m_impl = std::make_shared<InputsEmbedderGemma3>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
285+
m_impl = std::make_shared<InputsEmbedderGemma3>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
286286
} else {
287287
OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
288288
}

src/cpp/src/visual_language/inputs_embedder.hpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ class InputsEmbedder {
5555
std::pair<ov::Tensor, ov::Tensor> get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector<EncodedImage>& images, VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector<size_t>& image_sequence = {});
5656

5757
bool has_token_type_ids() const;
58-
58+
5959
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images);
6060

6161
std::vector<ov::genai::EncodedVideo> encode_videos(const std::vector<ov::Tensor>& videos);
@@ -81,7 +81,7 @@ class InputsEmbedder {
8181
// set the apply_chat_template flag, which determines whether chat template should be applied for non-chat scenarios
8282
void set_apply_chat_template_status(bool apply_chat_template);
8383

84-
// finishes chat and clears a chat history
84+
// finishes chat and clears a chat history
8585
void finish_chat();
8686

8787
virtual NormlizedPrompt normalize_prompt(
@@ -147,21 +147,21 @@ class InputsEmbedder {
147147
virtual std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images);
148148

149149
virtual std::vector<ov::genai::EncodedVideo> encode_videos(const std::vector<ov::Tensor>& videos);
150-
150+
151151
virtual std::pair<ov::Tensor, std::optional<int64_t>> get_position_ids(const size_t inputs_embeds_size, const size_t history_size);
152-
152+
153153
EmbeddingsModel::Ptr get_embedding_model() const {
154154
return m_embedding;
155155
}
156-
156+
157157
Tokenizer get_tokenizer() const {
158158
return m_tokenizer;
159159
}
160-
160+
161161
utils::KVCacheState& get_kv_cache_state() {
162162
return m_kv_cache_state;
163163
}
164-
164+
165165
void set_apply_chat_template_status(bool apply_chat_template) {
166166
m_apply_chat_template = apply_chat_template;
167167
}
@@ -170,43 +170,43 @@ class InputsEmbedder {
170170
m_add_special_tokens = value;
171171
m_add_special_tokens_is_set = true;
172172
}
173-
173+
174174
virtual void start_chat(const std::string& system_message);
175-
175+
176176
virtual void update_chat_history(const std::string& decoded_results, const ov::genai::GenerationStatus generation_finish_status);
177-
177+
178178
virtual void finish_chat();
179179

180180
virtual NormlizedPrompt normalize_prompt(
181181
const std::string& prompt,
182182
size_t base_id,
183183
const std::vector<EncodedImage>& images
184184
) const = 0;
185-
185+
186186
virtual NormlizedPrompt normalize_prompt(
187187
const std::string& prompt,
188188
size_t base_image_id,
189189
size_t base_video_id,
190190
const std::vector<EncodedImage>& images,
191191
const std::vector<EncodedVideo>& videos) const;
192-
192+
193193
protected:
194194
IInputsEmbedder(
195195
const VLMConfig& vlm_config,
196196
const std::filesystem::path& model_dir,
197197
const std::string& device,
198198
const ov::AnyMap device_config);
199-
199+
200200
IInputsEmbedder(
201201
const VLMConfig& vlm_config,
202202
const ModelsMap& models_map,
203203
const Tokenizer& tokenizer,
204204
const std::filesystem::path& config_dir_path,
205205
const std::string& device,
206206
const ov::AnyMap device_config);
207-
207+
208208
virtual ov::Tensor apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics);
209-
209+
210210
ov::Tensor update_history(const ov::Tensor& new_chat_tokens);
211211

212212
ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics);

src/cpp/src/visual_language/minicpm/classes.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,7 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& unified_p
634634
CircularBufferQueueElementGuard<EmbeddingsRequest> embeddings_request_guard(m_embedding->get_request_queue().get());
635635
EmbeddingsRequest& req = embeddings_request_guard.get();
636636
ov::Tensor inputs_embeds = m_embedding->infer(req, encoded_input);
637+
637638
OPENVINO_ASSERT(
638639
m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
639640
"Unexpected embedding size"

src/cpp/src/visual_language/phi4mm/classes.cpp

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -227,80 +227,80 @@ ov::Tensor calculate_patch_position_ids(
227227
ov::Shape image_embeds_shape = input_image_embeds.get_shape();
228228
// image_attention_mask: [batch, num_images, mask_height, mask_width]
229229
ov::Shape image_attention_mask_shape = image_attention_mask.get_shape();
230-
230+
231231
size_t batch_size = image_embeds_shape[0];
232232
size_t num_images = image_embeds_shape[1];
233233
size_t mask_height = image_attention_mask_shape[2];
234234
size_t mask_width = image_attention_mask_shape[3];
235-
235+
236236
size_t flattened_batch_size = batch_size * num_images;
237237
size_t total_mask_elements = mask_height * mask_width;
238-
238+
239239
std::vector<float> boundaries;
240240
boundaries.reserve(num_patches_per_side - 1);
241241
for (size_t i = 1; i < num_patches_per_side; ++i) {
242242
boundaries.push_back(static_cast<float>(i) / num_patches_per_side);
243243
}
244-
244+
245245
ov::Tensor position_ids{ov::element::i64, {flattened_batch_size, total_mask_elements}};
246246
int64_t* position_ids_data = position_ids.data<int64_t>();
247-
247+
248248
std::fill_n(position_ids_data, flattened_batch_size * total_mask_elements, 0);
249-
249+
250250
const float* image_attention_mask_data = image_attention_mask.data<float>();
251-
251+
252252
for (size_t flat_batch_idx = 0; flat_batch_idx < flattened_batch_size; ++flat_batch_idx) {
253253
size_t mask_offset = flat_batch_idx * mask_height * mask_width;
254254
const float* current_mask = image_attention_mask_data + mask_offset;
255-
255+
256256
size_t num_patches_h = 0;
257257
size_t num_patches_w = 0;
258-
258+
259259
for (size_t h = 0; h < mask_height; ++h) {
260260
if (current_mask[h * mask_width] > 0.0f) {
261261
num_patches_h++;
262262
}
263263
}
264-
264+
265265
for (size_t w = 0; w < mask_width; ++w) {
266266
if (current_mask[w] > 0.0f) {
267267
num_patches_w++;
268268
}
269269
}
270-
270+
271271
if (num_patches_h == 0 || num_patches_w == 0) {
272272
continue;
273273
}
274-
274+
275275
std::vector<float> fractional_coords_h;
276276
std::vector<float> fractional_coords_w;
277277
fractional_coords_h.reserve(num_patches_h);
278278
fractional_coords_w.reserve(num_patches_w);
279-
279+
280280
const float eps = 1e-6f;
281-
281+
282282
for (size_t i = 0; i < num_patches_h; ++i) {
283283
float coord = static_cast<float>(i) / num_patches_h;
284284
if (coord >= 1.0f - eps) {
285285
coord = 1.0f - eps;
286286
}
287287
fractional_coords_h.push_back(coord);
288288
}
289-
289+
290290
for (size_t i = 0; i < num_patches_w; ++i) {
291291
float coord = static_cast<float>(i) / num_patches_w;
292292
if (coord >= 1.0f - eps) {
293293
coord = 1.0f - eps;
294294
}
295295
fractional_coords_w.push_back(coord);
296296
}
297-
297+
298298
// Bucket coordinates (equivalent to torch.bucketize with right=True)
299299
std::vector<size_t> bucket_coords_h;
300300
std::vector<size_t> bucket_coords_w;
301301
bucket_coords_h.reserve(fractional_coords_h.size());
302302
bucket_coords_w.reserve(fractional_coords_w.size());
303-
303+
304304
for (float coord : fractional_coords_h) {
305305
size_t bucket = 0;
306306
for (size_t i = 0; i < boundaries.size(); ++i) {
@@ -312,7 +312,7 @@ ov::Tensor calculate_patch_position_ids(
312312
}
313313
bucket_coords_h.push_back(bucket);
314314
}
315-
315+
316316
for (float coord : fractional_coords_w) {
317317
size_t bucket = 0;
318318
for (size_t i = 0; i < boundaries.size(); ++i) {
@@ -324,26 +324,26 @@ ov::Tensor calculate_patch_position_ids(
324324
}
325325
bucket_coords_w.push_back(bucket);
326326
}
327-
327+
328328
std::vector<int64_t> pos_ids;
329329
pos_ids.reserve(bucket_coords_h.size() * bucket_coords_w.size());
330-
330+
331331
for (size_t h_coord : bucket_coords_h) {
332332
for (size_t w_coord : bucket_coords_w) {
333333
pos_ids.push_back(static_cast<int64_t>(h_coord * num_patches_per_side + w_coord));
334334
}
335335
}
336-
336+
337337
int64_t* batch_position_ids = position_ids_data + flat_batch_idx * total_mask_elements;
338338
size_t pos_idx = 0;
339-
339+
340340
for (size_t i = 0; i < total_mask_elements && pos_idx < pos_ids.size(); ++i) {
341341
if (current_mask[i] > 0.0f) {
342342
batch_position_ids[i] = pos_ids[pos_idx++];
343343
}
344344
}
345345
}
346-
346+
347347
return position_ids;
348348
}
349349

@@ -650,7 +650,7 @@ VisionEncoderPhi4MM::VisionEncoderPhi4MM(
650650
const std::filesystem::path& config_dir_path,
651651
const std::string& device,
652652
const ov::AnyMap properties
653-
) :
653+
) :
654654
VisionEncoder(models_map, config_dir_path, device, properties),
655655
m_image_preprocessors{create_image_preprocessors()},
656656
m_separator_inserters{create_separator_inserters()} {
@@ -675,7 +675,7 @@ EncodedImage VisionEncoderPhi4MM::encode(const ov::Tensor& image, const ov::AnyM
675675
CircularBufferQueueElementGuard<ov::InferRequest> lock{m_image_preprocessors.get()};
676676
ov::InferRequest& image_preprocessor = lock.get();
677677
image_preprocessor.set_tensor("image", image);
678-
678+
679679
ov::Tensor new_size_tensor{ov::element::i64, {2}};
680680
new_size_tensor.data<int64_t>()[0] = target_sizes.width;
681681
new_size_tensor.data<int64_t>()[1] = target_sizes.height;
@@ -690,7 +690,7 @@ EncodedImage VisionEncoderPhi4MM::encode(const ov::Tensor& image, const ov::AnyM
690690
image_preprocessor.set_tensor("padding_height", padding_height_tensor);
691691

692692
image_preprocessor.set_tensor("attention_mask", target_sizes.attention_mask);
693-
693+
694694
image_preprocessor.infer();
695695
image_preprocessor.get_tensor("input_image_embeds").copy_to(input_image_embeds);
696696
image_preprocessor.get_tensor("image_attention_mask").copy_to(image_attention_mask);
@@ -858,7 +858,7 @@ ov::Tensor InputsEmbedderPhi4MM::get_inputs_embeds(
858858
}
859859

860860
void InputsEmbedderPhi4MM::update_chat_history(
861-
const std::string& decoded_results,
861+
const std::string& decoded_results,
862862
const ov::genai::GenerationStatus generation_finish_status
863863
) {
864864
IInputsEmbedder::update_chat_history(decoded_results, generation_finish_status);

0 commit comments

Comments
 (0)