Skip to content

Commit 6b728f5

Browse files
authored
Merge branch 'master' into b_whisper_unification
2 parents bd178b4 + 5fd9984 commit 6b728f5

28 files changed

+812
-146
lines changed

src/cpp/include/openvino/genai/cache_eviction.hpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#pragma once
55

66
#include <cstddef>
7+
#include <unordered_map>
8+
#include <sstream>
79

810
#include "openvino/core/except.hpp"
911

@@ -66,6 +68,26 @@ class KVCrushConfig {
6668
std::size_t get_budget() const {
6769
return budget;
6870
}
71+
72+
std::string to_string() const {
73+
static const std::unordered_map<KVCrushAnchorPointMode, std::string> kv_crush_anchor_point_mode_to_string = {
74+
{KVCrushAnchorPointMode::RANDOM, "RANDOM"},
75+
{KVCrushAnchorPointMode::ZEROS, "ZEROS"},
76+
{KVCrushAnchorPointMode::ONES, "ONES"},
77+
{KVCrushAnchorPointMode::MEAN, "MEAN"},
78+
{KVCrushAnchorPointMode::ALTERNATE, "ALTERNATE"},
79+
};
80+
81+
std::ostringstream oss;
82+
oss << "KVCrushConfig { " << "\n";
83+
oss << " budget: " << budget << "\n";
84+
oss << " rng_seed: " << rng_seed << "\n";
85+
if (kv_crush_anchor_point_mode_to_string.count(anchor_point_mode) > 0) {
86+
oss << " anchor_point_mode: " << kv_crush_anchor_point_mode_to_string.at(anchor_point_mode) << "\n";
87+
}
88+
oss << " }";
89+
return oss.str();
90+
}
6991
};
7092

7193
/**
@@ -122,6 +144,28 @@ class CacheEvictionConfig {
122144
return m_evictable_size;
123145
}
124146

147+
std::string to_string() const {
148+
static const std::unordered_map<AggregationMode, std::string> aggregation_mode_to_string = {
149+
{AggregationMode::SUM, "SUM"},
150+
{AggregationMode::NORM_SUM, "NORM_SUM"},
151+
};
152+
153+
std::ostringstream oss;
154+
oss << "CacheEvictionConfig { " << "\n";
155+
oss << " start_size: " << m_start_size << "\n";
156+
oss << " recent_size: " << m_recent_size << "\n";
157+
oss << " max_cache_size: " << m_max_cache_size << "\n";
158+
oss << " evictable_size: " << m_evictable_size << "\n";
159+
if (aggregation_mode_to_string.count(aggregation_mode) > 0) {
160+
oss << " aggregation_mode: " << aggregation_mode_to_string.at(aggregation_mode) << "\n";
161+
}
162+
oss << " apply_rotation: " << std::boolalpha << apply_rotation << "\n";
163+
oss << " snapkv_window_size: " << snapkv_window_size << "\n";
164+
oss << kvcrush_config.to_string() << "\n";
165+
oss << " }";
166+
return oss.str();
167+
}
168+
125169
/** The mode used to compute the importance of tokens for eviction */
126170
AggregationMode aggregation_mode = AggregationMode::NORM_SUM;
127171

src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,12 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
175175
/// Higher level interface, which can process multiple prompts in continuous batching manner
176176
std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
177177
std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{});
178+
179+
std::vector<GenerationResult> generate(
180+
const std::vector<ChatHistory>& histories,
181+
const std::vector<ov::genai::GenerationConfig>& sampling_params,
182+
const ov::genai::StreamerVariant& streamer=std::monostate{});
183+
178184
std::vector<VLMDecodedResults> generate(
179185
const std::vector<std::string>& prompts,
180186
const std::vector<std::vector<ov::Tensor>>& images,

src/cpp/include/openvino/genai/llm_pipeline.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,57 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
233233
return generate(inputs, AnyMap{std::forward<Properties>(properties)...});
234234
}
235235

236+
/**
237+
* @brief High level generate that receives ChatHistory and returns decoded output.
238+
*
239+
* @param history ChatHistory with messages
240+
* @param generation_config optional GenerationConfig
241+
* @param streamer optional streamer
242+
* @return DecodedResults decoded resulting text
243+
*
244+
* Chat template will be applied to the prompt, run `pipe.get_tokenizer().set_chat_template(custom_chat_template)` to update it.
245+
* To disable chat template set `generation_config.apply_chat_template` to `false`.
246+
*/
247+
DecodedResults generate(
248+
const ChatHistory& history,
249+
OptionalGenerationConfig generation_config = std::nullopt,
250+
StreamerVariant streamer=std::monostate()
251+
);
252+
253+
/**
254+
* @brief High level generate that receives ChatHistory and returns decoded output.
255+
* Properties can be in any order pipe.generate(..., ov::genai::max_new_tokens(100), ov::genai::streamer(lambda_func)).
256+
*
257+
* @param history ChatHistory with messages
258+
* @param properties properties
259+
* @return DecodedResults decoded resulting text
260+
*
261+
* Chat template will be applied to the prompt, run `pipe.get_tokenizer().set_chat_template(custom_chat_template)` to update it.
262+
* To disable chat template set `generation_config.apply_chat_template` to `false`.
263+
*/
264+
template <typename... Properties>
265+
util::EnableIfAllStringAny<DecodedResults, Properties...> generate(
266+
const ChatHistory& history,
267+
Properties&&... properties) {
268+
return generate(history, AnyMap{std::forward<Properties>(properties)...});
269+
}
270+
DecodedResults generate(const ChatHistory& history, const ov::AnyMap& config_map);
271+
272+
DecodedResults operator()(
273+
const ChatHistory& history,
274+
OptionalGenerationConfig generation_config = std::nullopt,
275+
StreamerVariant streamer=std::monostate()
276+
) {
277+
return generate(history, generation_config, streamer);
278+
}
279+
280+
template <typename... Properties>
281+
util::EnableIfAllStringAny<DecodedResults, Properties...> operator()(
282+
const ChatHistory& history,
283+
Properties&&... properties) {
284+
return generate(history, AnyMap{std::forward<Properties>(properties)...});
285+
}
286+
236287
/**
237288
* @brief Low level generate to be called with already encoded input_ids tokens.
238289
* Streamer cannot be used for multibatch inputs.

src/cpp/include/openvino/genai/scheduler_config.hpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#pragma once
55

66
#include <cstddef>
7+
#include <sstream>
78

89
#include "openvino/genai/cache_eviction.hpp"
910
#include "openvino/genai/sparse_attention.hpp"
@@ -73,5 +74,33 @@ struct SchedulerConfig {
7374
dynamic_split_fuse == other.dynamic_split_fuse && use_cache_eviction == other.use_cache_eviction &&
7475
max_num_seqs == other.max_num_seqs && enable_prefix_caching == other.enable_prefix_caching;
7576
}
77+
78+
/**
79+
* Returns a human-readable string representation of the SchedulerConfig.
80+
* The output is a multi-line string listing each configuration field and its value.
81+
* This is useful for debugging, logging, or inspecting the current configuration.
82+
*
83+
* @return A string describing the current SchedulerConfig in a readable format.
84+
*/
85+
std::string to_string() const {
86+
std::ostringstream oss;
87+
oss << "SchedulerConfig { \n";
88+
oss << " max_num_batched_tokens: " << max_num_batched_tokens << "\n";
89+
oss << " num_kv_blocks: " << num_kv_blocks << "\n";
90+
oss << " cache_size: " << cache_size << "\n";
91+
oss << " dynamic_split_fuse: " << std::boolalpha << dynamic_split_fuse << "\n";
92+
oss << " use_cache_eviction: " << std::boolalpha << use_cache_eviction << "\n";
93+
if (use_cache_eviction) {
94+
oss << cache_eviction_config.to_string() << "\n";
95+
}
96+
oss << " max_num_seqs: " << max_num_seqs << "\n";
97+
oss << " enable_prefix_caching: " << std::boolalpha << enable_prefix_caching << "\n";
98+
oss << " use_sparse_attention: " << std::boolalpha << use_sparse_attention << "\n";
99+
if (use_sparse_attention) {
100+
oss << sparse_attention_config.to_string() << "\n";
101+
}
102+
oss << " }";
103+
return oss.str();
104+
}
76105
};
77106
}

src/cpp/include/openvino/genai/sparse_attention.hpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#pragma once
55

66
#include <cstddef>
7+
#include <unordered_map>
8+
#include <sstream>
79

810
namespace ov::genai {
911

@@ -79,6 +81,42 @@ class SparseAttentionConfig {
7981
* M time to be calculated, then the importance score calculation would be taking `M / xattention_stride` time as
8082
* overhead. */
8183
size_t xattention_stride = 8;
84+
85+
/**
86+
* @brief Returns a string representation of the SparseAttentionConfig.
87+
*
88+
* The returned string contains the values of all configuration fields in a human-readable format, e.g.:
89+
* SparseAttentionConfig {
90+
* sparseAttentionMode: TRISHAPE
91+
* num_last_dense_tokens_in_prefill: 100
92+
* num_retained_start_tokens_in_cache: 128
93+
* num_retained_recent_tokens_in_cache: 1920
94+
* xattention_threshold: 0.8
95+
* xattention_block_size: 64
96+
* xattention_stride: 8
97+
* }
98+
*
99+
* @return A string describing the current configuration.
100+
*/
101+
std::string to_string() const {
102+
static const std::unordered_map<SparseAttentionMode, std::string> sparse_attention_mode_to_string = {
103+
{SparseAttentionMode::TRISHAPE, "TRISHAPE"},
104+
{SparseAttentionMode::XATTENTION, "XATTENTION"},
105+
};
106+
std::ostringstream oss;
107+
oss << "SparseAttentionConfig { " << "\n";
108+
if (sparse_attention_mode_to_string.count(mode) > 0) {
109+
oss << " sparseAttentionMode: " << sparse_attention_mode_to_string.at(mode) << "\n";
110+
}
111+
oss << " num_last_dense_tokens_in_prefill: " << num_last_dense_tokens_in_prefill << "\n";
112+
oss << " num_retained_start_tokens_in_cache: " << num_retained_start_tokens_in_cache << "\n";
113+
oss << " num_retained_recent_tokens_in_cache: " << num_retained_recent_tokens_in_cache << "\n";
114+
oss << " xattention_threshold: " << xattention_threshold << "\n";
115+
oss << " xattention_block_size: " << xattention_block_size << "\n";
116+
oss << " xattention_stride: " << xattention_stride << "\n";
117+
oss << " }";
118+
return oss.str();
119+
}
82120
};
83121

84122
} // namespace ov::genai

src/cpp/src/continuous_batching/pipeline.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
5757
embedder = std::make_shared<InputsEmbedder>(models_path, device, vision_encoder_properties);
5858
}
5959

60+
utils::print_scheduler_config_info(scheduler_config);
61+
6062
if (is_prompt_lookup_enabled) {
6163
OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
6264
OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
@@ -97,6 +99,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
9799
embedder = std::make_shared<InputsEmbedder>(models_path, device, properties_without_draft_model_without_gguf);
98100
}
99101

102+
utils::print_scheduler_config_info(scheduler_config);
103+
100104
if (is_prompt_lookup_enabled) {
101105
OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
102106
OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
@@ -140,6 +144,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
140144
}
141145
}
142146

147+
utils::print_scheduler_config_info(scheduler_config);
148+
143149
if (is_prompt_lookup_enabled) {
144150
OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
145151
OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
@@ -188,6 +194,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
188194
}
189195
}
190196

197+
utils::print_scheduler_config_info(scheduler_config);
198+
191199
if (is_prompt_lookup_enabled) {
192200
OPENVINO_ASSERT(draft_model_desr.model == nullptr, "Speculative decoding and prompt lookup decoding are mutually exclusive");
193201
OPENVINO_ASSERT(embedder == nullptr, "Prompt lookup decoding is not supported for models with embeddings");
@@ -265,6 +273,21 @@ std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::ve
265273
return decoded_results;
266274
}
267275

276+
std::vector<GenerationResult> ContinuousBatchingPipeline::generate(
277+
const std::vector<ChatHistory>& histories,
278+
const std::vector<ov::genai::GenerationConfig>&
279+
sampling_params,
280+
const StreamerVariant& streamer
281+
) {
282+
auto decoded_results = m_impl->generate(histories, sampling_params, streamer);
283+
284+
for (auto& decoded_result : decoded_results) {
285+
decoded_result.perf_metrics.load_time = m_impl->m_load_time_ms;
286+
}
287+
288+
return decoded_results;
289+
}
290+
268291
std::vector<VLMDecodedResults> ContinuousBatchingPipeline::generate(
269292
const std::vector<std::string>& prompts,
270293
const std::vector<std::vector<ov::Tensor>>& images,

src/cpp/src/continuous_batching/pipeline_base.cpp

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
104104
timer.end();
105105
}
106106

107+
// TODO Consider moving to method and reuse
107108
std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params, streamer);
108109

109110
std::vector<GenerationResult> decoded;
@@ -149,6 +150,81 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
149150
return decoded;
150151
}
151152

153+
std::vector<GenerationResult>
154+
ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
155+
const std::vector<ChatHistory>& histories,
156+
const std::vector<ov::genai::GenerationConfig>& sampling_params,
157+
const StreamerVariant& streamer
158+
) {
159+
// TODO Enable chat history input for embeddings models.
160+
OPENVINO_ASSERT(m_model_input_type == ModelInputType::TOKENS, "Chat history input is not supported for embeddings models.");
161+
162+
OPENVINO_ASSERT(histories.size() == sampling_params.size(), "Number of histories must match sampling params");
163+
OPENVINO_ASSERT(!m_tokenizer.get_chat_template().empty(), "Chat template must not be empty when using ChatHistory in generate method.");
164+
165+
auto start_time = std::chrono::steady_clock::now();
166+
167+
std::vector<ov::Tensor> input_ids;
168+
input_ids.reserve(histories.size());
169+
170+
std::vector<MicroSeconds> tokenization_durations;
171+
static ManualTimer timer("tokenize");
172+
timer.start();
173+
174+
for (size_t i = 0; i < histories.size(); i++) {
175+
OPENVINO_ASSERT(sampling_params[i].apply_chat_template, "Chat template must be applied when using ChatHistory in generate method.");
176+
OPENVINO_ASSERT(!histories[i].empty(), "Chat history must not be empty when using ChatHistory in generate method.");
177+
const auto encode_start = std::chrono::steady_clock::now();
178+
constexpr bool add_generation_prompt = true;
179+
std::string templated_history = m_tokenizer.apply_chat_template(histories[i], add_generation_prompt);
180+
input_ids.push_back(
181+
m_tokenizer.encode(templated_history, add_special_tokens(false)).input_ids
182+
);
183+
tokenization_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - encode_start));
184+
}
185+
186+
timer.end();
187+
188+
std::vector<EncodedGenerationResult> encoded_results = generate(input_ids, sampling_params, streamer);
189+
190+
std::vector<GenerationResult> decoded_results;
191+
decoded_results.reserve(encoded_results.size());
192+
for (size_t i = 0; i < encoded_results.size(); ++i) {
193+
EncodedGenerationResult encoded_result = encoded_results[i];
194+
195+
auto& perf_metrics = encoded_result.perf_metrics;
196+
auto& raw_counters = perf_metrics.raw_metrics;
197+
raw_counters.tokenization_durations.emplace_back(tokenization_durations[i]);
198+
199+
std::vector<std::string> decoded_outputs;
200+
decoded_outputs.reserve(encoded_result.m_generation_ids.size());
201+
for (size_t idx = 0; idx < encoded_result.m_generation_ids.size(); ++idx) {
202+
const auto decode_start = std::chrono::steady_clock::now();
203+
decoded_outputs.push_back(m_tokenizer.decode(encoded_result.m_generation_ids.at(idx)));
204+
205+
raw_counters.detokenization_durations.emplace_back(std::chrono::steady_clock::now() - decode_start);
206+
}
207+
208+
// The same perf metrics for each sequence, only tokenization/detokenization will differ.
209+
perf_metrics.raw_metrics.generate_durations.clear();
210+
perf_metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - start_time));
211+
// Reevaluate taking into accound tokenization/detokenization times.
212+
perf_metrics.m_evaluated = false;
213+
perf_metrics.evaluate_statistics(start_time);
214+
215+
decoded_results.push_back(GenerationResult{
216+
encoded_result.m_request_id,
217+
std::move(decoded_outputs),
218+
std::move(encoded_result.m_scores),
219+
encoded_result.m_status,
220+
std::move(perf_metrics),
221+
std::move(encoded_result.extended_perf_metrics)
222+
});
223+
}
224+
225+
return decoded_results;
226+
}
227+
152228
std::vector<VLMDecodedResults>
153229
ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
154230
const std::vector<std::string>& prompts,

src/cpp/src/continuous_batching/pipeline_base.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,15 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline {
148148
const std::vector<GenerationConfig>& sampling_params,
149149
const StreamerVariant& streamer);
150150

151+
152+
/**
153+
* Performs monolitic generation based on ChatHistory objects
154+
*/
155+
std::vector<GenerationResult>
156+
generate(const std::vector<ChatHistory>& histories,
157+
const std::vector<GenerationConfig>& sampling_params,
158+
const StreamerVariant& streamer);
159+
151160
/**
152161
* Starts chat with a given system prompt
153162
*

0 commit comments

Comments
 (0)