@@ -104,6 +104,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
104104 timer.end ();
105105 }
106106
107+ // TODO Consider moving to method and reuse
107108 std::vector<EncodedGenerationResult> encoded = generate (input_ids, sampling_params, streamer);
108109
109110 std::vector<GenerationResult> decoded;
@@ -149,6 +150,81 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
149150 return decoded;
150151}
151152
153+ std::vector<GenerationResult>
154+ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate (
155+ const std::vector<ChatHistory>& histories,
156+ const std::vector<ov::genai::GenerationConfig>& sampling_params,
157+ const StreamerVariant& streamer
158+ ) {
159+ // TODO Enable chat history input for embeddings models.
160+ OPENVINO_ASSERT (m_model_input_type == ModelInputType::TOKENS, " Chat history input is not supported for embeddings models." );
161+
162+ OPENVINO_ASSERT (histories.size () == sampling_params.size (), " Number of histories must match sampling params" );
163+ OPENVINO_ASSERT (!m_tokenizer.get_chat_template ().empty (), " Chat template must not be empty when using ChatHistory in generate method." );
164+
165+ auto start_time = std::chrono::steady_clock::now ();
166+
167+ std::vector<ov::Tensor> input_ids;
168+ input_ids.reserve (histories.size ());
169+
170+ std::vector<MicroSeconds> tokenization_durations;
171+ static ManualTimer timer (" tokenize" );
172+ timer.start ();
173+
174+ for (size_t i = 0 ; i < histories.size (); i++) {
175+ OPENVINO_ASSERT (sampling_params[i].apply_chat_template , " Chat template must be applied when using ChatHistory in generate method." );
176+ OPENVINO_ASSERT (!histories[i].empty (), " Chat history must not be empty when using ChatHistory in generate method." );
177+ const auto encode_start = std::chrono::steady_clock::now ();
178+ constexpr bool add_generation_prompt = true ;
179+ std::string templated_history = m_tokenizer.apply_chat_template (histories[i], add_generation_prompt);
180+ input_ids.push_back (
181+ m_tokenizer.encode (templated_history, add_special_tokens (false )).input_ids
182+ );
183+ tokenization_durations.emplace_back (PerfMetrics::get_microsec (std::chrono::steady_clock::now () - encode_start));
184+ }
185+
186+ timer.end ();
187+
188+ std::vector<EncodedGenerationResult> encoded_results = generate (input_ids, sampling_params, streamer);
189+
190+ std::vector<GenerationResult> decoded_results;
191+ decoded_results.reserve (encoded_results.size ());
192+ for (size_t i = 0 ; i < encoded_results.size (); ++i) {
193+ EncodedGenerationResult encoded_result = encoded_results[i];
194+
195+ auto & perf_metrics = encoded_result.perf_metrics ;
196+ auto & raw_counters = perf_metrics.raw_metrics ;
197+ raw_counters.tokenization_durations .emplace_back (tokenization_durations[i]);
198+
199+ std::vector<std::string> decoded_outputs;
200+ decoded_outputs.reserve (encoded_result.m_generation_ids .size ());
201+ for (size_t idx = 0 ; idx < encoded_result.m_generation_ids .size (); ++idx) {
202+ const auto decode_start = std::chrono::steady_clock::now ();
203+ decoded_outputs.push_back (m_tokenizer.decode (encoded_result.m_generation_ids .at (idx)));
204+
205+ raw_counters.detokenization_durations .emplace_back (std::chrono::steady_clock::now () - decode_start);
206+ }
207+
208+ // The same perf metrics for each sequence, only tokenization/detokenization will differ.
209+ perf_metrics.raw_metrics .generate_durations .clear ();
210+ perf_metrics.raw_metrics .generate_durations .emplace_back (PerfMetrics::get_microsec (std::chrono::steady_clock::now () - start_time));
211+ // Reevaluate taking into accound tokenization/detokenization times.
212+ perf_metrics.m_evaluated = false ;
213+ perf_metrics.evaluate_statistics (start_time);
214+
215+ decoded_results.push_back (GenerationResult{
216+ encoded_result.m_request_id ,
217+ std::move (decoded_outputs),
218+ std::move (encoded_result.m_scores ),
219+ encoded_result.m_status ,
220+ std::move (perf_metrics),
221+ std::move (encoded_result.extended_perf_metrics )
222+ });
223+ }
224+
225+ return decoded_results;
226+ }
227+
152228std::vector<VLMDecodedResults>
153229ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate (
154230 const std::vector<std::string>& prompts,
0 commit comments