diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 1c620ad7d..02a4b20b3 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -171,6 +171,8 @@ def main( image_path = kwargs.pop("image_path", None) image_url = kwargs.pop("image_url", None) + iteration = kwargs.pop("iteration", 1) + automation = kwargs.pop("automation", False) config = qeff_model.model.config architecture = config.architectures[0] if config.architectures else None @@ -234,6 +236,8 @@ def main( prompt=prompt, prompts_txt_file_path=prompts_txt_file_path, generation_len=generation_len, + iteration=iteration, + automation=automation, ) diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py index fd7ef03ff..b09659deb 100755 --- a/QEfficient/generation/text_generation_inference.py +++ b/QEfficient/generation/text_generation_inference.py @@ -320,6 +320,7 @@ def cloud_ai_100_exec_kv( stream: bool = True, write_io_dir: Optional[str] = None, automation=False, + iteration: int = 1, prompt_to_lora_id_mapping: Optional[List[int]] = None, is_tlm: bool = False, ): @@ -341,6 +342,7 @@ def cloud_ai_100_exec_kv( :stream (bool): If True, enable streamer, which returns tokens one by one as the model generates them. ``Defaults to True``. :Write_io_dir (str): Path to write the input and output files. ``Defaults to None``. :automation (bool): If true, it prints input, output, and performance stats. ``Defaults to False``. + :iteration (int): Number of iterations to run the inference. ``Defaults to 1``. :prompt_to_lora_id_mapping (List[int]): Mapping to associate prompts with their respective LoRA adapter. Returns: @@ -373,30 +375,34 @@ def cloud_ai_100_exec_kv( full_batch_size=full_batch_size, is_tlm=is_tlm, ) - if full_batch_size is None: - exec_info = [ - generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping) - for i in range(0, len(prompt), batch_size) - ] - prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info]) - decode_perf = np.average([info.perf_metrics.decode_perf for info in exec_info]) - total_perf = np.average([info.perf_metrics.total_perf for info in exec_info]) - total_time = np.average([info.perf_metrics.total_time for info in exec_info]) - generated_texts = [info.generated_texts for info in exec_info] - generated_ids = [info.generated_ids for info in exec_info] - - exec_info = CloudAI100ExecInfo( - batch_size=batch_size, - generated_texts=generated_texts, - generated_ids=generated_ids, - perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time), - ) - else: - exec_info = generate_text.generate( - prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping - ) - print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation) + for _ in range(0, int(iteration)): + if full_batch_size is None: + exec_info = [ + generate_text.generate(prompt[i : i + batch_size], generation_len, stream, prompt_to_lora_id_mapping) + for i in range(0, len(prompt), batch_size) + ] + prefill_time = np.average([info.perf_metrics.prefill_time for info in exec_info]) + decode_perf = np.average([info.perf_metrics.decode_perf for info in exec_info]) + total_perf = np.average([info.perf_metrics.total_perf for info in exec_info]) + total_time = np.average([info.perf_metrics.total_time for info in exec_info]) + generated_texts = [info.generated_texts for info in exec_info] + generated_ids = [info.generated_ids for info in exec_info] + + exec_info = CloudAI100ExecInfo( + batch_size=batch_size, + generated_texts=generated_texts, + generated_ids=generated_ids, + perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time), + ) + else: + exec_info = generate_text.generate( + prompt=prompt, generation_len=generation_len, prompt_to_lora_id_mapping=prompt_to_lora_id_mapping + ) + + print_latency_stats_kv(prompt, exec_info=exec_info, automation=automation) + + # TODO: Need to handle the case where exec_info if given for n iterations return exec_info @@ -822,7 +828,9 @@ def run_continuous_batching_decode(self, prompt_queue, generation_len): return decode_pause_time - def run_decode(self, decode_inputs, generation_len, streamer: Optional[transformers.TextStreamer] = None): + def run_decode( + self, decode_inputs, generation_len, automation, streamer: Optional[transformers.TextStreamer] = None + ): """ Default method for running decode. Executes the decoding process for a given set of inputs and a specified generation length. @@ -857,11 +865,11 @@ def run_decode(self, decode_inputs, generation_len, streamer: Optional[transform self.generated_ids[:, num_token] = decode_inputs["input_ids"][:, -1] finished_sequences |= decode_inputs["input_ids"] == self.tokenizer.eos_token_id - if finished_sequences.all(): + if finished_sequences.all() and not automation: break return num_token - def generate_decode_stream(self, decode_inputs, generation_len): + def generate_decode_stream(self, decode_inputs, generation_len, automation): """ Generator method for yielding decode tokens. Executes the decoding process for a given set of inputs and a specified generation length. @@ -889,7 +897,7 @@ def generate_decode_stream(self, decode_inputs, generation_len): self.generated_ids[:, num_token] = decode_inputs["input_ids"].squeeze(1) finished_sequences |= decode_inputs["input_ids"] == self.tokenizer.eos_token_id - if finished_sequences.all(): + if finished_sequences.all() and not automation: break yield decode_inputs["input_ids"] # yield the last token @@ -953,6 +961,7 @@ def _regular_model_execution( prompt: List[str], generation_len: Optional[int] = None, stream: Optional[bool] = True, + automation: Optional[bool] = False, prompt_to_lora_id_mapping: Optional[List[int]] = None, ): """ @@ -980,7 +989,7 @@ def _regular_model_execution( decode_inputs = self._qaic_model.prepare_decode_inputs() loop_start = perf_counter() # Start decode loop timer - num_token = self._qaic_model.run_decode(decode_inputs, generation_len, self._text_streamer) + num_token = self._qaic_model.run_decode(decode_inputs, generation_len, automation, self._text_streamer) end = perf_counter() generated_texts = self._tokenizer.batch_decode(self._qaic_model.generated_ids, skip_special_tokens=True) @@ -1034,6 +1043,7 @@ def generate_stream_tokens( self, prompt: List[str], generation_len: Optional[int] = None, + automation: Optional[bool] = False, prompt_to_lora_id_mapping: Optional[List[int]] = None, ): """ @@ -1063,7 +1073,7 @@ def generate_stream_tokens( loop_start = perf_counter() # Start decode loop timer num_token = 0 - for token_id in self._qaic_model.generate_decode_stream(decode_inputs, generation_len): + for token_id in self._qaic_model.generate_decode_stream(decode_inputs, generation_len, automation): decoded_tokens = [] for idx in range(self._qaic_model.batch_size): decoded_tokens.append(self._tokenizer.decode(token_id[idx], skip_special_tokens=True)) @@ -1082,6 +1092,7 @@ def generate( prompt: List[str], generation_len: Optional[int] = None, stream: bool = True, + automation: Optional[bool] = False, prompt_to_lora_id_mapping: Optional[List[int]] = None, ): """ @@ -1105,7 +1116,7 @@ def generate( if stream: print("\nPrompt : " + prompt[0] + "\nCompletion :", flush=True, end="") perf_metrics, generated_texts = self._regular_model_execution( - prompt, generation_len, stream, prompt_to_lora_id_mapping + prompt, generation_len, stream, automation, prompt_to_lora_id_mapping ) if stream: diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 2f3ee3dc0..72814565b 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1892,6 +1892,8 @@ def generate( prompt=prompts, device_id=device_id, generation_len=generation_len, + automation=kwargs.pop("automation", False), + iteration=kwargs.pop("iteration", 1), is_tlm=self.is_tlm, ) else: