1616from utils .generation_config import get_greedy , get_beam_search , \
1717 get_multinomial_all_parameters , get_multinomial_temperature_and_num_return_sequence , \
1818 get_multinomial_temperature_and_top_k , get_multinomial_temperature , get_multinomial_temperature_and_top_p
19- from utils .hugging_face import download_and_convert_model
20- from utils .ov_genai_pipelines import create_ov_pipeline , create_ov_cb_pipeline , PipelineType , dict_to_scheduler_config , generate_and_compare , prepare_generation_config_by_pipe_type , GenerationChatInputsType
19+ from utils .hugging_face import download_and_convert_model , run_hugging_face
20+ from utils .ov_genai_pipelines import create_ov_pipeline , create_ov_cb_pipeline , PipelineType , dict_to_scheduler_config , generate_and_compare , prepare_generation_config_by_pipe_type , convert_decoded_results_to_generation_result , GenerationChatInputsType
21+ from utils .comparation import compare_generation_results
2122from data .models import get_chat_models_list
2223from data .test_dataset import get_test_dataset
2324
@@ -489,22 +490,45 @@ def get_data_by_pipeline_type(model_path: Path, pipeline_type: str, generation_c
489490 return pipe , prompt , generation_config
490491
491492
492- def run_extended_perf_metrics_collection (model_id , generation_config : GenerationConfig , prompt : str , pipeline_type : PipelineType ):
493+ def run_extended_perf_metrics_collection (model_id , generation_config : GenerationConfig , prompt : str , pipeline_type : PipelineType , draft_model_id ):
493494 _ , _ , model_path = download_and_convert_model (model_id )
494- ov_pipe = create_ov_pipeline (model_path , pipeline_type = pipeline_type )
495+ draft_model_path = None
496+ if draft_model_id is not None :
497+ _ ,_ , draft_model_path = download_and_convert_model (draft_model_id )
498+ ov_pipe = create_ov_pipeline (model_path , pipeline_type = pipeline_type , draft_model_path = draft_model_path )
495499 return ov_pipe .generate ([prompt ], generation_config ).extended_perf_metrics
496500
501+ eagle_models_and_input = [
502+ ("Qwen/Qwen3-1.7B" , "AngelSlim/Qwen3-1.7B_eagle3" , """Code:
503+ def add(a, b):
504+ return a + b
497505
506+ Question: Can you please add 2 and 3
507+ A:""" )]
508+
509+ speculative_cases = [
510+ ("TinyLlama/TinyLlama-1.1B-Chat-v1.0" , None , "Why is the Sun yellow?" ),
511+ eagle_models_and_input [0 ],
512+ ]
498513@pytest .mark .parametrize ("pipeline_type" , [PipelineType .PAGED_ATTENTION , PipelineType .SPECULATIVE_DECODING ])
514+ @pytest .mark .parametrize ("main_model_id,draft_model_id, prompt" , speculative_cases )
499515@pytest .mark .precommit
500- def test_speculative_decoding_extended_perf_metrics (pipeline_type ):
516+ def test_speculative_decoding_extended_perf_metrics (pipeline_type , main_model_id , draft_model_id , prompt ):
501517 import time
502518 start_time = time .perf_counter ()
503- model_id : str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
504- generation_config = GenerationConfig (do_sample = False , max_new_tokens = 20 , ignore_eos = True , num_assistant_tokens = 5 )
505- extended_perf_metrics = run_extended_perf_metrics_collection (model_id , generation_config , "Why is the Sun yellow?" , pipeline_type )
506- total_time = (time .perf_counter () - start_time ) * 1000
519+ extended_perf_metrics = None
520+ if draft_model_id is None :
521+ generation_config = GenerationConfig (do_sample = False , max_new_tokens = 20 , ignore_eos = True , num_assistant_tokens = 5 )
522+ extended_perf_metrics = run_extended_perf_metrics_collection (main_model_id , generation_config , prompt , pipeline_type , draft_model_id )
523+ total_time = (time .perf_counter () - start_time ) * 1000
507524
525+ else :
526+ if (pipeline_type == PipelineType .SPECULATIVE_DECODING ):
527+ generation_config = GenerationConfig (do_sample = False , max_new_tokens = 20 , ignore_eos = True , num_assistant_tokens = 5 )
528+ extended_perf_metrics = run_extended_perf_metrics_collection (main_model_id , generation_config , prompt , pipeline_type , draft_model_id )
529+ total_time = (time .perf_counter () - start_time ) * 1000
530+
531+
508532 if (pipeline_type == PipelineType .SPECULATIVE_DECODING ):
509533 assert not extended_perf_metrics is None
510534 assert not extended_perf_metrics .main_model_metrics is None
@@ -542,3 +566,31 @@ def test_speculative_decoding_extended_perf_metrics(pipeline_type):
542566 assert std_gen_duration == 0
543567 else :
544568 assert extended_perf_metrics is None
569+
570+ devices = [
571+ ('CPU' , 'CPU' )
572+ ]
573+ @pytest .mark .parametrize ("main_model,draft_model,prompt" , eagle_models_and_input )
574+ @pytest .mark .parametrize ("main_device,draft_device" , devices )
575+ @pytest .mark .precommit
576+ def test_eagle3_sd_string_inputs (main_model , main_device , draft_model , draft_device , prompt ):
577+ # Download and convert model:
578+ main_opt_model , main_hf_tokenizer , main_model_path = download_and_convert_model (main_model )
579+ __ , __ , draft_model_path = download_and_convert_model (draft_model )
580+
581+ # Create OpenVINO GenAI pipeline:
582+
583+ ov_pipe = create_ov_pipeline (main_model_path , pipeline_type = PipelineType .SPECULATIVE_DECODING , draft_model_path = draft_model_path )
584+
585+ # Run reference HF model:
586+ ov_generation_config = GenerationConfig (max_new_tokens = 20 )
587+ ref_gen_results = run_hugging_face (main_opt_model , main_hf_tokenizer , [prompt ], ov_generation_config )
588+
589+ # Run OpenVINO GenAI pipeline:
590+ ov_decoded_results = ov_pipe .generate ([prompt ], ov_generation_config )
591+ ov_gen_results = convert_decoded_results_to_generation_result (ov_decoded_results , 1 , 1 , False )
592+
593+ del ov_pipe
594+
595+ # Compare results:
596+ compare_generation_results ([prompt ], ref_gen_results , ov_gen_results , ov_generation_config )
0 commit comments