Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions tools/who_what_benchmark/whowhatbench/model_loaders.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
import logging
import json
import torch
Expand Down Expand Up @@ -70,6 +71,15 @@ def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None, **kwargs):
if kwargs.get('gguf_file'):
pipeline_path = os.path.join(model_dir, kwargs['gguf_file'])

draft_model_path = kwargs.get("draft_model", '')
if draft_model_path:
if not Path(draft_model_path).exists():
raise RuntimeError(f'Error: Draft model path does not exist: {draft_model_path}')
draft_device = kwargs.get('draft_device', None) or device
draft_model_load_kwargs = {'scheduler_config': get_scheduler_config_genai(kwargs["draft_cb_config"])}\
if kwargs["draft_cb_config"] is not None else {}
ov_config['draft_model'] = openvino_genai.draft_model(draft_model_path, draft_device.upper(), **draft_model_load_kwargs)

is_continuous_batching = kwargs.get("cb_config", None) is not None

if is_continuous_batching:
Expand Down
11 changes: 6 additions & 5 deletions tools/who_what_benchmark/whowhatbench/text_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,9 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
return res

def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question, use_chat_template=False):
def default_gen_answer(model, tokenizer, prompt, gen_config, crop_question):
max_new_tokens = gen_config.max_new_tokens
use_chat_template = gen_config.apply_chat_template
is_awq = getattr(model, "is_awq", None) is not None
device = "cpu"
if hasattr(model, "device"):
Expand Down Expand Up @@ -184,16 +186,15 @@ def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question,
else prompt_data.values[: self.num_samples]
)

if generation_config is None:
if self.seqs_per_request == 1:
for p in tqdm(prompts, desc="Evaluate pipeline"):
answers.append(
gen_answer_fn(
model,
self.tokenizer,
p,
self.max_new_tokens,
self._crop_question,
self.use_chat_template
generation_config,
self._crop_question
)
)
else:
Expand Down
65 changes: 62 additions & 3 deletions tools/who_what_benchmark/whowhatbench/wwb.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,35 @@ def parse_args():
"If the base/target model is a local path, gguf-file should be just the filename (e.g., 'model.gguf'). "
"If the base/target model is a HuggingFace model ID, gguf-file should be a relative path.",
)
parser.add_argument(
"--draft-model",
default=None,
help="Path to draft model folder including IR files for Speculative decoding generation.",
)
parser.add_argument(
"--draft-device",
type=str,
default=None,
help="Inference device for Speculative decoding of draft model, e.g. 'CPU', 'GPU'.",
)
parser.add_argument(
"--draft-cb-config",
type=str,
default=None,
help="Path to file with Continuous Batching Scheduler settings or dict for Speculative decoding of draft model",
)
parser.add_argument(
"--num-assistant-tokens",
type=int,
default=None,
help="Config option num_assistant_tokens for Speculative decoding and Prompt Lookup decoding.",
)
parser.add_argument(
"--assistant-confidence-threshold",
type=float,
default=None,
help="Config option assistant_confidence_threshold for Speculative decoding.",
)

return parser.parse_args()

Expand Down Expand Up @@ -387,11 +416,13 @@ def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str:
return "".join(output)


def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens, apply_chat_template=use_chat_template)
def genai_gen_text(model, tokenizer, question, gen_config, skip_question):
return model.generate(question, gen_config)


def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
def llamacpp_gen_text(model, tokenizer, question, gen_config, skip_question):
max_new_tokens = gen_config.max_new_tokens
use_chat_template = gen_config.apply_chat_template
if use_chat_template:
output = model.create_chat_completion(messages=[{"role": "user", "content": question}], max_tokens=max_new_tokens, temperature=0.0)
text = output["choices"][0]["message"]["content"]
Expand Down Expand Up @@ -491,6 +522,7 @@ def create_evaluator(base_model, args):
task = args.model_type

try:
import openvino_genai
EvaluatorCLS = EVALUATOR_REGISTRY[task]
prompts = load_prompts(args)

Expand All @@ -507,6 +539,21 @@ def create_evaluator(base_model, args):
use_chat_template = (
tokenizer is not None and tokenizer.chat_template is not None and not args.omit_chat_template
)

gen_config = openvino_genai.GenerationConfig()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please, import openvino_genai and create GenerationConfig only if --genai option is set

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can create and set generation config once when you create the GenAI pipeline in model_loaders.py

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, Updated.

gen_config.max_new_tokens = 128
gen_config.apply_chat_template = use_chat_template
gen_config.do_sample = False
if args.draft_model is not None:
config_info = "Speculative decoding config: "
if args.num_assistant_tokens is not None:
gen_config.num_assistant_tokens = int(args.num_assistant_tokens)
config_info += f" num_assistant_tokens {gen_config.num_assistant_tokens}"
if args.assistant_confidence_threshold is not None:
gen_config.assistant_confidence_threshold = float(args.assistant_confidence_threshold)
config_info += f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}"
logger.info(config_info)

return EvaluatorCLS(
base_model=base_model,
gt_data=args.gt_data,
Expand All @@ -516,6 +563,8 @@ def create_evaluator(base_model, args):
num_samples=args.num_samples,
language=args.language,
gen_answer_fn=gen_answer_fn,
generation_config=gen_config,
seqs_per_request=1,
use_chat_template=use_chat_template,
long_prompt=args.long_prompt,
)
Expand Down Expand Up @@ -715,11 +764,21 @@ def main():
kwargs["alphas"] = args.alphas
else:
kwargs["alphas"] = [1.0] * len(args.adapters)

kwargs["empty_adapters"] = args.empty_adapters
kwargs["embeds_pooling"] = args.embeds_pooling_type
kwargs["embeds_normalize"] = args.embeds_normalize
kwargs["embeds_padding_side"] = args.embeds_padding_side

if args.draft_model is not None:
kwargs["draft_model"] = args.draft_model
if args.draft_device is not None:
kwargs["draft_device"] = args.draft_device
if args.draft_cb_config is not None:
kwargs["draft_cb_config"] = read_cb_config(args.draft_cb_config)
else:
kwargs["draft_cb_config"] = None

if args.gt_data and os.path.exists(args.gt_data):
evaluator = create_evaluator(None, args)
else:
Expand Down
Loading