From d00a7c4041aaaf8c951228f57bed5d8444a90229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= Date: Thu, 22 May 2025 12:06:41 +0200 Subject: [PATCH] Add support for vLLM KV-cache quantization --- src/lighteval/models/vllm/vllm_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index e539b926f..adc04e9ae 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -95,6 +95,8 @@ class VLLMModelConfig(ModelConfig): max_num_seqs: PositiveInt = 128 # maximum number of sequences per iteration; This variable and `max_num_batched_tokens` effectively control the batch size at prefill stage. See https://github.com/vllm-project/vllm/issues/2492 for detailed explaination. max_num_batched_tokens: PositiveInt = 2048 # maximum number of tokens per batch subfolder: str | None = None + kv_cache_dtype: str = "auto" + calculate_kv_scales: bool = False class VLLMModel(LightevalModel): @@ -177,6 +179,8 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]: "seed": int(config.seed), "max_num_seqs": int(config.max_num_seqs), "max_num_batched_tokens": int(config.max_num_batched_tokens), + "kv_cache_dtype": config.kv_cache_dtype, + "calculate_kv_scales": config.calculate_kv_scales, } if config.quantization is not None: