diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index db9ac99d3..1b7bb5cc9 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -55,6 +55,7 @@ from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.lora.request import LoRARequest logging.getLogger("vllm").propagate = True logging.getLogger("vllm").handlers.clear() @@ -93,6 +94,7 @@ class VLLMModelConfig: ) pairwise_tokenization: bool = False # whether to tokenize the context and continuation separately or together. generation_parameters: GenerationParameters = None # sampling parameters to use for generation + lora_path: str | None = None # path to the LoRA modules max_num_seqs: int = 128 # maximum number of sequences per iteration; This variable and `max_num_batched_tokens` effectively control the batch size at prefill stage. See https://github.com/vllm-project/vllm/issues/2492 for detailed explaination. max_num_batched_tokens: int = 2048 # maximum number of tokens per batch @@ -132,6 +134,12 @@ def __init__( self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) self.pairwise_tokenization = config.pairwise_tokenization + + # enable LoRA if lora_path is provided + if config.lora_path is not None: + self.lora_request = LoRARequest("default", 1, config.lora_path) + else: + self.lora_request = None @property def tokenizer(self): @@ -183,6 +191,8 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) -> "max_model_len": self._max_length, "swap_space": 4, "seed": int(config.seed), + "enable_lora": config.lora_path is not None, + "seed": int(config.seed), "max_num_seqs": int(config.max_num_seqs), "max_num_batched_tokens": int(config.max_num_batched_tokens), } @@ -342,10 +352,14 @@ def _generate( # but then tensor_parallel breaks # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set, # as VLLM complains about no GPUs available. + # Haizhou: didn't test lora with data_parallel_size > 1 @ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None) def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) - return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) + if self.lora_request is not None: + return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params, lora_request=self.lora_request) + else: + return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) # dispatch requests to all self.data_parallel_size workers, in interleaved fashion # interleaved important to balance context lengths across workers @@ -362,11 +376,19 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r if x is not None ] else: - outputs = self.model.generate( - prompt_token_ids=inputs, - sampling_params=sampling_params, - use_tqdm=True, - ) + if self.lora_request is not None: + outputs = self.model.generate( + prompt_token_ids=inputs, + sampling_params=sampling_params, + lora_request=self.lora_request, + use_tqdm=True, + ) + else: + outputs = self.model.generate( + prompt_token_ids=inputs, + sampling_params=sampling_params, + use_tqdm=True, + ) return outputs