From f826ef2583cb7b36a15020ddb93c1f2346240c12 Mon Sep 17 00:00:00 2001 From: haizhou-shi Date: Tue, 11 Mar 2025 00:59:19 -0400 Subject: [PATCH] added vllm lora support. --- src/lighteval/models/vllm/vllm_model.py | 33 ++++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 98309f7d6..8661e539a 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -55,6 +55,7 @@ from vllm import LLM, SamplingParams from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel from vllm.transformers_utils.tokenizer import get_tokenizer + from vllm.lora.request import LoRARequest logging.getLogger("vllm").propagate = True logging.getLogger("vllm").handlers.clear() @@ -93,6 +94,7 @@ class VLLMModelConfig: ) pairwise_tokenization: bool = False # whether to tokenize the context and continuation separately or together. generation_parameters: GenerationParameters = None # sampling parameters to use for generation + lora_path: Optional[str] = None # path to the LoRA modules subfolder: Optional[str] = None @@ -131,6 +133,12 @@ def __init__( self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) self.sampling_params = SamplingParams(**config.generation_parameters.to_vllm_dict()) self.pairwise_tokenization = config.pairwise_tokenization + + # enable LoRA if lora_path is provided + if config.lora_path is not None: + self.lora_request = LoRARequest("default", 1, config.lora_path) + else: + self.lora_request = None @property def tokenizer(self): @@ -183,6 +191,7 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) -> "max_model_len": self._max_length, "swap_space": 4, "seed": config.seed, + "enable_lora": config.lora_path is not None, } if int(config.data_parallel_size) > 1: self.model_args["distributed_executor_backend"] = "ray" @@ -343,10 +352,14 @@ def _generate( # but then tensor_parallel breaks # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set, # as VLLM complains about no GPUs available. + # Haizhou: didn't test lora with data_parallel_size > 1 @ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None) def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) - return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) + if self.lora_request is not None: + return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params, lora_request=self.lora_request) + else: + return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) # dispatch requests to all self.data_parallel_size workers, in interleaved fashion # interleaved important to balance context lengths across workers @@ -363,11 +376,19 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r if x is not None ] else: - outputs = self.model.generate( - prompt_token_ids=inputs, - sampling_params=sampling_params, - use_tqdm=True, - ) + if self.lora_request is not None: + outputs = self.model.generate( + prompt_token_ids=inputs, + sampling_params=sampling_params, + lora_request=self.lora_request, + use_tqdm=True, + ) + else: + outputs = self.model.generate( + prompt_token_ids=inputs, + sampling_params=sampling_params, + use_tqdm=True, + ) return outputs