diff --git a/pyproject.toml b/pyproject.toml index 7ed055093..46fb06ef1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,7 +94,7 @@ nanotron = [ "tensorboardX" ] tensorboardX = ["tensorboardX"] -vllm = ["vllm>=0.8.4", "ray", "more_itertools"] +vllm = ["vllm>=0.8.5.post1", "ray", "more_itertools"] quality = ["ruff>=v0.11.0","pre-commit"] tests = ["pytest>=7.4.0","deepdiff"] dev = ["lighteval[accelerate,quality,tests,multilingual,math,extended_tasks,vllm]"] diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index f35eff8d9..53aba0c37 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -399,14 +399,8 @@ def _generate( sampling_params.detokenize = False if self.data_parallel_size > 1: - # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote - # also seems to only work with decorator and not with ray.remote() fn - # see https://github.com/vllm-project/vllm/issues/973 - # note: this has changed on 0.3.3, and it only works now if num_gpus are set. - # but then tensor_parallel breaks - # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set, - # as VLLM complains about no GPUs available. - @ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None) + + @ray.remote(num_gpus=self.tensor_parallel_size) def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)