From bac98a8c63960e50614c3fefe581885a303c774d Mon Sep 17 00:00:00 2001 From: Boopathy Kannappan Date: Fri, 8 Aug 2025 16:17:35 +0000 Subject: [PATCH] updates for 2.48.0 --- .../serve/configs/openai_api_models.py | 20 ++++++++++++++----- .../serve/deployments/llm/vllm/vllm_models.py | 11 +++++++++- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/python/ray/llm/_internal/serve/configs/openai_api_models.py b/python/ray/llm/_internal/serve/configs/openai_api_models.py index 0936abb9589b..984feae05dbc 100644 --- a/python/ray/llm/_internal/serve/configs/openai_api_models.py +++ b/python/ray/llm/_internal/serve/configs/openai_api_models.py @@ -52,9 +52,6 @@ ChatCompletionContentPartRefusalParam = TypeVar( "ChatCompletionContentPartRefusalParam", bound=Any ) -ChatCompletionMessageToolCallParam = TypeVar( - "ChatCompletionMessageToolCallParam", bound=Any -) OpenAIChatCompletionContentPartParam = TypeVar( "OpenAIChatCompletionContentPartParam", bound=Any ) @@ -152,6 +149,19 @@ class ChatCompletionContentPartContentParam(TypedDict, total=False): str, ] +class ChatCompletionMessageFunctionCallParam(TypedDict, total=False): + name: Required[str] + """The name of the function to call.""" + arguments: Required[str] + """The arguments to pass to the function.""" + +class ChatCompletionMessageToolCallParam(TypedDict, total=False): + id: Required[str] + """The ID of the tool call.""" + type: Required[Literal["function"]] = "function" + """The type of the tool call, which is always "function".""" + function: Required[ChatCompletionMessageFunctionCallParam] + """The function call details.""" class ChatCompletionMessageParam(TypedDict, total=False): """Enables custom roles in the Chat Completion API.""" @@ -172,7 +182,7 @@ class ChatCompletionMessageParam(TypedDict, total=False): tool_call_id: Optional[str] """Tool call that this message is responding to.""" - tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]] + tool_calls: Optional[List[ChatCompletionMessageToolCallParam]] """The tool calls generated by the model, such as function calls.""" @@ -236,7 +246,7 @@ class ChatCompletionRequest(BaseModel): top_p: Optional[float] = None tools: Optional[List[ChatCompletionToolsParam]] = None tool_choice: Optional[ - Union[Literal["none"], Literal["auto"], ChatCompletionNamedToolChoiceParam] + Union[Literal["none"], Literal["auto"], Literal["required"], ChatCompletionNamedToolChoiceParam] ] = "none" # NOTE this will be ignored by vLLM -- the model determines the behavior diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py index f0c79e636e23..17f38a12042d 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py @@ -196,10 +196,19 @@ def placement_bundles(self) -> List[Dict[str, float]]: if self.resources_per_bundle: bundle = self.resources_per_bundle else: - bundle = {"GPU": 1} + bundle = {"CPU": 1} + # Add GPU resources if tensor parallelism is enabled + if self.tensor_parallel_degree > 1: + bundle["GPU"] = 1 + if self.accelerator_type: bundle[self.ray_accelerator_type()] = 0.001 bundles = [bundle for _ in range(self.num_devices)] + + # Ensure we have at least one resource in each bundle + for bundle in bundles: + if not any(bundle.values()): + bundle["CPU"] = 1 return bundles