digitalocean · bkdoeng · Aug 8, 2025
diff --git a/python/ray/llm/_internal/serve/configs/openai_api_models.py b/python/ray/llm/_internal/serve/configs/openai_api_models.py
@@ -52,9 +52,6 @@
 ChatCompletionContentPartRefusalParam = TypeVar(
     "ChatCompletionContentPartRefusalParam", bound=Any
 )
-ChatCompletionMessageToolCallParam = TypeVar(
-    "ChatCompletionMessageToolCallParam", bound=Any
-)
 OpenAIChatCompletionContentPartParam = TypeVar(
     "OpenAIChatCompletionContentPartParam", bound=Any
 )
@@ -152,6 +149,19 @@ class ChatCompletionContentPartContentParam(TypedDict, total=False):
     str,
 ]
 
+class ChatCompletionMessageFunctionCallParam(TypedDict, total=False):
+    name: Required[str]
+    """The name of the function to call."""
+    arguments: Required[str]
+    """The arguments to pass to the function."""
+
+class ChatCompletionMessageToolCallParam(TypedDict, total=False):
+    id: Required[str]
+    """The ID of the tool call."""
+    type: Required[Literal["function"]] = "function"
+    """The type of the tool call, which is always "function"."""
+    function: Required[ChatCompletionMessageFunctionCallParam]
+    """The function call details."""
 
 class ChatCompletionMessageParam(TypedDict, total=False):
     """Enables custom roles in the Chat Completion API."""
@@ -172,7 +182,7 @@ class ChatCompletionMessageParam(TypedDict, total=False):
     tool_call_id: Optional[str]
     """Tool call that this message is responding to."""
 
-    tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]]
+    tool_calls: Optional[List[ChatCompletionMessageToolCallParam]]
     """The tool calls generated by the model, such as function calls."""
 
 
@@ -236,7 +246,7 @@ class ChatCompletionRequest(BaseModel):
     top_p: Optional[float] = None
     tools: Optional[List[ChatCompletionToolsParam]] = None
     tool_choice: Optional[
-        Union[Literal["none"], Literal["auto"], ChatCompletionNamedToolChoiceParam]
+        Union[Literal["none"], Literal["auto"], Literal["required"], ChatCompletionNamedToolChoiceParam]
     ] = "none"
 
     # NOTE this will be ignored by vLLM -- the model determines the behavior

diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_models.py
@@ -196,10 +196,19 @@ def placement_bundles(self) -> List[Dict[str, float]]:
         if self.resources_per_bundle:
             bundle = self.resources_per_bundle
         else:
-            bundle = {"GPU": 1}
+            bundle = {"CPU": 1}
+            # Add GPU resources if tensor parallelism is enabled
+            if self.tensor_parallel_degree > 1:
+                bundle["GPU"] = 1
+
         if self.accelerator_type:
             bundle[self.ray_accelerator_type()] = 0.001
         bundles = [bundle for _ in range(self.num_devices)]
+
+        # Ensure we have at least one resource in each bundle
+        for bundle in bundles:
+            if not any(bundle.values()):
+                bundle["CPU"] = 1
 
         return bundles