Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions tests/entrypoints/openai/test_response_api_mcp_tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project


import pytest
import pytest_asyncio
from openai import OpenAI
Expand Down Expand Up @@ -206,6 +207,67 @@ async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_nam
)


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_calling_streaming_types(
mcp_enabled_client: OpenAI, model_name: str
):
pairs_of_event_types = {
"response.completed": "response.created",
"response.output_item.done": "response.output_item.added",
"response.content_part.done": "response.content_part.added",
"response.output_text.done": "response.output_text.delta",
"response.reasoning_text.done": "response.reasoning_text.delta",
"response.reasoning_part.done": "response.reasoning_part.added",
"response.mcp_call_arguments.done": "response.mcp_call_arguments.delta",
"response.mcp_call.completed": "response.mcp_call.in_progress",
}

tools = [
{
"type": "mcp",
"server_label": "code_interpreter",
}
]
input_text = "What is 13 * 24? Use python to calculate the result."

stream_response = await mcp_enabled_client.responses.create(
model=model_name,
input=input_text,
tools=tools,
stream=True,
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
)

stack_of_event_types = []
saw_mcp_type = False
async for event in stream_response:
if event.type == "response.created":
stack_of_event_types.append(event.type)
elif event.type == "response.completed":
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
stack_of_event_types.pop()
if (
event.type.endswith("added")
or event.type == "response.mcp_call.in_progress"
):
stack_of_event_types.append(event.type)
elif event.type.endswith("delta"):
if stack_of_event_types[-1] == event.type:
continue
stack_of_event_types.append(event.type)
Comment on lines +257 to +260
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The logic for handling delta events might be flawed. If multiple delta events of the same type occur consecutively, only the first one is pushed to the stack, and subsequent ones are skipped (continue). However, pairs_of_event_types maps delta to done events, implying that each done event should correspond to a delta event. If intermediate delta events are skipped, the final done event might pop an incorrect delta event from the stack, leading to an assertion failure or an incorrect len(stack_of_event_types) == 0 at the end. Consider pushing all delta events or refining the stack logic to correctly match done events with their corresponding delta streams.

elif event.type.endswith("done") or event.type == "response.mcp_call.completed":
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
if "mcp_call" in event.type:
saw_mcp_type = True
stack_of_event_types.pop()

assert len(stack_of_event_types) == 0
assert saw_mcp_type, "Should have seen at least one mcp call"


def test_get_tool_description():
"""Test MCPToolServer.get_tool_description filtering logic.

Expand Down
184 changes: 183 additions & 1 deletion tests/entrypoints/openai/test_response_api_with_harmony.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.util
import json
import time

Expand Down Expand Up @@ -43,6 +43,8 @@ def server():
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS="code_interpreter,container,web_search_preview",
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS="1",
)

with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
Expand Down Expand Up @@ -850,6 +852,186 @@ async def test_function_calling_with_stream(client: OpenAI, model_name: str):
assert event.response.output_text is not None


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_code_interpreter_streaming(client: OpenAI, model_name: str, server):
tools = [
{
"type": "mcp",
"server_label": "code_interpreter",
}
]
input_text = (
"Calculate 15 * 32 using python. "
"The python interpreter is not stateful and you must print to see the output."
)

stream_response = await client.responses.create(
model=model_name,
input=input_text,
tools=tools,
stream=True,
temperature=0.0,
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
)

mcp_call_added = False
mcp_call_in_progress = False
mcp_arguments_delta_seen = False
mcp_arguments_done = False
mcp_call_completed = False
mcp_item_done = False

code_interpreter_events_seen = False

async for event in stream_response:
if "code_interpreter" in event.type:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The assertion assert "code_interpreter" in event.type is incorrect. The event types for MCP tools are like response.mcp_call.in_progress, response.mcp_call_arguments.delta, etc., which do not contain "code_interpreter". This assertion will always be false for the new MCP event types.

code_interpreter_events_seen = True

if event.type == "response.output_item.added":
if hasattr(event.item, "type") and event.item.type == "mcp_call":
mcp_call_added = True
assert event.item.name == "python"
assert event.item.server_label == "code_interpreter"

elif event.type == "response.mcp_call.in_progress":
mcp_call_in_progress = True

elif event.type == "response.mcp_call_arguments.delta":
mcp_arguments_delta_seen = True
assert event.delta is not None

elif event.type == "response.mcp_call_arguments.done":
mcp_arguments_done = True
assert event.name == "python"
assert event.arguments is not None

elif event.type == "response.mcp_call.completed":
mcp_call_completed = True

elif (
event.type == "response.output_item.done"
and hasattr(event.item, "type")
and event.item.type == "mcp_call"
):
mcp_item_done = True
assert event.item.name == "python"
assert event.item.status == "completed"

assert mcp_call_added, "MCP call was not added"
assert mcp_call_in_progress, "MCP call in_progress event not seen"
assert mcp_arguments_delta_seen, "MCP arguments delta event not seen"
assert mcp_arguments_done, "MCP arguments done event not seen"
assert mcp_call_completed, "MCP call completed event not seen"
assert mcp_item_done, "MCP item done event not seen"

assert not code_interpreter_events_seen, (
"Should not see code_interpreter events when using MCP type"
)
Comment on lines +930 to +932
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Given the previous assertion on line 891, this assertion assert not code_interpreter_events_seen seems contradictory or indicates a misunderstanding of the event types. If the intention is to ensure that legacy code_interpreter events are not seen when using the new mcp type, the check on line 891 should be removed or modified to specifically look for legacy event types, and code_interpreter_events_seen should only be set if a legacy event is encountered. As it stands, the test logic is confusing.



@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_multi_turn(client: OpenAI, model_name: str, server):
"""Test MCP tool calling across multiple turns.

This test verifies that MCP tools work correctly in multi-turn conversations,
maintaining state across turns via the previous_response_id mechanism.
"""
tools = [
{
"type": "mcp",
"server_label": "code_interpreter",
}
]

# First turn - make a calculation
response1 = await client.responses.create(
model=model_name,
input="Calculate 50 * 60 using python and print the result.",
tools=tools,
temperature=0.0,
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
extra_body={"enable_response_messages": True},
)

assert response1 is not None
assert response1.status == "completed"

# Verify MCP call in first response by checking output_messages
tool_call_found = False
tool_response_found = False
for message in response1.output_messages:
recipient = message.get("recipient")
if recipient and recipient.startswith("python"):
tool_call_found = True

author = message.get("author", {})
if (
author.get("role") == "tool"
and author.get("name")
and author.get("name").startswith("python")
):
tool_response_found = True

# Verify MCP tools were actually used
assert tool_call_found, "MCP tool call not found in output_messages"
assert tool_response_found, "MCP tool response not found in output_messages"

# Verify input messages: Should have system message with tool, NO developer message
developer_messages = [
msg for msg in response1.input_messages if msg["author"]["role"] == "developer"
]
assert len(developer_messages) == 0, (
"No developer message expected for elevated tools"
)

# Second turn - reference previous calculation
previous_messages = response1.input_messages + response1.output_messages

response2 = await client.responses.create(
model=model_name,
input="Now divide that result by 2.",
tools=tools,
temperature=0.0,
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
extra_body={
"previous_input_messages": previous_messages,
"enable_response_messages": True,
},
)

assert response2 is not None
assert response2.status == "completed"

# Verify input messages are correct: should have two messages -
# one to the python recipient on analysis channel and one from tool role
mcp_recipient_messages = []
tool_role_messages = []
for msg in response2.input_messages:
if msg["author"]["role"] == "assistant":
# Check if this is a message to MCP recipient on analysis channel
if msg.get("channel") == "analysis" and msg.get("recipient"):
recipient = msg.get("recipient")
if recipient.startswith("code_interpreter") or recipient == "python":
mcp_recipient_messages.append(msg)
elif msg["author"]["role"] == "tool":
tool_role_messages.append(msg)

assert len(mcp_recipient_messages) > 0, (
"Expected message(s) to MCP recipient on analysis channel"
)
assert len(tool_role_messages) > 0, (
"Expected message(s) from tool role after MCP call"
)


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
Expand Down
Loading