Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion tests/entrypoints/openai/test_response_api_mcp_tools.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project


import pytest
import pytest_asyncio
from openai import OpenAI
from openai_harmony import ToolDescription, ToolNamespaceConfig

from openai import OpenAI
from vllm.entrypoints.tool_server import MCPToolServer

from ...utils import RemoteOpenAIServer
Expand Down Expand Up @@ -206,6 +207,67 @@ async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_nam
)


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_calling_streaming_types(
mcp_enabled_client: OpenAI, model_name: str
):
pairs_of_event_types = {
"response.completed": "response.created",
"response.output_item.done": "response.output_item.added",
"response.content_part.done": "response.content_part.added",
"response.output_text.done": "response.output_text.delta",
"response.reasoning_text.done": "response.reasoning_text.delta",
"response.reasoning_part.done": "response.reasoning_part.added",
"response.mcp_call_arguments.done": "response.mcp_call_arguments.delta",
"response.mcp_call.completed": "response.mcp_call.in_progress",
}

tools = [
{
"type": "mcp",
"server_label": "code_interpreter",
}
]
input_text = "What is 13 * 24? Use python to calculate the result."

stream_response = await mcp_enabled_client.responses.create(
model=model_name,
input=input_text,
tools=tools,
stream=True,
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
)

stack_of_event_types = []
saw_mcp_type = False
async for event in stream_response:
if event.type == "response.created":
stack_of_event_types.append(event.type)
elif event.type == "response.completed":
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
stack_of_event_types.pop()
elif (
event.type.endswith("added")
or event.type == "response.mcp_call.in_progress"
):
stack_of_event_types.append(event.type)
elif event.type.endswith("delta"):
if stack_of_event_types[-1] == event.type:
continue
stack_of_event_types.append(event.type)
elif event.type.endswith("done") or event.type == "response.mcp_call.completed":
assert stack_of_event_types[-1] == pairs_of_event_types[event.type]
if "mcp_call" in event.type:
saw_mcp_type = True
stack_of_event_types.pop()

assert len(stack_of_event_types) == 0
Comment on lines 246 to 267
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The logic in this test has a couple of issues:

  1. The assertion assert "mcp_call" in event.type on line 245 is too strict. The event stream includes many events unrelated to MCP calls (e.g., response.created, response.reasoning_text.delta), which will cause this assertion to fail.
  2. The event handling logic uses an if followed by an elif chain, and then another if (line 252). This second if should be an elif to form a single conditional block. Otherwise, an event matching endswith("added") will be processed by the second if block, and then the subsequent elif blocks for delta and done will be skipped for that event, which is not the intended logic for pairing events.

I've suggested a fix that addresses both points by introducing a flag to check for MCP events and correcting the conditional logic.

    mcp_event_seen = False
    stack_of_event_types = []
    async for event in stream_response:
        if "mcp_call" in event.type:
            mcp_event_seen = True

        if event.type == "response.created":
            stack_of_event_types.append(event.type)
        elif event.type == "response.completed":
            assert stack_of_event_types.pop() == pairs_of_event_types[event.type]
        elif (
            event.type.endswith("added")
            or event.type == "response.mcp_call.in_progress"
        ):
            stack_of_event_types.append(event.type)
        elif event.type.endswith("delta"):
            if not stack_of_event_types or stack_of_event_types[-1] != event.type:
                stack_of_event_types.append(event.type)
        elif event.type.endswith("done") or event.type == "response.mcp_call.completed":
            assert stack_of_event_types.pop() == pairs_of_event_types[event.type]

    assert mcp_event_seen, "No MCP call events were observed in the stream."
    assert len(stack_of_event_types) == 0

assert saw_mcp_type, "Should have seen at least one mcp call"


def test_get_tool_description():
"""Test MCPToolServer.get_tool_description filtering logic.

Expand Down
237 changes: 235 additions & 2 deletions tests/entrypoints/openai/test_response_api_with_harmony.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.util
import json
import time

import pytest
import pytest_asyncio
import requests
from openai import BadRequestError, NotFoundError, OpenAI
from openai_harmony import (
Message,
)

from openai import BadRequestError, NotFoundError, OpenAI

from ...utils import RemoteOpenAIServer

MODEL_NAME = "openai/gpt-oss-20b"
Expand Down Expand Up @@ -44,6 +44,8 @@ def server():
env_dict = dict(
VLLM_ENABLE_RESPONSES_API_STORE="1",
PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS="code_interpreter,container,web_search_preview",
VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS="1",
)

with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
Expand Down Expand Up @@ -855,6 +857,237 @@ async def test_function_calling_with_stream(client: OpenAI, model_name: str):
assert event.response.output_text is not None


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_function_calling_no_code_interpreter_events(
client: OpenAI, model_name: str
):
"""Verify that function calls don't trigger code_interpreter events.

This test ensures that function calls (functions.*) use their own
function_call event types and don't incorrectly emit code_interpreter
events during streaming.
"""
tools = [GET_WEATHER_SCHEMA]
input_list = [
{
"role": "user",
"content": "What's the weather like in Paris today?",
}
]
stream_response = await client.responses.create(
model=model_name,
input=input_list,
tools=tools,
stream=True,
)

# Track which event types we see
event_types_seen = set()
function_call_found = False

async for event in stream_response:
event_types_seen.add(event.type)

if (
event.type == "response.output_item.added"
and event.item.type == "function_call"
):
function_call_found = True

# Ensure NO code_interpreter events are emitted for function calls
assert "code_interpreter" not in event.type, (
"Found code_interpreter event "
f"'{event.type}' during function call. Function calls should only "
"emit function_call events, not code_interpreter events."
)

# Verify we actually saw a function call
assert function_call_found, "Expected to see a function_call in the stream"

# Verify we saw the correct function call event types
assert (
"response.function_call_arguments.delta" in event_types_seen
or "response.function_call_arguments.done" in event_types_seen
), "Expected to see function_call_arguments events"


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_code_interpreter_streaming(client: OpenAI, model_name: str, server):
tools = [
{
"type": "mcp",
"server_label": "code_interpreter",
}
]
input_text = (
"Calculate 15 * 32 using python. "
"The python interpreter is not stateful and you must print to see the output."
)

stream_response = await client.responses.create(
model=model_name,
input=input_text,
tools=tools,
stream=True,
temperature=0.0,
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
)

mcp_call_added = False
mcp_call_in_progress = False
mcp_arguments_delta_seen = False
mcp_arguments_done = False
mcp_call_completed = False
mcp_item_done = False

code_interpreter_events_seen = False

async for event in stream_response:
if "code_interpreter" in event.type:
code_interpreter_events_seen = True

if event.type == "response.output_item.added":
if hasattr(event.item, "type") and event.item.type == "mcp_call":
mcp_call_added = True
assert event.item.name == "python"
assert event.item.server_label == "code_interpreter"

elif event.type == "response.mcp_call.in_progress":
mcp_call_in_progress = True

elif event.type == "response.mcp_call_arguments.delta":
mcp_arguments_delta_seen = True
assert event.delta is not None

elif event.type == "response.mcp_call_arguments.done":
mcp_arguments_done = True
assert event.name == "python"
assert event.arguments is not None

elif event.type == "response.mcp_call.completed":
mcp_call_completed = True

elif (
event.type == "response.output_item.done"
and hasattr(event.item, "type")
and event.item.type == "mcp_call"
):
mcp_item_done = True
assert event.item.name == "python"
assert event.item.status == "completed"

assert mcp_call_added, "MCP call was not added"
assert mcp_call_in_progress, "MCP call in_progress event not seen"
assert mcp_arguments_delta_seen, "MCP arguments delta event not seen"
assert mcp_arguments_done, "MCP arguments done event not seen"
assert mcp_call_completed, "MCP call completed event not seen"
assert mcp_item_done, "MCP item done event not seen"

assert not code_interpreter_events_seen, (
"Should not see code_interpreter events when using MCP type"
)


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_mcp_tool_multi_turn(client: OpenAI, model_name: str, server):
"""Test MCP tool calling across multiple turns.

This test verifies that MCP tools work correctly in multi-turn conversations,
maintaining state across turns via the previous_response_id mechanism.
"""
tools = [
{
"type": "mcp",
"server_label": "code_interpreter",
}
]

# First turn - make a calculation
response1 = await client.responses.create(
model=model_name,
input="Calculate 123 * 456 using python and print the result.",
tools=tools,
temperature=0.0,
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
extra_body={"enable_response_messages": True},
)

assert response1 is not None
assert response1.status == "completed"

# Verify MCP call in first response by checking output_messages
tool_call_found = False
tool_response_found = False
for message in response1.output_messages:
recipient = message.get("recipient")
if recipient and recipient.startswith("python"):
tool_call_found = True

author = message.get("author", {})
if (
author.get("role") == "tool"
and author.get("name")
and author.get("name").startswith("python")
):
tool_response_found = True

# Verify MCP tools were actually used
assert tool_call_found, "MCP tool call not found in output_messages"
assert tool_response_found, "MCP tool response not found in output_messages"

# Verify input messages: Should have system message with tool, NO developer message
developer_messages = [
msg for msg in response1.input_messages if msg["author"]["role"] == "developer"
]
assert len(developer_messages) == 0, (
"No developer message expected for elevated tools"
)

# Second turn - reference previous calculation
response2 = await client.responses.create(
model=model_name,
input="Now divide that result by 2.",
tools=tools,
temperature=0.0,
instructions=(
"You must use the Python tool to execute code. Never simulate execution."
),
previous_response_id=response1.id,
extra_body={"enable_response_messages": True},
)

assert response2 is not None
assert response2.status == "completed"

# Verify input messages are correct: should have two messages -
# one to the python recipient on analysis channel and one from tool role
mcp_recipient_messages = []
tool_role_messages = []
for msg in response2.input_messages:
if msg["author"]["role"] == "assistant":
# Check if this is a message to MCP recipient on analysis channel
if msg.get("channel") == "analysis" and msg.get("recipient"):
recipient = msg.get("recipient")
if recipient.startswith("code_interpreter") or recipient == "python":
mcp_recipient_messages.append(msg)
elif msg["author"]["role"] == "tool":
tool_role_messages.append(msg)

assert len(mcp_recipient_messages) > 0, (
"Expected message(s) to MCP recipient on analysis channel"
)
assert len(tool_role_messages) > 0, (
"Expected message(s) from tool role after MCP call"
)


@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_output_messages_enabled(client: OpenAI, model_name: str, server):
Expand Down
Loading
Loading