Skip to content

Commit 8ed6997

Browse files
committed
refactor(tests): make the responses tests nicer (#3161)
# What does this PR do? A _bunch_ on cleanup for the Responses tests. - Got rid of YAML test cases, moved them to just use simple pydantic models - Splitting the large monolithic test file into multiple focused test files: - `test_basic_responses.py` for basic and image response tests - `test_tool_responses.py` for tool-related tests - `test_file_search.py` for file search specific tests - Adding a `StreamingValidator` helper class to standardize streaming response validation ## Test Plan Run the tests: ``` pytest -s -v tests/integration/non_ci/responses/ \ --stack-config=starter \ --text-model openai/gpt-4o \ --embedding-model=sentence-transformers/all-MiniLM-L6-v2 \ -k "client_with_models" ```
1 parent ba66447 commit 8ed6997

File tree

11 files changed

+1312
-1736
lines changed

11 files changed

+1312
-1736
lines changed

tests/integration/non_ci/responses/fixtures/fixtures.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# the root directory of this source tree.
66

77
import os
8-
import re
98
from pathlib import Path
109

1110
import pytest
@@ -48,19 +47,6 @@ def _load_all_verification_configs():
4847
return {"providers": all_provider_configs}
4948

5049

51-
def case_id_generator(case):
52-
"""Generate a test ID from the case's 'case_id' field, or use a default."""
53-
case_id = case.get("case_id")
54-
if isinstance(case_id, str | int):
55-
return re.sub(r"\\W|^(?=\\d)", "_", str(case_id))
56-
return None
57-
58-
59-
# Helper to get the base test name from the request object
60-
def get_base_test_name(request):
61-
return request.node.originalname
62-
63-
6450
# --- End Helper Functions ---
6551

6652

tests/integration/non_ci/responses/fixtures/load.py

Lines changed: 0 additions & 16 deletions
This file was deleted.
Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the terms described in the LICENSE file in
5+
# the root directory of this source tree.
6+
7+
from typing import Any
8+
9+
import pytest
10+
from pydantic import BaseModel
11+
12+
13+
class ResponsesTestCase(BaseModel):
14+
# Input can be a simple string or complex message structure
15+
input: str | list[dict[str, Any]]
16+
expected: str
17+
# Tools as flexible dict structure (gets validated at runtime by the API)
18+
tools: list[dict[str, Any]] | None = None
19+
# Multi-turn conversations with input/output pairs
20+
turns: list[tuple[str | list[dict[str, Any]], str]] | None = None
21+
# File search specific fields
22+
file_content: str | None = None
23+
file_path: str | None = None
24+
# Streaming flag
25+
stream: bool | None = None
26+
27+
28+
# Basic response test cases
29+
basic_test_cases = [
30+
pytest.param(
31+
ResponsesTestCase(
32+
input="Which planet do humans live on?",
33+
expected="earth",
34+
),
35+
id="earth",
36+
),
37+
pytest.param(
38+
ResponsesTestCase(
39+
input="Which planet has rings around it with a name starting with letter S?",
40+
expected="saturn",
41+
),
42+
id="saturn",
43+
),
44+
pytest.param(
45+
ResponsesTestCase(
46+
input=[
47+
{
48+
"role": "user",
49+
"content": [
50+
{
51+
"type": "input_text",
52+
"text": "what teams are playing in this image?",
53+
}
54+
],
55+
},
56+
{
57+
"role": "user",
58+
"content": [
59+
{
60+
"type": "input_image",
61+
"image_url": "https://upload.wikimedia.org/wikipedia/commons/3/3b/LeBron_James_Layup_%28Cleveland_vs_Brooklyn_2018%29.jpg",
62+
}
63+
],
64+
},
65+
],
66+
expected="brooklyn nets",
67+
),
68+
id="image_input",
69+
),
70+
]
71+
72+
# Multi-turn test cases
73+
multi_turn_test_cases = [
74+
pytest.param(
75+
ResponsesTestCase(
76+
input="", # Not used for multi-turn
77+
expected="", # Not used for multi-turn
78+
turns=[
79+
("Which planet do humans live on?", "earth"),
80+
("What is the name of the planet from your previous response?", "earth"),
81+
],
82+
),
83+
id="earth",
84+
),
85+
]
86+
87+
# Web search test cases
88+
web_search_test_cases = [
89+
pytest.param(
90+
ResponsesTestCase(
91+
input="How many experts does the Llama 4 Maverick model have?",
92+
tools=[{"type": "web_search", "search_context_size": "low"}],
93+
expected="128",
94+
),
95+
id="llama_experts",
96+
),
97+
]
98+
99+
# File search test cases
100+
file_search_test_cases = [
101+
pytest.param(
102+
ResponsesTestCase(
103+
input="How many experts does the Llama 4 Maverick model have?",
104+
tools=[{"type": "file_search"}],
105+
expected="128",
106+
file_content="Llama 4 Maverick has 128 experts",
107+
),
108+
id="llama_experts",
109+
),
110+
pytest.param(
111+
ResponsesTestCase(
112+
input="How many experts does the Llama 4 Maverick model have?",
113+
tools=[{"type": "file_search"}],
114+
expected="128",
115+
file_path="pdfs/llama_stack_and_models.pdf",
116+
),
117+
id="llama_experts_pdf",
118+
),
119+
]
120+
121+
# MCP tool test cases
122+
mcp_tool_test_cases = [
123+
pytest.param(
124+
ResponsesTestCase(
125+
input="What is the boiling point of myawesomeliquid in Celsius?",
126+
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
127+
expected="Hello, world!",
128+
),
129+
id="boiling_point_tool",
130+
),
131+
]
132+
133+
# Custom tool test cases
134+
custom_tool_test_cases = [
135+
pytest.param(
136+
ResponsesTestCase(
137+
input="What's the weather like in San Francisco?",
138+
tools=[
139+
{
140+
"type": "function",
141+
"name": "get_weather",
142+
"description": "Get current temperature for a given location.",
143+
"parameters": {
144+
"additionalProperties": False,
145+
"properties": {
146+
"location": {
147+
"description": "City and country e.g. Bogotá, Colombia",
148+
"type": "string",
149+
}
150+
},
151+
"required": ["location"],
152+
"type": "object",
153+
},
154+
}
155+
],
156+
expected="", # No specific expected output for custom tools
157+
),
158+
id="sf_weather",
159+
),
160+
]
161+
162+
# Image test cases
163+
image_test_cases = [
164+
pytest.param(
165+
ResponsesTestCase(
166+
input=[
167+
{
168+
"role": "user",
169+
"content": [
170+
{
171+
"type": "input_text",
172+
"text": "Identify the type of animal in this image.",
173+
},
174+
{
175+
"type": "input_image",
176+
"image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
177+
},
178+
],
179+
},
180+
],
181+
expected="llama",
182+
),
183+
id="llama_image",
184+
),
185+
]
186+
187+
# Multi-turn image test cases
188+
multi_turn_image_test_cases = [
189+
pytest.param(
190+
ResponsesTestCase(
191+
input="", # Not used for multi-turn
192+
expected="", # Not used for multi-turn
193+
turns=[
194+
(
195+
[
196+
{
197+
"role": "user",
198+
"content": [
199+
{
200+
"type": "input_text",
201+
"text": "What type of animal is in this image? Please respond with a single word that starts with the letter 'L'.",
202+
},
203+
{
204+
"type": "input_image",
205+
"image_url": "https://upload.wikimedia.org/wikipedia/commons/f/f7/Llamas%2C_Vernagt-Stausee%2C_Italy.jpg",
206+
},
207+
],
208+
},
209+
],
210+
"llama",
211+
),
212+
(
213+
"What country do you find this animal primarily in? What continent?",
214+
"peru",
215+
),
216+
],
217+
),
218+
id="llama_image_understanding",
219+
),
220+
]
221+
222+
# Multi-turn tool execution test cases
223+
multi_turn_tool_execution_test_cases = [
224+
pytest.param(
225+
ResponsesTestCase(
226+
input="I need to check if user 'alice' can access the file 'document.txt'. First, get alice's user ID, then check if that user ID can access the file 'document.txt'. Do this as a series of steps, where each step is a separate message. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
227+
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
228+
expected="yes",
229+
),
230+
id="user_file_access_check",
231+
),
232+
pytest.param(
233+
ResponsesTestCase(
234+
input="I need to get the results for the 'boiling_point' experiment. First, get the experiment ID for 'boiling_point', then use that ID to get the experiment results. Tell me the boiling point in Celsius.",
235+
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
236+
expected="100°C",
237+
),
238+
id="experiment_results_lookup",
239+
),
240+
]
241+
242+
# Multi-turn tool execution streaming test cases
243+
multi_turn_tool_execution_streaming_test_cases = [
244+
pytest.param(
245+
ResponsesTestCase(
246+
input="Help me with this security check: First, get the user ID for 'charlie', then get the permissions for that user ID, and finally check if that user can access 'secret_file.txt'. Stream your progress as you work through each step. Return only one tool call per step. Summarize the final result with a single 'yes' or 'no' response.",
247+
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
248+
expected="no",
249+
stream=True,
250+
),
251+
id="user_permissions_workflow",
252+
),
253+
pytest.param(
254+
ResponsesTestCase(
255+
input="I need a complete analysis: First, get the experiment ID for 'chemical_reaction', then get the results for that experiment, and tell me if the yield was above 80%. Return only one tool call per step. Please stream your analysis process.",
256+
tools=[{"type": "mcp", "server_label": "localmcp", "server_url": "<FILLED_BY_TEST_RUNNER>"}],
257+
expected="85%",
258+
stream=True,
259+
),
260+
id="experiment_analysis_streaming",
261+
),
262+
]

0 commit comments

Comments
 (0)