Skip to content

Commit 94e8ca6

Browse files
committed
test: add integration tests for query_metrics
the integration tests check if all of the metrics we currently support actual are queryable after inference requests this also tests thinks like aggregation, label filtering, etc Signed-off-by: Charlie Doern <[email protected]>
1 parent c289691 commit 94e8ca6

15 files changed

+695
-107
lines changed

tests/integration/fixtures/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def openai_client(llama_stack_client, require_server):
278278
return OpenAI(base_url=base_url, api_key="fake")
279279

280280

281-
@pytest.fixture(params=["openai_client", "client_with_models"])
281+
@pytest.fixture(scope="session", params=["openai_client", "client_with_models"])
282282
def compat_client(request, client_with_models):
283283
if request.param == "openai_client" and isinstance(client_with_models, LlamaStackAsLibraryClient):
284284
# OpenAI client expects a server, so unless we also rewrite OpenAI client's requests
0 Bytes
Binary file not shown.
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/api/generate",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b-instruct-fp16",
8+
"raw": true,
9+
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
10+
"options": {
11+
"temperature": 0.0
12+
},
13+
"stream": false
14+
},
15+
"endpoint": "/api/generate",
16+
"model": "llama3.2:3b-instruct-fp16"
17+
},
18+
"response": {
19+
"body": {
20+
"__type__": "ollama._types.GenerateResponse",
21+
"__data__": {
22+
"model": "llama3.2:3b-instruct-fp16",
23+
"created_at": "2025-08-11T15:51:18.170868Z",
24+
"done": true,
25+
"done_reason": "stop",
26+
"total_duration": 5240614083,
27+
"load_duration": 9823416,
28+
"prompt_eval_count": 21,
29+
"prompt_eval_duration": 21000000,
30+
"eval_count": 310,
31+
"eval_duration": 5209000000,
32+
"response": "This is the start of a test. I'll provide some sample data and you can try to generate metrics based on it.\n\n**Data:**\n\nLet's say we have a dataset of user interactions with an e-commerce website. The data includes:\n\n| User ID | Product Name | Purchase Date | Quantity | Price |\n| --- | --- | --- | --- | --- |\n| 1 | iPhone 13 | 2022-01-01 | 2 | 999.99 |\n| 1 | MacBook Air | 2022-01-05 | 1 | 1299.99 |\n| 2 | Samsung TV | 2022-01-10 | 3 | 899.99 |\n| 3 | iPhone 13 | 2022-01-15 | 1 | 999.99 |\n| 4 | MacBook Pro | 2022-01-20 | 2 | 1799.99 |\n\n**Task:**\n\nYour task is to generate the following metrics based on this data:\n\n1. Average order value (AOV)\n2. Conversion rate\n3. Average revenue per user (ARPU)\n4. Customer lifetime value (CLV)\n\nPlease provide your answers in a format like this:\n\n| Metric | Value |\n| --- | --- |\n| AOV | 1234.56 |\n| Conversion Rate | 0.25 |\n| ARPU | 1000.00 |\n| CLV | 5000.00 |\n\nGo ahead and generate the metrics!",
33+
"thinking": null,
34+
"context": null
35+
}
36+
},
37+
"is_streaming": false
38+
}
39+
}

tests/integration/recordings/responses/4a3a4447b16b.json

Lines changed: 3 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"models": [
1515
{
1616
"model": "nomic-embed-text:latest",
17-
"modified_at": "2025-08-18T12:47:56.732989-07:00",
17+
"modified_at": "2025-08-22T15:38:31.980156-04:00",
1818
"digest": "0a109f422b47e3a30ba2b10eca18548e944e8a23073ee3f3e947efcf3c45e59f",
1919
"size": 274302450,
2020
"details": {
@@ -28,89 +28,9 @@
2828
"quantization_level": "F16"
2929
}
3030
},
31-
{
32-
"model": "llama3.2-vision:11b",
33-
"modified_at": "2025-07-30T18:45:02.517873-07:00",
34-
"digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e",
35-
"size": 7816589186,
36-
"details": {
37-
"parent_model": "",
38-
"format": "gguf",
39-
"family": "mllama",
40-
"families": [
41-
"mllama"
42-
],
43-
"parameter_size": "10.7B",
44-
"quantization_level": "Q4_K_M"
45-
}
46-
},
47-
{
48-
"model": "llama3.2-vision:latest",
49-
"modified_at": "2025-07-29T20:18:47.920468-07:00",
50-
"digest": "6f2f9757ae97e8a3f8ea33d6adb2b11d93d9a35bef277cd2c0b1b5af8e8d0b1e",
51-
"size": 7816589186,
52-
"details": {
53-
"parent_model": "",
54-
"format": "gguf",
55-
"family": "mllama",
56-
"families": [
57-
"mllama"
58-
],
59-
"parameter_size": "10.7B",
60-
"quantization_level": "Q4_K_M"
61-
}
62-
},
63-
{
64-
"model": "llama-guard3:1b",
65-
"modified_at": "2025-07-25T14:39:44.978630-07:00",
66-
"digest": "494147e06bf99e10dbe67b63a07ac81c162f18ef3341aa3390007ac828571b3b",
67-
"size": 1600181919,
68-
"details": {
69-
"parent_model": "",
70-
"format": "gguf",
71-
"family": "llama",
72-
"families": [
73-
"llama"
74-
],
75-
"parameter_size": "1.5B",
76-
"quantization_level": "Q8_0"
77-
}
78-
},
7931
{
8032
"model": "all-minilm:l6-v2",
81-
"modified_at": "2025-07-24T15:15:11.129290-07:00",
82-
"digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
83-
"size": 45960996,
84-
"details": {
85-
"parent_model": "",
86-
"format": "gguf",
87-
"family": "bert",
88-
"families": [
89-
"bert"
90-
],
91-
"parameter_size": "23M",
92-
"quantization_level": "F16"
93-
}
94-
},
95-
{
96-
"model": "llama3.2:1b",
97-
"modified_at": "2025-07-17T22:02:24.953208-07:00",
98-
"digest": "baf6a787fdffd633537aa2eb51cfd54cb93ff08e28040095462bb63daf552878",
99-
"size": 1321098329,
100-
"details": {
101-
"parent_model": "",
102-
"format": "gguf",
103-
"family": "llama",
104-
"families": [
105-
"llama"
106-
],
107-
"parameter_size": "1.2B",
108-
"quantization_level": "Q8_0"
109-
}
110-
},
111-
{
112-
"model": "all-minilm:latest",
113-
"modified_at": "2025-06-03T16:50:10.946583-07:00",
33+
"modified_at": "2025-07-28T10:48:35.010422-04:00",
11434
"digest": "1b226e2802dbb772b5fc32a58f103ca1804ef7501331012de126ab22f67475ef",
11535
"size": 45960996,
11636
"details": {
@@ -126,7 +46,7 @@
12646
},
12747
{
12848
"model": "llama3.2:3b",
129-
"modified_at": "2025-05-01T11:15:23.797447-07:00",
49+
"modified_at": "2025-07-11T10:37:11.453812-04:00",
13050
"digest": "a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72",
13151
"size": 2019393189,
13252
"details": {
@@ -139,22 +59,6 @@
13959
"parameter_size": "3.2B",
14060
"quantization_level": "Q4_K_M"
14161
}
142-
},
143-
{
144-
"model": "llama3.2:3b-instruct-fp16",
145-
"modified_at": "2025-04-30T15:33:48.939665-07:00",
146-
"digest": "195a8c01d91ec3cb1e0aad4624a51f2602c51fa7d96110f8ab5a20c84081804d",
147-
"size": 6433703586,
148-
"details": {
149-
"parent_model": "",
150-
"format": "gguf",
151-
"family": "llama",
152-
"families": [
153-
"llama"
154-
],
155-
"parameter_size": "3.2B",
156-
"quantization_level": "F16"
157-
}
15862
}
15963
]
16064
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/v1/v1/chat/completions",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b",
8+
"messages": [
9+
{
10+
"role": "user",
11+
"content": "OpenAI test 0"
12+
}
13+
],
14+
"stream": false
15+
},
16+
"endpoint": "/v1/chat/completions",
17+
"model": "llama3.2:3b"
18+
},
19+
"response": {
20+
"body": {
21+
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
22+
"__data__": {
23+
"id": "chatcmpl-843",
24+
"choices": [
25+
{
26+
"finish_reason": "stop",
27+
"index": 0,
28+
"logprobs": null,
29+
"message": {
30+
"content": "I don't have any information about an \"OpenAI test 0\". It's possible that you may be referring to a specific experiment or task being performed by OpenAI, but without more context, I can only speculate.\n\nHowever, I can tell you that OpenAI is a research organization that has been involved in various projects and tests related to artificial intelligence. If you could provide more context or clarify what you're referring to, I may be able to help further.\n\nIf you're looking for general information about OpenAI, I can try to provide some background on the organization:\n\nOpenAI is a non-profit research organization that was founded in 2015 with the goal of developing and applying advanced artificial intelligence to benefit humanity. The organization has made significant contributions to the field of AI, including the development of the popular language model, ChatGPT.\n\nIf you could provide more context or clarify what you're looking for, I'll do my best to assist you.",
31+
"refusal": null,
32+
"role": "assistant",
33+
"annotations": null,
34+
"audio": null,
35+
"function_call": null,
36+
"tool_calls": null
37+
}
38+
}
39+
],
40+
"created": 1755891518,
41+
"model": "llama3.2:3b",
42+
"object": "chat.completion",
43+
"service_tier": null,
44+
"system_fingerprint": "fp_ollama",
45+
"usage": {
46+
"completion_tokens": 194,
47+
"prompt_tokens": 30,
48+
"total_tokens": 224,
49+
"completion_tokens_details": null,
50+
"prompt_tokens_details": null
51+
}
52+
}
53+
},
54+
"is_streaming": false
55+
}
56+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/v1/v1/chat/completions",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b",
8+
"messages": [
9+
{
10+
"role": "user",
11+
"content": "OpenAI test 1"
12+
}
13+
],
14+
"stream": false
15+
},
16+
"endpoint": "/v1/chat/completions",
17+
"model": "llama3.2:3b"
18+
},
19+
"response": {
20+
"body": {
21+
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
22+
"__data__": {
23+
"id": "chatcmpl-726",
24+
"choices": [
25+
{
26+
"finish_reason": "stop",
27+
"index": 0,
28+
"logprobs": null,
29+
"message": {
30+
"content": "I'm ready to help with the test. What language would you like to use? Would you like to have a conversation, ask questions, or take a specific type of task?",
31+
"refusal": null,
32+
"role": "assistant",
33+
"annotations": null,
34+
"audio": null,
35+
"function_call": null,
36+
"tool_calls": null
37+
}
38+
}
39+
],
40+
"created": 1755891519,
41+
"model": "llama3.2:3b",
42+
"object": "chat.completion",
43+
"service_tier": null,
44+
"system_fingerprint": "fp_ollama",
45+
"usage": {
46+
"completion_tokens": 37,
47+
"prompt_tokens": 30,
48+
"total_tokens": 67,
49+
"completion_tokens_details": null,
50+
"prompt_tokens_details": null
51+
}
52+
}
53+
},
54+
"is_streaming": false
55+
}
56+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/v1/v1/chat/completions",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b",
8+
"messages": [
9+
{
10+
"role": "user",
11+
"content": "OpenAI test 4"
12+
}
13+
],
14+
"stream": false
15+
},
16+
"endpoint": "/v1/chat/completions",
17+
"model": "llama3.2:3b"
18+
},
19+
"response": {
20+
"body": {
21+
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
22+
"__data__": {
23+
"id": "chatcmpl-581",
24+
"choices": [
25+
{
26+
"finish_reason": "stop",
27+
"index": 0,
28+
"logprobs": null,
29+
"message": {
30+
"content": "I'm ready to help. What would you like to test? We could try a variety of things, such as:\n\n1. Conversational dialogue\n2. Language understanding\n3. Common sense reasoning\n4. Joke or pun generation\n5. Trivia or knowledge-based questions\n6. Creative writing or storytelling\n7. Summarization or paraphrasing\n\nLet me know which area you'd like to test, or suggest something else that's on your mind!",
31+
"refusal": null,
32+
"role": "assistant",
33+
"annotations": null,
34+
"audio": null,
35+
"function_call": null,
36+
"tool_calls": null
37+
}
38+
}
39+
],
40+
"created": 1755891527,
41+
"model": "llama3.2:3b",
42+
"object": "chat.completion",
43+
"service_tier": null,
44+
"system_fingerprint": "fp_ollama",
45+
"usage": {
46+
"completion_tokens": 96,
47+
"prompt_tokens": 30,
48+
"total_tokens": 126,
49+
"completion_tokens_details": null,
50+
"prompt_tokens_details": null
51+
}
52+
}
53+
},
54+
"is_streaming": false
55+
}
56+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
{
2+
"request": {
3+
"method": "POST",
4+
"url": "http://localhost:11434/api/generate",
5+
"headers": {},
6+
"body": {
7+
"model": "llama3.2:3b-instruct-fp16",
8+
"raw": true,
9+
"prompt": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nTest metrics generation 0<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
10+
"options": {
11+
"temperature": 0.0
12+
},
13+
"stream": false
14+
},
15+
"endpoint": "/api/generate",
16+
"model": "llama3.2:3b-instruct-fp16"
17+
},
18+
"response": {
19+
"body": {
20+
"__type__": "ollama._types.GenerateResponse",
21+
"__data__": {
22+
"model": "llama3.2:3b-instruct-fp16",
23+
"created_at": "2025-08-11T15:51:12.918723Z",
24+
"done": true,
25+
"done_reason": "stop",
26+
"total_duration": 8868987792,
27+
"load_duration": 2793275292,
28+
"prompt_eval_count": 21,
29+
"prompt_eval_duration": 250000000,
30+
"eval_count": 344,
31+
"eval_duration": 5823000000,
32+
"response": "Here are some common test metrics used to evaluate the performance of a system:\n\n1. **Accuracy**: The proportion of correct predictions or classifications out of total predictions made.\n2. **Precision**: The ratio of true positives (correctly predicted instances) to the sum of true positives and false positives (incorrectly predicted instances).\n3. **Recall**: The ratio of true positives to the sum of true positives and false negatives (missed instances).\n4. **F1-score**: The harmonic mean of precision and recall, providing a balanced measure of both.\n5. **Mean Squared Error (MSE)**: The average squared difference between predicted and actual values.\n6. **Mean Absolute Error (MAE)**: The average absolute difference between predicted and actual values.\n7. **Root Mean Squared Percentage Error (RMSPE)**: The square root of the mean of the squared percentage differences between predicted and actual values.\n8. **Coefficient of Determination (R-squared, R2)**: Measures how well a model fits the data, with higher values indicating better fit.\n9. **Mean Absolute Percentage Error (MAPE)**: The average absolute percentage difference between predicted and actual values.\n10. **Normalized Mean Squared Error (NMSE)**: Similar to MSE, but normalized by the mean of the actual values.\n\nThese metrics can be used for various types of data, including:\n\n* Regression problems (e.g., predicting continuous values)\n* Classification problems (e.g., predicting categorical labels)\n* Time series forecasting\n* Clustering and dimensionality reduction\n\nWhen choosing a metric, consider the specific problem you're trying to solve, the type of data, and the desired level of precision.",
33+
"thinking": null,
34+
"context": null
35+
}
36+
},
37+
"is_streaming": false
38+
}
39+
}

0 commit comments

Comments
 (0)