llamastack · derekhiggins · Aug 13, 2025 · Aug 18, 2025 · Aug 13, 2025 · Aug 20, 2025
@@ -58,9 +58,9 @@ runs:
           git add tests/integration/recordings/
 
           if [ "${{ inputs.run-vision-tests }}" == "true" ]; then
-            git commit -m "Recordings update from CI (vision)"
+            git commit -m "Recordings update from CI (vision) (${{ inputs.provider }})"
           else
-            git commit -m "Recordings update from CI"
+            git commit -m "Recordings update from CI (${{ inputs.provider }})"
           fi
 
           git fetch origin ${{ github.ref_name }}
@@ -76,7 +76,8 @@ runs:
       if: ${{ always() }}
       shell: bash
       run: |
-        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
+        sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log 2>&1 || true
+        sudo docker logs vllm > vllm-${{ inputs.inference-mode }}.log 2>&1 || true
 
     - name: Upload logs
       if: ${{ always() }}

@@ -21,7 +21,6 @@ on:
   schedule:
     # If changing the cron schedule, update the provider in the test-matrix job
     - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
-    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
   workflow_dispatch:
     inputs:
       test-all-client-versions:
@@ -47,7 +46,6 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-
   run-replay-mode-tests:
     runs-on: ubuntu-latest
     name: ${{ format('Integration Tests ({0}, {1}, {2}, client={3}, vision={4})', matrix.client-type, matrix.provider, matrix.python-version, matrix.client-version, matrix.run-vision-tests) }}
@@ -57,11 +55,14 @@ jobs:
       matrix:
         client-type: [library, server]
         # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
-        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
+        provider: [ollama, vllm]
         # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
         python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
         client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
         run-vision-tests: [true, false]
+        exclude:
+          - provider: vllm
+            run-vision-tests: true
 
     steps:
       - name: Checkout repository

@@ -10,12 +10,16 @@
 import json
 import os
 import sqlite3
+import uuid
 from collections.abc import Generator
 from contextlib import contextmanager
 from enum import StrEnum
 from pathlib import Path
 from typing import Any, Literal, cast
 
+from openai.pagination import AsyncPage
+from openai.types.chat import ChatCompletion, ChatCompletionChunk
+
 from llama_stack.log import get_logger
 
 logger = get_logger(__name__, category="testing")
@@ -248,6 +252,20 @@ async def _patched_inference_method(original_method, self, client_type, endpoint
         recording = _current_storage.find_recording(request_hash)
         if recording:
             response_body = recording["response"]["body"]
+            if (
+                isinstance(response_body, list)
+                and len(response_body) > 0
+                and isinstance(response_body[0], ChatCompletionChunk)
+            ):
+                # We can't replay chatcompletions with the same id and we store them in a sqlite database with a unique constraint on the id.
+                # So we generate a new id and replace the old one.
+                newid = uuid.uuid4().hex
+                response_body[0].id = "chatcmpl-" + newid
+            elif isinstance(response_body, ChatCompletion):
+                # We can't replay chatcompletions with the same id and we store them in a sqlite database with a unique constraint on the id.
+                # So we generate a new id and replace the old one.
+                newid = uuid.uuid4().hex
+                response_body.id = "chatcmpl-" + newid
 
             if recording["response"].get("is_streaming", False):
 
@@ -279,7 +297,8 @@ async def replay_stream():
         }
 
         # Determine if this is a streaming request based on request parameters
-        is_streaming = body.get("stream", False)
+        # or if the response is an AsyncPage (like models.list returns)
+        is_streaming = body.get("stream", False) or isinstance(response, AsyncPage)
 
         if is_streaming:
             # For streaming responses, we need to collect all chunks immediately before yielding
@@ -315,9 +334,11 @@ def patch_inference_clients():
     from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
     from openai.resources.completions import AsyncCompletions
     from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels
 
     # Store original methods for both OpenAI and Ollama clients
     _original_methods = {
+        "models_list": AsyncModels.list,
         "chat_completions_create": AsyncChatCompletions.create,
         "completions_create": AsyncCompletions.create,
         "embeddings_create": AsyncEmbeddings.create,
@@ -329,7 +350,38 @@ def patch_inference_clients():
         "ollama_list": OllamaAsyncClient.list,
     }
 
-    # Create patched methods for OpenAI client
+    # Special handling for models.list which needs to return something directly async-iterable
+    # Direct iteration: async for m in client.models.list()
+    # Await then iterate: res = await client.models.list(); async for m in res
+    def patched_models_list(self, *args, **kwargs):
+        class AsyncIterableModelsWrapper:
+            def __init__(self, original_method, client_self, args, kwargs):
+                self.original_method = original_method
+                self.client_self = client_self
+                self.args = args
+                self.kwargs = kwargs
+                self._result = None
+
+            def __aiter__(self):
+                return self._async_iter()
+
+            async def _async_iter(self):
+                # Get the result from the patched method
+                result = await _patched_inference_method(
+                    self.original_method, self.client_self, "openai", "/v1/models", *self.args, **self.kwargs
+                )
+                async for item in result:
+                    yield item
+
+            def __await__(self):
+                # When awaited, return self (since we're already async-iterable)
+                async def _return_self():
+                    return self
+
+                return _return_self().__await__()
+
+        return AsyncIterableModelsWrapper(_original_methods["models_list"], self, args, kwargs)
+
     async def patched_chat_completions_create(self, *args, **kwargs):
         return await _patched_inference_method(
             _original_methods["chat_completions_create"], self, "openai", "/v1/chat/completions", *args, **kwargs
@@ -346,6 +398,7 @@ async def patched_embeddings_create(self, *args, **kwargs):
         )
 
     # Apply OpenAI patches
+    AsyncModels.list = patched_models_list
     AsyncChatCompletions.create = patched_chat_completions_create
     AsyncCompletions.create = patched_completions_create
     AsyncEmbeddings.create = patched_embeddings_create
@@ -402,8 +455,10 @@ def unpatch_inference_clients():
     from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
     from openai.resources.completions import AsyncCompletions
     from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels
 
     # Restore OpenAI client methods
+    AsyncModels.list = _original_methods["models_list"]
     AsyncChatCompletions.create = _original_methods["chat_completions_create"]
     AsyncCompletions.create = _original_methods["completions_create"]
     AsyncEmbeddings.create = _original_methods["embeddings_create"]

@@ -193,7 +193,7 @@ EXCLUDE_TESTS="builtin_tool or safety_with_image or code_interpreter or test_rag
 
 # Additional exclusions for vllm provider
 if [[ "$PROVIDER" == "vllm" ]]; then
-    EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls"
+    EXCLUDE_TESTS="${EXCLUDE_TESTS} or test_inference_store_tool_calls or test_text_chat_completion_structured_output"
 fi
 
 PYTEST_PATTERN="not( $EXCLUDE_TESTS )"
@@ -240,7 +240,7 @@ TEST_FILES=""
 for test_subdir in $(echo "$TEST_SUBDIRS" | tr ',' '\n'); do
     # Skip certain test types for vllm provider
     if [[ "$PROVIDER" == "vllm" ]]; then
-        if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]]; then
+        if [[ "$test_subdir" == "safety" ]] || [[ "$test_subdir" == "post_training" ]] || [[ "$test_subdir" == "tool_runtime" ]] || [[ "$test_subdir" == "agents" ]]; then
             echo "Skipping $test_subdir for vllm provider"
             continue
         fi