From b078440482036e9708aa96d0893d63c91b466bf7 Mon Sep 17 00:00:00 2001
From: Kaushik <kausriniv@gmail.com>
Date: Thu, 6 Feb 2025 20:01:31 -0800
Subject: [PATCH 01/34] adding the fiddlecube provider

---
 llama_stack/providers/registry/safety.py      |   9 ++
 .../remote/safety/fiddlecube/__init__.py      |  18 +++
 .../remote/safety/fiddlecube/config.py        |  14 +++
 .../remote/safety/fiddlecube/fiddlecube.py    | 104 ++++++++++++++++++
 4 files changed, 145 insertions(+)
 create mode 100644 llama_stack/providers/remote/safety/fiddlecube/__init__.py
 create mode 100644 llama_stack/providers/remote/safety/fiddlecube/config.py
 create mode 100644 llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py

diff --git a/llama_stack/providers/registry/safety.py b/llama_stack/providers/registry/safety.py
index b9f7b6d784..1e57982e9c 100644
--- a/llama_stack/providers/registry/safety.py
+++ b/llama_stack/providers/registry/safety.py
@@ -85,4 +85,13 @@ def available_providers() -> List[ProviderSpec]:
                 config_class="llama_stack.providers.remote.safety.bedrock.BedrockSafetyConfig",
             ),
         ),
+        remote_provider_spec(
+            api=Api.safety,
+            adapter=AdapterSpec(
+                adapter_type="fiddlecube",
+                pip_packages=["httpx"],
+                module="llama_stack.providers.remote.safety.fiddlecube",
+                config_class="llama_stack.providers.remote.safety.fiddlecube.FiddlecubeSafetyConfig",
+            ),
+        ),
     ]
diff --git a/llama_stack/providers/remote/safety/fiddlecube/__init__.py b/llama_stack/providers/remote/safety/fiddlecube/__init__.py
new file mode 100644
index 0000000000..d4cceb6e72
--- /dev/null
+++ b/llama_stack/providers/remote/safety/fiddlecube/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from typing import Any
+
+from .config import FiddlecubeSafetyConfig
+
+
+async def get_adapter_impl(config: FiddlecubeSafetyConfig, _deps) -> Any:
+    from .fiddlecube import FiddlecubeSafetyAdapter
+
+    impl = FiddlecubeSafetyAdapter(config)
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/remote/safety/fiddlecube/config.py b/llama_stack/providers/remote/safety/fiddlecube/config.py
new file mode 100644
index 0000000000..f26e79f20a
--- /dev/null
+++ b/llama_stack/providers/remote/safety/fiddlecube/config.py
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from pydantic import BaseModel
+
+from llama_models.schema_utils import json_schema_type
+
+
+@json_schema_type
+class FiddlecubeSafetyConfig(BaseModel):
+    pass
diff --git a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
new file mode 100644
index 0000000000..84228c5f8a
--- /dev/null
+++ b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
@@ -0,0 +1,104 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import logging
+
+from typing import Any, Dict, List
+
+from llama_stack.apis.inference import Message
+
+from llama_stack.apis.safety import (
+    RunShieldResponse,
+    Safety,
+    SafetyViolation,
+    ViolationLevel,
+)
+from llama_stack.apis.shields import Shield
+from llama_stack.providers.datatypes import ShieldsProtocolPrivate
+
+from .config import FiddlecubeSafetyConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class FiddlecubeSafetyAdapter(Safety, ShieldsProtocolPrivate):
+    def __init__(self, config: FiddlecubeSafetyConfig) -> None:
+        self.config = config
+        self.registered_shields = []
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def register_shield(self, shield: Shield) -> None:
+        pass
+
+    async def run_shield(
+        self, shield_id: str, messages: List[Message], params: Dict[str, Any] = None
+    ) -> RunShieldResponse:
+        # Set up FiddleCube API using httpx
+        # [TBD] convert the `messages` into format FiddleCube expects
+        # make a call to the API for guardrails
+        # convert the [TBD] response into the format RunShieldResponse expects
+        # return the response
+        return RunShieldResponse()
+
+        shield = await self.shield_store.get_shield(shield_id)
+        if not shield:
+            raise ValueError(f"Shield {shield_id} not found")
+
+        """This is the implementation for the bedrock guardrails. The input to the guardrails is to be of this format
+        ```content = [
+            {
+                "text": {
+                    "text": "Is the AB503 Product a better investment than the S&P 500?"
+                }
+            }
+        ]```
+        However the incoming messages are of this type UserMessage(content=....) coming from
+        https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/datatypes.py
+
+        They contain content, role . For now we will extract the content and default the "qualifiers": ["query"]
+        """
+
+        shield_params = shield.params
+        logger.debug(f"run_shield::{shield_params}::messages={messages}")
+
+        # - convert the messages into format Bedrock expects
+        content_messages = []
+        for message in messages:
+            content_messages.append({"text": {"text": message.content}})
+        logger.debug(f"run_shield::final:messages::{json.dumps(content_messages, indent=2)}:")
+
+        response = self.bedrock_runtime_client.apply_guardrail(
+            guardrailIdentifier=shield.provider_resource_id,
+            guardrailVersion=shield_params["guardrailVersion"],
+            source="OUTPUT",  # or 'INPUT' depending on your use case
+            content=content_messages,
+        )
+        if response["action"] == "GUARDRAIL_INTERVENED":
+            user_message = ""
+            metadata = {}
+            for output in response["outputs"]:
+                # guardrails returns a list - however for this implementation we will leverage the last values
+                user_message = output["text"]
+            for assessment in response["assessments"]:
+                # guardrails returns a list - however for this implementation we will leverage the last values
+                metadata = dict(assessment)
+
+            return RunShieldResponse(
+                violation=SafetyViolation(
+                    user_message=user_message,
+                    violation_level=ViolationLevel.ERROR,
+                    metadata=metadata,
+                )
+            )
+
+        return RunShieldResponse()

From fbcb1b6523290b8717899775b7c08ffabce600e8 Mon Sep 17 00:00:00 2001
From: Vini Katyal <vinikatyal90@gmail.com>
Date: Fri, 7 Feb 2025 16:30:32 +0530
Subject: [PATCH 02/34] chore: calling api in mothership

---
 .../remote/safety/fiddlecube/config.py        |  1 +
 .../remote/safety/fiddlecube/fiddlecube.py    | 65 +++++++------------
 llama_stack/templates/together/build.yaml     |  1 +
 llama_stack/templates/together/run.yaml       |  8 ++-
 llama_stack/templates/together/together.py    |  4 +-
 5 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/llama_stack/providers/remote/safety/fiddlecube/config.py b/llama_stack/providers/remote/safety/fiddlecube/config.py
index f26e79f20a..773796c027 100644
--- a/llama_stack/providers/remote/safety/fiddlecube/config.py
+++ b/llama_stack/providers/remote/safety/fiddlecube/config.py
@@ -11,4 +11,5 @@
 
 @json_schema_type
 class FiddlecubeSafetyConfig(BaseModel):
+    api_url: str = "http://localhost:8000/api"
     pass
diff --git a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
index 84228c5f8a..1ece48e8cb 100644
--- a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
+++ b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
@@ -6,6 +6,7 @@
 
 import json
 import logging
+import httpx
 
 from typing import Any, Dict, List
 
@@ -38,59 +39,41 @@ async def shutdown(self) -> None:
         pass
 
     async def register_shield(self, shield: Shield) -> None:
+        print("Shield", shield)
         pass
 
     async def run_shield(
         self, shield_id: str, messages: List[Message], params: Dict[str, Any] = None
     ) -> RunShieldResponse:
-        # Set up FiddleCube API using httpx
-        # [TBD] convert the `messages` into format FiddleCube expects
-        # make a call to the API for guardrails
-        # convert the [TBD] response into the format RunShieldResponse expects
-        # return the response
-        return RunShieldResponse()
-
-        shield = await self.shield_store.get_shield(shield_id)
-        if not shield:
-            raise ValueError(f"Shield {shield_id} not found")
-
-        """This is the implementation for the bedrock guardrails. The input to the guardrails is to be of this format
-        ```content = [
-            {
-                "text": {
-                    "text": "Is the AB503 Product a better investment than the S&P 500?"
+        # Convert the `messages` into the format FiddleCube expects
+        content_messages = [{"text": {"text": message.content}} for message in messages]
+        logger.debug(f"run_shield::final:messages::{json.dumps(content_messages, indent=2)}:")
+        print("URL::::", self.config.api_url)
+        # Make a call to the FiddleCube API for guardrails
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                self.config.api_url + '/safety/redteam/benchmark',
+                json={
+                    "model_name": "gpt-4o",
+                    "system_prompt": content_messages,
                 }
-            }
-        ]```
-        However the incoming messages are of this type UserMessage(content=....) coming from
-        https://github.com/meta-llama/llama-models/blob/main/models/llama3/api/datatypes.py
-
-        They contain content, role . For now we will extract the content and default the "qualifiers": ["query"]
-        """
+            )
 
-        shield_params = shield.params
-        logger.debug(f"run_shield::{shield_params}::messages={messages}")
+            print("Response:::", response)
 
-        # - convert the messages into format Bedrock expects
-        content_messages = []
-        for message in messages:
-            content_messages.append({"text": {"text": message.content}})
-        logger.debug(f"run_shield::final:messages::{json.dumps(content_messages, indent=2)}:")
+        # Check if the response is successful
+        if response.status_code != 200:
+            logger.error(f"FiddleCube API error: {response.status_code} - {response.text}")
+            raise RuntimeError("Failed to run shield with FiddleCube API")
 
-        response = self.bedrock_runtime_client.apply_guardrail(
-            guardrailIdentifier=shield.provider_resource_id,
-            guardrailVersion=shield_params["guardrailVersion"],
-            source="OUTPUT",  # or 'INPUT' depending on your use case
-            content=content_messages,
-        )
-        if response["action"] == "GUARDRAIL_INTERVENED":
+        # Convert the response into the format RunShieldResponse expects
+        response_data = response.json()
+        if response_data["action"] == "GUARDRAIL_INTERVENED":
             user_message = ""
             metadata = {}
-            for output in response["outputs"]:
-                # guardrails returns a list - however for this implementation we will leverage the last values
+            for output in response_data["outputs"]:
                 user_message = output["text"]
-            for assessment in response["assessments"]:
-                # guardrails returns a list - however for this implementation we will leverage the last values
+            for assessment in response_data["assessments"]:
                 metadata = dict(assessment)
 
             return RunShieldResponse(
diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml
index 90ee5bcee5..ce5bec920c 100644
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@@ -10,6 +10,7 @@ distribution_spec:
     - remote::pgvector
     safety:
     - inline::llama-guard
+    - remote::fiddlecube
     agents:
     - inline::meta-reference
     telemetry:
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index c2afd98e9b..3a996915f1 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -32,6 +32,9 @@ providers:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
     config: {}
+  - provider_id: fiddlecube
+    provider_type: remote::fiddlecube
+    config: {}
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -143,8 +146,9 @@ models:
   model_id: all-MiniLM-L6-v2
   provider_id: sentence-transformers
   model_type: embedding
-shields:
-- shield_id: meta-llama/Llama-Guard-3-8B
+shields: 
+- shield_id: fiddlecube-redteam
+  provider_id: fiddlecube
 vector_dbs: []
 datasets: []
 scoring_fns: []
diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py
index b7ac130ed1..bb6a23cb36 100644
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@@ -28,7 +28,7 @@ def get_distribution_template() -> DistributionTemplate:
     providers = {
         "inference": ["remote::together"],
         "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
+        "safety": ["inline::llama-guard", "remote::fiddlecube"],
         "agents": ["inline::meta-reference"],
         "telemetry": ["inline::meta-reference"],
         "eval": ["inline::meta-reference"],
@@ -103,7 +103,7 @@ def get_distribution_template() -> DistributionTemplate:
             "run.yaml": RunConfigSettings(
                 provider_overrides={
                     "inference": [inference_provider, embedding_provider],
-                    "vector_io": [vector_io_provider],
+                    "vector_io": [vector_io_provider]
                 },
                 default_models=default_models + [embedding_model],
                 default_tool_groups=default_tool_groups,

From fe03e620c392a06f3a0855ab54c1a30b4651cad6 Mon Sep 17 00:00:00 2001
From: Vini Katyal <vinikatyal90@gmail.com>
Date: Fri, 7 Feb 2025 17:05:24 +0530
Subject: [PATCH 03/34] chore: hard coded URL for now

---
 distributions/dependencies.json               |  3 ++
 .../remote/safety/fiddlecube/fiddlecube.py    | 31 +++++--------------
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 6babf34409..029824a969 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -1,4 +1,7 @@
 {
+  "fiddlecube": [
+    "httpx"
+   ],
   "bedrock": [
     "aiosqlite",
     "autoevals",
diff --git a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
index 1ece48e8cb..ee208b5590 100644
--- a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
+++ b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
@@ -50,38 +50,21 @@ async def run_shield(
         logger.debug(f"run_shield::final:messages::{json.dumps(content_messages, indent=2)}:")
         print("URL::::", self.config.api_url)
         # Make a call to the FiddleCube API for guardrails
-        async with httpx.AsyncClient() as client:
+        async with httpx.AsyncClient(timeout=30.0) as client:
             response = await client.post(
-                self.config.api_url + '/safety/redteam/benchmark',
-                json={
-                    "model_name": "gpt-4o",
-                    "system_prompt": content_messages,
-                }
-            )
+                self.config.api_url + '/safety/redteam/benchmark?model_name=gpt-4o &system_prompt=what%20is%20the%20red%20color%20on%20your%20face'
+                )
 
-            print("Response:::", response)
+            print("Response:::", response.status_code)
 
         # Check if the response is successful
         if response.status_code != 200:
-            logger.error(f"FiddleCube API error: {response.status_code} - {response.text}")
+            print(f"FiddleCube API error: {response.status_code} - {response.text}")
             raise RuntimeError("Failed to run shield with FiddleCube API")
 
         # Convert the response into the format RunShieldResponse expects
         response_data = response.json()
-        if response_data["action"] == "GUARDRAIL_INTERVENED":
-            user_message = ""
-            metadata = {}
-            for output in response_data["outputs"]:
-                user_message = output["text"]
-            for assessment in response_data["assessments"]:
-                metadata = dict(assessment)
-
-            return RunShieldResponse(
-                violation=SafetyViolation(
-                    user_message=user_message,
-                    violation_level=ViolationLevel.ERROR,
-                    metadata=metadata,
-                )
-            )
+        print("Response data", response_data)
+
 
         return RunShieldResponse()

From bf9a733ae60e998fdc094b0f6821077b81894f22 Mon Sep 17 00:00:00 2001
From: Kaushik <kausriniv@gmail.com>
Date: Mon, 10 Feb 2025 15:30:48 -0800
Subject: [PATCH 04/34] adding prod url

also cleans up print messages
---
 .../remote/safety/fiddlecube/config.py        |  8 ++++---
 .../remote/safety/fiddlecube/fiddlecube.py    | 23 +++++++++++--------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/llama_stack/providers/remote/safety/fiddlecube/config.py b/llama_stack/providers/remote/safety/fiddlecube/config.py
index 773796c027..0ba5d7c32f 100644
--- a/llama_stack/providers/remote/safety/fiddlecube/config.py
+++ b/llama_stack/providers/remote/safety/fiddlecube/config.py
@@ -4,12 +4,14 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from pydantic import BaseModel
+from typing import List
+
+from pydantic import BaseModel, Field
 
 from llama_models.schema_utils import json_schema_type
 
 
 @json_schema_type
 class FiddlecubeSafetyConfig(BaseModel):
-    api_url: str = "http://localhost:8000/api"
-    pass
+    api_url: str = "https://api.fiddlecube.ai/api"
+    excluded_categories: List[str] = Field(default_factory=list)
diff --git a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
index ee208b5590..25e110b9a3 100644
--- a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
+++ b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
@@ -15,8 +15,6 @@
 from llama_stack.apis.safety import (
     RunShieldResponse,
     Safety,
-    SafetyViolation,
-    ViolationLevel,
 )
 from llama_stack.apis.shields import Shield
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
@@ -48,23 +46,30 @@ async def run_shield(
         # Convert the `messages` into the format FiddleCube expects
         content_messages = [{"text": {"text": message.content}} for message in messages]
         logger.debug(f"run_shield::final:messages::{json.dumps(content_messages, indent=2)}:")
-        print("URL::::", self.config.api_url)
+
         # Make a call to the FiddleCube API for guardrails
         async with httpx.AsyncClient(timeout=30.0) as client:
+            request_body = {
+                "messages": [message.model_dump(mode="json") for message in messages],
+            }
+            if params.get("excluded_categories"):
+                request_body["excluded_categories"] = params.get("excluded_categories")
+            headers = {"Content-Type": "application/json"}
             response = await client.post(
-                self.config.api_url + '/safety/redteam/benchmark?model_name=gpt-4o &system_prompt=what%20is%20the%20red%20color%20on%20your%20face'
-                )
+                f"{self.config.api_url}/safety/guard/check",
+                json=request_body,
+                headers=headers,
+            )
 
-            print("Response:::", response.status_code)
+            logger.debug("Response:::", response.status_code)
 
         # Check if the response is successful
         if response.status_code != 200:
-            print(f"FiddleCube API error: {response.status_code} - {response.text}")
+            logger.error(f"FiddleCube API error: {response.status_code} - {response.text}")
             raise RuntimeError("Failed to run shield with FiddleCube API")
 
         # Convert the response into the format RunShieldResponse expects
         response_data = response.json()
-        print("Response data", response_data)
-
+        logger.debug("Response data", response_data)
 
         return RunShieldResponse()

From b15cf63f64e1a58bd8ac4c17088f4150c5d9778f Mon Sep 17 00:00:00 2001
From: Kaushik <kausriniv@gmail.com>
Date: Mon, 10 Feb 2025 15:51:51 -0800
Subject: [PATCH 05/34] add safety violation code

also improved error handling.
used  to get  fields to prevent field errors.

process safety violation sent from the server and return a
response
---
 .../remote/safety/fiddlecube/fiddlecube.py    | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
index 25e110b9a3..b00066eaff 100644
--- a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
+++ b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
@@ -16,6 +16,7 @@
     RunShieldResponse,
     Safety,
 )
+from llama_stack.apis.safety.safety import SafetyViolation, ViolationLevel
 from llama_stack.apis.shields import Shield
 from llama_stack.providers.datatypes import ShieldsProtocolPrivate
 
@@ -43,11 +44,6 @@ async def register_shield(self, shield: Shield) -> None:
     async def run_shield(
         self, shield_id: str, messages: List[Message], params: Dict[str, Any] = None
     ) -> RunShieldResponse:
-        # Convert the `messages` into the format FiddleCube expects
-        content_messages = [{"text": {"text": message.content}} for message in messages]
-        logger.debug(f"run_shield::final:messages::{json.dumps(content_messages, indent=2)}:")
-
-        # Make a call to the FiddleCube API for guardrails
         async with httpx.AsyncClient(timeout=30.0) as client:
             request_body = {
                 "messages": [message.model_dump(mode="json") for message in messages],
@@ -70,6 +66,27 @@ async def run_shield(
 
         # Convert the response into the format RunShieldResponse expects
         response_data = response.json()
-        logger.debug("Response data", response_data)
+        logger.debug("Response data: %s", json.dumps(response_data, indent=2))
+
+        # Check if there's a violation based on the response structure
+        if response_data.get("action") == "GUARDRAIL_INTERVENED":
+            user_message = ""
+            metadata = {}
+
+            outputs = response_data.get("outputs", [])
+            if outputs:
+                user_message = outputs[-1].get("text", "Safety violation detected")
+
+            assessments = response_data.get("assessments", [])
+            for assessment in assessments:
+                metadata.update(dict(assessment))
+
+            return RunShieldResponse(
+                violation=SafetyViolation(
+                    user_message=user_message,
+                    violation_level=ViolationLevel.ERROR,
+                    metadata=metadata,
+                )
+            )
 
         return RunShieldResponse()

From 9bec7b6fa6fa185daf60beb7da25169419719e35 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Thu, 6 Feb 2025 17:30:21 -0800
Subject: [PATCH 06/34] doc: getting started notebook (#996)

# What does this PR do?

Fix link

## Test Plan

<!--
Please describe:
 - tests you ran to verify your changes with result summaries.
 - provide instructions so it can be reproduced.
-->

<!--
## Sources

Please link relevant resources if necessary.

-->

<!--
## Documentation

- [ ] Added a
[Changelog](https://github.com/meta-llama/llama-stack/blob/main/CHANGELOG.md)
entry if the change is significant (new feature, breaking change etc.).

-->
---
 docs/source/building_applications/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/building_applications/index.md b/docs/source/building_applications/index.md
index 45dca5a1cc..e89a90299c 100644
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a
 
 The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
 
-**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)
+**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
 
 Here are some key topics that will help you build effective agents:
 

From 61f14ed35a47c8d91494c46847706224453e9133 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Thu, 6 Feb 2025 20:19:38 -0800
Subject: [PATCH 07/34] test: fix flaky agent test (#1002)

Summary:

Test Plan:

LLAMA_STACK_CONFIG=fireworks pytest -s -v tests/client-sdk/
--safety-shield meta-llama/Llama-Guard-3-8

all tests passed
---
 tests/client-sdk/agents/test_agents.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py
index 2b1db7df03..302c1c6e7f 100644
--- a/tests/client-sdk/agents/test_agents.py
+++ b/tests/client-sdk/agents/test_agents.py
@@ -269,6 +269,7 @@ def test_override_system_message_behavior(llama_stack_client, agent_config):
         **agent_config,
         "instructions": "You are a pirate",
         "client_tools": [client_tool.get_tool_definition()],
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
     }
 
     agent = Agent(llama_stack_client, agent_config, client_tools=(client_tool,))

From 70197c24f3e6108002ca9acdcf2eb829a4d652ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 7 Feb 2025 17:04:25 +0100
Subject: [PATCH 08/34] test: rm unused exception alias in pytest.raises (#991)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Refactored tests by removing unused exception alias (as exc_info) in
pytest.raises, improving code clarity and reducing lint warnings.
exc_info was never used.

Signed-off-by: Sébastien Han <seb@redhat.com>

## Test Plan

Please describe:
 - tests you ran to verify your changes with result summaries.
 - provide instructions so it can be reproduced.


## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/providers/tests/datasetio/test_datasetio.py   | 4 ++--
 .../providers/tests/inference/test_model_registration.py  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py
index cf28045a4a..fd76bafe02 100644
--- a/llama_stack/providers/tests/datasetio/test_datasetio.py
+++ b/llama_stack/providers/tests/datasetio/test_datasetio.py
@@ -95,7 +95,7 @@ async def test_register_dataset(self, datasetio_stack):
         assert len(response) == 1
         assert response[0].identifier == "test_dataset"
 
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
             # unregister a dataset that does not exist
             await datasets_impl.unregister_dataset("test_dataset2")
 
@@ -104,7 +104,7 @@ async def test_register_dataset(self, datasetio_stack):
         assert isinstance(response, list)
         assert len(response) == 0
 
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
             await datasets_impl.unregister_dataset("test_dataset")
 
     @pytest.mark.asyncio
diff --git a/llama_stack/providers/tests/inference/test_model_registration.py b/llama_stack/providers/tests/inference/test_model_registration.py
index 96a34ec0e1..664564d222 100644
--- a/llama_stack/providers/tests/inference/test_model_registration.py
+++ b/llama_stack/providers/tests/inference/test_model_registration.py
@@ -32,7 +32,7 @@ async def test_register_unsupported_model(self, inference_stack, inference_model
             )
 
         # Try to register a model that's too large for local inference
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
             await models_impl.register_model(
                 model_id="Llama3.1-70B-Instruct",
             )
@@ -42,7 +42,7 @@ async def test_register_nonexistent_model(self, inference_stack):
         _, models_impl = inference_stack
 
         # Try to register a non-existent model
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
             await models_impl.register_model(
                 model_id="Llama3-NonExistent-Model",
             )
@@ -59,7 +59,7 @@ async def test_register_with_llama_model(self, inference_stack):
             },
         )
 
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
             await models_impl.register_model(
                 model_id="custom-model-2",
                 metadata={
@@ -88,7 +88,7 @@ async def test_initialize_model_during_registering(self, inference_stack):
     async def test_register_with_invalid_llama_model(self, inference_stack):
         _, models_impl = inference_stack
 
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
             await models_impl.register_model(
                 model_id="custom-model-2",
                 metadata={"llama_model": "invalid-llama-model"},

From 47f51f28661ad9db45217ffe4b088f9dbf96d285 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 7 Feb 2025 12:02:15 -0500
Subject: [PATCH 09/34] fix: List providers command prints out non-existing
 APIs from registry. Fixes #966 (#969)

Fixes #966.

Verified that:
1. Correct list of APIs are printed out when running `llama stack
list-providers`
2. `llama stack list-providers <api>` works as expected.

---------

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 llama_stack/cli/stack/list_providers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/cli/stack/list_providers.py b/llama_stack/cli/stack/list_providers.py
index 96e978826a..909fea0309 100644
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@@ -22,9 +22,9 @@ def __init__(self, subparsers: argparse._SubParsersAction):
         self.parser.set_defaults(func=self._run_providers_list_cmd)
 
     def _add_arguments(self):
-        from llama_stack.distribution.datatypes import Api
+        from llama_stack.distribution.distribution import providable_apis
 
-        api_values = [a.value for a in Api]
+        api_values = [api.value for api in providable_apis()]
         self.parser.add_argument(
             "api",
             type=str,

From 7d88da411e923114c4ac03e3dedb135b75d4166c Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 7 Feb 2025 09:03:35 -0800
Subject: [PATCH 10/34] Delete CHANGELOG.md

We use weekly releases as a way to communicate important improvements.
Keeping this information synced across is more overhead than we have
bandwidth for right now. We may change this process over time.
---
 CHANGELOG.md | 44 --------------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index 04cd097774..0000000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Changelog
-
-## 0.2.0
-
-### Added
-
-### Changed
-
-### Removed
-
-
-## 0.0.53
-
-### Added
-- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
-- Persistence for registered objects with distribution
-- Ability to persist memory banks created for FAISS
-- PostgreSQL KVStore implementation
-- Environment variable placeholder support in run.yaml files
-- Comprehensive Zero-to-Hero notebooks and quickstart guides
-- Support for quantized models in Ollama
-- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
-- Bedrock distribution with safety shields support
-- Evals API with task registration and scoring functions
-- MMLU and SimpleQA benchmark scoring functions
-- Huggingface dataset provider integration for benchmarks
-- Support for custom dataset registration from local paths
-- Benchmark evaluation CLI tools with visualization tables
-- RAG evaluation scoring functions and metrics
-- Local persistence for datasets and eval tasks
-
-### Changed
-- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
-- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
-- Updated API signatures for dataset and eval task registration
-- Restructured folder organization for providers
-- Enhanced Docker build configuration
-- Added version prefixing for REST API routes
-- Enhanced evaluation task registration workflow
-- Improved benchmark evaluation output formatting
-- Restructured evals folder organization for better modularity
-
-### Removed
-- `llama stack configure` command

From 57a6c278bd599e4cdabbeeef14c35b39fdacf640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 7 Feb 2025 18:35:00 +0100
Subject: [PATCH 11/34] chore: add missing ToolConfig import in groq.py (#983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Imported `ToolConfig` from the `llama_stack.apis.inference` module to
resolve missing reference and ensure proper functionality within the
`groq.py` file.

Signed-off-by: Sébastien Han <seb@redhat.com>


## Test Plan

Without the change, pytest will run with the following error:

```
uv run pytest -v -s -k "ollama" llama_stack/providers/tests/
/Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.13/site-packages/pytest_asyncio/plugin.py:207: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset.
The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session"

  warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET))
============================================ test session starts =============================================
platform darwin -- Python 3.13.1, pytest-8.3.4, pluggy-1.5.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3
cachedir: .pytest_cache
metadata: {'Python': '3.13.1', 'Platform': 'macOS-15.3-arm64-arm-64bit-Mach-O', 'Packages': {'pytest': '8.3.4', 'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'metadata': '3.1.1', 'asyncio': '0.25.3', 'anyio': '4.8.0', 'nbval': '0.11.0'}}
rootdir: /Users/leseb/Documents/AI/llama-stack
configfile: pyproject.toml
plugins: html-4.1.1, metadata-3.1.1, asyncio-0.25.3, anyio-4.8.0, nbval-0.11.0
asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=None
collected 379 items / 1 error / 349 deselected / 30 selected

=================================================== ERRORS ===================================================
__________________ ERROR collecting llama_stack/providers/tests/inference/groq/test_init.py __________________
llama_stack/providers/tests/inference/groq/test_init.py:11: in <module>
    from llama_stack.providers.remote.inference.groq.groq import GroqInferenceAdapter
llama_stack/providers/remote/inference/groq/groq.py:72: in <module>
    class GroqInferenceAdapter(Inference, ModelRegistryHelper, NeedsRequestProviderData):
llama_stack/providers/remote/inference/groq/groq.py:102: in GroqInferenceAdapter
    tool_config: Optional[ToolConfig] = None,
E   NameError: name 'ToolConfig' is not defined
========================================== short test summary info ===========================================
ERROR llama_stack/providers/tests/inference/groq/test_init.py - NameError: name 'ToolConfig' is not defined
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: 1 error during collection !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
=============================== 349 deselected, 22 warnings, 1 error in 0.28s ================================
```

With the change the test continues to run and fails with a different
error:

```
uv run pytest -v -s llama_stack/providers/tests/
/Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.13/site-packages/pytest_asyncio/plugin.py:207: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset.
The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session"

  warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET))
============================================ test session starts =============================================
platform darwin -- Python 3.13.1, pytest-8.3.4, pluggy-1.5.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3
cachedir: .pytest_cache
metadata: {'Python': '3.13.1', 'Platform': 'macOS-15.3-arm64-arm-64bit-Mach-O', 'Packages': {'pytest': '8.3.4', 'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'metadata': '3.1.1', 'asyncio': '0.25.3', 'anyio': '4.8.0', 'nbval': '0.11.0'}}
rootdir: /Users/leseb/Documents/AI/llama-stack
configfile: pyproject.toml
plugins: html-4.1.1, metadata-3.1.1, asyncio-0.25.3, anyio-4.8.0, nbval-0.11.0
asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=None
collected 342 items / 1 error

=================================================== ERRORS ===================================================
______________ ERROR collecting llama_stack/providers/tests/inference/test_vision_inference.py _______________
llama_stack/providers/tests/inference/test_vision_inference.py:29: in <module>
    class TestVisionModelInference:
llama_stack/providers/tests/inference/test_vision_inference.py:35: in TestVisionModelInference
    ImageContentItem(image=dict(data=PASTA_IMAGE)),
E   pydantic_core._pydantic_core.ValidationError: 1 validation error for ImageContentItem
E   image.data
E     Input should be a valid string, unable to parse raw data as a unicode string [type=string_unicode, input_value=b'\xff\xd8\xff\xe0\x00\x1...0\xe6\x9f5\xb5?\xff\xd9', input_type=bytes]
E       For further information visit https://errors.pydantic.dev/2.10/v/string_unicode
========================================== short test summary info ===========================================
ERROR llama_stack/providers/tests/inference/test_vision_inference.py - pydantic_core._pydantic_core.ValidationError: 1 validation error for ImageContentItem
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Interrupted: 1 error during collection !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
======================================= 22 warnings, 1 error in 0.25s ========================================
```

Which is fixed in https://github.com/meta-llama/llama-stack/pull/1003.

## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 llama_stack/providers/remote/inference/groq/groq.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py
index 461b3ee61a..4e6cc2d6bd 100644
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@@ -26,6 +26,7 @@
     Message,
     ResponseFormat,
     ToolChoice,
+    ToolConfig,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.providers.remote.inference.groq.config import GroqConfig

From 4387fb32587ae6291b0621619dbd411eec15f9f7 Mon Sep 17 00:00:00 2001
From: ehhuang <ehhuang@users.noreply.github.com>
Date: Fri, 7 Feb 2025 09:35:38 -0800
Subject: [PATCH 12/34] test: remove flaky agent test (#1006)

Summary:

Test Plan:
---
 tests/client-sdk/agents/test_agents.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py
index 302c1c6e7f..85b7af831c 100644
--- a/tests/client-sdk/agents/test_agents.py
+++ b/tests/client-sdk/agents/test_agents.py
@@ -263,7 +263,8 @@ def test_custom_tool(llama_stack_client, agent_config):
     assert "CustomTool" in logs_str
 
 
-def test_override_system_message_behavior(llama_stack_client, agent_config):
+# TODO: fix this flaky test
+def xtest_override_system_message_behavior(llama_stack_client, agent_config):
     client_tool = TestClientTool()
     agent_config = {
         **agent_config,

From a3d0d3c9e122bd9f384aa6a3aa7cd323119b224e Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 7 Feb 2025 12:35:49 -0500
Subject: [PATCH 13/34] test: Split inference tests to text and vision (#1008)

# What does this PR do?

This PR splits the inference tests into text and vision to make testing
on vLLM provider easier as mentioned in
https://github.com/meta-llama/llama-stack/pull/951 since serving
multiple models (e.g. Llama-3.2-11B-Vision-Instruct and
Llama-3.1-8B-Instruct) on a single port using the OpenAI API is [not
supported yet](https://docs.vllm.ai/en/v0.5.5/serving/faq.html) so it's
a bit tricky to test both at the same time.

## Test Plan

All previously passing tests related to text still pass:
`LLAMA_STACK_BASE_URL=http://localhost:5002 pytest -v
tests/client-sdk/inference/test_text_inference.py`

All vision tests passed via `LLAMA_STACK_BASE_URL=http://localhost:5002
pytest -v tests/client-sdk/inference/test_vision_inference.py`.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 .github/workflows/tests.yml                   |   2 +-
 tests/client-sdk/README.md                    |   6 +-
 ...st_inference.py => test_text_inference.py} | 118 ----------------
 .../inference/test_vision_inference.py        | 133 ++++++++++++++++++
 4 files changed, 137 insertions(+), 122 deletions(-)
 rename tests/client-sdk/inference/{test_inference.py => test_text_inference.py} (73%)
 create mode 100644 tests/client-sdk/inference/test_vision_inference.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ff13a4cb02..cfc26000bb 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -54,7 +54,7 @@ jobs:
           echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
 
           export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/test_inference.py --md-report-output "$REPORT_OUTPUT"
+          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
 
       - name: Output reports to the job summary
         if: always()
diff --git a/tests/client-sdk/README.md b/tests/client-sdk/README.md
index 13142d46fd..d4d439d962 100644
--- a/tests/client-sdk/README.md
+++ b/tests/client-sdk/README.md
@@ -4,18 +4,18 @@ You can run llama stack integration tests on either a Llama Stack Library or a L
 To test on a Llama Stack library with certain configuration, run
 ```bash
 LLAMA_STACK_CONFIG=./llama_stack/templates/cerebras/run.yaml
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference/
 ```
 or just the template name
 ```bash
 LLAMA_STACK_CONFIG=together
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference/
 ```
 
 To test on a Llama Stack endpoint, run
 ```bash
 LLAMA_STACK_BASE_URL=http//localhost:8089
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference
 ```
 
 ## Report Generation
diff --git a/tests/client-sdk/inference/test_inference.py b/tests/client-sdk/inference/test_text_inference.py
similarity index 73%
rename from tests/client-sdk/inference/test_inference.py
rename to tests/client-sdk/inference/test_text_inference.py
index 9bbd1061a1..4b24f1d389 100644
--- a/tests/client-sdk/inference/test_inference.py
+++ b/tests/client-sdk/inference/test_text_inference.py
@@ -4,9 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import base64
-import pathlib
-
 import pytest
 from pydantic import BaseModel
 
@@ -56,23 +53,6 @@ def get_weather_tool_definition():
     }
 
 
-@pytest.fixture
-def image_path():
-    return pathlib.Path(__file__).parent / "dog.png"
-
-
-@pytest.fixture
-def base64_image_data(image_path):
-    # Convert the image to base64
-    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
-
-
-@pytest.fixture
-def base64_image_url(base64_image_data, image_path):
-    # suffix includes the ., so we remove it
-    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
-
-
 def test_text_completion_non_streaming(llama_stack_client, text_model_id):
     response = llama_stack_client.inference.completion(
         content="Complete the sentence using one word: Roses are red, violets are ",
@@ -299,101 +279,3 @@ class AnswerFormat(BaseModel):
     assert answer.last_name == "Jordan"
     assert answer.year_of_birth == 1963
     assert answer.num_seasons_in_nba == 15
-
-
-def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id):
-    message = {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        # TODO: Replace with Github based URI to resources/sample1.jpg
-                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
-                    },
-                },
-            },
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
-    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
-
-
-def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
-    message = {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        # TODO: Replace with Github based URI to resources/sample1.jpg
-                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
-                    },
-                },
-            },
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=True,
-    )
-    streamed_content = ""
-    for chunk in response:
-        streamed_content += chunk.event.delta.text.lower()
-    assert len(streamed_content) > 0
-    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
-
-
-@pytest.mark.parametrize("type_", ["url", "data"])
-def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_):
-    image_spec = {
-        "url": {
-            "type": "image",
-            "image": {
-                "url": {
-                    "uri": base64_image_url,
-                },
-            },
-        },
-        "data": {
-            "type": "image",
-            "image": {
-                "data": base64_image_data,
-            },
-        },
-    }[type_]
-
-    message = {
-        "role": "user",
-        "content": [
-            image_spec,
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
diff --git a/tests/client-sdk/inference/test_vision_inference.py b/tests/client-sdk/inference/test_vision_inference.py
new file mode 100644
index 0000000000..df4b9d9339
--- /dev/null
+++ b/tests/client-sdk/inference/test_vision_inference.py
@@ -0,0 +1,133 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import pathlib
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def inference_provider_type(llama_stack_client):
+    providers = llama_stack_client.providers.list()
+    inference_providers = [p for p in providers if p.api == "inference"]
+    assert len(inference_providers) > 0, "No inference providers found"
+    return inference_providers[0].provider_type
+
+
+@pytest.fixture
+def image_path():
+    return pathlib.Path(__file__).parent / "dog.png"
+
+
+@pytest.fixture
+def base64_image_data(image_path):
+    # Convert the image to base64
+    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
+
+
+@pytest.fixture
+def base64_image_url(base64_image_data, image_path):
+    # suffix includes the ., so we remove it
+    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
+
+
+def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "url": {
+                        # TODO: Replace with Github based URI to resources/sample1.jpg
+                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
+                    },
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
+    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
+
+
+def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "url": {
+                        # TODO: Replace with Github based URI to resources/sample1.jpg
+                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
+                    },
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=True,
+    )
+    streamed_content = ""
+    for chunk in response:
+        streamed_content += chunk.event.delta.text.lower()
+    assert len(streamed_content) > 0
+    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
+
+
+@pytest.mark.parametrize("type_", ["url", "data"])
+def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_):
+    image_spec = {
+        "url": {
+            "type": "image",
+            "image": {
+                "url": {
+                    "uri": base64_image_url,
+                },
+            },
+        },
+        "data": {
+            "type": "image",
+            "image": {
+                "data": base64_image_data,
+            },
+        },
+    }[type_]
+
+    message = {
+        "role": "user",
+        "content": [
+            image_spec,
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0

From ca80e7a67415c88268109b55e87d84e8c7d7479c Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 7 Feb 2025 09:39:08 -0800
Subject: [PATCH 14/34] feat: Add HTTPS serving option (#1000)

# What does this PR do?

Enables HTTPS option for Llama Stack.

While doing so, introduces a `ServerConfig` sub-structure to house all
server related configuration (port, ssl, etc.)

Also simplified the `start_container.sh` entrypoint to simply be
`python` instead of a complex bash command line.

## Test Plan

Conda:

Run:
```bash
$ llama stack build --template together
$ llama stack run --port 8322        # ensure server starts

$ llama-stack-client configure --endpoint http://localhost:8322
$ llama-stack-client models list
```

Create a self-signed SSL key / cert pair. Then, using a local checkout
of `llama-stack-client-python`, change
https://github.com/meta-llama/llama-stack-client-python/blob/main/src/llama_stack_client/_base_client.py#L759
to add `kwargs.setdefault("verify", False)` so SSL verification is
disabled. Then:

```bash
$ llama stack run --port 8322 --tls-keyfile <KEYFILE> --tls-certfile <CERTFILE>
$ llama-stack-client configure --endpoint https://localhost:8322  # notice the `https`
$ llama-stack-client models list
```

Also tested with containers (but of course one needs to make sure the
cert and key files are appropriately provided to the container.)
---
 llama_stack/cli/stack/run.py                | 13 +++++++
 llama_stack/distribution/datatypes.py       | 22 +++++++++++
 llama_stack/distribution/server/server.py   | 42 +++++++++++++++++++--
 llama_stack/distribution/start_conda_env.sh |  5 ++-
 llama_stack/distribution/start_container.sh | 12 +++++-
 5 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index f84def184b..e7d6df2926 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -55,6 +55,16 @@ def _add_arguments(self):
             default=[],
             metavar="KEY=VALUE",
         )
+        self.parser.add_argument(
+            "--tls-keyfile",
+            type=str,
+            help="Path to TLS key file for HTTPS",
+        )
+        self.parser.add_argument(
+            "--tls-certfile",
+            type=str,
+            help="Path to TLS certificate file for HTTPS",
+        )
 
     def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
         import importlib.resources
@@ -178,4 +188,7 @@ def get_conda_prefix(env_name):
                 return
             run_args.extend(["--env", f"{key}={value}"])
 
+        if args.tls_keyfile and args.tls_certfile:
+            run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
+
         run_with_pty(run_args)
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 8b579b6365..97706f22a5 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -117,6 +117,23 @@ class Provider(BaseModel):
     config: Dict[str, Any]
 
 
+class ServerConfig(BaseModel):
+    port: int = Field(
+        default=8321,
+        description="Port to listen on",
+        ge=1024,
+        le=65535,
+    )
+    tls_certfile: Optional[str] = Field(
+        default=None,
+        description="Path to TLS certificate file for HTTPS",
+    )
+    tls_keyfile: Optional[str] = Field(
+        default=None,
+        description="Path to TLS key file for HTTPS",
+    )
+
+
 class StackRunConfig(BaseModel):
     version: str = LLAMA_STACK_RUN_CONFIG_VERSION
 
@@ -159,6 +176,11 @@ class StackRunConfig(BaseModel):
     eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
     tool_groups: List[ToolGroupInput] = Field(default_factory=list)
 
+    server: ServerConfig = Field(
+        default_factory=ServerConfig,
+        description="Configuration for the HTTP(S) server",
+    )
+
 
 class BuildConfig(BaseModel):
     version: str = LLAMA_STACK_BUILD_CONFIG_VERSION
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index fcd0e3cad1..d2c32de119 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -282,8 +282,19 @@ def main():
         action="append",
         help="Environment variables in KEY=value format. Can be specified multiple times.",
     )
+    parser.add_argument(
+        "--tls-keyfile",
+        help="Path to TLS key file for HTTPS",
+        required="--tls-certfile" in sys.argv,
+    )
+    parser.add_argument(
+        "--tls-certfile",
+        help="Path to TLS certificate file for HTTPS",
+        required="--tls-keyfile" in sys.argv,
+    )
 
     args = parser.parse_args()
+
     if args.env:
         for env_pair in args.env:
             try:
@@ -381,11 +392,36 @@ def main():
 
     import uvicorn
 
-    # FYI this does not do hot-reloads
+    # Configure SSL if certificates are provided
+    port = args.port or config.server.port
+
+    ssl_config = None
+    if args.tls_keyfile:
+        keyfile = args.tls_keyfile
+        certfile = args.tls_certfile
+    else:
+        keyfile = config.server.tls_keyfile
+        certfile = config.server.tls_certfile
+
+    if keyfile and certfile:
+        ssl_config = {
+            "ssl_keyfile": keyfile,
+            "ssl_certfile": certfile,
+        }
+        print(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
 
     listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0"
-    print(f"Listening on {listen_host}:{args.port}")
-    uvicorn.run(app, host=listen_host, port=args.port)
+    print(f"Listening on {listen_host}:{port}")
+
+    uvicorn_config = {
+        "app": app,
+        "host": listen_host,
+        "port": port,
+    }
+    if ssl_config:
+        uvicorn_config.update(ssl_config)
+
+    uvicorn.run(**uvicorn_config)
 
 
 def extract_path_params(route: str) -> List[str]:
diff --git a/llama_stack/distribution/start_conda_env.sh b/llama_stack/distribution/start_conda_env.sh
index c37f30ef00..fe830059ff 100755
--- a/llama_stack/distribution/start_conda_env.sh
+++ b/llama_stack/distribution/start_conda_env.sh
@@ -34,6 +34,7 @@ shift
 
 # Process environment variables from --env arguments
 env_vars=""
+other_args=""
 while [[ $# -gt 0 ]]; do
   case "$1" in
   --env)
@@ -48,6 +49,7 @@ while [[ $# -gt 0 ]]; do
     fi
     ;;
   *)
+    other_args="$other_args $1"
     shift
     ;;
   esac
@@ -61,4 +63,5 @@ $CONDA_PREFIX/bin/python \
   -m llama_stack.distribution.server.server \
   --yaml-config "$yaml_config" \
   --port "$port" \
-  $env_vars
+  $env_vars \
+  $other_args
diff --git a/llama_stack/distribution/start_container.sh b/llama_stack/distribution/start_container.sh
index 2c5d65d09e..a5f543fb43 100755
--- a/llama_stack/distribution/start_container.sh
+++ b/llama_stack/distribution/start_container.sh
@@ -40,8 +40,12 @@ shift
 port="$1"
 shift
 
+# Initialize other_args
+other_args=""
+
 # Process environment variables from --env arguments
 env_vars=""
+
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --env)
@@ -55,6 +59,7 @@ while [[ $# -gt 0 ]]; do
             fi
             ;;
         *)
+            other_args="$other_args $1"
             shift
             ;;
     esac
@@ -93,5 +98,8 @@ $CONTAINER_BINARY run $CONTAINER_OPTS -it \
   -v "$yaml_config:/app/config.yaml" \
   $mounts \
   --env LLAMA_STACK_PORT=$port \
-  --entrypoint='["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]' \
-  $container_image:$version_tag
+  --entrypoint python \
+  $container_image:$version_tag \
+  -m llama_stack.distribution.server.server \
+  --yaml-config /app/config.yaml \
+  $other_args

From d4cb624df5cbd9d67cce7a369377679818b41c4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 7 Feb 2025 18:44:16 +0100
Subject: [PATCH 15/34] test: encode image data as base64 (#1003)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Previously, the test was failing due to a pydantic validation error
caused by passing raw binary image data instead of a valid Unicode
string. This fix encodes the image data as base64, ensuring it is a
valid string format compatible with `ImageContentItem`.

Error:

```
______________ ERROR collecting llama_stack/providers/tests/inference/test_vision_inference.py _______________
llama_stack/providers/tests/inference/test_vision_inference.py:31: in <module>
    class TestVisionModelInference:
llama_stack/providers/tests/inference/test_vision_inference.py:37: in TestVisionModelInference
    ImageContentItem(image=dict(data=PASTA_IMAGE)),
E   pydantic_core._pydantic_core.ValidationError: 1 validation error for ImageContentItem
E   image.data
E     Input should be a valid string, unable to parse raw data as a unicode string [type=string_unicode, input_value=b'\xff\xd8\xff\xe0\x00\x1...0\xe6\x9f5\xb5?\xff\xd9', input_type=bytes]
E       For further information visit
https://errors.pydantic.dev/2.10/v/string_unicode
```

Signed-off-by: Sébastien Han <seb@redhat.com>

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan

Execute the following:

```
ollama run llama3.2-vision --keepalive 2m &
uv run pytest -v -s -k "ollama" --inference-model=llama3.2-vision:latest llama_stack/providers/tests/inference/test_vision_inference.py

llama_stack/providers/tests/inference/test_vision_inference.py::TestVisionModelInference::test_vision_chat_completion_non_streaming[-ollama-image0-expected_strings0] PASSED
llama_stack/providers/tests/inference/test_vision_inference.py::TestVisionModelInference::test_vision_chat_completion_non_streaming[-ollama-image1-expected_strings1] FAILED
llama_stack/providers/tests/inference/test_vision_inference.py::TestVisionModelInference::test_vision_chat_completion_streaming[-ollama] FAILED
```

The last two tests are failing because Cloudflare blocked me from
accessing
https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg
but this has no impact on the current fix.


[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .../providers/tests/inference/test_vision_inference.py      | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_stack/providers/tests/inference/test_vision_inference.py b/llama_stack/providers/tests/inference/test_vision_inference.py
index 964f709016..a2434ac417 100644
--- a/llama_stack/providers/tests/inference/test_vision_inference.py
+++ b/llama_stack/providers/tests/inference/test_vision_inference.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import base64
 from pathlib import Path
 
 import pytest
 
-from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem, URL
-
+from llama_stack.apis.common.content_types import URL, ImageContentItem, TextContentItem
 from llama_stack.apis.inference import (
     ChatCompletionResponse,
     ChatCompletionResponseEventType,
@@ -23,7 +23,7 @@
 THIS_DIR = Path(__file__).parent
 
 with open(THIS_DIR / "pasta.jpeg", "rb") as f:
-    PASTA_IMAGE = f.read()
+    PASTA_IMAGE = base64.b64encode(f.read()).decode("utf-8")
 
 
 class TestVisionModelInference:

From 42e47d541f6143cfc7fc459a1176237160846861 Mon Sep 17 00:00:00 2001
From: Charlie Doern <cdoern@redhat.com>
Date: Fri, 7 Feb 2025 12:47:02 -0500
Subject: [PATCH 16/34] fix: Ensure a better error stack trace when llama-stack
 is not built (#950)

# What does this PR do?

currently this is the output when you run a distribution locally without
running `llama stack build`:

```
Traceback (most recent call last):
  File "/Users/charliedoern/Documents/llama-sdk.py", line 25, in <module>
    models = client.models.list()
             ^^^^^^^^^^^^^^^^^^^^
  File "/Users/charliedoern/Documents/llama-stack-client-python/src/llama_stack_client/resources/models.py", line 107, in list
    raise exc
  File "/Users/charliedoern/Documents/llama-stack-client-python/src/llama_stack_client/resources/models.py", line 95, in list
    return self._get(
           ^^^^^^^^^^
  File "/Users/charliedoern/Documents/llama-stack-client-python/src/llama_stack_client/_base_client.py", line 1212, in get
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/charliedoern/Documents/llama-stack/llama_stack/distribution/library_client.py", line 168, in request
    return asyncio.run(self.async_client.request(*args, **kwargs))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.10/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/runners.py", line 190, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.10/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.10/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py", line 654, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/Users/charliedoern/Documents/llama-stack/llama_stack/distribution/library_client.py", line 258, in request
    if not self.endpoint_impls:
           ^^^^^^^^^^^^^^^^^^^
AttributeError: 'AsyncLlamaStackAsLibraryClient' object has no attribute 'endpoint_impls'
```

the intended exception is never raised, add an except for an
AttributeError so users can catch when they call things like
`models.list()` and so that a more useful error telling them that the
client is not properly initialized is printed.

## Test Plan

Please describe:
- I ran the script found here:
https://llama-stack.readthedocs.io/en/latest/getting_started/index.html#run-inference-with-python-sdk
locally with the changes in this PR and the exception was caught
successfully.

## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.

---------

Signed-off-by: Charlie Doern <cdoern@redhat.com>
Co-authored-by: Ashwin Bharambe <ashwin.bharambe@gmail.com>
---
 llama_stack/distribution/library_client.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index 13aa679563..d4a7cde7e9 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -198,6 +198,7 @@ def __init__(
 
     async def initialize(self) -> bool:
         try:
+            self.endpoint_impls = None
             self.impls = await construct_stack(self.config, self.custom_provider_registry)
         except ModuleNotFoundError as _e:
             cprint(_e.msg, "red")

From 5de7cff03a23d592f7d764f886ca05632a8b724e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 7 Feb 2025 18:52:16 +0100
Subject: [PATCH 17/34] refactor(ollama): model availability check (#986)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?

Moved model availability check logic into a dedicated
check_model_availability function. Eliminated redundant code by reusing
the helper function in both embedding and non-embedding model
registration.

Signed-off-by: Sébastien Han <seb@redhat.com>

## Test Plan

Run Ollama and serve 2 models to get most the unit test pass:

```
ollama run llama3.2:3b-instruct-fp16 --keepalive 2m &
ollama run llama3.1:8b  --keepalive 2m &
```

Run the unit test:

```
uv run pytest -v -k "ollama" --inference-model=llama3.2:3b-instruct-fp16 llama_stack/providers/tests/inference/test_model_registration.py
/Users/leseb/Documents/AI/llama-stack/.venv/lib/python3.13/site-packages/pytest_asyncio/plugin.py:207: PytestDeprecationWarning: The configuration option "asyncio_default_fixture_loop_scope" is unset.
The event loop scope for asynchronous fixtures will default to the fixture caching scope. Future versions of pytest-asyncio will default the loop scope for asynchronous fixtures to function scope. Set the default fixture loop scope explicitly in order to avoid unexpected behavior in the future. Valid fixture loop scopes are: "function", "class", "module", "package", "session"

  warnings.warn(PytestDeprecationWarning(_DEFAULT_FIXTURE_LOOP_SCOPE_UNSET))
============================================ test session starts =============================================
platform darwin -- Python 3.13.1, pytest-8.3.4, pluggy-1.5.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3
cachedir: .pytest_cache
metadata: {'Python': '3.13.1', 'Platform': 'macOS-15.3-arm64-arm-64bit-Mach-O', 'Packages': {'pytest': '8.3.4', 'pluggy': '1.5.0'}, 'Plugins': {'html': '4.1.1', 'metadata': '3.1.1', 'asyncio': '0.25.3', 'anyio': '4.8.0', 'nbval': '0.11.0'}}
rootdir: /Users/leseb/Documents/AI/llama-stack
configfile: pyproject.toml
plugins: html-4.1.1, metadata-3.1.1, asyncio-0.25.3, anyio-4.8.0, nbval-0.11.0
asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=None
collected 65 items / 60 deselected / 5 selected

llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_unsupported_model[-ollama] PASSED [ 20%]
llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_nonexistent_model[-ollama] PASSED [ 40%]
llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_with_llama_model[-ollama] FAILED [ 60%]
llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_initialize_model_during_registering[-ollama] FAILED [ 80%]
llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_with_invalid_llama_model[-ollama] PASSED [100%]

================================================== FAILURES ==================================================
_______________________ TestModelRegistration.test_register_with_llama_model[-ollama] ________________________
llama_stack/providers/tests/inference/test_model_registration.py:54: in test_register_with_llama_model
    _ = await models_impl.register_model(
llama_stack/providers/utils/telemetry/trace_protocol.py:91: in async_wrapper
    result = await method(self, *args, **kwargs)
llama_stack/distribution/routers/routing_tables.py:245: in register_model
    registered_model = await self.register_object(model)
llama_stack/distribution/routers/routing_tables.py:192: in register_object
    registered_obj = await register_object_with_provider(obj, p)
llama_stack/distribution/routers/routing_tables.py:53: in register_object_with_provider
    return await p.register_model(obj)
llama_stack/providers/utils/telemetry/trace_protocol.py:91: in async_wrapper
    result = await method(self, *args, **kwargs)
llama_stack/providers/remote/inference/ollama/ollama.py:368: in register_model
    await check_model_availability(model.provider_resource_id)
llama_stack/providers/remote/inference/ollama/ollama.py:359: in check_model_availability
    raise ValueError(
E   ValueError: Model 'custom-model' is not available in Ollama. Available models: llama3.1:8b, llama3.2:3b-instruct-fp16
__________________ TestModelRegistration.test_initialize_model_during_registering[-ollama] ___________________
llama_stack/providers/tests/inference/test_model_registration.py:85: in test_initialize_model_during_registering
    mock_load_model.assert_called_once()
/opt/homebrew/Cellar/python@3.13/3.13.1/Frameworks/Python.framework/Versions/3.13/lib/python3.13/unittest/mock.py:956: in assert_called_once
    raise AssertionError(msg)
E   AssertionError: Expected 'load_model' to have been called once. Called 0 times.
-------------------------------------------- Captured stderr call --------------------------------------------
W0207 11:55:26.777000 90854 .venv/lib/python3.13/site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
========================================== short test summary info ===========================================
FAILED llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_register_with_llama_model[-ollama] - ValueError: Model 'custom-model' is not available in Ollama. Available models: llama3.1:8b, llama3.2:3b-i...
FAILED llama_stack/providers/tests/inference/test_model_registration.py::TestModelRegistration::test_initialize_model_during_registering[-ollama] - AssertionError: Expected 'load_model' to have been called once. Called 0 times.
=========================== 2 failed, 3 passed, 60 deselected, 2 warnings in 1.84s ===========================
```

We only "care" about the `test_register_nonexistent_model` for this
code.


## Sources

Please link relevant resources if necessary.


## Before submitting

- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Ran pre-commit to handle lint / formatting issues.
- [ ] Read the [contributor
guideline](https://github.com/meta-llama/llama-stack/blob/main/CONTRIBUTING.md),
      Pull Request section?
- [ ] Updated relevant documentation.
- [ ] Wrote necessary unit or integration tests.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .../remote/inference/ollama/ollama.py         | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index cff8aa7422..ecd1958541 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -352,24 +352,20 @@ async def embeddings(
         return EmbeddingsResponse(embeddings=embeddings)
 
     async def register_model(self, model: Model) -> Model:
-        # ollama does not have embedding models running. Check if the model is in list of available models.
-        if model.model_type == ModelType.embedding:
-            response = await self.client.list()
+        async def check_model_availability(model_id: str):
+            response = await self.client.ps()
             available_models = [m["model"] for m in response["models"]]
-            if model.provider_resource_id not in available_models:
+            if model_id not in available_models:
                 raise ValueError(
-                    f"Model '{model.provider_resource_id}' is not available in Ollama. "
-                    f"Available models: {', '.join(available_models)}"
+                    f"Model '{model_id}' is not available in Ollama. Available models: {', '.join(available_models)}"
                 )
+
+        if model.model_type == ModelType.embedding:
+            await check_model_availability(model.provider_resource_id)
             return model
+
         model = await self.register_helper.register_model(model)
-        models = await self.client.ps()
-        available_models = [m["model"] for m in models["models"]]
-        if model.provider_resource_id not in available_models:
-            raise ValueError(
-                f"Model '{model.provider_resource_id}' is not available in Ollama. "
-                f"Available models: {', '.join(available_models)}"
-            )
+        await check_model_availability(model.provider_resource_id)
 
         return model
 

From 08aa6033a6e5aaf8832d9a2613632ce54541e01c Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 7 Feb 2025 09:55:48 -0800
Subject: [PATCH 18/34] Nuke use_proxy from code execution

---
 .../inline/tool_runtime/code_interpreter/code_execution.py       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
index b48f92d361..6f4b25b9d3 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@@ -67,7 +67,6 @@ def generate_bwrap_command(bind_dirs: List[str]) -> str:
 @dataclass
 class CodeExecutionContext:
     matplotlib_dump_dir: str
-    use_proxy: bool = False
 
 
 @dataclass

From 17b9e1665915f34f741e291d800ea2189662e2f0 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 7 Feb 2025 11:36:29 -0800
Subject: [PATCH 19/34] Minor clean up of notebook

---
 docs/getting_started.ipynb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index 4e48931580..96e39eb827 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -86,7 +86,6 @@
         "# NBVAL_SKIP\n",
         "\n",
         "!apt-get install -y bubblewrap\n",
-        "# install a branch of llama stack\n",
         "import os\n",
         "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n",
         "!pip install uv\n",

From 6d6a09e221f9929dbf0cee30613629ea40ec9322 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Fri, 7 Feb 2025 11:56:22 -0800
Subject: [PATCH 20/34] No spaces in ipynb tests

---
 docs/conftest.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 docs/conftest.py

diff --git a/docs/conftest.py b/docs/conftest.py
new file mode 100644
index 0000000000..bec535f77d
--- /dev/null
+++ b/docs/conftest.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+def pytest_collection_modifyitems(items):
+    for item in items:
+        item.name = item.name.replace(' ', '_') 

From 4922f8c7f4bbe8caaf4664cb446e2de02a24319a Mon Sep 17 00:00:00 2001
From: Hardik Shah <hjshah@fb.com>
Date: Fri, 7 Feb 2025 12:24:07 -0800
Subject: [PATCH 21/34] raise when client initialize fails

---
 llama_stack/distribution/library_client.py | 24 +++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index d4a7cde7e9..2c0f739744 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -17,17 +17,6 @@
 
 import httpx
 import yaml
-from llama_stack_client import (
-    APIResponse,
-    AsyncAPIResponse,
-    AsyncLlamaStackClient,
-    AsyncStream,
-    LlamaStackClient,
-    NOT_GIVEN,
-)
-from pydantic import BaseModel, TypeAdapter
-from rich.console import Console
-from termcolor import cprint
 
 from llama_stack.distribution.build import print_pip_install_help
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
@@ -46,6 +35,17 @@
     setup_logger,
     start_trace,
 )
+from llama_stack_client import (
+    APIResponse,
+    AsyncAPIResponse,
+    AsyncLlamaStackClient,
+    AsyncStream,
+    LlamaStackClient,
+    NOT_GIVEN,
+)
+from pydantic import BaseModel, TypeAdapter
+from rich.console import Console
+from termcolor import cprint
 
 T = TypeVar("T")
 
@@ -214,7 +214,7 @@ async def initialize(self) -> bool:
                     f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n",
                     "yellow",
                 )
-            return False
+            raise _e
 
         if Api.telemetry in self.impls:
             setup_logger(self.impls[Api.telemetry])

From aff96cd94ced08e29dd7075f34e815e7fdb0281f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 7 Feb 2025 21:52:50 +0000
Subject: [PATCH 22/34] Bump version to 0.1.2

---
 pyproject.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4020247729..5e9cb75e25 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llama_stack"
-version = "0.1.1"
+version = "0.1.2"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@@ -25,8 +25,8 @@ dependencies = [
     "fire",
     "httpx",
     "huggingface-hub",
-    "llama-models>=0.1.1",
-    "llama-stack-client>=0.1.1",
+    "llama-models>=0.1.2",
+    "llama-stack-client>=0.1.2",
     "prompt-toolkit",
     "python-dotenv",
     "pydantic>=2",

From bf2e5b1deef1b1f5a6d10feeb5e1cd211e894203 Mon Sep 17 00:00:00 2001
From: Jeff Tang <jeff.x.tang@gmail.com>
Date: Fri, 7 Feb 2025 15:36:15 -0800
Subject: [PATCH 23/34] Getting started notebook update (#936)

# What does this PR do?

Added examples (Section 4) of using Llama Stack 0.1 distro on together
and Llama 3.2 to answer questions about an image with LS Chat and Agent
APIs.
---
 docs/getting_started.ipynb | 225 +++++++++++++++++++++++++++++++++++++
 1 file changed, 225 insertions(+)

diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index 96e39eb827..abe537c8e1 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -3396,6 +3396,231 @@
         "response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n",
         "pprint(response)\n"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "ad077440",
+      "metadata": {},
+      "source": [
+        "## 4. Image Understanding with Llama 3.2\n",
+        "\n",
+        "Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "82e381ec",
+      "metadata": {},
+      "source": [
+        "### 4.1 Setup and helpers\n",
+        "\n",
+        "Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "865fc5a8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install llama-stack-client==0.1.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "44e05e16",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "469750f7",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from PIL import Image\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "def display_image(path):\n",
+        "  img = Image.open(path)\n",
+        "  plt.imshow(img)\n",
+        "  plt.axis('off')\n",
+        "  plt.show()\n",
+        "\n",
+        "display_image(\"Llama_Repo.jpeg\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a2c1e1c2",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import base64\n",
+        "\n",
+        "def encode_image(image_path):\n",
+        "    with open(image_path, \"rb\") as image_file:\n",
+        "        base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+        "        base64_url = f\"data:image/png;base64,{base64_string}\"\n",
+        "        return base64_url"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c565f99e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client import LlamaStackClient\n",
+        "\n",
+        "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
+        "LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "7737cd41",
+      "metadata": {},
+      "source": [
+        "### 4.2 Using Llama Stack Chat API\n",
+        "\n",
+        "The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d7914894",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client.lib.inference.event_logger import EventLogger\n",
+        "\n",
+        "async def run_main(image_path: str, prompt):\n",
+        "    client = LlamaStackClient(\n",
+        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
+        "    )\n",
+        "\n",
+        "    message = {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\n",
+        "                \"type\": \"image\",\n",
+        "                \"image\": {\n",
+        "                     \"url\": {\n",
+        "                          \"uri\": encode_image(image_path)\n",
+        "                     }\n",
+        "                }\n",
+        "            },\n",
+        "            {\n",
+        "                \"type\": \"text\",\n",
+        "                \"text\": prompt,\n",
+        "            }\n",
+        "        ]\n",
+        "    }\n",
+        "\n",
+        "    response = client.inference.chat_completion(\n",
+        "        messages=[message],\n",
+        "        model_id=LLAMA32_11B_INSTRUCT,\n",
+        "        stream=False,\n",
+        "    )\n",
+        "\n",
+        "    print(response.completion_message.content.lower().strip())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4ee09b97",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "await run_main(\"Llama_Repo.jpeg\",\n",
+        "     \"How many different colors are those llamas?\\\n",
+        "     What are those colors?\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e741d7b9",
+      "metadata": {},
+      "source": [
+        "### 4.3 Using Llama Stack Agent API\n",
+        "\n",
+        "The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f9a83275",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client.lib.agents.agent import Agent\n",
+        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
+        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
+        "\n",
+        "async def run_main(image_path, prompt):\n",
+        "    base64_image = encode_image(image_path)\n",
+        "\n",
+        "    client = LlamaStackClient(\n",
+        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
+        "    )\n",
+        "\n",
+        "    agent_config = AgentConfig(\n",
+        "        model=LLAMA32_11B_INSTRUCT,\n",
+        "        instructions=\"You are a helpful assistant\",\n",
+        "        enable_session_persistence=False,\n",
+        "    )\n",
+        "\n",
+        "    agent = Agent(client, agent_config)\n",
+        "    session_id = agent.create_session(\"test-session\")\n",
+        "\n",
+        "    response = agent.create_turn(\n",
+        "        messages=[{\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\n",
+        "                    \"type\": \"image\",\n",
+        "                    \"image\": {\n",
+        "                         \"url\": {\n",
+        "                              \"uri\": encode_image(image_path)\n",
+        "                         }\n",
+        "                    }\n",
+        "                },\n",
+        "                {\n",
+        "                    \"type\": \"text\",\n",
+        "                    \"text\": prompt,\n",
+        "                }\n",
+        "            ]\n",
+        "        }],\n",
+        "        session_id=session_id,\n",
+        "    )\n",
+        "\n",
+        "    for log in EventLogger().log(response):\n",
+        "        log.print()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "15d0098b",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "await run_main(\"Llama_Repo.jpeg\",\n",
+        "         \"How many different colors are those llamas?\\\n",
+        "         What are those colors?\")"
+      ]
     }
   ],
   "metadata": {

From 3965464625ccee1e27fc6154affae2c6a0bf45ed Mon Sep 17 00:00:00 2001
From: raghotham <rsm@meta.com>
Date: Fri, 7 Feb 2025 15:36:20 -0800
Subject: [PATCH 24/34] docs: update index.md for 0.1.2 (#1013)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)
---
 docs/source/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.md b/docs/source/index.md
index 095f50885c..2834f56419 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -2,7 +2,7 @@
 ```{admonition} News
 :class: tip
 
-Llama Stack 0.1.1 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.1) for more details.
+Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details.
 ```
 
 # Llama Stack

From 6eb0c409fbca9abf43588bf8a1f01c5cb15d6d9e Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sat, 8 Feb 2025 14:49:46 -0500
Subject: [PATCH 25/34] test: Make text-based chat completion tests run 10x
 faster (#1016)

# What does this PR do?

This significantly shortens the test time (about 10x faster) since most
of the time is spent on outputing the tokens "there are several planets
in our solar system that have...". We want to have an answer quicker,
especially when testing even larger models.

## Test Plan

```
LLAMA_STACK_BASE_URL=http://localhost:5002 pytest -v tests/client-sdk/inference/test_text_inference.py -k "test_text_chat_completion_non_streaming or test_text_chat_completion_streaming"
================================================================== test session starts ===================================================================
platform linux -- Python 3.10.16, pytest-8.3.4, pluggy-1.5.0 -- /home/yutang/.conda/envs/myenv/bin/python3.10
cachedir: .pytest_cache
rootdir: /home/yutang/repos/llama-stack
configfile: pyproject.toml
plugins: anyio-4.7.0
collected 12 items / 8 deselected / 4 selected

tests/client-sdk/inference/test_text_inference.py::test_text_chat_completion_non_streaming[meta-llama/Llama-3.1-8B-Instruct-Which planet do humans live on?-Earth] PASSED [ 25%]
tests/client-sdk/inference/test_text_inference.py::test_text_chat_completion_non_streaming[meta-llama/Llama-3.1-8B-Instruct-Which planet has rings around it with a name starting with letter S?-Saturn] PASSED [ 50%]
tests/client-sdk/inference/test_text_inference.py::test_text_chat_completion_streaming[meta-llama/Llama-3.1-8B-Instruct-What's the name of the Sun in latin?-Sol] PASSED [ 75%]
tests/client-sdk/inference/test_text_inference.py::test_text_chat_completion_streaming[meta-llama/Llama-3.1-8B-Instruct-What is the name of the US captial?-Washington] PASSED [100%]


```

---------

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 tests/client-sdk/inference/test_text_inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/client-sdk/inference/test_text_inference.py b/tests/client-sdk/inference/test_text_inference.py
index 4b24f1d389..aa0e510dd6 100644
--- a/tests/client-sdk/inference/test_text_inference.py
+++ b/tests/client-sdk/inference/test_text_inference.py
@@ -156,8 +156,8 @@ class AnswerFormat(BaseModel):
 @pytest.mark.parametrize(
     "question,expected",
     [
-        ("What are the names of planets in our solar system?", "Earth"),
-        ("What are the names of the planets that have rings around them?", "Saturn"),
+        ("Which planet do humans live on?", "Earth"),
+        ("Which planet has rings around it with a name starting with letter S?", "Saturn"),
     ],
 )
 def test_text_chat_completion_non_streaming(llama_stack_client, text_model_id, question, expected):

From a869902ee7b8e86a8d071f81055342b575721001 Mon Sep 17 00:00:00 2001
From: Sarthak Deshpande <60317842+cheesecake100201@users.noreply.github.com>
Date: Sun, 9 Feb 2025 01:20:35 +0530
Subject: [PATCH 26/34] chore: Updated requirements.txt (#1017)

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

Updated requirements.txt

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)

---------

Co-authored-by: sarthakdeshpande <sarthak.deshpande@engati.com>
---
 requirements.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 157c688204..497feb7649 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ annotated-types==0.7.0
 anyio==4.8.0
 blobfile==3.0.0
 certifi==2025.1.31
+chardet==5.2.0
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6 ; sys_platform == 'win32'
@@ -18,8 +19,8 @@ httpx==0.28.1
 huggingface-hub==0.28.1
 idna==3.10
 jinja2==3.1.5
-llama-models==0.1.1
-llama-stack-client==0.1.1
+llama-models==0.1.2
+llama-stack-client==0.1.2
 lxml==5.3.0
 markdown-it-py==3.0.0
 markupsafe==3.0.2
@@ -34,6 +35,7 @@ pycryptodomex==3.21.0
 pydantic==2.10.6
 pydantic-core==2.27.2
 pygments==2.19.1
+pypdf==5.2.0
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2025.1

From d066798fdf6ee802788084901375258b681fbd01 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sat, 8 Feb 2025 23:42:57 -0500
Subject: [PATCH 27/34] test: Use JSON tool prompt format for remote::vllm
 provider (#1019)

# What does this PR do?

This PR removes the warnings when running tests for `remote::vllm`
provider:
```
Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.
```

## Test Plan

All tests passed without the warning messages shown above.

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 tests/client-sdk/inference/test_text_inference.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/client-sdk/inference/test_text_inference.py b/tests/client-sdk/inference/test_text_inference.py
index aa0e510dd6..81b4762188 100644
--- a/tests/client-sdk/inference/test_text_inference.py
+++ b/tests/client-sdk/inference/test_text_inference.py
@@ -11,6 +11,7 @@
     "remote::ollama": "json",
     "remote::together": "json",
     "remote::fireworks": "json",
+    "remote::vllm": "json",
 }
 
 PROVIDER_LOGPROBS_TOP_K = set(

From 626c9ff0402ffe2c114d51eec7a73dba09349184 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sun, 9 Feb 2025 22:26:36 -0500
Subject: [PATCH 28/34] docs: Render check marks correctly on PyPI (#1024)

# What does this PR do?

The table on the project's PyPI page does not render check marks. This
PR switches to use the unicode symbol directly that can be rendered
correctly on PyPI.

Before:

![image](https://github.com/user-attachments/assets/6d01d440-8722-4c37-8b0a-9ba8c0cdb48d)

After:

![image](https://github.com/user-attachments/assets/3a7153f2-9468-40f6-97a2-17f903de4287)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 README.md | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index cdf98dc128..a5e5b217da 100644
--- a/README.md
+++ b/README.md
@@ -34,22 +34,22 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions to developers started easily,
 
-|                                  **API Provider Builder**                                  |    **Environments**    |     **Agents**     |   **Inference**    |     **Memory**     |     **Safety**     |   **Telemetry**    |
-|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
-|                                       Meta Reference                                       |      Single Node       | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
-|                                          SambaNova                                         |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                          Cerebras                                          |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                         Fireworks                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |
-|                                        AWS Bedrock                                         |         Hosted         |                    | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-|                                          Together                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-|                                            Groq                                            |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                           Ollama                                           |      Single Node       |                    | :heavy_check_mark: |                    |                    |                    |
-|                                            TGI                                             | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-| NVIDIA NIM | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-|                                           Chroma                                           |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
-|                                         PG Vector                                          |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
-|                                     PyTorch ExecuTorch                                     |     On-device iOS      | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |
-|                        vLLM                        | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
+|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
 
 ### Distributions
 

From 5faff945fe4561b16c51f718533542423ef63959 Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Mon, 10 Feb 2025 09:25:30 -0500
Subject: [PATCH 29/34] docs: update rag.md example code to prevent errors
 (#1009)

---
 docs/source/building_applications/rag.md | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md
index 6b7a354b7c..5287a2367c 100644
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@@ -36,13 +36,12 @@ chunks = [
         "content": "Your document text here",
         "mime_type": "text/plain",
     },
-    ...,
 ]
-client.vector_io.insert(vector_db_id, chunks)
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 
 # You can then query for these chunks
 chunks_response = client.vector_io.query(
-    vector_db_id, query="What do you know about..."
+    vector_db_id=vector_db_id, query="What do you know about..."
 )
 ```
 
@@ -72,8 +71,8 @@ client.tool_runtime.rag_tool.insert(
 
 # Query documents
 results = client.tool_runtime.rag_tool.query(
-    vector_db_id=vector_db_id,
-    query="What do you know about...",
+    vector_db_ids=[vector_db_id],
+    content="What do you know about...",
 )
 ```
 
@@ -82,10 +81,14 @@ results = client.tool_runtime.rag_tool.query(
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
 
 ```python
+from llama_stack_client.types.agent_create_params import AgentConfig
+from llama_stack_client.lib.agents.agent import Agent
+
 # Configure agent with memory
 agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
+    model="meta-llama/Llama-3.2-3B-Instruct",
     instructions="You are a helpful assistant",
+    enable_session_persistence=False,
     toolgroups=[
         {
             "name": "builtin::rag",
@@ -105,10 +108,10 @@ response = agent.create_turn(
         {"role": "user", "content": "I am providing some documents for reference."}
     ],
     documents=[
-        dict(
-            content="https://raw.githubusercontent.com/example/doc.rst",
-            mime_type="text/plain",
-        )
+        {
+            "content": "https://raw.githubusercontent.com/example/doc.rst",
+            "mime_type": "text/plain",
+        }
     ],
     session_id=session_id,
 )

From 9bad0a3836e27227a09dda0d668c42f5aa6fa789 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Mon, 10 Feb 2025 17:42:30 +0100
Subject: [PATCH 30/34] build: update uv lock to sync package versions (#1026)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# What does this PR do?
[Provide a short summary of what this PR does and why. Link to relevant
issues if applicable.]

Updated `uv.lock` to reflect the latest versions of `llama-models`,
`llama-stack`, and `llama-stack-client` (bumped to 0.1.2). This ensures
dependency consistency and avoids potential issues with outdated package
references.

Added `uv-sync` hook from `uv-pre-commit` repository to ensure
synchronization of dependencies.

Signed-off-by: Sébastien Han <seb@redhat.com>

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]

[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 .github/workflows/pre-commit.yml |  4 ++++
 .pre-commit-config.yaml          |  1 +
 uv.lock                          | 18 +++++++++---------
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index faa2eda319..046387ab92 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -23,3 +23,7 @@ jobs:
             .pre-commit-config.yaml
 
       - uses: pre-commit/action@v3.0.1
+
+      - name: Verify if there are any diff files after pre-commit
+        run: |
+          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index adafccf64c..bca91081fd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,6 +48,7 @@ repos:
     hooks:
     -   id: uv-export
         args: ["--frozen", "--no-hashes", "--no-emit-project"]
+    -   id: uv-sync
 
 # -   repo: https://github.com/pre-commit/mirrors-mypy
 #     rev: v1.14.0
diff --git a/uv.lock b/uv.lock
index f492872bcc..087396eeac 100644
--- a/uv.lock
+++ b/uv.lock
@@ -687,7 +687,7 @@ wheels = [
 
 [[package]]
 name = "llama-models"
-version = "0.1.1"
+version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jinja2" },
@@ -696,14 +696,14 @@ dependencies = [
     { name = "pyyaml" },
     { name = "tiktoken" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/df/80/4a4595cf5e55f71c0e15b85ff2f4c04b0742bf664ede062a09c9d383bf7b/llama_models-0.1.1.tar.gz", hash = "sha256:7cb5a9fe38485b47aff4c93e183d6d390a676a7619f3355502576b652f17733a", size = 1608412 }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/f2/ed8310d4677cd38ab45ffba45aea2a4e9882b640045ad9c3198ac69e5a85/llama_models-0.1.2.tar.gz", hash = "sha256:1266eaec7a8db336e4ed034d2b494189ccb7fd6d6b7aefe874eee749a4340b9b", size = 1608069 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/93/d49dd0f0cd37df1a7a7fb25444d010f626cdf42b21eea11d839b0f6a808a/llama_models-0.1.1-py3-none-any.whl", hash = "sha256:7e4f15dc4f6f011852ea2c42f9770b75140f5eca670b32cc67fc0a4605c55f89", size = 1650981 },
+    { url = "https://files.pythonhosted.org/packages/55/a7/34b9e88ef4109759c8881f43b8006139e3d13d54c440b8c571b253655f54/llama_models-0.1.2-py3-none-any.whl", hash = "sha256:8aa5287d1c6325698991ff677e71148cac347e07493bb5b3ab891e614b89e1f8", size = 1651273 },
 ]
 
 [[package]]
 name = "llama-stack"
-version = "0.1.1"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
     { name = "blobfile" },
@@ -751,8 +751,8 @@ requires-dist = [
     { name = "fire" },
     { name = "httpx" },
     { name = "huggingface-hub" },
-    { name = "llama-models", specifier = ">=0.1.1" },
-    { name = "llama-stack-client", specifier = ">=0.1.1" },
+    { name = "llama-models", specifier = ">=0.1.2" },
+    { name = "llama-stack-client", specifier = ">=0.1.2" },
     { name = "myst-parser", marker = "extra == 'docs'" },
     { name = "nbval", marker = "extra == 'dev'" },
     { name = "pre-commit", marker = "extra == 'dev'" },
@@ -780,7 +780,7 @@ requires-dist = [
 
 [[package]]
 name = "llama-stack-client"
-version = "0.1.1"
+version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -797,9 +797,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/07/42/7004958ac1a6da9a8060decf0d9120fdeb3b2775de090a0a473f2ee4a27d/llama_stack_client-0.1.1.tar.gz", hash = "sha256:3e549a848ade959d342fa52ec49b1913b7bb615a77b5b8dcaefe6ff94409049e", size = 179729 }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/75/8b41a3026c871a8650cd8d2cfda9f891a9163458813574f36518bb40afe4/llama_stack_client-0.1.2.tar.gz", hash = "sha256:94277ddae52be557d771dcdc15d85af9012b5aa87439dd69ec1dc0ff486b0c8e", size = 188023 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/66/5255c09dc001ff437fd6fe6fad27142035b60073df243f7df0494095f605/llama_stack_client-0.1.1-py3-none-any.whl", hash = "sha256:e07d58fdcc1eaa370dd00b94c2dd1a8169c0ac60c37f6f2772cbc2c5b63f2e62", size = 348665 },
+    { url = "https://files.pythonhosted.org/packages/c4/32/3a3a97eecff1f1e3a1dc90e9b00681abea11ec4f43a7ca549981261e18b6/llama_stack_client-0.1.2-py3-none-any.whl", hash = "sha256:85ff0fb57a62d7d0470cfaa2b07a595c9fb3483297944d5e5a066db850d38ccd", size = 359415 },
 ]
 
 [[package]]

From 44529204699849b725da61c6df50938c2f643a1e Mon Sep 17 00:00:00 2001
From: Ellis Tarn <ellistarn@gmail.com>
Date: Mon, 10 Feb 2025 13:24:15 -0800
Subject: [PATCH 31/34] fix: Gaps in doc codegen (#1035)

# What does this PR do?
Catches docs up to source with:
```
python llama_stack/scripts/distro_codegen.py
```

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
[Describe the tests you ran to verify your changes with result
summaries. *Provide clear instructions so the plan can be easily
re-executed.*]
Manually checked
```
sphinx-autobuild docs/source build/html
```

[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)
---
 distributions/dependencies.json               | 76 +++++++++----------
 .../self_hosted_distro/ollama.md              |  4 +-
 llama_stack/scripts/distro_codegen.py         |  2 +-
 llama_stack/templates/bedrock/run.yaml        |  2 +
 llama_stack/templates/cerebras/run.yaml       |  2 +
 .../templates/dell/run-with-safety.yaml       |  2 +
 llama_stack/templates/dell/run.yaml           |  2 +
 .../templates/fireworks/run-with-safety.yaml  |  2 +
 llama_stack/templates/fireworks/run.yaml      |  2 +
 .../hf-endpoint/run-with-safety.yaml          |  2 +
 llama_stack/templates/hf-endpoint/run.yaml    |  2 +
 .../hf-serverless/run-with-safety.yaml        |  2 +
 llama_stack/templates/hf-serverless/run.yaml  |  2 +
 .../meta-reference-gpu/run-with-safety.yaml   |  2 +
 .../templates/meta-reference-gpu/run.yaml     |  2 +
 .../meta-reference-quantized-gpu/run.yaml     |  2 +
 llama_stack/templates/nvidia/run.yaml         |  2 +
 .../templates/ollama/run-with-safety.yaml     |  2 +
 llama_stack/templates/ollama/run.yaml         |  2 +
 .../remote-vllm/run-with-safety.yaml          |  2 +
 llama_stack/templates/remote-vllm/run.yaml    |  2 +
 llama_stack/templates/sambanova/run.yaml      |  2 +
 .../templates/tgi/run-with-safety.yaml        |  2 +
 llama_stack/templates/tgi/run.yaml            |  2 +
 .../templates/together/run-with-safety.yaml   |  2 +
 llama_stack/templates/together/run.yaml       |  2 +
 llama_stack/templates/vllm-gpu/run.yaml       |  2 +
 27 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 029824a969..2e18730d70 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -69,7 +69,8 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "fireworks": [
+  "dell": [
+    "aiohttp",
     "aiosqlite",
     "autoevals",
     "blobfile",
@@ -79,10 +80,9 @@
     "faiss-cpu",
     "fastapi",
     "fire",
-    "fireworks-ai",
     "httpx",
+    "huggingface_hub",
     "matplotlib",
-    "mcp",
     "nltk",
     "numpy",
     "openai",
@@ -103,8 +103,7 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "hf-endpoint": [
-    "aiohttp",
+  "fireworks": [
     "aiosqlite",
     "autoevals",
     "blobfile",
@@ -114,8 +113,8 @@
     "faiss-cpu",
     "fastapi",
     "fire",
+    "fireworks-ai",
     "httpx",
-    "huggingface_hub",
     "matplotlib",
     "mcp",
     "nltk",
@@ -138,7 +137,7 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "hf-serverless": [
+  "hf-endpoint": [
     "aiohttp",
     "aiosqlite",
     "autoevals",
@@ -173,20 +172,19 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "meta-reference-gpu": [
-    "accelerate",
+  "hf-serverless": [
+    "aiohttp",
     "aiosqlite",
     "autoevals",
     "blobfile",
     "chardet",
     "chromadb-client",
     "datasets",
-    "fairscale",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "lm-format-enforcer",
+    "huggingface_hub",
     "matplotlib",
     "mcp",
     "nltk",
@@ -202,18 +200,14 @@
     "requests",
     "scikit-learn",
     "scipy",
-    "sentence-transformers",
     "sentencepiece",
-    "torch",
-    "torchvision",
     "tqdm",
     "transformers",
     "uvicorn",
-    "zmq",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "meta-reference-quantized-gpu": [
+  "meta-reference-gpu": [
     "accelerate",
     "aiosqlite",
     "autoevals",
@@ -224,7 +218,6 @@
     "fairscale",
     "faiss-cpu",
     "fastapi",
-    "fbgemm-gpu",
     "fire",
     "httpx",
     "lm-format-enforcer",
@@ -246,7 +239,6 @@
     "sentence-transformers",
     "sentencepiece",
     "torch",
-    "torchao==0.5.0",
     "torchvision",
     "tqdm",
     "transformers",
@@ -255,22 +247,25 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "ollama": [
-    "aiohttp",
+  "meta-reference-quantized-gpu": [
+    "accelerate",
     "aiosqlite",
     "autoevals",
     "blobfile",
     "chardet",
     "chromadb-client",
     "datasets",
+    "fairscale",
     "faiss-cpu",
     "fastapi",
+    "fbgemm-gpu",
     "fire",
     "httpx",
+    "lm-format-enforcer",
     "matplotlib",
+    "mcp",
     "nltk",
     "numpy",
-    "ollama",
     "openai",
     "opentelemetry-exporter-otlp-proto-http",
     "opentelemetry-sdk",
@@ -282,19 +277,23 @@
     "requests",
     "scikit-learn",
     "scipy",
+    "sentence-transformers",
     "sentencepiece",
+    "torch",
+    "torchao==0.5.0",
+    "torchvision",
     "tqdm",
     "transformers",
     "uvicorn",
+    "zmq",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "remote-vllm": [
+  "nvidia": [
     "aiosqlite",
     "autoevals",
     "blobfile",
     "chardet",
-    "chromadb-client",
     "datasets",
     "faiss-cpu",
     "fastapi",
@@ -322,7 +321,7 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "tgi": [
+  "ollama": [
     "aiohttp",
     "aiosqlite",
     "autoevals",
@@ -334,11 +333,10 @@
     "fastapi",
     "fire",
     "httpx",
-    "huggingface_hub",
     "matplotlib",
-    "mcp",
     "nltk",
     "numpy",
+    "ollama",
     "openai",
     "opentelemetry-exporter-otlp-proto-http",
     "opentelemetry-sdk",
@@ -357,7 +355,7 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "together": [
+  "remote-vllm": [
     "aiosqlite",
     "autoevals",
     "blobfile",
@@ -384,26 +382,22 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
-    "together",
     "tqdm",
     "transformers",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "vllm-gpu": [
+  "sambanova": [
     "aiosqlite",
-    "autoevals",
     "blobfile",
     "chardet",
     "chromadb-client",
-    "datasets",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
     "matplotlib",
-    "mcp",
     "nltk",
     "numpy",
     "openai",
@@ -421,20 +415,22 @@
     "tqdm",
     "transformers",
     "uvicorn",
-    "vllm",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "nvidia": [
+  "tgi": [
+    "aiohttp",
     "aiosqlite",
     "autoevals",
     "blobfile",
     "chardet",
+    "chromadb-client",
     "datasets",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
+    "huggingface_hub",
     "matplotlib",
     "mcp",
     "nltk",
@@ -457,16 +453,19 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "sambanova": [
+  "together": [
     "aiosqlite",
+    "autoevals",
     "blobfile",
     "chardet",
     "chromadb-client",
+    "datasets",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
     "matplotlib",
+    "mcp",
     "nltk",
     "numpy",
     "openai",
@@ -481,14 +480,14 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "together",
     "tqdm",
     "transformers",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "dell": [
-    "aiohttp",
+  "vllm-gpu": [
     "aiosqlite",
     "autoevals",
     "blobfile",
@@ -499,8 +498,8 @@
     "fastapi",
     "fire",
     "httpx",
-    "huggingface_hub",
     "matplotlib",
+    "mcp",
     "nltk",
     "numpy",
     "openai",
@@ -518,6 +517,7 @@
     "tqdm",
     "transformers",
     "uvicorn",
+    "vllm",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ]
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index 54f6b8fdfb..73a609421b 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -26,9 +26,7 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
-You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
-
-### Environment Variables
+You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables
 
 The following environment variables can be configured:
 
diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py
index 7064d3104c..c73c15d415 100644
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@@ -29,7 +29,7 @@ def find_template_dirs(templates_dir: Path) -> Iterator[Path]:
     if not templates_dir.exists():
         raise FileNotFoundError(f"Templates directory not found: {templates_dir}")
 
-    return (d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__")
+    return sorted(d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__")
 
 
 def process_template(template_dir: Path, progress) -> None:
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index 39408c1bd7..be6c9a928c 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -115,3 +115,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index 5a70890a89..05d3f45259 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -117,3 +117,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index bdc82d03a9..04c5957d46 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -116,3 +116,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index 2ba62a7821..706444eb1b 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -107,3 +107,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index a4b425436f..0fbe14a5a5 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -172,3 +172,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index a497317bde..ccf67dcbb0 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -161,3 +161,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index 0329f580ba..f520a2fdab 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -124,3 +124,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index 8163fe28e6..708cb1bcc6 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -114,3 +114,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index 9cee920a5b..7f0abf5be0 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -124,3 +124,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index c8ad0d38da..c0b7a4c605 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -114,3 +114,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index 0faaabb159..c5286fc6be 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -126,3 +126,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index 6ffe1fa360..310585f23f 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -115,3 +115,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
index 5ff87a9010..d43cf3917e 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@@ -117,3 +117,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index 6dc325e9dd..c8ae362f54 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -147,3 +147,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index 5b5c9c253a..ac5dab7552 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -121,3 +121,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 3cc1cb2ac6..4852236750 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -110,3 +110,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 4a0fa9a857..1fe998a1f9 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -126,3 +126,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 9631f94a2f..9d3db8a31e 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -115,3 +115,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index 6cec51824c..39b0f3c4e8 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -126,3 +126,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index 503505c326..ed6c9ef6f2 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -114,3 +114,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index f1953c513e..8bf76f37b1 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -113,3 +113,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index ec351108e5..2989266307 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -167,3 +167,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index 3a996915f1..142a9460ec 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -160,3 +160,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index 165e4d51db..41a545e1ae 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -117,3 +117,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321

From 60f7510906aa1a7af5f56f1d6dbdb6ccff506d15 Mon Sep 17 00:00:00 2001
From: Ellis Tarn <ellistarn@gmail.com>
Date: Mon, 10 Feb 2025 13:35:16 -0800
Subject: [PATCH 32/34] fix: Readthedocs cannot parse comments, resulting in
 docs bugs (#1033)

---
 docs/source/distributions/self_hosted_distro/dell.md  |  2 +-
 .../distributions/self_hosted_distro/fireworks.md     |  2 +-
 .../self_hosted_distro/meta-reference-gpu.md          |  2 +-
 .../meta-reference-quantized-gpu.md                   |  2 +-
 .../source/distributions/self_hosted_distro/ollama.md |  2 +-
 .../distributions/self_hosted_distro/remote-vllm.md   |  2 +-
 .../distributions/self_hosted_distro/sambanova.md     |  2 +-
 docs/source/distributions/self_hosted_distro/tgi.md   |  2 +-
 .../distributions/self_hosted_distro/together.md      |  2 +-
 llama_stack/templates/template.py                     | 11 +++++++++--
 10 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md
index be326ffa59..aef3ecf589 100644
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 
 # Dell Distribution of Llama Stack
 
diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md
index 9afeb4894c..f77d9f656a 100644
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Fireworks Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index d00d8177f5..b183757dbf 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
index e46c2d112a..9aeb7a88b8 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Quantized Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index 73a609421b..c015b9610e 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Ollama Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index ff626d40d0..6c3bbd1d03 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Remote vLLM Distribution
 ```{toctree}
 :maxdepth: 2
diff --git a/docs/source/distributions/self_hosted_distro/sambanova.md b/docs/source/distributions/self_hosted_distro/sambanova.md
index 86ef4ac581..e6ac616be4 100644
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # SambaNova Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md
index b970ab9fe1..f4eecf2cda 100644
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 
 # TGI Distribution
 
diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md
index 45ae462d55..8e36c1eb0c 100644
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Together Distribution
 
 ```{toctree}
diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py
index 09efd20384..04a09741cc 100644
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@@ -131,8 +131,15 @@ def generate_markdown_docs(self) -> str:
             providers_str = ", ".join(f"`{p}`" for p in providers)
             providers_table += f"| {api} | {providers_str} |\n"
 
-        template = "<!-- This file was auto-generated by distro_codegen.py, please edit source -->\n"
-        template += self.template_path.read_text()
+        template = self.template_path.read_text()
+        comment = "<!-- This file was auto-generated by distro_codegen.py, please edit source -->\n"
+        orphantext = "---\norphan: true\n---\n"
+
+        if template.startswith(orphantext):
+            template = template.replace(orphantext, orphantext + comment)
+        else:
+            template = comment + template
+
         # Render template with rich-generated table
         env = jinja2.Environment(
             trim_blocks=True,

From 2f03a803c34099765d557b835abfd6c31e7d7de2 Mon Sep 17 00:00:00 2001
From: Ellis Tarn <ellistarn@gmail.com>
Date: Mon, 10 Feb 2025 14:27:17 -0800
Subject: [PATCH 33/34] fix: a bad newline in ollama docs (#1036)

# What does this PR do?
Catches a bug in the previous codegen which was removing newlines.

[//]: # (If resolving an issue, uncomment and update the line below)
[//]: # (Closes #[issue-number])

## Test Plan
```
python llama_stack/scripts/distro_codegen.py
```

[//]: # (## Documentation)
[//]: # (- [ ] Added a Changelog entry if the change is significant)
---
 docs/source/distributions/self_hosted_distro/ollama.md | 4 +++-
 llama_stack/templates/ollama/doc_template.md           | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index c015b9610e..a3a45f9a89 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -26,7 +26,9 @@ The `llamastack/distribution-ollama` distribution consists of the following prov
 | vector_io | `inline::faiss`, `remote::chromadb`, `remote::pgvector` |
 
 
-You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.### Environment Variables
+You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
+
+### Environment Variables
 
 The following environment variables can be configured:
 
diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md
index eb4aadd292..29efe39c33 100644
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@@ -16,7 +16,7 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
 
 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
 
-{%- if run_config_env_vars %}
+{% if run_config_env_vars %}
 ### Environment Variables
 
 The following environment variables can be configured:

From cca703009d63e32506e7b165be672b0bba171e12 Mon Sep 17 00:00:00 2001
From: Bill Murdock <bmurdock@redhat.com>
Date: Mon, 10 Feb 2025 18:08:33 -0500
Subject: [PATCH 34/34] fix: Update Qdrant support post-refactor (#1022)

# What does this PR do?

I tried running the Qdrant provider and found some bugs. See #1021 for
details. @terrytangyuan wrote there:

> Please feel free to submit your changes in a PR. I fixed similar
issues for pgvector provider. This might be an issue introduced from a
refactoring.

So I am submitting this PR.

Closes #1021

## Test Plan

Here are the highlights for what I did to test this:

References:
-
https://llama-stack.readthedocs.io/en/latest/getting_started/index.html
-
https://github.com/meta-llama/llama-stack-apps/blob/main/examples/agents/rag_with_vector_db.py
-
https://github.com/meta-llama/llama-stack/blob/main/docs/zero_to_hero_guide/README.md#build-configure-and-run-llama-stack

Install and run Qdrant server:

```
podman pull qdrant/qdrant
mkdir qdrant-data
podman run -p 6333:6333 -v $(pwd)/qdrant-data:/qdrant/storage qdrant/qdrant
```

Install and run Llama Stack from the venv-support PR (mainly because I
didn't want to install conda):

```
brew install cmake # Should just need this once

git clone https://github.com/meta-llama/llama-models.git
gh repo clone cdoern/llama-stack
cd llama-stack
gh pr checkout 1018 # This is the checkout that introduces venv support for build/run.  Otherwise you have to use conda.  Eventually this wil be part of main, hopefully.

uv sync --extra dev
uv pip install -e .
source .venv/bin/activate
uv pip install qdrant_client

LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack build --template ollama --image-type venv
```
```
edit llama_stack/templates/ollama/run.yaml
```

in that editor under:
```
  vector_io:
```
add:
```
  - provider_id: qdrant
    provider_type: remote::qdrant
    config: {}
```

see
https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/vector_io/qdrant/config.py#L14
for config options (but I didn't need any)

```
LLAMA_STACK_DIR=$(pwd) LLAMA_MODELS_DIR=../llama-models llama stack run ollama --image-type venv \
   --port $LLAMA_STACK_PORT \
   --env INFERENCE_MODEL=$INFERENCE_MODEL \
   --env SAFETY_MODEL=$SAFETY_MODEL \
   --env OLLAMA_URL=$OLLAMA_URL
```

Then I tested it out in a notebook.  Key highlights included:

```
qdrant_provider = None
for provider in client.providers.list():
    if provider.api == "vector_io" and provider.provider_id == "qdrant":
        qdrant_provider = provider
qdrant_provider
assert qdrant_provider is not None, "QDrant is not a provider.  You need to edit the run yaml file you use in your `llama stack run` call"

vector_db_id = f"test-vector-db-{uuid.uuid4().hex}"
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
    embedding_dimension=384,
    provider_id=qdrant_provider.provider_id,
)
```

Other than that, I just followed what was in
https://llama-stack.readthedocs.io/en/latest/getting_started/index.html

It would be good to have automated tests for this in the future, but
that would be a big undertaking.

Signed-off-by: Bill Murdock <bmurdock@redhat.com>
---
 llama_stack/providers/remote/vector_io/qdrant/__init__.py | 4 ++--
 llama_stack/providers/remote/vector_io/qdrant/qdrant.py   | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llama_stack/providers/remote/vector_io/qdrant/__init__.py b/llama_stack/providers/remote/vector_io/qdrant/__init__.py
index 54605fcf91..c584e29efe 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/__init__.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/__init__.py
@@ -12,8 +12,8 @@
 
 
 async def get_adapter_impl(config: QdrantConfig, deps: Dict[Api, ProviderSpec]):
-    from .qdrant import QdrantVectorMemoryAdapter
+    from .qdrant import QdrantVectorDBAdapter
 
-    impl = QdrantVectorMemoryAdapter(config, deps[Api.inference])
+    impl = QdrantVectorDBAdapter(config, deps[Api.inference])
     await impl.initialize()
     return impl
diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
index 719070528d..e7ad136eba 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@@ -55,7 +55,7 @@ async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
 
         points = []
         for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-            chunk_id = f"{chunk.document_id}:chunk-{i}"
+            chunk_id = f"{chunk.metadata['document_id']}:chunk-{i}"
             points.append(
                 PointStruct(
                     id=convert_id(chunk_id),
@@ -93,6 +93,9 @@ async def query(self, embedding: NDArray, k: int, score_threshold: float) -> Que
 
         return QueryChunksResponse(chunks=chunks, scores=scores)
 
+    async def delete(self):
+        await self.client.delete_collection(collection_name=self.collection_name)
+
 
 class QdrantVectorDBAdapter(VectorIO, VectorDBsProtocolPrivate):
     def __init__(self, config: QdrantConfig, inference_api: Api.inference) -> None: