diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index faa2eda319..046387ab92 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -23,3 +23,7 @@ jobs:
             .pre-commit-config.yaml
 
       - uses: pre-commit/action@v3.0.1
+
+      - name: Verify if there are any diff files after pre-commit
+        run: |
+          git diff --exit-code || (echo "There are uncommitted changes, run pre-commit locally and commit again" && exit 1)
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ff13a4cb02..cfc26000bb 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -54,7 +54,7 @@ jobs:
           echo "REPORT_FILE=${REPORT_OUTPUT}" >> "$GITHUB_ENV"
 
           export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
-          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/test_inference.py --md-report-output "$REPORT_OUTPUT"
+          LLAMA_STACK_CONFIG=./llama_stack/templates/${{ matrix.provider }}/run.yaml pytest --md-report --md-report-verbose=1 ./tests/client-sdk/inference/ --md-report-output "$REPORT_OUTPUT"
 
       - name: Output reports to the job summary
         if: always()
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index adafccf64c..bca91081fd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,6 +48,7 @@ repos:
     hooks:
     -   id: uv-export
         args: ["--frozen", "--no-hashes", "--no-emit-project"]
+    -   id: uv-sync
 
 # -   repo: https://github.com/pre-commit/mirrors-mypy
 #     rev: v1.14.0
diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index 04cd097774..0000000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Changelog
-
-## 0.2.0
-
-### Added
-
-### Changed
-
-### Removed
-
-
-## 0.0.53
-
-### Added
-- Resource-oriented design for models, shields, memory banks, datasets and eval tasks
-- Persistence for registered objects with distribution
-- Ability to persist memory banks created for FAISS
-- PostgreSQL KVStore implementation
-- Environment variable placeholder support in run.yaml files
-- Comprehensive Zero-to-Hero notebooks and quickstart guides
-- Support for quantized models in Ollama
-- Vision models support for Together, Fireworks, Meta-Reference, and Ollama, and vLLM
-- Bedrock distribution with safety shields support
-- Evals API with task registration and scoring functions
-- MMLU and SimpleQA benchmark scoring functions
-- Huggingface dataset provider integration for benchmarks
-- Support for custom dataset registration from local paths
-- Benchmark evaluation CLI tools with visualization tables
-- RAG evaluation scoring functions and metrics
-- Local persistence for datasets and eval tasks
-
-### Changed
-- Split safety into distinct providers (llama-guard, prompt-guard, code-scanner)
-- Changed provider naming convention (`impls` → `inline`, `adapters` → `remote`)
-- Updated API signatures for dataset and eval task registration
-- Restructured folder organization for providers
-- Enhanced Docker build configuration
-- Added version prefixing for REST API routes
-- Enhanced evaluation task registration workflow
-- Improved benchmark evaluation output formatting
-- Restructured evals folder organization for better modularity
-
-### Removed
-- `llama stack configure` command
diff --git a/README.md b/README.md
index cdf98dc128..a5e5b217da 100644
--- a/README.md
+++ b/README.md
@@ -34,22 +34,22 @@ By reducing friction and complexity, Llama Stack empowers developers to focus on
 ### API Providers
 Here is a list of the various API providers and available distributions to developers started easily,
 
-|                                  **API Provider Builder**                                  |    **Environments**    |     **Agents**     |   **Inference**    |     **Memory**     |     **Safety**     |   **Telemetry**    |
-|:------------------------------------------------------------------------------------------:|:----------------------:|:------------------:|:------------------:|:------------------:|:------------------:|:------------------:|
-|                                       Meta Reference                                       |      Single Node       | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
-|                                          SambaNova                                         |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                          Cerebras                                          |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                         Fireworks                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |
-|                                        AWS Bedrock                                         |         Hosted         |                    | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-|                                          Together                                          |         Hosted         | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: |                    |
-|                                            Groq                                            |         Hosted         |                    | :heavy_check_mark: |                    |                    |                    |
-|                                           Ollama                                           |      Single Node       |                    | :heavy_check_mark: |                    |                    |                    |
-|                                            TGI                                             | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-| NVIDIA NIM | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
-|                                           Chroma                                           |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
-|                                         PG Vector                                          |      Single Node       |                    |                    | :heavy_check_mark: |                    |                    |
-|                                     PyTorch ExecuTorch                                     |     On-device iOS      | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |
-|                        vLLM                        | Hosted and Single Node |                    | :heavy_check_mark: |                    |                    |                    |
+| **API Provider Builder** |    **Environments**    | **Agents** | **Inference** | **Memory** | **Safety** | **Telemetry** |
+|:------------------------:|:----------------------:|:----------:|:-------------:|:----------:|:----------:|:-------------:|
+|      Meta Reference      |      Single Node       |     ✅      |       ✅       |     ✅      |     ✅      |       ✅       |
+|        SambaNova         |         Hosted         |            |       ✅       |            |            |               |
+|         Cerebras         |         Hosted         |            |       ✅       |            |            |               |
+|        Fireworks         |         Hosted         |     ✅      |       ✅       |     ✅      |            |               |
+|       AWS Bedrock        |         Hosted         |            |       ✅       |            |     ✅      |               |
+|         Together         |         Hosted         |     ✅      |       ✅       |            |     ✅      |               |
+|           Groq           |         Hosted         |            |       ✅       |            |            |               |
+|          Ollama          |      Single Node       |            |       ✅       |            |            |               |
+|           TGI            | Hosted and Single Node |            |       ✅       |            |            |               |
+|        NVIDIA NIM        | Hosted and Single Node |            |       ✅       |            |            |               |
+|          Chroma          |      Single Node       |            |               |     ✅      |            |               |
+|        PG Vector         |      Single Node       |            |               |     ✅      |            |               |
+|    PyTorch ExecuTorch    |     On-device iOS      |     ✅      |       ✅       |            |            |               |
+|           vLLM           | Hosted and Single Node |            |       ✅       |            |            |               |
 
 ### Distributions
 
diff --git a/distributions/dependencies.json b/distributions/dependencies.json
index 6babf34409..2e18730d70 100644
--- a/distributions/dependencies.json
+++ b/distributions/dependencies.json
@@ -1,4 +1,7 @@
 {
+  "fiddlecube": [
+    "httpx"
+   ],
   "bedrock": [
     "aiosqlite",
     "autoevals",
@@ -66,7 +69,8 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "fireworks": [
+  "dell": [
+    "aiohttp",
     "aiosqlite",
     "autoevals",
     "blobfile",
@@ -76,10 +80,9 @@
     "faiss-cpu",
     "fastapi",
     "fire",
-    "fireworks-ai",
     "httpx",
+    "huggingface_hub",
     "matplotlib",
-    "mcp",
     "nltk",
     "numpy",
     "openai",
@@ -100,8 +103,7 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "hf-endpoint": [
-    "aiohttp",
+  "fireworks": [
     "aiosqlite",
     "autoevals",
     "blobfile",
@@ -111,8 +113,8 @@
     "faiss-cpu",
     "fastapi",
     "fire",
+    "fireworks-ai",
     "httpx",
-    "huggingface_hub",
     "matplotlib",
     "mcp",
     "nltk",
@@ -135,7 +137,7 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "hf-serverless": [
+  "hf-endpoint": [
     "aiohttp",
     "aiosqlite",
     "autoevals",
@@ -170,20 +172,19 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "meta-reference-gpu": [
-    "accelerate",
+  "hf-serverless": [
+    "aiohttp",
     "aiosqlite",
     "autoevals",
     "blobfile",
     "chardet",
     "chromadb-client",
     "datasets",
-    "fairscale",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
-    "lm-format-enforcer",
+    "huggingface_hub",
     "matplotlib",
     "mcp",
     "nltk",
@@ -199,18 +200,14 @@
     "requests",
     "scikit-learn",
     "scipy",
-    "sentence-transformers",
     "sentencepiece",
-    "torch",
-    "torchvision",
     "tqdm",
     "transformers",
     "uvicorn",
-    "zmq",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "meta-reference-quantized-gpu": [
+  "meta-reference-gpu": [
     "accelerate",
     "aiosqlite",
     "autoevals",
@@ -221,7 +218,6 @@
     "fairscale",
     "faiss-cpu",
     "fastapi",
-    "fbgemm-gpu",
     "fire",
     "httpx",
     "lm-format-enforcer",
@@ -243,7 +239,6 @@
     "sentence-transformers",
     "sentencepiece",
     "torch",
-    "torchao==0.5.0",
     "torchvision",
     "tqdm",
     "transformers",
@@ -252,22 +247,25 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "ollama": [
-    "aiohttp",
+  "meta-reference-quantized-gpu": [
+    "accelerate",
     "aiosqlite",
     "autoevals",
     "blobfile",
     "chardet",
     "chromadb-client",
     "datasets",
+    "fairscale",
     "faiss-cpu",
     "fastapi",
+    "fbgemm-gpu",
     "fire",
     "httpx",
+    "lm-format-enforcer",
     "matplotlib",
+    "mcp",
     "nltk",
     "numpy",
-    "ollama",
     "openai",
     "opentelemetry-exporter-otlp-proto-http",
     "opentelemetry-sdk",
@@ -279,19 +277,23 @@
     "requests",
     "scikit-learn",
     "scipy",
+    "sentence-transformers",
     "sentencepiece",
+    "torch",
+    "torchao==0.5.0",
+    "torchvision",
     "tqdm",
     "transformers",
     "uvicorn",
+    "zmq",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "remote-vllm": [
+  "nvidia": [
     "aiosqlite",
     "autoevals",
     "blobfile",
     "chardet",
-    "chromadb-client",
     "datasets",
     "faiss-cpu",
     "fastapi",
@@ -319,7 +321,7 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "tgi": [
+  "ollama": [
     "aiohttp",
     "aiosqlite",
     "autoevals",
@@ -331,11 +333,10 @@
     "fastapi",
     "fire",
     "httpx",
-    "huggingface_hub",
     "matplotlib",
-    "mcp",
     "nltk",
     "numpy",
+    "ollama",
     "openai",
     "opentelemetry-exporter-otlp-proto-http",
     "opentelemetry-sdk",
@@ -354,7 +355,7 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "together": [
+  "remote-vllm": [
     "aiosqlite",
     "autoevals",
     "blobfile",
@@ -381,26 +382,22 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
-    "together",
     "tqdm",
     "transformers",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "vllm-gpu": [
+  "sambanova": [
     "aiosqlite",
-    "autoevals",
     "blobfile",
     "chardet",
     "chromadb-client",
-    "datasets",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
     "matplotlib",
-    "mcp",
     "nltk",
     "numpy",
     "openai",
@@ -418,20 +415,22 @@
     "tqdm",
     "transformers",
     "uvicorn",
-    "vllm",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "nvidia": [
+  "tgi": [
+    "aiohttp",
     "aiosqlite",
     "autoevals",
     "blobfile",
     "chardet",
+    "chromadb-client",
     "datasets",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
+    "huggingface_hub",
     "matplotlib",
     "mcp",
     "nltk",
@@ -454,16 +453,19 @@
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "sambanova": [
+  "together": [
     "aiosqlite",
+    "autoevals",
     "blobfile",
     "chardet",
     "chromadb-client",
+    "datasets",
     "faiss-cpu",
     "fastapi",
     "fire",
     "httpx",
     "matplotlib",
+    "mcp",
     "nltk",
     "numpy",
     "openai",
@@ -478,14 +480,14 @@
     "scikit-learn",
     "scipy",
     "sentencepiece",
+    "together",
     "tqdm",
     "transformers",
     "uvicorn",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ],
-  "dell": [
-    "aiohttp",
+  "vllm-gpu": [
     "aiosqlite",
     "autoevals",
     "blobfile",
@@ -496,8 +498,8 @@
     "fastapi",
     "fire",
     "httpx",
-    "huggingface_hub",
     "matplotlib",
+    "mcp",
     "nltk",
     "numpy",
     "openai",
@@ -515,6 +517,7 @@
     "tqdm",
     "transformers",
     "uvicorn",
+    "vllm",
     "sentence-transformers --no-deps",
     "torch torchvision --index-url https://download.pytorch.org/whl/cpu"
   ]
diff --git a/docs/conftest.py b/docs/conftest.py
new file mode 100644
index 0000000000..bec535f77d
--- /dev/null
+++ b/docs/conftest.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+def pytest_collection_modifyitems(items):
+    for item in items:
+        item.name = item.name.replace(' ', '_') 
diff --git a/docs/getting_started.ipynb b/docs/getting_started.ipynb
index 4e48931580..abe537c8e1 100644
--- a/docs/getting_started.ipynb
+++ b/docs/getting_started.ipynb
@@ -86,7 +86,6 @@
         "# NBVAL_SKIP\n",
         "\n",
         "!apt-get install -y bubblewrap\n",
-        "# install a branch of llama stack\n",
         "import os\n",
         "os.environ[\"UV_SYSTEM_PYTHON\"] = \"1\"\n",
         "!pip install uv\n",
@@ -3397,6 +3396,231 @@
         "response = client.scoring.score(input_rows=rows, scoring_functions=scoring_params)\n",
         "pprint(response)\n"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "ad077440",
+      "metadata": {},
+      "source": [
+        "## 4. Image Understanding with Llama 3.2\n",
+        "\n",
+        "Below is a complete example of using Together's Llama Stack 0.1 server at https://llama-stack.together.ai to ask Llama 3.2 questions about an image."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "82e381ec",
+      "metadata": {},
+      "source": [
+        "### 4.1 Setup and helpers\n",
+        "\n",
+        "Below we install the Llama Stack client 0.1, download the example image, define two image helpers, and set Llama Stack Together server URL and Llama 3.2 model name.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "865fc5a8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install llama-stack-client==0.1.0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "44e05e16",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!wget https://raw.githubusercontent.com/meta-llama/llama-models/refs/heads/main/Llama_Repo.jpeg"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "469750f7",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from PIL import Image\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "def display_image(path):\n",
+        "  img = Image.open(path)\n",
+        "  plt.imshow(img)\n",
+        "  plt.axis('off')\n",
+        "  plt.show()\n",
+        "\n",
+        "display_image(\"Llama_Repo.jpeg\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "a2c1e1c2",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import base64\n",
+        "\n",
+        "def encode_image(image_path):\n",
+        "    with open(image_path, \"rb\") as image_file:\n",
+        "        base64_string = base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
+        "        base64_url = f\"data:image/png;base64,{base64_string}\"\n",
+        "        return base64_url"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c565f99e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client import LlamaStackClient\n",
+        "\n",
+        "LLAMA_STACK_API_TOGETHER_URL=\"https://llama-stack.together.ai\"\n",
+        "LLAMA32_11B_INSTRUCT = \"meta-llama/Llama-3.2-11B-Vision-Instruct\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "7737cd41",
+      "metadata": {},
+      "source": [
+        "### 4.2 Using Llama Stack Chat API\n",
+        "\n",
+        "The code below uses the Llama Stack 0.1's chat API to interact with Llama 3.2:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d7914894",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client.lib.inference.event_logger import EventLogger\n",
+        "\n",
+        "async def run_main(image_path: str, prompt):\n",
+        "    client = LlamaStackClient(\n",
+        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
+        "    )\n",
+        "\n",
+        "    message = {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\n",
+        "                \"type\": \"image\",\n",
+        "                \"image\": {\n",
+        "                     \"url\": {\n",
+        "                          \"uri\": encode_image(image_path)\n",
+        "                     }\n",
+        "                }\n",
+        "            },\n",
+        "            {\n",
+        "                \"type\": \"text\",\n",
+        "                \"text\": prompt,\n",
+        "            }\n",
+        "        ]\n",
+        "    }\n",
+        "\n",
+        "    response = client.inference.chat_completion(\n",
+        "        messages=[message],\n",
+        "        model_id=LLAMA32_11B_INSTRUCT,\n",
+        "        stream=False,\n",
+        "    )\n",
+        "\n",
+        "    print(response.completion_message.content.lower().strip())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4ee09b97",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "await run_main(\"Llama_Repo.jpeg\",\n",
+        "     \"How many different colors are those llamas?\\\n",
+        "     What are those colors?\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e741d7b9",
+      "metadata": {},
+      "source": [
+        "### 4.3 Using Llama Stack Agent API\n",
+        "\n",
+        "The code below uses the Llama Stack 0.1's Agent API to interact with Llama 3.2:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f9a83275",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_stack_client.lib.agents.agent import Agent\n",
+        "from llama_stack_client.lib.agents.event_logger import EventLogger\n",
+        "from llama_stack_client.types.agent_create_params import AgentConfig\n",
+        "\n",
+        "async def run_main(image_path, prompt):\n",
+        "    base64_image = encode_image(image_path)\n",
+        "\n",
+        "    client = LlamaStackClient(\n",
+        "        base_url=LLAMA_STACK_API_TOGETHER_URL,\n",
+        "    )\n",
+        "\n",
+        "    agent_config = AgentConfig(\n",
+        "        model=LLAMA32_11B_INSTRUCT,\n",
+        "        instructions=\"You are a helpful assistant\",\n",
+        "        enable_session_persistence=False,\n",
+        "    )\n",
+        "\n",
+        "    agent = Agent(client, agent_config)\n",
+        "    session_id = agent.create_session(\"test-session\")\n",
+        "\n",
+        "    response = agent.create_turn(\n",
+        "        messages=[{\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\n",
+        "                    \"type\": \"image\",\n",
+        "                    \"image\": {\n",
+        "                         \"url\": {\n",
+        "                              \"uri\": encode_image(image_path)\n",
+        "                         }\n",
+        "                    }\n",
+        "                },\n",
+        "                {\n",
+        "                    \"type\": \"text\",\n",
+        "                    \"text\": prompt,\n",
+        "                }\n",
+        "            ]\n",
+        "        }],\n",
+        "        session_id=session_id,\n",
+        "    )\n",
+        "\n",
+        "    for log in EventLogger().log(response):\n",
+        "        log.print()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "15d0098b",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "await run_main(\"Llama_Repo.jpeg\",\n",
+        "         \"How many different colors are those llamas?\\\n",
+        "         What are those colors?\")"
+      ]
     }
   ],
   "metadata": {
diff --git a/docs/source/building_applications/index.md b/docs/source/building_applications/index.md
index 45dca5a1cc..e89a90299c 100644
--- a/docs/source/building_applications/index.md
+++ b/docs/source/building_applications/index.md
@@ -4,7 +4,7 @@ Llama Stack provides all the building blocks needed to create sophisticated AI a
 
 The best way to get started is to look at this notebook which walks through the various APIs (from basic inference, to RAG agents) and how to use them.
 
-**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/notebooks/Llama_Stack_Benchmark_Evals.ipynb)
+**Notebook**: [Building AI Applications](https://github.com/meta-llama/llama-stack/blob/main/docs/getting_started.ipynb)
 
 Here are some key topics that will help you build effective agents:
 
diff --git a/docs/source/building_applications/rag.md b/docs/source/building_applications/rag.md
index 6b7a354b7c..5287a2367c 100644
--- a/docs/source/building_applications/rag.md
+++ b/docs/source/building_applications/rag.md
@@ -36,13 +36,12 @@ chunks = [
         "content": "Your document text here",
         "mime_type": "text/plain",
     },
-    ...,
 ]
-client.vector_io.insert(vector_db_id, chunks)
+client.vector_io.insert(vector_db_id=vector_db_id, chunks=chunks)
 
 # You can then query for these chunks
 chunks_response = client.vector_io.query(
-    vector_db_id, query="What do you know about..."
+    vector_db_id=vector_db_id, query="What do you know about..."
 )
 ```
 
@@ -72,8 +71,8 @@ client.tool_runtime.rag_tool.insert(
 
 # Query documents
 results = client.tool_runtime.rag_tool.query(
-    vector_db_id=vector_db_id,
-    query="What do you know about...",
+    vector_db_ids=[vector_db_id],
+    content="What do you know about...",
 )
 ```
 
@@ -82,10 +81,14 @@ results = client.tool_runtime.rag_tool.query(
 One of the most powerful patterns is combining agents with RAG capabilities. Here's a complete example:
 
 ```python
+from llama_stack_client.types.agent_create_params import AgentConfig
+from llama_stack_client.lib.agents.agent import Agent
+
 # Configure agent with memory
 agent_config = AgentConfig(
-    model="Llama3.2-3B-Instruct",
+    model="meta-llama/Llama-3.2-3B-Instruct",
     instructions="You are a helpful assistant",
+    enable_session_persistence=False,
     toolgroups=[
         {
             "name": "builtin::rag",
@@ -105,10 +108,10 @@ response = agent.create_turn(
         {"role": "user", "content": "I am providing some documents for reference."}
     ],
     documents=[
-        dict(
-            content="https://raw.githubusercontent.com/example/doc.rst",
-            mime_type="text/plain",
-        )
+        {
+            "content": "https://raw.githubusercontent.com/example/doc.rst",
+            "mime_type": "text/plain",
+        }
     ],
     session_id=session_id,
 )
diff --git a/docs/source/distributions/self_hosted_distro/dell.md b/docs/source/distributions/self_hosted_distro/dell.md
index be326ffa59..aef3ecf589 100644
--- a/docs/source/distributions/self_hosted_distro/dell.md
+++ b/docs/source/distributions/self_hosted_distro/dell.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 
 # Dell Distribution of Llama Stack
 
diff --git a/docs/source/distributions/self_hosted_distro/fireworks.md b/docs/source/distributions/self_hosted_distro/fireworks.md
index 9afeb4894c..f77d9f656a 100644
--- a/docs/source/distributions/self_hosted_distro/fireworks.md
+++ b/docs/source/distributions/self_hosted_distro/fireworks.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Fireworks Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
index d00d8177f5..b183757dbf 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-gpu.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
index e46c2d112a..9aeb7a88b8 100644
--- a/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
+++ b/docs/source/distributions/self_hosted_distro/meta-reference-quantized-gpu.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Meta Reference Quantized Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/ollama.md b/docs/source/distributions/self_hosted_distro/ollama.md
index 54f6b8fdfb..a3a45f9a89 100644
--- a/docs/source/distributions/self_hosted_distro/ollama.md
+++ b/docs/source/distributions/self_hosted_distro/ollama.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Ollama Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/remote-vllm.md b/docs/source/distributions/self_hosted_distro/remote-vllm.md
index ff626d40d0..6c3bbd1d03 100644
--- a/docs/source/distributions/self_hosted_distro/remote-vllm.md
+++ b/docs/source/distributions/self_hosted_distro/remote-vllm.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Remote vLLM Distribution
 ```{toctree}
 :maxdepth: 2
diff --git a/docs/source/distributions/self_hosted_distro/sambanova.md b/docs/source/distributions/self_hosted_distro/sambanova.md
index 86ef4ac581..e6ac616be4 100644
--- a/docs/source/distributions/self_hosted_distro/sambanova.md
+++ b/docs/source/distributions/self_hosted_distro/sambanova.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # SambaNova Distribution
 
 ```{toctree}
diff --git a/docs/source/distributions/self_hosted_distro/tgi.md b/docs/source/distributions/self_hosted_distro/tgi.md
index b970ab9fe1..f4eecf2cda 100644
--- a/docs/source/distributions/self_hosted_distro/tgi.md
+++ b/docs/source/distributions/self_hosted_distro/tgi.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 
 # TGI Distribution
 
diff --git a/docs/source/distributions/self_hosted_distro/together.md b/docs/source/distributions/self_hosted_distro/together.md
index 45ae462d55..8e36c1eb0c 100644
--- a/docs/source/distributions/self_hosted_distro/together.md
+++ b/docs/source/distributions/self_hosted_distro/together.md
@@ -1,7 +1,7 @@
-<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 ---
 orphan: true
 ---
+<!-- This file was auto-generated by distro_codegen.py, please edit source -->
 # Together Distribution
 
 ```{toctree}
diff --git a/docs/source/index.md b/docs/source/index.md
index 095f50885c..2834f56419 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -2,7 +2,7 @@
 ```{admonition} News
 :class: tip
 
-Llama Stack 0.1.1 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.1) for more details.
+Llama Stack 0.1.2 is now available! See the [release notes](https://github.com/meta-llama/llama-stack/releases/tag/v0.1.2) for more details.
 ```
 
 # Llama Stack
diff --git a/llama_stack/cli/stack/list_providers.py b/llama_stack/cli/stack/list_providers.py
index 96e978826a..909fea0309 100644
--- a/llama_stack/cli/stack/list_providers.py
+++ b/llama_stack/cli/stack/list_providers.py
@@ -22,9 +22,9 @@ def __init__(self, subparsers: argparse._SubParsersAction):
         self.parser.set_defaults(func=self._run_providers_list_cmd)
 
     def _add_arguments(self):
-        from llama_stack.distribution.datatypes import Api
+        from llama_stack.distribution.distribution import providable_apis
 
-        api_values = [a.value for a in Api]
+        api_values = [api.value for api in providable_apis()]
         self.parser.add_argument(
             "api",
             type=str,
diff --git a/llama_stack/cli/stack/run.py b/llama_stack/cli/stack/run.py
index f84def184b..e7d6df2926 100644
--- a/llama_stack/cli/stack/run.py
+++ b/llama_stack/cli/stack/run.py
@@ -55,6 +55,16 @@ def _add_arguments(self):
             default=[],
             metavar="KEY=VALUE",
         )
+        self.parser.add_argument(
+            "--tls-keyfile",
+            type=str,
+            help="Path to TLS key file for HTTPS",
+        )
+        self.parser.add_argument(
+            "--tls-certfile",
+            type=str,
+            help="Path to TLS certificate file for HTTPS",
+        )
 
     def _run_stack_run_cmd(self, args: argparse.Namespace) -> None:
         import importlib.resources
@@ -178,4 +188,7 @@ def get_conda_prefix(env_name):
                 return
             run_args.extend(["--env", f"{key}={value}"])
 
+        if args.tls_keyfile and args.tls_certfile:
+            run_args.extend(["--tls-keyfile", args.tls_keyfile, "--tls-certfile", args.tls_certfile])
+
         run_with_pty(run_args)
diff --git a/llama_stack/distribution/datatypes.py b/llama_stack/distribution/datatypes.py
index 8b579b6365..97706f22a5 100644
--- a/llama_stack/distribution/datatypes.py
+++ b/llama_stack/distribution/datatypes.py
@@ -117,6 +117,23 @@ class Provider(BaseModel):
     config: Dict[str, Any]
 
 
+class ServerConfig(BaseModel):
+    port: int = Field(
+        default=8321,
+        description="Port to listen on",
+        ge=1024,
+        le=65535,
+    )
+    tls_certfile: Optional[str] = Field(
+        default=None,
+        description="Path to TLS certificate file for HTTPS",
+    )
+    tls_keyfile: Optional[str] = Field(
+        default=None,
+        description="Path to TLS key file for HTTPS",
+    )
+
+
 class StackRunConfig(BaseModel):
     version: str = LLAMA_STACK_RUN_CONFIG_VERSION
 
@@ -159,6 +176,11 @@ class StackRunConfig(BaseModel):
     eval_tasks: List[EvalTaskInput] = Field(default_factory=list)
     tool_groups: List[ToolGroupInput] = Field(default_factory=list)
 
+    server: ServerConfig = Field(
+        default_factory=ServerConfig,
+        description="Configuration for the HTTP(S) server",
+    )
+
 
 class BuildConfig(BaseModel):
     version: str = LLAMA_STACK_BUILD_CONFIG_VERSION
diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index 13aa679563..2c0f739744 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -17,17 +17,6 @@
 
 import httpx
 import yaml
-from llama_stack_client import (
-    APIResponse,
-    AsyncAPIResponse,
-    AsyncLlamaStackClient,
-    AsyncStream,
-    LlamaStackClient,
-    NOT_GIVEN,
-)
-from pydantic import BaseModel, TypeAdapter
-from rich.console import Console
-from termcolor import cprint
 
 from llama_stack.distribution.build import print_pip_install_help
 from llama_stack.distribution.configure import parse_and_maybe_upgrade_config
@@ -46,6 +35,17 @@
     setup_logger,
     start_trace,
 )
+from llama_stack_client import (
+    APIResponse,
+    AsyncAPIResponse,
+    AsyncLlamaStackClient,
+    AsyncStream,
+    LlamaStackClient,
+    NOT_GIVEN,
+)
+from pydantic import BaseModel, TypeAdapter
+from rich.console import Console
+from termcolor import cprint
 
 T = TypeVar("T")
 
@@ -198,6 +198,7 @@ def __init__(
 
     async def initialize(self) -> bool:
         try:
+            self.endpoint_impls = None
             self.impls = await construct_stack(self.config, self.custom_provider_registry)
         except ModuleNotFoundError as _e:
             cprint(_e.msg, "red")
@@ -213,7 +214,7 @@ async def initialize(self) -> bool:
                     f"Please run:\n\n{prefix}llama stack build --template {self.config_path_or_template_name} --image-type venv\n\n",
                     "yellow",
                 )
-            return False
+            raise _e
 
         if Api.telemetry in self.impls:
             setup_logger(self.impls[Api.telemetry])
diff --git a/llama_stack/distribution/server/server.py b/llama_stack/distribution/server/server.py
index fcd0e3cad1..d2c32de119 100644
--- a/llama_stack/distribution/server/server.py
+++ b/llama_stack/distribution/server/server.py
@@ -282,8 +282,19 @@ def main():
         action="append",
         help="Environment variables in KEY=value format. Can be specified multiple times.",
     )
+    parser.add_argument(
+        "--tls-keyfile",
+        help="Path to TLS key file for HTTPS",
+        required="--tls-certfile" in sys.argv,
+    )
+    parser.add_argument(
+        "--tls-certfile",
+        help="Path to TLS certificate file for HTTPS",
+        required="--tls-keyfile" in sys.argv,
+    )
 
     args = parser.parse_args()
+
     if args.env:
         for env_pair in args.env:
             try:
@@ -381,11 +392,36 @@ def main():
 
     import uvicorn
 
-    # FYI this does not do hot-reloads
+    # Configure SSL if certificates are provided
+    port = args.port or config.server.port
+
+    ssl_config = None
+    if args.tls_keyfile:
+        keyfile = args.tls_keyfile
+        certfile = args.tls_certfile
+    else:
+        keyfile = config.server.tls_keyfile
+        certfile = config.server.tls_certfile
+
+    if keyfile and certfile:
+        ssl_config = {
+            "ssl_keyfile": keyfile,
+            "ssl_certfile": certfile,
+        }
+        print(f"HTTPS enabled with certificates:\n  Key: {keyfile}\n  Cert: {certfile}")
 
     listen_host = ["::", "0.0.0.0"] if not args.disable_ipv6 else "0.0.0.0"
-    print(f"Listening on {listen_host}:{args.port}")
-    uvicorn.run(app, host=listen_host, port=args.port)
+    print(f"Listening on {listen_host}:{port}")
+
+    uvicorn_config = {
+        "app": app,
+        "host": listen_host,
+        "port": port,
+    }
+    if ssl_config:
+        uvicorn_config.update(ssl_config)
+
+    uvicorn.run(**uvicorn_config)
 
 
 def extract_path_params(route: str) -> List[str]:
diff --git a/llama_stack/distribution/start_conda_env.sh b/llama_stack/distribution/start_conda_env.sh
index c37f30ef00..fe830059ff 100755
--- a/llama_stack/distribution/start_conda_env.sh
+++ b/llama_stack/distribution/start_conda_env.sh
@@ -34,6 +34,7 @@ shift
 
 # Process environment variables from --env arguments
 env_vars=""
+other_args=""
 while [[ $# -gt 0 ]]; do
   case "$1" in
   --env)
@@ -48,6 +49,7 @@ while [[ $# -gt 0 ]]; do
     fi
     ;;
   *)
+    other_args="$other_args $1"
     shift
     ;;
   esac
@@ -61,4 +63,5 @@ $CONDA_PREFIX/bin/python \
   -m llama_stack.distribution.server.server \
   --yaml-config "$yaml_config" \
   --port "$port" \
-  $env_vars
+  $env_vars \
+  $other_args
diff --git a/llama_stack/distribution/start_container.sh b/llama_stack/distribution/start_container.sh
index 2c5d65d09e..a5f543fb43 100755
--- a/llama_stack/distribution/start_container.sh
+++ b/llama_stack/distribution/start_container.sh
@@ -40,8 +40,12 @@ shift
 port="$1"
 shift
 
+# Initialize other_args
+other_args=""
+
 # Process environment variables from --env arguments
 env_vars=""
+
 while [[ $# -gt 0 ]]; do
     case "$1" in
         --env)
@@ -55,6 +59,7 @@ while [[ $# -gt 0 ]]; do
             fi
             ;;
         *)
+            other_args="$other_args $1"
             shift
             ;;
     esac
@@ -93,5 +98,8 @@ $CONTAINER_BINARY run $CONTAINER_OPTS -it \
   -v "$yaml_config:/app/config.yaml" \
   $mounts \
   --env LLAMA_STACK_PORT=$port \
-  --entrypoint='["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]' \
-  $container_image:$version_tag
+  --entrypoint python \
+  $container_image:$version_tag \
+  -m llama_stack.distribution.server.server \
+  --yaml-config /app/config.yaml \
+  $other_args
diff --git a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
index b48f92d361..6f4b25b9d3 100644
--- a/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
+++ b/llama_stack/providers/inline/tool_runtime/code_interpreter/code_execution.py
@@ -67,7 +67,6 @@ def generate_bwrap_command(bind_dirs: List[str]) -> str:
 @dataclass
 class CodeExecutionContext:
     matplotlib_dump_dir: str
-    use_proxy: bool = False
 
 
 @dataclass
diff --git a/llama_stack/providers/registry/safety.py b/llama_stack/providers/registry/safety.py
index b9f7b6d784..1e57982e9c 100644
--- a/llama_stack/providers/registry/safety.py
+++ b/llama_stack/providers/registry/safety.py
@@ -85,4 +85,13 @@ def available_providers() -> List[ProviderSpec]:
                 config_class="llama_stack.providers.remote.safety.bedrock.BedrockSafetyConfig",
             ),
         ),
+        remote_provider_spec(
+            api=Api.safety,
+            adapter=AdapterSpec(
+                adapter_type="fiddlecube",
+                pip_packages=["httpx"],
+                module="llama_stack.providers.remote.safety.fiddlecube",
+                config_class="llama_stack.providers.remote.safety.fiddlecube.FiddlecubeSafetyConfig",
+            ),
+        ),
     ]
diff --git a/llama_stack/providers/remote/inference/groq/groq.py b/llama_stack/providers/remote/inference/groq/groq.py
index 461b3ee61a..4e6cc2d6bd 100644
--- a/llama_stack/providers/remote/inference/groq/groq.py
+++ b/llama_stack/providers/remote/inference/groq/groq.py
@@ -26,6 +26,7 @@
     Message,
     ResponseFormat,
     ToolChoice,
+    ToolConfig,
 )
 from llama_stack.distribution.request_headers import NeedsRequestProviderData
 from llama_stack.providers.remote.inference.groq.config import GroqConfig
diff --git a/llama_stack/providers/remote/inference/ollama/ollama.py b/llama_stack/providers/remote/inference/ollama/ollama.py
index cff8aa7422..ecd1958541 100644
--- a/llama_stack/providers/remote/inference/ollama/ollama.py
+++ b/llama_stack/providers/remote/inference/ollama/ollama.py
@@ -352,24 +352,20 @@ async def embeddings(
         return EmbeddingsResponse(embeddings=embeddings)
 
     async def register_model(self, model: Model) -> Model:
-        # ollama does not have embedding models running. Check if the model is in list of available models.
-        if model.model_type == ModelType.embedding:
-            response = await self.client.list()
+        async def check_model_availability(model_id: str):
+            response = await self.client.ps()
             available_models = [m["model"] for m in response["models"]]
-            if model.provider_resource_id not in available_models:
+            if model_id not in available_models:
                 raise ValueError(
-                    f"Model '{model.provider_resource_id}' is not available in Ollama. "
-                    f"Available models: {', '.join(available_models)}"
+                    f"Model '{model_id}' is not available in Ollama. Available models: {', '.join(available_models)}"
                 )
+
+        if model.model_type == ModelType.embedding:
+            await check_model_availability(model.provider_resource_id)
             return model
+
         model = await self.register_helper.register_model(model)
-        models = await self.client.ps()
-        available_models = [m["model"] for m in models["models"]]
-        if model.provider_resource_id not in available_models:
-            raise ValueError(
-                f"Model '{model.provider_resource_id}' is not available in Ollama. "
-                f"Available models: {', '.join(available_models)}"
-            )
+        await check_model_availability(model.provider_resource_id)
 
         return model
 
diff --git a/llama_stack/providers/remote/safety/fiddlecube/__init__.py b/llama_stack/providers/remote/safety/fiddlecube/__init__.py
new file mode 100644
index 0000000000..d4cceb6e72
--- /dev/null
+++ b/llama_stack/providers/remote/safety/fiddlecube/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from typing import Any
+
+from .config import FiddlecubeSafetyConfig
+
+
+async def get_adapter_impl(config: FiddlecubeSafetyConfig, _deps) -> Any:
+    from .fiddlecube import FiddlecubeSafetyAdapter
+
+    impl = FiddlecubeSafetyAdapter(config)
+    await impl.initialize()
+    return impl
diff --git a/llama_stack/providers/remote/safety/fiddlecube/config.py b/llama_stack/providers/remote/safety/fiddlecube/config.py
new file mode 100644
index 0000000000..0ba5d7c32f
--- /dev/null
+++ b/llama_stack/providers/remote/safety/fiddlecube/config.py
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from llama_models.schema_utils import json_schema_type
+
+
+@json_schema_type
+class FiddlecubeSafetyConfig(BaseModel):
+    api_url: str = "https://api.fiddlecube.ai/api"
+    excluded_categories: List[str] = Field(default_factory=list)
diff --git a/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
new file mode 100644
index 0000000000..b00066eaff
--- /dev/null
+++ b/llama_stack/providers/remote/safety/fiddlecube/fiddlecube.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import json
+import logging
+import httpx
+
+from typing import Any, Dict, List
+
+from llama_stack.apis.inference import Message
+
+from llama_stack.apis.safety import (
+    RunShieldResponse,
+    Safety,
+)
+from llama_stack.apis.safety.safety import SafetyViolation, ViolationLevel
+from llama_stack.apis.shields import Shield
+from llama_stack.providers.datatypes import ShieldsProtocolPrivate
+
+from .config import FiddlecubeSafetyConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class FiddlecubeSafetyAdapter(Safety, ShieldsProtocolPrivate):
+    def __init__(self, config: FiddlecubeSafetyConfig) -> None:
+        self.config = config
+        self.registered_shields = []
+
+    async def initialize(self) -> None:
+        pass
+
+    async def shutdown(self) -> None:
+        pass
+
+    async def register_shield(self, shield: Shield) -> None:
+        print("Shield", shield)
+        pass
+
+    async def run_shield(
+        self, shield_id: str, messages: List[Message], params: Dict[str, Any] = None
+    ) -> RunShieldResponse:
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            request_body = {
+                "messages": [message.model_dump(mode="json") for message in messages],
+            }
+            if params.get("excluded_categories"):
+                request_body["excluded_categories"] = params.get("excluded_categories")
+            headers = {"Content-Type": "application/json"}
+            response = await client.post(
+                f"{self.config.api_url}/safety/guard/check",
+                json=request_body,
+                headers=headers,
+            )
+
+            logger.debug("Response:::", response.status_code)
+
+        # Check if the response is successful
+        if response.status_code != 200:
+            logger.error(f"FiddleCube API error: {response.status_code} - {response.text}")
+            raise RuntimeError("Failed to run shield with FiddleCube API")
+
+        # Convert the response into the format RunShieldResponse expects
+        response_data = response.json()
+        logger.debug("Response data: %s", json.dumps(response_data, indent=2))
+
+        # Check if there's a violation based on the response structure
+        if response_data.get("action") == "GUARDRAIL_INTERVENED":
+            user_message = ""
+            metadata = {}
+
+            outputs = response_data.get("outputs", [])
+            if outputs:
+                user_message = outputs[-1].get("text", "Safety violation detected")
+
+            assessments = response_data.get("assessments", [])
+            for assessment in assessments:
+                metadata.update(dict(assessment))
+
+            return RunShieldResponse(
+                violation=SafetyViolation(
+                    user_message=user_message,
+                    violation_level=ViolationLevel.ERROR,
+                    metadata=metadata,
+                )
+            )
+
+        return RunShieldResponse()
diff --git a/llama_stack/providers/remote/vector_io/qdrant/__init__.py b/llama_stack/providers/remote/vector_io/qdrant/__init__.py
index 54605fcf91..c584e29efe 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/__init__.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/__init__.py
@@ -12,8 +12,8 @@
 
 
 async def get_adapter_impl(config: QdrantConfig, deps: Dict[Api, ProviderSpec]):
-    from .qdrant import QdrantVectorMemoryAdapter
+    from .qdrant import QdrantVectorDBAdapter
 
-    impl = QdrantVectorMemoryAdapter(config, deps[Api.inference])
+    impl = QdrantVectorDBAdapter(config, deps[Api.inference])
     await impl.initialize()
     return impl
diff --git a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
index 719070528d..e7ad136eba 100644
--- a/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
+++ b/llama_stack/providers/remote/vector_io/qdrant/qdrant.py
@@ -55,7 +55,7 @@ async def add_chunks(self, chunks: List[Chunk], embeddings: NDArray):
 
         points = []
         for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-            chunk_id = f"{chunk.document_id}:chunk-{i}"
+            chunk_id = f"{chunk.metadata['document_id']}:chunk-{i}"
             points.append(
                 PointStruct(
                     id=convert_id(chunk_id),
@@ -93,6 +93,9 @@ async def query(self, embedding: NDArray, k: int, score_threshold: float) -> Que
 
         return QueryChunksResponse(chunks=chunks, scores=scores)
 
+    async def delete(self):
+        await self.client.delete_collection(collection_name=self.collection_name)
+
 
 class QdrantVectorDBAdapter(VectorIO, VectorDBsProtocolPrivate):
     def __init__(self, config: QdrantConfig, inference_api: Api.inference) -> None:
diff --git a/llama_stack/providers/tests/datasetio/test_datasetio.py b/llama_stack/providers/tests/datasetio/test_datasetio.py
index cf28045a4a..fd76bafe02 100644
--- a/llama_stack/providers/tests/datasetio/test_datasetio.py
+++ b/llama_stack/providers/tests/datasetio/test_datasetio.py
@@ -95,7 +95,7 @@ async def test_register_dataset(self, datasetio_stack):
         assert len(response) == 1
         assert response[0].identifier == "test_dataset"
 
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
             # unregister a dataset that does not exist
             await datasets_impl.unregister_dataset("test_dataset2")
 
@@ -104,7 +104,7 @@ async def test_register_dataset(self, datasetio_stack):
         assert isinstance(response, list)
         assert len(response) == 0
 
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
             await datasets_impl.unregister_dataset("test_dataset")
 
     @pytest.mark.asyncio
diff --git a/llama_stack/providers/tests/inference/test_model_registration.py b/llama_stack/providers/tests/inference/test_model_registration.py
index 96a34ec0e1..664564d222 100644
--- a/llama_stack/providers/tests/inference/test_model_registration.py
+++ b/llama_stack/providers/tests/inference/test_model_registration.py
@@ -32,7 +32,7 @@ async def test_register_unsupported_model(self, inference_stack, inference_model
             )
 
         # Try to register a model that's too large for local inference
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
             await models_impl.register_model(
                 model_id="Llama3.1-70B-Instruct",
             )
@@ -42,7 +42,7 @@ async def test_register_nonexistent_model(self, inference_stack):
         _, models_impl = inference_stack
 
         # Try to register a non-existent model
-        with pytest.raises(Exception) as exc_info:
+        with pytest.raises(ValueError):
             await models_impl.register_model(
                 model_id="Llama3-NonExistent-Model",
             )
@@ -59,7 +59,7 @@ async def test_register_with_llama_model(self, inference_stack):
             },
         )
 
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
             await models_impl.register_model(
                 model_id="custom-model-2",
                 metadata={
@@ -88,7 +88,7 @@ async def test_initialize_model_during_registering(self, inference_stack):
     async def test_register_with_invalid_llama_model(self, inference_stack):
         _, models_impl = inference_stack
 
-        with pytest.raises(ValueError) as exc_info:
+        with pytest.raises(ValueError):
             await models_impl.register_model(
                 model_id="custom-model-2",
                 metadata={"llama_model": "invalid-llama-model"},
diff --git a/llama_stack/providers/tests/inference/test_vision_inference.py b/llama_stack/providers/tests/inference/test_vision_inference.py
index 964f709016..a2434ac417 100644
--- a/llama_stack/providers/tests/inference/test_vision_inference.py
+++ b/llama_stack/providers/tests/inference/test_vision_inference.py
@@ -4,12 +4,12 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
+import base64
 from pathlib import Path
 
 import pytest
 
-from llama_stack.apis.common.content_types import ImageContentItem, TextContentItem, URL
-
+from llama_stack.apis.common.content_types import URL, ImageContentItem, TextContentItem
 from llama_stack.apis.inference import (
     ChatCompletionResponse,
     ChatCompletionResponseEventType,
@@ -23,7 +23,7 @@
 THIS_DIR = Path(__file__).parent
 
 with open(THIS_DIR / "pasta.jpeg", "rb") as f:
-    PASTA_IMAGE = f.read()
+    PASTA_IMAGE = base64.b64encode(f.read()).decode("utf-8")
 
 
 class TestVisionModelInference:
diff --git a/llama_stack/scripts/distro_codegen.py b/llama_stack/scripts/distro_codegen.py
index 7064d3104c..c73c15d415 100644
--- a/llama_stack/scripts/distro_codegen.py
+++ b/llama_stack/scripts/distro_codegen.py
@@ -29,7 +29,7 @@ def find_template_dirs(templates_dir: Path) -> Iterator[Path]:
     if not templates_dir.exists():
         raise FileNotFoundError(f"Templates directory not found: {templates_dir}")
 
-    return (d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__")
+    return sorted(d for d in templates_dir.iterdir() if d.is_dir() and d.name != "__pycache__")
 
 
 def process_template(template_dir: Path, progress) -> None:
diff --git a/llama_stack/templates/bedrock/run.yaml b/llama_stack/templates/bedrock/run.yaml
index 39408c1bd7..be6c9a928c 100644
--- a/llama_stack/templates/bedrock/run.yaml
+++ b/llama_stack/templates/bedrock/run.yaml
@@ -115,3 +115,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/cerebras/run.yaml b/llama_stack/templates/cerebras/run.yaml
index 5a70890a89..05d3f45259 100644
--- a/llama_stack/templates/cerebras/run.yaml
+++ b/llama_stack/templates/cerebras/run.yaml
@@ -117,3 +117,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/dell/run-with-safety.yaml b/llama_stack/templates/dell/run-with-safety.yaml
index bdc82d03a9..04c5957d46 100644
--- a/llama_stack/templates/dell/run-with-safety.yaml
+++ b/llama_stack/templates/dell/run-with-safety.yaml
@@ -116,3 +116,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/dell/run.yaml b/llama_stack/templates/dell/run.yaml
index 2ba62a7821..706444eb1b 100644
--- a/llama_stack/templates/dell/run.yaml
+++ b/llama_stack/templates/dell/run.yaml
@@ -107,3 +107,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/fireworks/run-with-safety.yaml b/llama_stack/templates/fireworks/run-with-safety.yaml
index a4b425436f..0fbe14a5a5 100644
--- a/llama_stack/templates/fireworks/run-with-safety.yaml
+++ b/llama_stack/templates/fireworks/run-with-safety.yaml
@@ -172,3 +172,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/fireworks/run.yaml b/llama_stack/templates/fireworks/run.yaml
index a497317bde..ccf67dcbb0 100644
--- a/llama_stack/templates/fireworks/run.yaml
+++ b/llama_stack/templates/fireworks/run.yaml
@@ -161,3 +161,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/hf-endpoint/run-with-safety.yaml b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
index 0329f580ba..f520a2fdab 100644
--- a/llama_stack/templates/hf-endpoint/run-with-safety.yaml
+++ b/llama_stack/templates/hf-endpoint/run-with-safety.yaml
@@ -124,3 +124,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/hf-endpoint/run.yaml b/llama_stack/templates/hf-endpoint/run.yaml
index 8163fe28e6..708cb1bcc6 100644
--- a/llama_stack/templates/hf-endpoint/run.yaml
+++ b/llama_stack/templates/hf-endpoint/run.yaml
@@ -114,3 +114,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/hf-serverless/run-with-safety.yaml b/llama_stack/templates/hf-serverless/run-with-safety.yaml
index 9cee920a5b..7f0abf5be0 100644
--- a/llama_stack/templates/hf-serverless/run-with-safety.yaml
+++ b/llama_stack/templates/hf-serverless/run-with-safety.yaml
@@ -124,3 +124,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/hf-serverless/run.yaml b/llama_stack/templates/hf-serverless/run.yaml
index c8ad0d38da..c0b7a4c605 100644
--- a/llama_stack/templates/hf-serverless/run.yaml
+++ b/llama_stack/templates/hf-serverless/run.yaml
@@ -114,3 +114,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
index 0faaabb159..c5286fc6be 100644
--- a/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run-with-safety.yaml
@@ -126,3 +126,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/meta-reference-gpu/run.yaml b/llama_stack/templates/meta-reference-gpu/run.yaml
index 6ffe1fa360..310585f23f 100644
--- a/llama_stack/templates/meta-reference-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-gpu/run.yaml
@@ -115,3 +115,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
index 5ff87a9010..d43cf3917e 100644
--- a/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
+++ b/llama_stack/templates/meta-reference-quantized-gpu/run.yaml
@@ -117,3 +117,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/nvidia/run.yaml b/llama_stack/templates/nvidia/run.yaml
index 6dc325e9dd..c8ae362f54 100644
--- a/llama_stack/templates/nvidia/run.yaml
+++ b/llama_stack/templates/nvidia/run.yaml
@@ -147,3 +147,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/ollama/doc_template.md b/llama_stack/templates/ollama/doc_template.md
index eb4aadd292..29efe39c33 100644
--- a/llama_stack/templates/ollama/doc_template.md
+++ b/llama_stack/templates/ollama/doc_template.md
@@ -16,7 +16,7 @@ The `llamastack/distribution-{{ name }}` distribution consists of the following
 
 You should use this distribution if you have a regular desktop machine without very powerful GPUs. Of course, if you have powerful GPUs, you can still continue using this distribution since Ollama supports GPU acceleration.
 
-{%- if run_config_env_vars %}
+{% if run_config_env_vars %}
 ### Environment Variables
 
 The following environment variables can be configured:
diff --git a/llama_stack/templates/ollama/run-with-safety.yaml b/llama_stack/templates/ollama/run-with-safety.yaml
index 5b5c9c253a..ac5dab7552 100644
--- a/llama_stack/templates/ollama/run-with-safety.yaml
+++ b/llama_stack/templates/ollama/run-with-safety.yaml
@@ -121,3 +121,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/ollama/run.yaml b/llama_stack/templates/ollama/run.yaml
index 3cc1cb2ac6..4852236750 100644
--- a/llama_stack/templates/ollama/run.yaml
+++ b/llama_stack/templates/ollama/run.yaml
@@ -110,3 +110,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/remote-vllm/run-with-safety.yaml b/llama_stack/templates/remote-vllm/run-with-safety.yaml
index 4a0fa9a857..1fe998a1f9 100644
--- a/llama_stack/templates/remote-vllm/run-with-safety.yaml
+++ b/llama_stack/templates/remote-vllm/run-with-safety.yaml
@@ -126,3 +126,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/remote-vllm/run.yaml b/llama_stack/templates/remote-vllm/run.yaml
index 9631f94a2f..9d3db8a31e 100644
--- a/llama_stack/templates/remote-vllm/run.yaml
+++ b/llama_stack/templates/remote-vllm/run.yaml
@@ -115,3 +115,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/sambanova/run.yaml b/llama_stack/templates/sambanova/run.yaml
index 6cec51824c..39b0f3c4e8 100644
--- a/llama_stack/templates/sambanova/run.yaml
+++ b/llama_stack/templates/sambanova/run.yaml
@@ -126,3 +126,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/template.py b/llama_stack/templates/template.py
index 09efd20384..04a09741cc 100644
--- a/llama_stack/templates/template.py
+++ b/llama_stack/templates/template.py
@@ -131,8 +131,15 @@ def generate_markdown_docs(self) -> str:
             providers_str = ", ".join(f"`{p}`" for p in providers)
             providers_table += f"| {api} | {providers_str} |\n"
 
-        template = "<!-- This file was auto-generated by distro_codegen.py, please edit source -->\n"
-        template += self.template_path.read_text()
+        template = self.template_path.read_text()
+        comment = "<!-- This file was auto-generated by distro_codegen.py, please edit source -->\n"
+        orphantext = "---\norphan: true\n---\n"
+
+        if template.startswith(orphantext):
+            template = template.replace(orphantext, orphantext + comment)
+        else:
+            template = comment + template
+
         # Render template with rich-generated table
         env = jinja2.Environment(
             trim_blocks=True,
diff --git a/llama_stack/templates/tgi/run-with-safety.yaml b/llama_stack/templates/tgi/run-with-safety.yaml
index 503505c326..ed6c9ef6f2 100644
--- a/llama_stack/templates/tgi/run-with-safety.yaml
+++ b/llama_stack/templates/tgi/run-with-safety.yaml
@@ -114,3 +114,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/tgi/run.yaml b/llama_stack/templates/tgi/run.yaml
index f1953c513e..8bf76f37b1 100644
--- a/llama_stack/templates/tgi/run.yaml
+++ b/llama_stack/templates/tgi/run.yaml
@@ -113,3 +113,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/together/build.yaml b/llama_stack/templates/together/build.yaml
index 90ee5bcee5..ce5bec920c 100644
--- a/llama_stack/templates/together/build.yaml
+++ b/llama_stack/templates/together/build.yaml
@@ -10,6 +10,7 @@ distribution_spec:
     - remote::pgvector
     safety:
     - inline::llama-guard
+    - remote::fiddlecube
     agents:
     - inline::meta-reference
     telemetry:
diff --git a/llama_stack/templates/together/run-with-safety.yaml b/llama_stack/templates/together/run-with-safety.yaml
index ec351108e5..2989266307 100644
--- a/llama_stack/templates/together/run-with-safety.yaml
+++ b/llama_stack/templates/together/run-with-safety.yaml
@@ -167,3 +167,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/together/run.yaml b/llama_stack/templates/together/run.yaml
index c2afd98e9b..142a9460ec 100644
--- a/llama_stack/templates/together/run.yaml
+++ b/llama_stack/templates/together/run.yaml
@@ -32,6 +32,9 @@ providers:
   - provider_id: llama-guard
     provider_type: inline::llama-guard
     config: {}
+  - provider_id: fiddlecube
+    provider_type: remote::fiddlecube
+    config: {}
   agents:
   - provider_id: meta-reference
     provider_type: inline::meta-reference
@@ -143,8 +146,9 @@ models:
   model_id: all-MiniLM-L6-v2
   provider_id: sentence-transformers
   model_type: embedding
-shields:
-- shield_id: meta-llama/Llama-Guard-3-8B
+shields: 
+- shield_id: fiddlecube-redteam
+  provider_id: fiddlecube
 vector_dbs: []
 datasets: []
 scoring_fns: []
@@ -156,3 +160,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/llama_stack/templates/together/together.py b/llama_stack/templates/together/together.py
index b7ac130ed1..bb6a23cb36 100644
--- a/llama_stack/templates/together/together.py
+++ b/llama_stack/templates/together/together.py
@@ -28,7 +28,7 @@ def get_distribution_template() -> DistributionTemplate:
     providers = {
         "inference": ["remote::together"],
         "vector_io": ["inline::faiss", "remote::chromadb", "remote::pgvector"],
-        "safety": ["inline::llama-guard"],
+        "safety": ["inline::llama-guard", "remote::fiddlecube"],
         "agents": ["inline::meta-reference"],
         "telemetry": ["inline::meta-reference"],
         "eval": ["inline::meta-reference"],
@@ -103,7 +103,7 @@ def get_distribution_template() -> DistributionTemplate:
             "run.yaml": RunConfigSettings(
                 provider_overrides={
                     "inference": [inference_provider, embedding_provider],
-                    "vector_io": [vector_io_provider],
+                    "vector_io": [vector_io_provider]
                 },
                 default_models=default_models + [embedding_model],
                 default_tool_groups=default_tool_groups,
diff --git a/llama_stack/templates/vllm-gpu/run.yaml b/llama_stack/templates/vllm-gpu/run.yaml
index 165e4d51db..41a545e1ae 100644
--- a/llama_stack/templates/vllm-gpu/run.yaml
+++ b/llama_stack/templates/vllm-gpu/run.yaml
@@ -117,3 +117,5 @@ tool_groups:
   provider_id: rag-runtime
 - toolgroup_id: builtin::code_interpreter
   provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/pyproject.toml b/pyproject.toml
index 4020247729..5e9cb75e25 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llama_stack"
-version = "0.1.1"
+version = "0.1.2"
 authors = [{ name = "Meta Llama", email = "llama-oss@meta.com" }]
 description = "Llama Stack"
 readme = "README.md"
@@ -25,8 +25,8 @@ dependencies = [
     "fire",
     "httpx",
     "huggingface-hub",
-    "llama-models>=0.1.1",
-    "llama-stack-client>=0.1.1",
+    "llama-models>=0.1.2",
+    "llama-stack-client>=0.1.2",
     "prompt-toolkit",
     "python-dotenv",
     "pydantic>=2",
diff --git a/requirements.txt b/requirements.txt
index 157c688204..497feb7649 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,6 +4,7 @@ annotated-types==0.7.0
 anyio==4.8.0
 blobfile==3.0.0
 certifi==2025.1.31
+chardet==5.2.0
 charset-normalizer==3.4.1
 click==8.1.8
 colorama==0.4.6 ; sys_platform == 'win32'
@@ -18,8 +19,8 @@ httpx==0.28.1
 huggingface-hub==0.28.1
 idna==3.10
 jinja2==3.1.5
-llama-models==0.1.1
-llama-stack-client==0.1.1
+llama-models==0.1.2
+llama-stack-client==0.1.2
 lxml==5.3.0
 markdown-it-py==3.0.0
 markupsafe==3.0.2
@@ -34,6 +35,7 @@ pycryptodomex==3.21.0
 pydantic==2.10.6
 pydantic-core==2.27.2
 pygments==2.19.1
+pypdf==5.2.0
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2025.1
diff --git a/tests/client-sdk/README.md b/tests/client-sdk/README.md
index 13142d46fd..d4d439d962 100644
--- a/tests/client-sdk/README.md
+++ b/tests/client-sdk/README.md
@@ -4,18 +4,18 @@ You can run llama stack integration tests on either a Llama Stack Library or a L
 To test on a Llama Stack library with certain configuration, run
 ```bash
 LLAMA_STACK_CONFIG=./llama_stack/templates/cerebras/run.yaml
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference/
 ```
 or just the template name
 ```bash
 LLAMA_STACK_CONFIG=together
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference/
 ```
 
 To test on a Llama Stack endpoint, run
 ```bash
 LLAMA_STACK_BASE_URL=http//localhost:8089
-pytest -s -v tests/client-sdk/inference/test_inference.py
+pytest -s -v tests/client-sdk/inference
 ```
 
 ## Report Generation
diff --git a/tests/client-sdk/agents/test_agents.py b/tests/client-sdk/agents/test_agents.py
index 2b1db7df03..85b7af831c 100644
--- a/tests/client-sdk/agents/test_agents.py
+++ b/tests/client-sdk/agents/test_agents.py
@@ -263,12 +263,14 @@ def test_custom_tool(llama_stack_client, agent_config):
     assert "CustomTool" in logs_str
 
 
-def test_override_system_message_behavior(llama_stack_client, agent_config):
+# TODO: fix this flaky test
+def xtest_override_system_message_behavior(llama_stack_client, agent_config):
     client_tool = TestClientTool()
     agent_config = {
         **agent_config,
         "instructions": "You are a pirate",
         "client_tools": [client_tool.get_tool_definition()],
+        "model": "meta-llama/Llama-3.2-3B-Instruct",
     }
 
     agent = Agent(llama_stack_client, agent_config, client_tools=(client_tool,))
diff --git a/tests/client-sdk/inference/test_inference.py b/tests/client-sdk/inference/test_text_inference.py
similarity index 72%
rename from tests/client-sdk/inference/test_inference.py
rename to tests/client-sdk/inference/test_text_inference.py
index 9bbd1061a1..81b4762188 100644
--- a/tests/client-sdk/inference/test_inference.py
+++ b/tests/client-sdk/inference/test_text_inference.py
@@ -4,9 +4,6 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import base64
-import pathlib
-
 import pytest
 from pydantic import BaseModel
 
@@ -14,6 +11,7 @@
     "remote::ollama": "json",
     "remote::together": "json",
     "remote::fireworks": "json",
+    "remote::vllm": "json",
 }
 
 PROVIDER_LOGPROBS_TOP_K = set(
@@ -56,23 +54,6 @@ def get_weather_tool_definition():
     }
 
 
-@pytest.fixture
-def image_path():
-    return pathlib.Path(__file__).parent / "dog.png"
-
-
-@pytest.fixture
-def base64_image_data(image_path):
-    # Convert the image to base64
-    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
-
-
-@pytest.fixture
-def base64_image_url(base64_image_data, image_path):
-    # suffix includes the ., so we remove it
-    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
-
-
 def test_text_completion_non_streaming(llama_stack_client, text_model_id):
     response = llama_stack_client.inference.completion(
         content="Complete the sentence using one word: Roses are red, violets are ",
@@ -176,8 +157,8 @@ class AnswerFormat(BaseModel):
 @pytest.mark.parametrize(
     "question,expected",
     [
-        ("What are the names of planets in our solar system?", "Earth"),
-        ("What are the names of the planets that have rings around them?", "Saturn"),
+        ("Which planet do humans live on?", "Earth"),
+        ("Which planet has rings around it with a name starting with letter S?", "Saturn"),
     ],
 )
 def test_text_chat_completion_non_streaming(llama_stack_client, text_model_id, question, expected):
@@ -299,101 +280,3 @@ class AnswerFormat(BaseModel):
     assert answer.last_name == "Jordan"
     assert answer.year_of_birth == 1963
     assert answer.num_seasons_in_nba == 15
-
-
-def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id):
-    message = {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        # TODO: Replace with Github based URI to resources/sample1.jpg
-                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
-                    },
-                },
-            },
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
-    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
-
-
-def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
-    message = {
-        "role": "user",
-        "content": [
-            {
-                "type": "image",
-                "image": {
-                    "url": {
-                        # TODO: Replace with Github based URI to resources/sample1.jpg
-                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
-                    },
-                },
-            },
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=True,
-    )
-    streamed_content = ""
-    for chunk in response:
-        streamed_content += chunk.event.delta.text.lower()
-    assert len(streamed_content) > 0
-    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
-
-
-@pytest.mark.parametrize("type_", ["url", "data"])
-def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_):
-    image_spec = {
-        "url": {
-            "type": "image",
-            "image": {
-                "url": {
-                    "uri": base64_image_url,
-                },
-            },
-        },
-        "data": {
-            "type": "image",
-            "image": {
-                "data": base64_image_data,
-            },
-        },
-    }[type_]
-
-    message = {
-        "role": "user",
-        "content": [
-            image_spec,
-            {
-                "type": "text",
-                "text": "Describe what is in this image.",
-            },
-        ],
-    }
-    response = llama_stack_client.inference.chat_completion(
-        model_id=vision_model_id,
-        messages=[message],
-        stream=False,
-    )
-    message_content = response.completion_message.content.lower().strip()
-    assert len(message_content) > 0
diff --git a/tests/client-sdk/inference/test_vision_inference.py b/tests/client-sdk/inference/test_vision_inference.py
new file mode 100644
index 0000000000..df4b9d9339
--- /dev/null
+++ b/tests/client-sdk/inference/test_vision_inference.py
@@ -0,0 +1,133 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+import base64
+import pathlib
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def inference_provider_type(llama_stack_client):
+    providers = llama_stack_client.providers.list()
+    inference_providers = [p for p in providers if p.api == "inference"]
+    assert len(inference_providers) > 0, "No inference providers found"
+    return inference_providers[0].provider_type
+
+
+@pytest.fixture
+def image_path():
+    return pathlib.Path(__file__).parent / "dog.png"
+
+
+@pytest.fixture
+def base64_image_data(image_path):
+    # Convert the image to base64
+    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
+
+
+@pytest.fixture
+def base64_image_url(base64_image_data, image_path):
+    # suffix includes the ., so we remove it
+    return f"data:image/{image_path.suffix[1:]};base64,{base64_image_data}"
+
+
+def test_image_chat_completion_non_streaming(llama_stack_client, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "url": {
+                        # TODO: Replace with Github based URI to resources/sample1.jpg
+                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
+                    },
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
+    assert any(expected in message_content for expected in {"dog", "puppy", "pup"})
+
+
+def test_image_chat_completion_streaming(llama_stack_client, vision_model_id):
+    message = {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "url": {
+                        # TODO: Replace with Github based URI to resources/sample1.jpg
+                        "uri": "https://www.healthypawspetinsurance.com/Images/V3/DogAndPuppyInsurance/Dog_CTA_Desktop_HeroImage.jpg"
+                    },
+                },
+            },
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=True,
+    )
+    streamed_content = ""
+    for chunk in response:
+        streamed_content += chunk.event.delta.text.lower()
+    assert len(streamed_content) > 0
+    assert any(expected in streamed_content for expected in {"dog", "puppy", "pup"})
+
+
+@pytest.mark.parametrize("type_", ["url", "data"])
+def test_image_chat_completion_base64(llama_stack_client, vision_model_id, base64_image_data, base64_image_url, type_):
+    image_spec = {
+        "url": {
+            "type": "image",
+            "image": {
+                "url": {
+                    "uri": base64_image_url,
+                },
+            },
+        },
+        "data": {
+            "type": "image",
+            "image": {
+                "data": base64_image_data,
+            },
+        },
+    }[type_]
+
+    message = {
+        "role": "user",
+        "content": [
+            image_spec,
+            {
+                "type": "text",
+                "text": "Describe what is in this image.",
+            },
+        ],
+    }
+    response = llama_stack_client.inference.chat_completion(
+        model_id=vision_model_id,
+        messages=[message],
+        stream=False,
+    )
+    message_content = response.completion_message.content.lower().strip()
+    assert len(message_content) > 0
diff --git a/uv.lock b/uv.lock
index f492872bcc..087396eeac 100644
--- a/uv.lock
+++ b/uv.lock
@@ -687,7 +687,7 @@ wheels = [
 
 [[package]]
 name = "llama-models"
-version = "0.1.1"
+version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jinja2" },
@@ -696,14 +696,14 @@ dependencies = [
     { name = "pyyaml" },
     { name = "tiktoken" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/df/80/4a4595cf5e55f71c0e15b85ff2f4c04b0742bf664ede062a09c9d383bf7b/llama_models-0.1.1.tar.gz", hash = "sha256:7cb5a9fe38485b47aff4c93e183d6d390a676a7619f3355502576b652f17733a", size = 1608412 }
+sdist = { url = "https://files.pythonhosted.org/packages/b5/f2/ed8310d4677cd38ab45ffba45aea2a4e9882b640045ad9c3198ac69e5a85/llama_models-0.1.2.tar.gz", hash = "sha256:1266eaec7a8db336e4ed034d2b494189ccb7fd6d6b7aefe874eee749a4340b9b", size = 1608069 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d9/93/d49dd0f0cd37df1a7a7fb25444d010f626cdf42b21eea11d839b0f6a808a/llama_models-0.1.1-py3-none-any.whl", hash = "sha256:7e4f15dc4f6f011852ea2c42f9770b75140f5eca670b32cc67fc0a4605c55f89", size = 1650981 },
+    { url = "https://files.pythonhosted.org/packages/55/a7/34b9e88ef4109759c8881f43b8006139e3d13d54c440b8c571b253655f54/llama_models-0.1.2-py3-none-any.whl", hash = "sha256:8aa5287d1c6325698991ff677e71148cac347e07493bb5b3ab891e614b89e1f8", size = 1651273 },
 ]
 
 [[package]]
 name = "llama-stack"
-version = "0.1.1"
+version = "0.1.2"
 source = { editable = "." }
 dependencies = [
     { name = "blobfile" },
@@ -751,8 +751,8 @@ requires-dist = [
     { name = "fire" },
     { name = "httpx" },
     { name = "huggingface-hub" },
-    { name = "llama-models", specifier = ">=0.1.1" },
-    { name = "llama-stack-client", specifier = ">=0.1.1" },
+    { name = "llama-models", specifier = ">=0.1.2" },
+    { name = "llama-stack-client", specifier = ">=0.1.2" },
     { name = "myst-parser", marker = "extra == 'docs'" },
     { name = "nbval", marker = "extra == 'dev'" },
     { name = "pre-commit", marker = "extra == 'dev'" },
@@ -780,7 +780,7 @@ requires-dist = [
 
 [[package]]
 name = "llama-stack-client"
-version = "0.1.1"
+version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -797,9 +797,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/07/42/7004958ac1a6da9a8060decf0d9120fdeb3b2775de090a0a473f2ee4a27d/llama_stack_client-0.1.1.tar.gz", hash = "sha256:3e549a848ade959d342fa52ec49b1913b7bb615a77b5b8dcaefe6ff94409049e", size = 179729 }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/75/8b41a3026c871a8650cd8d2cfda9f891a9163458813574f36518bb40afe4/llama_stack_client-0.1.2.tar.gz", hash = "sha256:94277ddae52be557d771dcdc15d85af9012b5aa87439dd69ec1dc0ff486b0c8e", size = 188023 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/66/5255c09dc001ff437fd6fe6fad27142035b60073df243f7df0494095f605/llama_stack_client-0.1.1-py3-none-any.whl", hash = "sha256:e07d58fdcc1eaa370dd00b94c2dd1a8169c0ac60c37f6f2772cbc2c5b63f2e62", size = 348665 },
+    { url = "https://files.pythonhosted.org/packages/c4/32/3a3a97eecff1f1e3a1dc90e9b00681abea11ec4f43a7ca549981261e18b6/llama_stack_client-0.1.2-py3-none-any.whl", hash = "sha256:85ff0fb57a62d7d0470cfaa2b07a595c9fb3483297944d5e5a066db850d38ccd", size = 359415 },
 ]
 
 [[package]]