diff --git a/.github/actions/setup-vllm-gpu/action.yml b/.github/actions/setup-vllm-gpu/action.yml
new file mode 100644
index 0000000000..efb8ea706f
--- /dev/null
+++ b/.github/actions/setup-vllm-gpu/action.yml
@@ -0,0 +1,63 @@
+name: Setup local vLLM
+description: Start vLLM Inference Server on localhost
+inputs:
+
+  vllm-image-version:
+    description: vLLM Docker Image version
+    required: true
+    default: latest
+
+  vllm-container:
+    description: vLLM Docker Container name
+    required: true
+    default: vllm_server
+
+  vllm-server-port:
+    description: vLLM Inference Server port
+    required: true
+    default: "8000"
+
+  model:
+    description: Model to serve by vLLM Inference Server
+    required: true
+    default: meta-llama/Llama-3.2-3B-Instruct
+
+  tool-calling-parser:
+    description: Model Tool Calling parser 
+    required: true
+    default: llama3_json
+  
+  huggingface-pat:
+    description: HuggingFace Personal Access Token
+    required: true
+    default: N/A
+
+runs:
+  using: "composite"
+  steps:
+
+    - name: Start vLLM Inference Server
+      shell: bash
+      run: |
+
+        if sudo docker ps -a | grep -q "${{ inputs.vllm-container }}"; then
+          echo "Container '${{ inputs.vllm-container }}' exists."
+        else
+
+          echo "Starting vLLM Inference Server serving ${{ inputs.model }} ..."
+
+          sudo docker run \
+              --name ${{ inputs.vllm-container }} \
+              --runtime nvidia \
+              --gpus all \
+              --detach \
+              --volume ~/.cache/huggingface:/root/.cache/huggingface \
+              --env "HUGGING_FACE_HUB_TOKEN=${{ inputs.huggingface-pat }}" \
+              --publish ${{ inputs.vllm-server-port }}:8000 \
+              --ipc=host \
+              vllm/vllm-openai:${{ inputs.vllm-image-version }} \
+              --model ${{ inputs.model }} \
+              --max_model_len 124832 \
+              --enable-auto-tool-choice \
+              --tool-call-parser ${{ inputs.tool-calling-parser }}
+        fi
diff --git a/.github/workflows/integration-tests-on-gpu-self-hosted-runner.yaml b/.github/workflows/integration-tests-on-gpu-self-hosted-runner.yaml
new file mode 100644
index 0000000000..563feef9c9
--- /dev/null
+++ b/.github/workflows/integration-tests-on-gpu-self-hosted-runner.yaml
@@ -0,0 +1,268 @@
+name: "[Self-Hosted runner] Integration Tests vLLM Inference Server"
+
+on:
+  workflow_dispatch: # Only triggering workflow manually.
+    inputs:
+
+      model:
+        description: Model to serve by vLLM Inference Server
+        required: true
+        default: meta-llama/Llama-3.2-3B-Instruct
+
+      vllm-server-ip:
+        description: IP address of the vLLM Inference Server
+        required: true
+        default: "0.0.0.0"
+      
+      server-timeout:
+        description: Time to wait for server to come online
+        required: true
+        default: "300"
+
+      test-all-client-versions:
+        description: 'Test against both the latest and published versions'
+        type: boolean
+        default: false
+
+#   push:
+#     branches: [ main ]
+#   pull_request:
+#     branches: [ main ]
+#     paths:
+#       - 'llama_stack/**'
+#       - 'tests/integration/**'
+#       - 'uv.lock'
+#       - 'pyproject.toml'
+#       - 'requirements.txt'
+#       - '.github/workflows/integration-tests.yml' # This workflow
+
+env:
+  CONTAINER_NAME: vllm_server
+  LLAMA_STACK_TEMPLATE: remote-vllm-gpu
+  EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  discover-tests:
+    runs-on: ubuntu-latest
+    outputs:
+      test-type: ${{ steps.generate-matrix.outputs.test-type }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Generate test matrix
+        id: generate-matrix
+        run: |
+          # Get test directories dynamically, excluding non-test directories
+          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
+            grep -Ev "^(__pycache__|fixtures|test_cases)$" |
+            sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-type=$TEST_TYPES" >> $GITHUB_OUTPUT
+
+          echo "Test Matrix: "
+          cat  $GITHUB_OUTPUT
+  
+  prepare_self_hosted_runner:
+    runs-on: [self-hosted, linux, X64]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+
+      - name: Setup vllm GPU
+        uses: ./.github/actions/setup-vllm-gpu
+        with:
+          model: ${{ inputs.model }}
+          huggingface-pat: ${{ secrets.HF_PAT }}
+          vllm-container: ${{ env.CONTAINER_NAME }}
+
+
+      - name: Verify VLLM remote server is healthy
+        env:
+          RETRY_INTERVAL: 1
+        run: |
+
+          if sudo docker ps -a | grep -q "${{ env.CONTAINER_NAME }}"; then
+            echo "Container '${{ env.CONTAINER_NAME }}' exists."
+            echo "Checking if vLLM Inference Server is online..."
+          else
+            echo "Container '${{ env.CONTAINER_NAME }}' does not exist."
+            exit 1
+          fi
+
+          echo "Waiting for vLLM Inference Server to come online..."
+
+          for i in $(seq 1 ${{ inputs.server-timeout }}); do
+              if curl --silent --fail "http://${{ inputs.vllm-server-ip }}:8000/health" > /dev/null; then
+                  echo "vLLM server is up and running!"
+                  exit 0
+              else
+                  echo "[Attempt $i/${{ inputs.server-timeout }}] vLLM server not yet available. Retrying in ${RETRY_INTERVAL} seconds..."
+                  sleep $RETRY_INTERVAL
+              fi
+          done
+
+          echo "vLLM server failed to start after ${{ inputs.server-timeout }} seconds of waiting"
+        
+          sudo docker logs ${{ env.CONTAINER_NAME }} > ${{ env.CONTAINER_NAME }}.log
+          cat ${{ env.CONTAINER_NAME }}.log
+
+          exit 1
+
+
+      # - name: Build Llama Stack
+      #   run: |
+      #     uv run llama stack build --template ${{ env.LLAMA_STACK_TEMPLATE }} --image-type venv
+
+
+      # - name: Start Llama Stack server in background
+      #   # if: matrix.client-type == 'http'
+      #   env:
+      #     INFERENCE_MODEL: ${{ inputs.model }}
+      #     ENABLE_VLLM: "vllm"
+      #     VLLM_URL: "http://${{ inputs.vllm-server-ip }}:8000/v1/"
+      #     VLLM_INFERENCE_MODEL: ${{ inputs.model }}
+      #   run: |
+
+      #     LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run \
+      #       $HOME/.llama/distributions/${{ env.LLAMA_STACK_TEMPLATE }}/${{ env.LLAMA_STACK_TEMPLATE }}-run.yaml \
+      #       --image-type venv \
+      #       --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \
+      #       --env VLLM_URL=${{ env.VLLM_URL }} \
+      #       --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }} &
+
+
+      # - name: Wait for Llama Stack server to be ready
+      #   # if: matrix.client-type == 'http'
+      #   run: |
+      #     echo "Waiting for Llama Stack server..."
+      #     for i in {1..30}; do
+      #       if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
+      #         echo "Llama Stack server is up!"
+      #         exit 0
+      #       fi
+      #       sleep 1
+      #     done
+      #     echo "Llama Stack server failed to start"
+      #     cat server.log
+      #     exit 1
+
+  test-matrix:
+    runs-on: [self-hosted, linux, X64]
+    needs: [discover-tests, prepare_self_hosted_runner]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }}
+        # client-type: [library, server]
+        # python-version: ["3.12", "3.13"]
+        # test-type: [inference]
+        client-type: [server]
+        python-version: ["3.13"]
+        client-version: ${{ (github.event_name == 'schedule' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
+
+
+    # strategy:
+    #   fail-fast: false # we want to run all tests regardless of failure
+    #   matrix:
+    #     test-type: [inference]
+    #     client-type: [server]
+    #     python-version: ["3.12"]
+    #     client-version: [latest]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+        with:
+          python-version: ${{ matrix.python-version }}
+          client-version: ${{ matrix.client-version }}
+
+
+      - name: Check Storage and Memory Available Before Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+      - name: Build Llama Stack
+        run: |
+          uv run llama stack build --template ${{ env.LLAMA_STACK_TEMPLATE }} --image-type venv
+
+
+      - name: Run Integration Tests
+        env:
+          INFERENCE_MODEL: ${{ inputs.model }}
+          ENABLE_VLLM: "vllm"
+          VLLM_URL: "http://${{ inputs.vllm-server-ip }}:8000/v1/"
+          VLLM_INFERENCE_MODEL: ${{ inputs.model }}
+        run: |
+
+          set -e
+          
+          if [ "${{ matrix.client-type }}" == "library" ]; then
+            stack_config="${{ env.LLAMA_STACK_TEMPLATE }}"
+          else
+            stack_config="server:${{ env.LLAMA_STACK_TEMPLATE }}"
+          fi
+          
+          start_time=$(date +%s)
+          echo "[DEBUG] Start time of the test: ${start_time}"
+
+          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+            --text-model="vllm/${{ env.INFERENCE_MODEL }}" \
+            --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \
+            --env VLLM_URL=${{ env.VLLM_URL }} \
+            --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }}  \
+            --embedding-model=${{ env.EMBEDDING_MODEL }} \
+            --color=yes \
+            --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log
+
+          end_time=$(date +%s)
+          echo "[DEBUG] End time of the test: ${end_time}"
+
+          test_run_duraiton=$((end_time - start_time))
+          echo "[DEBUG] Duration of the test run: ${test_run_duraiton} seconds"
+          
+          ## Grabbing vLLM inference server logs for the duration of a given test run: 
+          sudo docker logs ${{ env.CONTAINER_NAME }} --since "${test_run_duraiton}s"  > ${{ env.CONTAINER_NAME }}-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}.log
+
+          cat ${{ env.CONTAINER_NAME }}-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}.log
+
+
+      - name: Check Storage and Memory Available After Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+
+      - name: Upload all logs to artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}
+          path: |
+            *.log
+          retention-days: 1
+  
+
+  stop_inference_server:
+    runs-on: [self-hosted, linux, X64]
+    needs: test-matrix
+
+    steps:
+      - name: Stop vLLM server and write logs to file
+        if: ${{ always() }}
+        run: |
+          sudo docker stop ${{ env.CONTAINER_NAME }}
+          sudo docker rm ${{ env.CONTAINER_NAME }}
\ No newline at end of file
diff --git a/.github/workflows/integration-tests-remote-vllm.yaml b/.github/workflows/integration-tests-remote-vllm.yaml
new file mode 100644
index 0000000000..66d038f0c7
--- /dev/null
+++ b/.github/workflows/integration-tests-remote-vllm.yaml
@@ -0,0 +1,177 @@
+name: Integration Tests with remote vLLM Inference Server
+
+on:
+  workflow_dispatch: # Only triggering workflow manually.
+    inputs:
+      vllm_server_ip:
+        description: IP address of the vLLM Inference Server
+        required: true
+        default: "0.0.0.0"
+
+#   push:
+#     branches: [ main ]
+#   pull_request:
+#     branches: [ main ]
+#     paths:
+#       - 'llama_stack/**'
+#       - 'tests/integration/**'
+#       - 'uv.lock'
+#       - 'pyproject.toml'
+#       - 'requirements.txt'
+#       - '.github/workflows/integration-tests.yml' # This workflow
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test-matrix:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        # Listing tests manually since some of them currently fail
+        # TODO: generate matrix list from tests/integration when fixed
+        # test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime, vector_io]
+        test-type: [inference]
+        client-type: [library, http]
+        # python-version: ["3.12", "3.13"]
+        python-version: ["3.12"]
+      fail-fast: false # we want to run all tests regardless of failure
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+        with:
+          python-version: ${{ matrix.python-version }}
+
+    #   - name: Setup ollama
+    #     uses: ./.github/actions/setup-ollama
+
+      - name: Verify VLLM remote server is healthy
+        # if: matrix.client-type == 'http'
+        run: |
+          echo "Verifying vLLM status..."
+          vllm_status=$(curl -s -o /dev/null -w "%{http_code}" http://${{ github.event.inputs.vllm_server_ip }}:8000/health)
+          echo "vLLM remote server /health status: $vllm_status"
+          if [ "$vllm_status" != "200" ]; then
+            echo "vLLM health check failed"
+            exit 1
+          fi
+
+      - name: Build Llama Stack
+        run: |
+          uv run llama stack build --template starter --image-type venv
+
+      - name: Start Llama Stack server in background
+        if: matrix.client-type == 'http'
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          ENABLE_VLLM: "vllm"
+          VLLM_URL: "http://${{ github.event.inputs.vllm_server_ip }}:8000/v1/"
+          VLLM_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          VLLM_API_TOKEN: "remote_vllm"
+        run: |
+          ls -la ./llama_stack/templates
+          echo "GITHUB_WORKSPACE ${GITHUB_WORKSPACE}"
+          echo "github.workspace ${{ github.workspace }}"
+
+          LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run $GITHUB_WORKSPACE/llama_stack/templates/starter/run.yaml \
+            --image-type venv \
+            --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \
+            --env VLLM_URL=${{ env.VLLM_URL }} \
+            --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }}  \
+            --env VLLM_API_TOKEN=${{ env.VLLM_API_TOKEN }} &
+
+      - name: Wait for Llama Stack server to be ready
+        if: matrix.client-type == 'http'
+        run: |
+          echo "Waiting for Llama Stack server..."
+          for i in {1..30}; do
+            if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
+              echo "Llama Stack server is up!"
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "Llama Stack server failed to start"
+          cat server.log
+          exit 1
+
+    #   - name: Verify Ollama status is OK
+    #     if: matrix.client-type == 'http'
+    #     run: |
+    #       echo "Verifying Ollama status..."
+    #       ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status)
+    #       echo "Ollama status: $ollama_status"
+    #       if [ "$ollama_status" != "OK" ]; then
+    #         echo "Ollama health check failed"
+    #         exit 1
+    #       fi
+
+      - name: Check Storage and Memory Available Before Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+      - name: Run Integration Tests
+        env:
+          INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          ENABLE_VLLM: "vllm"
+          VLLM_URL: "http://${{ github.event.inputs.vllm_server_ip }}:8000/v1/"
+          VLLM_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+          VLLM_API_TOKEN: "remote_vllm"
+        run: |
+          if [ "${{ matrix.client-type }}" == "library" ]; then
+            stack_config="starter"
+          else
+            stack_config="server:starter"
+          fi
+          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+            --text-model="vllm/${{ env.INFERENCE_MODEL }}" \
+            --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \
+            --env VLLM_URL=${{ env.VLLM_URL }} \
+            --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }}  \
+            --env VLLM_API_TOKEN=${{ env.VLLM_API_TOKEN }} \
+            --embedding-model=all-MiniLM-L6-v2 \
+            --color=yes \
+            --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log
+
+    #   - name: Run Integration Tests
+    #     env:
+    #       INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct"
+    #       OLLAMA_URL: "http://0.0.0.0:11434"
+    #     run: |
+    #       if [ "${{ matrix.client-type }}" == "library" ]; then
+    #         stack_config="ollama"
+    #       else
+    #         stack_config="http://localhost:8321"
+    #       fi
+    #       uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+    #         -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+    #         --text-model="meta-llama/Llama-3.2-3B-Instruct" \
+    #         --embedding-model=all-MiniLM-L6-v2
+
+      - name: Check Storage and Memory Available After Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+    #   - name: Write ollama logs to file
+    #     if: ${{ always() }}
+    #     run: |
+    #       sudo docker logs ollama > ollama.log
+
+      - name: Upload all logs to artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}
+          path: |
+            *.log
+          retention-days: 1
diff --git a/.github/workflows/tests_on_self_hosted_ec2_runner.yaml b/.github/workflows/tests_on_self_hosted_ec2_runner.yaml
new file mode 100644
index 0000000000..fd544d1b75
--- /dev/null
+++ b/.github/workflows/tests_on_self_hosted_ec2_runner.yaml
@@ -0,0 +1,142 @@
+name: Llama Stack tests on Self-Hosted EC2 Runner.
+run-name: ${{ github.actor }} Triggered Action.
+on:
+#   pull_request:
+#     branches:    
+#       - 'main'
+#     paths:
+#       - '*.py'
+
+  workflow_dispatch: # Only triggering workflow manually.
+    inputs:
+      aws_ec2_type:
+        description: 'EC2 Instance Type to run tests on.'
+        required: true
+        default: "t2.large"
+      aws_ec2_ami:
+        description: 'AMI Image (use default if unsure)'
+        required: true 
+        default: "ami-05f991c49d264708f" # Ubuntu Server 24.04 LTS (HVM), SSD Volume Type
+      aws_sg_id:
+        description: 'Security Group ID to be attached to the selg Hosted runner (use default if unsure)'
+        required: true 
+        default: "sg-0f1e12826ee9b097b"
+        
+env:
+  AWS_REGION : "us-west-2"
+  AWS_ACCOUNT: "495599741739"
+  AWS_SUBNET_ID: "subnet-01ec950bba37d393a"
+  AWS_EC2_TYPE: "t2.large"
+  AWS_EC2_AMI: "ami-05f991c49d264708f" # Ubuntu Server 24.04 LTS (HVM), SSD Volume Type
+  AWS_SG_ID: "sg-0162d28d02b2e3cc5"
+
+permissions:
+  id-token: write # This is required for requesting the JWT
+  contents: read  # This is required for actions/checkout
+  checks: write
+  pull-requests: write
+
+jobs:
+  start-self-hosted-ec2-runner:
+    name: Start self-hosted EC2 runner
+    runs-on: ubuntu-latest
+    outputs:
+        label: ${{ steps.start-ec2-runner.outputs.label }}
+        ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+    # If workflow is triggered manually via workflow_dispatch event then both AWS_EC2_TYPE and AWS_EC2_AMI
+    # env variables need to be overwritten with input values.
+      - name: "DEBUG: Overriding AWS_EC2_TYPE and AWS_EC2_AMI env variables with 'workflow_dispatch' inputs"
+        if: ${{ github.event.inputs.aws_ec2_type != '' }}
+        run: |
+          echo "AWS_EC2_TYPE=${{ inputs.aws_ec2_type }}" >> $GITHUB_ENV
+          echo "AWS_EC2_AMI=${{ inputs.aws_ec2_ami }}" >> $GITHUB_ENV
+          echo "AWS_SG_ID=${{ inputs.aws_sg_id }}" >> $GITHUB_ENV
+
+      - name: "DEBUG: AWS_EC2_TYPE and AWS_EC2_AMI env variables values"
+        run: |
+          echo ${{ env.AWS_EC2_TYPE }}
+          echo ${{ env.AWS_EC2_AMI }}
+          echo ${{ env.AWS_SG_ID }}
+
+      - name: Configure AWS credentials
+        id: configure_aws_security_credentials
+        uses: aws-actions/configure-aws-credentials@v2
+        with:
+          role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT }}:role/GitHub_SelfHostedRunner
+          role-session-name: GitHubOIDCSession
+          aws-region: ${{ env.AWS_REGION }}
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@v2
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} # added to repo secrets.
+          ec2-image-id: ${{ env.AWS_EC2_AMI }}
+          ec2-instance-type: ${{ env.AWS_EC2_TYPE }}
+          subnet-id: ${{ env.AWS_SUBNET_ID }}
+          security-group-id: ${{ env.AWS_SG_ID }}
+          # GitHub_EC2AccessToAWSServicesRole is associated with Instance Profile is assumed by newly configured EC2 instance(s) 
+          # to perform permitted Actions on AWS resources.
+          iam-role-name: GitHub_EC2AccessToAWSServicesInstanceProfile
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "ec2-github-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
+            ]
+
+  run_workload_on_the_runner:
+    name: Execute the job on the runner
+    needs: start-self-hosted-ec2-runner                              # required to start the main job when the runner is ready
+    runs-on: ${{ needs.start-self-hosted-ec2-runner.outputs.label }} # run the job on the newly created runner
+    steps:
+
+        - name: "la -la"
+          run: |
+            ls -la
+            whoami
+
+    #   - name: "Checkout 'llama-recipes' repository"
+    #     uses: actions/checkout@v4
+    #     with:
+    #       ref: ${{ inputs.branch_name }}
+
+    #   - name: "Installing dependencies from requirementst.txt file."
+    #     run: |
+    #       pip install -U pip setuptools
+    #       pip install -r requirements.txt
+    #       pip install --extra-index-url https://download.pytorch.org/whl/test/cu118 -e '.[tests,auditnlg,vllm]' 
+
+    #   - name: "Running llama-recipes pytest tests on Self Hosted EC2 Runner"
+    #     run: |
+    #       echo "Running llama-recipes pytest tests on Self Hosted ${{ env.AWS_EC2_TYPE }} EC2 Runner"
+    #       cd $GITHUB_WORKSPACE && python3 -m pytest --junitxml="$GITHUB_WORKSPACE/result.xml"
+
+    #   - name: Test Summary
+    #     uses: test-summary/action@v2
+    #     with:
+    #       paths: "**/*.xml"
+    #     if: always()
+
+  stop-runner:
+    name: Stop self-hosted EC2 runner
+    needs:
+      - start-self-hosted-ec2-runner # required to get output from the start-runner job
+      - run_workload_on_the_runner   # required to wait when the main job is done
+    runs-on: ubuntu-latest
+    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
+    steps:
+       - name: Configure AWS credentials
+         id: configure_aws_security_credentials
+         uses: aws-actions/configure-aws-credentials@v2
+         with:
+           role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT }}:role/GitHub_SelfHostedRunner
+           aws-region: ${{ env.AWS_REGION }}
+       - name: Stop EC2 runner
+         uses: machulav/ec2-github-runner@v2
+         with:
+           mode: stop
+           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+           label: ${{ needs.start-self-hosted-ec2-runner.outputs.label }}
+           ec2-instance-id: ${{ needs.start-self-hosted-ec2-runner.outputs.ec2-instance-id }}
\ No newline at end of file
diff --git a/coverage.svg b/coverage.svg
index 636889bb0d..a5804ea07e 100644
--- a/coverage.svg
+++ b/coverage.svg
@@ -15,7 +15,7 @@
     <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
         <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
         <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">44%</text>
-        <text x="80" y="14">44%</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">46%</text>
+        <text x="80" y="14">46%</text>
     </g>
 </svg>
diff --git a/llama_stack/templates/remote-vllm-gpu/__init__.py b/llama_stack/templates/remote-vllm-gpu/__init__.py
new file mode 100644
index 0000000000..81d8f8da18
--- /dev/null
+++ b/llama_stack/templates/remote-vllm-gpu/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from .remote_vllm_gpu import get_distribution_template  # noqa: F401
diff --git a/llama_stack/templates/remote-vllm-gpu/build.yaml b/llama_stack/templates/remote-vllm-gpu/build.yaml
new file mode 100644
index 0000000000..e12b0de084
--- /dev/null
+++ b/llama_stack/templates/remote-vllm-gpu/build.yaml
@@ -0,0 +1,55 @@
+version: 2
+distribution_spec:
+  description: CI tests for Llama Stack on Remote VLLM (GPU) Inference Server
+  providers:
+    inference:
+    - provider_type: remote::cerebras
+    - provider_type: remote::ollama
+    - provider_type: remote::vllm
+    - provider_type: remote::tgi
+    - provider_type: remote::fireworks
+    - provider_type: remote::together
+    - provider_type: remote::bedrock
+    - provider_type: remote::nvidia
+    - provider_type: remote::openai
+    - provider_type: remote::anthropic
+    - provider_type: remote::gemini
+    - provider_type: remote::groq
+    - provider_type: remote::sambanova
+    - provider_type: inline::sentence-transformers
+    vector_io:
+    - provider_type: inline::faiss
+    - provider_type: inline::sqlite-vec
+    - provider_type: inline::milvus
+    - provider_type: remote::chromadb
+    - provider_type: remote::pgvector
+    files:
+    - provider_type: inline::localfs
+    safety:
+    - provider_type: inline::llama-guard
+    agents:
+    - provider_type: inline::meta-reference
+    telemetry:
+    - provider_type: inline::meta-reference
+    post_training:
+    - provider_type: inline::huggingface
+    eval:
+    - provider_type: inline::meta-reference
+    datasetio:
+    - provider_type: remote::huggingface
+    - provider_type: inline::localfs
+    scoring:
+    - provider_type: inline::basic
+    - provider_type: inline::llm-as-judge
+    - provider_type: inline::braintrust
+    tool_runtime:
+    - provider_type: remote::brave-search
+    - provider_type: remote::tavily-search
+    - provider_type: inline::rag-runtime
+    - provider_type: remote::model-context-protocol
+image_type: conda
+image_name: ci-tests
+additional_pip_packages:
+- aiosqlite
+- asyncpg
+- sqlalchemy[asyncio]
diff --git a/llama_stack/templates/remote-vllm-gpu/remote_vllm_gpu.py b/llama_stack/templates/remote-vllm-gpu/remote_vllm_gpu.py
new file mode 100644
index 0000000000..62334cc0fe
--- /dev/null
+++ b/llama_stack/templates/remote-vllm-gpu/remote_vllm_gpu.py
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+
+from llama_stack.templates.template import DistributionTemplate
+
+from ..starter.starter import get_distribution_template as get_starter_distribution_template
+
+
+def get_distribution_template() -> DistributionTemplate:
+    template = get_starter_distribution_template()
+    name = "remote-vllm-gpu"
+    template.name = name
+    template.description = "CI tests for Llama Stack on Remote VLLM (GPU) Inference Server"
+
+    return template
\ No newline at end of file
diff --git a/llama_stack/templates/remote-vllm-gpu/run.yaml b/llama_stack/templates/remote-vllm-gpu/run.yaml
new file mode 100644
index 0000000000..e609b932fc
--- /dev/null
+++ b/llama_stack/templates/remote-vllm-gpu/run.yaml
@@ -0,0 +1,225 @@
+version: 2
+image_name: remote-vllm-gpu
+apis:
+- agents
+- datasetio
+- eval
+- files
+- inference
+- post_training
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+
+  - provider_id: ${env.VLLM_URL:+vllm}
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:=}
+      max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+      api_token: ${env.VLLM_API_TOKEN:=fake}
+      tls_verify: ${env.VLLM_TLS_VERIFY:=true}
+
+  # - provider_id: ollama
+  #   provider_type: remote::ollama
+  #   config:
+  #     url: ${env.OLLAMA_URL:=http://localhost:11434}
+
+  - provider_id: ${env.CEREBRAS_API_KEY:+cerebras}
+    provider_type: remote::cerebras
+    config:
+      base_url: https://api.cerebras.ai
+      api_key: ${env.CEREBRAS_API_KEY:=}
+
+  - provider_id: ${env.TGI_URL:+tgi}
+    provider_type: remote::tgi
+    config:
+      url: ${env.TGI_URL:=}
+  - provider_id: fireworks
+    provider_type: remote::fireworks
+    config:
+      url: https://api.fireworks.ai/inference/v1
+      api_key: ${env.FIREWORKS_API_KEY:=}
+  - provider_id: together
+    provider_type: remote::together
+    config:
+      url: https://api.together.xyz/v1
+      api_key: ${env.TOGETHER_API_KEY:=}
+  - provider_id: bedrock
+    provider_type: remote::bedrock
+  - provider_id: ${env.NVIDIA_API_KEY:+nvidia}
+    provider_type: remote::nvidia
+    config:
+      url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com}
+      api_key: ${env.NVIDIA_API_KEY:=}
+      append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
+  - provider_id: openai
+    provider_type: remote::openai
+    config:
+      api_key: ${env.OPENAI_API_KEY:=}
+  - provider_id: anthropic
+    provider_type: remote::anthropic
+    config:
+      api_key: ${env.ANTHROPIC_API_KEY:=}
+  - provider_id: gemini
+    provider_type: remote::gemini
+    config:
+      api_key: ${env.GEMINI_API_KEY:=}
+  - provider_id: groq
+    provider_type: remote::groq
+    config:
+      url: https://api.groq.com
+      api_key: ${env.GROQ_API_KEY:=}
+  - provider_id: sambanova
+    provider_type: remote::sambanova
+    config:
+      url: https://api.sambanova.ai/v1
+      api_key: ${env.SAMBANOVA_API_KEY:=}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db
+  - provider_id: sqlite-vec
+    provider_type: inline::sqlite-vec
+    config:
+      db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db
+  - provider_id: ${env.MILVUS_URL:+milvus}
+    provider_type: inline::milvus
+    config:
+      db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db
+  - provider_id: ${env.CHROMADB_URL:+chromadb}
+    provider_type: remote::chromadb
+    config:
+      url: ${env.CHROMADB_URL:=}
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db
+  - provider_id: ${env.PGVECTOR_DB:+pgvector}
+    provider_type: remote::pgvector
+    config:
+      host: ${env.PGVECTOR_HOST:=localhost}
+      port: ${env.PGVECTOR_PORT:=5432}
+      db: ${env.PGVECTOR_DB:=}
+      user: ${env.PGVECTOR_USER:=}
+      password: ${env.PGVECTOR_PASSWORD:=}
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db
+  files:
+  - provider_id: meta-reference-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}
+      metadata_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config:
+      excluded_categories: []
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/agents_store.db
+      responses_store:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/responses_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+      sinks: ${env.TELEMETRY_SINKS:=console,sqlite}
+      sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/trace_store.db
+      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=}
+  post_training:
+  - provider_id: huggingface
+    provider_type: inline::huggingface
+    config:
+      checkpoint_format: huggingface
+      distributed_backend: null
+      device: cpu
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/meta_reference_eval.db
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/huggingface_datasetio.db
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config:
+      kvstore:
+        type: sqlite
+        db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/localfs_datasetio.db
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:=}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:=}
+      max_results: 3
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/registry.db
+inference_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/inference_store.db
+models: []
+shields:
+- shield_id: llama-guard
+  provider_id: ${env.SAFETY_MODEL:+llama-guard}
+  provider_shield_id: ${env.SAFETY_MODEL:=}
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: tavily-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+server:
+  port: 8321