maximgroshev · github-actions · Jun 24, 2025 · Jun 24, 2025 · Jun 24, 2025 · Jun 26, 2025
diff --git a/.github/actions/setup-vllm-gpu/action.yml b/.github/actions/setup-vllm-gpu/action.yml
@@ -0,0 +1,63 @@
+name: Setup local vLLM
+description: Start vLLM Inference Server on localhost
+inputs:
+
+  vllm-image-version:
+    description: vLLM Docker Image version
+    required: true
+    default: latest
+
+  vllm-container:
+    description: vLLM Docker Container name
+    required: true
+    default: vllm_server
+
+  vllm-server-port:
+    description: vLLM Inference Server port
+    required: true
+    default: "8000"
+
+  model:
+    description: Model to serve by vLLM Inference Server
+    required: true
+    default: meta-llama/Llama-3.2-3B-Instruct
+
+  tool-calling-parser:
+    description: Model Tool Calling parser 
+    required: true
+    default: llama3_json
+
+  huggingface-pat:
+    description: HuggingFace Personal Access Token
+    required: true
+    default: N/A
+
+runs:
+  using: "composite"
+  steps:
+
+    - name: Start vLLM Inference Server
+      shell: bash
+      run: |
+
+        if sudo docker ps -a | grep -q "${{ inputs.vllm-container }}"; then
+          echo "Container '${{ inputs.vllm-container }}' exists."
+        else
+
+          echo "Starting vLLM Inference Server serving ${{ inputs.model }} ..."
+
+          sudo docker run \
+              --name ${{ inputs.vllm-container }} \
+              --runtime nvidia \
+              --gpus all \
+              --detach \
+              --volume ~/.cache/huggingface:/root/.cache/huggingface \
+              --env "HUGGING_FACE_HUB_TOKEN=${{ inputs.huggingface-pat }}" \
+              --publish ${{ inputs.vllm-server-port }}:8000 \
+              --ipc=host \
+              vllm/vllm-openai:${{ inputs.vllm-image-version }} \
+              --model ${{ inputs.model }} \
+              --max_model_len 124832 \
+              --enable-auto-tool-choice \
+              --tool-call-parser ${{ inputs.tool-calling-parser }}
+        fi
diff --git a/.github/workflows/integration-tests-on-gpu-self-hosted-runner.yaml b/.github/workflows/integration-tests-on-gpu-self-hosted-runner.yaml
@@ -0,0 +1,268 @@
+name: "[Self-Hosted runner] Integration Tests vLLM Inference Server"
+
+on:
+  workflow_dispatch: # Only triggering workflow manually.
+    inputs:
+
+      model:
+        description: Model to serve by vLLM Inference Server
+        required: true
+        default: meta-llama/Llama-3.2-3B-Instruct
+
+      vllm-server-ip:
+        description: IP address of the vLLM Inference Server
+        required: true
+        default: "0.0.0.0"
+
+      server-timeout:
+        description: Time to wait for server to come online
+        required: true
+        default: "300"
+
+      test-all-client-versions:
+        description: 'Test against both the latest and published versions'
+        type: boolean
+        default: false
+
+#   push:
+#     branches: [ main ]
+#   pull_request:
+#     branches: [ main ]
+#     paths:
+#       - 'llama_stack/**'
+#       - 'tests/integration/**'
+#       - 'uv.lock'
+#       - 'pyproject.toml'
+#       - 'requirements.txt'
+#       - '.github/workflows/integration-tests.yml' # This workflow
+
+env:
+  CONTAINER_NAME: vllm_server
+  LLAMA_STACK_TEMPLATE: remote-vllm-gpu
+  EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  discover-tests:
+    runs-on: ubuntu-latest
+    outputs:
+      test-type: ${{ steps.generate-matrix.outputs.test-type }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Generate test matrix
+        id: generate-matrix
+        run: |
+          # Get test directories dynamically, excluding non-test directories
+          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
+            grep -Ev "^(__pycache__|fixtures|test_cases)$" |
+            sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-type=$TEST_TYPES" >> $GITHUB_OUTPUT
+
+          echo "Test Matrix: "
+          cat  $GITHUB_OUTPUT
+
+  prepare_self_hosted_runner:
+    runs-on: [self-hosted, linux, X64]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+
+      - name: Setup vllm GPU
+        uses: ./.github/actions/setup-vllm-gpu
+        with:
+          model: ${{ inputs.model }}
+          huggingface-pat: ${{ secrets.HF_PAT }}
+          vllm-container: ${{ env.CONTAINER_NAME }}
+
+
+      - name: Verify VLLM remote server is healthy
+        env:
+          RETRY_INTERVAL: 1
+        run: |
+
+          if sudo docker ps -a | grep -q "${{ env.CONTAINER_NAME }}"; then
+            echo "Container '${{ env.CONTAINER_NAME }}' exists."
+            echo "Checking if vLLM Inference Server is online..."
+          else
+            echo "Container '${{ env.CONTAINER_NAME }}' does not exist."
+            exit 1
+          fi
+
+          echo "Waiting for vLLM Inference Server to come online..."
+
+          for i in $(seq 1 ${{ inputs.server-timeout }}); do
+              if curl --silent --fail "http://${{ inputs.vllm-server-ip }}:8000/health" > /dev/null; then
+                  echo "vLLM server is up and running!"
+                  exit 0
+              else
+                  echo "[Attempt $i/${{ inputs.server-timeout }}] vLLM server not yet available. Retrying in ${RETRY_INTERVAL} seconds..."
+                  sleep $RETRY_INTERVAL
+              fi
+          done
+
+          echo "vLLM server failed to start after ${{ inputs.server-timeout }} seconds of waiting"
+
+          sudo docker logs ${{ env.CONTAINER_NAME }} > ${{ env.CONTAINER_NAME }}.log
+          cat ${{ env.CONTAINER_NAME }}.log
+
+          exit 1
+
+
+      # - name: Build Llama Stack
+      #   run: |
+      #     uv run llama stack build --template ${{ env.LLAMA_STACK_TEMPLATE }} --image-type venv
+
+
+      # - name: Start Llama Stack server in background
+      #   # if: matrix.client-type == 'http'
+      #   env:
+      #     INFERENCE_MODEL: ${{ inputs.model }}
+      #     ENABLE_VLLM: "vllm"
+      #     VLLM_URL: "http://${{ inputs.vllm-server-ip }}:8000/v1/"
+      #     VLLM_INFERENCE_MODEL: ${{ inputs.model }}
+      #   run: |
+
+      #     LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run \
+      #       $HOME/.llama/distributions/${{ env.LLAMA_STACK_TEMPLATE }}/${{ env.LLAMA_STACK_TEMPLATE }}-run.yaml \
+      #       --image-type venv \
+      #       --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \
+      #       --env VLLM_URL=${{ env.VLLM_URL }} \
+      #       --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }} &
+
+
+      # - name: Wait for Llama Stack server to be ready
+      #   # if: matrix.client-type == 'http'
+      #   run: |
+      #     echo "Waiting for Llama Stack server..."
+      #     for i in {1..30}; do
+      #       if curl -s http://localhost:8321/v1/health | grep -q "OK"; then
+      #         echo "Llama Stack server is up!"
+      #         exit 0
+      #       fi
+      #       sleep 1
+      #     done
+      #     echo "Llama Stack server failed to start"
+      #     cat server.log
+      #     exit 1
+
+  test-matrix:
+    runs-on: [self-hosted, linux, X64]
+    needs: [discover-tests, prepare_self_hosted_runner]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }}
+        # client-type: [library, server]
+        # python-version: ["3.12", "3.13"]
+        # test-type: [inference]
+        client-type: [server]
+        python-version: ["3.13"]
+        client-version: ${{ (github.event_name == 'schedule' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
+
+
+    # strategy:
+    #   fail-fast: false # we want to run all tests regardless of failure
+    #   matrix:
+    #     test-type: [inference]
+    #     client-type: [server]
+    #     python-version: ["3.12"]
+    #     client-version: [latest]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Install dependencies
+        uses: ./.github/actions/setup-runner
+        with:
+          python-version: ${{ matrix.python-version }}
+          client-version: ${{ matrix.client-version }}
+
+
+      - name: Check Storage and Memory Available Before Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+      - name: Build Llama Stack
+        run: |
+          uv run llama stack build --template ${{ env.LLAMA_STACK_TEMPLATE }} --image-type venv
+
+
+      - name: Run Integration Tests
+        env:
+          INFERENCE_MODEL: ${{ inputs.model }}
+          ENABLE_VLLM: "vllm"
+          VLLM_URL: "http://${{ inputs.vllm-server-ip }}:8000/v1/"
+          VLLM_INFERENCE_MODEL: ${{ inputs.model }}
+        run: |
+
+          set -e
+
+          if [ "${{ matrix.client-type }}" == "library" ]; then
+            stack_config="${{ env.LLAMA_STACK_TEMPLATE }}"
+          else
+            stack_config="server:${{ env.LLAMA_STACK_TEMPLATE }}"
+          fi
+
+          start_time=$(date +%s)
+          echo "[DEBUG] Start time of the test: ${start_time}"
+
+          uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \
+            -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \
+            --text-model="vllm/${{ env.INFERENCE_MODEL }}" \
+            --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \
+            --env VLLM_URL=${{ env.VLLM_URL }} \
+            --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }}  \
+            --embedding-model=${{ env.EMBEDDING_MODEL }} \
+            --color=yes \
+            --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log
+
+          end_time=$(date +%s)
+          echo "[DEBUG] End time of the test: ${end_time}"
+
+          test_run_duraiton=$((end_time - start_time))
+          echo "[DEBUG] Duration of the test run: ${test_run_duraiton} seconds"
+
+          ## Grabbing vLLM inference server logs for the duration of a given test run: 
+          sudo docker logs ${{ env.CONTAINER_NAME }} --since "${test_run_duraiton}s"  > ${{ env.CONTAINER_NAME }}-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}.log
+
+          cat ${{ env.CONTAINER_NAME }}-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}.log
+
+
+      - name: Check Storage and Memory Available After Tests
+        if: ${{ always() }}
+        run: |
+          free -h
+          df -h
+
+
+      - name: Upload all logs to artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}
+          path: |
+            *.log
+          retention-days: 1
+
+
+  stop_inference_server:
+    runs-on: [self-hosted, linux, X64]
+    needs: test-matrix
+
+    steps:
+      - name: Stop vLLM server and write logs to file
+        if: ${{ always() }}
+        run: |
+          sudo docker stop ${{ env.CONTAINER_NAME }}
+          sudo docker rm ${{ env.CONTAINER_NAME }}