diff --git a/.github/actions/setup-vllm-gpu/action.yml b/.github/actions/setup-vllm-gpu/action.yml new file mode 100644 index 0000000000..efb8ea706f --- /dev/null +++ b/.github/actions/setup-vllm-gpu/action.yml @@ -0,0 +1,63 @@ +name: Setup local vLLM +description: Start vLLM Inference Server on localhost +inputs: + + vllm-image-version: + description: vLLM Docker Image version + required: true + default: latest + + vllm-container: + description: vLLM Docker Container name + required: true + default: vllm_server + + vllm-server-port: + description: vLLM Inference Server port + required: true + default: "8000" + + model: + description: Model to serve by vLLM Inference Server + required: true + default: meta-llama/Llama-3.2-3B-Instruct + + tool-calling-parser: + description: Model Tool Calling parser + required: true + default: llama3_json + + huggingface-pat: + description: HuggingFace Personal Access Token + required: true + default: N/A + +runs: + using: "composite" + steps: + + - name: Start vLLM Inference Server + shell: bash + run: | + + if sudo docker ps -a | grep -q "${{ inputs.vllm-container }}"; then + echo "Container '${{ inputs.vllm-container }}' exists." + else + + echo "Starting vLLM Inference Server serving ${{ inputs.model }} ..." + + sudo docker run \ + --name ${{ inputs.vllm-container }} \ + --runtime nvidia \ + --gpus all \ + --detach \ + --volume ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=${{ inputs.huggingface-pat }}" \ + --publish ${{ inputs.vllm-server-port }}:8000 \ + --ipc=host \ + vllm/vllm-openai:${{ inputs.vllm-image-version }} \ + --model ${{ inputs.model }} \ + --max_model_len 124832 \ + --enable-auto-tool-choice \ + --tool-call-parser ${{ inputs.tool-calling-parser }} + fi diff --git a/.github/workflows/integration-tests-on-gpu-self-hosted-runner.yaml b/.github/workflows/integration-tests-on-gpu-self-hosted-runner.yaml new file mode 100644 index 0000000000..563feef9c9 --- /dev/null +++ b/.github/workflows/integration-tests-on-gpu-self-hosted-runner.yaml @@ -0,0 +1,268 @@ +name: "[Self-Hosted runner] Integration Tests vLLM Inference Server" + +on: + workflow_dispatch: # Only triggering workflow manually. + inputs: + + model: + description: Model to serve by vLLM Inference Server + required: true + default: meta-llama/Llama-3.2-3B-Instruct + + vllm-server-ip: + description: IP address of the vLLM Inference Server + required: true + default: "0.0.0.0" + + server-timeout: + description: Time to wait for server to come online + required: true + default: "300" + + test-all-client-versions: + description: 'Test against both the latest and published versions' + type: boolean + default: false + +# push: +# branches: [ main ] +# pull_request: +# branches: [ main ] +# paths: +# - 'llama_stack/**' +# - 'tests/integration/**' +# - 'uv.lock' +# - 'pyproject.toml' +# - 'requirements.txt' +# - '.github/workflows/integration-tests.yml' # This workflow + +env: + CONTAINER_NAME: vllm_server + LLAMA_STACK_TEMPLATE: remote-vllm-gpu + EMBEDDING_MODEL: sentence-transformers/all-MiniLM-L6-v2 + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + discover-tests: + runs-on: ubuntu-latest + outputs: + test-type: ${{ steps.generate-matrix.outputs.test-type }} + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Generate test matrix + id: generate-matrix + run: | + # Get test directories dynamically, excluding non-test directories + TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" | + grep -Ev "^(__pycache__|fixtures|test_cases)$" | + sort | jq -R -s -c 'split("\n")[:-1]') + echo "test-type=$TEST_TYPES" >> $GITHUB_OUTPUT + + echo "Test Matrix: " + cat $GITHUB_OUTPUT + + prepare_self_hosted_runner: + runs-on: [self-hosted, linux, X64] + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + + - name: Setup vllm GPU + uses: ./.github/actions/setup-vllm-gpu + with: + model: ${{ inputs.model }} + huggingface-pat: ${{ secrets.HF_PAT }} + vllm-container: ${{ env.CONTAINER_NAME }} + + + - name: Verify VLLM remote server is healthy + env: + RETRY_INTERVAL: 1 + run: | + + if sudo docker ps -a | grep -q "${{ env.CONTAINER_NAME }}"; then + echo "Container '${{ env.CONTAINER_NAME }}' exists." + echo "Checking if vLLM Inference Server is online..." + else + echo "Container '${{ env.CONTAINER_NAME }}' does not exist." + exit 1 + fi + + echo "Waiting for vLLM Inference Server to come online..." + + for i in $(seq 1 ${{ inputs.server-timeout }}); do + if curl --silent --fail "http://${{ inputs.vllm-server-ip }}:8000/health" > /dev/null; then + echo "vLLM server is up and running!" + exit 0 + else + echo "[Attempt $i/${{ inputs.server-timeout }}] vLLM server not yet available. Retrying in ${RETRY_INTERVAL} seconds..." + sleep $RETRY_INTERVAL + fi + done + + echo "vLLM server failed to start after ${{ inputs.server-timeout }} seconds of waiting" + + sudo docker logs ${{ env.CONTAINER_NAME }} > ${{ env.CONTAINER_NAME }}.log + cat ${{ env.CONTAINER_NAME }}.log + + exit 1 + + + # - name: Build Llama Stack + # run: | + # uv run llama stack build --template ${{ env.LLAMA_STACK_TEMPLATE }} --image-type venv + + + # - name: Start Llama Stack server in background + # # if: matrix.client-type == 'http' + # env: + # INFERENCE_MODEL: ${{ inputs.model }} + # ENABLE_VLLM: "vllm" + # VLLM_URL: "http://${{ inputs.vllm-server-ip }}:8000/v1/" + # VLLM_INFERENCE_MODEL: ${{ inputs.model }} + # run: | + + # LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run \ + # $HOME/.llama/distributions/${{ env.LLAMA_STACK_TEMPLATE }}/${{ env.LLAMA_STACK_TEMPLATE }}-run.yaml \ + # --image-type venv \ + # --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \ + # --env VLLM_URL=${{ env.VLLM_URL }} \ + # --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }} & + + + # - name: Wait for Llama Stack server to be ready + # # if: matrix.client-type == 'http' + # run: | + # echo "Waiting for Llama Stack server..." + # for i in {1..30}; do + # if curl -s http://localhost:8321/v1/health | grep -q "OK"; then + # echo "Llama Stack server is up!" + # exit 0 + # fi + # sleep 1 + # done + # echo "Llama Stack server failed to start" + # cat server.log + # exit 1 + + test-matrix: + runs-on: [self-hosted, linux, X64] + needs: [discover-tests, prepare_self_hosted_runner] + + strategy: + fail-fast: false + matrix: + test-type: ${{ fromJson(needs.discover-tests.outputs.test-type) }} + # client-type: [library, server] + # python-version: ["3.12", "3.13"] + # test-type: [inference] + client-type: [server] + python-version: ["3.13"] + client-version: ${{ (github.event_name == 'schedule' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }} + + + # strategy: + # fail-fast: false # we want to run all tests regardless of failure + # matrix: + # test-type: [inference] + # client-type: [server] + # python-version: ["3.12"] + # client-version: [latest] + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Install dependencies + uses: ./.github/actions/setup-runner + with: + python-version: ${{ matrix.python-version }} + client-version: ${{ matrix.client-version }} + + + - name: Check Storage and Memory Available Before Tests + if: ${{ always() }} + run: | + free -h + df -h + + - name: Build Llama Stack + run: | + uv run llama stack build --template ${{ env.LLAMA_STACK_TEMPLATE }} --image-type venv + + + - name: Run Integration Tests + env: + INFERENCE_MODEL: ${{ inputs.model }} + ENABLE_VLLM: "vllm" + VLLM_URL: "http://${{ inputs.vllm-server-ip }}:8000/v1/" + VLLM_INFERENCE_MODEL: ${{ inputs.model }} + run: | + + set -e + + if [ "${{ matrix.client-type }}" == "library" ]; then + stack_config="${{ env.LLAMA_STACK_TEMPLATE }}" + else + stack_config="server:${{ env.LLAMA_STACK_TEMPLATE }}" + fi + + start_time=$(date +%s) + echo "[DEBUG] Start time of the test: ${start_time}" + + uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ + -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \ + --text-model="vllm/${{ env.INFERENCE_MODEL }}" \ + --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \ + --env VLLM_URL=${{ env.VLLM_URL }} \ + --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }} \ + --embedding-model=${{ env.EMBEDDING_MODEL }} \ + --color=yes \ + --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log + + end_time=$(date +%s) + echo "[DEBUG] End time of the test: ${end_time}" + + test_run_duraiton=$((end_time - start_time)) + echo "[DEBUG] Duration of the test run: ${test_run_duraiton} seconds" + + ## Grabbing vLLM inference server logs for the duration of a given test run: + sudo docker logs ${{ env.CONTAINER_NAME }} --since "${test_run_duraiton}s" > ${{ env.CONTAINER_NAME }}-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}.log + + cat ${{ env.CONTAINER_NAME }}-logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }}.log + + + - name: Check Storage and Memory Available After Tests + if: ${{ always() }} + run: | + free -h + df -h + + + - name: Upload all logs to artifacts + if: ${{ always() }} + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }}-${{ matrix.client-version }} + path: | + *.log + retention-days: 1 + + + stop_inference_server: + runs-on: [self-hosted, linux, X64] + needs: test-matrix + + steps: + - name: Stop vLLM server and write logs to file + if: ${{ always() }} + run: | + sudo docker stop ${{ env.CONTAINER_NAME }} + sudo docker rm ${{ env.CONTAINER_NAME }} \ No newline at end of file diff --git a/.github/workflows/integration-tests-remote-vllm.yaml b/.github/workflows/integration-tests-remote-vllm.yaml new file mode 100644 index 0000000000..66d038f0c7 --- /dev/null +++ b/.github/workflows/integration-tests-remote-vllm.yaml @@ -0,0 +1,177 @@ +name: Integration Tests with remote vLLM Inference Server + +on: + workflow_dispatch: # Only triggering workflow manually. + inputs: + vllm_server_ip: + description: IP address of the vLLM Inference Server + required: true + default: "0.0.0.0" + +# push: +# branches: [ main ] +# pull_request: +# branches: [ main ] +# paths: +# - 'llama_stack/**' +# - 'tests/integration/**' +# - 'uv.lock' +# - 'pyproject.toml' +# - 'requirements.txt' +# - '.github/workflows/integration-tests.yml' # This workflow + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test-matrix: + runs-on: ubuntu-latest + strategy: + matrix: + # Listing tests manually since some of them currently fail + # TODO: generate matrix list from tests/integration when fixed + # test-type: [agents, inference, datasets, inspect, scoring, post_training, providers, tool_runtime, vector_io] + test-type: [inference] + client-type: [library, http] + # python-version: ["3.12", "3.13"] + python-version: ["3.12"] + fail-fast: false # we want to run all tests regardless of failure + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Install dependencies + uses: ./.github/actions/setup-runner + with: + python-version: ${{ matrix.python-version }} + + # - name: Setup ollama + # uses: ./.github/actions/setup-ollama + + - name: Verify VLLM remote server is healthy + # if: matrix.client-type == 'http' + run: | + echo "Verifying vLLM status..." + vllm_status=$(curl -s -o /dev/null -w "%{http_code}" http://${{ github.event.inputs.vllm_server_ip }}:8000/health) + echo "vLLM remote server /health status: $vllm_status" + if [ "$vllm_status" != "200" ]; then + echo "vLLM health check failed" + exit 1 + fi + + - name: Build Llama Stack + run: | + uv run llama stack build --template starter --image-type venv + + - name: Start Llama Stack server in background + if: matrix.client-type == 'http' + env: + INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + ENABLE_VLLM: "vllm" + VLLM_URL: "http://${{ github.event.inputs.vllm_server_ip }}:8000/v1/" + VLLM_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + VLLM_API_TOKEN: "remote_vllm" + run: | + ls -la ./llama_stack/templates + echo "GITHUB_WORKSPACE ${GITHUB_WORKSPACE}" + echo "github.workspace ${{ github.workspace }}" + + LLAMA_STACK_LOG_FILE=server.log nohup uv run llama stack run $GITHUB_WORKSPACE/llama_stack/templates/starter/run.yaml \ + --image-type venv \ + --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \ + --env VLLM_URL=${{ env.VLLM_URL }} \ + --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }} \ + --env VLLM_API_TOKEN=${{ env.VLLM_API_TOKEN }} & + + - name: Wait for Llama Stack server to be ready + if: matrix.client-type == 'http' + run: | + echo "Waiting for Llama Stack server..." + for i in {1..30}; do + if curl -s http://localhost:8321/v1/health | grep -q "OK"; then + echo "Llama Stack server is up!" + exit 0 + fi + sleep 1 + done + echo "Llama Stack server failed to start" + cat server.log + exit 1 + + # - name: Verify Ollama status is OK + # if: matrix.client-type == 'http' + # run: | + # echo "Verifying Ollama status..." + # ollama_status=$(curl -s -L http://127.0.0.1:8321/v1/providers/ollama|jq --raw-output .health.status) + # echo "Ollama status: $ollama_status" + # if [ "$ollama_status" != "OK" ]; then + # echo "Ollama health check failed" + # exit 1 + # fi + + - name: Check Storage and Memory Available Before Tests + if: ${{ always() }} + run: | + free -h + df -h + + - name: Run Integration Tests + env: + INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + ENABLE_VLLM: "vllm" + VLLM_URL: "http://${{ github.event.inputs.vllm_server_ip }}:8000/v1/" + VLLM_INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + VLLM_API_TOKEN: "remote_vllm" + run: | + if [ "${{ matrix.client-type }}" == "library" ]; then + stack_config="starter" + else + stack_config="server:starter" + fi + uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ + -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \ + --text-model="vllm/${{ env.INFERENCE_MODEL }}" \ + --env ENABLE_VLLM=${{ env.ENABLE_VLLM }} \ + --env VLLM_URL=${{ env.VLLM_URL }} \ + --env VLLM_INFERENCE_MODEL=${{ env.VLLM_INFERENCE_MODEL }} \ + --env VLLM_API_TOKEN=${{ env.VLLM_API_TOKEN }} \ + --embedding-model=all-MiniLM-L6-v2 \ + --color=yes \ + --capture=tee-sys | tee pytest-${{ matrix.test-type }}.log + + # - name: Run Integration Tests + # env: + # INFERENCE_MODEL: "meta-llama/Llama-3.2-3B-Instruct" + # OLLAMA_URL: "http://0.0.0.0:11434" + # run: | + # if [ "${{ matrix.client-type }}" == "library" ]; then + # stack_config="ollama" + # else + # stack_config="http://localhost:8321" + # fi + # uv run pytest -s -v tests/integration/${{ matrix.test-type }} --stack-config=${stack_config} \ + # -k "not(builtin_tool or safety_with_image or code_interpreter or test_rag)" \ + # --text-model="meta-llama/Llama-3.2-3B-Instruct" \ + # --embedding-model=all-MiniLM-L6-v2 + + - name: Check Storage and Memory Available After Tests + if: ${{ always() }} + run: | + free -h + df -h + + # - name: Write ollama logs to file + # if: ${{ always() }} + # run: | + # sudo docker logs ollama > ollama.log + + - name: Upload all logs to artifacts + if: ${{ always() }} + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: logs-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.client-type }}-${{ matrix.test-type }}-${{ matrix.python-version }} + path: | + *.log + retention-days: 1 diff --git a/.github/workflows/tests_on_self_hosted_ec2_runner.yaml b/.github/workflows/tests_on_self_hosted_ec2_runner.yaml new file mode 100644 index 0000000000..fd544d1b75 --- /dev/null +++ b/.github/workflows/tests_on_self_hosted_ec2_runner.yaml @@ -0,0 +1,142 @@ +name: Llama Stack tests on Self-Hosted EC2 Runner. +run-name: ${{ github.actor }} Triggered Action. +on: +# pull_request: +# branches: +# - 'main' +# paths: +# - '*.py' + + workflow_dispatch: # Only triggering workflow manually. + inputs: + aws_ec2_type: + description: 'EC2 Instance Type to run tests on.' + required: true + default: "t2.large" + aws_ec2_ami: + description: 'AMI Image (use default if unsure)' + required: true + default: "ami-05f991c49d264708f" # Ubuntu Server 24.04 LTS (HVM), SSD Volume Type + aws_sg_id: + description: 'Security Group ID to be attached to the selg Hosted runner (use default if unsure)' + required: true + default: "sg-0f1e12826ee9b097b" + +env: + AWS_REGION : "us-west-2" + AWS_ACCOUNT: "495599741739" + AWS_SUBNET_ID: "subnet-01ec950bba37d393a" + AWS_EC2_TYPE: "t2.large" + AWS_EC2_AMI: "ami-05f991c49d264708f" # Ubuntu Server 24.04 LTS (HVM), SSD Volume Type + AWS_SG_ID: "sg-0162d28d02b2e3cc5" + +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + checks: write + pull-requests: write + +jobs: + start-self-hosted-ec2-runner: + name: Start self-hosted EC2 runner + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + # If workflow is triggered manually via workflow_dispatch event then both AWS_EC2_TYPE and AWS_EC2_AMI + # env variables need to be overwritten with input values. + - name: "DEBUG: Overriding AWS_EC2_TYPE and AWS_EC2_AMI env variables with 'workflow_dispatch' inputs" + if: ${{ github.event.inputs.aws_ec2_type != '' }} + run: | + echo "AWS_EC2_TYPE=${{ inputs.aws_ec2_type }}" >> $GITHUB_ENV + echo "AWS_EC2_AMI=${{ inputs.aws_ec2_ami }}" >> $GITHUB_ENV + echo "AWS_SG_ID=${{ inputs.aws_sg_id }}" >> $GITHUB_ENV + + - name: "DEBUG: AWS_EC2_TYPE and AWS_EC2_AMI env variables values" + run: | + echo ${{ env.AWS_EC2_TYPE }} + echo ${{ env.AWS_EC2_AMI }} + echo ${{ env.AWS_SG_ID }} + + - name: Configure AWS credentials + id: configure_aws_security_credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT }}:role/GitHub_SelfHostedRunner + role-session-name: GitHubOIDCSession + aws-region: ${{ env.AWS_REGION }} + + - name: Start EC2 runner + id: start-ec2-runner + uses: machulav/ec2-github-runner@v2 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} # added to repo secrets. + ec2-image-id: ${{ env.AWS_EC2_AMI }} + ec2-instance-type: ${{ env.AWS_EC2_TYPE }} + subnet-id: ${{ env.AWS_SUBNET_ID }} + security-group-id: ${{ env.AWS_SG_ID }} + # GitHub_EC2AccessToAWSServicesRole is associated with Instance Profile is assumed by newly configured EC2 instance(s) + # to perform permitted Actions on AWS resources. + iam-role-name: GitHub_EC2AccessToAWSServicesInstanceProfile + aws-resource-tags: > + [ + {"Key": "Name", "Value": "ec2-github-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"} + ] + + run_workload_on_the_runner: + name: Execute the job on the runner + needs: start-self-hosted-ec2-runner # required to start the main job when the runner is ready + runs-on: ${{ needs.start-self-hosted-ec2-runner.outputs.label }} # run the job on the newly created runner + steps: + + - name: "la -la" + run: | + ls -la + whoami + + # - name: "Checkout 'llama-recipes' repository" + # uses: actions/checkout@v4 + # with: + # ref: ${{ inputs.branch_name }} + + # - name: "Installing dependencies from requirementst.txt file." + # run: | + # pip install -U pip setuptools + # pip install -r requirements.txt + # pip install --extra-index-url https://download.pytorch.org/whl/test/cu118 -e '.[tests,auditnlg,vllm]' + + # - name: "Running llama-recipes pytest tests on Self Hosted EC2 Runner" + # run: | + # echo "Running llama-recipes pytest tests on Self Hosted ${{ env.AWS_EC2_TYPE }} EC2 Runner" + # cd $GITHUB_WORKSPACE && python3 -m pytest --junitxml="$GITHUB_WORKSPACE/result.xml" + + # - name: Test Summary + # uses: test-summary/action@v2 + # with: + # paths: "**/*.xml" + # if: always() + + stop-runner: + name: Stop self-hosted EC2 runner + needs: + - start-self-hosted-ec2-runner # required to get output from the start-runner job + - run_workload_on_the_runner # required to wait when the main job is done + runs-on: ubuntu-latest + if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs + steps: + - name: Configure AWS credentials + id: configure_aws_security_credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT }}:role/GitHub_SelfHostedRunner + aws-region: ${{ env.AWS_REGION }} + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@v2 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-self-hosted-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-self-hosted-ec2-runner.outputs.ec2-instance-id }} \ No newline at end of file diff --git a/coverage.svg b/coverage.svg index 636889bb0d..a5804ea07e 100644 --- a/coverage.svg +++ b/coverage.svg @@ -15,7 +15,7 @@ coverage coverage - 44% - 44% + 46% + 46% diff --git a/llama_stack/templates/remote-vllm-gpu/__init__.py b/llama_stack/templates/remote-vllm-gpu/__init__.py new file mode 100644 index 0000000000..81d8f8da18 --- /dev/null +++ b/llama_stack/templates/remote-vllm-gpu/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +from .remote_vllm_gpu import get_distribution_template # noqa: F401 diff --git a/llama_stack/templates/remote-vllm-gpu/build.yaml b/llama_stack/templates/remote-vllm-gpu/build.yaml new file mode 100644 index 0000000000..e12b0de084 --- /dev/null +++ b/llama_stack/templates/remote-vllm-gpu/build.yaml @@ -0,0 +1,55 @@ +version: 2 +distribution_spec: + description: CI tests for Llama Stack on Remote VLLM (GPU) Inference Server + providers: + inference: + - provider_type: remote::cerebras + - provider_type: remote::ollama + - provider_type: remote::vllm + - provider_type: remote::tgi + - provider_type: remote::fireworks + - provider_type: remote::together + - provider_type: remote::bedrock + - provider_type: remote::nvidia + - provider_type: remote::openai + - provider_type: remote::anthropic + - provider_type: remote::gemini + - provider_type: remote::groq + - provider_type: remote::sambanova + - provider_type: inline::sentence-transformers + vector_io: + - provider_type: inline::faiss + - provider_type: inline::sqlite-vec + - provider_type: inline::milvus + - provider_type: remote::chromadb + - provider_type: remote::pgvector + files: + - provider_type: inline::localfs + safety: + - provider_type: inline::llama-guard + agents: + - provider_type: inline::meta-reference + telemetry: + - provider_type: inline::meta-reference + post_training: + - provider_type: inline::huggingface + eval: + - provider_type: inline::meta-reference + datasetio: + - provider_type: remote::huggingface + - provider_type: inline::localfs + scoring: + - provider_type: inline::basic + - provider_type: inline::llm-as-judge + - provider_type: inline::braintrust + tool_runtime: + - provider_type: remote::brave-search + - provider_type: remote::tavily-search + - provider_type: inline::rag-runtime + - provider_type: remote::model-context-protocol +image_type: conda +image_name: ci-tests +additional_pip_packages: +- aiosqlite +- asyncpg +- sqlalchemy[asyncio] diff --git a/llama_stack/templates/remote-vllm-gpu/remote_vllm_gpu.py b/llama_stack/templates/remote-vllm-gpu/remote_vllm_gpu.py new file mode 100644 index 0000000000..62334cc0fe --- /dev/null +++ b/llama_stack/templates/remote-vllm-gpu/remote_vllm_gpu.py @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + + +from llama_stack.templates.template import DistributionTemplate + +from ..starter.starter import get_distribution_template as get_starter_distribution_template + + +def get_distribution_template() -> DistributionTemplate: + template = get_starter_distribution_template() + name = "remote-vllm-gpu" + template.name = name + template.description = "CI tests for Llama Stack on Remote VLLM (GPU) Inference Server" + + return template \ No newline at end of file diff --git a/llama_stack/templates/remote-vllm-gpu/run.yaml b/llama_stack/templates/remote-vllm-gpu/run.yaml new file mode 100644 index 0000000000..e609b932fc --- /dev/null +++ b/llama_stack/templates/remote-vllm-gpu/run.yaml @@ -0,0 +1,225 @@ +version: 2 +image_name: remote-vllm-gpu +apis: +- agents +- datasetio +- eval +- files +- inference +- post_training +- safety +- scoring +- telemetry +- tool_runtime +- vector_io +providers: + inference: + + - provider_id: ${env.VLLM_URL:+vllm} + provider_type: remote::vllm + config: + url: ${env.VLLM_URL:=} + max_tokens: ${env.VLLM_MAX_TOKENS:=4096} + api_token: ${env.VLLM_API_TOKEN:=fake} + tls_verify: ${env.VLLM_TLS_VERIFY:=true} + + # - provider_id: ollama + # provider_type: remote::ollama + # config: + # url: ${env.OLLAMA_URL:=http://localhost:11434} + + - provider_id: ${env.CEREBRAS_API_KEY:+cerebras} + provider_type: remote::cerebras + config: + base_url: https://api.cerebras.ai + api_key: ${env.CEREBRAS_API_KEY:=} + + - provider_id: ${env.TGI_URL:+tgi} + provider_type: remote::tgi + config: + url: ${env.TGI_URL:=} + - provider_id: fireworks + provider_type: remote::fireworks + config: + url: https://api.fireworks.ai/inference/v1 + api_key: ${env.FIREWORKS_API_KEY:=} + - provider_id: together + provider_type: remote::together + config: + url: https://api.together.xyz/v1 + api_key: ${env.TOGETHER_API_KEY:=} + - provider_id: bedrock + provider_type: remote::bedrock + - provider_id: ${env.NVIDIA_API_KEY:+nvidia} + provider_type: remote::nvidia + config: + url: ${env.NVIDIA_BASE_URL:=https://integrate.api.nvidia.com} + api_key: ${env.NVIDIA_API_KEY:=} + append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True} + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY:=} + - provider_id: anthropic + provider_type: remote::anthropic + config: + api_key: ${env.ANTHROPIC_API_KEY:=} + - provider_id: gemini + provider_type: remote::gemini + config: + api_key: ${env.GEMINI_API_KEY:=} + - provider_id: groq + provider_type: remote::groq + config: + url: https://api.groq.com + api_key: ${env.GROQ_API_KEY:=} + - provider_id: sambanova + provider_type: remote::sambanova + config: + url: https://api.sambanova.ai/v1 + api_key: ${env.SAMBANOVA_API_KEY:=} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + vector_io: + - provider_id: faiss + provider_type: inline::faiss + config: + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/faiss_store.db + - provider_id: sqlite-vec + provider_type: inline::sqlite-vec + config: + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec.db + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sqlite_vec_registry.db + - provider_id: ${env.MILVUS_URL:+milvus} + provider_type: inline::milvus + config: + db_path: ${env.MILVUS_DB_PATH:=~/.llama/distributions/starter}/milvus.db + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/milvus_registry.db + - provider_id: ${env.CHROMADB_URL:+chromadb} + provider_type: remote::chromadb + config: + url: ${env.CHROMADB_URL:=} + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter/}/chroma_remote_registry.db + - provider_id: ${env.PGVECTOR_DB:+pgvector} + provider_type: remote::pgvector + config: + host: ${env.PGVECTOR_HOST:=localhost} + port: ${env.PGVECTOR_PORT:=5432} + db: ${env.PGVECTOR_DB:=} + user: ${env.PGVECTOR_USER:=} + password: ${env.PGVECTOR_PASSWORD:=} + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/pgvector_registry.db + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/agents_store.db + responses_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/responses_store.db + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" + sinks: ${env.TELEMETRY_SINKS:=console,sqlite} + sqlite_db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/trace_store.db + otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=} + post_training: + - provider_id: huggingface + provider_type: inline::huggingface + config: + checkpoint_format: huggingface + distributed_backend: null + device: cpu + eval: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/meta_reference_eval.db + datasetio: + - provider_id: huggingface + provider_type: remote::huggingface + config: + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/huggingface_datasetio.db + - provider_id: localfs + provider_type: inline::localfs + config: + kvstore: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/localfs_datasetio.db + scoring: + - provider_id: basic + provider_type: inline::basic + - provider_id: llm-as-judge + provider_type: inline::llm-as-judge + - provider_id: braintrust + provider_type: inline::braintrust + config: + openai_api_key: ${env.OPENAI_API_KEY:=} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:=} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:=} + max_results: 3 + - provider_id: rag-runtime + provider_type: inline::rag-runtime + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol +metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/registry.db +inference_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/remote-vllm-gpu}/inference_store.db +models: [] +shields: +- shield_id: llama-guard + provider_id: ${env.SAFETY_MODEL:+llama-guard} + provider_shield_id: ${env.SAFETY_MODEL:=} +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +server: + port: 8321