diff --git a/.github/workflows/a3mega-workload.yaml b/.github/workflows/a3mega-workload.yaml new file mode 100644 index 000000000..08fd1b449 --- /dev/null +++ b/.github/workflows/a3mega-workload.yaml @@ -0,0 +1,75 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +name: a3mega-nightly + +on: + workflow_call: + +env: + # Names must be unique in parallel running tests. + GPU_CLUSTER_NAME: nightly-gpu-a3mega + WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }} + +jobs: + gpu-a3mega-workload: + runs-on: [ubuntu-22.04] + concurrency: # We support one build test to run at a time currently. + group: nightly-test-cluster-group-gpu + cancel-in-progress: false + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - uses: 'google-github-actions/auth@v2' + with: + credentials_json: '${{ secrets.GCP_SA_KEY }}' + - uses: google-github-actions/setup-gcloud@v2 + with: + version: '>= 363.0.0' + install_components: 'beta,gke-gcloud-auth-plugin' + - name: Verify gcp setup + run: gcloud info + - name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands. + run: | + gcloud config set compute/zone us-east4-a + gcloud config get compute/zone + - name: Install xpk dependencies + run: | + make install + echo $PWD/bin >> "$GITHUB_PATH" + - name: Check xpk installation + run: xpk --help + - name: Create an XPK Cluster with one gpu nodepool + run: python xpk.py cluster create --cluster $GPU_CLUSTER_NAME --device-type=h200-141gb-8 --num-nodes=1 --project=${{secrets.A3M_GPU_PROJECT}} --zone=${{secrets.A3M_GPU_ZONE}} --reservation=${{secrets.A3M_RESERVATION}} + - name: Authenticate Docker + run: gcloud auth configure-docker --quiet + - name: Create test script to execute in workloads + run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh + - name: Run a base-docker-image workload + run: python xpk.py workload create --cluster $GPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --device-type=h200-141gb-8 --zone=${{secrets.A3M_GPU_ZONE}} + - name: List out the workloads on the cluster + run: python3 xpk.py workload list --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3M_GPU_ZONE}} + - name: Run xpk inspector with the workload created above + run: python3 xpk.py inspector --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3M_GPU_ZONE}} --workload $WORKLOAD_NAME + - name: Wait for workload completion and confirm it succeeded + run: python3 xpk.py workload list --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3M_GPU_ZONE}} --wait-for-job-completion $WORKLOAD_NAME --timeout 300 + - name: Run xpk info command + run : python3 xpk.py info --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3M_GPU_ZONE}} + - name: Delete the workload on the cluster + run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3M_GPU_ZONE}} + - name: Delete the cluster created + if: always() + run: python xpk.py cluster delete --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3M_GPU_ZONE}} --force diff --git a/.github/workflows/a3u-workload.yaml b/.github/workflows/a3u-workload.yaml new file mode 100644 index 000000000..eda178cce --- /dev/null +++ b/.github/workflows/a3u-workload.yaml @@ -0,0 +1,77 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +name: a3u-nightly + +on: + workflow_call: + +env: + # Names must be unique in parallel running tests. + GPU_CLUSTER_NAME: nightly-gpu-a3ultra + WORKLOAD_NAME: xpktest-nightly-${{ github.run_attempt }} + +jobs: + gpu-a3u-workload: + runs-on: [ubuntu-22.04] + concurrency: # We support one build test to run at a time currently. + group: nightly-test-cluster-group-gpu + cancel-in-progress: false + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Install expect package + run: sudo apt-get install expect + - uses: 'google-github-actions/auth@v2' + with: + credentials_json: '${{ secrets.GCP_SA_KEY }}' + - uses: google-github-actions/setup-gcloud@v2 + with: + version: '>= 363.0.0' + install_components: 'beta,gke-gcloud-auth-plugin' + - name: Verify gcp setup + run: gcloud info + - name: Set Google Cloud CLI properties to a unused zone to verify --zone arg is passed properly in commands. + run: | + gcloud config set compute/zone us-east4-a + gcloud config get compute/zone + - name: Install xpk dependencies + run: | + make install + echo $PWD/bin >> "$GITHUB_PATH" + - name: Check xpk installation + run: xpk --help + - name: Create an XPK Cluster with one gpu nodepool + run: python xpk.py cluster create --cluster $GPU_CLUSTER_NAME --device-type=h200-141gb-8 --num-nodes=1 --project=${{secrets.A3U_GPU_PROJECT}} --zone=${{secrets.A3U_GPU_ZONE}} --reservation=${{secrets.A3U_RESERVATION}} + - name: Authenticate Docker + run: gcloud auth configure-docker --quiet + - name: Create test script to execute in workloads + run: echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > workload.sh + - name: Run a base-docker-image workload + run: python xpk.py workload create --cluster $GPU_CLUSTER_NAME --workload $WORKLOAD_NAME --command "bash workload.sh" --device-type=h200-141gb-8 --zone=${{secrets.A3U_GPU_ZONE}} + - name: List out the workloads on the cluster + run: python3 xpk.py workload list --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3U_GPU_ZONE}} + - name: Run xpk inspector with the workload created above + run: python3 xpk.py inspector --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3U_GPU_ZONE}} --workload $WORKLOAD_NAME + - name: Wait for workload completion and confirm it succeeded + run: python3 xpk.py workload list --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3U_GPU_ZONE}} --wait-for-job-completion $WORKLOAD_NAME --timeout 300 + - name: Run xpk info command + run : python3 xpk.py info --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3U_GPU_ZONE}} + - name: Delete the workload on the cluster + run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3U_GPU_ZONE}} + - name: Delete the cluster created + if: always() + run: python xpk.py cluster delete --cluster $GPU_CLUSTER_NAME --zone=${{secrets.A3U_GPU_ZONE}} --force diff --git a/.github/workflows/nightly_tests.yaml b/.github/workflows/nightly_tests.yaml index 2dff1261c..682c771bb 100644 --- a/.github/workflows/nightly_tests.yaml +++ b/.github/workflows/nightly_tests.yaml @@ -16,6 +16,16 @@ name: Nightly Tests on: workflow_dispatch: + inputs: + gpu-type: + description: 'GPU Type' + required: false + default: '' + type: choice + options: + - 'h200-141gb-8' + - 'h100-mega-80gb-8' + - 'h100-80gb-8' schedule: # Schedule the job run at 12AM PST daily. - cron: '0 8 * * *' @@ -31,6 +41,14 @@ env: RAYCLUSTER_TPU_CLUSTER_NAME: rc-nightly-test-2-v4-8-nodepools jobs: + a3u-test: + if: inputs.gpu-type == 'h200-141gb-8' + uses: ./.github/workflows/a3u-workload.yaml + secrets: inherit + a3mega-test: + if: inputs.gpu-type == 'h100-mega-80gb-8' + uses: ./.github/workflows/a3mega-workload.yaml + secrets: inherit cluster-create-and-delete: runs-on: [ubuntu-22.04] concurrency: # We support one build test to run at a time currently.