From 6181d03a20aa6e16d4066ede839d7646cedba3d3 Mon Sep 17 00:00:00 2001 From: Charlie Doern Date: Tue, 10 Jun 2025 15:39:43 -0400 Subject: [PATCH] fix: wait for medium e2e to finish and pass Signed-off-by: Charlie Doern --- .github/mergify.yml | 22 +++++++ .github/workflows/e2e-nvidia-l40s-x4-sdk.yml | 39 +++++++++---- .github/workflows/status-checks.yml | 60 ++++++++++++++++++++ 3 files changed, 111 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/status-checks.yml diff --git a/.github/mergify.yml b/.github/mergify.yml index e82bca96..0b4bc6db 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -24,6 +24,28 @@ pull_request_rules: - -files~=^\.github/(actions|workflows)/.*\.ya?ml$ - -files~=^\.github/workflows/actionlint\. + # medium e2e workflow + - or: + - and: + # note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml' + - check-success~=e2e-medium-workflow-complete + - or: + - files~=\.py$ + - files=pyproject.toml + - files=tox.ini + - files=requirements.txt + - files=requirements-dev.txt + - files=constraints-dev.txt + - files~=^\.github/workflows/.*\.yml$ # This workflow + - and: + - files~=\.py$ + - files=pyproject.toml + - files=tox.ini + - files=requirements.txt + - files=requirements-dev.txt + - files=constraints-dev.txt + - files~=^\.github/workflows/.*\.yml$ # This workflow + # code lint workflow - or: - and: diff --git a/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml index 3f416642..86e6e5f4 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml @@ -4,9 +4,11 @@ name: E2E (NVIDIA L40S x4) SDK Test on: # only run on PRs that touch certain regex paths - pull_request: + # only run on PRs that touch certain regex paths + pull_request_target: branches: - main + - release-* paths: # note this should match the merging criteria in 'mergify.yml' - "**.py" @@ -15,13 +17,8 @@ on: - "requirements.txt" - "requirements-dev.txt" - "constraints-dev.txt" - - ".github/workflows/e2e-nvidia-l40s-x4-sdk.yaml" # This workflow - workflow_dispatch: - inputs: - pr_or_branch: - description: 'pull request number or branch name' - required: true - default: 'main' + - ".github/workflows/*.yml" # This workflow + workflow_dispatch: {} concurrency: group: ${{ github.workflow }}-${{ github.event.number || github.ref }} cancel-in-progress: true @@ -29,6 +26,14 @@ concurrency: env: TMPDIR: /home/tmp +defaults: + run: + shell: bash + + +permissions: + contents: read + jobs: start-large-ec2-runner: runs-on: ubuntu-latest @@ -97,8 +102,11 @@ jobs: - start-large-ec2-runner runs-on: ${{ needs.start-large-ec2-runner.outputs.label }} - permissions: - pull-requests: write + + # It is important that this job has no write permissions and has + # no access to any secrets. This part (e2e-medium-test) is where we are running + # untrusted code from PRs. + permissions: {} steps: - name: Install Packages @@ -308,3 +316,14 @@ jobs: run: | echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate." echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + + e2e-medium-workflow-complete: + # we don't want to block PRs on failed EC2 cleanup + # so not requiring "stop-medium-ec2-runner" as well + permissions: + checks: read + uses: ./.github/workflows/status-checks.yml + with: + job_ids: >- # Space-separated job ids to wait on for status checks + start-large-ec2-runner + e2e-medium-test diff --git a/.github/workflows/status-checks.yml b/.github/workflows/status-checks.yml new file mode 100644 index 00000000..55ed5055 --- /dev/null +++ b/.github/workflows/status-checks.yml @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +# yamllint disable rule:line-length + +name: Status Checks Reusable Workflow + +on: + workflow_call: + inputs: + job_ids: + description: 'Space-separated job ids to wait on for status checks' + required: true + type: string + delay: + description: 'Period in seconds to wait before first poll of GitHub Check Runs' + required: false + type: number + default: 10 + interval: + description: 'Interval or period in seconds between polling GitHub Check Runs' + required: false + type: number + default: 10 + timeout: + description: 'Timeout in seconds to complete polling GitHub Check Runs' + required: false + type: number + default: 3600 + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + checks: read + +jobs: + status-checks: + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Set status check variables" + id: set_variables + run: | + jq -nr '[$ARGS.positional[] | split("\\s"; null) | map(select(. != ""))] | flatten | join("|") | ("match_pattern=(" + . + ")")' --args "${{ inputs.job_ids }}" >> "$GITHUB_OUTPUT" + + - name: "Wait for status checks" + uses: poseidon/wait-for-status-checks@899c768d191b56eef585c18f8558da19e1f3e707 # v0.6.0 + with: + token: ${{ secrets.GITHUB_TOKEN }} + match_pattern: ${{ steps.set_variables.outputs.match_pattern }} + delay: ${{ inputs.delay }} + interval: ${{ inputs.interval }} + timeout: ${{ inputs.timeout }}