Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 36 additions & 16 deletions scripts/ci-gate-watchdog.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
#!/usr/bin/env bash
# ci-gate-watchdog.sh — invoked by the `watchdog` job in _ci-gate.yml.
#
# After QUEUE_TIMEOUT_MINUTES, cancels every sibling job in the current
# workflow run that is still in `queued` state, except for "Queue Watchdog"
# (this script's own job) and "Merge Gate" (the gate job, which is still
# pending here because `needs: watchdog`). Cancellation forces a terminal
# state so `gate` can finally schedule and `re-actors/alls-green` can
# evaluate, ensuring the required `Merge Gate` status always reports.
# Poll the run's sibling jobs and EXIT AS SOON AS none is still `queued`
# (everything got scheduled) — only cancel jobs that remain stuck in `queued`
# after QUEUE_TIMEOUT_MINUTES. Cancelling forces a terminal state so `gate` can
# finally schedule and `re-actors/alls-green` can evaluate, ensuring the required
# `Merge Gate` status always reports.
#
# Why poll instead of `sleep $TIMEOUT`: a fixed sleep billed a full
# QUEUE_TIMEOUT_MINUTES of runner time on EVERY run, even though jobs only get
# stuck in `queued` on self-hosted runners that never pick them up. On GitHub-
# hosted runners siblings schedule within seconds, so this now exits in seconds.
# "Queue Watchdog" (this job) and "Merge Gate" (awaiting this job) are exempt —
# their `queued`/pending state here is intentional.
#
# Required env:
# GH_TOKEN — GitHub token with actions:write on the run
# QUEUE_TIMEOUT_MINUTES — minutes to wait before evaluating queued jobs
# QUEUE_TIMEOUT_MINUTES — minutes to wait before cancelling still-queued jobs
# REPO — owner/repo of the current workflow run
# RUN_ID — workflow run id

Expand All @@ -21,19 +27,33 @@ set -euo pipefail
: "${REPO:?required}"
: "${RUN_ID:?required}"

sleep "$(awk "BEGIN{printf \"%d\", $QUEUE_TIMEOUT_MINUTES * 60}")"

# Names whose `status == "queued"` is intentional at this point in the run.
# The watchdog itself is mid-execution; the gate is awaiting watchdog.
exempt_filter='.name != "Queue Watchdog" and .name != "Merge Gate"'

stuck=$(gh api --paginate "repos/${REPO}/actions/runs/${RUN_ID}/jobs?per_page=100" \
--jq ".jobs[] | select(.status == \"queued\" and ${exempt_filter}) | \"\(.id)\t\(.name)\"")
queued_siblings() {
gh api --paginate "repos/${REPO}/actions/runs/${RUN_ID}/jobs?per_page=100" \
--jq ".jobs[] | select(.status == \"queued\" and ${exempt_filter}) | \"\(.id)\t\(.name)\""
}

# awk parses QUEUE_TIMEOUT_MINUTES so a float (e.g. 0.5) truncates to an integer
# instead of crashing bash arithmetic, which only handles integers.
limit_seconds=$(awk "BEGIN{printf \"%d\", $QUEUE_TIMEOUT_MINUTES * 60}")
poll_interval=15

if [ -z "$stuck" ]; then
echo "No stuck queued jobs found."
exit 0
fi
# $SECONDS is a bash builtin tracking elapsed script time — no subshell per poll.
# Exit the instant nothing is queued; otherwise keep watching until the deadline.
while :; do
stuck=$(queued_siblings)
if [ -z "$stuck" ]; then
echo "No queued sibling jobs — all scheduled. Nothing to cancel."
exit 0
fi
if [ "$SECONDS" -ge "$limit_seconds" ]; then
echo "Timeout (${QUEUE_TIMEOUT_MINUTES}m) reached; cancelling jobs still stuck in queued:"
break
fi
sleep "$poll_interval"
done

while IFS=$'\t' read -r job_id job_name; do
echo "Cancelling stuck queued job: ${job_name} (id=${job_id})"
Expand Down