Cloud Monitor #30917

Workflow file for this run

.github/workflows/cloud-monitor.yml at 40e3bb8

	name: Cloud Monitor

	on:
	workflow_dispatch:
	branches:
	- main
	schedule:
	# Run every 30 minutes
	- cron: '/30 * * *'

	jobs:
	# AWS Cloud Monitor Job
	aws-monitor:
	name: AWS Cloud Monitor
	runs-on: ubuntu-22.04

	env:
	DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"
	AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
	AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
	AWS_DEFAULT_REGION: "us-east-2"

	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Install Kubernetes command line tools
	run: \|
	curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
	sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

	- name: Check environment
	run: \|
	jq --version
	aws --version
	kubectl version --client=true

	- name: Check EKS cluster nodegroups
	if: always()
	run: \|
	nodeGroupNames=$(aws eks list-nodegroups --cluster-name zephyr-alpha \| jq -r '.nodegroups[]')
	isFailed="no"

	for nodeGroupName in $nodeGroupNames; do
	nodeGroup=$(aws eks describe-nodegroup --cluster-name zephyr-alpha --nodegroup-name $nodeGroupName)
	nodeGroupStatus=$(echo "$nodeGroup" \| jq -r '.nodegroup.status')

	if [ "$nodeGroupStatus" != 'ACTIVE' ]; then
	.github/log.sh ERROR "aws: zephyr-alpha: Found nodegroup '${nodeGroupName}' with inactive status."
	isFailed="yes"
	fi
	done

	if [ "$isFailed" = "yes" ]; then
	exit 911
	fi

	- name: Get EKS cluster configuration
	if: always()
	run: \|
	aws eks update-kubeconfig --name zephyr-alpha --kubeconfig config
	echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV

	- name: Check Kubernetes nodes
	if: always()
	run: \|
	kubectl get nodes
	nodeList=$(kubectl get nodes -o json)

	notReadyCount=$(echo "$nodeList" \| jq -r '
	[
	.items[] \|
	select(.spec.unschedulable == null) \|
	.status.conditions[] \|
	select(.type == "Ready") \|
	select(.status != "True")
	] \| length')

	if [ "$notReadyCount" -gt "0" ]; then
	.github/log.sh ERROR "aws: zephyr-alpha: Found ${notReadyCount} node(s) with not-ready status."
	exit 911
	fi

	- name: Report failure
	if: failure()
	run: \|
	jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	.github/log.sh ERROR "aws: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"

	# Centrinix Cloud Monitor Job
	#cnx-monitor:
	# name: Centrinix Cloud Monitor
	# runs-on: ubuntu-22.04

	# env:
	# DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"
	# OS_AUTH_TYPE: "v3applicationcredential"
	# OS_AUTH_URL: "https://openstack.gumi.centrinix.cloud:5000"
	# OS_IDENTITY_API_VERSION: "3"
	# OS_REGION_NAME: "Gumi"
	# OS_INTERFACE: "public"
	# OS_APPLICATION_CREDENTIAL_ID: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_ID }}"
	# OS_APPLICATION_CREDENTIAL_SECRET: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_SECRET }}"

	# steps:
	# - name: Checkout
	# uses: actions/checkout@v4

	# - name: Install OpenVPN client
	# run: \|
	# sudo apt update
	# sudo apt install -y openvpn

	# - name: Install OpenStack command line tools
	# run: \|
	# pip install python-openstackclient
	# pip install python-magnumclient

	# - name: Install Kubernetes command line tools
	# run: \|
	# curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
	# sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

	# - name: Check environment
	# run: \|
	# jq --version
	# openvpn --version
	# openstack --version
	# kubectl version --client=true

	# - name: Load SSH key
	# uses: webfactory/ssh-agent@v0.9.0
	# with:
	# ssh-private-key: ${{ secrets.CNX_ZEPHYR_CI_SSH_KEY }}

	# - name: Connect to Centrinix CGN VPN
	# uses: kota65535/github-openvpn-connect-action@v3
	# with:
	# config_file: .github/cnx-cgn.ovpn
	# username: ${{ secrets.CNX_CGNVPN_USERNAME }}
	# password: ${{ secrets.CNX_CGNVPN_PASSWORD }}

	# - name: Check OpenStack server instances
	# if: always()
	# run: \|
	# openstack server list
	# serverList=$(openstack server list -f json)

	# nonActiveCount=$(echo "$serverList" \| jq -r '
	# [
	# .[] \|
	# select(.Status != "ACTIVE")
	# ] \| length')

	# #
	# # NOTE: OpenStack active server instance count check is disabled because
	# # CNX is no longer the main hosting provider.
	# #
	# # if [ "$nonActiveCount" -gt "0" ]; then
	# # .github/log.sh ERROR "cnx: Found ${nonActiveCount} server(s) with non-active status."
	# # exit 911
	# # fi

	# - name: Check OpenStack COE cluster nodegroups
	# if: always()
	# run: \|
	# openstack coe nodegroup list zephyr-ci
	# nodeGroupList=$(openstack coe nodegroup list zephyr-ci -f json)

	# incompleteCount=$(echo "$nodeGroupList" \| jq -r '
	# [
	# .[] \|
	# select(
	# .status != "CREATE_COMPLETE" and
	# .status != "UPDATE_COMPLETE")
	# ] \| length')

	# #
	# # NOTE: OpenStack complete COE cluster nodegroup check is disabled
	# # because CNX is no longer the main hosting provider.
	# #
	# # if [ "$incompleteCount" -gt "0" ]; then
	# # .github/log.sh ERROR "cnx: zephyr-ci: Found ${incompleteCount} nodegroup(s) with incomplete status."
	# # exit 911
	# # fi

	# - name: Get OpenStack COE cluster configuration
	# if: always()
	# run: \|
	# for ((i=0; i<10; ++i)); do
	# openstack coe cluster config zephyr-ci \|\| true
	# [ -f config ] && break
	# done

	# echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV

	# - name: Check Kubernetes nodes
	# if: always()
	# run: \|
	# kubectl get nodes
	# nodeList=$(kubectl get nodes -o json)

	# notReadyCount=$(echo "$nodeList" \| jq -r '
	# [
	# .items[] \|
	# select(.spec.unschedulable == null) \|
	# .status.conditions[] \|
	# select(.type == "Ready") \|
	# select(.status != "True")
	# ] \| length')

	# # NOTE: Not-ready Kubernetes node count check threshold has been set to
	# # 80 because CNX is no longer the main hosting provider and is
	# # operating at a reduced capacity.
	# if [ "$notReadyCount" -gt "80" ]; then
	# .github/log.sh ERROR "cnx: zephyr-ci: Found ${notReadyCount} node(s) with not-ready status."
	# exit 911
	# fi

	# #
	# # NOTE: KeyDB cache pod check is disabled because CNX is no longer the main
	# # hosting provider and cache nodes are no longer active on it.
	# #
	# # - name: Check KeyDB cache pods
	# # if: always()
	# # run: \|
	# # kubectl -n keydb-cache get pods
	# # podList=$(kubectl -n keydb-cache get pods -o json)

	# # readyCount=$(echo "$podList" \| jq -r '
	# # [
	# # .items[].status.conditions[] \|
	# # select(.type == "Ready") \|
	# # select(.status == "True")
	# # ] \| length')

	# # if [ "$readyCount" -lt "3" ]; then
	# # .github/log.sh ERROR "cnx: zephyr-ci: Found ${readyCount} KeyDB cache pod with ready status (expected 3)."
	# # exit 911
	# # fi

	# - name: Check Actions Runner Controller pods
	# if: always()
	# run: \|
	# kubectl -n arc-systems get pods
	# podList=$(kubectl -n arc-systems get pods -o json)

	# notReadyCount=$(echo "$podList" \| jq -r '
	# [
	# .items[].status.conditions[] \|
	# select(.type == "Ready") \|
	# select(.status != "True")
	# ] \| length')

	# if [ "$notReadyCount" -gt "0" ]; then
	# .github/log.sh ERROR "cnx: zephyr-ci: Found ${notReadyCount} ARC pods with not-ready status."
	# exit 911
	# fi

	# - name: Clean up stuck PVs
	# if: always()
	# run: \|
	# releasedPvList=$(kubectl get pv -o json \| jq '
	# .items[] \|
	# select(.status.phase == "Released")')

	# for pvName in $(echo "$releasedPvList" \| jq -r '.metadata.name'); do
	# pvEventList=$(kubectl -n default get events \
	# --field-selector involvedObject.name=$pvName -o json)

	# pvFailedDeleteVolumeInUseCount=$(echo "$pvEventList" \| jq -r '
	# [
	# .items[] \|
	# select(.reason == "VolumeFailedDelete") \|
	# select(.message \| contains("Volume in use"))
	# ] \| length')

	# if [ "$pvFailedDeleteVolumeInUseCount" -gt "0" ]; then
	# pvNodeName=$(echo "$releasedPvList" \| jq -r '
	# select(.metadata.name == '\"$pvName\"') \|
	# .spec.nodeAffinity.required.nodeSelectorTerms[].matchExpressions[] \|
	# select(.key == "hostname") \|
	# .values \|
	# first
	# ')

	# pvNodeIp=$(kubectl get node $pvNodeName -o json \| jq -r '
	# .status.addresses[] \|
	# select(.type == "ExternalIP") \|
	# .address')

	# ssh -T -o LogLevel=ERROR -o StrictHostKeyChecking=no \
	# core@$pvNodeIp \
	# -t 'sudo rm -fv /var/lib/containerd/openebs/rawfile/'\"$pvName\"'/disk.img'

	# .github/log.sh INFO "cnx: zephyr-ci: Cleaned up stuck PV '$pvName' on '$pvNodeName' ($pvNodeIp)"
	# fi
	# done

	# - name: Report failure
	# if: failure()
	# run: \|
	# jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	# .github/log.sh ERROR "cnx: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"

	# Hetzner Cloud Monitor Job
	hzr-monitor:
	name: Hetzner Cloud Monitor
	runs-on: ubuntu-22.04

	env:
	DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"

	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Install WireGuard client
	run: \|
	sudo apt install wireguard openresolv

	- name: Install Kubernetes command line tools
	run: \|
	curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl
	sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl

	- name: Check environment
	run: \|
	jq --version
	wg --version
	kubectl version --client=true

	- name: Connect to Hetzner VPN
	run: \|
	sudo sh -c 'echo "${{ secrets.HZR_WG_CONF }}" > /etc/wireguard/wg0.conf'
	sudo wg-quick up wg0

	- name: Set up Kubernetes cluster configuration
	run: \|
	echo '${{ secrets.HZR_KUBECONFIG }}' > config
	echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV

	- name: Check Kubernetes nodes
	if: always()
	run: \|
	kubectl get nodes
	nodeList=$(kubectl get nodes -o json)

	notReadyCount=$(echo "$nodeList" \| jq -r '
	[
	.items[] \|
	select(.spec.unschedulable == null) \|
	.status.conditions[] \|
	select(.type == "Ready") \|
	select(.status != "True")
	] \| length')

	if [ "$notReadyCount" -gt "0" ]; then
	.github/log.sh ERROR "hzr: ci-main: Found ${notReadyCount} node(s) with not-ready status."
	exit 911
	fi

	- name: Check KeyDB cache pods
	if: always()
	run: \|
	kubectl -n keydb-cache get pods
	podList=$(kubectl -n keydb-cache get pods -o json)

	readyCount=$(echo "$podList" \| jq -r '
	[
	.items[].status.conditions[] \|
	select(.type == "Ready") \|
	select(.status == "True")
	] \| length')

	if [ "$readyCount" -lt "3" ]; then
	.github/log.sh ERROR "hzr: ci-main: Found ${readyCount} KeyDB cache pod with ready status (expected 3)."
	exit 911
	fi

	- name: Check Actions Runner Controller pods
	if: always()
	run: \|
	kubectl -n arc-systems get pods
	podList=$(kubectl -n arc-systems get pods -o json)

	notReadyCount=$(echo "$podList" \| jq -r '
	[
	.items[].status.conditions[] \|
	select(.type == "Ready") \|
	select(.status != "True")
	] \| length')

	if [ "$notReadyCount" -gt "0" ]; then
	.github/log.sh ERROR "hzr: ci-main: Found ${notReadyCount} ARC pods with not-ready status."
	exit 911
	fi

	- name: Clean up failed ephemeral runners
	if: always()
	run: \|
	kubectl -n arc-runners get ephemeralrunners
	runnerList=$(kubectl -n arc-runners get ephemeralrunners -o json)

	failedCount=$(echo "$runnerList" \| jq -r '
	[
	.items[].status \|
	select(.phase == "Failed")
	] \| length')

	if [ "$failedCount" -gt "0" ]; then
	.github/log.sh WARN "hzr: ci-main: Found ${failedCount} ephemeral runners with failed status."

	echo "$runnerList" \|
	jq -r '.items[].status \| select(.phase == "Failed").runnerName' \|
	xargs kubectl delete ephemeralrunner

	.github/log.sh INFO "hzr: ci-main: Cleaned up ${failedCount} ephemeral runners with failed status."
	fi

	- name: Report failure
	if: failure()
	run: \|
	jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
	.github/log.sh ERROR "hzr: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Cloud Monitor #30917

Workflow file

Cloud Monitor #30917

Uh oh!

Workflow file for this run