Skip to content

Cloud Monitor

Cloud Monitor #30917

Workflow file for this run

name: Cloud Monitor
on:
workflow_dispatch:
branches:
- main
schedule:
# Run every 30 minutes
- cron: '*/30 * * * *'
jobs:
# AWS Cloud Monitor Job
aws-monitor:
name: AWS Cloud Monitor
runs-on: ubuntu-22.04
env:
DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_DEFAULT_REGION: "us-east-2"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
aws --version
kubectl version --client=true
- name: Check EKS cluster nodegroups
if: always()
run: |
nodeGroupNames=$(aws eks list-nodegroups --cluster-name zephyr-alpha | jq -r '.nodegroups[]')
isFailed="no"
for nodeGroupName in $nodeGroupNames; do
nodeGroup=$(aws eks describe-nodegroup --cluster-name zephyr-alpha --nodegroup-name $nodeGroupName)
nodeGroupStatus=$(echo "$nodeGroup" | jq -r '.nodegroup.status')
if [ "$nodeGroupStatus" != 'ACTIVE' ]; then
.github/log.sh ERROR "aws: zephyr-alpha: Found nodegroup '${nodeGroupName}' with inactive status."
isFailed="yes"
fi
done
if [ "$isFailed" = "yes" ]; then
exit 911
fi
- name: Get EKS cluster configuration
if: always()
run: |
aws eks update-kubeconfig --name zephyr-alpha --kubeconfig config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes nodes
if: always()
run: |
kubectl get nodes
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
.github/log.sh ERROR "aws: zephyr-alpha: Found ${notReadyCount} node(s) with not-ready status."
exit 911
fi
- name: Report failure
if: failure()
run: |
jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
.github/log.sh ERROR "aws: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"
# Centrinix Cloud Monitor Job
#cnx-monitor:
# name: Centrinix Cloud Monitor
# runs-on: ubuntu-22.04
# env:
# DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"
# OS_AUTH_TYPE: "v3applicationcredential"
# OS_AUTH_URL: "https://openstack.gumi.centrinix.cloud:5000"
# OS_IDENTITY_API_VERSION: "3"
# OS_REGION_NAME: "Gumi"
# OS_INTERFACE: "public"
# OS_APPLICATION_CREDENTIAL_ID: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_ID }}"
# OS_APPLICATION_CREDENTIAL_SECRET: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_SECRET }}"
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# - name: Install OpenVPN client
# run: |
# sudo apt update
# sudo apt install -y openvpn
# - name: Install OpenStack command line tools
# run: |
# pip install python-openstackclient
# pip install python-magnumclient
# - name: Install Kubernetes command line tools
# run: |
# curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
# sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
# - name: Check environment
# run: |
# jq --version
# openvpn --version
# openstack --version
# kubectl version --client=true
# - name: Load SSH key
# uses: webfactory/ssh-agent@v0.9.0
# with:
# ssh-private-key: ${{ secrets.CNX_ZEPHYR_CI_SSH_KEY }}
# - name: Connect to Centrinix CGN VPN
# uses: kota65535/github-openvpn-connect-action@v3
# with:
# config_file: .github/cnx-cgn.ovpn
# username: ${{ secrets.CNX_CGNVPN_USERNAME }}
# password: ${{ secrets.CNX_CGNVPN_PASSWORD }}
# - name: Check OpenStack server instances
# if: always()
# run: |
# openstack server list
# serverList=$(openstack server list -f json)
# nonActiveCount=$(echo "$serverList" | jq -r '
# [
# .[] |
# select(.Status != "ACTIVE")
# ] | length')
# #
# # NOTE: OpenStack active server instance count check is disabled because
# # CNX is no longer the main hosting provider.
# #
# # if [ "$nonActiveCount" -gt "0" ]; then
# # .github/log.sh ERROR "cnx: Found ${nonActiveCount} server(s) with non-active status."
# # exit 911
# # fi
# - name: Check OpenStack COE cluster nodegroups
# if: always()
# run: |
# openstack coe nodegroup list zephyr-ci
# nodeGroupList=$(openstack coe nodegroup list zephyr-ci -f json)
# incompleteCount=$(echo "$nodeGroupList" | jq -r '
# [
# .[] |
# select(
# .status != "CREATE_COMPLETE" and
# .status != "UPDATE_COMPLETE")
# ] | length')
# #
# # NOTE: OpenStack complete COE cluster nodegroup check is disabled
# # because CNX is no longer the main hosting provider.
# #
# # if [ "$incompleteCount" -gt "0" ]; then
# # .github/log.sh ERROR "cnx: zephyr-ci: Found ${incompleteCount} nodegroup(s) with incomplete status."
# # exit 911
# # fi
# - name: Get OpenStack COE cluster configuration
# if: always()
# run: |
# for ((i=0; i<10; ++i)); do
# openstack coe cluster config zephyr-ci || true
# [ -f config ] && break
# done
# echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
# - name: Check Kubernetes nodes
# if: always()
# run: |
# kubectl get nodes
# nodeList=$(kubectl get nodes -o json)
# notReadyCount=$(echo "$nodeList" | jq -r '
# [
# .items[] |
# select(.spec.unschedulable == null) |
# .status.conditions[] |
# select(.type == "Ready") |
# select(.status != "True")
# ] | length')
# # NOTE: Not-ready Kubernetes node count check threshold has been set to
# # 80 because CNX is no longer the main hosting provider and is
# # operating at a reduced capacity.
# if [ "$notReadyCount" -gt "80" ]; then
# .github/log.sh ERROR "cnx: zephyr-ci: Found ${notReadyCount} node(s) with not-ready status."
# exit 911
# fi
# #
# # NOTE: KeyDB cache pod check is disabled because CNX is no longer the main
# # hosting provider and cache nodes are no longer active on it.
# #
# # - name: Check KeyDB cache pods
# # if: always()
# # run: |
# # kubectl -n keydb-cache get pods
# # podList=$(kubectl -n keydb-cache get pods -o json)
# # readyCount=$(echo "$podList" | jq -r '
# # [
# # .items[].status.conditions[] |
# # select(.type == "Ready") |
# # select(.status == "True")
# # ] | length')
# # if [ "$readyCount" -lt "3" ]; then
# # .github/log.sh ERROR "cnx: zephyr-ci: Found ${readyCount} KeyDB cache pod with ready status (expected 3)."
# # exit 911
# # fi
# - name: Check Actions Runner Controller pods
# if: always()
# run: |
# kubectl -n arc-systems get pods
# podList=$(kubectl -n arc-systems get pods -o json)
# notReadyCount=$(echo "$podList" | jq -r '
# [
# .items[].status.conditions[] |
# select(.type == "Ready") |
# select(.status != "True")
# ] | length')
# if [ "$notReadyCount" -gt "0" ]; then
# .github/log.sh ERROR "cnx: zephyr-ci: Found ${notReadyCount} ARC pods with not-ready status."
# exit 911
# fi
# - name: Clean up stuck PVs
# if: always()
# run: |
# releasedPvList=$(kubectl get pv -o json | jq '
# .items[] |
# select(.status.phase == "Released")')
# for pvName in $(echo "$releasedPvList" | jq -r '.metadata.name'); do
# pvEventList=$(kubectl -n default get events \
# --field-selector involvedObject.name=$pvName -o json)
# pvFailedDeleteVolumeInUseCount=$(echo "$pvEventList" | jq -r '
# [
# .items[] |
# select(.reason == "VolumeFailedDelete") |
# select(.message | contains("Volume in use"))
# ] | length')
# if [ "$pvFailedDeleteVolumeInUseCount" -gt "0" ]; then
# pvNodeName=$(echo "$releasedPvList" | jq -r '
# select(.metadata.name == '\"$pvName\"') |
# .spec.nodeAffinity.required.nodeSelectorTerms[].matchExpressions[] |
# select(.key == "hostname") |
# .values |
# first
# ')
# pvNodeIp=$(kubectl get node $pvNodeName -o json | jq -r '
# .status.addresses[] |
# select(.type == "ExternalIP") |
# .address')
# ssh -T -o LogLevel=ERROR -o StrictHostKeyChecking=no \
# core@$pvNodeIp \
# -t 'sudo rm -fv /var/lib/containerd/openebs/rawfile/'\"$pvName\"'/disk.img'
# .github/log.sh INFO "cnx: zephyr-ci: Cleaned up stuck PV '$pvName' on '$pvNodeName' ($pvNodeIp)"
# fi
# done
# - name: Report failure
# if: failure()
# run: |
# jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
# .github/log.sh ERROR "cnx: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"
# Hetzner Cloud Monitor Job
hzr-monitor:
name: Hetzner Cloud Monitor
runs-on: ubuntu-22.04
env:
DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install WireGuard client
run: |
sudo apt install wireguard openresolv
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
wg --version
kubectl version --client=true
- name: Connect to Hetzner VPN
run: |
sudo sh -c 'echo "${{ secrets.HZR_WG_CONF }}" > /etc/wireguard/wg0.conf'
sudo wg-quick up wg0
- name: Set up Kubernetes cluster configuration
run: |
echo '${{ secrets.HZR_KUBECONFIG }}' > config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes nodes
if: always()
run: |
kubectl get nodes
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
.github/log.sh ERROR "hzr: ci-main: Found ${notReadyCount} node(s) with not-ready status."
exit 911
fi
- name: Check KeyDB cache pods
if: always()
run: |
kubectl -n keydb-cache get pods
podList=$(kubectl -n keydb-cache get pods -o json)
readyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status == "True")
] | length')
if [ "$readyCount" -lt "3" ]; then
.github/log.sh ERROR "hzr: ci-main: Found ${readyCount} KeyDB cache pod with ready status (expected 3)."
exit 911
fi
- name: Check Actions Runner Controller pods
if: always()
run: |
kubectl -n arc-systems get pods
podList=$(kubectl -n arc-systems get pods -o json)
notReadyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
.github/log.sh ERROR "hzr: ci-main: Found ${notReadyCount} ARC pods with not-ready status."
exit 911
fi
- name: Clean up failed ephemeral runners
if: always()
run: |
kubectl -n arc-runners get ephemeralrunners
runnerList=$(kubectl -n arc-runners get ephemeralrunners -o json)
failedCount=$(echo "$runnerList" | jq -r '
[
.items[].status |
select(.phase == "Failed")
] | length')
if [ "$failedCount" -gt "0" ]; then
.github/log.sh WARN "hzr: ci-main: Found ${failedCount} ephemeral runners with failed status."
echo "$runnerList" |
jq -r '.items[].status | select(.phase == "Failed").runnerName' |
xargs kubectl delete ephemeralrunner
.github/log.sh INFO "hzr: ci-main: Cleaned up ${failedCount} ephemeral runners with failed status."
fi
- name: Report failure
if: failure()
run: |
jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
.github/log.sh ERROR "hzr: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}"