Cloud Monitor #30917
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Cloud Monitor | |
| on: | |
| workflow_dispatch: | |
| branches: | |
| - main | |
| schedule: | |
| # Run every 30 minutes | |
| - cron: '*/30 * * * *' | |
| jobs: | |
| # AWS Cloud Monitor Job | |
| aws-monitor: | |
| name: AWS Cloud Monitor | |
| runs-on: ubuntu-22.04 | |
| env: | |
| DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}" | |
| AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" | |
| AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" | |
| AWS_DEFAULT_REGION: "us-east-2" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Install Kubernetes command line tools | |
| run: | | |
| curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl | |
| sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | |
| - name: Check environment | |
| run: | | |
| jq --version | |
| aws --version | |
| kubectl version --client=true | |
| - name: Check EKS cluster nodegroups | |
| if: always() | |
| run: | | |
| nodeGroupNames=$(aws eks list-nodegroups --cluster-name zephyr-alpha | jq -r '.nodegroups[]') | |
| isFailed="no" | |
| for nodeGroupName in $nodeGroupNames; do | |
| nodeGroup=$(aws eks describe-nodegroup --cluster-name zephyr-alpha --nodegroup-name $nodeGroupName) | |
| nodeGroupStatus=$(echo "$nodeGroup" | jq -r '.nodegroup.status') | |
| if [ "$nodeGroupStatus" != 'ACTIVE' ]; then | |
| .github/log.sh ERROR "aws: zephyr-alpha: Found nodegroup '${nodeGroupName}' with inactive status." | |
| isFailed="yes" | |
| fi | |
| done | |
| if [ "$isFailed" = "yes" ]; then | |
| exit 911 | |
| fi | |
| - name: Get EKS cluster configuration | |
| if: always() | |
| run: | | |
| aws eks update-kubeconfig --name zephyr-alpha --kubeconfig config | |
| echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV | |
| - name: Check Kubernetes nodes | |
| if: always() | |
| run: | | |
| kubectl get nodes | |
| nodeList=$(kubectl get nodes -o json) | |
| notReadyCount=$(echo "$nodeList" | jq -r ' | |
| [ | |
| .items[] | | |
| select(.spec.unschedulable == null) | | |
| .status.conditions[] | | |
| select(.type == "Ready") | | |
| select(.status != "True") | |
| ] | length') | |
| if [ "$notReadyCount" -gt "0" ]; then | |
| .github/log.sh ERROR "aws: zephyr-alpha: Found ${notReadyCount} node(s) with not-ready status." | |
| exit 911 | |
| fi | |
| - name: Report failure | |
| if: failure() | |
| run: | | |
| jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| .github/log.sh ERROR "aws: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}" | |
| # Centrinix Cloud Monitor Job | |
| #cnx-monitor: | |
| # name: Centrinix Cloud Monitor | |
| # runs-on: ubuntu-22.04 | |
| # env: | |
| # DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}" | |
| # OS_AUTH_TYPE: "v3applicationcredential" | |
| # OS_AUTH_URL: "https://openstack.gumi.centrinix.cloud:5000" | |
| # OS_IDENTITY_API_VERSION: "3" | |
| # OS_REGION_NAME: "Gumi" | |
| # OS_INTERFACE: "public" | |
| # OS_APPLICATION_CREDENTIAL_ID: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_ID }}" | |
| # OS_APPLICATION_CREDENTIAL_SECRET: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_SECRET }}" | |
| # steps: | |
| # - name: Checkout | |
| # uses: actions/checkout@v4 | |
| # - name: Install OpenVPN client | |
| # run: | | |
| # sudo apt update | |
| # sudo apt install -y openvpn | |
| # - name: Install OpenStack command line tools | |
| # run: | | |
| # pip install python-openstackclient | |
| # pip install python-magnumclient | |
| # - name: Install Kubernetes command line tools | |
| # run: | | |
| # curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl | |
| # sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | |
| # - name: Check environment | |
| # run: | | |
| # jq --version | |
| # openvpn --version | |
| # openstack --version | |
| # kubectl version --client=true | |
| # - name: Load SSH key | |
| # uses: webfactory/ssh-agent@v0.9.0 | |
| # with: | |
| # ssh-private-key: ${{ secrets.CNX_ZEPHYR_CI_SSH_KEY }} | |
| # - name: Connect to Centrinix CGN VPN | |
| # uses: kota65535/github-openvpn-connect-action@v3 | |
| # with: | |
| # config_file: .github/cnx-cgn.ovpn | |
| # username: ${{ secrets.CNX_CGNVPN_USERNAME }} | |
| # password: ${{ secrets.CNX_CGNVPN_PASSWORD }} | |
| # - name: Check OpenStack server instances | |
| # if: always() | |
| # run: | | |
| # openstack server list | |
| # serverList=$(openstack server list -f json) | |
| # nonActiveCount=$(echo "$serverList" | jq -r ' | |
| # [ | |
| # .[] | | |
| # select(.Status != "ACTIVE") | |
| # ] | length') | |
| # # | |
| # # NOTE: OpenStack active server instance count check is disabled because | |
| # # CNX is no longer the main hosting provider. | |
| # # | |
| # # if [ "$nonActiveCount" -gt "0" ]; then | |
| # # .github/log.sh ERROR "cnx: Found ${nonActiveCount} server(s) with non-active status." | |
| # # exit 911 | |
| # # fi | |
| # - name: Check OpenStack COE cluster nodegroups | |
| # if: always() | |
| # run: | | |
| # openstack coe nodegroup list zephyr-ci | |
| # nodeGroupList=$(openstack coe nodegroup list zephyr-ci -f json) | |
| # incompleteCount=$(echo "$nodeGroupList" | jq -r ' | |
| # [ | |
| # .[] | | |
| # select( | |
| # .status != "CREATE_COMPLETE" and | |
| # .status != "UPDATE_COMPLETE") | |
| # ] | length') | |
| # # | |
| # # NOTE: OpenStack complete COE cluster nodegroup check is disabled | |
| # # because CNX is no longer the main hosting provider. | |
| # # | |
| # # if [ "$incompleteCount" -gt "0" ]; then | |
| # # .github/log.sh ERROR "cnx: zephyr-ci: Found ${incompleteCount} nodegroup(s) with incomplete status." | |
| # # exit 911 | |
| # # fi | |
| # - name: Get OpenStack COE cluster configuration | |
| # if: always() | |
| # run: | | |
| # for ((i=0; i<10; ++i)); do | |
| # openstack coe cluster config zephyr-ci || true | |
| # [ -f config ] && break | |
| # done | |
| # echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV | |
| # - name: Check Kubernetes nodes | |
| # if: always() | |
| # run: | | |
| # kubectl get nodes | |
| # nodeList=$(kubectl get nodes -o json) | |
| # notReadyCount=$(echo "$nodeList" | jq -r ' | |
| # [ | |
| # .items[] | | |
| # select(.spec.unschedulable == null) | | |
| # .status.conditions[] | | |
| # select(.type == "Ready") | | |
| # select(.status != "True") | |
| # ] | length') | |
| # # NOTE: Not-ready Kubernetes node count check threshold has been set to | |
| # # 80 because CNX is no longer the main hosting provider and is | |
| # # operating at a reduced capacity. | |
| # if [ "$notReadyCount" -gt "80" ]; then | |
| # .github/log.sh ERROR "cnx: zephyr-ci: Found ${notReadyCount} node(s) with not-ready status." | |
| # exit 911 | |
| # fi | |
| # # | |
| # # NOTE: KeyDB cache pod check is disabled because CNX is no longer the main | |
| # # hosting provider and cache nodes are no longer active on it. | |
| # # | |
| # # - name: Check KeyDB cache pods | |
| # # if: always() | |
| # # run: | | |
| # # kubectl -n keydb-cache get pods | |
| # # podList=$(kubectl -n keydb-cache get pods -o json) | |
| # # readyCount=$(echo "$podList" | jq -r ' | |
| # # [ | |
| # # .items[].status.conditions[] | | |
| # # select(.type == "Ready") | | |
| # # select(.status == "True") | |
| # # ] | length') | |
| # # if [ "$readyCount" -lt "3" ]; then | |
| # # .github/log.sh ERROR "cnx: zephyr-ci: Found ${readyCount} KeyDB cache pod with ready status (expected 3)." | |
| # # exit 911 | |
| # # fi | |
| # - name: Check Actions Runner Controller pods | |
| # if: always() | |
| # run: | | |
| # kubectl -n arc-systems get pods | |
| # podList=$(kubectl -n arc-systems get pods -o json) | |
| # notReadyCount=$(echo "$podList" | jq -r ' | |
| # [ | |
| # .items[].status.conditions[] | | |
| # select(.type == "Ready") | | |
| # select(.status != "True") | |
| # ] | length') | |
| # if [ "$notReadyCount" -gt "0" ]; then | |
| # .github/log.sh ERROR "cnx: zephyr-ci: Found ${notReadyCount} ARC pods with not-ready status." | |
| # exit 911 | |
| # fi | |
| # - name: Clean up stuck PVs | |
| # if: always() | |
| # run: | | |
| # releasedPvList=$(kubectl get pv -o json | jq ' | |
| # .items[] | | |
| # select(.status.phase == "Released")') | |
| # for pvName in $(echo "$releasedPvList" | jq -r '.metadata.name'); do | |
| # pvEventList=$(kubectl -n default get events \ | |
| # --field-selector involvedObject.name=$pvName -o json) | |
| # pvFailedDeleteVolumeInUseCount=$(echo "$pvEventList" | jq -r ' | |
| # [ | |
| # .items[] | | |
| # select(.reason == "VolumeFailedDelete") | | |
| # select(.message | contains("Volume in use")) | |
| # ] | length') | |
| # if [ "$pvFailedDeleteVolumeInUseCount" -gt "0" ]; then | |
| # pvNodeName=$(echo "$releasedPvList" | jq -r ' | |
| # select(.metadata.name == '\"$pvName\"') | | |
| # .spec.nodeAffinity.required.nodeSelectorTerms[].matchExpressions[] | | |
| # select(.key == "hostname") | | |
| # .values | | |
| # first | |
| # ') | |
| # pvNodeIp=$(kubectl get node $pvNodeName -o json | jq -r ' | |
| # .status.addresses[] | | |
| # select(.type == "ExternalIP") | | |
| # .address') | |
| # ssh -T -o LogLevel=ERROR -o StrictHostKeyChecking=no \ | |
| # core@$pvNodeIp \ | |
| # -t 'sudo rm -fv /var/lib/containerd/openebs/rawfile/'\"$pvName\"'/disk.img' | |
| # .github/log.sh INFO "cnx: zephyr-ci: Cleaned up stuck PV '$pvName' on '$pvNodeName' ($pvNodeIp)" | |
| # fi | |
| # done | |
| # - name: Report failure | |
| # if: failure() | |
| # run: | | |
| # jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| # .github/log.sh ERROR "cnx: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}" | |
| # Hetzner Cloud Monitor Job | |
| hzr-monitor: | |
| name: Hetzner Cloud Monitor | |
| runs-on: ubuntu-22.04 | |
| env: | |
| DISCORD_INFRA_MONITOR_WEBHOOK: "${{ secrets.DISCORD_INFRA_MONITOR_WEBHOOK }}" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Install WireGuard client | |
| run: | | |
| sudo apt install wireguard openresolv | |
| - name: Install Kubernetes command line tools | |
| run: | | |
| curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl | |
| sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | |
| - name: Check environment | |
| run: | | |
| jq --version | |
| wg --version | |
| kubectl version --client=true | |
| - name: Connect to Hetzner VPN | |
| run: | | |
| sudo sh -c 'echo "${{ secrets.HZR_WG_CONF }}" > /etc/wireguard/wg0.conf' | |
| sudo wg-quick up wg0 | |
| - name: Set up Kubernetes cluster configuration | |
| run: | | |
| echo '${{ secrets.HZR_KUBECONFIG }}' > config | |
| echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV | |
| - name: Check Kubernetes nodes | |
| if: always() | |
| run: | | |
| kubectl get nodes | |
| nodeList=$(kubectl get nodes -o json) | |
| notReadyCount=$(echo "$nodeList" | jq -r ' | |
| [ | |
| .items[] | | |
| select(.spec.unschedulable == null) | | |
| .status.conditions[] | | |
| select(.type == "Ready") | | |
| select(.status != "True") | |
| ] | length') | |
| if [ "$notReadyCount" -gt "0" ]; then | |
| .github/log.sh ERROR "hzr: ci-main: Found ${notReadyCount} node(s) with not-ready status." | |
| exit 911 | |
| fi | |
| - name: Check KeyDB cache pods | |
| if: always() | |
| run: | | |
| kubectl -n keydb-cache get pods | |
| podList=$(kubectl -n keydb-cache get pods -o json) | |
| readyCount=$(echo "$podList" | jq -r ' | |
| [ | |
| .items[].status.conditions[] | | |
| select(.type == "Ready") | | |
| select(.status == "True") | |
| ] | length') | |
| if [ "$readyCount" -lt "3" ]; then | |
| .github/log.sh ERROR "hzr: ci-main: Found ${readyCount} KeyDB cache pod with ready status (expected 3)." | |
| exit 911 | |
| fi | |
| - name: Check Actions Runner Controller pods | |
| if: always() | |
| run: | | |
| kubectl -n arc-systems get pods | |
| podList=$(kubectl -n arc-systems get pods -o json) | |
| notReadyCount=$(echo "$podList" | jq -r ' | |
| [ | |
| .items[].status.conditions[] | | |
| select(.type == "Ready") | | |
| select(.status != "True") | |
| ] | length') | |
| if [ "$notReadyCount" -gt "0" ]; then | |
| .github/log.sh ERROR "hzr: ci-main: Found ${notReadyCount} ARC pods with not-ready status." | |
| exit 911 | |
| fi | |
| - name: Clean up failed ephemeral runners | |
| if: always() | |
| run: | | |
| kubectl -n arc-runners get ephemeralrunners | |
| runnerList=$(kubectl -n arc-runners get ephemeralrunners -o json) | |
| failedCount=$(echo "$runnerList" | jq -r ' | |
| [ | |
| .items[].status | | |
| select(.phase == "Failed") | |
| ] | length') | |
| if [ "$failedCount" -gt "0" ]; then | |
| .github/log.sh WARN "hzr: ci-main: Found ${failedCount} ephemeral runners with failed status." | |
| echo "$runnerList" | | |
| jq -r '.items[].status | select(.phase == "Failed").runnerName' | | |
| xargs kubectl delete ephemeralrunner | |
| .github/log.sh INFO "hzr: ci-main: Cleaned up ${failedCount} ephemeral runners with failed status." | |
| fi | |
| - name: Report failure | |
| if: failure() | |
| run: | | |
| jobUrl="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| .github/log.sh ERROR "hzr: Cloud monitor job ${{ github.run_id }} failed.\n${jobUrl}" |