Skip to content

Status Publisher #86806

Status Publisher

Status Publisher #86806

name: Status Publisher
on:
workflow_dispatch:
branches:
- main
schedule:
# Run every 5 minutes
- cron: '*/5 * * * *'
jobs:
# AWS Cloud Status Publisher Job
aws-status-publisher:
name: AWS Cloud Status Publisher
runs-on: ubuntu-22.04
env:
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_DEFAULT_REGION: "us-east-2"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
aws --version
kubectl version --client=true
- name: Get EKS cluster configuration
run: |
aws eks update-kubeconfig --name zephyr-alpha --kubeconfig config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes Cluster status
if: always()
run: |
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "3" ]; then
status="majorOutage"
elif [ "$notReadyCount" -gt "2" ]; then
status="partialOutage"
elif [ "$notReadyCount" -gt "1" ]; then
status="degradedPerformance"
else
status="operational"
fi
cat <<EOF > status.aws_kubernetes_cluster.json
{
"status": "${status}",
"rawData": $(kubectl get nodes | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Elasticsearch status
if: always()
run: |
esData=$(kubectl -n elastic-stack get elasticsearches main -o json)
esHealth=$(echo "$esData" | jq -r '.status.health')
if [ "$esHealth" == "red" ]; then
status="majorOutage"
# NOTE: We are intentionally running Elasticsearch with reduced
# replicas (i.e. yellow) for cost saving reasons.
# elif [ "$esHealth" == "yellow" ]; then
# status="partialOutage"
else
status="operational"
fi
cat <<EOF > status.aws_elasticsearch.json
{
"status": "${status}",
"rawData": $(kubectl -n elastic-stack get elasticsearches main | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Kibana status
if: always()
run: |
kbData=$(kubectl -n elastic-stack get kibanas main -o json)
kbHealth=$(echo "$kbData" | jq -r '.status.health')
if [ "$kbHealth" == "red" ]; then
status="majorOutage"
elif [ "$kbHealth" == "yellow" ]; then
status="partialOutage"
else
status="operational"
fi
cat <<EOF > status.aws_kibana.json
{
"status": "${status}",
"rawData": $(kubectl -n elastic-stack get kibanas main | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Upload status data to S3 bucket
if: always()
run: |
aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json'
# Centrinix Cloud Status Publisher Job
# cnx-status-publisher:
# name: Centrinix Cloud Status Publisher
# runs-on: ubuntu-22.04
# env:
# AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
# AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
# AWS_DEFAULT_REGION: "us-east-2"
# OS_AUTH_TYPE: "v3applicationcredential"
# OS_AUTH_URL: "https://openstack.gumi.centrinix.cloud:5000"
# OS_IDENTITY_API_VERSION: "3"
# OS_REGION_NAME: "Gumi"
# OS_INTERFACE: "public"
# OS_APPLICATION_CREDENTIAL_ID: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_ID }}"
# OS_APPLICATION_CREDENTIAL_SECRET: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_SECRET }}"
# steps:
# - name: Checkout
# uses: actions/checkout@v4
# - name: Install OpenVPN client
# run: |
# sudo apt update
# sudo apt install -y openvpn
# - name: Install OpenStack command line tools
# run: |
# pip install python-openstackclient
# pip install python-magnumclient
# - name: Install Kubernetes command line tools
# run: |
# curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl
# sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
# - name: Check environment
# run: |
# jq --version
# openvpn --version
# openstack --version
# kubectl version --client=true
# - name: Load SSH key
# uses: webfactory/ssh-agent@v0.9.0
# with:
# ssh-private-key: ${{ secrets.CNX_ZEPHYR_CI_SSH_KEY }}
# - name: Connect to Centrinix CGN VPN
# uses: kota65535/github-openvpn-connect-action@v3
# with:
# config_file: .github/cnx-cgn.ovpn
# username: ${{ secrets.CNX_CGNVPN_USERNAME }}
# password: ${{ secrets.CNX_CGNVPN_PASSWORD }}
# - name: Get OpenStack COE cluster configuration
# if: always()
# run: |
# for ((i=0; i<10; ++i)); do
# openstack coe cluster config zephyr-ci || true
# [ -f config ] && break
# done
# echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
# - name: Check Kubernetes Cluster status
# if: always()
# run: |
# nodeList=$(kubectl get nodes -o json)
# notReadyCount=$(echo "$nodeList" | jq -r '
# [
# .items[] |
# select(.spec.unschedulable == null) |
# .status.conditions[] |
# select(.type == "Ready") |
# select(.status != "True")
# ] | length')
# if [ "$notReadyCount" -gt "85" ]; then
# status="majorOutage"
# elif [ "$notReadyCount" -gt "82" ]; then
# status="partialOutage"
# elif [ "$notReadyCount" -gt "80" ]; then
# status="degradedPerformance"
# else
# status="operational"
# fi
# cat <<EOF > status.cnx_kubernetes_cluster.json
# {
# "status": "${status}",
# "rawData": $(kubectl get nodes | jq -sR),
# "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
# }
# EOF
# - name: Check KeyDB Cache status
# if: always()
# run: |
# podList=$(kubectl -n keydb-cache get pods -o json)
# readyCount=$(echo "$podList" | jq -r '
# [
# .items[].status.conditions[] |
# select(.type == "Ready") |
# select(.status == "True")
# ] | length')
# if [ "$readyCount" -lt "1" ]; then
# status="inactive"
# elif [ "$readyCount" -lt "2" ]; then
# status="majorOutage"
# elif [ "$readyCount" -lt "3" ]; then
# status="partialOutage"
# else
# status="operational"
# fi
# cat <<EOF > status.cnx_keydb_cache.json
# {
# "status": "${status}",
# "rawData": $(kubectl -n keydb-cache get pods | jq -sR),
# "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
# }
# EOF
# - name: Check Actions Runner Controller status
# if: always()
# run: |
# podList=$(kubectl -n arc-systems get pods -o json)
# notReadyCount=$(echo "$podList" | jq -r '
# [
# .items[].status.conditions[] |
# select(.type == "Ready") |
# select(.status != "True")
# ] | length')
# if [ "$notReadyCount" -gt "0" ]; then
# status="majorOutage"
# else
# status="operational"
# fi
# cat <<EOF > status.cnx_actions_runner_controller.json
# {
# "status": "${status}",
# "rawData": $(kubectl -n arc-systems get pods | jq -sR),
# "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
# }
# EOF
# - name: Check Runner Scaling Set linux-arm64-4xlarge status
# if: always()
# run: |
# podList=$(kubectl -n arc-systems get pods -o json)
# podName=$(echo "$podList" | jq -r '
# .items[] |
# select(.metadata.name | startswith("zrv2-linux-arm64-4xlarge-cnx")) |
# .metadata.name')
# if [ "$podName" != "" ]; then
# podData=$(kubectl -n arc-systems get pods ${podName} -o json)
# podStatus=$(echo "$podData" | jq -r '
# .status.conditions[] |
# select(.type == "Ready") |
# .status
# ')
# if [ "$podStatus" != "True" ]; then
# status="majorOutage"
# else
# status="operational"
# fi
# runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-cnx -o json)"
# runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners')
# pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners')
# rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n'
# rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-cnx)"
# else
# status="inactive"
# runningRunnerCount=0
# pendingRunnerCount=0
# rawData="$(kubectl -n arc-systems get pods)"
# fi
# cat <<EOF > status.cnx_runner_scale_set-linux_arm64_4xlarge.json
# {
# "status": "${status}",
# "runningRunnerCount": ${runningRunnerCount},
# "pendingRunnerCount": ${pendingRunnerCount},
# "rawData": $(echo "${rawData}" | jq -sR),
# "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
# }
# EOF
# - name: Check Runner Scaling Set linux-x64-4xlarge status
# if: always()
# run: |
# podList=$(kubectl -n arc-systems get pods -o json)
# podName=$(echo "$podList" | jq -r '
# .items[] |
# select(.metadata.name | startswith("zrv2-linux-x64-4xlarge-cnx")) |
# .metadata.name')
# if [ "$podName" != "" ]; then
# podData=$(kubectl -n arc-systems get pods ${podName} -o json)
# podStatus=$(echo "$podData" | jq -r '
# .status.conditions[] |
# select(.type == "Ready") |
# .status
# ')
# if [ "$podStatus" != "True" ]; then
# status="majorOutage"
# else
# status="operational"
# fi
# runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-cnx -o json)"
# runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners')
# pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners')
# rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n'
# rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-cnx)"
# else
# status="inactive"
# runningRunnerCount=0
# pendingRunnerCount=0
# rawData="$(kubectl -n arc-systems get pods)"
# fi
# cat <<EOF > status.cnx_runner_scale_set-linux_x64_4xlarge.json
# {
# "status": "${status}",
# "runningRunnerCount": ${runningRunnerCount},
# "pendingRunnerCount": ${pendingRunnerCount},
# "rawData": $(echo "${rawData}" | jq -sR),
# "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
# }
# EOF
# - name: Upload status data to S3 bucket
# if: always()
# run: |
# aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json'
# Hetzner Cloud Status Publisher Job
hzr-status-publisher:
name: Hetzner Cloud Status Publisher
runs-on: ubuntu-22.04
env:
AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}"
AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
AWS_DEFAULT_REGION: "us-east-2"
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install WireGuard client
run: |
sudo apt install wireguard openresolv
- name: Install Kubernetes command line tools
run: |
curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- name: Check environment
run: |
jq --version
wg --version
kubectl version --client=true
- name: Connect to Hetzner VPN
run: |
sudo sh -c 'echo "${{ secrets.HZR_WG_CONF }}" > /etc/wireguard/wg0.conf'
sudo wg-quick up wg0
- name: Set up Kubernetes cluster configuration
run: |
echo '${{ secrets.HZR_KUBECONFIG }}' > config
echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV
- name: Check Kubernetes Cluster status
if: always()
run: |
nodeList=$(kubectl get nodes -o json)
notReadyCount=$(echo "$nodeList" | jq -r '
[
.items[] |
select(.spec.unschedulable == null) |
.status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "10" ]; then
status="majorOutage"
elif [ "$notReadyCount" -gt "5" ]; then
status="partialOutage"
elif [ "$notReadyCount" -gt "2" ]; then
status="degradedPerformance"
else
status="operational"
fi
cat <<EOF > status.hzr_kubernetes_cluster.json
{
"status": "${status}",
"rawData": $(kubectl get nodes | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check KeyDB Cache status
if: always()
run: |
podList=$(kubectl -n keydb-cache get pods -o json)
readyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status == "True")
] | length')
if [ "$readyCount" -lt "1" ]; then
status="inactive"
elif [ "$readyCount" -lt "2" ]; then
status="majorOutage"
elif [ "$readyCount" -lt "3" ]; then
status="partialOutage"
else
status="operational"
fi
cat <<EOF > status.hzr_keydb_cache.json
{
"status": "${status}",
"rawData": $(kubectl -n keydb-cache get pods | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Actions Runner Controller status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
notReadyCount=$(echo "$podList" | jq -r '
[
.items[].status.conditions[] |
select(.type == "Ready") |
select(.status != "True")
] | length')
if [ "$notReadyCount" -gt "0" ]; then
status="majorOutage"
else
status="operational"
fi
cat <<EOF > status.hzr_actions_runner_controller.json
{
"status": "${status}",
"rawData": $(kubectl -n arc-systems get pods | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Runner Scaling Set linux-arm64-4xlarge status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
podName=$(echo "$podList" | jq -r '
.items[] |
select(.metadata.name | startswith("zrv2-linux-arm64-4xlarge-hzr")) |
.metadata.name')
if [ "$podName" != "" ]; then
podData=$(kubectl -n arc-systems get pods ${podName} -o json)
podStatus=$(echo "$podData" | jq -r '
.status.conditions[] |
select(.type == "Ready") |
.status
')
if [ "$podStatus" != "True" ]; then
status="majorOutage"
else
status="operational"
fi
runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-hzr -o json)"
runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners')
pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners')
rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n'
rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-hzr)"
else
status="inactive"
runningRunnerCount=0
pendingRunnerCount=0
rawData="$(kubectl -n arc-systems get pods)"
fi
cat <<EOF > status.hzr_runner_scale_set-linux_arm64_4xlarge.json
{
"status": "${status}",
"runningRunnerCount": ${runningRunnerCount},
"pendingRunnerCount": ${pendingRunnerCount},
"rawData": $(echo "${rawData}" | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Check Runner Scaling Set linux-x64-4xlarge status
if: always()
run: |
podList=$(kubectl -n arc-systems get pods -o json)
podName=$(echo "$podList" | jq -r '
.items[] |
select(.metadata.name | startswith("zrv2-linux-x64-4xlarge-hzr")) |
.metadata.name')
if [ "$podName" != "" ]; then
podData=$(kubectl -n arc-systems get pods ${podName} -o json)
podStatus=$(echo "$podData" | jq -r '
.status.conditions[] |
select(.type == "Ready") |
.status
')
if [ "$podStatus" != "True" ]; then
status="majorOutage"
else
status="operational"
fi
runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-hzr -o json)"
runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners')
pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners')
rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n'
rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-hzr)"
else
status="inactive"
runningRunnerCount=0
pendingRunnerCount=0
rawData="$(kubectl -n arc-systems get pods)"
fi
cat <<EOF > status.hzr_runner_scale_set-linux_x64_4xlarge.json
{
"status": "${status}",
"runningRunnerCount": ${runningRunnerCount},
"pendingRunnerCount": ${pendingRunnerCount},
"rawData": $(echo "${rawData}" | jq -sR),
"updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')"
}
EOF
- name: Upload status data to S3 bucket
if: always()
run: |
aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json'