Status Publisher #86806
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Status Publisher | |
| on: | |
| workflow_dispatch: | |
| branches: | |
| - main | |
| schedule: | |
| # Run every 5 minutes | |
| - cron: '*/5 * * * *' | |
| jobs: | |
| # AWS Cloud Status Publisher Job | |
| aws-status-publisher: | |
| name: AWS Cloud Status Publisher | |
| runs-on: ubuntu-22.04 | |
| env: | |
| AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" | |
| AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" | |
| AWS_DEFAULT_REGION: "us-east-2" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Install Kubernetes command line tools | |
| run: | | |
| curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl | |
| sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | |
| - name: Check environment | |
| run: | | |
| jq --version | |
| aws --version | |
| kubectl version --client=true | |
| - name: Get EKS cluster configuration | |
| run: | | |
| aws eks update-kubeconfig --name zephyr-alpha --kubeconfig config | |
| echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV | |
| - name: Check Kubernetes Cluster status | |
| if: always() | |
| run: | | |
| nodeList=$(kubectl get nodes -o json) | |
| notReadyCount=$(echo "$nodeList" | jq -r ' | |
| [ | |
| .items[] | | |
| select(.spec.unschedulable == null) | | |
| .status.conditions[] | | |
| select(.type == "Ready") | | |
| select(.status != "True") | |
| ] | length') | |
| if [ "$notReadyCount" -gt "3" ]; then | |
| status="majorOutage" | |
| elif [ "$notReadyCount" -gt "2" ]; then | |
| status="partialOutage" | |
| elif [ "$notReadyCount" -gt "1" ]; then | |
| status="degradedPerformance" | |
| else | |
| status="operational" | |
| fi | |
| cat <<EOF > status.aws_kubernetes_cluster.json | |
| { | |
| "status": "${status}", | |
| "rawData": $(kubectl get nodes | jq -sR), | |
| "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| } | |
| EOF | |
| - name: Check Elasticsearch status | |
| if: always() | |
| run: | | |
| esData=$(kubectl -n elastic-stack get elasticsearches main -o json) | |
| esHealth=$(echo "$esData" | jq -r '.status.health') | |
| if [ "$esHealth" == "red" ]; then | |
| status="majorOutage" | |
| # NOTE: We are intentionally running Elasticsearch with reduced | |
| # replicas (i.e. yellow) for cost saving reasons. | |
| # elif [ "$esHealth" == "yellow" ]; then | |
| # status="partialOutage" | |
| else | |
| status="operational" | |
| fi | |
| cat <<EOF > status.aws_elasticsearch.json | |
| { | |
| "status": "${status}", | |
| "rawData": $(kubectl -n elastic-stack get elasticsearches main | jq -sR), | |
| "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| } | |
| EOF | |
| - name: Check Kibana status | |
| if: always() | |
| run: | | |
| kbData=$(kubectl -n elastic-stack get kibanas main -o json) | |
| kbHealth=$(echo "$kbData" | jq -r '.status.health') | |
| if [ "$kbHealth" == "red" ]; then | |
| status="majorOutage" | |
| elif [ "$kbHealth" == "yellow" ]; then | |
| status="partialOutage" | |
| else | |
| status="operational" | |
| fi | |
| cat <<EOF > status.aws_kibana.json | |
| { | |
| "status": "${status}", | |
| "rawData": $(kubectl -n elastic-stack get kibanas main | jq -sR), | |
| "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| } | |
| EOF | |
| - name: Upload status data to S3 bucket | |
| if: always() | |
| run: | | |
| aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json' | |
| # Centrinix Cloud Status Publisher Job | |
| # cnx-status-publisher: | |
| # name: Centrinix Cloud Status Publisher | |
| # runs-on: ubuntu-22.04 | |
| # env: | |
| # AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" | |
| # AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" | |
| # AWS_DEFAULT_REGION: "us-east-2" | |
| # OS_AUTH_TYPE: "v3applicationcredential" | |
| # OS_AUTH_URL: "https://openstack.gumi.centrinix.cloud:5000" | |
| # OS_IDENTITY_API_VERSION: "3" | |
| # OS_REGION_NAME: "Gumi" | |
| # OS_INTERFACE: "public" | |
| # OS_APPLICATION_CREDENTIAL_ID: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_ID }}" | |
| # OS_APPLICATION_CREDENTIAL_SECRET: "${{ secrets.CNX_OS_APPLICATION_CREDENTIAL_SECRET }}" | |
| # steps: | |
| # - name: Checkout | |
| # uses: actions/checkout@v4 | |
| # - name: Install OpenVPN client | |
| # run: | | |
| # sudo apt update | |
| # sudo apt install -y openvpn | |
| # - name: Install OpenStack command line tools | |
| # run: | | |
| # pip install python-openstackclient | |
| # pip install python-magnumclient | |
| # - name: Install Kubernetes command line tools | |
| # run: | | |
| # curl -LO https://dl.k8s.io/release/v1.23.0/bin/linux/amd64/kubectl | |
| # sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | |
| # - name: Check environment | |
| # run: | | |
| # jq --version | |
| # openvpn --version | |
| # openstack --version | |
| # kubectl version --client=true | |
| # - name: Load SSH key | |
| # uses: webfactory/ssh-agent@v0.9.0 | |
| # with: | |
| # ssh-private-key: ${{ secrets.CNX_ZEPHYR_CI_SSH_KEY }} | |
| # - name: Connect to Centrinix CGN VPN | |
| # uses: kota65535/github-openvpn-connect-action@v3 | |
| # with: | |
| # config_file: .github/cnx-cgn.ovpn | |
| # username: ${{ secrets.CNX_CGNVPN_USERNAME }} | |
| # password: ${{ secrets.CNX_CGNVPN_PASSWORD }} | |
| # - name: Get OpenStack COE cluster configuration | |
| # if: always() | |
| # run: | | |
| # for ((i=0; i<10; ++i)); do | |
| # openstack coe cluster config zephyr-ci || true | |
| # [ -f config ] && break | |
| # done | |
| # echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV | |
| # - name: Check Kubernetes Cluster status | |
| # if: always() | |
| # run: | | |
| # nodeList=$(kubectl get nodes -o json) | |
| # notReadyCount=$(echo "$nodeList" | jq -r ' | |
| # [ | |
| # .items[] | | |
| # select(.spec.unschedulable == null) | | |
| # .status.conditions[] | | |
| # select(.type == "Ready") | | |
| # select(.status != "True") | |
| # ] | length') | |
| # if [ "$notReadyCount" -gt "85" ]; then | |
| # status="majorOutage" | |
| # elif [ "$notReadyCount" -gt "82" ]; then | |
| # status="partialOutage" | |
| # elif [ "$notReadyCount" -gt "80" ]; then | |
| # status="degradedPerformance" | |
| # else | |
| # status="operational" | |
| # fi | |
| # cat <<EOF > status.cnx_kubernetes_cluster.json | |
| # { | |
| # "status": "${status}", | |
| # "rawData": $(kubectl get nodes | jq -sR), | |
| # "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| # } | |
| # EOF | |
| # - name: Check KeyDB Cache status | |
| # if: always() | |
| # run: | | |
| # podList=$(kubectl -n keydb-cache get pods -o json) | |
| # readyCount=$(echo "$podList" | jq -r ' | |
| # [ | |
| # .items[].status.conditions[] | | |
| # select(.type == "Ready") | | |
| # select(.status == "True") | |
| # ] | length') | |
| # if [ "$readyCount" -lt "1" ]; then | |
| # status="inactive" | |
| # elif [ "$readyCount" -lt "2" ]; then | |
| # status="majorOutage" | |
| # elif [ "$readyCount" -lt "3" ]; then | |
| # status="partialOutage" | |
| # else | |
| # status="operational" | |
| # fi | |
| # cat <<EOF > status.cnx_keydb_cache.json | |
| # { | |
| # "status": "${status}", | |
| # "rawData": $(kubectl -n keydb-cache get pods | jq -sR), | |
| # "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| # } | |
| # EOF | |
| # - name: Check Actions Runner Controller status | |
| # if: always() | |
| # run: | | |
| # podList=$(kubectl -n arc-systems get pods -o json) | |
| # notReadyCount=$(echo "$podList" | jq -r ' | |
| # [ | |
| # .items[].status.conditions[] | | |
| # select(.type == "Ready") | | |
| # select(.status != "True") | |
| # ] | length') | |
| # if [ "$notReadyCount" -gt "0" ]; then | |
| # status="majorOutage" | |
| # else | |
| # status="operational" | |
| # fi | |
| # cat <<EOF > status.cnx_actions_runner_controller.json | |
| # { | |
| # "status": "${status}", | |
| # "rawData": $(kubectl -n arc-systems get pods | jq -sR), | |
| # "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| # } | |
| # EOF | |
| # - name: Check Runner Scaling Set linux-arm64-4xlarge status | |
| # if: always() | |
| # run: | | |
| # podList=$(kubectl -n arc-systems get pods -o json) | |
| # podName=$(echo "$podList" | jq -r ' | |
| # .items[] | | |
| # select(.metadata.name | startswith("zrv2-linux-arm64-4xlarge-cnx")) | | |
| # .metadata.name') | |
| # if [ "$podName" != "" ]; then | |
| # podData=$(kubectl -n arc-systems get pods ${podName} -o json) | |
| # podStatus=$(echo "$podData" | jq -r ' | |
| # .status.conditions[] | | |
| # select(.type == "Ready") | | |
| # .status | |
| # ') | |
| # if [ "$podStatus" != "True" ]; then | |
| # status="majorOutage" | |
| # else | |
| # status="operational" | |
| # fi | |
| # runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-cnx -o json)" | |
| # runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners') | |
| # pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners') | |
| # rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n' | |
| # rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-cnx)" | |
| # else | |
| # status="inactive" | |
| # runningRunnerCount=0 | |
| # pendingRunnerCount=0 | |
| # rawData="$(kubectl -n arc-systems get pods)" | |
| # fi | |
| # cat <<EOF > status.cnx_runner_scale_set-linux_arm64_4xlarge.json | |
| # { | |
| # "status": "${status}", | |
| # "runningRunnerCount": ${runningRunnerCount}, | |
| # "pendingRunnerCount": ${pendingRunnerCount}, | |
| # "rawData": $(echo "${rawData}" | jq -sR), | |
| # "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| # } | |
| # EOF | |
| # - name: Check Runner Scaling Set linux-x64-4xlarge status | |
| # if: always() | |
| # run: | | |
| # podList=$(kubectl -n arc-systems get pods -o json) | |
| # podName=$(echo "$podList" | jq -r ' | |
| # .items[] | | |
| # select(.metadata.name | startswith("zrv2-linux-x64-4xlarge-cnx")) | | |
| # .metadata.name') | |
| # if [ "$podName" != "" ]; then | |
| # podData=$(kubectl -n arc-systems get pods ${podName} -o json) | |
| # podStatus=$(echo "$podData" | jq -r ' | |
| # .status.conditions[] | | |
| # select(.type == "Ready") | | |
| # .status | |
| # ') | |
| # if [ "$podStatus" != "True" ]; then | |
| # status="majorOutage" | |
| # else | |
| # status="operational" | |
| # fi | |
| # runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-cnx -o json)" | |
| # runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners') | |
| # pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners') | |
| # rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n' | |
| # rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-cnx)" | |
| # else | |
| # status="inactive" | |
| # runningRunnerCount=0 | |
| # pendingRunnerCount=0 | |
| # rawData="$(kubectl -n arc-systems get pods)" | |
| # fi | |
| # cat <<EOF > status.cnx_runner_scale_set-linux_x64_4xlarge.json | |
| # { | |
| # "status": "${status}", | |
| # "runningRunnerCount": ${runningRunnerCount}, | |
| # "pendingRunnerCount": ${pendingRunnerCount}, | |
| # "rawData": $(echo "${rawData}" | jq -sR), | |
| # "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| # } | |
| # EOF | |
| # - name: Upload status data to S3 bucket | |
| # if: always() | |
| # run: | | |
| # aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json' | |
| # Hetzner Cloud Status Publisher Job | |
| hzr-status-publisher: | |
| name: Hetzner Cloud Status Publisher | |
| runs-on: ubuntu-22.04 | |
| env: | |
| AWS_ACCESS_KEY_ID: "${{ secrets.AWS_ACCESS_KEY_ID }}" | |
| AWS_SECRET_ACCESS_KEY: "${{ secrets.AWS_SECRET_ACCESS_KEY }}" | |
| AWS_DEFAULT_REGION: "us-east-2" | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| - name: Install WireGuard client | |
| run: | | |
| sudo apt install wireguard openresolv | |
| - name: Install Kubernetes command line tools | |
| run: | | |
| curl -LO https://dl.k8s.io/release/v1.31.4/bin/linux/amd64/kubectl | |
| sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl | |
| - name: Check environment | |
| run: | | |
| jq --version | |
| wg --version | |
| kubectl version --client=true | |
| - name: Connect to Hetzner VPN | |
| run: | | |
| sudo sh -c 'echo "${{ secrets.HZR_WG_CONF }}" > /etc/wireguard/wg0.conf' | |
| sudo wg-quick up wg0 | |
| - name: Set up Kubernetes cluster configuration | |
| run: | | |
| echo '${{ secrets.HZR_KUBECONFIG }}' > config | |
| echo "KUBECONFIG=${PWD}/config" >> $GITHUB_ENV | |
| - name: Check Kubernetes Cluster status | |
| if: always() | |
| run: | | |
| nodeList=$(kubectl get nodes -o json) | |
| notReadyCount=$(echo "$nodeList" | jq -r ' | |
| [ | |
| .items[] | | |
| select(.spec.unschedulable == null) | | |
| .status.conditions[] | | |
| select(.type == "Ready") | | |
| select(.status != "True") | |
| ] | length') | |
| if [ "$notReadyCount" -gt "10" ]; then | |
| status="majorOutage" | |
| elif [ "$notReadyCount" -gt "5" ]; then | |
| status="partialOutage" | |
| elif [ "$notReadyCount" -gt "2" ]; then | |
| status="degradedPerformance" | |
| else | |
| status="operational" | |
| fi | |
| cat <<EOF > status.hzr_kubernetes_cluster.json | |
| { | |
| "status": "${status}", | |
| "rawData": $(kubectl get nodes | jq -sR), | |
| "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| } | |
| EOF | |
| - name: Check KeyDB Cache status | |
| if: always() | |
| run: | | |
| podList=$(kubectl -n keydb-cache get pods -o json) | |
| readyCount=$(echo "$podList" | jq -r ' | |
| [ | |
| .items[].status.conditions[] | | |
| select(.type == "Ready") | | |
| select(.status == "True") | |
| ] | length') | |
| if [ "$readyCount" -lt "1" ]; then | |
| status="inactive" | |
| elif [ "$readyCount" -lt "2" ]; then | |
| status="majorOutage" | |
| elif [ "$readyCount" -lt "3" ]; then | |
| status="partialOutage" | |
| else | |
| status="operational" | |
| fi | |
| cat <<EOF > status.hzr_keydb_cache.json | |
| { | |
| "status": "${status}", | |
| "rawData": $(kubectl -n keydb-cache get pods | jq -sR), | |
| "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| } | |
| EOF | |
| - name: Check Actions Runner Controller status | |
| if: always() | |
| run: | | |
| podList=$(kubectl -n arc-systems get pods -o json) | |
| notReadyCount=$(echo "$podList" | jq -r ' | |
| [ | |
| .items[].status.conditions[] | | |
| select(.type == "Ready") | | |
| select(.status != "True") | |
| ] | length') | |
| if [ "$notReadyCount" -gt "0" ]; then | |
| status="majorOutage" | |
| else | |
| status="operational" | |
| fi | |
| cat <<EOF > status.hzr_actions_runner_controller.json | |
| { | |
| "status": "${status}", | |
| "rawData": $(kubectl -n arc-systems get pods | jq -sR), | |
| "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| } | |
| EOF | |
| - name: Check Runner Scaling Set linux-arm64-4xlarge status | |
| if: always() | |
| run: | | |
| podList=$(kubectl -n arc-systems get pods -o json) | |
| podName=$(echo "$podList" | jq -r ' | |
| .items[] | | |
| select(.metadata.name | startswith("zrv2-linux-arm64-4xlarge-hzr")) | | |
| .metadata.name') | |
| if [ "$podName" != "" ]; then | |
| podData=$(kubectl -n arc-systems get pods ${podName} -o json) | |
| podStatus=$(echo "$podData" | jq -r ' | |
| .status.conditions[] | | |
| select(.type == "Ready") | | |
| .status | |
| ') | |
| if [ "$podStatus" != "True" ]; then | |
| status="majorOutage" | |
| else | |
| status="operational" | |
| fi | |
| runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-hzr -o json)" | |
| runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners') | |
| pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners') | |
| rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n' | |
| rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-arm64-4xlarge-hzr)" | |
| else | |
| status="inactive" | |
| runningRunnerCount=0 | |
| pendingRunnerCount=0 | |
| rawData="$(kubectl -n arc-systems get pods)" | |
| fi | |
| cat <<EOF > status.hzr_runner_scale_set-linux_arm64_4xlarge.json | |
| { | |
| "status": "${status}", | |
| "runningRunnerCount": ${runningRunnerCount}, | |
| "pendingRunnerCount": ${pendingRunnerCount}, | |
| "rawData": $(echo "${rawData}" | jq -sR), | |
| "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| } | |
| EOF | |
| - name: Check Runner Scaling Set linux-x64-4xlarge status | |
| if: always() | |
| run: | | |
| podList=$(kubectl -n arc-systems get pods -o json) | |
| podName=$(echo "$podList" | jq -r ' | |
| .items[] | | |
| select(.metadata.name | startswith("zrv2-linux-x64-4xlarge-hzr")) | | |
| .metadata.name') | |
| if [ "$podName" != "" ]; then | |
| podData=$(kubectl -n arc-systems get pods ${podName} -o json) | |
| podStatus=$(echo "$podData" | jq -r ' | |
| .status.conditions[] | | |
| select(.type == "Ready") | | |
| .status | |
| ') | |
| if [ "$podStatus" != "True" ]; then | |
| status="majorOutage" | |
| else | |
| status="operational" | |
| fi | |
| runnerData="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-hzr -o json)" | |
| runningRunnerCount=$(echo "$runnerData" | jq -r '.status.runningEphemeralRunners') | |
| pendingRunnerCount=$(echo "$runnerData" | jq -r '.status.pendingEphemeralRunners') | |
| rawData="$(kubectl -n arc-systems get pods ${podName})"$'\n\n' | |
| rawData+="$(kubectl -n arc-runners get autoscalingrunnersets zrv2-linux-x64-4xlarge-hzr)" | |
| else | |
| status="inactive" | |
| runningRunnerCount=0 | |
| pendingRunnerCount=0 | |
| rawData="$(kubectl -n arc-systems get pods)" | |
| fi | |
| cat <<EOF > status.hzr_runner_scale_set-linux_x64_4xlarge.json | |
| { | |
| "status": "${status}", | |
| "runningRunnerCount": ${runningRunnerCount}, | |
| "pendingRunnerCount": ${pendingRunnerCount}, | |
| "rawData": $(echo "${rawData}" | jq -sR), | |
| "updatedAt": "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" | |
| } | |
| EOF | |
| - name: Upload status data to S3 bucket | |
| if: always() | |
| run: | | |
| aws s3 cp . s3://statuspage-data/ --recursive --exclude '*' --include 'status.*.json' | |