diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml index b1fcdc7c1..5fa992365 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml @@ -9,3 +9,4 @@ image_list: - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } +- { name: "quay.io/prometheus/blackbox-exporter", tag: "{{ kube_prometheus_stack_blackbox_exporter_image_tag }}" } diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index bf6097089..427083ec6 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -20,6 +20,12 @@ kube_prometheus_stack_wait_timeout: 5m kube_prometheus_stack_metrics_image_tag: v2.12.0 kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6 +kube_prometheus_stack_blackbox_exporter_release_version: 9.0.1 +kube_prometheus_stack_blackbox_exporter_image_tag: v0.25.0 +kube_prometheus_stack_blackbox_exporter_release_name: blackbox-exporter + +kube_prometheus_stack_blackbox_modules: {} + control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" grafana_auth_anonymous: false diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 13488de58..b6f9c4f86 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -176,6 +176,26 @@ ansible.builtin.import_role: name: grafana-dashboards +- name: Install blackbox exporter helm chart + no_log: true # may expose testuser password + kubernetes.core.helm: + chart_ref: prometheus-blackbox-exporter + chart_repo_url: https://prometheus-community.github.io/helm-charts + chart_version: "{{ kube_prometheus_stack_blackbox_exporter_release_version }}" + release_name: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}" + release_namespace: "{{ kube_prometheus_stack_release_namespace }}" + release_values: + nodeSelector: + clusterrole: "server" + config: + modules: "{{ kube_prometheus_stack_blackbox_modules }}" + configReloader: + image: + tag: "{{ kube_prometheus_stack_app_version }}" # keeps consistent with pre-pulled image for kube-prometheus-stack + image: + tag: "{{ kube_prometheus_stack_blackbox_exporter_image_tag }}" + wait: yes + - name: Install kube-prometheus-stack on target Kubernetes cluster kubernetes.core.helm: chart_ref: "{{ kube_prometheus_stack_chart_name }}" diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index c1e00f5a1..d88edd02b 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -28,6 +28,10 @@ Metrics are scraped from exporters. Exporters are services which expose HTTP end Tool which parses slurm accounting data and produces a log file that is suitable for ingest by filebeat. +### [blackbox-exporter](https://github.com/prometheus/blackbox_exporter) + +Tool which allows blackbox probing of endpoints over HTTP, HTTPS, DNS, TCP, ICMP and gRPC. + ## Definition of terms In this section we define any terms that may not be widely understood. @@ -290,6 +294,21 @@ slurm-stats is configured `slurm-stats` role in the [slurm_openstack_tools colle The `slurm_stats` group controls the placement of the `slurm_stats` service. This should be configured to be a group with a single host. That host must be co-located on the same host as the `filebeat` service that scrapes its output. +## blackbox-exporter + +### Defaults and adding jobs + +Blackbox exporter is configured using rolevars in the [kube_prometheus_stack role](../ansible/roles/kube_prometheus_stack/defaults/main). Blackbox uses modules to +probe service endpoints. Modules can be configured by overriding the maps in [environments/common/inventory/group_vars/all/blackbox_exporter.yml](../environments/common/inventory/group_vars/all/blackbox_exporter.yml), see [upstream docs](https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md) +and [underlying Helm chart values](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-blackbox-exporter/values.yaml#L162) for module configuration options. Probes are defined through Prometheus scrape jobs, which can be added in [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). See upstream docs for configuring blackbox-exporter scrape jobs. +By default a HTTPS probe for OpenOndemand is added if there are hosts in the `openondemand` group, the module and scrape job for this is defined in +[environments/common/inventory/group_vars/all/openondemand.yml](../environments/common/inventory/group_vars/all/openondemand.yml) (these are merged into the config in [blackbox_exporter.yml]([prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml)) and [prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) respectively). +### Placement + +Installed as part of the kube_prometheus_stack role thats placement is controlled by the `prometheus` group. As above, there is currently no load balancing support so should only be placed on a single node, configured to be the Slurm control node by default. + +### Access +Probes can be viewed through the `Prometheus Blackbox Exporter` Grafana dashboard. diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 6b440865e..71f016b53 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241029-0905-f23c2fca", - "RL9": "openhpc-RL9-241029-0949-f23c2fca", - "RL9-cuda": "openhpc-cuda-RL9-241029-0905-f23c2fca" + "RL8": "openhpc-RL8-241106-1149-6e780c0d", + "RL9": "openhpc-RL9-241106-1149-6e780c0d", + "RL9-cuda": "openhpc-cuda-RL9-241106-1149-6e780c0d" } } diff --git a/environments/common/inventory/group_vars/all/blackbox_exporter.yml b/environments/common/inventory/group_vars/all/blackbox_exporter.yml new file mode 100644 index 000000000..5a0fdcd60 --- /dev/null +++ b/environments/common/inventory/group_vars/all/blackbox_exporter.yml @@ -0,0 +1,7 @@ +# Note: underlying helm chart is configured with a default module 'http_2xx', defining modules with this name here will merge the module's values with the existing module, +# see https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-blackbox-exporter/values.yaml#L163 +kube_prometheus_stack_blackbox_modules_defaults: {} + +kube_prometheus_stack_blackbox_modules: "{{ kube_prometheus_stack_blackbox_modules_defaults if ( groups['openondemand'] | count == 0 ) else ( kube_prometheus_stack_blackbox_modules_defaults | combine(openondemand_blackbox_modules) ) }}" + +# See prometheus_scrape_configs in prometheus.yml to add additional scrape jobs to probe services diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index ee874d2ed..45841a3a4 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -38,6 +38,12 @@ grafana_dashboards_default: - placeholder: DS_PROMETHEUS replacement: prometheus revision_id: 3 + # blackbox probes + - dashboard_id: 14928 + replacements: + - placeholder: DS_PROMETHEUS + replacement: prometheus + revision_id: 6 grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}" # Configmap names of kube prometheus stack's default dashboards to exclude diff --git a/environments/common/inventory/group_vars/all/monitoring.yml b/environments/common/inventory/group_vars/all/monitoring.yml index 098039b44..d9d726e3d 100644 --- a/environments/common/inventory/group_vars/all/monitoring.yml +++ b/environments/common/inventory/group_vars/all/monitoring.yml @@ -2,5 +2,6 @@ kube_prometheus_stack_chart_version: 59.1.0 kube_prometheus_stack_release_namespace: monitoring-system kube_prometheus_stack_release_name: kube-prometheus-stack kube_prometheus_stack_wait_timeout: 5m +kube_prometheus_stack_blackbox_exporter_release_name: blackbox-exporter # See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 71ffff844..ae1b576df 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -182,6 +182,34 @@ openondemand_scrape_configs: labels: environment: "{{ appliances_environment_name }}" service: "openondemand" + - job_name: "ood-blackbox-probe" + metrics_path: /probe + params: + module: [ood_http_2xx] + static_configs: + - targets: + - "{{ openondemand_address }}" + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: target + - target_label: __address__ + replacement: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}-prometheus-blackbox-exporter:9115" + +openondemand_blackbox_modules: + ood_http_2xx: + prober: http + timeout: 5s + http: + valid_http_versions: ["HTTP/1.1", "HTTP/2.0"] + follow_redirects: true + preferred_ip_protocol: "ip4" + tls_config: + insecure_skip_verify: true + basic_auth: + username: "testuser" + password: "{{ vault_testuser_password }}" openondemand_dashboard: - dashboard_id: 13465 diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 4bde55b3d..9d59d651a 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -28,6 +28,54 @@ prometheus_extra_rules: expr: "slurm_nodes_down > 0\n" labels: severity: critical + - alert: BlackboxProbeFailed + expr: probe_success == 0 + for: 0m + labels: + severity: critical + annotations: + summary: '{% raw %}Blackbox probe failed (target {{ $labels.target }}){% endraw %}' + description: "{% raw %}Blackbox probe '{{ $labels.target }}' failed{% endraw %}" + - alert: BlackboxSlowProbe + expr: avg_over_time(probe_duration_seconds[1m]) > 1.25 #around 1.14 expected due to indirection in cluster + for: 1m + labels: + severity: warning + annotations: + summary: '{% raw %}Blackbox slow probe (target {{ $labels.target }}){% endraw %}' + description: "{% raw %}Blackbox probe '{{ $labels.target }}' took more than 1s to complete - {{ $value }}{% endraw %}" + - alert: BlackboxProbeHttpFailure + expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 + for: 0m + labels: + severity: critical + annotations: + summary: '{% raw %}Blackbox probe HTTP failure (target {{ $labels.target }}){% endraw %}' + description: "{% raw %}Blackbox probe '{{ $labels.target }}' returned an HTTP error status - {{ $value }}{% endraw %}" + - alert: BlackboxSslCertificateWillExpireSoon + expr: (7 * 24 * 3600) <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (30 * 24 * 3600) + for: 0m + labels: + severity: warning + annotations: + summary: '{% raw %}Blackbox SSL certificate will expire soon (target {{ $labels.target }}){% endraw %}' + description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}" + - alert: BlackboxSslCertificateWillExpireVerySoon + expr: 0 <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (7 * 24 * 3600) + for: 0m + labels: + severity: critical + annotations: + summary: '{% raw %}Blackbox SSL certificate will expire very soon (target {{ $labels.target }}){% endraw %}' + description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}" + - alert: BlackboxSslCertificateExpired + expr: (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < 0 + for: 0m + labels: + severity: critical + annotations: + summary: '{% raw %}Blackbox SSL certificate expired (target {{ $labels.target }}){% endraw %}' + description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' has expired{% endraw %}" - record: node_cpu_system_seconds:record expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s]))) - record: node_cpu_user_seconds:record