diff --git a/images/flashbox-l2.conf b/images/flashbox-l2.conf index 459dfc04..eff7e0d6 100644 --- a/images/flashbox-l2.conf +++ b/images/flashbox-l2.conf @@ -2,6 +2,7 @@ Include=shared/mkosi.conf Include=modules/flashbox/common/mkosi.conf Include=modules/flashbox/flashbox-l2/mkosi.conf +Include=modules/flashbox/observability/mkosi.conf [Config] Profiles=gcp diff --git a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh index 8701b55e..67acef20 100755 --- a/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh +++ b/modules/flashbox/common/mkosi.extra/usr/bin/init-firewall.sh @@ -151,6 +151,10 @@ drop_dst_ip() { # # `source` is not supported in dash ########################################################################### + +# Load observability config if the module is included (metrics endpoint IP) +[ -f /etc/flashbox/observability.env ] && . /etc/flashbox/observability.env + . /etc/bob/firewall-config ########################################################################### diff --git a/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config b/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config index bee194c2..44a06dcb 100644 --- a/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config +++ b/modules/flashbox/flashbox-l2/mkosi.extra/etc/bob/firewall-config @@ -50,6 +50,11 @@ accept_dst_port $CHAIN_ALWAYS_IN tcp $CVM_REVERSE_PROXY_PORT "CVM reverse-proxy" accept_dst_port $CHAIN_ALWAYS_OUT udp $NTP_PORT "NTP" accept_dst_port $CHAIN_ALWAYS_OUT tcp $NTP_NTS_PORT "NTP-NTS" +# Observability metrics endpoint (loaded from /etc/flashbox/observability.env) +if [ -n "${METRICS_ENDPOINT:-}" ]; then + accept_dst_ip_port $CHAIN_ALWAYS_OUT tcp "$METRICS_ENDPOINT" $HTTPS_PORT "Metrics endpoint (Flashbots)" +fi + ########################################################################### # (3) MAINTENANCE_IN: Inbound rules for Maintenance Mode ########################################################################### diff --git a/modules/flashbox/observability/mkosi.build b/modules/flashbox/observability/mkosi.build new file mode 100755 index 00000000..e8f52796 --- /dev/null +++ b/modules/flashbox/observability/mkosi.build @@ -0,0 +1,12 @@ +#!/bin/bash +set -euxo pipefail + +source scripts/make_git_package.sh + +# Build gomplate (template engine for Prometheus config) +make_git_package \ + "gomplate" \ + "v4.3.3" \ + "https://github.com/hairyhenderson/gomplate" \ + 'go build -trimpath -ldflags "-s -w -buildid=" -o ./build/gomplate ./cmd/gomplate' \ + "build/gomplate:/usr/bin/gomplate" diff --git a/modules/flashbox/observability/mkosi.conf b/modules/flashbox/observability/mkosi.conf new file mode 100644 index 00000000..f23f464c --- /dev/null +++ b/modules/flashbox/observability/mkosi.conf @@ -0,0 +1,15 @@ +[Build] +WithNetwork=true + +[Content] +ExtraTrees=modules/flashbox/observability/mkosi.extra +PostInstallationScripts=modules/flashbox/observability/mkosi.postinst +BuildScripts=modules/flashbox/observability/mkosi.build + +Packages=prometheus + prometheus-node-exporter + prometheus-process-exporter + +BuildPackages=build-essential + git + golang diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml b/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml new file mode 100644 index 00000000..033f901d --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/process-exporter.yml @@ -0,0 +1,5 @@ +process_names: + # Monitor the searcher container (conmon + all children via --children flag) + - name: "searcher-container" + cmdline: + - 'conmon.*searcher-container' diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl new file mode 100644 index 00000000..2e2dc00d --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/prometheus.yml.tmpl @@ -0,0 +1,43 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +# Recording rules for aggregated metrics +rule_files: + - /etc/prometheus/recording_rules.yml + +# Scrape configurations +scrape_configs: + # Node exporter on localhost + - job_name: 'node' + static_configs: + - targets: ['localhost:9100'] + metric_relabel_configs: + # Only keep aggregated metrics for remote write + - source_labels: [__name__] + regex: 'node_(cpu|memory|disk|filesystem|network|vmstat)_.*' + action: keep + + # Process exporter for container monitoring + - job_name: 'process' + static_configs: + - targets: ['localhost:9256'] + +{{- $config := (datasource "config") }} +{{- if $config.remote_write_flashbots_url }} + +# Remote write configuration (dynamically configured) +remote_write: + # Flashbots endpoint + - url: {{ $config.remote_write_flashbots_url }} + write_relabel_configs: + # Only send flashbox: prefixed metrics + - source_labels: [__name__] + regex: 'flashbox:.*' + action: keep + {{- if $config.remote_write_flashbots_auth }} + basic_auth: + username: {{ $config.remote_write_flashbots_username }} + password: {{ $config.remote_write_flashbots_password }} + {{- end }} +{{- end }} diff --git a/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml b/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml new file mode 100644 index 00000000..79a370cc --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/prometheus/recording_rules.yml @@ -0,0 +1,39 @@ +groups: + # Base metrics — local: prefix means they stay inside the TEE + # (remote_write only forwards flashbox:*) + - name: local_container_metrics + interval: 30s + rules: + - record: local:container_cpu_percent + expr: sum(rate(namedprocess_namegroup_cpu_seconds_total{groupname=~".*searcher-container.*"}[5m])) * 100 + + # Forwarded metrics — flashbox: prefix, picked up by remote_write + - name: flashbox_health + interval: 30s + rules: + - record: flashbox:container_alive + expr: up{job="process"} * on(instance) group_left(cgroup) namedprocess_namegroup_num_procs{groupname=~".*searcher-container.*"} + + # Spike-guarded: current 15m avg must be under 80%, + # AND the 10m max ending 5m ago must have been under 70% + - record: flashbox:container_average_cpu_is_under_80_percent + expr: > + (avg_over_time(local:container_cpu_percent[15m]) < bool 80) + * (max_over_time(local:container_cpu_percent[10m] offset 5m) < bool 70) + + - record: flashbox:container_oom_kills_count + expr: node_vmstat_oom_kill + + - record: flashbox:disk_free_space_is_over_10_percent + expr: > + (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) > bool 0.1 + + - record: flashbox:disk_free_space_is_over_128_gb + expr: > + (node_filesystem_avail_bytes{mountpoint="/persistent"}) > bool (128 * 1024 * 1024 * 1024) + + - record: flashbox:network_is_up + expr: > + (sum(rate(node_network_receive_bytes_total{device!~"lo"}[5m])) + + sum(rate(node_network_transmit_bytes_total{device!~"lo"}[5m]))) + > bool 0 diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service new file mode 100644 index 00000000..f45ccb49 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/fetch-observability-config.service @@ -0,0 +1,14 @@ +[Unit] +Description=Fetch observability configuration +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +ExecStart=/usr/bin/fetch-observability-config.sh +RemainAfterExit=yes +StandardOutput=journal +StandardError=journal + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service new file mode 100644 index 00000000..1f6d2a3d --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/node-exporter.service @@ -0,0 +1,56 @@ +[Unit] +Description=Prometheus Node Exporter +Documentation=https://github.com/prometheus/node_exporter +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStart=/usr/bin/prometheus-node-exporter \ + --web.listen-address=127.0.0.1:9100 \ + --collector.cpu \ + --collector.meminfo \ + --collector.diskstats \ + --collector.filesystem \ + --collector.netdev \ + --collector.loadavg \ + --no-collector.arp \ + --no-collector.bcache \ + --no-collector.bonding \ + --no-collector.conntrack \ + --no-collector.cpufreq \ + --no-collector.edac \ + --no-collector.entropy \ + --no-collector.filefd \ + --no-collector.hwmon \ + --no-collector.infiniband \ + --no-collector.ipvs \ + --no-collector.mdadm \ + --no-collector.netclass \ + --no-collector.netstat \ + --no-collector.nfs \ + --no-collector.nfsd \ + --no-collector.pressure \ + --no-collector.rapl \ + --no-collector.schedstat \ + --no-collector.sockstat \ + --no-collector.softnet \ + --no-collector.stat \ + --no-collector.textfile \ + --no-collector.thermal_zone \ + --no-collector.time \ + --no-collector.timex \ + --no-collector.udp_queues \ + --no-collector.uname \ + --collector.vmstat \ + --no-collector.xfs \ + --no-collector.zfs \ + --no-collector.systemd \ + --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker)($|/) +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service new file mode 100644 index 00000000..30b1257c --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/process-exporter.service @@ -0,0 +1,19 @@ +[Unit] +Description=Prometheus Process Exporter +Documentation=https://github.com/ncabatoff/process-exporter +After=network-online.target searcher-container.service +Wants=network-online.target + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStart=/usr/bin/prometheus-process-exporter \ + --web.listen-address=127.0.0.1:9256 \ + --config.path=/etc/prometheus/process-exporter.yml \ + --children +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service new file mode 100644 index 00000000..fb6397ca --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/prometheus.service @@ -0,0 +1,25 @@ +[Unit] +Description=Prometheus Monitoring System +Documentation=https://prometheus.io/docs/introduction/overview/ +After=network-online.target fetch-observability-config.service +Wants=network-online.target +Requires=fetch-observability-config.service + +[Service] +Type=simple +User=prometheus +Group=prometheus +ExecStartPre=+/usr/bin/gomplate -f /etc/prometheus/prometheus.yml.tmpl -o /etc/prometheus/prometheus.yml -d config=/etc/flashbox/observability-config.json +ExecStart=/usr/bin/prometheus \ + --config.file=/etc/prometheus/prometheus.yml \ + --storage.tsdb.path=/var/lib/prometheus/ \ + --storage.tsdb.retention.time=24h \ + --web.console.templates=/usr/share/prometheus/consoles \ + --web.console.libraries=/usr/share/prometheus/console_libraries \ + --web.listen-address=127.0.0.1:9090 +ExecReload=/bin/kill -HUP $MAINPID +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=minimal.target diff --git a/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf new file mode 100644 index 00000000..b11c3917 --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/etc/systemd/system/searcher-firewall.service.d/needs-observability.conf @@ -0,0 +1,3 @@ +[Unit] +After=fetch-observability-config.service +Wants=fetch-observability-config.service diff --git a/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh b/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh new file mode 100755 index 00000000..f319a28f --- /dev/null +++ b/modules/flashbox/observability/mkosi.extra/usr/bin/fetch-observability-config.sh @@ -0,0 +1,121 @@ +#!/bin/sh +set -eu -o pipefail + +# Fetches observability configuration (metrics endpoint credentials) and writes: +# /etc/flashbox/observability-config.json — consumed by gomplate for Prometheus config +# /etc/flashbox/observability.env — sourced by firewall for metrics endpoint IP +# +# On failure: logs a warning and writes empty defaults. Prometheus runs locally +# without remote_write. This is intentional — observability should never block boot. + +OBSERVABILITY_CONFIG_PATH=/etc/flashbox/observability-config.json +OBSERVABILITY_ENV_PATH=/etc/flashbox/observability.env + +write_config() { + local url="${1:-}" + local username="${2:-}" + local password="${3:-}" + + # Extract IP for firewall rules + local metrics_endpoint="" + if [ -n "$url" ]; then + metrics_endpoint=$(echo "$url" | grep -oE '[0-9]{1,3}(\.[0-9]{1,3}){3}' | head -1 || true) + fi + + mkdir -p /etc/flashbox + + # JSON config for Prometheus gomplate template + cat < "$OBSERVABILITY_CONFIG_PATH" +{ + "remote_write_flashbots_url": "${url}", + "remote_write_flashbots_username": "${username}", + "remote_write_flashbots_password": "${password}", + "remote_write_flashbots_auth": $([ -n "${username}" ] && echo '"true"' || echo '""') +} +EOF + + # Env file for firewall (sourced by init-firewall.sh) + cat < "$OBSERVABILITY_ENV_PATH" +METRICS_ENDPOINT='${metrics_endpoint}' +EOF + + echo "Observability config written (endpoint: ${metrics_endpoint:-none})" +} + +# Don't override if config already exists +if [ -f "$OBSERVABILITY_CONFIG_PATH" ]; then + echo "Observability config already exists, skipping" + exit 0 +fi + +# Local QEMU dev: no remote_write +if dmidecode -s system-manufacturer 2>/dev/null | grep -q "QEMU" && \ + [ -f /etc/systemd/system/serial-console.service ]; then + echo "QEMU dev environment, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +# Production: fetch from Vault (non-fatal on failure) +echo "Fetching observability config from Vault..." + +fetch_metadata_value() { + curl -sf \ + --header "Metadata-Flavor: Google" \ + "http://metadata/computeMetadata/v1/instance/attributes/$1" +} + +if ! instance_name=$(fetch_metadata_value "name") || \ + ! vault_addr=$(fetch_metadata_value "vault_addr") || \ + ! vault_auth_mount=$(fetch_metadata_value "vault_auth_mount_gcp") || \ + ! vault_kv_path=$(fetch_metadata_value "vault_kv_path") || \ + ! vault_kv_common_suffix=$(fetch_metadata_value "vault_kv_common_suffix"); then + echo "WARNING: Could not fetch GCP metadata, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +# Authenticate with Vault using GCP identity +gcp_token=$(curl -sf \ + --header "Metadata-Flavor: Google" \ + --data-urlencode "audience=http://vault/$instance_name" \ + --data-urlencode "format=full" \ + "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity") || true + +if [ -z "${gcp_token:-}" ]; then + echo "WARNING: Could not get GCP identity token, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +vault_token=$(curl -sf \ + --data "$(printf '{"role":"%s","jwt":"%s"}' "$instance_name" "$gcp_token")" \ + "${vault_addr}/v1/${vault_auth_mount}/login" | \ + jq -r .auth.client_token) || true + +if [ -z "${vault_token:-}" ]; then + echo "WARNING: Could not authenticate with Vault, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +# Fetch common data (observability keys live here) +common_data=$(curl -sf \ + --header "X-Vault-Token: ${vault_token}" \ + "${vault_addr}/v1/${vault_kv_path}/node/${vault_kv_common_suffix}" | + jq -c .data.data) || true + +if [ -z "${common_data:-}" ]; then + echo "WARNING: Could not fetch Vault data, writing empty observability config" + write_config "" "" "" + exit 0 +fi + +get_value() { + echo "$common_data" | jq -rc --arg key "$1" '.[$key] // ""' +} + +write_config \ + "$(get_value metrics_flashbots_url)" \ + "$(get_value metrics_flashbots_username)" \ + "$(get_value metrics_flashbots_password)" diff --git a/modules/flashbox/observability/mkosi.postinst b/modules/flashbox/observability/mkosi.postinst new file mode 100755 index 00000000..e0d88484 --- /dev/null +++ b/modules/flashbox/observability/mkosi.postinst @@ -0,0 +1,12 @@ +#!/bin/bash +set -euxo pipefail + +# Ensure prometheus owns its data directory +mkosi-chroot chown -R prometheus:prometheus /var/lib/prometheus + +# Enable observability services +mkosi-chroot systemctl add-wants minimal.target \ + fetch-observability-config.service \ + prometheus.service \ + node-exporter.service \ + process-exporter.service