From aad3e7283fd26b14128d1a8326638f4ee76afc6b Mon Sep 17 00:00:00 2001 From: pincher95 Date: Fri, 25 Jul 2025 12:29:26 -0400 Subject: [PATCH 01/25] Add multi-target support Signed-off-by: pincher95 --- CHANGELOG.md | 4 + README.md | 497 ++++++++++++++++++-------------- config/config.go | 37 +++ config/config_test.go | 34 +++ examples/auth_modules.yml | 25 ++ examples/example-prometheus.yml | 19 ++ go.mod | 5 +- main.go | 102 ++++++- probe.go | 54 ++++ probe_test.go | 44 +++ 10 files changed, 595 insertions(+), 226 deletions(-) create mode 100644 config/config.go create mode 100644 config/config_test.go create mode 100644 examples/auth_modules.yml create mode 100644 examples/example-prometheus.yml create mode 100644 probe.go create mode 100644 probe_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 67d0bfec..50d97789 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ The flag `--es.ilm` has been renamed to `--collector.ilm`. * [CHANGE] Rename --es.data_stream to --collector.data-stream #983 * [CHANGE] Rename --es.ilm to --collector.ilm #999 +## [Unreleased] +### Added +- Multi-target scraping via `/probe` endpoint with optional auth modules (compatible with postgres_exporter style) (#PR) + ## 1.9.0 / 2025-02-27 BREAKING CHANGES: diff --git a/README.md b/README.md index f813534d..75b747d3 100644 --- a/README.md +++ b/README.md @@ -52,32 +52,32 @@ Below is the command line options summary: elasticsearch_exporter --help ``` -| Argument | Introduced in Version | Description | Default | -| ----------------------- | --------------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ----------- | -| collector.clustersettings| 1.6.0 | If true, query stats for cluster settings (As of v1.6.0, this flag has replaced "es.cluster_settings"). | false | -| es.uri | 1.0.2 | Address (host and port) of the Elasticsearch node we should connect to. This could be a local node (`localhost:9200`, for instance), or the address of a remote Elasticsearch server. When basic auth is needed, specify as: `://:@:`. E.G., `http://admin:pass@localhost:9200`. Special characters in the user credentials need to be URL-encoded. | | -| es.all | 1.0.2 | If true, query stats for all nodes in the cluster, rather than just the node we connect to. | false | -| es.indices | 1.0.2 | If true, query stats for all indices in the cluster. | false | -| es.indices_settings | 1.0.4rc1 | If true, query settings stats for all indices in the cluster. | false | -| es.indices_mappings | 1.2.0 | If true, query stats for mappings of all indices of the cluster. | false | -| es.aliases | 1.0.4rc1 | If true, include informational aliases metrics. | true | -| es.ilm | 1.6.0 | If true, query index lifecycle policies for indices in the cluster. -| es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | -| collector.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. (As of v1.7.0, this flag has replaced "es.snapshots"). | false | -| collector.health-report | 1.10.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | -| es.slm | | If true, query stats for SLM. | false | -| es.data_stream | | If true, query state for Data Steams. | false | -| es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | -| es.ca | 1.0.2 | Path to PEM file that contains trusted Certificate Authorities for the Elasticsearch connection. | | -| es.client-private-key | 1.0.2 | Path to PEM file that contains the private key for client auth when connecting to Elasticsearch. | | -| es.client-cert | 1.0.2 | Path to PEM file that contains the corresponding cert for the private key to connect to Elasticsearch. | | -| es.clusterinfo.interval | 1.1.0rc1 | Cluster info update interval for the cluster label | 5m | -| es.ssl-skip-verify | 1.0.4rc1 | Skip SSL verification when connecting to Elasticsearch. | false | -| web.listen-address | 1.0.2 | Address to listen on for web interface and telemetry. | :9114 | -| web.telemetry-path | 1.0.2 | Path under which to expose metrics. | /metrics | -| aws.region | 1.5.0 | Region for AWS elasticsearch | | -| aws.role-arn | 1.6.0 | Role ARN of an IAM role to assume. | | -| version | 1.0.2 | Show version info on stdout and exit. | | +| Argument | Introduced in Version | Description | Default | +| ------------------------- | --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------- | +| collector.clustersettings | 1.6.0 | If true, query stats for cluster settings (As of v1.6.0, this flag has replaced "es.cluster_settings"). | false | +| es.uri | 1.0.2 | Address (host and port) of the Elasticsearch node we should connect to. This could be a local node (`localhost:9200`, for instance), or the address of a remote Elasticsearch server. When basic auth is needed, specify as: `://:@:`. E.G., `http://admin:pass@localhost:9200`. Special characters in the user credentials need to be URL-encoded. | | +| es.all | 1.0.2 | If true, query stats for all nodes in the cluster, rather than just the node we connect to. | false | +| es.indices | 1.0.2 | If true, query stats for all indices in the cluster. | false | +| es.indices_settings | 1.0.4rc1 | If true, query settings stats for all indices in the cluster. | false | +| es.indices_mappings | 1.2.0 | If true, query stats for mappings of all indices of the cluster. | false | +| es.aliases | 1.0.4rc1 | If true, include informational aliases metrics. | true | +| es.ilm | 1.6.0 | If true, query index lifecycle policies for indices in the cluster. | +| es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | +| collector.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. (As of v1.7.0, this flag has replaced "es.snapshots"). | false | +| collector.health-report | 1.10.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | +| es.slm | | If true, query stats for SLM. | false | +| es.data_stream | | If true, query state for Data Steams. | false | +| es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | +| es.ca | 1.0.2 | Path to PEM file that contains trusted Certificate Authorities for the Elasticsearch connection. | | +| es.client-private-key | 1.0.2 | Path to PEM file that contains the private key for client auth when connecting to Elasticsearch. | | +| es.client-cert | 1.0.2 | Path to PEM file that contains the corresponding cert for the private key to connect to Elasticsearch. | | +| es.clusterinfo.interval | 1.1.0rc1 | Cluster info update interval for the cluster label | 5m | +| es.ssl-skip-verify | 1.0.4rc1 | Skip SSL verification when connecting to Elasticsearch. | false | +| web.listen-address | 1.0.2 | Address to listen on for web interface and telemetry. | :9114 | +| web.telemetry-path | 1.0.2 | Path under which to expose metrics. | /metrics | +| aws.region | 1.5.0 | Region for AWS elasticsearch | | +| aws.role-arn | 1.6.0 | Role ARN of an IAM role to assume. | | +| version | 1.0.2 | Show version info on stdout and exit. | | Commandline parameters start with a single `-` for versions less than `1.1.0rc1`. For versions greater than `1.1.0rc1`, commandline parameters are specified with `--`. @@ -95,17 +95,17 @@ Specifying those two environment variables will override authentication passed i ES 7.x supports RBACs. The following security privileges are required for the elasticsearch_exporter. -Setting | Privilege Required | Description -:---- | :---- | :---- -collector.clustersettings| `cluster` `monitor` | -exporter defaults | `cluster` `monitor` | All cluster read-only operations, like cluster health and state, hot threads, node info, node and cluster stats, and pending cluster tasks. | -es.indices | `indices` `monitor` (per index or `*`) | All actions that are required for monitoring (recovery, segments info, index stats and status) -es.indices_settings | `indices` `monitor` (per index or `*`) | -es.indices_mappings | `indices` `view_index_metadata` (per index or `*`) | -es.shards | not sure if `indices` or `cluster` `monitor` or both | -collector.snapshots | `cluster:admin/snapshot/status` and `cluster:admin/repository/get` | [ES Forum Post](https://discuss.elastic.co/t/permissions-for-backup-user-with-x-pack/88057) -es.slm | `manage_slm` -es.data_stream | `monitor` or `manage` (per index or `*`) | +| Setting | Privilege Required | Description | +| :------------------------ | :----------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------ | +| collector.clustersettings | `cluster` `monitor` | +| exporter defaults | `cluster` `monitor` | All cluster read-only operations, like cluster health and state, hot threads, node info, node and cluster stats, and pending cluster tasks. | +| es.indices | `indices` `monitor` (per index or `*`) | All actions that are required for monitoring (recovery, segments info, index stats and status) | +| es.indices_settings | `indices` `monitor` (per index or `*`) | +| es.indices_mappings | `indices` `view_index_metadata` (per index or `*`) | +| es.shards | not sure if `indices` or `cluster` `monitor` or both | +| collector.snapshots | `cluster:admin/snapshot/status` and `cluster:admin/repository/get` | [ES Forum Post](https://discuss.elastic.co/t/permissions-for-backup-user-with-x-pack/88057) | +| es.slm | `manage_slm` | +| es.data_stream | `monitor` or `manage` (per index or `*`) | Further Information @@ -113,193 +113,246 @@ Further Information - [Defining Roles](https://www.elastic.co/guide/en/elastic-stack-overview/7.3/defining-roles.html) - [Privileges](https://www.elastic.co/guide/en/elastic-stack-overview/7.3/security-privileges.html) +### Multi-Target Scraping (beta) + +From v2.X the exporter exposes `/probe` allowing one running instance to scrape many clusters. + +Supported `auth_module` types: + +| type | YAML fields | Injected into request | +| ---------- | ----------------------------------------------------------------- | ------------------------------------------------------------------ | +| `userpass` | `userpass.username`, `userpass.password`, optional `options:` map | Sets HTTP basic-auth header, appends `options` as query parameters | +| `apikey` | `apikey:` Base64 API-Key string, optional `options:` map | Adds `Authorization: ApiKey …` header, appends `options` | + +Example config: + +```yaml +# exporter-config.yml +auth_modules: + prod_basic: + type: userpass + userpass: + username: metrics + password: s3cr3t + + staging_key: + type: apikey + apikey: "bXk6YXBpa2V5Ig==" # base64 id:key + options: + sslmode: disable +``` + +Run exporter: + +```bash +./elasticsearch_exporter --config.file=exporter-config.yml +``` + +Prometheus scrape_config: + +```yaml +- job_name: es + metrics_path: /probe + params: + auth_module: [staging_key] + static_configs: + - targets: ["https://es-stage:9200"] + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: exporter:9114 +``` + ### Metrics -| Name | Type | Cardinality | Help | -|----------------------------------------------------------------------|------------|-------------|-----------------------------------------------------------------------------------------------------| -| elasticsearch_breakers_estimated_size_bytes | gauge | 4 | Estimated size in bytes of breaker | -| elasticsearch_breakers_limit_size_bytes | gauge | 4 | Limit size in bytes for breaker | -| elasticsearch_breakers_tripped | counter | 4 | tripped for breaker | -| elasticsearch_cluster_health_active_primary_shards | gauge | 1 | The number of primary shards in your cluster. This is an aggregate total across all indices. | -| elasticsearch_cluster_health_active_shards | gauge | 1 | Aggregate total of all shards across all indices, which includes replica shards. | -| elasticsearch_cluster_health_delayed_unassigned_shards | gauge | 1 | Shards delayed to reduce reallocation overhead | -| elasticsearch_cluster_health_initializing_shards | gauge | 1 | Count of shards that are being freshly created. | -| elasticsearch_cluster_health_number_of_data_nodes | gauge | 1 | Number of data nodes in the cluster. | -| elasticsearch_cluster_health_number_of_in_flight_fetch | gauge | 1 | The number of ongoing shard info requests. | -| elasticsearch_cluster_health_number_of_nodes | gauge | 1 | Number of nodes in the cluster. | -| elasticsearch_cluster_health_number_of_pending_tasks | gauge | 1 | Cluster level changes which have not yet been executed | -| elasticsearch_cluster_health_task_max_waiting_in_queue_millis | gauge | 1 | Max time in millis that a task is waiting in queue. | -| elasticsearch_cluster_health_relocating_shards | gauge | 1 | The number of shards that are currently moving from one node to another node. | -| elasticsearch_cluster_health_status | gauge | 3 | Whether all primary and replica shards are allocated. | -| elasticsearch_cluster_health_unassigned_shards | gauge | 1 | The number of shards that exist in the cluster state, but cannot be found in the cluster itself. | -| elasticsearch_clustersettings_stats_max_shards_per_node | gauge | 0 | Current maximum number of shards per node setting. | -| elasticsearch_clustersettings_allocation_threshold_enabled | gauge | 0 | Is disk allocation decider enabled. | -| elasticsearch_clustersettings_allocation_watermark_flood_stage_bytes | gauge | 0 | Flood stage watermark as in bytes. | -| elasticsearch_clustersettings_allocation_watermark_high_bytes | gauge | 0 | High watermark for disk usage in bytes. | -| elasticsearch_clustersettings_allocation_watermark_low_bytes | gauge | 0 | Low watermark for disk usage in bytes. | -| elasticsearch_clustersettings_allocation_watermark_flood_stage_ratio | gauge | 0 | Flood stage watermark as a ratio. | -| elasticsearch_clustersettings_allocation_watermark_high_ratio | gauge | 0 | High watermark for disk usage as a ratio. | -| elasticsearch_clustersettings_allocation_watermark_low_ratio | gauge | 0 | Low watermark for disk usage as a ratio. | -| elasticsearch_filesystem_data_available_bytes | gauge | 1 | Available space on block device in bytes | -| elasticsearch_filesystem_data_free_bytes | gauge | 1 | Free space on block device in bytes | -| elasticsearch_filesystem_data_size_bytes | gauge | 1 | Size of block device in bytes | -| elasticsearch_filesystem_io_stats_device_operations_count | gauge | 1 | Count of disk operations | -| elasticsearch_filesystem_io_stats_device_read_operations_count | gauge | 1 | Count of disk read operations | -| elasticsearch_filesystem_io_stats_device_write_operations_count | gauge | 1 | Count of disk write operations | -| elasticsearch_filesystem_io_stats_device_read_size_kilobytes_sum | gauge | 1 | Total kilobytes read from disk | -| elasticsearch_filesystem_io_stats_device_write_size_kilobytes_sum | gauge | 1 | Total kilobytes written to disk | -| elasticsearch_ilm_status | gauge | 1 | Current status of ILM. Status can be `STOPPED`, `RUNNING`, `STOPPING`. | -| elasticsearch_ilm_index_status | gauge | 4 | Status of ILM policy for index | -| elasticsearch_indices_active_queries | gauge | 1 | The number of currently active queries | -| elasticsearch_indices_docs | gauge | 1 | Count of documents on this node | -| elasticsearch_indices_docs_deleted | gauge | 1 | Count of deleted documents on this node | -| elasticsearch_indices_deleted_docs_primary | gauge | 1 | Count of deleted documents with only primary shards | -| elasticsearch_indices_docs_primary | gauge | 1 | Count of documents with only primary shards on all nodes | -| elasticsearch_indices_docs_total | gauge | | Count of documents with shards on all nodes | -| elasticsearch_indices_fielddata_evictions | counter | 1 | Evictions from field data | -| elasticsearch_indices_fielddata_memory_size_bytes | gauge | 1 | Field data cache memory usage in bytes | -| elasticsearch_indices_filter_cache_evictions | counter | 1 | Evictions from filter cache | -| elasticsearch_indices_filter_cache_memory_size_bytes | gauge | 1 | Filter cache memory usage in bytes | -| elasticsearch_indices_flush_time_seconds | counter | 1 | Cumulative flush time in seconds | -| elasticsearch_indices_flush_total | counter | 1 | Total flushes | -| elasticsearch_indices_get_exists_time_seconds | counter | 1 | Total time get exists in seconds | -| elasticsearch_indices_get_exists_total | counter | 1 | Total get exists operations | -| elasticsearch_indices_get_missing_time_seconds | counter | 1 | Total time of get missing in seconds | -| elasticsearch_indices_get_missing_total | counter | 1 | Total get missing | -| elasticsearch_indices_get_time_seconds | counter | 1 | Total get time in seconds | -| elasticsearch_indices_get_total | counter | 1 | Total get | -| elasticsearch_indices_indexing_delete_time_seconds_total | counter | 1 | Total time indexing delete in seconds | -| elasticsearch_indices_indexing_delete_total | counter | 1 | Total indexing deletes | -| elasticsearch_indices_index_current | gauge | 1 | The number of documents currently being indexed to an index | -| elasticsearch_indices_indexing_index_time_seconds_total | counter | 1 | Cumulative index time in seconds | -| elasticsearch_indices_indexing_index_total | counter | 1 | Total index calls | -| elasticsearch_indices_mappings_stats_fields | gauge | 1 | Count of fields currently mapped by index | -| elasticsearch_indices_mappings_stats_json_parse_failures_total | counter | 0 | Number of errors while parsing JSON | -| elasticsearch_indices_mappings_stats_scrapes_total | counter | 0 | Current total Elasticsearch Indices Mappings scrapes | -| elasticsearch_indices_mappings_stats_up | gauge | 0 | Was the last scrape of the Elasticsearch Indices Mappings endpoint successful | -| elasticsearch_indices_merges_docs_total | counter | 1 | Cumulative docs merged | -| elasticsearch_indices_merges_total | counter | 1 | Total merges | -| elasticsearch_indices_merges_total_size_bytes_total | counter | 1 | Total merge size in bytes | -| elasticsearch_indices_merges_total_time_seconds_total | counter | 1 | Total time spent merging in seconds | -| elasticsearch_indices_query_cache_cache_total | counter | 1 | Count of query cache | -| elasticsearch_indices_query_cache_cache_size | gauge | 1 | Size of query cache | -| elasticsearch_indices_query_cache_count | counter | 2 | Count of query cache hit/miss | -| elasticsearch_indices_query_cache_evictions | counter | 1 | Evictions from query cache | -| elasticsearch_indices_query_cache_memory_size_bytes | gauge | 1 | Query cache memory usage in bytes | -| elasticsearch_indices_query_cache_total | counter | 1 | Size of query cache total | -| elasticsearch_indices_refresh_time_seconds_total | counter | 1 | Total time spent refreshing in seconds | -| elasticsearch_indices_refresh_total | counter | 1 | Total refreshes | -| elasticsearch_indices_request_cache_count | counter | 2 | Count of request cache hit/miss | -| elasticsearch_indices_request_cache_evictions | counter | 1 | Evictions from request cache | -| elasticsearch_indices_request_cache_memory_size_bytes | gauge | 1 | Request cache memory usage in bytes | -| elasticsearch_indices_search_fetch_time_seconds | counter | 1 | Total search fetch time in seconds | -| elasticsearch_indices_search_fetch_total | counter | 1 | Total number of fetches | -| elasticsearch_indices_search_query_time_seconds | counter | 1 | Total search query time in seconds | -| elasticsearch_indices_search_query_total | counter | 1 | Total number of queries | -| elasticsearch_indices_segments_count | gauge | 1 | Count of index segments on this node | -| elasticsearch_indices_segments_memory_bytes | gauge | 1 | Current memory size of segments in bytes | -| elasticsearch_indices_settings_creation_timestamp_seconds | gauge | 1 | Timestamp of the index creation in seconds | -| elasticsearch_indices_settings_stats_read_only_indices | gauge | 1 | Count of indices that have read_only_allow_delete=true | -| elasticsearch_indices_settings_total_fields | gauge | | Index setting value for index.mapping.total_fields.limit (total allowable mapped fields in a index) | -| elasticsearch_indices_settings_replicas | gauge | | Index setting value for index.replicas | -| elasticsearch_indices_shards_docs | gauge | 3 | Count of documents on this shard | -| elasticsearch_indices_shards_docs_deleted | gauge | 3 | Count of deleted documents on each shard | -| elasticsearch_indices_store_size_bytes | gauge | 1 | Current size of stored index data in bytes | -| elasticsearch_indices_store_size_bytes_primary | gauge | | Current size of stored index data in bytes with only primary shards on all nodes | -| elasticsearch_indices_store_size_bytes_total | gauge | | Current size of stored index data in bytes with all shards on all nodes | -| elasticsearch_indices_store_throttle_time_seconds_total | counter | 1 | Throttle time for index store in seconds | -| elasticsearch_indices_translog_operations | counter | 1 | Total translog operations | -| elasticsearch_indices_translog_size_in_bytes | counter | 1 | Total translog size in bytes | -| elasticsearch_indices_warmer_time_seconds_total | counter | 1 | Total warmer time in seconds | -| elasticsearch_indices_warmer_total | counter | 1 | Total warmer count | -| elasticsearch_jvm_gc_collection_seconds_count | counter | 2 | Count of JVM GC runs | -| elasticsearch_jvm_gc_collection_seconds_sum | counter | 2 | GC run time in seconds | -| elasticsearch_jvm_memory_committed_bytes | gauge | 2 | JVM memory currently committed by area | -| elasticsearch_jvm_memory_max_bytes | gauge | 1 | JVM memory max | -| elasticsearch_jvm_memory_used_bytes | gauge | 2 | JVM memory currently used by area | -| elasticsearch_jvm_memory_pool_used_bytes | gauge | 3 | JVM memory currently used by pool | -| elasticsearch_jvm_memory_pool_max_bytes | counter | 3 | JVM memory max by pool | -| elasticsearch_jvm_memory_pool_peak_used_bytes | counter | 3 | JVM memory peak used by pool | -| elasticsearch_jvm_memory_pool_peak_max_bytes | counter | 3 | JVM memory peak max by pool | -| elasticsearch_os_cpu_percent | gauge | 1 | Percent CPU used by the OS | -| elasticsearch_os_load1 | gauge | 1 | Shortterm load average | -| elasticsearch_os_load5 | gauge | 1 | Midterm load average | -| elasticsearch_os_load15 | gauge | 1 | Longterm load average | -| elasticsearch_process_cpu_percent | gauge | 1 | Percent CPU used by process | -| elasticsearch_process_cpu_seconds_total | counter | 1 | Process CPU time in seconds | -| elasticsearch_process_mem_resident_size_bytes | gauge | 1 | Resident memory in use by process in bytes | -| elasticsearch_process_mem_share_size_bytes | gauge | 1 | Shared memory in use by process in bytes | -| elasticsearch_process_mem_virtual_size_bytes | gauge | 1 | Total virtual memory used in bytes | -| elasticsearch_process_open_files_count | gauge | 1 | Open file descriptors | -| elasticsearch_snapshot_stats_number_of_snapshots | gauge | 1 | Total number of snapshots | -| elasticsearch_snapshot_stats_oldest_snapshot_timestamp | gauge | 1 | Oldest snapshot timestamp | -| elasticsearch_snapshot_stats_snapshot_start_time_timestamp | gauge | 1 | Last snapshot start timestamp | -| elasticsearch_snapshot_stats_latest_snapshot_timestamp_seconds | gauge | 1 | Timestamp of the latest SUCCESS or PARTIAL snapshot | -| elasticsearch_snapshot_stats_snapshot_end_time_timestamp | gauge | 1 | Last snapshot end timestamp | -| elasticsearch_snapshot_stats_snapshot_number_of_failures | gauge | 1 | Last snapshot number of failures | -| elasticsearch_snapshot_stats_snapshot_number_of_indices | gauge | 1 | Last snapshot number of indices | -| elasticsearch_snapshot_stats_snapshot_failed_shards | gauge | 1 | Last snapshot failed shards | -| elasticsearch_snapshot_stats_snapshot_successful_shards | gauge | 1 | Last snapshot successful shards | -| elasticsearch_snapshot_stats_snapshot_total_shards | gauge | 1 | Last snapshot total shard | -| elasticsearch_thread_pool_active_count | gauge | 14 | Thread Pool threads active | -| elasticsearch_thread_pool_completed_count | counter | 14 | Thread Pool operations completed | -| elasticsearch_thread_pool_largest_count | gauge | 14 | Thread Pool largest threads count | -| elasticsearch_thread_pool_queue_count | gauge | 14 | Thread Pool operations queued | -| elasticsearch_thread_pool_rejected_count | counter | 14 | Thread Pool operations rejected | -| elasticsearch_thread_pool_threads_count | gauge | 14 | Thread Pool current threads count | -| elasticsearch_transport_rx_packets_total | counter | 1 | Count of packets received | -| elasticsearch_transport_rx_size_bytes_total | counter | 1 | Total number of bytes received | -| elasticsearch_transport_tx_packets_total | counter | 1 | Count of packets sent | -| elasticsearch_transport_tx_size_bytes_total | counter | 1 | Total number of bytes sent | -| elasticsearch_clusterinfo_last_retrieval_success_ts | gauge | 1 | Timestamp of the last successful cluster info retrieval | -| elasticsearch_clusterinfo_up | gauge | 1 | Up metric for the cluster info collector | -| elasticsearch_clusterinfo_version_info | gauge | 6 | Constant metric with ES version information as labels | -| elasticsearch_slm_stats_up | gauge | 0 | Up metric for SLM collector | -| elasticsearch_slm_stats_total_scrapes | counter | 0 | Number of scrapes for SLM collector | -| elasticsearch_slm_stats_json_parse_failures | counter | 0 | JSON parse failures for SLM collector | -| elasticsearch_slm_stats_retention_runs_total | counter | 0 | Total retention runs | -| elasticsearch_slm_stats_retention_failed_total | counter | 0 | Total failed retention runs | -| elasticsearch_slm_stats_retention_timed_out_total | counter | 0 | Total retention run timeouts | -| elasticsearch_slm_stats_retention_deletion_time_seconds | gauge | 0 | Retention run deletion time | -| elasticsearch_slm_stats_total_snapshots_taken_total | counter | 0 | Total snapshots taken | -| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed | -| elasticsearch_slm_stats_total_snapshots_deleted_total | counter | 0 | Total snapshots deleted | -| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed | -| elasticsearch_slm_stats_snapshots_taken_total | counter | 1 | Snapshots taken by policy | -| elasticsearch_slm_stats_snapshots_failed_total | counter | 1 | Snapshots failed by policy | -| elasticsearch_slm_stats_snapshots_deleted_total | counter | 1 | Snapshots deleted by policy | -| elasticsearch_slm_stats_snapshot_deletion_failures_total | counter | 1 | Snapshot deletion failures by policy | -| elasticsearch_slm_stats_operation_mode | gauge | 1 | SLM operation mode (Running, stopping, stopped) | -| elasticsearch_data_stream_stats_up | gauge | 0 | Up metric for Data Stream collection | -| elasticsearch_data_stream_stats_total_scrapes | counter | 0 | Total scrapes for Data Stream stats | -| elasticsearch_data_stream_stats_json_parse_failures | counter | 0 | Number of parsing failures for Data Stream stats | -| elasticsearch_data_stream_backing_indices_total | gauge | 1 | Number of backing indices for Data Stream | -| elasticsearch_data_stream_store_size_bytes | gauge | 1 | Current size of data stream backing indices in bytes | -| elasticsearch_health_report_creating_primaries | gauge | 1 | The number of creating primary shards | -| elasticsearch_health_report_creating_replicas | gauge | 1 | The number of creating replica shards | -| elasticsearch_health_report_data_stream_lifecycle_status | gauge | 2 | Data stream lifecycle status | -| elasticsearch_health_report_disk_status | gauge | 2 | disk status | -| elasticsearch_health_report_ilm_policies | gauge | 1 | The number of ILM Policies | -| elasticsearch_health_report_ilm_stagnating_indices | gauge | 1 | The number of stagnating indices | -| elasticsearch_health_report_ilm_status | gauge | 2 | ILM status | -| elasticsearch_health_report_initializing_primaries | gauge | 1 | The number of initializing primary shards | -| elasticsearch_health_report_initializing_replicas | gauge | 1 | The number of initializing replica shards | -| elasticsearch_health_report_master_is_stable_status | gauge | 2 | Master is stable status | -| elasticsearch_health_report_max_shards_in_cluster_data | gauge | 1 | The number of maximum shards in a cluster | -| elasticsearch_health_report_max_shards_in_cluster_frozen | gauge | 1 | The number of maximum frozen shards in a cluster | -| elasticsearch_health_report_repository_integrity_status | gauge | 2 | Repository integrity status | -| elasticsearch_health_report_restarting_primaries | gauge | 1 | The number of restarting primary shards | -| elasticsearch_health_report_restarting_replicas | gauge | 1 | The number of restarting replica shards | -| elasticsearch_health_report_shards_availabilty_status | gauge | 2 | Shards availabilty status | -| elasticsearch_health_report_shards_capacity_status | gauge | 2 | Shards capacity status | -| elasticsearch_health_report_slm_policies | gauge | 1 | The number of SLM policies | -| elasticsearch_health_report_slm_status | gauge | 2 | SLM status | -| elasticsearch_health_report_started_primaries | gauge | 1 | The number of started primary shards | -| elasticsearch_health_report_started_replicas | gauge | 1 | The number of started replica shards | -| elasticsearch_health_report_status | gauge | 2 | Overall cluster status | -| elasticsearch_health_report_total_repositories | gauge | 1 | The number snapshot repositories | -| elasticsearch_health_report_unassigned_primaries | gauge | 1 | The number of unassigned primary shards | -| elasticsearch_health_report_unassigned_replicas | gauge | 1 | The number of unassigned replica shards | +| Name | Type | Cardinality | Help | +| -------------------------------------------------------------------- | ------- | ----------- | --------------------------------------------------------------------------------------------------- | +| elasticsearch_breakers_estimated_size_bytes | gauge | 4 | Estimated size in bytes of breaker | +| elasticsearch_breakers_limit_size_bytes | gauge | 4 | Limit size in bytes for breaker | +| elasticsearch_breakers_tripped | counter | 4 | tripped for breaker | +| elasticsearch_cluster_health_active_primary_shards | gauge | 1 | The number of primary shards in your cluster. This is an aggregate total across all indices. | +| elasticsearch_cluster_health_active_shards | gauge | 1 | Aggregate total of all shards across all indices, which includes replica shards. | +| elasticsearch_cluster_health_delayed_unassigned_shards | gauge | 1 | Shards delayed to reduce reallocation overhead | +| elasticsearch_cluster_health_initializing_shards | gauge | 1 | Count of shards that are being freshly created. | +| elasticsearch_cluster_health_number_of_data_nodes | gauge | 1 | Number of data nodes in the cluster. | +| elasticsearch_cluster_health_number_of_in_flight_fetch | gauge | 1 | The number of ongoing shard info requests. | +| elasticsearch_cluster_health_number_of_nodes | gauge | 1 | Number of nodes in the cluster. | +| elasticsearch_cluster_health_number_of_pending_tasks | gauge | 1 | Cluster level changes which have not yet been executed | +| elasticsearch_cluster_health_task_max_waiting_in_queue_millis | gauge | 1 | Max time in millis that a task is waiting in queue. | +| elasticsearch_cluster_health_relocating_shards | gauge | 1 | The number of shards that are currently moving from one node to another node. | +| elasticsearch_cluster_health_status | gauge | 3 | Whether all primary and replica shards are allocated. | +| elasticsearch_cluster_health_unassigned_shards | gauge | 1 | The number of shards that exist in the cluster state, but cannot be found in the cluster itself. | +| elasticsearch_clustersettings_stats_max_shards_per_node | gauge | 0 | Current maximum number of shards per node setting. | +| elasticsearch_clustersettings_allocation_threshold_enabled | gauge | 0 | Is disk allocation decider enabled. | +| elasticsearch_clustersettings_allocation_watermark_flood_stage_bytes | gauge | 0 | Flood stage watermark as in bytes. | +| elasticsearch_clustersettings_allocation_watermark_high_bytes | gauge | 0 | High watermark for disk usage in bytes. | +| elasticsearch_clustersettings_allocation_watermark_low_bytes | gauge | 0 | Low watermark for disk usage in bytes. | +| elasticsearch_clustersettings_allocation_watermark_flood_stage_ratio | gauge | 0 | Flood stage watermark as a ratio. | +| elasticsearch_clustersettings_allocation_watermark_high_ratio | gauge | 0 | High watermark for disk usage as a ratio. | +| elasticsearch_clustersettings_allocation_watermark_low_ratio | gauge | 0 | Low watermark for disk usage as a ratio. | +| elasticsearch_filesystem_data_available_bytes | gauge | 1 | Available space on block device in bytes | +| elasticsearch_filesystem_data_free_bytes | gauge | 1 | Free space on block device in bytes | +| elasticsearch_filesystem_data_size_bytes | gauge | 1 | Size of block device in bytes | +| elasticsearch_filesystem_io_stats_device_operations_count | gauge | 1 | Count of disk operations | +| elasticsearch_filesystem_io_stats_device_read_operations_count | gauge | 1 | Count of disk read operations | +| elasticsearch_filesystem_io_stats_device_write_operations_count | gauge | 1 | Count of disk write operations | +| elasticsearch_filesystem_io_stats_device_read_size_kilobytes_sum | gauge | 1 | Total kilobytes read from disk | +| elasticsearch_filesystem_io_stats_device_write_size_kilobytes_sum | gauge | 1 | Total kilobytes written to disk | +| elasticsearch_ilm_status | gauge | 1 | Current status of ILM. Status can be `STOPPED`, `RUNNING`, `STOPPING`. | +| elasticsearch_ilm_index_status | gauge | 4 | Status of ILM policy for index | +| elasticsearch_indices_active_queries | gauge | 1 | The number of currently active queries | +| elasticsearch_indices_docs | gauge | 1 | Count of documents on this node | +| elasticsearch_indices_docs_deleted | gauge | 1 | Count of deleted documents on this node | +| elasticsearch_indices_deleted_docs_primary | gauge | 1 | Count of deleted documents with only primary shards | +| elasticsearch_indices_docs_primary | gauge | 1 | Count of documents with only primary shards on all nodes | +| elasticsearch_indices_docs_total | gauge | | Count of documents with shards on all nodes | +| elasticsearch_indices_fielddata_evictions | counter | 1 | Evictions from field data | +| elasticsearch_indices_fielddata_memory_size_bytes | gauge | 1 | Field data cache memory usage in bytes | +| elasticsearch_indices_filter_cache_evictions | counter | 1 | Evictions from filter cache | +| elasticsearch_indices_filter_cache_memory_size_bytes | gauge | 1 | Filter cache memory usage in bytes | +| elasticsearch_indices_flush_time_seconds | counter | 1 | Cumulative flush time in seconds | +| elasticsearch_indices_flush_total | counter | 1 | Total flushes | +| elasticsearch_indices_get_exists_time_seconds | counter | 1 | Total time get exists in seconds | +| elasticsearch_indices_get_exists_total | counter | 1 | Total get exists operations | +| elasticsearch_indices_get_missing_time_seconds | counter | 1 | Total time of get missing in seconds | +| elasticsearch_indices_get_missing_total | counter | 1 | Total get missing | +| elasticsearch_indices_get_time_seconds | counter | 1 | Total get time in seconds | +| elasticsearch_indices_get_total | counter | 1 | Total get | +| elasticsearch_indices_indexing_delete_time_seconds_total | counter | 1 | Total time indexing delete in seconds | +| elasticsearch_indices_indexing_delete_total | counter | 1 | Total indexing deletes | +| elasticsearch_indices_index_current | gauge | 1 | The number of documents currently being indexed to an index | +| elasticsearch_indices_indexing_index_time_seconds_total | counter | 1 | Cumulative index time in seconds | +| elasticsearch_indices_indexing_index_total | counter | 1 | Total index calls | +| elasticsearch_indices_mappings_stats_fields | gauge | 1 | Count of fields currently mapped by index | +| elasticsearch_indices_mappings_stats_json_parse_failures_total | counter | 0 | Number of errors while parsing JSON | +| elasticsearch_indices_mappings_stats_scrapes_total | counter | 0 | Current total Elasticsearch Indices Mappings scrapes | +| elasticsearch_indices_mappings_stats_up | gauge | 0 | Was the last scrape of the Elasticsearch Indices Mappings endpoint successful | +| elasticsearch_indices_merges_docs_total | counter | 1 | Cumulative docs merged | +| elasticsearch_indices_merges_total | counter | 1 | Total merges | +| elasticsearch_indices_merges_total_size_bytes_total | counter | 1 | Total merge size in bytes | +| elasticsearch_indices_merges_total_time_seconds_total | counter | 1 | Total time spent merging in seconds | +| elasticsearch_indices_query_cache_cache_total | counter | 1 | Count of query cache | +| elasticsearch_indices_query_cache_cache_size | gauge | 1 | Size of query cache | +| elasticsearch_indices_query_cache_count | counter | 2 | Count of query cache hit/miss | +| elasticsearch_indices_query_cache_evictions | counter | 1 | Evictions from query cache | +| elasticsearch_indices_query_cache_memory_size_bytes | gauge | 1 | Query cache memory usage in bytes | +| elasticsearch_indices_query_cache_total | counter | 1 | Size of query cache total | +| elasticsearch_indices_refresh_time_seconds_total | counter | 1 | Total time spent refreshing in seconds | +| elasticsearch_indices_refresh_total | counter | 1 | Total refreshes | +| elasticsearch_indices_request_cache_count | counter | 2 | Count of request cache hit/miss | +| elasticsearch_indices_request_cache_evictions | counter | 1 | Evictions from request cache | +| elasticsearch_indices_request_cache_memory_size_bytes | gauge | 1 | Request cache memory usage in bytes | +| elasticsearch_indices_search_fetch_time_seconds | counter | 1 | Total search fetch time in seconds | +| elasticsearch_indices_search_fetch_total | counter | 1 | Total number of fetches | +| elasticsearch_indices_search_query_time_seconds | counter | 1 | Total search query time in seconds | +| elasticsearch_indices_search_query_total | counter | 1 | Total number of queries | +| elasticsearch_indices_segments_count | gauge | 1 | Count of index segments on this node | +| elasticsearch_indices_segments_memory_bytes | gauge | 1 | Current memory size of segments in bytes | +| elasticsearch_indices_settings_creation_timestamp_seconds | gauge | 1 | Timestamp of the index creation in seconds | +| elasticsearch_indices_settings_stats_read_only_indices | gauge | 1 | Count of indices that have read_only_allow_delete=true | +| elasticsearch_indices_settings_total_fields | gauge | | Index setting value for index.mapping.total_fields.limit (total allowable mapped fields in a index) | +| elasticsearch_indices_settings_replicas | gauge | | Index setting value for index.replicas | +| elasticsearch_indices_shards_docs | gauge | 3 | Count of documents on this shard | +| elasticsearch_indices_shards_docs_deleted | gauge | 3 | Count of deleted documents on each shard | +| elasticsearch_indices_store_size_bytes | gauge | 1 | Current size of stored index data in bytes | +| elasticsearch_indices_store_size_bytes_primary | gauge | | Current size of stored index data in bytes with only primary shards on all nodes | +| elasticsearch_indices_store_size_bytes_total | gauge | | Current size of stored index data in bytes with all shards on all nodes | +| elasticsearch_indices_store_throttle_time_seconds_total | counter | 1 | Throttle time for index store in seconds | +| elasticsearch_indices_translog_operations | counter | 1 | Total translog operations | +| elasticsearch_indices_translog_size_in_bytes | counter | 1 | Total translog size in bytes | +| elasticsearch_indices_warmer_time_seconds_total | counter | 1 | Total warmer time in seconds | +| elasticsearch_indices_warmer_total | counter | 1 | Total warmer count | +| elasticsearch_jvm_gc_collection_seconds_count | counter | 2 | Count of JVM GC runs | +| elasticsearch_jvm_gc_collection_seconds_sum | counter | 2 | GC run time in seconds | +| elasticsearch_jvm_memory_committed_bytes | gauge | 2 | JVM memory currently committed by area | +| elasticsearch_jvm_memory_max_bytes | gauge | 1 | JVM memory max | +| elasticsearch_jvm_memory_used_bytes | gauge | 2 | JVM memory currently used by area | +| elasticsearch_jvm_memory_pool_used_bytes | gauge | 3 | JVM memory currently used by pool | +| elasticsearch_jvm_memory_pool_max_bytes | counter | 3 | JVM memory max by pool | +| elasticsearch_jvm_memory_pool_peak_used_bytes | counter | 3 | JVM memory peak used by pool | +| elasticsearch_jvm_memory_pool_peak_max_bytes | counter | 3 | JVM memory peak max by pool | +| elasticsearch_os_cpu_percent | gauge | 1 | Percent CPU used by the OS | +| elasticsearch_os_load1 | gauge | 1 | Shortterm load average | +| elasticsearch_os_load5 | gauge | 1 | Midterm load average | +| elasticsearch_os_load15 | gauge | 1 | Longterm load average | +| elasticsearch_process_cpu_percent | gauge | 1 | Percent CPU used by process | +| elasticsearch_process_cpu_seconds_total | counter | 1 | Process CPU time in seconds | +| elasticsearch_process_mem_resident_size_bytes | gauge | 1 | Resident memory in use by process in bytes | +| elasticsearch_process_mem_share_size_bytes | gauge | 1 | Shared memory in use by process in bytes | +| elasticsearch_process_mem_virtual_size_bytes | gauge | 1 | Total virtual memory used in bytes | +| elasticsearch_process_open_files_count | gauge | 1 | Open file descriptors | +| elasticsearch_snapshot_stats_number_of_snapshots | gauge | 1 | Total number of snapshots | +| elasticsearch_snapshot_stats_oldest_snapshot_timestamp | gauge | 1 | Oldest snapshot timestamp | +| elasticsearch_snapshot_stats_snapshot_start_time_timestamp | gauge | 1 | Last snapshot start timestamp | +| elasticsearch_snapshot_stats_latest_snapshot_timestamp_seconds | gauge | 1 | Timestamp of the latest SUCCESS or PARTIAL snapshot | +| elasticsearch_snapshot_stats_snapshot_end_time_timestamp | gauge | 1 | Last snapshot end timestamp | +| elasticsearch_snapshot_stats_snapshot_number_of_failures | gauge | 1 | Last snapshot number of failures | +| elasticsearch_snapshot_stats_snapshot_number_of_indices | gauge | 1 | Last snapshot number of indices | +| elasticsearch_snapshot_stats_snapshot_failed_shards | gauge | 1 | Last snapshot failed shards | +| elasticsearch_snapshot_stats_snapshot_successful_shards | gauge | 1 | Last snapshot successful shards | +| elasticsearch_snapshot_stats_snapshot_total_shards | gauge | 1 | Last snapshot total shard | +| elasticsearch_thread_pool_active_count | gauge | 14 | Thread Pool threads active | +| elasticsearch_thread_pool_completed_count | counter | 14 | Thread Pool operations completed | +| elasticsearch_thread_pool_largest_count | gauge | 14 | Thread Pool largest threads count | +| elasticsearch_thread_pool_queue_count | gauge | 14 | Thread Pool operations queued | +| elasticsearch_thread_pool_rejected_count | counter | 14 | Thread Pool operations rejected | +| elasticsearch_thread_pool_threads_count | gauge | 14 | Thread Pool current threads count | +| elasticsearch_transport_rx_packets_total | counter | 1 | Count of packets received | +| elasticsearch_transport_rx_size_bytes_total | counter | 1 | Total number of bytes received | +| elasticsearch_transport_tx_packets_total | counter | 1 | Count of packets sent | +| elasticsearch_transport_tx_size_bytes_total | counter | 1 | Total number of bytes sent | +| elasticsearch_clusterinfo_last_retrieval_success_ts | gauge | 1 | Timestamp of the last successful cluster info retrieval | +| elasticsearch_clusterinfo_up | gauge | 1 | Up metric for the cluster info collector | +| elasticsearch_clusterinfo_version_info | gauge | 6 | Constant metric with ES version information as labels | +| elasticsearch_slm_stats_up | gauge | 0 | Up metric for SLM collector | +| elasticsearch_slm_stats_total_scrapes | counter | 0 | Number of scrapes for SLM collector | +| elasticsearch_slm_stats_json_parse_failures | counter | 0 | JSON parse failures for SLM collector | +| elasticsearch_slm_stats_retention_runs_total | counter | 0 | Total retention runs | +| elasticsearch_slm_stats_retention_failed_total | counter | 0 | Total failed retention runs | +| elasticsearch_slm_stats_retention_timed_out_total | counter | 0 | Total retention run timeouts | +| elasticsearch_slm_stats_retention_deletion_time_seconds | gauge | 0 | Retention run deletion time | +| elasticsearch_slm_stats_total_snapshots_taken_total | counter | 0 | Total snapshots taken | +| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed | +| elasticsearch_slm_stats_total_snapshots_deleted_total | counter | 0 | Total snapshots deleted | +| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed | +| elasticsearch_slm_stats_snapshots_taken_total | counter | 1 | Snapshots taken by policy | +| elasticsearch_slm_stats_snapshots_failed_total | counter | 1 | Snapshots failed by policy | +| elasticsearch_slm_stats_snapshots_deleted_total | counter | 1 | Snapshots deleted by policy | +| elasticsearch_slm_stats_snapshot_deletion_failures_total | counter | 1 | Snapshot deletion failures by policy | +| elasticsearch_slm_stats_operation_mode | gauge | 1 | SLM operation mode (Running, stopping, stopped) | +| elasticsearch_data_stream_stats_up | gauge | 0 | Up metric for Data Stream collection | +| elasticsearch_data_stream_stats_total_scrapes | counter | 0 | Total scrapes for Data Stream stats | +| elasticsearch_data_stream_stats_json_parse_failures | counter | 0 | Number of parsing failures for Data Stream stats | +| elasticsearch_data_stream_backing_indices_total | gauge | 1 | Number of backing indices for Data Stream | +| elasticsearch_data_stream_store_size_bytes | gauge | 1 | Current size of data stream backing indices in bytes | +| elasticsearch_health_report_creating_primaries | gauge | 1 | The number of creating primary shards | +| elasticsearch_health_report_creating_replicas | gauge | 1 | The number of creating replica shards | +| elasticsearch_health_report_data_stream_lifecycle_status | gauge | 2 | Data stream lifecycle status | +| elasticsearch_health_report_disk_status | gauge | 2 | disk status | +| elasticsearch_health_report_ilm_policies | gauge | 1 | The number of ILM Policies | +| elasticsearch_health_report_ilm_stagnating_indices | gauge | 1 | The number of stagnating indices | +| elasticsearch_health_report_ilm_status | gauge | 2 | ILM status | +| elasticsearch_health_report_initializing_primaries | gauge | 1 | The number of initializing primary shards | +| elasticsearch_health_report_initializing_replicas | gauge | 1 | The number of initializing replica shards | +| elasticsearch_health_report_master_is_stable_status | gauge | 2 | Master is stable status | +| elasticsearch_health_report_max_shards_in_cluster_data | gauge | 1 | The number of maximum shards in a cluster | +| elasticsearch_health_report_max_shards_in_cluster_frozen | gauge | 1 | The number of maximum frozen shards in a cluster | +| elasticsearch_health_report_repository_integrity_status | gauge | 2 | Repository integrity status | +| elasticsearch_health_report_restarting_primaries | gauge | 1 | The number of restarting primary shards | +| elasticsearch_health_report_restarting_replicas | gauge | 1 | The number of restarting replica shards | +| elasticsearch_health_report_shards_availabilty_status | gauge | 2 | Shards availabilty status | +| elasticsearch_health_report_shards_capacity_status | gauge | 2 | Shards capacity status | +| elasticsearch_health_report_slm_policies | gauge | 1 | The number of SLM policies | +| elasticsearch_health_report_slm_status | gauge | 2 | SLM status | +| elasticsearch_health_report_started_primaries | gauge | 1 | The number of started primary shards | +| elasticsearch_health_report_started_replicas | gauge | 1 | The number of started replica shards | +| elasticsearch_health_report_status | gauge | 2 | Overall cluster status | +| elasticsearch_health_report_total_repositories | gauge | 1 | The number snapshot repositories | +| elasticsearch_health_report_unassigned_primaries | gauge | 1 | The number of unassigned primary shards | +| elasticsearch_health_report_unassigned_replicas | gauge | 1 | The number of unassigned replica shards | ### Alerts & Recording Rules diff --git a/config/config.go b/config/config.go new file mode 100644 index 00000000..e50660ea --- /dev/null +++ b/config/config.go @@ -0,0 +1,37 @@ +package config + +import ( + "os" + + "gopkg.in/yaml.v3" +) + +// Config represents the YAML configuration file structure. +type Config struct { + AuthModules map[string]AuthModule `yaml:"auth_modules"` +} + +type AuthModule struct { + Type string `yaml:"type"` + UserPass *UserPassConfig `yaml:"userpass,omitempty"` + APIKey string `yaml:"apikey,omitempty"` + Options map[string]string `yaml:"options,omitempty"` +} + +type UserPassConfig struct { + Username string `yaml:"username"` + Password string `yaml:"password"` +} + +// LoadConfig reads and parses YAML config file. +func LoadConfig(path string) (*Config, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, err + } + return &cfg, nil +} diff --git a/config/config_test.go b/config/config_test.go new file mode 100644 index 00000000..00caac26 --- /dev/null +++ b/config/config_test.go @@ -0,0 +1,34 @@ +package config + +import ( + "os" + "testing" +) + +func TestLoadConfig(t *testing.T) { + yaml := `auth_modules: + foo: + type: userpass + userpass: + username: bar + password: baz + options: + sslmode: disable +` + tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") + if err != nil { + t.Fatalf("temp file: %v", err) + } + if _, err := tmp.WriteString(yaml); err != nil { + t.Fatalf("write: %v", err) + } + tmp.Close() + + cfg, err := LoadConfig(tmp.Name()) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + if cfg.AuthModules["foo"].UserPass.Username != "bar" { + t.Fatalf("unexpected username: %s", cfg.AuthModules["foo"].UserPass.Username) + } +} diff --git a/examples/auth_modules.yml b/examples/auth_modules.yml new file mode 100644 index 00000000..b1b8d925 --- /dev/null +++ b/examples/auth_modules.yml @@ -0,0 +1,25 @@ +# Example exporter-config.yml demonstrating multiple auth modules +# Each module can be referenced with ?auth_module= in /probe requests. + +auth_modules: + # 1. Simple basic-auth over HTTPS ---------------------------------------- + prod_basic: + type: userpass + userpass: + username: metrics + password: s3cr3t + # extra URL query parameters are appended to the target DSN + options: + sslmode: disable # example option – becomes ?sslmode=disable + + # 2. Read-only account for staging cluster + staging_ro: + type: userpass + userpass: + username: readonly + password: changeme + + # 3. API-Key authentication (future) + prod_key: + type: apikey + apikey: BASE64-ENCODED-KEY== diff --git a/examples/example-prometheus.yml b/examples/example-prometheus.yml new file mode 100644 index 00000000..b2f8b19f --- /dev/null +++ b/examples/example-prometheus.yml @@ -0,0 +1,19 @@ +scrape_configs: + - job_name: es-multi + metrics_path: /probe + params: + auth_module: [prod_key] # default for all targets in this block + static_configs: + - targets: + - https://es-prod:9200 # uses prod_key + - targets: + - https://es-stage:9200 # will override auth_module below + labels: + __param_auth_module: staging_basic + relabel_configs: + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: exporter:9114 # host:port of the single exporter diff --git a/go.mod b/go.mod index 0ba71fca..5679a417 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,6 @@ module github.com/prometheus-community/elasticsearch_exporter -go 1.23.0 - -toolchain go1.24.1 +go 1.24.1 require ( github.com/alecthomas/kingpin/v2 v2.4.0 @@ -15,6 +13,7 @@ require ( github.com/prometheus/client_golang v1.22.0 github.com/prometheus/common v0.65.0 github.com/prometheus/exporter-toolkit v0.14.0 + gopkg.in/yaml.v3 v3.0.1 ) require ( diff --git a/main.go b/main.go index a8405f45..89533875 100644 --- a/main.go +++ b/main.go @@ -36,6 +36,7 @@ import ( webflag "github.com/prometheus/exporter-toolkit/web/kingpinflag" "github.com/prometheus-community/elasticsearch_exporter/collector" + "github.com/prometheus-community/elasticsearch_exporter/config" "github.com/prometheus-community/elasticsearch_exporter/pkg/clusterinfo" "github.com/prometheus-community/elasticsearch_exporter/pkg/roundtripper" ) @@ -109,6 +110,7 @@ func main() { awsRoleArn = kingpin.Flag("aws.role-arn", "Role ARN of an IAM role to assume."). Default("").String() + configFile = kingpin.Flag("config.file", "Path to YAML configuration file.").Default("").String() ) promslogConfig := &promslog.Config{} @@ -117,6 +119,18 @@ func main() { kingpin.CommandLine.HelpFlag.Short('h') kingpin.Parse() + // Load optional YAML config + var cfg *config.Config + if *configFile != "" { + var cfgErr error + cfg, cfgErr = config.LoadConfig(*configFile) + if cfgErr != nil { + // At this stage logger not yet created; fallback to stderr + fmt.Fprintf(os.Stderr, "failed to load config file: %v\n", cfgErr) + os.Exit(1) + } + } + var w io.Writer switch strings.ToLower(*logOutput) { case "stderr": @@ -238,7 +252,19 @@ func main() { // register cluster info retriever as prometheus collector prometheus.MustRegister(clusterInfoRetriever) - http.Handle(*metricsPath, promhttp.Handler()) + http.HandleFunc(*metricsPath, func(w http.ResponseWriter, r *http.Request) { + // If query has target param treat like probe endpoint + if r.URL.Query().Has("target") { + // reuse probe logic by delegating to /probe handler implementation + r.URL.Path = "/probe" // set path for consistency in logs + if probeHandler, _ := http.DefaultServeMux.Handler(&http.Request{URL: &url.URL{Path: "/probe"}}); probeHandler != nil { + probeHandler.ServeHTTP(w, r) + return + } + } + promhttp.Handler().ServeHTTP(w, r) + }) + if *metricsPath != "/" && *metricsPath != "" { landingConfig := web.LandingConfig{ Name: "Elasticsearch Exporter", @@ -264,6 +290,80 @@ func main() { http.Error(w, http.StatusText(http.StatusOK), http.StatusOK) }) + // probe endpoint + http.HandleFunc("/probe", func(w http.ResponseWriter, r *http.Request) { + origQuery := r.URL.Query() + targetStr, am, valErr := validateProbeParams(cfg, origQuery) + if valErr != nil { + http.Error(w, valErr.Error(), http.StatusBadRequest) + return + } + targetURL, _ := url.Parse(targetStr) + if am != nil { + if am.UserPass != nil { + targetURL.User = url.UserPassword(am.UserPass.Username, am.UserPass.Password) + } + if len(am.Options) > 0 { + q := targetURL.Query() + for k, v := range am.Options { + q.Set(k, v) + } + targetURL.RawQuery = q.Encode() + } + } + + // Build a dedicated HTTP client for this probe request (reuse TLS opts, timeout, etc.). + tlsCfg := createTLSConfig(*esCA, *esClientCert, *esClientPrivateKey, *esInsecureSkipVerify) + var transport http.RoundTripper = &http.Transport{ + TLSClientConfig: tlsCfg, + Proxy: http.ProxyFromEnvironment, + } + + // inject API key header if auth_module is of type apikey + if am != nil && strings.EqualFold(am.Type, "apikey") && am.APIKey != "" { + transport = &transportWithAPIKey{ + underlyingTransport: transport, + apiKey: am.APIKey, + } + } + probeClient := &http.Client{ + Timeout: *esTimeout, + Transport: transport, + } + + reg := prometheus.NewRegistry() + + // Core exporter collector + exp, err := collector.NewElasticsearchCollector( + logger, + []string{}, + collector.WithElasticsearchURL(targetURL), + collector.WithHTTPClient(probeClient), + ) + if err != nil { + http.Error(w, "failed to create exporter", http.StatusInternalServerError) + return + } + reg.MustRegister(exp) + // Basic additional collectors – reuse global CLI flags + reg.MustRegister(collector.NewClusterHealth(logger, probeClient, targetURL)) + reg.MustRegister(collector.NewNodes(logger, probeClient, targetURL, *esAllNodes, *esNode)) + if *esExportIndices || *esExportShards { + shardsC := collector.NewShards(logger, probeClient, targetURL) + indicesC := collector.NewIndices(logger, probeClient, targetURL, *esExportShards, *esExportIndexAliases) + reg.MustRegister(shardsC) + reg.MustRegister(indicesC) + } + if *esExportIndicesSettings { + reg.MustRegister(collector.NewIndicesSettings(logger, probeClient, targetURL)) + } + if *esExportIndicesMappings { + reg.MustRegister(collector.NewIndicesMappings(logger, probeClient, targetURL)) + } + + promhttp.HandlerFor(reg, promhttp.HandlerOpts{}).ServeHTTP(w, r) + }) + server := &http.Server{} go func() { if err = web.ListenAndServe(server, toolkitFlags, logger); err != nil { diff --git a/probe.go b/probe.go new file mode 100644 index 00000000..db9cc814 --- /dev/null +++ b/probe.go @@ -0,0 +1,54 @@ +package main + +import ( + "errors" + "net/url" + "strings" + + "github.com/prometheus-community/elasticsearch_exporter/config" +) + +var ( + errMissingTarget = errors.New("missing target parameter") + errInvalidTarget = errors.New("invalid target parameter") + errModuleNotFound = errors.New("auth_module not found") + errUnsupportedModule = errors.New("unsupported auth_module type") +) + +// validateProbeParams performs upfront validation of the query parameters. +// It returns the target string (as given), the resolved AuthModule (optional), or an error. +func validateProbeParams(cfg *config.Config, q url.Values) (string, *config.AuthModule, error) { + target := q.Get("target") + if target == "" { + return "", nil, errMissingTarget + } + if _, err := url.Parse(target); err != nil { + return "", nil, errInvalidTarget + } + + modu := q.Get("auth_module") + if modu == "" { + return target, nil, nil // no auth module requested + } + if cfg == nil { + return "", nil, errModuleNotFound + } + am, ok := cfg.AuthModules[modu] + if !ok { + return "", nil, errModuleNotFound + } + switch strings.ToLower(am.Type) { + case "userpass": + if am.UserPass != nil { + return target, &am, nil + } + return "", nil, errUnsupportedModule + case "apikey": + if am.APIKey != "" { + return target, &am, nil + } + return "", nil, errUnsupportedModule + default: + return "", nil, errUnsupportedModule + } +} diff --git a/probe_test.go b/probe_test.go new file mode 100644 index 00000000..ad95a940 --- /dev/null +++ b/probe_test.go @@ -0,0 +1,44 @@ +package main + +import ( + "net/url" + "testing" + + "github.com/prometheus-community/elasticsearch_exporter/config" +) + +func TestValidateProbeParams(t *testing.T) { + cfg := &config.Config{AuthModules: map[string]config.AuthModule{}} + // missing target + _, _, err := validateProbeParams(cfg, url.Values{}) + if err != errMissingTarget { + t.Fatalf("expected missing target error, got %v", err) + } + + // invalid target + vals := url.Values{} + vals.Set("target", "http://[::1") + _, _, err = validateProbeParams(cfg, vals) + if err == nil { + t.Fatalf("expected invalid target error") + } + + // unknown module + vals = url.Values{} + vals.Set("target", "http://localhost:9200") + vals.Set("auth_module", "foo") + _, _, err = validateProbeParams(cfg, vals) + if err != errModuleNotFound { + t.Fatalf("expected module not found error, got %v", err) + } + + // good path + cfg.AuthModules["foo"] = config.AuthModule{Type: "userpass", UserPass: &config.UserPassConfig{Username: "u", Password: "p"}} + vals = url.Values{} + vals.Set("target", "http://localhost:9200") + vals.Set("auth_module", "foo") + tgt, am, err := validateProbeParams(cfg, vals) + if err != nil || am == nil || tgt == "" { + t.Fatalf("expected success, got err=%v", err) + } +} From 0a54b3f4746d56f7e2d85d41ae476d37a8671b9c Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 27 Jul 2025 10:12:33 -0400 Subject: [PATCH 02/25] Update example-prometheus.yml Signed-off-by: pincher95 --- examples/example-prometheus.yml | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/example-prometheus.yml b/examples/example-prometheus.yml index b2f8b19f..f63be37a 100644 --- a/examples/example-prometheus.yml +++ b/examples/example-prometheus.yml @@ -1,19 +1,33 @@ scrape_configs: - job_name: es-multi metrics_path: /probe + # Default parameters for all scrapes in this job. + # Can be overridden by labels on a per-target basis. params: - auth_module: [prod_key] # default for all targets in this block + auth_module: [prod_key] static_configs: + # This is a target group. All targets here will use the default 'prod_key' auth_module. - targets: - - https://es-prod:9200 # uses prod_key + - https://es-prod-1:9200 + - https://es-prod-2:9200 + # This is another target group. - targets: - - https://es-stage:9200 # will override auth_module below + - https://es-stage:9200 + # The __param_ prefix on a label causes it to be added as a URL parameter. + # This will override the default auth_module for this target. labels: __param_auth_module: staging_basic relabel_configs: + # The following relabeling rules are applied to every target. + + # 1. The special label __address__ (the target address) is saved as the 'target' URL parameter. - source_labels: [__address__] target_label: __param_target + + # 2. The 'target' parameter is used as the 'instance' label for the scraped metrics. - source_labels: [__param_target] target_label: instance + + # 3. The scrape address is rewritten to point to the exporter. - target_label: __address__ replacement: exporter:9114 # host:port of the single exporter From a5d094228738335644c584b15023590705bed866 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 27 Jul 2025 10:17:22 -0400 Subject: [PATCH 03/25] Make `es.uri` optional by setting default to empty string check if it's empty and if so, don't parse it Signed-off-by: pincher95 Signed-off-by: pincher95 --- main.go | 182 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 93 insertions(+), 89 deletions(-) diff --git a/main.go b/main.go index 89533875..cc5066e4 100644 --- a/main.go +++ b/main.go @@ -61,7 +61,7 @@ func main() { toolkitFlags = webflag.AddFlags(kingpin.CommandLine, ":9114") esURI = kingpin.Flag("es.uri", "HTTP API address of an Elasticsearch node."). - Default("http://localhost:9200").String() + Default("").String() esTimeout = kingpin.Flag("es.timeout", "Timeout for trying to get stats from Elasticsearch."). Default("5s").Duration() @@ -143,114 +143,118 @@ func main() { promslogConfig.Writer = w logger := promslog.New(promslogConfig) - esURL, err := url.Parse(*esURI) - if err != nil { - logger.Error("failed to parse es.uri", "err", err) - os.Exit(1) - } + // version metric + prometheus.MustRegister(versioncollector.NewCollector(name)) - esUsername := os.Getenv("ES_USERNAME") - esPassword := os.Getenv("ES_PASSWORD") + // Create a context that is cancelled on SIGKILL or SIGINT. + ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, os.Kill) + defer cancel() - if esUsername != "" && esPassword != "" { - esURL.User = url.UserPassword(esUsername, esPassword) - } + if *esURI != "" { + esURL, err := url.Parse(*esURI) + if err != nil { + logger.Error("failed to parse es.uri", "err", err) + os.Exit(1) + } - // returns nil if not provided and falls back to simple TCP. - tlsConfig := createTLSConfig(*esCA, *esClientCert, *esClientPrivateKey, *esInsecureSkipVerify) + esUsername := os.Getenv("ES_USERNAME") + esPassword := os.Getenv("ES_PASSWORD") - var httpTransport http.RoundTripper + if esUsername != "" && esPassword != "" { + esURL.User = url.UserPassword(esUsername, esPassword) + } - httpTransport = &http.Transport{ - TLSClientConfig: tlsConfig, - Proxy: http.ProxyFromEnvironment, - } + // returns nil if not provided and falls back to simple TCP. + tlsConfig := createTLSConfig(*esCA, *esClientCert, *esClientPrivateKey, *esInsecureSkipVerify) - esAPIKey := os.Getenv("ES_API_KEY") + var httpTransport http.RoundTripper - if esAPIKey != "" { - httpTransport = &transportWithAPIKey{ - underlyingTransport: httpTransport, - apiKey: esAPIKey, + httpTransport = &http.Transport{ + TLSClientConfig: tlsConfig, + Proxy: http.ProxyFromEnvironment, } - } - httpClient := &http.Client{ - Timeout: *esTimeout, - Transport: httpTransport, - } + esAPIKey := os.Getenv("ES_API_KEY") - if *awsRegion != "" { - httpClient.Transport, err = roundtripper.NewAWSSigningTransport(httpTransport, *awsRegion, *awsRoleArn, logger) - if err != nil { - logger.Error("failed to create AWS transport", "err", err) - os.Exit(1) + if esAPIKey != "" { + httpTransport = &transportWithAPIKey{ + underlyingTransport: httpTransport, + apiKey: esAPIKey, + } } - } - // version metric - prometheus.MustRegister(versioncollector.NewCollector(name)) + httpClient := &http.Client{ + Timeout: *esTimeout, + Transport: httpTransport, + } - // create the exporter - exporter, err := collector.NewElasticsearchCollector( - logger, - []string{}, - collector.WithElasticsearchURL(esURL), - collector.WithHTTPClient(httpClient), - ) - if err != nil { - logger.Error("failed to create Elasticsearch collector", "err", err) - os.Exit(1) - } - prometheus.MustRegister(exporter) - - // TODO(@sysadmind): Remove this when we have a better way to get the cluster name to down stream collectors. - // cluster info retriever - clusterInfoRetriever := clusterinfo.New(logger, httpClient, esURL, *esClusterInfoInterval) - - prometheus.MustRegister(collector.NewClusterHealth(logger, httpClient, esURL)) - prometheus.MustRegister(collector.NewNodes(logger, httpClient, esURL, *esAllNodes, *esNode)) - - if *esExportIndices || *esExportShards { - sC := collector.NewShards(logger, httpClient, esURL) - prometheus.MustRegister(sC) - iC := collector.NewIndices(logger, httpClient, esURL, *esExportShards, *esExportIndexAliases) - prometheus.MustRegister(iC) - if registerErr := clusterInfoRetriever.RegisterConsumer(iC); registerErr != nil { - logger.Error("failed to register indices collector in cluster info") - os.Exit(1) + if *awsRegion != "" { + var err error + httpClient.Transport, err = roundtripper.NewAWSSigningTransport(httpTransport, *awsRegion, *awsRoleArn, logger) + if err != nil { + logger.Error("failed to create AWS transport", "err", err) + os.Exit(1) + } } - if registerErr := clusterInfoRetriever.RegisterConsumer(sC); registerErr != nil { - logger.Error("failed to register shards collector in cluster info") + + // create the exporter + exporter, err := collector.NewElasticsearchCollector( + logger, + []string{}, + collector.WithElasticsearchURL(esURL), + collector.WithHTTPClient(httpClient), + ) + if err != nil { + logger.Error("failed to create Elasticsearch collector", "err", err) os.Exit(1) } - } + prometheus.MustRegister(exporter) - if *esExportIndicesSettings { - prometheus.MustRegister(collector.NewIndicesSettings(logger, httpClient, esURL)) - } + // TODO(@sysadmind): Remove this when we have a better way to get the cluster name to down stream collectors. + // cluster info retriever + clusterInfoRetriever := clusterinfo.New(logger, httpClient, esURL, *esClusterInfoInterval) - if *esExportIndicesMappings { - prometheus.MustRegister(collector.NewIndicesMappings(logger, httpClient, esURL)) - } + prometheus.MustRegister(collector.NewClusterHealth(logger, httpClient, esURL)) + prometheus.MustRegister(collector.NewNodes(logger, httpClient, esURL, *esAllNodes, *esNode)) - // Create a context that is cancelled on SIGKILL or SIGINT. - ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, os.Kill) - defer cancel() + if *esExportIndices || *esExportShards { + sC := collector.NewShards(logger, httpClient, esURL) + prometheus.MustRegister(sC) + iC := collector.NewIndices(logger, httpClient, esURL, *esExportShards, *esExportIndexAliases) + prometheus.MustRegister(iC) + if registerErr := clusterInfoRetriever.RegisterConsumer(iC); registerErr != nil { + logger.Error("failed to register indices collector in cluster info") + os.Exit(1) + } + if registerErr := clusterInfoRetriever.RegisterConsumer(sC); registerErr != nil { + logger.Error("failed to register shards collector in cluster info") + os.Exit(1) + } + } - // start the cluster info retriever - switch runErr := clusterInfoRetriever.Run(ctx); runErr { - case nil: - logger.Info("started cluster info retriever", "interval", (*esClusterInfoInterval).String()) - case clusterinfo.ErrInitialCallTimeout: - logger.Info("initial cluster info call timed out") - default: - logger.Error("failed to run cluster info retriever", "err", err) - os.Exit(1) - } + if *esExportIndicesSettings { + prometheus.MustRegister(collector.NewIndicesSettings(logger, httpClient, esURL)) + } - // register cluster info retriever as prometheus collector - prometheus.MustRegister(clusterInfoRetriever) + if *esExportIndicesMappings { + prometheus.MustRegister(collector.NewIndicesMappings(logger, httpClient, esURL)) + } + + // start the cluster info retriever + switch runErr := clusterInfoRetriever.Run(ctx); runErr { + case nil: + logger.Info("started cluster info retriever", "interval", (*esClusterInfoInterval).String()) + case clusterinfo.ErrInitialCallTimeout: + logger.Info("initial cluster info call timed out") + default: + var err error + logger.Error("failed to run cluster info retriever", "err", err) + os.Exit(1) + } + + // register cluster info retriever as prometheus collector + prometheus.MustRegister(clusterInfoRetriever) + } http.HandleFunc(*metricsPath, func(w http.ResponseWriter, r *http.Request) { // If query has target param treat like probe endpoint @@ -366,7 +370,7 @@ func main() { server := &http.Server{} go func() { - if err = web.ListenAndServe(server, toolkitFlags, logger); err != nil { + if err := web.ListenAndServe(server, toolkitFlags, logger); err != nil { logger.Error("http server quit", "err", err) os.Exit(1) } From 5957b0d230046cad9b1505a95a657bc83f183ac7 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 27 Jul 2025 10:25:12 -0400 Subject: [PATCH 04/25] Update README.md Signed-off-by: pincher95 --- README.md | 75 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 75b747d3..adeaefab 100644 --- a/README.md +++ b/README.md @@ -52,32 +52,33 @@ Below is the command line options summary: elasticsearch_exporter --help ``` -| Argument | Introduced in Version | Description | Default | -| ------------------------- | --------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------- | -| collector.clustersettings | 1.6.0 | If true, query stats for cluster settings (As of v1.6.0, this flag has replaced "es.cluster_settings"). | false | -| es.uri | 1.0.2 | Address (host and port) of the Elasticsearch node we should connect to. This could be a local node (`localhost:9200`, for instance), or the address of a remote Elasticsearch server. When basic auth is needed, specify as: `://:@:`. E.G., `http://admin:pass@localhost:9200`. Special characters in the user credentials need to be URL-encoded. | | -| es.all | 1.0.2 | If true, query stats for all nodes in the cluster, rather than just the node we connect to. | false | -| es.indices | 1.0.2 | If true, query stats for all indices in the cluster. | false | -| es.indices_settings | 1.0.4rc1 | If true, query settings stats for all indices in the cluster. | false | -| es.indices_mappings | 1.2.0 | If true, query stats for mappings of all indices of the cluster. | false | -| es.aliases | 1.0.4rc1 | If true, include informational aliases metrics. | true | -| es.ilm | 1.6.0 | If true, query index lifecycle policies for indices in the cluster. | -| es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | -| collector.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. (As of v1.7.0, this flag has replaced "es.snapshots"). | false | -| collector.health-report | 1.10.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | -| es.slm | | If true, query stats for SLM. | false | -| es.data_stream | | If true, query state for Data Steams. | false | -| es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | -| es.ca | 1.0.2 | Path to PEM file that contains trusted Certificate Authorities for the Elasticsearch connection. | | -| es.client-private-key | 1.0.2 | Path to PEM file that contains the private key for client auth when connecting to Elasticsearch. | | -| es.client-cert | 1.0.2 | Path to PEM file that contains the corresponding cert for the private key to connect to Elasticsearch. | | -| es.clusterinfo.interval | 1.1.0rc1 | Cluster info update interval for the cluster label | 5m | -| es.ssl-skip-verify | 1.0.4rc1 | Skip SSL verification when connecting to Elasticsearch. | false | -| web.listen-address | 1.0.2 | Address to listen on for web interface and telemetry. | :9114 | -| web.telemetry-path | 1.0.2 | Path under which to expose metrics. | /metrics | -| aws.region | 1.5.0 | Region for AWS elasticsearch | | -| aws.role-arn | 1.6.0 | Role ARN of an IAM role to assume. | | -| version | 1.0.2 | Show version info on stdout and exit. | | +| Argument | Introduced in Version | Description | Default | +| ------------------------- | --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | +| collector.clustersettings | 1.6.0 | If true, query stats for cluster settings (As of v1.6.0, this flag has replaced "es.cluster_settings"). | false | +| es.uri | 1.0.2 | Address (host and port) of the Elasticsearch node we should connect to **when running in single-target mode**. Leave empty (the default) when you want to run the exporter only as a multi-target `/probe` endpoint. When basic auth is needed, specify as: `://:@:`. E.G., `http://admin:pass@localhost:9200`. Special characters in the user credentials need to be URL-encoded. | "" | +| es.all | 1.0.2 | If true, query stats for all nodes in the cluster, rather than just the node we connect to. | false | +| es.indices | 1.0.2 | If true, query stats for all indices in the cluster. | false | +| es.indices_settings | 1.0.4rc1 | If true, query settings stats for all indices in the cluster. | false | +| es.indices_mappings | 1.2.0 | If true, query stats for mappings of all indices of the cluster. | false | +| es.aliases | 1.0.4rc1 | If true, include informational aliases metrics. | true | +| es.ilm | 1.6.0 | If true, query index lifecycle policies for indices in the cluster. | +| es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | +| collector.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. (As of v1.7.0, this flag has replaced "es.snapshots"). | false | +| collector.health-report | 1.10.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | +| es.slm | | If true, query stats for SLM. | false | +| es.data_stream | | If true, query state for Data Steams. | false | +| es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | +| es.ca | 1.0.2 | Path to PEM file that contains trusted Certificate Authorities for the Elasticsearch connection. | | +| es.client-private-key | 1.0.2 | Path to PEM file that contains the private key for client auth when connecting to Elasticsearch. | | +| es.client-cert | 1.0.2 | Path to PEM file that contains the corresponding cert for the private key to connect to Elasticsearch. | | +| es.clusterinfo.interval | 1.1.0rc1 | Cluster info update interval for the cluster label | 5m | +| es.ssl-skip-verify | 1.0.4rc1 | Skip SSL verification when connecting to Elasticsearch. | false | +| web.listen-address | 1.0.2 | Address to listen on for web interface and telemetry. | :9114 | +| web.telemetry-path | 1.0.2 | Path under which to expose metrics. | /metrics | +| aws.region | 1.5.0 | Region for AWS elasticsearch | | +| aws.role-arn | 1.6.0 | Role ARN of an IAM role to assume. | | +| config.file | 2.0.0 | Path to a YAML configuration file that defines `auth_modules:` used by the `/probe` multi-target endpoint. Leave unset when not using multi-target mode. | | +| version | 1.0.2 | Show version info on stdout and exit. | | Commandline parameters start with a single `-` for versions less than `1.1.0rc1`. For versions greater than `1.1.0rc1`, commandline parameters are specified with `--`. @@ -95,17 +96,17 @@ Specifying those two environment variables will override authentication passed i ES 7.x supports RBACs. The following security privileges are required for the elasticsearch_exporter. -| Setting | Privilege Required | Description | -| :------------------------ | :----------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------ | -| collector.clustersettings | `cluster` `monitor` | -| exporter defaults | `cluster` `monitor` | All cluster read-only operations, like cluster health and state, hot threads, node info, node and cluster stats, and pending cluster tasks. | -| es.indices | `indices` `monitor` (per index or `*`) | All actions that are required for monitoring (recovery, segments info, index stats and status) | -| es.indices_settings | `indices` `monitor` (per index or `*`) | -| es.indices_mappings | `indices` `view_index_metadata` (per index or `*`) | -| es.shards | not sure if `indices` or `cluster` `monitor` or both | -| collector.snapshots | `cluster:admin/snapshot/status` and `cluster:admin/repository/get` | [ES Forum Post](https://discuss.elastic.co/t/permissions-for-backup-user-with-x-pack/88057) | -| es.slm | `manage_slm` | -| es.data_stream | `monitor` or `manage` (per index or `*`) | +| Setting | Privilege Required | Description | +| :---------------------------- | :----------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------ | +| collector.**clustersettings** | `cluster` `monitor` | +| exporter defaults | `cluster` `monitor` | All cluster read-only operations, like cluster health and state, hot threads, node info, node and cluster stats, and pending cluster tasks. | +| es.indices | `indices` `monitor` (per index or `*`) | All actions that are required for monitoring (recovery, segments info, index stats and status) | +| es.indices_settings | `indices` `monitor` (per index or `*`) | +| es.indices_mappings | `indices` `view_index_metadata` (per index or `*`) | +| es.shards | not sure if `indices` or `cluster` `monitor` or both | +| collector.snapshots | `cluster:admin/snapshot/status` and `cluster:admin/repository/get` | [ES Forum Post](https://discuss.elastic.co/t/permissions-for-backup-user-with-x-pack/88057) | +| es.slm | `manage_slm` | +| es.data_stream | `monitor` or `manage` (per index or `*`) | Further Information From 90609cc1da8f2c19f1984240adaa6b9e8eaa4506 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Mon, 28 Jul 2025 13:23:07 -0400 Subject: [PATCH 05/25] Add sanity target scheme validation Signed-off-by: pincher95 --- probe.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/probe.go b/probe.go index db9cc814..331f49d3 100644 --- a/probe.go +++ b/probe.go @@ -22,6 +22,13 @@ func validateProbeParams(cfg *config.Config, q url.Values) (string, *config.Auth if target == "" { return "", nil, errMissingTarget } + + // If the target does not contain an URL scheme, default to http. + // This allows users to pass "host:port" without the "http://" prefix. + if !strings.Contains(target, "://") { + target = "http://" + target + } + if _, err := url.Parse(target); err != nil { return "", nil, errInvalidTarget } From 1c08e686c0700b4d781d4c58391ba865400e5376 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Wed, 30 Jul 2025 07:20:55 -0400 Subject: [PATCH 06/25] Change yaml package to go.yaml.in/yaml/v3 Signed-off-by: pincher95 --- config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.go b/config/config.go index e50660ea..646f4fa9 100644 --- a/config/config.go +++ b/config/config.go @@ -3,7 +3,7 @@ package config import ( "os" - "gopkg.in/yaml.v3" + "go.yaml.in/yaml/v3" ) // Config represents the YAML configuration file structure. From daecd52aa0897f7a3a090898f326c3e95404c9d7 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Wed, 30 Jul 2025 07:23:06 -0400 Subject: [PATCH 07/25] Update yaml package to go.yaml.in/yaml/v3 Signed-off-by: pincher95 --- go.mod | 2 +- go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 5679a417..faf92040 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/prometheus/client_golang v1.22.0 github.com/prometheus/common v0.65.0 github.com/prometheus/exporter-toolkit v0.14.0 - gopkg.in/yaml.v3 v3.0.1 + go.yaml.in/yaml/v3 v3.0.4 ) require ( diff --git a/go.sum b/go.sum index 98d5e97f..6599a6bf 100644 --- a/go.sum +++ b/go.sum @@ -82,6 +82,8 @@ github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOf github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= From 58d29655442e8e9748eebc7b78ed034ac666dc8c Mon Sep 17 00:00:00 2001 From: pincher95 Date: Wed, 30 Jul 2025 07:24:21 -0400 Subject: [PATCH 08/25] Update CHANGELOG.md Signed-off-by: pincher95 --- CHANGELOG.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50d97789..98bd0508 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,17 +1,18 @@ ## master / unreleased +### Added +- Multi-target scraping via `/probe` endpoint with optional auth modules (compatible with postgres_exporter style) #1063 + BREAKING CHANGES: +* [CHANGE] Set `--es.uri` by default to empty string #1063 + The flag `--es.data_stream` has been renamed to `--collector.data-stream`. The flag `--es.ilm` has been renamed to `--collector.ilm`. * [CHANGE] Rename --es.data_stream to --collector.data-stream #983 * [CHANGE] Rename --es.ilm to --collector.ilm #999 -## [Unreleased] -### Added -- Multi-target scraping via `/probe` endpoint with optional auth modules (compatible with postgres_exporter style) (#PR) - ## 1.9.0 / 2025-02-27 BREAKING CHANGES: From bd4e1c46d37e32e5474a316fab275ef8f9e22085 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Thu, 31 Jul 2025 07:29:22 -0400 Subject: [PATCH 09/25] Remove whitespaces from README.md Signed-off-by: pincher95 --- README.md | 446 +++++++++++++++++++++++++++--------------------------- 1 file changed, 223 insertions(+), 223 deletions(-) diff --git a/README.md b/README.md index adeaefab..2ffd330c 100644 --- a/README.md +++ b/README.md @@ -52,33 +52,33 @@ Below is the command line options summary: elasticsearch_exporter --help ``` -| Argument | Introduced in Version | Description | Default | -| ------------------------- | --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -| collector.clustersettings | 1.6.0 | If true, query stats for cluster settings (As of v1.6.0, this flag has replaced "es.cluster_settings"). | false | -| es.uri | 1.0.2 | Address (host and port) of the Elasticsearch node we should connect to **when running in single-target mode**. Leave empty (the default) when you want to run the exporter only as a multi-target `/probe` endpoint. When basic auth is needed, specify as: `://:@:`. E.G., `http://admin:pass@localhost:9200`. Special characters in the user credentials need to be URL-encoded. | "" | -| es.all | 1.0.2 | If true, query stats for all nodes in the cluster, rather than just the node we connect to. | false | -| es.indices | 1.0.2 | If true, query stats for all indices in the cluster. | false | -| es.indices_settings | 1.0.4rc1 | If true, query settings stats for all indices in the cluster. | false | -| es.indices_mappings | 1.2.0 | If true, query stats for mappings of all indices of the cluster. | false | -| es.aliases | 1.0.4rc1 | If true, include informational aliases metrics. | true | -| es.ilm | 1.6.0 | If true, query index lifecycle policies for indices in the cluster. | -| es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | -| collector.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. (As of v1.7.0, this flag has replaced "es.snapshots"). | false | -| collector.health-report | 1.10.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | -| es.slm | | If true, query stats for SLM. | false | -| es.data_stream | | If true, query state for Data Steams. | false | -| es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | -| es.ca | 1.0.2 | Path to PEM file that contains trusted Certificate Authorities for the Elasticsearch connection. | | -| es.client-private-key | 1.0.2 | Path to PEM file that contains the private key for client auth when connecting to Elasticsearch. | | -| es.client-cert | 1.0.2 | Path to PEM file that contains the corresponding cert for the private key to connect to Elasticsearch. | | -| es.clusterinfo.interval | 1.1.0rc1 | Cluster info update interval for the cluster label | 5m | -| es.ssl-skip-verify | 1.0.4rc1 | Skip SSL verification when connecting to Elasticsearch. | false | -| web.listen-address | 1.0.2 | Address to listen on for web interface and telemetry. | :9114 | -| web.telemetry-path | 1.0.2 | Path under which to expose metrics. | /metrics | -| aws.region | 1.5.0 | Region for AWS elasticsearch | | -| aws.role-arn | 1.6.0 | Role ARN of an IAM role to assume. | | -| config.file | 2.0.0 | Path to a YAML configuration file that defines `auth_modules:` used by the `/probe` multi-target endpoint. Leave unset when not using multi-target mode. | | -| version | 1.0.2 | Show version info on stdout and exit. | | +| Argument | Introduced in Version | Description | Default | +| ----------------------- | --------------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ----------- | +| collector.clustersettings| 1.6.0 | If true, query stats for cluster settings (As of v1.6.0, this flag has replaced "es.cluster_settings"). | false | +| es.uri | 1.0.2 | Address (host and port) of the Elasticsearch node we should connect to **when running in single-target mode**. Leave empty (the default) when you want to run the exporter only as a multi-target `/probe` endpoint. When basic auth is needed, specify as: `://:@:`. E.G., `http://admin:pass@localhost:9200`. Special characters in the user credentials need to be URL-encoded. | ״״ | +| es.all | 1.0.2 | If true, query stats for all nodes in the cluster, rather than just the node we connect to. | false | +| es.indices | 1.0.2 | If true, query stats for all indices in the cluster. | false | +| es.indices_settings | 1.0.4rc1 | If true, query settings stats for all indices in the cluster. | false | +| es.indices_mappings | 1.2.0 | If true, query stats for mappings of all indices of the cluster. | false | +| es.aliases | 1.0.4rc1 | If true, include informational aliases metrics. | true | +| es.ilm | 1.6.0 | If true, query index lifecycle policies for indices in the cluster. +| es.shards | 1.0.3rc1 | If true, query stats for all indices in the cluster, including shard-level stats (implies `es.indices=true`). | false | +| collector.snapshots | 1.0.4rc1 | If true, query stats for the cluster snapshots. (As of v1.7.0, this flag has replaced "es.snapshots"). | false | +| collector.health-report | 1.10.0 | If true, query the health report (requires elasticsearch 8.7.0 or later) | false | +| es.slm | | If true, query stats for SLM. | false | +| es.data_stream | | If true, query state for Data Steams. | false | +| es.timeout | 1.0.2 | Timeout for trying to get stats from Elasticsearch. (ex: 20s) | 5s | +| es.ca | 1.0.2 | Path to PEM file that contains trusted Certificate Authorities for the Elasticsearch connection. | | +| es.client-private-key | 1.0.2 | Path to PEM file that contains the private key for client auth when connecting to Elasticsearch. | | +| es.client-cert | 1.0.2 | Path to PEM file that contains the corresponding cert for the private key to connect to Elasticsearch. | | +| es.clusterinfo.interval | 1.1.0rc1 | Cluster info update interval for the cluster label | 5m | +| es.ssl-skip-verify | 1.0.4rc1 | Skip SSL verification when connecting to Elasticsearch. | false | +| web.listen-address | 1.0.2 | Address to listen on for web interface and telemetry. | :9114 | +| web.telemetry-path | 1.0.2 | Path under which to expose metrics. | /metrics | +| aws.region | 1.5.0 | Region for AWS elasticsearch | | +| aws.role-arn | 1.6.0 | Role ARN of an IAM role to assume. | | +| config.file | 2.0.0 | Path to a YAML configuration file that defines `auth_modules:` used by the `/probe` multi-target endpoint. Leave unset when not using multi-target mode. | | +| version | 1.0.2 | Show version info on stdout and exit. | | Commandline parameters start with a single `-` for versions less than `1.1.0rc1`. For versions greater than `1.1.0rc1`, commandline parameters are specified with `--`. @@ -96,17 +96,17 @@ Specifying those two environment variables will override authentication passed i ES 7.x supports RBACs. The following security privileges are required for the elasticsearch_exporter. -| Setting | Privilege Required | Description | -| :---------------------------- | :----------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------ | -| collector.**clustersettings** | `cluster` `monitor` | -| exporter defaults | `cluster` `monitor` | All cluster read-only operations, like cluster health and state, hot threads, node info, node and cluster stats, and pending cluster tasks. | -| es.indices | `indices` `monitor` (per index or `*`) | All actions that are required for monitoring (recovery, segments info, index stats and status) | -| es.indices_settings | `indices` `monitor` (per index or `*`) | -| es.indices_mappings | `indices` `view_index_metadata` (per index or `*`) | -| es.shards | not sure if `indices` or `cluster` `monitor` or both | -| collector.snapshots | `cluster:admin/snapshot/status` and `cluster:admin/repository/get` | [ES Forum Post](https://discuss.elastic.co/t/permissions-for-backup-user-with-x-pack/88057) | -| es.slm | `manage_slm` | -| es.data_stream | `monitor` or `manage` (per index or `*`) | +Setting | Privilege Required | Description +:---- | :---- | :---- +collector.clustersettings| `cluster` `monitor` | +exporter defaults | `cluster` `monitor` | All cluster read-only operations, like cluster health and state, hot threads, node info, node and cluster stats, and pending cluster tasks. | +es.indices | `indices` `monitor` (per index or `*`) | All actions that are required for monitoring (recovery, segments info, index stats and status) +es.indices_settings | `indices` `monitor` (per index or `*`) | +es.indices_mappings | `indices` `view_index_metadata` (per index or `*`) | +es.shards | not sure if `indices` or `cluster` `monitor` or both | +collector.snapshots | `cluster:admin/snapshot/status` and `cluster:admin/repository/get` | [ES Forum Post](https://discuss.elastic.co/t/permissions-for-backup-user-with-x-pack/88057) +es.slm | `manage_slm` +es.data_stream | `monitor` or `manage` (per index or `*`) | Further Information @@ -169,191 +169,191 @@ Prometheus scrape_config: ### Metrics -| Name | Type | Cardinality | Help | -| -------------------------------------------------------------------- | ------- | ----------- | --------------------------------------------------------------------------------------------------- | -| elasticsearch_breakers_estimated_size_bytes | gauge | 4 | Estimated size in bytes of breaker | -| elasticsearch_breakers_limit_size_bytes | gauge | 4 | Limit size in bytes for breaker | -| elasticsearch_breakers_tripped | counter | 4 | tripped for breaker | -| elasticsearch_cluster_health_active_primary_shards | gauge | 1 | The number of primary shards in your cluster. This is an aggregate total across all indices. | -| elasticsearch_cluster_health_active_shards | gauge | 1 | Aggregate total of all shards across all indices, which includes replica shards. | -| elasticsearch_cluster_health_delayed_unassigned_shards | gauge | 1 | Shards delayed to reduce reallocation overhead | -| elasticsearch_cluster_health_initializing_shards | gauge | 1 | Count of shards that are being freshly created. | -| elasticsearch_cluster_health_number_of_data_nodes | gauge | 1 | Number of data nodes in the cluster. | -| elasticsearch_cluster_health_number_of_in_flight_fetch | gauge | 1 | The number of ongoing shard info requests. | -| elasticsearch_cluster_health_number_of_nodes | gauge | 1 | Number of nodes in the cluster. | -| elasticsearch_cluster_health_number_of_pending_tasks | gauge | 1 | Cluster level changes which have not yet been executed | -| elasticsearch_cluster_health_task_max_waiting_in_queue_millis | gauge | 1 | Max time in millis that a task is waiting in queue. | -| elasticsearch_cluster_health_relocating_shards | gauge | 1 | The number of shards that are currently moving from one node to another node. | -| elasticsearch_cluster_health_status | gauge | 3 | Whether all primary and replica shards are allocated. | -| elasticsearch_cluster_health_unassigned_shards | gauge | 1 | The number of shards that exist in the cluster state, but cannot be found in the cluster itself. | -| elasticsearch_clustersettings_stats_max_shards_per_node | gauge | 0 | Current maximum number of shards per node setting. | -| elasticsearch_clustersettings_allocation_threshold_enabled | gauge | 0 | Is disk allocation decider enabled. | -| elasticsearch_clustersettings_allocation_watermark_flood_stage_bytes | gauge | 0 | Flood stage watermark as in bytes. | -| elasticsearch_clustersettings_allocation_watermark_high_bytes | gauge | 0 | High watermark for disk usage in bytes. | -| elasticsearch_clustersettings_allocation_watermark_low_bytes | gauge | 0 | Low watermark for disk usage in bytes. | -| elasticsearch_clustersettings_allocation_watermark_flood_stage_ratio | gauge | 0 | Flood stage watermark as a ratio. | -| elasticsearch_clustersettings_allocation_watermark_high_ratio | gauge | 0 | High watermark for disk usage as a ratio. | -| elasticsearch_clustersettings_allocation_watermark_low_ratio | gauge | 0 | Low watermark for disk usage as a ratio. | -| elasticsearch_filesystem_data_available_bytes | gauge | 1 | Available space on block device in bytes | -| elasticsearch_filesystem_data_free_bytes | gauge | 1 | Free space on block device in bytes | -| elasticsearch_filesystem_data_size_bytes | gauge | 1 | Size of block device in bytes | -| elasticsearch_filesystem_io_stats_device_operations_count | gauge | 1 | Count of disk operations | -| elasticsearch_filesystem_io_stats_device_read_operations_count | gauge | 1 | Count of disk read operations | -| elasticsearch_filesystem_io_stats_device_write_operations_count | gauge | 1 | Count of disk write operations | -| elasticsearch_filesystem_io_stats_device_read_size_kilobytes_sum | gauge | 1 | Total kilobytes read from disk | -| elasticsearch_filesystem_io_stats_device_write_size_kilobytes_sum | gauge | 1 | Total kilobytes written to disk | -| elasticsearch_ilm_status | gauge | 1 | Current status of ILM. Status can be `STOPPED`, `RUNNING`, `STOPPING`. | -| elasticsearch_ilm_index_status | gauge | 4 | Status of ILM policy for index | -| elasticsearch_indices_active_queries | gauge | 1 | The number of currently active queries | -| elasticsearch_indices_docs | gauge | 1 | Count of documents on this node | -| elasticsearch_indices_docs_deleted | gauge | 1 | Count of deleted documents on this node | -| elasticsearch_indices_deleted_docs_primary | gauge | 1 | Count of deleted documents with only primary shards | -| elasticsearch_indices_docs_primary | gauge | 1 | Count of documents with only primary shards on all nodes | -| elasticsearch_indices_docs_total | gauge | | Count of documents with shards on all nodes | -| elasticsearch_indices_fielddata_evictions | counter | 1 | Evictions from field data | -| elasticsearch_indices_fielddata_memory_size_bytes | gauge | 1 | Field data cache memory usage in bytes | -| elasticsearch_indices_filter_cache_evictions | counter | 1 | Evictions from filter cache | -| elasticsearch_indices_filter_cache_memory_size_bytes | gauge | 1 | Filter cache memory usage in bytes | -| elasticsearch_indices_flush_time_seconds | counter | 1 | Cumulative flush time in seconds | -| elasticsearch_indices_flush_total | counter | 1 | Total flushes | -| elasticsearch_indices_get_exists_time_seconds | counter | 1 | Total time get exists in seconds | -| elasticsearch_indices_get_exists_total | counter | 1 | Total get exists operations | -| elasticsearch_indices_get_missing_time_seconds | counter | 1 | Total time of get missing in seconds | -| elasticsearch_indices_get_missing_total | counter | 1 | Total get missing | -| elasticsearch_indices_get_time_seconds | counter | 1 | Total get time in seconds | -| elasticsearch_indices_get_total | counter | 1 | Total get | -| elasticsearch_indices_indexing_delete_time_seconds_total | counter | 1 | Total time indexing delete in seconds | -| elasticsearch_indices_indexing_delete_total | counter | 1 | Total indexing deletes | -| elasticsearch_indices_index_current | gauge | 1 | The number of documents currently being indexed to an index | -| elasticsearch_indices_indexing_index_time_seconds_total | counter | 1 | Cumulative index time in seconds | -| elasticsearch_indices_indexing_index_total | counter | 1 | Total index calls | -| elasticsearch_indices_mappings_stats_fields | gauge | 1 | Count of fields currently mapped by index | -| elasticsearch_indices_mappings_stats_json_parse_failures_total | counter | 0 | Number of errors while parsing JSON | -| elasticsearch_indices_mappings_stats_scrapes_total | counter | 0 | Current total Elasticsearch Indices Mappings scrapes | -| elasticsearch_indices_mappings_stats_up | gauge | 0 | Was the last scrape of the Elasticsearch Indices Mappings endpoint successful | -| elasticsearch_indices_merges_docs_total | counter | 1 | Cumulative docs merged | -| elasticsearch_indices_merges_total | counter | 1 | Total merges | -| elasticsearch_indices_merges_total_size_bytes_total | counter | 1 | Total merge size in bytes | -| elasticsearch_indices_merges_total_time_seconds_total | counter | 1 | Total time spent merging in seconds | -| elasticsearch_indices_query_cache_cache_total | counter | 1 | Count of query cache | -| elasticsearch_indices_query_cache_cache_size | gauge | 1 | Size of query cache | -| elasticsearch_indices_query_cache_count | counter | 2 | Count of query cache hit/miss | -| elasticsearch_indices_query_cache_evictions | counter | 1 | Evictions from query cache | -| elasticsearch_indices_query_cache_memory_size_bytes | gauge | 1 | Query cache memory usage in bytes | -| elasticsearch_indices_query_cache_total | counter | 1 | Size of query cache total | -| elasticsearch_indices_refresh_time_seconds_total | counter | 1 | Total time spent refreshing in seconds | -| elasticsearch_indices_refresh_total | counter | 1 | Total refreshes | -| elasticsearch_indices_request_cache_count | counter | 2 | Count of request cache hit/miss | -| elasticsearch_indices_request_cache_evictions | counter | 1 | Evictions from request cache | -| elasticsearch_indices_request_cache_memory_size_bytes | gauge | 1 | Request cache memory usage in bytes | -| elasticsearch_indices_search_fetch_time_seconds | counter | 1 | Total search fetch time in seconds | -| elasticsearch_indices_search_fetch_total | counter | 1 | Total number of fetches | -| elasticsearch_indices_search_query_time_seconds | counter | 1 | Total search query time in seconds | -| elasticsearch_indices_search_query_total | counter | 1 | Total number of queries | -| elasticsearch_indices_segments_count | gauge | 1 | Count of index segments on this node | -| elasticsearch_indices_segments_memory_bytes | gauge | 1 | Current memory size of segments in bytes | -| elasticsearch_indices_settings_creation_timestamp_seconds | gauge | 1 | Timestamp of the index creation in seconds | -| elasticsearch_indices_settings_stats_read_only_indices | gauge | 1 | Count of indices that have read_only_allow_delete=true | -| elasticsearch_indices_settings_total_fields | gauge | | Index setting value for index.mapping.total_fields.limit (total allowable mapped fields in a index) | -| elasticsearch_indices_settings_replicas | gauge | | Index setting value for index.replicas | -| elasticsearch_indices_shards_docs | gauge | 3 | Count of documents on this shard | -| elasticsearch_indices_shards_docs_deleted | gauge | 3 | Count of deleted documents on each shard | -| elasticsearch_indices_store_size_bytes | gauge | 1 | Current size of stored index data in bytes | -| elasticsearch_indices_store_size_bytes_primary | gauge | | Current size of stored index data in bytes with only primary shards on all nodes | -| elasticsearch_indices_store_size_bytes_total | gauge | | Current size of stored index data in bytes with all shards on all nodes | -| elasticsearch_indices_store_throttle_time_seconds_total | counter | 1 | Throttle time for index store in seconds | -| elasticsearch_indices_translog_operations | counter | 1 | Total translog operations | -| elasticsearch_indices_translog_size_in_bytes | counter | 1 | Total translog size in bytes | -| elasticsearch_indices_warmer_time_seconds_total | counter | 1 | Total warmer time in seconds | -| elasticsearch_indices_warmer_total | counter | 1 | Total warmer count | -| elasticsearch_jvm_gc_collection_seconds_count | counter | 2 | Count of JVM GC runs | -| elasticsearch_jvm_gc_collection_seconds_sum | counter | 2 | GC run time in seconds | -| elasticsearch_jvm_memory_committed_bytes | gauge | 2 | JVM memory currently committed by area | -| elasticsearch_jvm_memory_max_bytes | gauge | 1 | JVM memory max | -| elasticsearch_jvm_memory_used_bytes | gauge | 2 | JVM memory currently used by area | -| elasticsearch_jvm_memory_pool_used_bytes | gauge | 3 | JVM memory currently used by pool | -| elasticsearch_jvm_memory_pool_max_bytes | counter | 3 | JVM memory max by pool | -| elasticsearch_jvm_memory_pool_peak_used_bytes | counter | 3 | JVM memory peak used by pool | -| elasticsearch_jvm_memory_pool_peak_max_bytes | counter | 3 | JVM memory peak max by pool | -| elasticsearch_os_cpu_percent | gauge | 1 | Percent CPU used by the OS | -| elasticsearch_os_load1 | gauge | 1 | Shortterm load average | -| elasticsearch_os_load5 | gauge | 1 | Midterm load average | -| elasticsearch_os_load15 | gauge | 1 | Longterm load average | -| elasticsearch_process_cpu_percent | gauge | 1 | Percent CPU used by process | -| elasticsearch_process_cpu_seconds_total | counter | 1 | Process CPU time in seconds | -| elasticsearch_process_mem_resident_size_bytes | gauge | 1 | Resident memory in use by process in bytes | -| elasticsearch_process_mem_share_size_bytes | gauge | 1 | Shared memory in use by process in bytes | -| elasticsearch_process_mem_virtual_size_bytes | gauge | 1 | Total virtual memory used in bytes | -| elasticsearch_process_open_files_count | gauge | 1 | Open file descriptors | -| elasticsearch_snapshot_stats_number_of_snapshots | gauge | 1 | Total number of snapshots | -| elasticsearch_snapshot_stats_oldest_snapshot_timestamp | gauge | 1 | Oldest snapshot timestamp | -| elasticsearch_snapshot_stats_snapshot_start_time_timestamp | gauge | 1 | Last snapshot start timestamp | -| elasticsearch_snapshot_stats_latest_snapshot_timestamp_seconds | gauge | 1 | Timestamp of the latest SUCCESS or PARTIAL snapshot | -| elasticsearch_snapshot_stats_snapshot_end_time_timestamp | gauge | 1 | Last snapshot end timestamp | -| elasticsearch_snapshot_stats_snapshot_number_of_failures | gauge | 1 | Last snapshot number of failures | -| elasticsearch_snapshot_stats_snapshot_number_of_indices | gauge | 1 | Last snapshot number of indices | -| elasticsearch_snapshot_stats_snapshot_failed_shards | gauge | 1 | Last snapshot failed shards | -| elasticsearch_snapshot_stats_snapshot_successful_shards | gauge | 1 | Last snapshot successful shards | -| elasticsearch_snapshot_stats_snapshot_total_shards | gauge | 1 | Last snapshot total shard | -| elasticsearch_thread_pool_active_count | gauge | 14 | Thread Pool threads active | -| elasticsearch_thread_pool_completed_count | counter | 14 | Thread Pool operations completed | -| elasticsearch_thread_pool_largest_count | gauge | 14 | Thread Pool largest threads count | -| elasticsearch_thread_pool_queue_count | gauge | 14 | Thread Pool operations queued | -| elasticsearch_thread_pool_rejected_count | counter | 14 | Thread Pool operations rejected | -| elasticsearch_thread_pool_threads_count | gauge | 14 | Thread Pool current threads count | -| elasticsearch_transport_rx_packets_total | counter | 1 | Count of packets received | -| elasticsearch_transport_rx_size_bytes_total | counter | 1 | Total number of bytes received | -| elasticsearch_transport_tx_packets_total | counter | 1 | Count of packets sent | -| elasticsearch_transport_tx_size_bytes_total | counter | 1 | Total number of bytes sent | -| elasticsearch_clusterinfo_last_retrieval_success_ts | gauge | 1 | Timestamp of the last successful cluster info retrieval | -| elasticsearch_clusterinfo_up | gauge | 1 | Up metric for the cluster info collector | -| elasticsearch_clusterinfo_version_info | gauge | 6 | Constant metric with ES version information as labels | -| elasticsearch_slm_stats_up | gauge | 0 | Up metric for SLM collector | -| elasticsearch_slm_stats_total_scrapes | counter | 0 | Number of scrapes for SLM collector | -| elasticsearch_slm_stats_json_parse_failures | counter | 0 | JSON parse failures for SLM collector | -| elasticsearch_slm_stats_retention_runs_total | counter | 0 | Total retention runs | -| elasticsearch_slm_stats_retention_failed_total | counter | 0 | Total failed retention runs | -| elasticsearch_slm_stats_retention_timed_out_total | counter | 0 | Total retention run timeouts | -| elasticsearch_slm_stats_retention_deletion_time_seconds | gauge | 0 | Retention run deletion time | -| elasticsearch_slm_stats_total_snapshots_taken_total | counter | 0 | Total snapshots taken | -| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed | -| elasticsearch_slm_stats_total_snapshots_deleted_total | counter | 0 | Total snapshots deleted | -| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed | -| elasticsearch_slm_stats_snapshots_taken_total | counter | 1 | Snapshots taken by policy | -| elasticsearch_slm_stats_snapshots_failed_total | counter | 1 | Snapshots failed by policy | -| elasticsearch_slm_stats_snapshots_deleted_total | counter | 1 | Snapshots deleted by policy | -| elasticsearch_slm_stats_snapshot_deletion_failures_total | counter | 1 | Snapshot deletion failures by policy | -| elasticsearch_slm_stats_operation_mode | gauge | 1 | SLM operation mode (Running, stopping, stopped) | -| elasticsearch_data_stream_stats_up | gauge | 0 | Up metric for Data Stream collection | -| elasticsearch_data_stream_stats_total_scrapes | counter | 0 | Total scrapes for Data Stream stats | -| elasticsearch_data_stream_stats_json_parse_failures | counter | 0 | Number of parsing failures for Data Stream stats | -| elasticsearch_data_stream_backing_indices_total | gauge | 1 | Number of backing indices for Data Stream | -| elasticsearch_data_stream_store_size_bytes | gauge | 1 | Current size of data stream backing indices in bytes | -| elasticsearch_health_report_creating_primaries | gauge | 1 | The number of creating primary shards | -| elasticsearch_health_report_creating_replicas | gauge | 1 | The number of creating replica shards | -| elasticsearch_health_report_data_stream_lifecycle_status | gauge | 2 | Data stream lifecycle status | -| elasticsearch_health_report_disk_status | gauge | 2 | disk status | -| elasticsearch_health_report_ilm_policies | gauge | 1 | The number of ILM Policies | -| elasticsearch_health_report_ilm_stagnating_indices | gauge | 1 | The number of stagnating indices | -| elasticsearch_health_report_ilm_status | gauge | 2 | ILM status | -| elasticsearch_health_report_initializing_primaries | gauge | 1 | The number of initializing primary shards | -| elasticsearch_health_report_initializing_replicas | gauge | 1 | The number of initializing replica shards | -| elasticsearch_health_report_master_is_stable_status | gauge | 2 | Master is stable status | -| elasticsearch_health_report_max_shards_in_cluster_data | gauge | 1 | The number of maximum shards in a cluster | -| elasticsearch_health_report_max_shards_in_cluster_frozen | gauge | 1 | The number of maximum frozen shards in a cluster | -| elasticsearch_health_report_repository_integrity_status | gauge | 2 | Repository integrity status | -| elasticsearch_health_report_restarting_primaries | gauge | 1 | The number of restarting primary shards | -| elasticsearch_health_report_restarting_replicas | gauge | 1 | The number of restarting replica shards | -| elasticsearch_health_report_shards_availabilty_status | gauge | 2 | Shards availabilty status | -| elasticsearch_health_report_shards_capacity_status | gauge | 2 | Shards capacity status | -| elasticsearch_health_report_slm_policies | gauge | 1 | The number of SLM policies | -| elasticsearch_health_report_slm_status | gauge | 2 | SLM status | -| elasticsearch_health_report_started_primaries | gauge | 1 | The number of started primary shards | -| elasticsearch_health_report_started_replicas | gauge | 1 | The number of started replica shards | -| elasticsearch_health_report_status | gauge | 2 | Overall cluster status | -| elasticsearch_health_report_total_repositories | gauge | 1 | The number snapshot repositories | -| elasticsearch_health_report_unassigned_primaries | gauge | 1 | The number of unassigned primary shards | -| elasticsearch_health_report_unassigned_replicas | gauge | 1 | The number of unassigned replica shards | +| Name | Type | Cardinality | Help | +|----------------------------------------------------------------------|------------|-------------|-----------------------------------------------------------------------------------------------------| +| elasticsearch_breakers_estimated_size_bytes | gauge | 4 | Estimated size in bytes of breaker | +| elasticsearch_breakers_limit_size_bytes | gauge | 4 | Limit size in bytes for breaker | +| elasticsearch_breakers_tripped | counter | 4 | tripped for breaker | +| elasticsearch_cluster_health_active_primary_shards | gauge | 1 | The number of primary shards in your cluster. This is an aggregate total across all indices. | +| elasticsearch_cluster_health_active_shards | gauge | 1 | Aggregate total of all shards across all indices, which includes replica shards. | +| elasticsearch_cluster_health_delayed_unassigned_shards | gauge | 1 | Shards delayed to reduce reallocation overhead | +| elasticsearch_cluster_health_initializing_shards | gauge | 1 | Count of shards that are being freshly created. | +| elasticsearch_cluster_health_number_of_data_nodes | gauge | 1 | Number of data nodes in the cluster. | +| elasticsearch_cluster_health_number_of_in_flight_fetch | gauge | 1 | The number of ongoing shard info requests. | +| elasticsearch_cluster_health_number_of_nodes | gauge | 1 | Number of nodes in the cluster. | +| elasticsearch_cluster_health_number_of_pending_tasks | gauge | 1 | Cluster level changes which have not yet been executed | +| elasticsearch_cluster_health_task_max_waiting_in_queue_millis | gauge | 1 | Max time in millis that a task is waiting in queue. | +| elasticsearch_cluster_health_relocating_shards | gauge | 1 | The number of shards that are currently moving from one node to another node. | +| elasticsearch_cluster_health_status | gauge | 3 | Whether all primary and replica shards are allocated. | +| elasticsearch_cluster_health_unassigned_shards | gauge | 1 | The number of shards that exist in the cluster state, but cannot be found in the cluster itself. | +| elasticsearch_clustersettings_stats_max_shards_per_node | gauge | 0 | Current maximum number of shards per node setting. | +| elasticsearch_clustersettings_allocation_threshold_enabled | gauge | 0 | Is disk allocation decider enabled. | +| elasticsearch_clustersettings_allocation_watermark_flood_stage_bytes | gauge | 0 | Flood stage watermark as in bytes. | +| elasticsearch_clustersettings_allocation_watermark_high_bytes | gauge | 0 | High watermark for disk usage in bytes. | +| elasticsearch_clustersettings_allocation_watermark_low_bytes | gauge | 0 | Low watermark for disk usage in bytes. | +| elasticsearch_clustersettings_allocation_watermark_flood_stage_ratio | gauge | 0 | Flood stage watermark as a ratio. | +| elasticsearch_clustersettings_allocation_watermark_high_ratio | gauge | 0 | High watermark for disk usage as a ratio. | +| elasticsearch_clustersettings_allocation_watermark_low_ratio | gauge | 0 | Low watermark for disk usage as a ratio. | +| elasticsearch_filesystem_data_available_bytes | gauge | 1 | Available space on block device in bytes | +| elasticsearch_filesystem_data_free_bytes | gauge | 1 | Free space on block device in bytes | +| elasticsearch_filesystem_data_size_bytes | gauge | 1 | Size of block device in bytes | +| elasticsearch_filesystem_io_stats_device_operations_count | gauge | 1 | Count of disk operations | +| elasticsearch_filesystem_io_stats_device_read_operations_count | gauge | 1 | Count of disk read operations | +| elasticsearch_filesystem_io_stats_device_write_operations_count | gauge | 1 | Count of disk write operations | +| elasticsearch_filesystem_io_stats_device_read_size_kilobytes_sum | gauge | 1 | Total kilobytes read from disk | +| elasticsearch_filesystem_io_stats_device_write_size_kilobytes_sum | gauge | 1 | Total kilobytes written to disk | +| elasticsearch_ilm_status | gauge | 1 | Current status of ILM. Status can be `STOPPED`, `RUNNING`, `STOPPING`. | +| elasticsearch_ilm_index_status | gauge | 4 | Status of ILM policy for index | +| elasticsearch_indices_active_queries | gauge | 1 | The number of currently active queries | +| elasticsearch_indices_docs | gauge | 1 | Count of documents on this node | +| elasticsearch_indices_docs_deleted | gauge | 1 | Count of deleted documents on this node | +| elasticsearch_indices_deleted_docs_primary | gauge | 1 | Count of deleted documents with only primary shards | +| elasticsearch_indices_docs_primary | gauge | 1 | Count of documents with only primary shards on all nodes | +| elasticsearch_indices_docs_total | gauge | | Count of documents with shards on all nodes | +| elasticsearch_indices_fielddata_evictions | counter | 1 | Evictions from field data | +| elasticsearch_indices_fielddata_memory_size_bytes | gauge | 1 | Field data cache memory usage in bytes | +| elasticsearch_indices_filter_cache_evictions | counter | 1 | Evictions from filter cache | +| elasticsearch_indices_filter_cache_memory_size_bytes | gauge | 1 | Filter cache memory usage in bytes | +| elasticsearch_indices_flush_time_seconds | counter | 1 | Cumulative flush time in seconds | +| elasticsearch_indices_flush_total | counter | 1 | Total flushes | +| elasticsearch_indices_get_exists_time_seconds | counter | 1 | Total time get exists in seconds | +| elasticsearch_indices_get_exists_total | counter | 1 | Total get exists operations | +| elasticsearch_indices_get_missing_time_seconds | counter | 1 | Total time of get missing in seconds | +| elasticsearch_indices_get_missing_total | counter | 1 | Total get missing | +| elasticsearch_indices_get_time_seconds | counter | 1 | Total get time in seconds | +| elasticsearch_indices_get_total | counter | 1 | Total get | +| elasticsearch_indices_indexing_delete_time_seconds_total | counter | 1 | Total time indexing delete in seconds | +| elasticsearch_indices_indexing_delete_total | counter | 1 | Total indexing deletes | +| elasticsearch_indices_index_current | gauge | 1 | The number of documents currently being indexed to an index | +| elasticsearch_indices_indexing_index_time_seconds_total | counter | 1 | Cumulative index time in seconds | +| elasticsearch_indices_indexing_index_total | counter | 1 | Total index calls | +| elasticsearch_indices_mappings_stats_fields | gauge | 1 | Count of fields currently mapped by index | +| elasticsearch_indices_mappings_stats_json_parse_failures_total | counter | 0 | Number of errors while parsing JSON | +| elasticsearch_indices_mappings_stats_scrapes_total | counter | 0 | Current total Elasticsearch Indices Mappings scrapes | +| elasticsearch_indices_mappings_stats_up | gauge | 0 | Was the last scrape of the Elasticsearch Indices Mappings endpoint successful | +| elasticsearch_indices_merges_docs_total | counter | 1 | Cumulative docs merged | +| elasticsearch_indices_merges_total | counter | 1 | Total merges | +| elasticsearch_indices_merges_total_size_bytes_total | counter | 1 | Total merge size in bytes | +| elasticsearch_indices_merges_total_time_seconds_total | counter | 1 | Total time spent merging in seconds | +| elasticsearch_indices_query_cache_cache_total | counter | 1 | Count of query cache | +| elasticsearch_indices_query_cache_cache_size | gauge | 1 | Size of query cache | +| elasticsearch_indices_query_cache_count | counter | 2 | Count of query cache hit/miss | +| elasticsearch_indices_query_cache_evictions | counter | 1 | Evictions from query cache | +| elasticsearch_indices_query_cache_memory_size_bytes | gauge | 1 | Query cache memory usage in bytes | +| elasticsearch_indices_query_cache_total | counter | 1 | Size of query cache total | +| elasticsearch_indices_refresh_time_seconds_total | counter | 1 | Total time spent refreshing in seconds | +| elasticsearch_indices_refresh_total | counter | 1 | Total refreshes | +| elasticsearch_indices_request_cache_count | counter | 2 | Count of request cache hit/miss | +| elasticsearch_indices_request_cache_evictions | counter | 1 | Evictions from request cache | +| elasticsearch_indices_request_cache_memory_size_bytes | gauge | 1 | Request cache memory usage in bytes | +| elasticsearch_indices_search_fetch_time_seconds | counter | 1 | Total search fetch time in seconds | +| elasticsearch_indices_search_fetch_total | counter | 1 | Total number of fetches | +| elasticsearch_indices_search_query_time_seconds | counter | 1 | Total search query time in seconds | +| elasticsearch_indices_search_query_total | counter | 1 | Total number of queries | +| elasticsearch_indices_segments_count | gauge | 1 | Count of index segments on this node | +| elasticsearch_indices_segments_memory_bytes | gauge | 1 | Current memory size of segments in bytes | +| elasticsearch_indices_settings_creation_timestamp_seconds | gauge | 1 | Timestamp of the index creation in seconds | +| elasticsearch_indices_settings_stats_read_only_indices | gauge | 1 | Count of indices that have read_only_allow_delete=true | +| elasticsearch_indices_settings_total_fields | gauge | | Index setting value for index.mapping.total_fields.limit (total allowable mapped fields in a index) | +| elasticsearch_indices_settings_replicas | gauge | | Index setting value for index.replicas | +| elasticsearch_indices_shards_docs | gauge | 3 | Count of documents on this shard | +| elasticsearch_indices_shards_docs_deleted | gauge | 3 | Count of deleted documents on each shard | +| elasticsearch_indices_store_size_bytes | gauge | 1 | Current size of stored index data in bytes | +| elasticsearch_indices_store_size_bytes_primary | gauge | | Current size of stored index data in bytes with only primary shards on all nodes | +| elasticsearch_indices_store_size_bytes_total | gauge | | Current size of stored index data in bytes with all shards on all nodes | +| elasticsearch_indices_store_throttle_time_seconds_total | counter | 1 | Throttle time for index store in seconds | +| elasticsearch_indices_translog_operations | counter | 1 | Total translog operations | +| elasticsearch_indices_translog_size_in_bytes | counter | 1 | Total translog size in bytes | +| elasticsearch_indices_warmer_time_seconds_total | counter | 1 | Total warmer time in seconds | +| elasticsearch_indices_warmer_total | counter | 1 | Total warmer count | +| elasticsearch_jvm_gc_collection_seconds_count | counter | 2 | Count of JVM GC runs | +| elasticsearch_jvm_gc_collection_seconds_sum | counter | 2 | GC run time in seconds | +| elasticsearch_jvm_memory_committed_bytes | gauge | 2 | JVM memory currently committed by area | +| elasticsearch_jvm_memory_max_bytes | gauge | 1 | JVM memory max | +| elasticsearch_jvm_memory_used_bytes | gauge | 2 | JVM memory currently used by area | +| elasticsearch_jvm_memory_pool_used_bytes | gauge | 3 | JVM memory currently used by pool | +| elasticsearch_jvm_memory_pool_max_bytes | counter | 3 | JVM memory max by pool | +| elasticsearch_jvm_memory_pool_peak_used_bytes | counter | 3 | JVM memory peak used by pool | +| elasticsearch_jvm_memory_pool_peak_max_bytes | counter | 3 | JVM memory peak max by pool | +| elasticsearch_os_cpu_percent | gauge | 1 | Percent CPU used by the OS | +| elasticsearch_os_load1 | gauge | 1 | Shortterm load average | +| elasticsearch_os_load5 | gauge | 1 | Midterm load average | +| elasticsearch_os_load15 | gauge | 1 | Longterm load average | +| elasticsearch_process_cpu_percent | gauge | 1 | Percent CPU used by process | +| elasticsearch_process_cpu_seconds_total | counter | 1 | Process CPU time in seconds | +| elasticsearch_process_mem_resident_size_bytes | gauge | 1 | Resident memory in use by process in bytes | +| elasticsearch_process_mem_share_size_bytes | gauge | 1 | Shared memory in use by process in bytes | +| elasticsearch_process_mem_virtual_size_bytes | gauge | 1 | Total virtual memory used in bytes | +| elasticsearch_process_open_files_count | gauge | 1 | Open file descriptors | +| elasticsearch_snapshot_stats_number_of_snapshots | gauge | 1 | Total number of snapshots | +| elasticsearch_snapshot_stats_oldest_snapshot_timestamp | gauge | 1 | Oldest snapshot timestamp | +| elasticsearch_snapshot_stats_snapshot_start_time_timestamp | gauge | 1 | Last snapshot start timestamp | +| elasticsearch_snapshot_stats_latest_snapshot_timestamp_seconds | gauge | 1 | Timestamp of the latest SUCCESS or PARTIAL snapshot | +| elasticsearch_snapshot_stats_snapshot_end_time_timestamp | gauge | 1 | Last snapshot end timestamp | +| elasticsearch_snapshot_stats_snapshot_number_of_failures | gauge | 1 | Last snapshot number of failures | +| elasticsearch_snapshot_stats_snapshot_number_of_indices | gauge | 1 | Last snapshot number of indices | +| elasticsearch_snapshot_stats_snapshot_failed_shards | gauge | 1 | Last snapshot failed shards | +| elasticsearch_snapshot_stats_snapshot_successful_shards | gauge | 1 | Last snapshot successful shards | +| elasticsearch_snapshot_stats_snapshot_total_shards | gauge | 1 | Last snapshot total shard | +| elasticsearch_thread_pool_active_count | gauge | 14 | Thread Pool threads active | +| elasticsearch_thread_pool_completed_count | counter | 14 | Thread Pool operations completed | +| elasticsearch_thread_pool_largest_count | gauge | 14 | Thread Pool largest threads count | +| elasticsearch_thread_pool_queue_count | gauge | 14 | Thread Pool operations queued | +| elasticsearch_thread_pool_rejected_count | counter | 14 | Thread Pool operations rejected | +| elasticsearch_thread_pool_threads_count | gauge | 14 | Thread Pool current threads count | +| elasticsearch_transport_rx_packets_total | counter | 1 | Count of packets received | +| elasticsearch_transport_rx_size_bytes_total | counter | 1 | Total number of bytes received | +| elasticsearch_transport_tx_packets_total | counter | 1 | Count of packets sent | +| elasticsearch_transport_tx_size_bytes_total | counter | 1 | Total number of bytes sent | +| elasticsearch_clusterinfo_last_retrieval_success_ts | gauge | 1 | Timestamp of the last successful cluster info retrieval | +| elasticsearch_clusterinfo_up | gauge | 1 | Up metric for the cluster info collector | +| elasticsearch_clusterinfo_version_info | gauge | 6 | Constant metric with ES version information as labels | +| elasticsearch_slm_stats_up | gauge | 0 | Up metric for SLM collector | +| elasticsearch_slm_stats_total_scrapes | counter | 0 | Number of scrapes for SLM collector | +| elasticsearch_slm_stats_json_parse_failures | counter | 0 | JSON parse failures for SLM collector | +| elasticsearch_slm_stats_retention_runs_total | counter | 0 | Total retention runs | +| elasticsearch_slm_stats_retention_failed_total | counter | 0 | Total failed retention runs | +| elasticsearch_slm_stats_retention_timed_out_total | counter | 0 | Total retention run timeouts | +| elasticsearch_slm_stats_retention_deletion_time_seconds | gauge | 0 | Retention run deletion time | +| elasticsearch_slm_stats_total_snapshots_taken_total | counter | 0 | Total snapshots taken | +| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed | +| elasticsearch_slm_stats_total_snapshots_deleted_total | counter | 0 | Total snapshots deleted | +| elasticsearch_slm_stats_total_snapshots_failed_total | counter | 0 | Total snapshots failed | +| elasticsearch_slm_stats_snapshots_taken_total | counter | 1 | Snapshots taken by policy | +| elasticsearch_slm_stats_snapshots_failed_total | counter | 1 | Snapshots failed by policy | +| elasticsearch_slm_stats_snapshots_deleted_total | counter | 1 | Snapshots deleted by policy | +| elasticsearch_slm_stats_snapshot_deletion_failures_total | counter | 1 | Snapshot deletion failures by policy | +| elasticsearch_slm_stats_operation_mode | gauge | 1 | SLM operation mode (Running, stopping, stopped) | +| elasticsearch_data_stream_stats_up | gauge | 0 | Up metric for Data Stream collection | +| elasticsearch_data_stream_stats_total_scrapes | counter | 0 | Total scrapes for Data Stream stats | +| elasticsearch_data_stream_stats_json_parse_failures | counter | 0 | Number of parsing failures for Data Stream stats | +| elasticsearch_data_stream_backing_indices_total | gauge | 1 | Number of backing indices for Data Stream | +| elasticsearch_data_stream_store_size_bytes | gauge | 1 | Current size of data stream backing indices in bytes | +| elasticsearch_health_report_creating_primaries | gauge | 1 | The number of creating primary shards | +| elasticsearch_health_report_creating_replicas | gauge | 1 | The number of creating replica shards | +| elasticsearch_health_report_data_stream_lifecycle_status | gauge | 2 | Data stream lifecycle status | +| elasticsearch_health_report_disk_status | gauge | 2 | disk status | +| elasticsearch_health_report_ilm_policies | gauge | 1 | The number of ILM Policies | +| elasticsearch_health_report_ilm_stagnating_indices | gauge | 1 | The number of stagnating indices | +| elasticsearch_health_report_ilm_status | gauge | 2 | ILM status | +| elasticsearch_health_report_initializing_primaries | gauge | 1 | The number of initializing primary shards | +| elasticsearch_health_report_initializing_replicas | gauge | 1 | The number of initializing replica shards | +| elasticsearch_health_report_master_is_stable_status | gauge | 2 | Master is stable status | +| elasticsearch_health_report_max_shards_in_cluster_data | gauge | 1 | The number of maximum shards in a cluster | +| elasticsearch_health_report_max_shards_in_cluster_frozen | gauge | 1 | The number of maximum frozen shards in a cluster | +| elasticsearch_health_report_repository_integrity_status | gauge | 2 | Repository integrity status | +| elasticsearch_health_report_restarting_primaries | gauge | 1 | The number of restarting primary shards | +| elasticsearch_health_report_restarting_replicas | gauge | 1 | The number of restarting replica shards | +| elasticsearch_health_report_shards_availabilty_status | gauge | 2 | Shards availabilty status | +| elasticsearch_health_report_shards_capacity_status | gauge | 2 | Shards capacity status | +| elasticsearch_health_report_slm_policies | gauge | 1 | The number of SLM policies | +| elasticsearch_health_report_slm_status | gauge | 2 | SLM status | +| elasticsearch_health_report_started_primaries | gauge | 1 | The number of started primary shards | +| elasticsearch_health_report_started_replicas | gauge | 1 | The number of started replica shards | +| elasticsearch_health_report_status | gauge | 2 | Overall cluster status | +| elasticsearch_health_report_total_repositories | gauge | 1 | The number snapshot repositories | +| elasticsearch_health_report_unassigned_primaries | gauge | 1 | The number of unassigned primary shards | +| elasticsearch_health_report_unassigned_replicas | gauge | 1 | The number of unassigned replica shards | ### Alerts & Recording Rules From d77cf8e4638bcb5e89dda8c1705271e6dbc1e3f0 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sat, 2 Aug 2025 14:37:05 -0500 Subject: [PATCH 10/25] Add testing for apikey authentication module Update examples/auth_modules.yml Fix main.go to apply userpass credentials only if the module type is explicitly set to userpass. Signed-off-by: pincher95 --- config/config_test.go | 52 +++++++++++++++++++++++++++++++++++++++ examples/auth_modules.yml | 2 +- main.go | 3 ++- probe_test.go | 25 ++++++++++++++++++- 4 files changed, 79 insertions(+), 3 deletions(-) diff --git a/config/config_test.go b/config/config_test.go index 00caac26..16d18f2f 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -32,3 +32,55 @@ func TestLoadConfig(t *testing.T) { t.Fatalf("unexpected username: %s", cfg.AuthModules["foo"].UserPass.Username) } } + +// Additional test coverage for apikey based authentication modules. +// Technically, a module could specify both userpass and apikey configs, but +// the `type` field should dictate which credentials are considered valid by +// the application logic. +func TestLoadConfigAPIKey(t *testing.T) { + yaml := `auth_modules: + api_only: + type: apikey + apikey: secretkey123 + mixed: + type: apikey + apikey: anotherkey456 + userpass: + username: should + password: be_ignored +` + tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") + if err != nil { + t.Fatalf("temp file: %v", err) + } + if _, err := tmp.WriteString(yaml); err != nil { + t.Fatalf("write: %v", err) + } + tmp.Close() + + cfg, err := LoadConfig(tmp.Name()) + if err != nil { + t.Fatalf("loadConfig: %v", err) + } + + am := cfg.AuthModules["api_only"] + if am.Type != "apikey" { + t.Fatalf("expected module type apikey, got %s", am.Type) + } + if am.APIKey != "secretkey123" { + t.Fatalf("unexpected apikey value: %s", am.APIKey) + } + + mixed := cfg.AuthModules["mixed"] + if mixed.Type != "apikey" { + t.Fatalf("expected mixed module type apikey, got %s", mixed.Type) + } + if mixed.APIKey != "anotherkey456" { + t.Fatalf("unexpected mixed apikey value: %s", mixed.APIKey) + } + // The userpass credentials should still be parsed but are expected to be ignored + // by the application when the type is apikey. + if mixed.UserPass == nil { + t.Fatalf("expected userpass section to be parsed for mixed module") + } +} diff --git a/examples/auth_modules.yml b/examples/auth_modules.yml index b1b8d925..eeec23f4 100644 --- a/examples/auth_modules.yml +++ b/examples/auth_modules.yml @@ -19,7 +19,7 @@ auth_modules: username: readonly password: changeme - # 3. API-Key authentication (future) + # 3. API-Key authentication prod_key: type: apikey apikey: BASE64-ENCODED-KEY== diff --git a/main.go b/main.go index cc5066e4..5332f8c7 100644 --- a/main.go +++ b/main.go @@ -304,7 +304,8 @@ func main() { } targetURL, _ := url.Parse(targetStr) if am != nil { - if am.UserPass != nil { + // Apply userpass credentials only if the module type is explicitly set to userpass. + if strings.EqualFold(am.Type, "userpass") && am.UserPass != nil { targetURL.User = url.UserPassword(am.UserPass.Username, am.UserPass.Password) } if len(am.Options) > 0 { diff --git a/probe_test.go b/probe_test.go index ad95a940..5b0cf5d5 100644 --- a/probe_test.go +++ b/probe_test.go @@ -32,7 +32,7 @@ func TestValidateProbeParams(t *testing.T) { t.Fatalf("expected module not found error, got %v", err) } - // good path + // good path (userpass) cfg.AuthModules["foo"] = config.AuthModule{Type: "userpass", UserPass: &config.UserPassConfig{Username: "u", Password: "p"}} vals = url.Values{} vals.Set("target", "http://localhost:9200") @@ -41,4 +41,27 @@ func TestValidateProbeParams(t *testing.T) { if err != nil || am == nil || tgt == "" { t.Fatalf("expected success, got err=%v", err) } + + // good path (apikey) with both userpass and apikey set - apikey should be accepted + cfg.AuthModules["api"] = config.AuthModule{ + Type: "apikey", + APIKey: "mysecret", + UserPass: &config.UserPassConfig{Username: "u", Password: "p"}, + } + vals = url.Values{} + vals.Set("target", "http://localhost:9200") + vals.Set("auth_module", "api") + tgt, am, err = validateProbeParams(cfg, vals) + if err != nil { + t.Fatalf("expected success for apikey module, got err=%v", err) + } + if am == nil || am.Type != "apikey" { + t.Fatalf("expected apikey module, got %+v", am) + } + if am.APIKey != "mysecret" { + t.Fatalf("unexpected apikey value: %s", am.APIKey) + } + if tgt == "" { + t.Fatalf("expected non-empty target string") + } } From d329fb312a0ed44aaa5364137b40bcb0294bfbd9 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sat, 2 Aug 2025 16:39:22 -0500 Subject: [PATCH 11/25] Add Load-time validation for the auth module config file during startup Keep light-weight validation for the probe params during runtime Add AWS SigV4 authentication module support Signed-off-by: pincher95 --- config/config.go | 37 ++++++++++++++++++++++- config/config_test.go | 69 ++++++++++++++++++++++++++++--------------- main.go | 19 ++++++++---- probe.go | 9 +++--- probe_test.go | 25 ++++++++++++++-- 5 files changed, 122 insertions(+), 37 deletions(-) diff --git a/config/config.go b/config/config.go index 646f4fa9..6867746b 100644 --- a/config/config.go +++ b/config/config.go @@ -1,7 +1,9 @@ package config import ( + "fmt" "os" + "strings" "go.yaml.in/yaml/v3" ) @@ -15,15 +17,45 @@ type AuthModule struct { Type string `yaml:"type"` UserPass *UserPassConfig `yaml:"userpass,omitempty"` APIKey string `yaml:"apikey,omitempty"` + AWS *AWSConfig `yaml:"aws,omitempty"` Options map[string]string `yaml:"options,omitempty"` } +// AWSConfig contains settings for SigV4 authentication. +type AWSConfig struct { + Region string `yaml:"region"` + RoleARN string `yaml:"role_arn,omitempty"` +} + type UserPassConfig struct { Username string `yaml:"username"` Password string `yaml:"password"` } -// LoadConfig reads and parses YAML config file. +// validate ensures every auth module has the required fields according to its type. +func (c *Config) validate() error { + for name, am := range c.AuthModules { + switch strings.ToLower(am.Type) { + case "userpass": + if am.UserPass == nil || am.UserPass.Username == "" || am.UserPass.Password == "" { + return fmt.Errorf("auth_module %s type userpass requires username and password", name) + } + case "apikey": + if am.APIKey == "" { + return fmt.Errorf("auth_module %s type apikey requires apikey", name) + } + case "aws": + if am.AWS == nil || am.AWS.Region == "" { + return fmt.Errorf("auth_module %s type aws requires region", name) + } + default: + return fmt.Errorf("auth_module %s has unsupported type %s", name, am.Type) + } + } + return nil +} + +// LoadConfig reads, parses, and validates the YAML config file. func LoadConfig(path string) (*Config, error) { data, err := os.ReadFile(path) if err != nil { @@ -33,5 +65,8 @@ func LoadConfig(path string) (*Config, error) { if err := yaml.Unmarshal(data, &cfg); err != nil { return nil, err } + if err := cfg.validate(); err != nil { + return nil, err + } return &cfg, nil } diff --git a/config/config_test.go b/config/config_test.go index 16d18f2f..7c242810 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -33,21 +33,12 @@ func TestLoadConfig(t *testing.T) { } } -// Additional test coverage for apikey based authentication modules. -// Technically, a module could specify both userpass and apikey configs, but -// the `type` field should dictate which credentials are considered valid by -// the application logic. +// Additional test coverage for apikey and aws based authentication modules. func TestLoadConfigAPIKey(t *testing.T) { yaml := `auth_modules: api_only: type: apikey apikey: secretkey123 - mixed: - type: apikey - apikey: anotherkey456 - userpass: - username: should - password: be_ignored ` tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") if err != nil { @@ -64,23 +55,55 @@ func TestLoadConfigAPIKey(t *testing.T) { } am := cfg.AuthModules["api_only"] - if am.Type != "apikey" { - t.Fatalf("expected module type apikey, got %s", am.Type) + if am.Type != "apikey" || am.APIKey != "secretkey123" { + t.Fatalf("unexpected apikey module: %+v", am) + } +} + +func TestLoadConfigAWS(t *testing.T) { + yaml := `auth_modules: + awsmod: + type: aws + aws: + region: us-east-1 + role_arn: arn:aws:iam::123456789012:role/metrics +` + tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") + if err != nil { + t.Fatalf("temp file: %v", err) } - if am.APIKey != "secretkey123" { - t.Fatalf("unexpected apikey value: %s", am.APIKey) + if _, err := tmp.WriteString(yaml); err != nil { + t.Fatalf("write: %v", err) } + tmp.Close() - mixed := cfg.AuthModules["mixed"] - if mixed.Type != "apikey" { - t.Fatalf("expected mixed module type apikey, got %s", mixed.Type) + cfg, err := LoadConfig(tmp.Name()) + if err != nil { + t.Fatalf("loadConfig: %v", err) } - if mixed.APIKey != "anotherkey456" { - t.Fatalf("unexpected mixed apikey value: %s", mixed.APIKey) + + awsMod := cfg.AuthModules["awsmod"] + if awsMod.Type != "aws" || awsMod.AWS == nil || awsMod.AWS.Region != "us-east-1" { + t.Fatalf("unexpected aws module: %+v", awsMod) } - // The userpass credentials should still be parsed but are expected to be ignored - // by the application when the type is apikey. - if mixed.UserPass == nil { - t.Fatalf("expected userpass section to be parsed for mixed module") +} + +func TestLoadConfigInvalidUserPass(t *testing.T) { + // missing userpass section for type=userpass + yaml := `auth_modules: + bad: + type: userpass +` + tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") + if err != nil { + t.Fatalf("temp file: %v", err) + } + if _, err := tmp.WriteString(yaml); err != nil { + t.Fatalf("write: %v", err) + } + tmp.Close() + + if _, err := LoadConfig(tmp.Name()); err == nil { + t.Fatalf("expected validation error for missing credentials, got nil") } } diff --git a/main.go b/main.go index 5332f8c7..e15d7bb9 100644 --- a/main.go +++ b/main.go @@ -324,11 +324,20 @@ func main() { Proxy: http.ProxyFromEnvironment, } - // inject API key header if auth_module is of type apikey - if am != nil && strings.EqualFold(am.Type, "apikey") && am.APIKey != "" { - transport = &transportWithAPIKey{ - underlyingTransport: transport, - apiKey: am.APIKey, + // inject authentication based on auth_module type + if am != nil { + if strings.EqualFold(am.Type, "apikey") && am.APIKey != "" { + transport = &transportWithAPIKey{ + underlyingTransport: transport, + apiKey: am.APIKey, + } + } else if strings.EqualFold(am.Type, "aws") && am.AWS != nil { + var err error + transport, err = roundtripper.NewAWSSigningTransport(transport, am.AWS.Region, am.AWS.RoleARN, logger) + if err != nil { + http.Error(w, "failed to create AWS signing transport", http.StatusInternalServerError) + return + } } } probeClient := &http.Client{ diff --git a/probe.go b/probe.go index 331f49d3..cc4e19cc 100644 --- a/probe.go +++ b/probe.go @@ -46,12 +46,11 @@ func validateProbeParams(cfg *config.Config, q url.Values) (string, *config.Auth } switch strings.ToLower(am.Type) { case "userpass": - if am.UserPass != nil { - return target, &am, nil - } - return "", nil, errUnsupportedModule + return target, &am, nil case "apikey": - if am.APIKey != "" { + return target, &am, nil + case "aws": + if am.AWS != nil && am.AWS.Region != "" { return target, &am, nil } return "", nil, errUnsupportedModule diff --git a/probe_test.go b/probe_test.go index 5b0cf5d5..8ada7ea7 100644 --- a/probe_test.go +++ b/probe_test.go @@ -51,7 +51,7 @@ func TestValidateProbeParams(t *testing.T) { vals = url.Values{} vals.Set("target", "http://localhost:9200") vals.Set("auth_module", "api") - tgt, am, err = validateProbeParams(cfg, vals) + _, am, err = validateProbeParams(cfg, vals) if err != nil { t.Fatalf("expected success for apikey module, got err=%v", err) } @@ -61,7 +61,26 @@ func TestValidateProbeParams(t *testing.T) { if am.APIKey != "mysecret" { t.Fatalf("unexpected apikey value: %s", am.APIKey) } - if tgt == "" { - t.Fatalf("expected non-empty target string") + + // good path (aws) + cfg.AuthModules["awsmod"] = config.AuthModule{ + Type: "aws", + AWS: &config.AWSConfig{ + Region: "us-east-1", + RoleARN: "arn:aws:iam::123456789012:role/metrics", + }, + } + vals = url.Values{} + vals.Set("target", "http://localhost:9200") + vals.Set("auth_module", "awsmod") + _, am, err = validateProbeParams(cfg, vals) + if err != nil { + t.Fatalf("expected success for aws module, got err=%v", err) + } + if am == nil || am.Type != "aws" { + t.Fatalf("expected aws module, got %+v", am) + } + if am.AWS == nil || am.AWS.Region != "us-east-1" { + t.Fatalf("unexpected aws config: %+v", am.AWS) } } From 5f754b71e21c387d71e28b47b629b9f4605c75bc Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 3 Aug 2025 09:12:55 -0500 Subject: [PATCH 12/25] Expose error in the logger Signed-off-by: pincher95 --- main.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/main.go b/main.go index e15d7bb9..149006df 100644 --- a/main.go +++ b/main.go @@ -247,8 +247,7 @@ func main() { case clusterinfo.ErrInitialCallTimeout: logger.Info("initial cluster info call timed out") default: - var err error - logger.Error("failed to run cluster info retriever", "err", err) + logger.Error("failed to run cluster info retriever", "err", runErr) os.Exit(1) } From 4a660def10f505b682b79d3dd34bf5607eee7109 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 3 Aug 2025 09:35:11 -0500 Subject: [PATCH 13/25] Add TLS config per target support Add TLS config validation Update config test to include TLS config Signed-off-by: pincher95 --- config/config.go | 24 +++++++ config/config_test.go | 155 ++++++++++++++++++++---------------------- main.go | 14 +++- 3 files changed, 112 insertions(+), 81 deletions(-) diff --git a/config/config.go b/config/config.go index 6867746b..c50a9726 100644 --- a/config/config.go +++ b/config/config.go @@ -18,6 +18,7 @@ type AuthModule struct { UserPass *UserPassConfig `yaml:"userpass,omitempty"` APIKey string `yaml:"apikey,omitempty"` AWS *AWSConfig `yaml:"aws,omitempty"` + TLS *TLSConfig `yaml:"tls,omitempty"` Options map[string]string `yaml:"options,omitempty"` } @@ -27,6 +28,14 @@ type AWSConfig struct { RoleARN string `yaml:"role_arn,omitempty"` } +// TLSConfig allows per-target TLS options. +type TLSConfig struct { + CAFile string `yaml:"ca_file,omitempty"` + CertFile string `yaml:"cert_file,omitempty"` + KeyFile string `yaml:"key_file,omitempty"` + InsecureSkipVerify bool `yaml:"insecure_skip_verify,omitempty"` +} + type UserPassConfig struct { Username string `yaml:"username"` Password string `yaml:"password"` @@ -51,6 +60,21 @@ func (c *Config) validate() error { default: return fmt.Errorf("auth_module %s has unsupported type %s", name, am.Type) } + + // TLS validation (optional but if specified must be coherent) + if am.TLS != nil { + if (am.TLS.CertFile == "") != (am.TLS.KeyFile == "") { + return fmt.Errorf("auth_module %s tls requires both cert_file and key_file or neither", name) + } + for _, p := range []string{am.TLS.CAFile, am.TLS.CertFile, am.TLS.KeyFile} { + if p == "" { + continue + } + if _, err := os.Stat(p); err != nil { + return fmt.Errorf("auth_module %s tls file %s not accessible: %w", name, p, err) + } + } + } } return nil } diff --git a/config/config_test.go b/config/config_test.go index 7c242810..8d23598b 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -5,105 +5,100 @@ import ( "testing" ) -func TestLoadConfig(t *testing.T) { - yaml := `auth_modules: - foo: - type: userpass - userpass: - username: bar - password: baz - options: - sslmode: disable -` - tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") +func mustTempFile(t *testing.T) string { + f, err := os.CreateTemp(t.TempDir(), "pem-*.crt") if err != nil { t.Fatalf("temp file: %v", err) } - if _, err := tmp.WriteString(yaml); err != nil { - t.Fatalf("write: %v", err) - } - tmp.Close() - - cfg, err := LoadConfig(tmp.Name()) - if err != nil { - t.Fatalf("loadConfig: %v", err) - } - if cfg.AuthModules["foo"].UserPass.Username != "bar" { - t.Fatalf("unexpected username: %s", cfg.AuthModules["foo"].UserPass.Username) - } + f.Close() + return f.Name() } -// Additional test coverage for apikey and aws based authentication modules. -func TestLoadConfigAPIKey(t *testing.T) { +func TestLoadConfigTLSValid(t *testing.T) { + ca := mustTempFile(t) + cert := mustTempFile(t) + key := mustTempFile(t) yaml := `auth_modules: - api_only: - type: apikey - apikey: secretkey123 + secure: + type: userpass + userpass: + username: foo + password: bar + tls: + ca_file: ` + ca + ` + cert_file: ` + cert + ` + key_file: ` + key + ` ` - tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") - if err != nil { - t.Fatalf("temp file: %v", err) - } - if _, err := tmp.WriteString(yaml); err != nil { - t.Fatalf("write: %v", err) - } + tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") + tmp.WriteString(yaml) tmp.Close() - - cfg, err := LoadConfig(tmp.Name()) - if err != nil { - t.Fatalf("loadConfig: %v", err) - } - - am := cfg.AuthModules["api_only"] - if am.Type != "apikey" || am.APIKey != "secretkey123" { - t.Fatalf("unexpected apikey module: %+v", am) + if _, err := LoadConfig(tmp.Name()); err != nil { + t.Fatalf("expected config to load, got %v", err) } } -func TestLoadConfigAWS(t *testing.T) { +func TestLoadConfigTLSMissingKey(t *testing.T) { + cert := mustTempFile(t) yaml := `auth_modules: - awsmod: - type: aws - aws: - region: us-east-1 - role_arn: arn:aws:iam::123456789012:role/metrics + badtls: + type: userpass + userpass: + username: foo + password: bar + tls: + cert_file: ` + cert + ` ` - tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") - if err != nil { - t.Fatalf("temp file: %v", err) - } - if _, err := tmp.WriteString(yaml); err != nil { - t.Fatalf("write: %v", err) - } + tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") + tmp.WriteString(yaml) tmp.Close() - - cfg, err := LoadConfig(tmp.Name()) - if err != nil { - t.Fatalf("loadConfig: %v", err) - } - - awsMod := cfg.AuthModules["awsmod"] - if awsMod.Type != "aws" || awsMod.AWS == nil || awsMod.AWS.Region != "us-east-1" { - t.Fatalf("unexpected aws module: %+v", awsMod) + if _, err := LoadConfig(tmp.Name()); err == nil { + t.Fatalf("expected validation error for missing key_file") } } -func TestLoadConfigInvalidUserPass(t *testing.T) { - // missing userpass section for type=userpass - yaml := `auth_modules: +func TestLoadConfigValidationErrors(t *testing.T) { + badPath := "/path/does/not/exist" + key := mustTempFile(t) + cases := []struct { + name string + yaml string + }{ + { + "tlsMissingCert", + `auth_modules: bad: type: userpass -` - tmp, err := os.CreateTemp(t.TempDir(), "cfg-*.yml") - if err != nil { - t.Fatalf("temp file: %v", err) - } - if _, err := tmp.WriteString(yaml); err != nil { - t.Fatalf("write: %v", err) + userpass: {username: u, password: p} + tls: {key_file: ` + key + `}`, + }, + { + "tlsBadCAPath", + `auth_modules: + bad: + type: userpass + userpass: {username: u, password: p} + tls: {ca_file: ` + badPath + `}`, + }, + { + "awsNoRegion", + `auth_modules: + bad: + type: aws + aws: {}`, + }, + { + "unsupportedType", + `auth_modules: + bad: + type: foobar`, + }, } - tmp.Close() - - if _, err := LoadConfig(tmp.Name()); err == nil { - t.Fatalf("expected validation error for missing credentials, got nil") + for _, c := range cases { + tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") + tmp.WriteString(c.yaml) + tmp.Close() + if _, err := LoadConfig(tmp.Name()); err == nil { + t.Fatalf("%s: expected validation error, got nil", c.name) + } } } diff --git a/main.go b/main.go index 149006df..9b92d19a 100644 --- a/main.go +++ b/main.go @@ -317,7 +317,19 @@ func main() { } // Build a dedicated HTTP client for this probe request (reuse TLS opts, timeout, etc.). - tlsCfg := createTLSConfig(*esCA, *esClientCert, *esClientPrivateKey, *esInsecureSkipVerify) + pemCA := *esCA + pemCert := *esClientCert + pemKey := *esClientPrivateKey + insecure := *esInsecureSkipVerify + if am != nil && am.TLS != nil { + pemCA = am.TLS.CAFile + pemCert = am.TLS.CertFile + pemKey = am.TLS.KeyFile + if am.TLS.InsecureSkipVerify { + insecure = true + } + } + tlsCfg := createTLSConfig(pemCA, pemCert, pemKey, insecure) var transport http.RoundTripper = &http.Transport{ TLSClientConfig: tlsCfg, Proxy: http.ProxyFromEnvironment, From 5e32ad8830f75bcd6cb94c7ecad813bd84ebe09a Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 3 Aug 2025 10:10:02 -0500 Subject: [PATCH 14/25] Indices and Shards collectors now fetch cluster_name once from GET / when no clusterinfo retriever is attached, avoiding the previous "unknown_cluster" label. Signed-off-by: pincher95 --- collector/indices.go | 23 +++++++++++++++++++---- collector/shards.go | 38 +++++++++++++++++++++++++++++++++----- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/collector/indices.go b/collector/indices.go index dd7e7274..81b6705a 100644 --- a/collector/indices.go +++ b/collector/indices.go @@ -620,13 +620,28 @@ func (i *Indices) fetchAndDecodeIndexStats(ctx context.Context) (indexStatsRespo return isr, nil } -// getCluserName returns the name of the cluster from the clusterinfo -// if the clusterinfo is nil, it returns "unknown_cluster" -// TODO(@sysadmind): this should be removed once we have a better way to handle clusterinfo +// getClusterName returns the cluster name. If no clusterinfo retriever is +// attached (e.g. /probe mode) it performs a lightweight call to the root +// endpoint once and caches the result. func (i *Indices) getClusterName() string { - if i.lastClusterInfo != nil { + if i.lastClusterInfo != nil && i.lastClusterInfo.ClusterName != "unknown_cluster" { return i.lastClusterInfo.ClusterName } + u := *i.url + u.Path = "/" + resp, err := i.client.Get(u.String()) + if err == nil { + defer resp.Body.Close() + if resp.StatusCode == http.StatusOK { + var root struct { + ClusterName string `json:"cluster_name"` + } + if err := json.NewDecoder(resp.Body).Decode(&root); err == nil && root.ClusterName != "" { + i.lastClusterInfo = &clusterinfo.Response{ClusterName: root.ClusterName} + return root.ClusterName + } + } + } return "unknown_cluster" } diff --git a/collector/shards.go b/collector/shards.go index 136ea671..6496be26 100644 --- a/collector/shards.go +++ b/collector/shards.go @@ -64,23 +64,50 @@ type nodeShardMetric struct { Labels labels } +// fetchClusterNameOnce performs a single request to the root endpoint to obtain the cluster name. +func fetchClusterNameOnce(s *Shards) string { + if s.lastClusterInfo != nil && s.lastClusterInfo.ClusterName != "unknown_cluster" { + return s.lastClusterInfo.ClusterName + } + u := *s.url + u.Path = "/" + resp, err := s.client.Get(u.String()) + if err == nil { + defer resp.Body.Close() + if resp.StatusCode == http.StatusOK { + var root struct { + ClusterName string `json:"cluster_name"` + } + if err := json.NewDecoder(resp.Body).Decode(&root); err == nil && root.ClusterName != "" { + s.lastClusterInfo = &clusterinfo.Response{ClusterName: root.ClusterName} + return root.ClusterName + } + } + } + return "unknown_cluster" +} + // NewShards defines Shards Prometheus metrics func NewShards(logger *slog.Logger, client *http.Client, url *url.URL) *Shards { + var shardPtr *Shards nodeLabels := labels{ keys: func(...string) []string { return []string{"node", "cluster"} }, - values: func(lastClusterinfo *clusterinfo.Response, s ...string) []string { + values: func(lastClusterinfo *clusterinfo.Response, base ...string) []string { if lastClusterinfo != nil { - return append(s, lastClusterinfo.ClusterName) + return append(base, lastClusterinfo.ClusterName) } - // this shouldn't happen, as the clusterinfo Retriever has a blocking - // Run method. It blocks until the first clusterinfo call has succeeded - return append(s, "unknown_cluster") + if shardPtr != nil { + return append(base, fetchClusterNameOnce(shardPtr)) + } + return append(base, "unknown_cluster") }, } shards := &Shards{ + // will assign later + logger: logger, client: client, url: url, @@ -123,6 +150,7 @@ func NewShards(logger *slog.Logger, client *http.Client, url *url.URL) *Shards { logger.Debug("exiting cluster info receive loop") }() + shardPtr = shards return shards } From d674f6d97b8374816d89fbb20fda214d331a69a8 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 3 Aug 2025 10:16:40 -0500 Subject: [PATCH 15/25] Removed the special-case logic that redirected /metrics?target= requests to /probe. Updated auth_modules.yml to include AWS SigV4 signing and mTLS support. Signed-off-by: pincher95 --- examples/auth_modules.yml | 44 +++++++++++++++++++++++++++++++++++---- main.go | 11 ++-------- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/examples/auth_modules.yml b/examples/auth_modules.yml index eeec23f4..93837aad 100644 --- a/examples/auth_modules.yml +++ b/examples/auth_modules.yml @@ -2,7 +2,9 @@ # Each module can be referenced with ?auth_module= in /probe requests. auth_modules: - # 1. Simple basic-auth over HTTPS ---------------------------------------- + ########################################################################### + # 1. Simple basic-auth over HTTPS # + ########################################################################### prod_basic: type: userpass userpass: @@ -10,16 +12,50 @@ auth_modules: password: s3cr3t # extra URL query parameters are appended to the target DSN options: - sslmode: disable # example option – becomes ?sslmode=disable + sslmode: disable # becomes ?sslmode=disable - # 2. Read-only account for staging cluster + ########################################################################### + # 2. Read-only account for staging cluster # + ########################################################################### staging_ro: type: userpass userpass: username: readonly password: changeme - # 3. API-Key authentication + ########################################################################### + # 3. Userpass + mTLS (per-cluster client certs) # + ########################################################################### + prod_tls: + type: userpass + userpass: + username: metrics + password: certsecret + tls: + ca_file: /etc/ssl/prod/ca.pem + cert_file: /etc/ssl/prod/client.pem + key_file: /etc/ssl/prod/client-key.pem + insecure_skip_verify: false + + ########################################################################### + # 4. API-Key authentication # + ########################################################################### prod_key: type: apikey apikey: BASE64-ENCODED-KEY== + # per-cluster TLS settings can still be supplied if needed + # tls: + # ca_file: /etc/ssl/prod/ca.pem + + ########################################################################### + # 5. AWS SigV4 signing # + ########################################################################### + aws_sigv4: + type: aws + aws: + region: us-east-1 + # role_arn is optional; uncomment to assume a role + # role_arn: arn:aws:iam::123456789012:role/metrics-reader + # optional custom TLS + tls: + insecure_skip_verify: true # e.g. self-signed certs on ALB diff --git a/main.go b/main.go index 9b92d19a..6ef5da09 100644 --- a/main.go +++ b/main.go @@ -256,15 +256,8 @@ func main() { } http.HandleFunc(*metricsPath, func(w http.ResponseWriter, r *http.Request) { - // If query has target param treat like probe endpoint - if r.URL.Query().Has("target") { - // reuse probe logic by delegating to /probe handler implementation - r.URL.Path = "/probe" // set path for consistency in logs - if probeHandler, _ := http.DefaultServeMux.Handler(&http.Request{URL: &url.URL{Path: "/probe"}}); probeHandler != nil { - probeHandler.ServeHTTP(w, r) - return - } - } + // /metrics endpoint is reserved for single-target mode only. + // For per-scrape overrides use the dedicated /probe endpoint. promhttp.Handler().ServeHTTP(w, r) }) From 9fa061076ca730e5b4cd1ebc3cc32ac4bc9f2cad Mon Sep 17 00:00:00 2001 From: pincher95 Date: Tue, 5 Aug 2025 12:56:38 -0500 Subject: [PATCH 16/25] Add license headers to all new files Signed-off-by: pincher95 --- config/config.go | 13 +++++++++++++ config/config_test.go | 13 +++++++++++++ probe.go | 13 +++++++++++++ probe_test.go | 13 +++++++++++++ 4 files changed, 52 insertions(+) diff --git a/config/config.go b/config/config.go index c50a9726..7eae4066 100644 --- a/config/config.go +++ b/config/config.go @@ -1,3 +1,16 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package config import ( diff --git a/config/config_test.go b/config/config_test.go index 8d23598b..9d09edcb 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1,3 +1,16 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package config import ( diff --git a/probe.go b/probe.go index cc4e19cc..d303eaec 100644 --- a/probe.go +++ b/probe.go @@ -1,3 +1,16 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package main import ( diff --git a/probe_test.go b/probe_test.go index 8ada7ea7..50b22e22 100644 --- a/probe_test.go +++ b/probe_test.go @@ -1,3 +1,16 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package main import ( From d5f818bda84c1ec1db2792e157752d3fa3b9d85d Mon Sep 17 00:00:00 2001 From: pincher95 Date: Tue, 5 Aug 2025 14:07:35 -0500 Subject: [PATCH 17/25] Fixes for relative paths in multi-target mode Signed-off-by: pincher95 --- collector/indices.go | 3 ++- collector/shards.go | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/collector/indices.go b/collector/indices.go index 81b6705a..fdad33f5 100644 --- a/collector/indices.go +++ b/collector/indices.go @@ -19,6 +19,7 @@ import ( "log/slog" "net/http" "net/url" + "path" "sort" "strconv" @@ -628,7 +629,7 @@ func (i *Indices) getClusterName() string { return i.lastClusterInfo.ClusterName } u := *i.url - u.Path = "/" + u.Path = path.Join(u.Path, "/") resp, err := i.client.Get(u.String()) if err == nil { defer resp.Body.Close() diff --git a/collector/shards.go b/collector/shards.go index 6496be26..351680ca 100644 --- a/collector/shards.go +++ b/collector/shards.go @@ -70,7 +70,7 @@ func fetchClusterNameOnce(s *Shards) string { return s.lastClusterInfo.ClusterName } u := *s.url - u.Path = "/" + u.Path = path.Join(u.Path, "/") resp, err := s.client.Get(u.String()) if err == nil { defer resp.Body.Close() From 750e0da70d72a95859c6bff1fd12417e3d83aa6c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Aug 2025 22:05:43 -0400 Subject: [PATCH 18/25] Bump github.com/prometheus/client_golang from 1.22.0 to 1.23.0 (#1065) Bumps [github.com/prometheus/client_golang](https://github.com/prometheus/client_golang) from 1.22.0 to 1.23.0. - [Release notes](https://github.com/prometheus/client_golang/releases) - [Changelog](https://github.com/prometheus/client_golang/blob/main/CHANGELOG.md) - [Commits](https://github.com/prometheus/client_golang/compare/v1.22.0...v1.23.0) --- updated-dependencies: - dependency-name: github.com/prometheus/client_golang dependency-version: 1.23.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Signed-off-by: pincher95 --- go.mod | 4 ++-- go.sum | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index faf92040..057c72f6 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/sts v1.34.0 github.com/blang/semver/v4 v4.0.0 github.com/imdario/mergo v0.3.13 - github.com/prometheus/client_golang v1.22.0 + github.com/prometheus/client_golang v1.23.0 github.com/prometheus/common v0.65.0 github.com/prometheus/exporter-toolkit v0.14.0 go.yaml.in/yaml/v3 v3.0.4 @@ -37,7 +37,7 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/procfs v0.15.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect github.com/xhit/go-str2duration/v2 v2.1.0 // indirect golang.org/x/crypto v0.38.0 // indirect golang.org/x/net v0.40.0 // indirect diff --git a/go.sum b/go.sum index 6599a6bf..0a857670 100644 --- a/go.sum +++ b/go.sum @@ -64,16 +64,16 @@ github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+ github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= -github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc= +github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= github.com/prometheus/exporter-toolkit v0.14.0 h1:NMlswfibpcZZ+H0sZBiTjrA3/aBFHkNZqE+iCj5EmRg= github.com/prometheus/exporter-toolkit v0.14.0/go.mod h1:Gu5LnVvt7Nr/oqTBUC23WILZepW0nffNo10XdhQcwWA= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -84,6 +84,8 @@ github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8 github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= From 3ea24f0bbfddf08c7dc255d471153a24dc303dbe Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 10 Aug 2025 09:59:26 -0400 Subject: [PATCH 19/25] Add target schema validation, http/https only Add tls auth type support in multi-target mode Update README.md, examples/auth_modules.yml, tests Signed-off-by: pincher95 --- README.md | 18 +++- config/config.go | 46 +++++++-- config/config_test.go | 190 ++++++++++++++++++++++++-------------- examples/auth_modules.yml | 40 ++++---- main.go | 49 +++++++--- probe.go | 17 +++- probe_test.go | 27 ++++++ 7 files changed, 264 insertions(+), 123 deletions(-) diff --git a/README.md b/README.md index 2ffd330c..08bd9851 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ elasticsearch_exporter --help | Argument | Introduced in Version | Description | Default | | ----------------------- | --------------------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ----------- | | collector.clustersettings| 1.6.0 | If true, query stats for cluster settings (As of v1.6.0, this flag has replaced "es.cluster_settings"). | false | -| es.uri | 1.0.2 | Address (host and port) of the Elasticsearch node we should connect to **when running in single-target mode**. Leave empty (the default) when you want to run the exporter only as a multi-target `/probe` endpoint. When basic auth is needed, specify as: `://:@:`. E.G., `http://admin:pass@localhost:9200`. Special characters in the user credentials need to be URL-encoded. | ״״ | +| es.uri | 1.0.2 | Address (host and port) of the Elasticsearch node we should connect to **when running in single-target mode**. Leave empty (the default) when you want to run the exporter only as a multi-target `/probe` endpoint. When basic auth is needed, specify as: `://:@:`. E.G., `http://admin:pass@localhost:9200`. Special characters in the user credentials need to be URL-encoded. | "" | | es.all | 1.0.2 | If true, query stats for all nodes in the cluster, rather than just the node we connect to. | false | | es.indices | 1.0.2 | If true, query stats for all indices in the cluster. | false | | es.indices_settings | 1.0.4rc1 | If true, query settings stats for all indices in the cluster. | false | @@ -120,10 +120,12 @@ From v2.X the exporter exposes `/probe` allowing one running instance to scrape Supported `auth_module` types: -| type | YAML fields | Injected into request | -| ---------- | ----------------------------------------------------------------- | ------------------------------------------------------------------ | -| `userpass` | `userpass.username`, `userpass.password`, optional `options:` map | Sets HTTP basic-auth header, appends `options` as query parameters | -| `apikey` | `apikey:` Base64 API-Key string, optional `options:` map | Adds `Authorization: ApiKey …` header, appends `options` | +| type | YAML fields | Injected into request | +| ---------- | ----------------------------------------------------------------- | ------------------------------------------------------------------------------------- | +| `userpass` | `userpass.username`, `userpass.password`, optional `options:` map | Sets HTTP basic-auth header, appends `options` as query parameters | +| `apikey` | `apikey:` Base64 API-Key string, optional `options:` map | Adds `Authorization: ApiKey …` header, appends `options` | +| `aws` | `aws.region`, optional `aws.role_arn`, optional `options:` map | Uses AWS SigV4 signing transport for HTTP(S) requests, appends `options` | +| `tls` | `tls.ca_file`, `tls.cert_file`, `tls.key_file` | Uses client certificate authentication via TLS; cannot be mixed with other auth types | Example config: @@ -167,6 +169,12 @@ Prometheus scrape_config: replacement: exporter:9114 ``` +Notes: +- `/metrics` serves a single, process-wide registry and is intended for single-target mode. +- `/probe` creates a fresh registry per scrape for the given `target` allowing multi-target scraping. +- Any `options:` under an auth module will be appended as URL query parameters to the target URL. +- The `tls` auth module (client certificate authentication) is intended for self‑managed Elasticsearch/OpenSearch deployments. Amazon OpenSearch Service typically authenticates at the domain edge with IAM/SigV4 and does not support client certificate authentication; use the `aws` auth module instead when scraping Amazon OpenSearch Service domains. + ### Metrics | Name | Type | Cardinality | Help | diff --git a/config/config.go b/config/config.go index 7eae4066..d2b9cb74 100644 --- a/config/config.go +++ b/config/config.go @@ -57,6 +57,7 @@ type UserPassConfig struct { // validate ensures every auth module has the required fields according to its type. func (c *Config) validate() error { for name, am := range c.AuthModules { + // Validate fields based on auth type switch strings.ToLower(am.Type) { case "userpass": if am.UserPass == nil || am.UserPass.Username == "" || am.UserPass.Password == "" { @@ -70,21 +71,52 @@ func (c *Config) validate() error { if am.AWS == nil || am.AWS.Region == "" { return fmt.Errorf("auth_module %s type aws requires region", name) } + case "tls": + // TLS auth type means client certificate authentication only (no other auth) + if am.TLS == nil { + return fmt.Errorf("auth_module %s type tls requires tls configuration section", name) + } + if am.TLS.CertFile == "" || am.TLS.KeyFile == "" { + return fmt.Errorf("auth_module %s type tls requires cert_file and key_file for client certificate authentication", name) + } + // Validate that other auth fields are not set when using TLS auth type + if am.UserPass != nil { + return fmt.Errorf("auth_module %s type tls cannot have userpass configuration", name) + } + if am.APIKey != "" { + return fmt.Errorf("auth_module %s type tls cannot have apikey", name) + } + if am.AWS != nil { + return fmt.Errorf("auth_module %s type tls cannot have aws configuration", name) + } default: return fmt.Errorf("auth_module %s has unsupported type %s", name, am.Type) } - // TLS validation (optional but if specified must be coherent) + // Validate TLS configuration (optional for all auth types, provides transport security) if am.TLS != nil { - if (am.TLS.CertFile == "") != (am.TLS.KeyFile == "") { - return fmt.Errorf("auth_module %s tls requires both cert_file and key_file or neither", name) + // For cert-based auth (type: tls), cert and key are required + // For other auth types, TLS config is optional and used for transport security + if strings.ToLower(am.Type) == "tls" { + // Already validated above that cert and key are present + } else { + // For non-TLS auth types, if cert/key are provided, both must be present + if (am.TLS.CertFile != "") != (am.TLS.KeyFile != "") { + return fmt.Errorf("auth_module %s: if providing client certificate, both cert_file and key_file must be specified", name) + } } - for _, p := range []string{am.TLS.CAFile, am.TLS.CertFile, am.TLS.KeyFile} { - if p == "" { + + // Validate file accessibility + for fileType, path := range map[string]string{ + "ca_file": am.TLS.CAFile, + "cert_file": am.TLS.CertFile, + "key_file": am.TLS.KeyFile, + } { + if path == "" { continue } - if _, err := os.Stat(p); err != nil { - return fmt.Errorf("auth_module %s tls file %s not accessible: %w", name, p, err) + if _, err := os.Stat(path); err != nil { + return fmt.Errorf("auth_module %s: %s '%s' not accessible: %w", name, fileType, path, err) } } } diff --git a/config/config_test.go b/config/config_test.go index 9d09edcb..985e32fa 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1,16 +1,3 @@ -// Copyright The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - package config import ( @@ -27,91 +14,152 @@ func mustTempFile(t *testing.T) string { return f.Name() } -func TestLoadConfigTLSValid(t *testing.T) { +// ---------------------------- Positive cases ---------------------------- +func TestLoadConfigPositiveVariants(t *testing.T) { ca := mustTempFile(t) cert := mustTempFile(t) key := mustTempFile(t) - yaml := `auth_modules: - secure: + + positive := []struct { + name string + yaml string + }{{ + "userpass", + `auth_modules: + basic: type: userpass userpass: - username: foo - password: bar + username: u + password: p`, + }, { + "userpass-with-tls", + `auth_modules: + basic: + type: userpass + userpass: + username: u + password: p + tls: + ca_file: ` + ca + ` + insecure_skip_verify: true`, + }, { + "apikey", + `auth_modules: + key: + type: apikey + apikey: ZXhhbXBsZQ==`, + }, { + "apikey-with-tls", + `auth_modules: + key: + type: apikey + apikey: ZXhhbXBsZQ== tls: ca_file: ` + ca + ` cert_file: ` + cert + ` - key_file: ` + key + ` -` - tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") - tmp.WriteString(yaml) - tmp.Close() - if _, err := LoadConfig(tmp.Name()); err != nil { - t.Fatalf("expected config to load, got %v", err) - } -} - -func TestLoadConfigTLSMissingKey(t *testing.T) { - cert := mustTempFile(t) - yaml := `auth_modules: - badtls: - type: userpass - userpass: - username: foo - password: bar + key_file: ` + key + ``, + }, { + "aws-with-tls", + `auth_modules: + awsmod: + type: aws + aws: + region: us-east-1 tls: + insecure_skip_verify: true`, + }, { + "tls-only", + `auth_modules: + pki: + type: tls + tls: + ca_file: ` + ca + ` cert_file: ` + cert + ` -` - tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") - tmp.WriteString(yaml) - tmp.Close() - if _, err := LoadConfig(tmp.Name()); err == nil { - t.Fatalf("expected validation error for missing key_file") + key_file: ` + key + ``, + }} + + for _, c := range positive { + tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") + tmp.WriteString(c.yaml) + tmp.Close() + if _, err := LoadConfig(tmp.Name()); err != nil { + t.Fatalf("%s: expected success, got %v", c.name, err) + } } } -func TestLoadConfigValidationErrors(t *testing.T) { - badPath := "/path/does/not/exist" +// ---------------------------- Negative cases ---------------------------- +func TestLoadConfigNegativeVariants(t *testing.T) { + cert := mustTempFile(t) key := mustTempFile(t) - cases := []struct { + + negative := []struct { name string yaml string - }{ - { - "tlsMissingCert", - `auth_modules: + }{{ + "userpassMissingPassword", + `auth_modules: bad: type: userpass - userpass: {username: u, password: p} + userpass: {username: u}`, + }, { + "tlsMissingCert", + `auth_modules: + bad: + type: tls tls: {key_file: ` + key + `}`, - }, - { - "tlsBadCAPath", - `auth_modules: + }, { + "tlsMissingKey", + `auth_modules: bad: - type: userpass - userpass: {username: u, password: p} - tls: {ca_file: ` + badPath + `}`, - }, - { - "awsNoRegion", - `auth_modules: + type: tls + tls: {cert_file: ` + cert + `}`, + }, { + "tlsMissingConfig", + `auth_modules: bad: - type: aws - aws: {}`, - }, - { - "unsupportedType", - `auth_modules: + type: tls`, + }, { + "tlsWithUserpass", + `auth_modules: + bad: + type: tls + tls: {cert_file: ` + cert + `, key_file: ` + key + `} + userpass: {username: u, password: p}`, + }, { + "tlsWithAPIKey", + `auth_modules: + bad: + type: tls + tls: {cert_file: ` + cert + `, key_file: ` + key + `} + apikey: ZXhhbXBsZQ==`, + }, { + "tlsWithAWS", + `auth_modules: + bad: + type: tls + tls: {cert_file: ` + cert + `, key_file: ` + key + `} + aws: {region: us-east-1}`, + }, { + "tlsIncompleteCert", + `auth_modules: + bad: + type: apikey + apikey: ZXhhbXBsZQ== + tls: {cert_file: ` + cert + `}`, + }, { + "unsupportedType", + `auth_modules: bad: type: foobar`, - }, - } - for _, c := range cases { + }} + + for _, c := range negative { tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") tmp.WriteString(c.yaml) tmp.Close() if _, err := LoadConfig(tmp.Name()); err == nil { - t.Fatalf("%s: expected validation error, got nil", c.name) + t.Fatalf("%s: expected validation error, got none", c.name) } } } diff --git a/examples/auth_modules.yml b/examples/auth_modules.yml index 93837aad..7603aa8c 100644 --- a/examples/auth_modules.yml +++ b/examples/auth_modules.yml @@ -24,38 +24,32 @@ auth_modules: password: changeme ########################################################################### - # 3. Userpass + mTLS (per-cluster client certs) # - ########################################################################### - prod_tls: - type: userpass - userpass: - username: metrics - password: certsecret - tls: - ca_file: /etc/ssl/prod/ca.pem - cert_file: /etc/ssl/prod/client.pem - key_file: /etc/ssl/prod/client-key.pem - insecure_skip_verify: false - - ########################################################################### - # 4. API-Key authentication # + # 3. API-Key authentication # ########################################################################### prod_key: type: apikey apikey: BASE64-ENCODED-KEY== - # per-cluster TLS settings can still be supplied if needed - # tls: - # ca_file: /etc/ssl/prod/ca.pem ########################################################################### - # 5. AWS SigV4 signing # + # 5. AWS SigV4 signing with optional TLS settings # ########################################################################### aws_sigv4: type: aws aws: region: us-east-1 - # role_arn is optional; uncomment to assume a role - # role_arn: arn:aws:iam::123456789012:role/metrics-reader - # optional custom TLS + # role_arn is optional + # Optional TLS configuration for transport security + tls: + ca_file: /etc/ssl/ca.pem + insecure_skip_verify: false + + ########################################################################### + # 6. Client certificate authentication only (no username/password) # + ########################################################################### + pki_mtls: + type: tls # This auth type uses ONLY client certificates for authentication tls: - insecure_skip_verify: true # e.g. self-signed certs on ALB + ca_file: /etc/ssl/pki/ca.pem # Optional: CA for server verification + cert_file: /etc/ssl/pki/client.pem # Required: Client certificate for auth + key_file: /etc/ssl/pki/client-key.pem # Required: Client private key for auth + insecure_skip_verify: false # Optional: Skip server cert validation diff --git a/main.go b/main.go index 6ef5da09..c638c987 100644 --- a/main.go +++ b/main.go @@ -314,10 +314,20 @@ func main() { pemCert := *esClientCert pemKey := *esClientPrivateKey insecure := *esInsecureSkipVerify + + // Apply TLS configuration from auth module if provided (for transport security) + // This matches single-target behavior where TLS settings are always applied if am != nil && am.TLS != nil { - pemCA = am.TLS.CAFile - pemCert = am.TLS.CertFile - pemKey = am.TLS.KeyFile + // Override with module-specific TLS settings + if am.TLS.CAFile != "" { + pemCA = am.TLS.CAFile + } + if am.TLS.CertFile != "" { + pemCert = am.TLS.CertFile + } + if am.TLS.KeyFile != "" { + pemKey = am.TLS.KeyFile + } if am.TLS.InsecureSkipVerify { insecure = true } @@ -330,18 +340,31 @@ func main() { // inject authentication based on auth_module type if am != nil { - if strings.EqualFold(am.Type, "apikey") && am.APIKey != "" { - transport = &transportWithAPIKey{ - underlyingTransport: transport, - apiKey: am.APIKey, + switch strings.ToLower(am.Type) { + case "apikey": + if am.APIKey != "" { + transport = &transportWithAPIKey{ + underlyingTransport: transport, + apiKey: am.APIKey, + } } - } else if strings.EqualFold(am.Type, "aws") && am.AWS != nil { - var err error - transport, err = roundtripper.NewAWSSigningTransport(transport, am.AWS.Region, am.AWS.RoleARN, logger) - if err != nil { - http.Error(w, "failed to create AWS signing transport", http.StatusInternalServerError) - return + case "aws": + if am.AWS != nil { + if am.AWS.Region == "" { + http.Error(w, "aws.region is required for aws auth_module", http.StatusBadRequest) + return + } + var err error + transport, err = roundtripper.NewAWSSigningTransport(transport, am.AWS.Region, am.AWS.RoleARN, logger) + if err != nil { + http.Error(w, "failed to create AWS signing transport", http.StatusInternalServerError) + return + } } + case "tls": + // No additional auth wrapper needed - client certificates in TLS config handle authentication + case "userpass": + // Already handled above by setting targetURL.User } } probeClient := &http.Client{ diff --git a/probe.go b/probe.go index d303eaec..78a0f65b 100644 --- a/probe.go +++ b/probe.go @@ -42,7 +42,11 @@ func validateProbeParams(cfg *config.Config, q url.Values) (string, *config.Auth target = "http://" + target } - if _, err := url.Parse(target); err != nil { + u, err := url.Parse(target) + if err != nil { + return "", nil, errInvalidTarget + } + if u.Scheme != "http" && u.Scheme != "https" { return "", nil, errInvalidTarget } @@ -63,11 +67,16 @@ func validateProbeParams(cfg *config.Config, q url.Values) (string, *config.Auth case "apikey": return target, &am, nil case "aws": - if am.AWS != nil && am.AWS.Region != "" { - return target, &am, nil + if am.AWS == nil || am.AWS.Region == "" { + return "", nil, errUnsupportedModule } - return "", nil, errUnsupportedModule + return target, &am, nil + case "tls": + // TLS auth type is valid; detailed TLS validation is performed during config load. + return target, &am, nil default: return "", nil, errUnsupportedModule } } + +// region inference removed by design; aws.region must be supplied in config diff --git a/probe_test.go b/probe_test.go index 50b22e22..a2cc3bbf 100644 --- a/probe_test.go +++ b/probe_test.go @@ -36,6 +36,14 @@ func TestValidateProbeParams(t *testing.T) { t.Fatalf("expected invalid target error") } + // invalid scheme + vals = url.Values{} + vals.Set("target", "ftp://example.com") + _, _, err = validateProbeParams(cfg, vals) + if err == nil { + t.Fatalf("expected invalid target error for unsupported scheme") + } + // unknown module vals = url.Values{} vals.Set("target", "http://localhost:9200") @@ -96,4 +104,23 @@ func TestValidateProbeParams(t *testing.T) { if am.AWS == nil || am.AWS.Region != "us-east-1" { t.Fatalf("unexpected aws config: %+v", am.AWS) } + + // invalid path (aws with empty region - rejected at config load; simulate here by passing nil cfg lookup) + // No additional test needed as config.LoadConfig enforces region. + + // good path (tls) + cfg.AuthModules["mtls"] = config.AuthModule{ + Type: "tls", + TLS: &config.TLSConfig{CAFile: "/dev/null", CertFile: "/dev/null", KeyFile: "/dev/null"}, + } + vals = url.Values{} + vals.Set("target", "http://localhost:9200") + vals.Set("auth_module", "mtls") + _, am, err = validateProbeParams(cfg, vals) + if err != nil { + t.Fatalf("expected success for tls module, got err=%v", err) + } + if am == nil || am.Type != "tls" { + t.Fatalf("expected tls module, got %+v", am) + } } From 03d1a70eadb90b1e6efd564cf0b870c08e5b5dba Mon Sep 17 00:00:00 2001 From: pincher95 Date: Sun, 10 Aug 2025 10:08:58 -0400 Subject: [PATCH 20/25] Cleanup Signed-off-by: pincher95 --- probe.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/probe.go b/probe.go index 78a0f65b..62ca31cf 100644 --- a/probe.go +++ b/probe.go @@ -78,5 +78,3 @@ func validateProbeParams(cfg *config.Config, q url.Values) (string, *config.Auth return "", nil, errUnsupportedModule } } - -// region inference removed by design; aws.region must be supplied in config From b0965369df491d4364b8156f6ad09995ba2d3719 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Mon, 11 Aug 2025 08:09:08 -0400 Subject: [PATCH 21/25] Fix tls auth type validation Signed-off-by: pincher95 --- config/config.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/config/config.go b/config/config.go index d2b9cb74..c97e8212 100644 --- a/config/config.go +++ b/config/config.go @@ -97,9 +97,7 @@ func (c *Config) validate() error { if am.TLS != nil { // For cert-based auth (type: tls), cert and key are required // For other auth types, TLS config is optional and used for transport security - if strings.ToLower(am.Type) == "tls" { - // Already validated above that cert and key are present - } else { + if strings.ToLower(am.Type) != "tls" { // For non-TLS auth types, if cert/key are provided, both must be present if (am.TLS.CertFile != "") != (am.TLS.KeyFile != "") { return fmt.Errorf("auth_module %s: if providing client certificate, both cert_file and key_file must be specified", name) From 236586c09c83c9152ba12d0512918acab1b87fc3 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Mon, 11 Aug 2025 08:11:49 -0400 Subject: [PATCH 22/25] Remove aws.region validation Signed-off-by: pincher95 --- config/config.go | 2 +- main.go | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/config/config.go b/config/config.go index c97e8212..07d3c479 100644 --- a/config/config.go +++ b/config/config.go @@ -68,7 +68,7 @@ func (c *Config) validate() error { return fmt.Errorf("auth_module %s type apikey requires apikey", name) } case "aws": - if am.AWS == nil || am.AWS.Region == "" { + if am.AWS == nil { return fmt.Errorf("auth_module %s type aws requires region", name) } case "tls": diff --git a/main.go b/main.go index c638c987..dfd996a7 100644 --- a/main.go +++ b/main.go @@ -350,10 +350,6 @@ func main() { } case "aws": if am.AWS != nil { - if am.AWS.Region == "" { - http.Error(w, "aws.region is required for aws auth_module", http.StatusBadRequest) - return - } var err error transport, err = roundtripper.NewAWSSigningTransport(transport, am.AWS.Region, am.AWS.RoleARN, logger) if err != nil { From bd1c0a809f4f55f38399ec42c1115cf009f4dee6 Mon Sep 17 00:00:00 2001 From: pincher95 Date: Mon, 11 Aug 2025 08:29:35 -0400 Subject: [PATCH 23/25] Add temp file cleanup in config_test.go Signed-off-by: pincher95 --- config/config_test.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/config/config_test.go b/config/config_test.go index 985e32fa..9a9f70d7 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -11,7 +11,10 @@ func mustTempFile(t *testing.T) string { t.Fatalf("temp file: %v", err) } f.Close() - return f.Name() + // Ensure temp file is removed even if created outside of test's TempDir semantics change + path := f.Name() + t.Cleanup(func() { _ = os.Remove(path) }) + return path } // ---------------------------- Positive cases ---------------------------- @@ -80,8 +83,9 @@ func TestLoadConfigPositiveVariants(t *testing.T) { for _, c := range positive { tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") - tmp.WriteString(c.yaml) - tmp.Close() + _, _ = tmp.WriteString(c.yaml) + _ = tmp.Close() + t.Cleanup(func() { _ = os.Remove(tmp.Name()) }) if _, err := LoadConfig(tmp.Name()); err != nil { t.Fatalf("%s: expected success, got %v", c.name, err) } @@ -156,8 +160,9 @@ func TestLoadConfigNegativeVariants(t *testing.T) { for _, c := range negative { tmp, _ := os.CreateTemp(t.TempDir(), "cfg-*.yml") - tmp.WriteString(c.yaml) - tmp.Close() + _, _ = tmp.WriteString(c.yaml) + _ = tmp.Close() + t.Cleanup(func() { _ = os.Remove(tmp.Name()) }) if _, err := LoadConfig(tmp.Name()); err == nil { t.Fatalf("%s: expected validation error, got none", c.name) } From c7ca445f241238353240948d838898c155d1529a Mon Sep 17 00:00:00 2001 From: pincher95 Date: Mon, 11 Aug 2025 09:29:56 -0400 Subject: [PATCH 24/25] Add copyright header to config_test.go Signed-off-by: pincher95 --- config/config_test.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/config/config_test.go b/config/config_test.go index 9a9f70d7..f5147db6 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1,3 +1,16 @@ +// Copyright The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package config import ( From 438cc76ef8bd873fb26dcf5c41c80e5c0064cbad Mon Sep 17 00:00:00 2001 From: pincher95 Date: Wed, 13 Aug 2025 13:43:08 -0400 Subject: [PATCH 25/25] Add version metric to the per-probe registry Update roundtripper.go to use region from config or environment resolver if not provided in config file (AWS_REGION) Update probe.go to accept module even if region omitted; environment resolver can provide it Update config.go to use region as optional field Update main.go to use region from config or environment resolver if not provided in config file (AWS_REGION) and update roundtripper.go to use region from config or environment resolver if not provided in config file (AWS_REGION) Signed-off-by: pincher95 --- config/config.go | 6 ++---- main.go | 19 ++++++++++++------- pkg/roundtripper/roundtripper.go | 7 ++++++- probe.go | 4 +--- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/config/config.go b/config/config.go index 07d3c479..2137cc5e 100644 --- a/config/config.go +++ b/config/config.go @@ -37,7 +37,7 @@ type AuthModule struct { // AWSConfig contains settings for SigV4 authentication. type AWSConfig struct { - Region string `yaml:"region"` + Region string `yaml:"region,omitempty"` RoleARN string `yaml:"role_arn,omitempty"` } @@ -68,9 +68,7 @@ func (c *Config) validate() error { return fmt.Errorf("auth_module %s type apikey requires apikey", name) } case "aws": - if am.AWS == nil { - return fmt.Errorf("auth_module %s type aws requires region", name) - } + // No strict validation: region can come from environment/defaults; role_arn is optional. case "tls": // TLS auth type means client certificate authentication only (no other auth) if am.TLS == nil { diff --git a/main.go b/main.go index dfd996a7..c739c28d 100644 --- a/main.go +++ b/main.go @@ -349,13 +349,15 @@ func main() { } } case "aws": - if am.AWS != nil { - var err error - transport, err = roundtripper.NewAWSSigningTransport(transport, am.AWS.Region, am.AWS.RoleARN, logger) - if err != nil { - http.Error(w, "failed to create AWS signing transport", http.StatusInternalServerError) - return - } + var region string + if am.AWS.Region != "" { + region = am.AWS.Region + } + var err error + transport, err = roundtripper.NewAWSSigningTransport(transport, region, am.AWS.RoleARN, logger) + if err != nil { + http.Error(w, "failed to create AWS signing transport", http.StatusInternalServerError) + return } case "tls": // No additional auth wrapper needed - client certificates in TLS config handle authentication @@ -370,6 +372,9 @@ func main() { reg := prometheus.NewRegistry() + // version metric + reg.MustRegister(versioncollector.NewCollector(name)) + // Core exporter collector exp, err := collector.NewElasticsearchCollector( logger, diff --git a/pkg/roundtripper/roundtripper.go b/pkg/roundtripper/roundtripper.go index 4c4dd026..8f1cfd3f 100644 --- a/pkg/roundtripper/roundtripper.go +++ b/pkg/roundtripper/roundtripper.go @@ -42,7 +42,12 @@ type AWSSigningTransport struct { } func NewAWSSigningTransport(transport http.RoundTripper, region string, roleArn string, log *slog.Logger) (*AWSSigningTransport, error) { - cfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion(region)) + // Only set region explicitly when provided; otherwise allow env/IMDS resolution + var opts []func(*config.LoadOptions) error + if region != "" { + opts = append(opts, config.WithRegion(region)) + } + cfg, err := config.LoadDefaultConfig(context.Background(), opts...) if err != nil { log.Error("failed to load aws default config", "err", err) return nil, err diff --git a/probe.go b/probe.go index 62ca31cf..2b999604 100644 --- a/probe.go +++ b/probe.go @@ -67,9 +67,7 @@ func validateProbeParams(cfg *config.Config, q url.Values) (string, *config.Auth case "apikey": return target, &am, nil case "aws": - if am.AWS == nil || am.AWS.Region == "" { - return "", nil, errUnsupportedModule - } + // Accept module even if region omitted; environment resolver can provide it. return target, &am, nil case "tls": // TLS auth type is valid; detailed TLS validation is performed during config load.