Skip to content

Rework of ElasticsearchWriter performance data #10511

@martialblog

Description

@martialblog

Hi,

I have some suggestions for improving the ElasticsearchWriter.

Currently the ElasticsearchWriter creates a new field for every metric in a given check command.
This leads to a vast number of fields in the Elasticsearch/OpenSearch indices.

Here are some examples:

{
  "_index": "icinga",
  "_id": "UQdo5ZcByMLqDokf4UFI",
  "_source": {
    "@timestamp": "2025-07-07T15:02:20.573+0000",
    "check_command": "load",
    "check_result.check_source": "homestead",
    "check_result.command": [
      "/usr/lib/nagios/plugins/check_load",
      "-c",
      "10,6,4",
      "-w",
      "5,4,3"
    ],
    "check_result.execution_end": "2025-07-07T15:02:20.573+0000",
    "check_result.execution_start": "2025-07-07T15:02:20.570+0000",
    "check_result.execution_time": 0.0027740001678466797,
    "check_result.exit_status": 0,
    "check_result.latency": 0.0003490447998046875,
    "check_result.output": "OK - load average: 0.24, 0.25, 0.26",
    "check_result.perfdata.load1.crit": 10,
    "check_result.perfdata.load1.min": 0,
    "check_result.perfdata.load1.value": 0.24,
    "check_result.perfdata.load1.warn": 5,
    "check_result.perfdata.load15.crit": 4,
    "check_result.perfdata.load15.min": 0,
    "check_result.perfdata.load15.value": 0.26,
    "check_result.perfdata.load15.warn": 3,
    "check_result.perfdata.load5.crit": 6,
    "check_result.perfdata.load5.min": 0,
    "check_result.perfdata.load5.value": 0.25,
    "check_result.perfdata.load5.warn": 4,
    "check_result.schedule_end": "2025-07-07T15:02:20.573+0000",
    "check_result.schedule_start": "2025-07-07T15:02:20.569+0000",
    "check_result.state": 0,
    "check_result.vars_after": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "check_result.vars_before": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "current_check_attempt": 1,
    "host": "homestead",
    "last_hard_state": 0,
    "last_state": 0,
    "max_check_attempts": 5,
    "reachable": true,
    "service": "load",
    "state": 0,
    "state_type": 1,
    "timestamp": "2025-07-07T15:02:20.573+0000",
    "type": "icinga2.event.checkresult"
  }
},
{
  "_index": "icinga",
  "_id": "-j6O6pcBNgAT8LEo7v53",
  "_score": 1,
  "_source": {
    "@timestamp": "2025-07-08T15:01:56.548+0000",
    "check_command": "ping4",
    "check_result.check_source": "homestead",
    "check_result.command": [
      "/usr/lib/nagios/plugins/check_ping",
      "-4",
      "-H",
      "127.0.0.1",
      "-c",
      "200,15%",
      "-w",
      "100,5%"
    ],
    "check_result.execution_end": "2025-07-08T15:01:56.548+0000",
    "check_result.execution_start": "2025-07-08T15:01:52.466+0000",
    "check_result.execution_time": 4.081341981887817,
    "check_result.exit_status": 0,
    "check_result.latency": 0.0006992816925048828,
    "check_result.output": "PING OK - Packet loss = 0%, RTA = 0.07 ms",
    "check_result.perfdata.pl.crit": 15,
    "check_result.perfdata.pl.min": 0,
    "check_result.perfdata.pl.unit": "percent",
    "check_result.perfdata.pl.value": 0,
    "check_result.perfdata.pl.warn": 5,
    "check_result.perfdata.rta.crit": 0.2,
    "check_result.perfdata.rta.min": 0,
    "check_result.perfdata.rta.unit": "seconds",
    "check_result.perfdata.rta.value": 7.4e-05,
    "check_result.perfdata.rta.warn": 0.1,
    "check_result.schedule_end": "2025-07-08T15:01:56.548+0000",
    "check_result.schedule_start": "2025-07-08T15:01:52.466+0000",
    "check_result.state": 0,
    "check_result.vars_after": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "check_result.vars_before": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "current_check_attempt": 1,
    "host": "Demo1",
    "last_hard_state": 0,
    "last_state": 0,
    "max_check_attempts": 5,
    "reachable": true,
    "service": "ping4",
    "state": 0,
    "state_type": 1,
    "timestamp": "2025-07-08T15:01:56.548+0000",
    "type": "icinga2.event.checkresult"
  }
}

This means, the ElasticsearchWriter is practically unusable in large Icinga2 setups, when enable_send_perfdata is enabled. Because a high amount of fields can lead to performance degradations and memory issues in clusters (See https://www.elastic.co/guide/en/elasticsearch/reference/8.18/mapping-settings-limit.html )

Elasticsearch/OpenSearch only allow 1000 fields as default, to safeguard clusters against indices that create too many fields.

As mentioned, the Icinga2 ElasticsearchWriter creates a new field for each metric in a check plugin. Each new plugin, each new metric adds new fields to the index. See also: #6805

A possible solution

One solution could be to redesign the output of the ElasticsearchWriter to use an array of nested objects for the performance data.
These are supported in both Elasticsearch and OpenSearch:

The nested type is a specialised version of the object data type that allows arrays of objects to be indexed in a way that they can be queried independently of each other.

Example from the docs:

{
  "patients": [
    {"name" : "John Doe", "age" : 56, "smoker" : true},
    {"name" : "Mary Major", "age" : 85, "smoker" : false}
  ]
}

There are some pitfalls to look out for. By default, arrays of objects are flattened during indexing. Meaning the ElasticsearchWrite needs to make sure the nested field is used.

An example of how this would look like in Icinga2:

{
  "_index": "icinga",
  "_id": "UQdo5ZcByMLqDokf4UFI",
  "_score": 1,
  "_source": {
    "@timestamp": "2025-07-07T15:02:20.573+0000",
    "check_command": "load",
    "check_result.check_source": "homestead",
    "check_result.command": [
      "/usr/lib/nagios/plugins/check_load",
      "-c",
      "10,6,4",
      "-w",
      "5,4,3"
    ],
    "check_result.execution_end": "2025-07-07T15:02:20.573+0000",
    "check_result.execution_start": "2025-07-07T15:02:20.570+0000",
    "check_result.execution_time": 0.0027740001678466797,
    "check_result.exit_status": 0,
    "check_result.latency": 0.0003490447998046875,
    "check_result.output": "OK - load average: 0.24, 0.25, 0.26",
    "check_result.perfdata": [
        {
            "metric_name": "load1",
            "crit": 10,
            "min": 0,
            "value": 0.24,
            "warn": 5,
        },
        {
            "metric_name": "load15",
            "crit": 4,
            "min": 0,
            "value": 0.26,
            "warn": 3,
        },
        {
            "metric_name": "load5",
            "crit": 6,
            "min": 0,
            "value": 0.25,
            "warn": 4,
        },
    ]
    "check_result.schedule_end": "2025-07-07T15:02:20.573+0000",
    "check_result.schedule_start": "2025-07-07T15:02:20.569+0000",
    "check_result.state": 0,
    "check_result.vars_after": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "check_result.vars_before": {
      "attempt": 1,
      "reachable": true,
      "state": 0,
      "state_type": 1
    },
    "current_check_attempt": 1,
    "host": "homestead",
    "last_hard_state": 0,
    "last_state": 0,
    "max_check_attempts": 5,
    "reachable": true,
    "service": "load",
    "state": 0,
    "state_type": 1,
    "timestamp": "2025-07-07T15:02:20.573+0000",
    "type": "icinga2.event.checkresult"
  }
}

Benefits:

  1. Avoids creating new fields for each metric in a check plugin
  2. Make the index fields more predictable when searching
  3. Makes the perfdata independently searchable

Obviously, this would be a breaking change.

However, I don't see how the ElasticsearchWriter enable_send_perfdata would ever be usable in practice without reworking the current performance data output, regardless of how it is reworked. Since every cluster is doomed to run into the field mapping limit eventually with the current implementation, as fields cannot be removed from the mapping.

Note: I do realize that the current daily index rotation might alleviate this somewhat, until you hit a threshold with plugins/metric amount.

Regards,
Markus

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions