From 51d3fbea353c0fd3194f40782ab4072abe0edb0c Mon Sep 17 00:00:00 2001 From: May Lee Date: Tue, 2 Sep 2025 16:51:13 -0400 Subject: [PATCH 1/7] add cases --- .../get_started_with_the_custom_processor.md | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md index 8c1937f8c411d..7b2624a3b42f0 100644 --- a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md +++ b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md @@ -311,6 +311,158 @@ For this example, you have a service field that contains an incorrect service na } ``` +## Remove attributes containing null values + +Attributes with null or empty values can add unnecessary bloat to your logs. Remove null values to trim the log and only send attributes that provide information. In the script below, the `empty_patterns` section contains the list of empty patterns to check for in your logs. You can add and remove patterns to fit your use case. + +``` +# Define your empty patterns +empty_patterns = ["null", "NULL", "N/A", "n/a", "none", "NONE", "-", "undefined"] + +# Apply generic cleanup +. = compact(map_values(., recursive: true) -> |v| { + if is_null(v) || + includes(empty_patterns, v) || + (is_string(v) && strip_whitespace!(v) == "") || + (is_array(v) && length!(v) == 0) || + (is_object(v) && length!(v) == 0) { + null + } else { + v + } +}) +``` + +## Merge nested attributes to root level + +Targeting nested objects or fields in a filter query may require multiple paths to define. This is common when working with the message field, where the resulting parsed contents are nested in an object. When you use the Observability Pipelines' filter syntax, accessing a nested field requires the . notation. + +For example, this log contains a stringified JSON message: + +```json +{ + "level": "info", + "message": "{\"event_type\":\"user_login\",\"result\":\"success\",\"login_method\":\"oauth\",\"user_agent\":\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\",\"ip_address\":\"192.168.1.100\",\"session_id\":\"sess_abc123xyz\",\"duration_ms\":245}", + "timestamp": "2019-03-12T11:30:00Z", + "processed_ts": "2025-05-22T14:30:00Z", + "user_id": "12345", + "app_id": "streaming-services", + "ddtags": [ + "kube_service:my-service", + "k8_deployment:your-host", + "kube_cronjob:myjob" + ] +} +``` + +Now see the output after the `message` field is parsed. The parsed content is nested in the `message` object. + +``` +{ + "app_id": "streaming-services", + "ddtags": [ + "kube_service:my-service", + "k8_deployment:your-host", + "kube_cronjob:myjob" + ], + "level": "info", + "message": { + "duration_ms": 245, + "event_type": "user_login", + "ip_address": "192.168.1.100", + "login_method": "oauth", + "result": "success", + "session_id": "sess_abc123xyz", + "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + }, + "processed_ts": "2025-05-22T14:30:00Z", + "timestamp": "2019-03-12T11:30:00Z", + "user_id": "12345" +} +``` +In this case, to filter for `event_type`, you need to specify` @message.event_type`. Though that works, it can be difficult to do at scale. Therefore, Datadog recommends flattening the object to the root level. + +In order to merge the events from the `message` object to the root level, use this script: + +``` +if is_object(.message) { + . = merge!(., .message) + del(.message) +} +``` + +**Note**: This script works with any JSON object. You just need to replace the `message` attribute with the name of the field you are trying to flatten. + +This results in the log with flattened attributes that you can filter directly: + +``` +{ + "app_id": "streaming-services", + "ddtags": [ + "kube_service:my-service", + "k8_deployment:your-host", + "kube_cronjob:myjob" + ], + "duration_ms": 245, + "event_type": "user_login", + "ip_address": "192.168.1.100", + "level": "info", + "login_method": "oauth", + "processed_ts": "2025-05-22T14:30:00Z", + "result": "success", + "session_id": "sess_abc123xyz", + "timestamp": "2019-03-12T11:30:00Z", + "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "user_id": "12345" +} +``` + +## Serialize outbound logs in _raw format + +Splunk and CrowdStrike prefer a format called `_raw` for log ingestion. Sending data in `_raw` normalizes your logs and allows you to benefit from their out-of-the-box dashboards, monitors, and threat detection content. To ensure the `_raw` log format gets applied, you can serialize the outbound event in `_raw`. + +**Notes**: +- You should add any processing, remapping, or parsing before using this step. +- Select `Raw` as the encoding option when you set up the Splunk HEC or Crowdstrike destination. + +An example input log: + +``` +{ + "app_id": "streaming-services", + "level": "info", + "message": { + "duration_ms": 245, + "event_type": "user_login", + "ip_address": "192.168.1.100", + "login_method": "oauth", + "result": "success", + "session_id": "sess_abc123xyz", + "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + }, + "processed_ts": "2025-05-22T14:30:00Z", + "timestamp": "2019-03-12T11:30:00Z", + "user_id": "12345" +} +``` + +This custom function serializes the event to `_raw` format: + +``` +# Serialize the entire event into _raw +._raw = encode_key_value(.) +# Only keep _raw +. = { "_raw": ._raw } +``` + +This is the output of the example log after it's been processed by the custom script: + +``` +{ + "_raw": "app_id=streaming-services level=info message.duration_ms=245 message.event_type=user_login message.ip_address=192.168.1.100 message.login_method=oauth message.result=success message.session_id=sess_abc123xyz message.user_agent=\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\" processed_ts=2025-05-22T14:30:00Z timestamp=2019-03-12T11:30:00Z user_id=12345" +} +``` + ## Further reading {{< partial name="whats-next/whats-next.html" >}} From 5a93b852b6d0a10fbbcb76af6a11b2e89b6e4380 Mon Sep 17 00:00:00 2001 From: May Lee Date: Tue, 2 Sep 2025 16:55:08 -0400 Subject: [PATCH 2/7] add to overview list --- .../guide/get_started_with_the_custom_processor.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md index 7b2624a3b42f0..d587e8aebf795 100644 --- a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md +++ b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md @@ -28,6 +28,9 @@ It also goes over example scripts that address common use cases, such as: - [Remap timestamps for historical logs](#remap-timestamps-for-historical-logs) - [Extract a field from the Datadog tags array (`ddtags`)](#extract-a-field-from-the-datadog-tags-array) - [Reference another field's value](#reference-another-fields-value) +- [Remove attributes containing null values](#remove-attributes-containing-null-values) +- [Merge nested attributes to root level](#merge-nested-attributes-to-root-level) +- [Serialize outbound logs in _raw format](#serialize-outbound-logs-in-_raw-format) ## Decode Base64 From 06a7cc0be42cd3d0223f4596e9ca3fcc48043b46 Mon Sep 17 00:00:00 2001 From: May Lee Date: Wed, 10 Sep 2025 16:12:43 -0400 Subject: [PATCH 3/7] small edits --- .../guide/get_started_with_the_custom_processor.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md index d587e8aebf795..48f7c7ce14b41 100644 --- a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md +++ b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md @@ -338,7 +338,7 @@ empty_patterns = ["null", "NULL", "N/A", "n/a", "none", "NONE", "-", "undefined" ## Merge nested attributes to root level -Targeting nested objects or fields in a filter query may require multiple paths to define. This is common when working with the message field, where the resulting parsed contents are nested in an object. When you use the Observability Pipelines' filter syntax, accessing a nested field requires the . notation. +Targeting nested objects or fields in a filter query may require you to define multiple paths. This is common when working with the message field, where the resulting parsed contents are nested in an object. When you use the Observability Pipelines' filter syntax, accessing a nested field requires the . notation. For example, this log contains a stringified JSON message: @@ -358,7 +358,7 @@ For example, this log contains a stringified JSON message: } ``` -Now see the output after the `message` field is parsed. The parsed content is nested in the `message` object. +This is the output after the `message` field has been parsed. The parsed content is nested in the `message` object. ``` { @@ -383,9 +383,9 @@ Now see the output after the `message` field is parsed. The parsed content is ne "user_id": "12345" } ``` -In this case, to filter for `event_type`, you need to specify` @message.event_type`. Though that works, it can be difficult to do at scale. Therefore, Datadog recommends flattening the object to the root level. +In this case, to filter for `event_type`, you need to specify` @message.event_type`. While that works, it can be difficult to do so at scale. Therefore, Datadog recommends flattening the object to the root level. -In order to merge the events from the `message` object to the root level, use this script: +In order to merge the events from the `message` object to root level, use this script: ``` if is_object(.message) { @@ -420,12 +420,14 @@ This results in the log with flattened attributes that you can filter directly: } ``` +**Note**: If you flatten the message field and send the logs to Datadog, the resulting log does not have a "Log message" in the Log Explorer. + ## Serialize outbound logs in _raw format Splunk and CrowdStrike prefer a format called `_raw` for log ingestion. Sending data in `_raw` normalizes your logs and allows you to benefit from their out-of-the-box dashboards, monitors, and threat detection content. To ensure the `_raw` log format gets applied, you can serialize the outbound event in `_raw`. **Notes**: -- You should add any processing, remapping, or parsing before using this step. +- You should add other processing, remapping, and parsing steps before serializing your logs in `_raw` format. - Select `Raw` as the encoding option when you set up the Splunk HEC or Crowdstrike destination. An example input log: @@ -449,7 +451,7 @@ An example input log: } ``` -This custom function serializes the event to `_raw` format: +This custom function serializes the event into `_raw` format: ``` # Serialize the entire event into _raw From 2a8a21b19789592fed68865c421eb1e322b33bc4 Mon Sep 17 00:00:00 2001 From: May Lee Date: Wed, 10 Sep 2025 16:53:07 -0400 Subject: [PATCH 4/7] small edit --- .../guide/get_started_with_the_custom_processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md index 48f7c7ce14b41..655b6ae243a7d 100644 --- a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md +++ b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md @@ -420,7 +420,7 @@ This results in the log with flattened attributes that you can filter directly: } ``` -**Note**: If you flatten the message field and send the logs to Datadog, the resulting log does not have a "Log message" in the Log Explorer. +**Note**: If you flatten the message field, the resulting log no longer has a message object. This means if the log is sent to Datadog, when you view the log in Log Explorer, you will not see a Log Message section in the log side panel. ## Serialize outbound logs in _raw format From 3b3fbf48bb4dbe4e2773037dc1e28c6a681d1041 Mon Sep 17 00:00:00 2001 From: May Lee Date: Thu, 11 Sep 2025 12:51:05 -0400 Subject: [PATCH 5/7] Apply suggestions from code review Co-authored-by: Joe Peeples --- .../guide/get_started_with_the_custom_processor.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md index 655b6ae243a7d..910a1d5a399ee 100644 --- a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md +++ b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md @@ -338,7 +338,7 @@ empty_patterns = ["null", "NULL", "N/A", "n/a", "none", "NONE", "-", "undefined" ## Merge nested attributes to root level -Targeting nested objects or fields in a filter query may require you to define multiple paths. This is common when working with the message field, where the resulting parsed contents are nested in an object. When you use the Observability Pipelines' filter syntax, accessing a nested field requires the . notation. +Targeting nested objects or fields in a filter query may require you to define multiple paths. This is common when working with the message field, where the resulting parsed contents are nested in an object. When you use the Observability Pipelines' filter syntax, accessing a nested field requires the `.` notation. For example, this log contains a stringified JSON message: @@ -383,9 +383,9 @@ This is the output after the `message` field has been parsed. The parsed content "user_id": "12345" } ``` -In this case, to filter for `event_type`, you need to specify` @message.event_type`. While that works, it can be difficult to do so at scale. Therefore, Datadog recommends flattening the object to the root level. +In this case, to filter for `event_type`, you need to specify `@message.event_type`. While that works, it can be difficult to do so at scale. Therefore, Datadog recommends flattening the object to the root level. -In order to merge the events from the `message` object to root level, use this script: +To merge the events from the `message` object to root level, use this script: ``` if is_object(.message) { @@ -420,7 +420,7 @@ This results in the log with flattened attributes that you can filter directly: } ``` -**Note**: If you flatten the message field, the resulting log no longer has a message object. This means if the log is sent to Datadog, when you view the log in Log Explorer, you will not see a Log Message section in the log side panel. +**Note**: If you flatten the message field, the resulting log no longer has a message object. This means if the log is sent to Datadog, when you view the log in Log Explorer, you will not see a **Log Message** section in the log side panel. ## Serialize outbound logs in _raw format @@ -428,7 +428,7 @@ Splunk and CrowdStrike prefer a format called `_raw` for log ingestion. Sending **Notes**: - You should add other processing, remapping, and parsing steps before serializing your logs in `_raw` format. -- Select `Raw` as the encoding option when you set up the Splunk HEC or Crowdstrike destination. +- Select `Raw` as the encoding option when you set up the Splunk HEC or CrowdStrike destination. An example input log: From 062bc60cee0dec3a1d20ac3a89d4f4acbda9d705 Mon Sep 17 00:00:00 2001 From: May Lee Date: Thu, 11 Sep 2025 12:53:46 -0400 Subject: [PATCH 6/7] add json --- .../guide/get_started_with_the_custom_processor.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md index 910a1d5a399ee..8dd0d16dc4b24 100644 --- a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md +++ b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md @@ -318,7 +318,7 @@ For this example, you have a service field that contains an incorrect service na Attributes with null or empty values can add unnecessary bloat to your logs. Remove null values to trim the log and only send attributes that provide information. In the script below, the `empty_patterns` section contains the list of empty patterns to check for in your logs. You can add and remove patterns to fit your use case. -``` +```json # Define your empty patterns empty_patterns = ["null", "NULL", "N/A", "n/a", "none", "NONE", "-", "undefined"] @@ -360,7 +360,7 @@ For example, this log contains a stringified JSON message: This is the output after the `message` field has been parsed. The parsed content is nested in the `message` object. -``` +```json { "app_id": "streaming-services", "ddtags": [ @@ -387,7 +387,7 @@ In this case, to filter for `event_type`, you need to specify `@message.event_ty To merge the events from the `message` object to root level, use this script: -``` +```json if is_object(.message) { . = merge!(., .message) del(.message) @@ -398,7 +398,7 @@ if is_object(.message) { This results in the log with flattened attributes that you can filter directly: -``` +```json { "app_id": "streaming-services", "ddtags": [ @@ -432,7 +432,7 @@ Splunk and CrowdStrike prefer a format called `_raw` for log ingestion. Sending An example input log: -``` +```json { "app_id": "streaming-services", "level": "info", @@ -453,7 +453,7 @@ An example input log: This custom function serializes the event into `_raw` format: -``` +```json # Serialize the entire event into _raw ._raw = encode_key_value(.) # Only keep _raw @@ -462,7 +462,7 @@ This custom function serializes the event into `_raw` format: This is the output of the example log after it's been processed by the custom script: -``` +```json { "_raw": "app_id=streaming-services level=info message.duration_ms=245 message.event_type=user_login message.ip_address=192.168.1.100 message.login_method=oauth message.result=success message.session_id=sess_abc123xyz message.user_agent=\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\" processed_ts=2025-05-22T14:30:00Z timestamp=2019-03-12T11:30:00Z user_id=12345" } From cc05119988d8d264007ed53d425653eb83aade8d Mon Sep 17 00:00:00 2001 From: May Lee Date: Thu, 11 Sep 2025 14:53:26 -0400 Subject: [PATCH 7/7] Update content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md --- .../guide/get_started_with_the_custom_processor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md index 8dd0d16dc4b24..aa87ffee23b70 100644 --- a/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md +++ b/content/en/observability_pipelines/guide/get_started_with_the_custom_processor.md @@ -383,7 +383,7 @@ This is the output after the `message` field has been parsed. The parsed content "user_id": "12345" } ``` -In this case, to filter for `event_type`, you need to specify `@message.event_type`. While that works, it can be difficult to do so at scale. Therefore, Datadog recommends flattening the object to the root level. +In this case, to filter for `event_type`, you need to specify `@message.event_type`. To directly filter for `event_type` or another field within an object, Datadog recommends flattening the object to the root level. To merge the events from the `message` object to root level, use this script: