Skip to content

Commit 20d1be5

Browse files
committed
Set up dcgmexporter interval to 1000ms when accelerated_compute_gpu_metrics_collection_interval is present and less than 60
1 parent 566a75b commit 20d1be5

File tree

2 files changed

+22
-3
lines changed

2 files changed

+22
-3
lines changed

charts/amazon-cloudwatch-observability/templates/_helpers.tpl

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,26 @@ Get the current recommended neuron-monitor image for a region
248248
{{- printf "%s/%s:%s" $imageDomain .Values.neuronMonitor.image.repository .Values.neuronMonitor.image.tag -}}
249249
{{- end -}}
250250

251+
{{/*
252+
Set DCGM_EXPORTER_INTERVAL environment variable for dcgmExporter if accelerated_compute_gpu_metrics_collection_interval is set and less than 60
253+
*/}}
254+
{{- define "dcgm-exporter.env" -}}
255+
{{- $intervalFound := false -}}
256+
{{- $intervalValue := 0 -}}
257+
{{- range .Values.agents -}}
258+
{{- $agent := merge . (deepCopy $.Values.agent) -}}
259+
{{- $agentConfig := $agent.config | default $agent.defaultConfig -}}
260+
{{- if and (hasKey $agentConfig "logs") (hasKey $agentConfig.logs "metrics_collected") (hasKey $agentConfig.logs.metrics_collected "kubernetes") (hasKey $agentConfig.logs.metrics_collected.kubernetes "accelerated_compute_gpu_metrics_collection_interval") -}}
261+
{{- $intervalFound = true -}}
262+
{{- $intervalValue = $agentConfig.logs.metrics_collected.kubernetes.accelerated_compute_gpu_metrics_collection_interval -}}
263+
{{- end -}}
264+
{{- end -}}
265+
{{- if and $intervalFound (lt ($intervalValue | int) 60) -}}
266+
- name: DCGM_EXPORTER_INTERVAL
267+
value: "1000"
268+
{{- end -}}
269+
{{- end -}}
270+
251271
{{/*
252272
Get the current recommended auto instrumentation java image
253273
*/}}
@@ -407,5 +427,3 @@ Get namespaceSelector value for admission webhooks
407427
{{- end -}}
408428
{{- end -}}
409429
{{- end -}}
410-
411-

charts/amazon-cloudwatch-observability/templates/linux/dcgm-exporter-daemonset.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ spec:
3131
valueFrom:
3232
fieldRef:
3333
fieldPath: spec.nodeName
34+
{{- include "dcgm-exporter.env" . | nindent 2 }}
3435
ports:
3536
- name: "metrics"
3637
port: {{ .Values.dcgmExporter.service.port }}
@@ -69,4 +70,4 @@ spec:
6970
cert_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.crt
7071
key_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.key
7172
{{- dict "component" .Values.dcgmExporter "context" . | include "amazon-cloudwatch-observability.common.tolerations" | nindent 2 }}
72-
{{- end }}
73+
{{- end }}

0 commit comments

Comments
 (0)