From 67c592ffbcec00c22756d4e9a587aa9754ce9017 Mon Sep 17 00:00:00 2001 From: Ronaldo Saheki Date: Wed, 27 Aug 2025 10:53:15 +0100 Subject: [PATCH 1/2] Add annotations support to DCGM Exporter service configuration - Introduced an `Annotations` field in `DCGMExporterServiceConfig` to allow custom annotations for the DCGM Exporter service. - Updated the deepcopy function to handle the new `Annotations` field. - Modified the service transformation logic to apply specified annotations to the DCGM Exporter service. - Enhanced the CRD definition to include the new annotations property. - Added unit tests to verify the correct application of annotations in various scenarios. Signed-off-by: Ronaldo Saheki --- api/nvidia/v1/clusterpolicy_types.go | 6 + api/nvidia/v1/zz_generated.deepcopy.go | 7 ++ .../crd/bases/nvidia.com_clusterpolicies.yaml | 6 + controllers/object_controls.go | 10 ++ controllers/transforms_test.go | 118 ++++++++++++++++++ deployments/gpu-operator/values.yaml | 1 + 6 files changed, 148 insertions(+) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 5b9535dba..7dc131b5e 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -946,6 +946,12 @@ type DCGMExporterServiceConfig struct { // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Internal Traffic Policy for the DCGM Exporter K8s Service" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" InternalTrafficPolicy *corev1.ServiceInternalTrafficPolicy `json:"internalTrafficPolicy,omitempty"` + + // Annotations to be added to the DCGM Exporter service + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Annotations for the DCGM Exporter K8s Service" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text" + Annotations map[string]string `json:"annotations,omitempty"` } // DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 1735b0699..9ccafa89a 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -281,6 +281,13 @@ func (in *DCGMExporterServiceConfig) DeepCopyInto(out *DCGMExporterServiceConfig *out = new(corev1.ServiceInternalTrafficPolicy) **out = **in } + if in.Annotations != nil { + in, out := &in.Annotations, &out.Annotations + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DCGMExporterServiceConfig. diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index c032907c6..ac4104c98 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -385,6 +385,12 @@ spec: description: 'Optional: Service configuration for NVIDIA DCGM Exporter' properties: + annotations: + additionalProperties: + type: string + description: Annotations to be added to the DCGM Exporter + service + type: object internalTrafficPolicy: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 917187e21..3595cc656 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -998,6 +998,16 @@ func TransformDCGMExporterService(obj *corev1.Service, config *gpuv1.ClusterPoli if serviceConfig.InternalTrafficPolicy != nil { obj.Spec.InternalTrafficPolicy = serviceConfig.InternalTrafficPolicy } + + // set annotations if specified for dcgm-exporter service + if len(serviceConfig.Annotations) > 0 { + if obj.ObjectMeta.Annotations == nil { + obj.ObjectMeta.Annotations = make(map[string]string) + } + for annKey, annValue := range serviceConfig.Annotations { + obj.ObjectMeta.Annotations[annKey] = annValue + } + } } return nil } diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index ece2e015c..0f24df16f 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -1600,6 +1600,124 @@ func TestTransformNodeStatusExporter(t *testing.T) { } } +func TestTransformDCGMExporterService(t *testing.T) { + testCases := []struct { + description string + service *corev1.Service + cpSpec *gpuv1.ClusterPolicySpec + expectedService *corev1.Service + }{ + { + description: "service without annotations", + service: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nvidia-dcgm-exporter", + Annotations: map[string]string{ + "prometheus.io/scrape": "true", + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + }, + }, + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + ServiceSpec: &gpuv1.DCGMExporterServiceConfig{ + Type: corev1.ServiceTypeNodePort, + }, + }, + }, + expectedService: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nvidia-dcgm-exporter", + Annotations: map[string]string{ + "prometheus.io/scrape": "true", + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeNodePort, + }, + }, + }, + { + description: "service with custom annotations", + service: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nvidia-dcgm-exporter", + Annotations: map[string]string{ + "prometheus.io/scrape": "true", + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + }, + }, + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + ServiceSpec: &gpuv1.DCGMExporterServiceConfig{ + Type: corev1.ServiceTypeNodePort, + Annotations: map[string]string{ + "custom.annotation/key": "custom-value", + "another.annotation": "another-value", + }, + }, + }, + }, + expectedService: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nvidia-dcgm-exporter", + Annotations: map[string]string{ + "prometheus.io/scrape": "true", + "custom.annotation/key": "custom-value", + "another.annotation": "another-value", + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeNodePort, + }, + }, + }, + { + description: "service with nil service spec", + service: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nvidia-dcgm-exporter", + Annotations: map[string]string{ + "prometheus.io/scrape": "true", + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + }, + }, + cpSpec: &gpuv1.ClusterPolicySpec{ + DCGMExporter: gpuv1.DCGMExporterSpec{ + ServiceSpec: nil, + }, + }, + expectedService: &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nvidia-dcgm-exporter", + Annotations: map[string]string{ + "prometheus.io/scrape": "true", + }, + }, + Spec: corev1.ServiceSpec{ + Type: corev1.ServiceTypeClusterIP, + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + err := TransformDCGMExporterService(tc.service, tc.cpSpec) + require.NoError(t, err) + require.EqualValues(t, tc.expectedService, tc.service) + }) + } +} + func TestTransformDriver(t *testing.T) { initMockK8sClients() testCases := []struct { diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 98acfcda1..dfd461162 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -295,6 +295,7 @@ dcgmExporter: resources: {} service: internalTrafficPolicy: Cluster + annotations: {} serviceMonitor: enabled: false interval: 15s From 600df94734e83ca14d51a4e8bb02ea9ba36c6323 Mon Sep 17 00:00:00 2001 From: Ronaldo Saheki Date: Wed, 27 Aug 2025 15:36:31 +0100 Subject: [PATCH 2/2] Add annotations property to DCGM Exporter service configuration in CRDs - Enhanced the CRD definitions for both `nvidia.com_clusterpolicies.yaml` files to include an `annotations` field. - This new field allows users to specify custom annotations for the DCGM Exporter service. Signed-off-by: Ronaldo Saheki --- bundle/manifests/nvidia.com_clusterpolicies.yaml | 6 ++++++ .../gpu-operator/crds/nvidia.com_clusterpolicies.yaml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/bundle/manifests/nvidia.com_clusterpolicies.yaml b/bundle/manifests/nvidia.com_clusterpolicies.yaml index c032907c6..ac4104c98 100644 --- a/bundle/manifests/nvidia.com_clusterpolicies.yaml +++ b/bundle/manifests/nvidia.com_clusterpolicies.yaml @@ -385,6 +385,12 @@ spec: description: 'Optional: Service configuration for NVIDIA DCGM Exporter' properties: + annotations: + additionalProperties: + type: string + description: Annotations to be added to the DCGM Exporter + service + type: object internalTrafficPolicy: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP. diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index c032907c6..ac4104c98 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -385,6 +385,12 @@ spec: description: 'Optional: Service configuration for NVIDIA DCGM Exporter' properties: + annotations: + additionalProperties: + type: string + description: Annotations to be added to the DCGM Exporter + service + type: object internalTrafficPolicy: description: InternalTrafficPolicy describes how nodes distribute service traffic they receive on the ClusterIP.