Skip to content

Commit 004b2c6

Browse files
committed
add HAMi Recording rules
Signed-off-by: frezes <[email protected]>
1 parent b640662 commit 004b2c6

File tree

2 files changed

+182
-0
lines changed

2 files changed

+182
-0
lines changed

ks-prometheus/components/wiztelemetry-mixin/rules/gpu.libsonnet

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,109 @@
494494
},
495495
],
496496
},
497+
{
498+
name: "wiztelemetry-hami.rules",
499+
rules: [
500+
{
501+
record: 'node_namespace_pod_container:container_gpu_utilization',
502+
expr: |||
503+
sum by (%(clusterLabel)s, node, namespace, pod, container) (
504+
label_replace(
505+
label_replace(
506+
label_replace(Device_utilization_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
507+
"pod",
508+
"$1",
509+
"podname",
510+
"(.*)"
511+
),
512+
"container",
513+
"$1",
514+
"ctrname",
515+
"(.*)"
516+
)
517+
)
518+
||| % $._config,
519+
},
520+
{
521+
record: 'node_namespace_pod_container:container_gpu_memory_usage',
522+
expr: |||
523+
sum by (%(clusterLabel)s, node, namespace, pod, container) (
524+
label_replace(
525+
label_replace(
526+
label_replace(Device_memory_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
527+
"pod",
528+
"$1",
529+
"podname",
530+
"(.*)"
531+
),
532+
"container",
533+
"$1",
534+
"ctrname",
535+
"(.*)"
536+
)
537+
)
538+
||| % $._config,
539+
},
540+
{
541+
record: 'node:vgpu_device:vgpu_allocated_utilization',
542+
expr: |||
543+
label_replace(
544+
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit * 100, "node", "$1", "nodeid", "(.*)"),
545+
"device_num",
546+
"$1",
547+
"deviceidx",
548+
"(.*)"
549+
)
550+
||| % $._config,
551+
},
552+
{
553+
record: 'node:vgpu_device:vgpu_core_allocated_utilization',
554+
expr: |||
555+
label_replace(
556+
label_replace(GPUDeviceCoreAllocated / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
557+
"device_num",
558+
"$1",
559+
"deviceidx",
560+
"(.*)"
561+
)
562+
||| % $._config,
563+
},
564+
{
565+
record: 'node:vgpu_device:vgpu_memory_allocated_utilization',
566+
expr: |||
567+
label_replace(
568+
label_replace(
569+
sum without (devicecores) (GPUDeviceMemoryAllocated) / GPUDeviceMemoryLimit,
570+
"node",
571+
"$1",
572+
"nodeid",
573+
"(.*)"
574+
),
575+
"device_num",
576+
"$1",
577+
"deviceidx",
578+
"(.*)"
579+
)
580+
||| % $._config,
581+
},
582+
{
583+
record: 'node:node_gpu_allocated_num:sum',
584+
expr: |||
585+
sum by (%(clusterLabel)s, node) (
586+
kube_pod_container_resource_requests{%(kubeStateMetricsSelector)s,resource=~"nvidia_com_vgpu"}
587+
)
588+
||| % $._config,
589+
},
590+
{
591+
record: 'node:node_gpu_num:sum',
592+
expr: |||
593+
sum by(%(clusterLabel)s, node) (
594+
kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource=~"nvidia_com_vgpu"}
595+
)
596+
||| % $._config,
597+
},
598+
],
599+
},
497600
],
498601
},
499602
}

ks-prometheus/manifests/wiztelemetry-prometheusRule.yaml

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -964,6 +964,85 @@ spec:
964964
kube_node_status_allocatable{job="kube-state-metrics",resource=~"huawei_com_Ascend(.*)"}
965965
)
966966
record: node:node_gpu_num:sum
967+
- name: wiztelemetry-hami.rules
968+
rules:
969+
- expr: |
970+
sum by (cluster, node, namespace, pod, container) (
971+
label_replace(
972+
label_replace(
973+
label_replace(Device_utilization_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
974+
"pod",
975+
"$1",
976+
"podname",
977+
"(.*)"
978+
),
979+
"container",
980+
"$1",
981+
"ctrname",
982+
"(.*)"
983+
)
984+
)
985+
record: node_namespace_pod_container:container_gpu_utilization
986+
- expr: |
987+
sum by (cluster, node, namespace, pod, container) (
988+
label_replace(
989+
label_replace(
990+
label_replace(Device_memory_desc_of_container, "namespace", "$1", "podnamespace", "(.*)"),
991+
"pod",
992+
"$1",
993+
"podname",
994+
"(.*)"
995+
),
996+
"container",
997+
"$1",
998+
"ctrname",
999+
"(.*)"
1000+
)
1001+
)
1002+
record: node_namespace_pod_container:container_gpu_memory_usage
1003+
- expr: |
1004+
label_replace(
1005+
label_replace(GPUDeviceSharedNum / GPUDeviceCoreLimit * 100, "node", "$1", "nodeid", "(.*)"),
1006+
"device_num",
1007+
"$1",
1008+
"deviceidx",
1009+
"(.*)"
1010+
)
1011+
record: node:vgpu_device:vgpu_allocated_utilization
1012+
- expr: |
1013+
label_replace(
1014+
label_replace(GPUDeviceCoreAllocated / GPUDeviceCoreLimit, "node", "$1", "nodeid", "(.*)"),
1015+
"device_num",
1016+
"$1",
1017+
"deviceidx",
1018+
"(.*)"
1019+
)
1020+
record: node:vgpu_device:vgpu_core_allocated_utilization
1021+
- expr: |
1022+
label_replace(
1023+
label_replace(
1024+
sum without (devicecores) (GPUDeviceMemoryAllocated) / GPUDeviceMemoryLimit,
1025+
"node",
1026+
"$1",
1027+
"nodeid",
1028+
"(.*)"
1029+
),
1030+
"device_num",
1031+
"$1",
1032+
"deviceidx",
1033+
"(.*)"
1034+
)
1035+
record: node:vgpu_device:vgpu_memory_allocated_utilization
1036+
- expr: |
1037+
sum by (cluster, node) (
1038+
kube_pod_container_resource_requests{job="kube-state-metrics",resource=~"nvidia_com_vgpu"}
1039+
)
1040+
record: node:node_gpu_allocated_num:sum
1041+
- expr: |
1042+
sum by(cluster, node) (
1043+
kube_node_status_allocatable{job="kube-state-metrics",resource=~"nvidia_com_vgpu"}
1044+
)
1045+
record: node:node_gpu_num:sum
9671046
- name: wiztelemetry-kubelet.rules
9681047
rules:
9691048
- expr: |

0 commit comments

Comments
 (0)