Skip to content

Commit 8102d0c

Browse files
committed
Use kube-scheduler's metrics instead of kube-state-metrics
Use kube-scheduler's metrics instead of kube-state-metrics, as they are more precise. Refer the links below for more details. * https://github.com/kubernetes/enhancements/tree/master/keps/sig-instrumentation/1748-pod-resource-metrics?rgh-link-date=2022-10-07T13%3A34%3A39Z#expose-new-metrics * kubernetes/kube-state-metrics#1849
1 parent a1c276d commit 8102d0c

11 files changed

+383
-434
lines changed

alerts/resource_alerts.libsonnet

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,18 @@
3434
} +
3535
if $._config.showMultiCluster then {
3636
expr: |||
37-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(kubeStateMetricsSelector)s,%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
37+
sum(namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
3838
and
39-
(sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu"}) by (%(clusterLabel)s)) > 0
39+
(sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
4040
||| % $._config,
4141
annotations+: {
4242
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
4343
},
4444
} else {
4545
expr: |||
46-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
46+
sum(namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
4747
and
48-
(sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})) > 0
48+
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
4949
||| % $._config,
5050
annotations+: {
5151
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
@@ -63,19 +63,19 @@
6363
} +
6464
if $._config.showMultiCluster then {
6565
expr: |||
66-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
66+
sum(namespace_memory:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
6767
and
68-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)) > 0
68+
(sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
6969
||| % $._config,
7070
annotations+: {
7171
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
7272
},
7373
} else
7474
{
7575
expr: |||
76-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
76+
sum(namespace_memory:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
7777
and
78-
(sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) - max(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})) > 0
78+
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
7979
||| % $._config,
8080
annotations+: {
8181
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
@@ -199,9 +199,9 @@
199199
{
200200
alert: 'CPUThrottlingHigh',
201201
expr: |||
202-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace)
202+
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace)
203203
/
204-
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace)
204+
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace)
205205
> ( %(cpuThrottlingPercent)s / 100 )
206206
||| % $._config,
207207
'for': '15m',

dashboards/resources/cluster.libsonnet

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -146,23 +146,23 @@ local template = grafana.template;
146146
)
147147
.addPanel(
148148
g.panel('CPU Requests Commitment') +
149-
g.statPanel('sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu",%(clusterLabel)s="$cluster"})' % $._config)
149+
g.statPanel('sum(namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu",%(clusterLabel)s="$cluster"})' % $._config)
150150
)
151151
.addPanel(
152152
g.panel('CPU Limits Commitment') +
153-
g.statPanel('sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu",%(clusterLabel)s="$cluster"})' % $._config)
153+
g.statPanel('sum(namespace_cpu:kube_pod_resource_limit_or_kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="cpu",%(clusterLabel)s="$cluster"})' % $._config)
154154
)
155155
.addPanel(
156156
g.panel('Memory Utilisation') +
157157
g.statPanel('1 - sum(:node_memory_MemAvailable_bytes:sum{%(clusterLabel)s="$cluster"}) / sum(node_memory_MemTotal_bytes{%(nodeExporterSelector)s,%(clusterLabel)s="$cluster"})' % $._config)
158158
)
159159
.addPanel(
160160
g.panel('Memory Requests Commitment') +
161-
g.statPanel('sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="memory",%(clusterLabel)s="$cluster"})' % $._config)
161+
g.statPanel('sum(namespace_memory:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="memory",%(clusterLabel)s="$cluster"})' % $._config)
162162
)
163163
.addPanel(
164164
g.panel('Memory Limits Commitment') +
165-
g.statPanel('sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="memory",%(clusterLabel)s="$cluster"})' % $._config)
165+
g.statPanel('sum(namespace_memory:kube_pod_resource_limit_or_kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) / sum(kube_node_status_allocatable{%(kubeStateMetricsSelector)s,resource="memory",%(clusterLabel)s="$cluster"})' % $._config)
166166
)
167167
)
168168
.addRow(
@@ -179,10 +179,10 @@ local template = grafana.template;
179179
g.panel('CPU Quota') +
180180
g.tablePanel(podWorkloadColumns + [
181181
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
182-
'sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
183-
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
184-
'sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
185-
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
182+
'sum(namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
183+
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
184+
'sum(namespace_cpu:kube_pod_resource_limit_or_kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
185+
'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{%(clusterLabel)s="$cluster"}) by (namespace) / sum(namespace_cpu:kube_pod_resource_limit_or_kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
186186
], tableStyles {
187187
'Value #C': { alias: 'CPU Usage' },
188188
'Value #D': { alias: 'CPU Requests' },
@@ -209,10 +209,10 @@ local template = grafana.template;
209209
g.tablePanel(podWorkloadColumns + [
210210
// Not using container_memory_usage_bytes here because that includes page cache
211211
'sum(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""}) by (namespace)' % $._config,
212-
'sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
213-
'sum(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
214-
'sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
215-
'sum(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""}) by (namespace) / sum(namespace_memory:kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
212+
'sum(namespace_memory:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
213+
'sum(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""}) by (namespace) / sum(namespace_memory:kube_pod_resource_request_or_kube_pod_container_resource_requests:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
214+
'sum(namespace_memory:kube_pod_resource_limit_or_kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
215+
'sum(container_memory_rss{%(cadvisorSelector)s, %(clusterLabel)s="$cluster", container!=""}) by (namespace) / sum(namespace_memory:kube_pod_resource_limit_or_kube_pod_container_resource_limits:sum{%(clusterLabel)s="$cluster"}) by (namespace)' % $._config,
216216
], tableStyles {
217217
'Value #C': { alias: 'Memory Usage', unit: 'bytes' },
218218
'Value #D': { alias: 'Memory Requests', unit: 'bytes' },

0 commit comments

Comments
 (0)