Provide a metric for cohort resource reservations#9833
Provide a metric for cohort resource reservations#9833mszadkow wants to merge 2 commits intokubernetes-sigs:mainfrom
Conversation
|
[APPROVALNOTIFIER] This PR is NOT APPROVED This pull-request has been approved by: mszadkow The full list of commands accepted by this bot can be found here. DetailsNeeds approval from an approver in each of these files:Approvers can indicate their approval by writing |
✅ Deploy Preview for kubernetes-sigs-kueue canceled.
|
1cc9dc8 to
081ab40
Compare
test/integration/singlecluster/scheduler/fairsharing/metrics_test.go
Outdated
Show resolved
Hide resolved
test/integration/singlecluster/scheduler/fairsharing/metrics_test.go
Outdated
Show resolved
Hide resolved
f87ae90 to
a26f57a
Compare
a26f57a to
c94a40d
Compare
|
/lgtm |
|
LGTM label has been added. DetailsGit tree hash: 63f91d381ca482809071ea08ba34d1800d818d03 |
c94a40d to
35c178e
Compare
|
New changes are detected. LGTM label has been removed. |
11cf83f to
8761860
Compare
|
@mbobrovskyi please check again |
8761860 to
b85a8be
Compare
b85a8be to
58b1de4
Compare
58b1de4 to
06f08e6
Compare
35dee7e to
ff6a34b
Compare
| // collectCohortMetricPoints prepares metric values for the target cohort and each ancestor. | ||
| // When simulateRemoval=true, it computes the remaining ancestor subtree values after | ||
| // subtracting the target cohort subtree quotas and reservations. |
There was a problem hiding this comment.
Is it the intention?
// collectCohortMetricPoints prepares subtree metric points for the target cohort
// and all cohorts on the path from the target to the root.
// When simulateRemoval=true, it computes post-removal subtree values by subtracting
// the target cohort's subtree contribution from each cohort's current subtree totals.
// This is used when clearing metrics so ancestor subtree gauges are updated too,
// rather than left with stale values after the target cohort is removed.
There was a problem hiding this comment.
Yes, that's the point of this function
| removedSubtreeQuota := ch.resourceNode.SubtreeQuota | ||
|
|
||
| // Cache subtree reservation aggregations once per cohort during this run to avoid | ||
| // repeated recursion when walking ancestor paths. | ||
| reservationsCache := make(map[*cohort]resources.FlavorResourceQuantities) | ||
| removedSubtreeReservations := totalSubtreeReservationsWithCache(ch, reservationsCache) | ||
|
|
||
| var points []cohortMetricPoint | ||
| for ancestor := range ch.PathSelfToRoot() { | ||
| quotas := ancestor.resourceNode.SubtreeQuota | ||
| cohortSubtreeQuota := ancestor.resourceNode.SubtreeQuota | ||
| cohortSubtreeReservations := totalSubtreeReservationsWithCache(ancestor, reservationsCache) | ||
|
|
||
| var ancestorCurrentSubtreeReservations resources.FlavorResourceQuantities | ||
| if simulateRemoval { | ||
| quotas = removedQuota | ||
| // In removal mode, subtract target subtree values from the ancestor current values | ||
| // to obtain the post-removal snapshot used for clearing metrics. | ||
| cohortSubtreeQuota = removedSubtreeQuota | ||
| cohortSubtreeReservations = removedSubtreeReservations | ||
| ancestorCurrentSubtreeReservations = totalSubtreeReservationsWithCache(ancestor, reservationsCache) | ||
| } | ||
|
|
||
| for flr, qty := range quotas { | ||
| flavorResourceKeys := sets.New[resources.FlavorResource]() | ||
| flavorResourceKeys.Insert(slices.Collect(maps.Keys(cohortSubtreeQuota))...) | ||
| flavorResourceKeys.Insert(slices.Collect(maps.Keys(cohortSubtreeReservations))...) | ||
|
|
||
| for fr := range flavorResourceKeys { | ||
| quotaQty := cohortSubtreeQuota[fr] | ||
| reservationsQty := cohortSubtreeReservations[fr] | ||
| if simulateRemoval { | ||
| qty = max(ancestor.resourceNode.SubtreeQuota[flr]-qty, 0) | ||
| quotaQty = max(ancestor.resourceNode.SubtreeQuota[fr]-quotaQty, 0) | ||
| reservationsQty = max(ancestorCurrentSubtreeReservations[fr]-reservationsQty, 0) | ||
| } | ||
| points = append(points, cohortMetricPoint{ | ||
| cohortName: ancestor.Name, | ||
| flavorResource: flr, | ||
| qty: qty, | ||
| cohortName: ancestor.Name, | ||
| flavorResource: fr, | ||
| quotaQty: quotaQty, | ||
| reservationsQty: reservationsQty, | ||
| }) |
There was a problem hiding this comment.
The code is quite complex I would say. Maybe worth considering a helper to Subtract the usage maps represtented as ResourceFlavorQuantity maps to hide technicalities behind an abstraction and avoid some weird tricks with pinning / unpining the cohort variables.
I imagine the flow could be overall somthing like :
currentCohortQuota := ch.resourceNode.SubtreeQuota
for ancestor := range ch.PathSelfToRoot() {
ancestorCohortQuota := ancestor.resourceNode.SubtreeQuota
if simulateRemoval {
currentCohortQuota.Subtract(ancestorCohortQuota) // or currentCohortQuotaWithRemove = currentCohortQuota.Sub(ancestorCohortQuota) if we need to also keep previous value
}This would iiuc allow us later to skip the simulateRemoval handling in the flow.
Let me know if I missing something or your encounter some obstacles.
| } | ||
|
|
||
| removedQuota := ch.resourceNode.SubtreeQuota | ||
| removedSubtreeQuota := ch.resourceNode.SubtreeQuota |
There was a problem hiding this comment.
Let's also add some unit tests for the function which would allow quick verification of correctness with debugger without running the heavy machinery of integration tests.
ff6a34b to
60b5ea1
Compare
|
PR needs rebase. DetailsInstructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes-sigs/prow repository. |
60b5ea1 to
fa2f796
Compare
|
@mszadkow: The following tests failed, say
Full PR test history. Your PR dashboard. Please help us cut down on flakes by linking to an open issue when you hit one in your PR. DetailsInstructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes-sigs/prow repository. I understand the commands that are listed here. |
What type of PR is this?
/kind feature
What this PR does / why we need it:
Add metrics for cohort's resource reservations.
Which issue(s) this PR fixes:
Relates to #7539
Special notes for your reviewer:
Does this PR introduce a user-facing change?