Skip to content

Commit be19da0

Browse files
committed
feat: add condition type RecommendationProvided to workload
1 parent 6cbb753 commit be19da0

File tree

5 files changed

+19
-6
lines changed

5 files changed

+19
-6
lines changed

internal/autoscaler/autoscaler_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
"github.com/samber/lo"
3535
corev1 "k8s.io/api/core/v1"
3636
"k8s.io/apimachinery/pkg/api/errors"
37+
"k8s.io/apimachinery/pkg/api/meta"
3738
"k8s.io/apimachinery/pkg/api/resource"
3839
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3940
"k8s.io/apimachinery/pkg/types"
@@ -483,6 +484,8 @@ func verifyRecommendationStatus(workload *tfv1.TensorFusionWorkload, expectedRes
483484
Eventually(func(g Gomega) {
484485
g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
485486
g.Expect(workload.Status.Recommendation.Equal(expectedRes)).To(BeTrue())
487+
g.Expect(meta.FindStatusCondition(workload.Status.Conditions,
488+
constants.ConditionStatusTypeRecommendationProvided)).To(Not(BeNil()))
486489
res, _ := utils.GPUResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
487490
g.Expect(res.Equal(expectedRes)).To(BeTrue())
488491
}).Should(Succeed())

internal/autoscaler/recommender/percentile_recommender.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workloa
103103
UpperBoundVram: QuantityFromAmount(p.upperBoundVram.GetVramEstimation(aggregator)),
104104
}
105105

106-
log.Info("current recommended resources from percentile recommender", "workload", workload.Name, "resources", rr)
106+
log.V(6).Info("current recommended resources from percentile recommender", "workload", workload.Name, "resources", rr)
107107

108108
targetRes := &tfv1.Resources{}
109109
if curRes.Requests.Tflops.Cmp(rr.LowerBoundTflops) < 0 ||

internal/autoscaler/workload/handler.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@ import (
44
"context"
55
"fmt"
66
"maps"
7+
"time"
78

89
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
910
"github.com/NexusGPU/tensor-fusion/internal/constants"
1011
"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
1112
"github.com/NexusGPU/tensor-fusion/internal/utils"
1213
corev1 "k8s.io/api/core/v1"
14+
"k8s.io/apimachinery/pkg/api/meta"
15+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1316
"sigs.k8s.io/controller-runtime/pkg/client"
1417
"sigs.k8s.io/controller-runtime/pkg/log"
1518
)
@@ -95,6 +98,12 @@ func (h *handler) updateWorkload(
9598

9699
if workload.Status.Recommendation == nil || !workload.Status.Recommendation.Equal(targetRes) {
97100
workload.Status.Recommendation = targetRes
101+
meta.SetStatusCondition(&workload.Status.Conditions, metav1.Condition{
102+
Type: constants.ConditionStatusTypeRecommendationProvided,
103+
Status: metav1.ConditionTrue,
104+
Reason: "GPUResourcesRecommended",
105+
LastTransitionTime: metav1.NewTime(time.Now()),
106+
})
98107
if err := h.Status().Patch(ctx, workload, patch); err != nil {
99108
return fmt.Errorf("failed to patch workload status %s: %v", workload.Name, err)
100109
}

internal/constants/constants.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ const (
135135
ConditionStatusTypeGPUPool = "GPUPoolReady"
136136
ConditionStatusTypeTimeSeriesDatabase = "TimeSeriesDatabaseReady"
137137
ConditionStatusTypeCloudVendorConnection = "CloudVendorConnectionReady"
138+
139+
ConditionStatusTypeRecommendationProvided = "RecommendationProvided"
138140
)
139141

140142
const (

internal/controller/tensorfusionworkload_controller.go

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323

2424
corev1 "k8s.io/api/core/v1"
2525
"k8s.io/apimachinery/pkg/api/errors"
26+
"k8s.io/apimachinery/pkg/api/meta"
2627
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2728
"k8s.io/apimachinery/pkg/runtime"
2829
"k8s.io/client-go/tools/record"
@@ -333,7 +334,6 @@ func (r *TensorFusionWorkloadReconciler) updateStatus(
333334

334335
// Determine workload phase
335336
var phase tfv1.TensorFusionWorkloadPhase
336-
var conditions []metav1.Condition
337337

338338
// Update Ready condition based on readyReplicas and desired replicas
339339
readyCondition := metav1.Condition{
@@ -365,7 +365,8 @@ func (r *TensorFusionWorkloadReconciler) updateStatus(
365365
readyCondition.Reason = "WaitingForWorkers"
366366
readyCondition.Message = fmt.Sprintf("Ready replicas: %d/%d", readyReplicas, *workload.Spec.Replicas)
367367
}
368-
conditions = append(conditions, readyCondition)
368+
369+
conditionsChanged := meta.SetStatusCondition(&workload.Status.Conditions, readyCondition)
369370

370371
// Check if we need to update status
371372
totalReplicasChangedInDynamicReplicaMode :=
@@ -374,13 +375,11 @@ func (r *TensorFusionWorkloadReconciler) updateStatus(
374375
workload.Status.WorkerCount = int32(len(pods))
375376
}
376377
statusChanged := totalReplicasChangedInDynamicReplicaMode || workload.Status.ReadyWorkers != readyReplicas ||
377-
workload.Status.Phase != phase ||
378-
!utils.EqualConditionsDisregardTransitionTime(workload.Status.Conditions, conditions)
378+
workload.Status.Phase != phase || conditionsChanged
379379

380380
if statusChanged {
381381
log.Info("Updating workload status", "phase", phase, "readyReplicas", readyReplicas)
382382
workload.Status.Phase = phase
383-
workload.Status.Conditions = conditions
384383
workload.Status.ReadyWorkers = readyReplicas
385384
if err := r.Status().Update(ctx, workload); err != nil {
386385
return fmt.Errorf("update workload status: %w", err)

0 commit comments

Comments
 (0)