feat: apply updates to specified target resources

knave · knave · commit c748df6ed49e · 2025-07-01T02:10:09.000+08:00
diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go
@@ -99,7 +99,6 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) {
 
 	observedWorkloads := map[string]bool{}
 	for _, workload := range workloadList.Items {
-		autoScalingConfig := workload.Spec.AutoScalingConfig
 		if !workload.DeletionTimestamp.IsZero() {
 			continue
 		}
@@ -111,7 +110,7 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) {
 		}
 		workloadState.Namespace = workload.Namespace
 		workloadState.Resources = workload.Spec.Resources
-		workloadState.AutoScalingConfig = autoScalingConfig
+		workloadState.AutoScalingConfig = workload.Spec.AutoScalingConfig
 		s.WorkloadStates[workloadName] = workloadState
 
 		observedWorkloads[workloadName] = true
@@ -218,44 +217,40 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) {
 			continue
 		}
 
-		// TODO: apply config
-		// asConfig := workloadState.AutoScalingConfig
-		// NewResourceRecommenderFromAutoScalingConfig(ResouceRecomenderConfig{
-		// }).GetRecommendedResources(workloadState)
 		rr := s.ResourceRecommender.GetRecommendedResources(workloadState)
-		log.Info("Autoscaler processWorkloads", "recommended resources", rr)
+		log.Info("recommend resources", "workload", workloadState.Name, "resources", rr)
 
 		for _, worker := range podList.Items {
 			if !worker.DeletionTimestamp.IsZero() {
 				continue
 			}
 
-			if err := s.updateWorker(ctx, &worker, rr); err != nil {
+			if err := s.updateWorkerResourcesIfNeeded(ctx, workloadState, &worker, rr); err != nil {
 				log.Error(err, "failed to update worker")
 			}
 		}
 	}
 }
 
-func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *RecommendedResources) error {
-	annotations := worker.GetAnnotations()
-	newAnnotations := map[string]string{}
-
+func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workloadState *WorkloadState, worker *corev1.Pod, rr *RecommendedResources) error {
 	resourcesInfo := []struct {
+		name       string
 		requestKey string
 		limitKey   string
 		lowerBound ResourceAmount
 		upperBound ResourceAmount
 		target     ResourceAmount
 	}{
 		{
+			name:       "tflops",
 			requestKey: constants.TFLOPSRequestAnnotation,
 			limitKey:   constants.TFLOPSLimitAnnotation,
 			lowerBound: rr.LowerBoundTflops,
 			upperBound: rr.UpperBoundTflops,
 			target:     rr.TargetTflops,
 		},
 		{
+			name:       "vram",
 			requestKey: constants.VRAMRequestAnnotation,
 			limitKey:   constants.VRAMLimitAnnotation,
 			lowerBound: rr.LowerBoundVram,
@@ -264,8 +259,13 @@ func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *R
 		},
 	}
 
+	annotations := worker.GetAnnotations()
+	newAnnotations := map[string]string{}
 	for _, resInfo := range resourcesInfo {
-		if err := updateResource(
+		if !workloadState.IsTargetResource(resInfo.name) {
+			continue
+		}
+		if err := detectResourceChanges(
 			annotations, newAnnotations,
 			resInfo.requestKey, resInfo.limitKey,
 			resInfo.lowerBound, resInfo.upperBound, resInfo.target,
@@ -291,7 +291,7 @@ func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *R
 	return nil
 }
 
-func updateResource(annotations, newAnnotations map[string]string, requestKey, limitKey string, lowerBound, upperBound, target ResourceAmount) error {
+func detectResourceChanges(annotations, newAnnotations map[string]string, requestKey, limitKey string, lowerBound, upperBound, target ResourceAmount) error {
 	currentRequest, err := resource.ParseQuantity(annotations[requestKey])
 	if err != nil {
 		return fmt.Errorf("failed to parse %s: %v", requestKey, err)
diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go
@@ -158,7 +158,7 @@ var _ = Describe("Autoscaler", func() {
 	})
 
 	Context("when processing workloads", func() {
-		FIt("should update only those resources exceeding the recommended resource boundaries", func() {
+		It("should update only those resources exceeding the recommended resource boundaries", func() {
 			tfEnv := NewTensorFusionEnvBuilder().
 				AddPoolWithNodeCount(1).SetGpuCountPerNode(1).
 				Build()
@@ -184,6 +184,34 @@ var _ = Describe("Autoscaler", func() {
 			}).Should(Succeed())
 		})
 
+		It("should update specific resources based on TargetResource", func() {
+			tfEnv := NewTensorFusionEnvBuilder().
+				AddPoolWithNodeCount(1).SetGpuCountPerNode(1).
+				Build()
+			defer tfEnv.Cleanup()
+			workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1)
+			defer deleteWorkload(workload)
+
+			scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{})
+			scaler.LoadWorkloads(ctx)
+
+			scaler.ResourceRecommender = &FakeOutBoundRecommender{}
+			rr := scaler.ResourceRecommender.GetRecommendedResources(nil)
+
+			workloadState := scaler.WorkloadStates[workload.Name]
+			workloadState.AutoScalingConfig.AutoSetResources.TargetResource = "tflops"
+
+			oldRes := workloadState.Resources
+			scaler.ProcessWorkloads(ctx)
+			Eventually(func(g Gomega) {
+				tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0])
+				Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops)))
+				Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2)))
+				Expect(vramRequest.Equal(oldRes.Requests.Vram)).To(BeTrue())
+				Expect(vramLimit.Equal(oldRes.Limits.Vram)).To(BeTrue())
+			}).Should(Succeed())
+		})
+
 		It("should return an error if failed to reallocate resources", func() {
 			tfEnv := NewTensorFusionEnvBuilder().
 				AddPoolWithNodeCount(1).SetGpuCountPerNode(1).
@@ -196,7 +224,7 @@ var _ = Describe("Autoscaler", func() {
 			scaler.LoadWorkloads(ctx)
 			scaler.ResourceRecommender = &FakeOutBoundRecommender{}
 			rr := scaler.ResourceRecommender.GetRecommendedResources(nil)
-			err := scaler.updateWorker(ctx, getWorkers(workload)[0], rr)
+			err := scaler.updateWorkerResourcesIfNeeded(ctx, scaler.WorkloadStates[workload.Name], getWorkers(workload)[0], rr)
 			Expect(err.Error()).To(ContainSubstring("failed to reallocate resources"))
 		})
 
@@ -388,16 +416,26 @@ func cleanupWorkload(key client.ObjectKey) {
 }
 
 func assertWorkerAnnotations(worker *corev1.Pod, rr *RecommendedResources) {
-	annotations := worker.GetAnnotations()
-	tflopsRequest := resource.MustParse(annotations[constants.TFLOPSRequestAnnotation])
+	tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(worker)
 	Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops)))
-
-	tflopsLimit := resource.MustParse(annotations[constants.TFLOPSLimitAnnotation])
 	Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2)))
-
-	vramRequest := resource.MustParse(annotations[constants.VRAMRequestAnnotation])
 	Expect(vramRequest.Value()).To(Equal(int64(rr.TargetVram)))
-
-	vramLimit := resource.MustParse(annotations[constants.VRAMLimitAnnotation])
 	Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2)))
 }
+
+func parseResourceAnnotations(worker *corev1.Pod) (tflopsRequest, tflopsLimit, vramRequest, vramLimit resource.Quantity) {
+	annotations := worker.GetAnnotations()
+	keys := []struct {
+		key string
+		dst *resource.Quantity
+	}{
+		{constants.TFLOPSRequestAnnotation, &tflopsRequest},
+		{constants.TFLOPSLimitAnnotation, &tflopsLimit},
+		{constants.VRAMRequestAnnotation, &vramRequest},
+		{constants.VRAMLimitAnnotation, &vramLimit},
+	}
+	for _, k := range keys {
+		*k.dst = resource.MustParse(annotations[k.key])
+	}
+	return
+}