From 125977cb8c4d1dc909d1b8f1470d133526743b23 Mon Sep 17 00:00:00 2001 From: knave Date: Fri, 20 Jun 2025 06:18:54 +0800 Subject: [PATCH 01/27] feat: implement the core auto-scaling functionality --- go.mod | 1 + internal/autoscaler/autoscaler.go | 293 ++++++++++ internal/autoscaler/autoscaler_suite_test.go | 575 +++++++++++++++++++ internal/autoscaler/autoscaler_test.go | 397 +++++++++++++ internal/autoscaler/estimator.go | 166 ++++++ internal/autoscaler/metricsprovider.go | 30 + internal/autoscaler/recommender.go | 84 +++ internal/autoscaler/recommender_test.go | 19 + internal/autoscaler/workerstate.go | 92 +++ internal/autoscaler/workloadstate.go | 64 +++ 10 files changed, 1721 insertions(+) create mode 100644 internal/autoscaler/autoscaler.go create mode 100644 internal/autoscaler/autoscaler_suite_test.go create mode 100644 internal/autoscaler/autoscaler_test.go create mode 100644 internal/autoscaler/estimator.go create mode 100644 internal/autoscaler/metricsprovider.go create mode 100644 internal/autoscaler/recommender.go create mode 100644 internal/autoscaler/recommender_test.go create mode 100644 internal/autoscaler/workerstate.go create mode 100644 internal/autoscaler/workloadstate.go diff --git a/go.mod b/go.mod index bfdc3c41..053d4e75 100644 --- a/go.mod +++ b/go.mod @@ -119,6 +119,7 @@ require ( github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/stretchr/objx v0.5.2 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.12 // indirect github.com/x448/float16 v0.8.4 // indirect diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go new file mode 100644 index 00000000..923f6a5a --- /dev/null +++ b/internal/autoscaler/autoscaler.go @@ -0,0 +1,293 @@ +package autoscaler + +import ( + "context" + "errors" + "math/big" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +type Autoscaler struct { + client.Client + Recommender + MetricsProvider + WorkloadStates map[string]*WorkloadState + WorkerStates map[string]*WorkerState +} + +func NewAutoscaler(c client.Client) (*Autoscaler, error) { + if c == nil { + return nil, errors.New("must specify client") + } + + return &Autoscaler{ + Client: c, + Recommender: NewRecommender(), + MetricsProvider: NewMetricsProvider(), + WorkloadStates: map[string]*WorkloadState{}, + WorkerStates: map[string]*WorkerState{}, + }, nil +} + +func (s *Autoscaler) Start(ctx context.Context) error { + log := log.FromContext(ctx) + log.Info("Starting autoscaler") + + s.LoadHistoryMetrics(ctx) // TODO: handle timeout + + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + for { + select { + case <-ticker.C: + s.Run(ctx) + case <-ctx.Done(): + log.Info("Stopping autoscaler") + return nil + } + } +} + +func (s *Autoscaler) Run(ctx context.Context) { + log := log.FromContext(ctx) + + log.Info("Autoscaler running") + s.LoadWorkloads(ctx) + s.LoadRealTimeMetrics(ctx) + s.ProcessWorkloads(ctx) +} + +func (s *Autoscaler) LoadWorkloads(ctx context.Context) { + log := log.FromContext(ctx) + + workloadList := tfv1.TensorFusionWorkloadList{} + if err := s.List(ctx, &workloadList); err != nil { + log.Error(err, "failed to list workloads") + return + } + + observedWorkloads := map[string]bool{} + for _, workload := range workloadList.Items { + autoScalingConfig := workload.Spec.AutoScalingConfig + // Currently only supports enabling both AutoSetLimits and AutoSetRequests simultaneously + if !workload.DeletionTimestamp.IsZero() || + !(autoScalingConfig.AutoSetLimits.Enable && + autoScalingConfig.AutoSetRequests.Enable) { + continue + } + + workloadName := workload.Name + workloadState, exists := s.WorkloadStates[workloadName] + if !exists { + workloadState = NewWorkloadState(workloadName) + } + workloadState.Namespace = workload.Namespace + workloadState.Resources = workload.Spec.Resources + workloadState.AutoScalingConfig = autoScalingConfig + s.WorkloadStates[workloadName] = workloadState + + observedWorkloads[workloadName] = true + + podList := &corev1.PodList{} + if err := s.List(ctx, podList, + client.InNamespace(workload.Namespace), + client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil { + log.Error(err, "failed to list workers") + continue + } + + observedWorkers := map[string]bool{} + for _, worker := range podList.Items { + if !worker.DeletionTimestamp.IsZero() { + continue + } + if _, exists := s.WorkerStates[worker.Name]; !exists { + s.WorkerStates[worker.Name] = NewWorkerState(worker.Name, workloadName) + } + observedWorkers[worker.Name] = true + } + + s.WorkerStates = lo.OmitBy(s.WorkerStates, func(key string, state *WorkerState) bool { + return state.Workload == workloadName && !observedWorkers[key] + }) + } + + // remove unused workloadStates + s.WorkloadStates = lo.OmitBy(s.WorkloadStates, func(key string, value *WorkloadState) bool { + return !observedWorkloads[key] + }) + + // remove unused workerStates + s.WorkerStates = lo.OmitBy(s.WorkerStates, func(key string, state *WorkerState) bool { + return !observedWorkloads[state.Workload] + }) +} + +func (s *Autoscaler) LoadHistoryMetrics(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("loading historical metrics") + + workersMetrics := s.MetricsProvider.GetHistoryMetrics() + for _, metrics := range workersMetrics { + workloadState, exists := s.WorkloadStates[metrics.Workload] + if !exists { + workloadState = NewWorkloadState(metrics.Workload) + s.WorkloadStates[metrics.Workload] = workloadState + } + workerState, exists := s.WorkerStates[metrics.Worker] + if !exists { + workerState = NewWorkerState(metrics.Worker, metrics.Workload) + s.WorkerStates[metrics.Worker] = workerState + } + + s.addSamples(workloadState, workerState, metrics) + } +} + +func (s *Autoscaler) LoadRealTimeMetrics(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("loading realtime metrics") + + workersMetrics := s.MetricsProvider.GetWorkersMetrics() + for _, metrics := range workersMetrics { + workloadState, workloadExists := s.WorkloadStates[metrics.Workload] + if !workloadExists { + continue + } + workerState, workerExists := s.WorkerStates[metrics.Worker] + if !workerExists { + continue + } + + s.addSamples(workloadState, workerState, metrics) + } +} + +func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("processing workloads") + + for _, workloadState := range s.WorkloadStates { + // TODO: continue if histogram is empty + podList := &corev1.PodList{} + if err := s.List(ctx, podList, + client.InNamespace(workloadState.Namespace), + client.MatchingLabels{constants.WorkloadKey: workloadState.Name}); err != nil { + log.Error(err, "failed to list workers") + continue + } + + // TODO: apply config + // asConfig := workloadState.AutoScalingConfig + rr := s.Recommender.GetRecommendedResources(workloadState) + log.Info("Autoscaler processWorkloads", "recommended resources", rr) + + for _, worker := range podList.Items { + if !worker.DeletionTimestamp.IsZero() { + continue + } + + annotations := worker.GetAnnotations() + tflopsRequest, err := resource.ParseQuantity(annotations[constants.TFLOPSRequestAnnotation]) + if err != nil { + log.Error(err, "failed to parse vram request") + continue + } + + newAnnotations := map[string]string{} + if tflopsRequest.Cmp(QuantityFromAmount(rr.LowerBoundTflops)) < 0 || + tflopsRequest.Cmp(QuantityFromAmount(rr.UpperBoundTflops)) > 0 { + targetTflopsRequest := QuantityFromAmount(rr.TargetTflops) + newAnnotations[constants.TFLOPSRequestAnnotation] = targetTflopsRequest.String() + tflopsLimit, err := resource.ParseQuantity(annotations[constants.TFLOPSLimitAnnotation]) + if err != nil { + log.Error(err, "failed to parse tflops limit annotation") + continue + } + targetTflopsLimit := getProportionalLimit(&tflopsLimit, &tflopsRequest, &targetTflopsRequest) + if targetTflopsLimit == nil { + log.Error(err, "failed to get limit for tflops") + continue + } + newAnnotations[constants.TFLOPSLimitAnnotation] = targetTflopsLimit.String() + } + + vramRequest, err := resource.ParseQuantity(annotations[constants.VRAMRequestAnnotation]) + if err != nil { + log.Error(err, "failed to parse vram request") + continue + } + if vramRequest.Cmp(QuantityFromAmount(rr.LowerBoundVram)) < 0 || + vramRequest.Cmp(QuantityFromAmount(rr.UpperBoundVram)) > 0 { + targetVramRequest := QuantityFromAmount(rr.TargetVram) + newAnnotations[constants.VRAMRequestAnnotation] = targetVramRequest.String() + vramLimit, err := resource.ParseQuantity(annotations[constants.VRAMLimitAnnotation]) + if err != nil { + log.Error(err, "failed to parse vram limit annotation") + continue + } + targetVramLimit := getProportionalLimit(&vramLimit, &vramRequest, &targetVramRequest) + if targetVramLimit == nil { + log.Error(err, "failed to get limit for vram") + continue + } + newAnnotations[constants.VRAMLimitAnnotation] = targetVramLimit.String() + } + + if len(newAnnotations) > 0 { + for key, value := range newAnnotations { + worker.Annotations[key] = value + } + + if err := s.Update(ctx, &worker); err != nil { + log.Error(err, "failed to update worker") + } + } + } + } +} + +func (*Autoscaler) addSamples(workloadState *WorkloadState, workerState *WorkerState, metrics *WorkerMetrics) { + workerState.AddTflopsSample(workloadState, metrics) + workerState.AddVramSample(workloadState, metrics) + workloadState.UpdateSampleStats(metrics) +} + +func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity { + if (originalLimit == nil || originalLimit.IsZero()) || + (recommendedRequest == nil || recommendedRequest.IsZero()) || + (originalRequest == nil || originalRequest.IsZero()) { + return nil + } + + originalValue := big.NewInt(originalLimit.Value()) + scaleBaseValue := big.NewInt(originalRequest.Value()) + scaleResultValue := big.NewInt(recommendedRequest.Value()) + var scaledOriginal big.Int + scaledOriginal.Mul(originalValue, scaleResultValue) + scaledOriginal.Div(&scaledOriginal, scaleBaseValue) + if scaledOriginal.IsInt64() { + result := resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format) + return result + } + + return nil +} + +// Start after manager started +func SetupWithManager(mgr ctrl.Manager) error { + autoScaler, err := NewAutoscaler(mgr.GetClient()) + if err != nil { + return err + } + return mgr.Add(autoScaler) +} diff --git a/internal/autoscaler/autoscaler_suite_test.go b/internal/autoscaler/autoscaler_suite_test.go new file mode 100644 index 00000000..63657b0c --- /dev/null +++ b/internal/autoscaler/autoscaler_suite_test.go @@ -0,0 +1,575 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package autoscaler + +import ( + "context" + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + "testing" + "time" + + ctrl "sigs.k8s.io/controller-runtime" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + corev1 "k8s.io/api/core/v1" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/config" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/controller" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + "github.com/NexusGPU/tensor-fusion/internal/metrics" + "github.com/NexusGPU/tensor-fusion/internal/portallocator" + "github.com/NexusGPU/tensor-fusion/internal/utils" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + // +kubebuilder:scaffold:imports +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var cfg *rest.Config +var k8sClient client.Client +var testEnv *envtest.Environment +var ctx context.Context +var cancel context.CancelFunc +var allocator *gpuallocator.GpuAllocator +var metricsRecorder *metrics.MetricsRecorder + +func TestControllers(t *testing.T) { + RegisterFailHandler(Fail) + SetDefaultEventuallyTimeout(6 * time.Second) + SetDefaultEventuallyPollingInterval(200 * time.Millisecond) + SetDefaultConsistentlyDuration(5 * time.Second) + SetDefaultConsistentlyPollingInterval(200 * time.Millisecond) + RunSpecs(t, "Controller Suite") +} + +var _ = BeforeSuite(func() { + // Expect(os.Setenv("USE_EXISTING_CLUSTER", "true")).Should(Succeed()) + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + ctx, cancel = context.WithCancel(context.TODO()) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + + // The BinaryAssetsDirectory is only required if you want to run the tests directly + // without call the makefile target test. If not informed it will look for the + // default path defined in controller-runtime which is /usr/local/kubebuilder/. + // Note that you must have the required binaries setup under the bin directory to perform + // the tests directly. When we run make test it will be setup and used automatically. + BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", + fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), + } + + var err error + // cfg is defined in this file globally. + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + err = tfv1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + err = corev1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:scheme + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) + + Expect(k8sClient.Create(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: utils.CurrentNamespace(), + }, + })).NotTo(HaveOccurred()) + + mgr, err := ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme.Scheme, + Metrics: metricsserver.Options{ + BindAddress: "0", + }, + }) + Expect(err).ToNot(HaveOccurred()) + + metricsRecorder = &metrics.MetricsRecorder{ + MetricsOutputPath: "./metrics.log", + HourlyUnitPriceMap: map[string]float64{ + "A100": 10, + }, + WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing), + } + + err = (&controller.TensorFusionClusterReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("TensorFusionCluster"), + MetricsRecorder: metricsRecorder, + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.GPUPoolReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUPool"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.GPUNodeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUNode"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), "40000-42000", "42001-60000") + if err != nil { + Expect(err).ToNot(HaveOccurred()) + } + _ = portAllocator.SetupWithManager(ctx, mgr) + + err = (&controller.GPUNodeClassReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.SchedulingConfigTemplateReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.PodReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + PortAllocator: portAllocator, + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.NodeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("Node"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.WorkloadProfileReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 150*time.Millisecond) + _, err = allocator.SetupWithManager(ctx, mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.TensorFusionConnectionReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("TensorFusionConnection"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.GPUReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(ctx, mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.TensorFusionWorkloadReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Allocator: allocator, + Recorder: mgr.GetEventRecorderFor("TensorFusionWorkload"), + GpuInfos: config.MockGpuInfo(), + PortAllocator: portAllocator, + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + go func() { + defer GinkgoRecover() + err = mgr.Start(ctx) + Expect(err).ToNot(HaveOccurred(), "failed to run manager") + }() + +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + allocator.Stop() + cancel() + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) + // Expect(os.Unsetenv("USE_EXISTING_CLUSTER")).To(Succeed()) +}) + +type TensorFusionEnv struct { + clusterKey client.ObjectKey + poolCount int + poolNodeMap map[int]map[int]int +} + +func (c *TensorFusionEnv) GetCluster() *tfv1.TensorFusionCluster { + GinkgoHelper() + tfc := &tfv1.TensorFusionCluster{} + Expect(k8sClient.Get(ctx, c.clusterKey, tfc)).Should(Succeed()) + return tfc +} + +func (c *TensorFusionEnv) UpdateCluster(tfc *tfv1.TensorFusionCluster) { + GinkgoHelper() + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + latest := &tfv1.TensorFusionCluster{} + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(tfc), latest); err != nil { + return err + } + latest.Spec = tfc.Spec + return k8sClient.Update(ctx, latest) + }) + Expect(err).Should(Succeed()) +} + +func (c *TensorFusionEnv) Cleanup() { + GinkgoHelper() + for poolIndex, nodeGpuMap := range c.poolNodeMap { + for nodeIndex := range nodeGpuMap { + c.DeleteGPUNode(poolIndex, nodeIndex) + } + } + + tfc := c.GetCluster() + tfcCopy := tfc.DeepCopy() + tfcCopy.Spec.GPUPools = []tfv1.GPUPoolDefinition{} + c.UpdateCluster(tfcCopy) + + for poolIndex := range c.poolNodeMap { + Eventually(func(g Gomega) { + pool := &tfv1.GPUPool{} + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(HaveOccurred()) + }).Should(Succeed()) + delete(c.poolNodeMap, poolIndex) + c.poolCount-- + } + + Expect(k8sClient.Delete(ctx, tfc)).Should(Succeed()) + Eventually(func(g Gomega) { + err := k8sClient.Get(ctx, c.clusterKey, tfc) + g.Expect(err).Should(HaveOccurred()) + }).Should(Succeed()) +} + +func (c *TensorFusionEnv) GetGPUPoolList() *tfv1.GPUPoolList { + GinkgoHelper() + poolList := &tfv1.GPUPoolList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, poolList, client.MatchingLabels(map[string]string{ + constants.LabelKeyOwner: c.clusterKey.Name, + }))).Should(Succeed()) + g.Expect(poolList.Items).Should(HaveLen(c.poolCount)) + }).Should(Succeed()) + return poolList +} + +func (c *TensorFusionEnv) GetGPUPool(poolIndex int) *tfv1.GPUPool { + GinkgoHelper() + pool := &tfv1.GPUPool{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(Succeed()) + }).Should(Succeed()) + return pool +} + +func (c *TensorFusionEnv) GetGPUNodeList(poolIndex int) *tfv1.GPUNodeList { + GinkgoHelper() + nodeList := &tfv1.GPUNodeList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, nodeList, client.MatchingLabels(map[string]string{ + fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, c.getPoolName(poolIndex)): "true", + }))).Should(Succeed()) + g.Expect(nodeList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex]))) + }).Should(Succeed()) + return nodeList +} + +func (c *TensorFusionEnv) GetGPUNode(poolIndex int, nodeIndex int) *tfv1.GPUNode { + GinkgoHelper() + node := &tfv1.GPUNode{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(Succeed()) + }).Should(Succeed()) + return node +} + +func (c *TensorFusionEnv) DeleteGPUNode(poolIndex int, nodeIndex int) { + GinkgoHelper() + c.DeleteNodeGpuList(poolIndex, nodeIndex) + node := c.GetGPUNode(poolIndex, nodeIndex) + Expect(k8sClient.Delete(ctx, node)).Should(Succeed()) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(HaveOccurred()) + }).Should(Succeed()) + delete(c.poolNodeMap[poolIndex], nodeIndex) +} + +func (c *TensorFusionEnv) GetNodeGpuList(poolIndex int, nodeIndex int) *tfv1.GPUList { + GinkgoHelper() + gpuList := &tfv1.GPUList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, gpuList, client.MatchingLabels(map[string]string{ + constants.LabelKeyOwner: c.getNodeName(poolIndex, nodeIndex), + }))).Should(Succeed()) + g.Expect(gpuList.Items).Should(HaveLen(c.poolNodeMap[poolIndex][nodeIndex])) + }).Should(Succeed()) + return gpuList +} + +func (c *TensorFusionEnv) DeleteNodeGpuList(poolIndex int, nodeIndex int) { + GinkgoHelper() + Expect(k8sClient.DeleteAllOf(ctx, &tfv1.GPU{}, + client.MatchingLabels{constants.LabelKeyOwner: c.getNodeName(poolIndex, nodeIndex)}, + )).Should(Succeed()) +} + +func (c *TensorFusionEnv) GetPoolGpuList(poolIndex int) *tfv1.GPUList { + GinkgoHelper() + gpuList := &tfv1.GPUList{} + poolGpuCount := 0 + for _, gpuCount := range c.poolNodeMap[poolIndex] { + poolGpuCount += gpuCount + } + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, gpuList, client.MatchingLabels(map[string]string{ + constants.GpuPoolKey: c.getPoolName(poolIndex), + }))).Should(Succeed()) + g.Expect(gpuList.Items).Should(HaveLen(poolGpuCount)) + }).Should(Succeed()) + return gpuList +} + +// https://book.kubebuilder.io/reference/envtest#testing-considerations +// Unless you’re using an existing cluster, keep in mind that no built-in controllers are running in the test context. +// So the checkStatusAndUpdateVirtualCapacity in gpunode_controller.go checking pod status always pending and the gpunode status can't change to running +// When using an existing cluster, the test speed go a lot faster, may change later? +func (c *TensorFusionEnv) UpdateHypervisorStatus() { + GinkgoHelper() + if os.Getenv("USE_EXISTING_CLUSTER") != "true" { + for poolIndex := range c.poolNodeMap { + podList := &corev1.PodList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, podList, + client.InNamespace(utils.CurrentNamespace()), + client.MatchingLabels(map[string]string{ + fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, c.getPoolName(poolIndex)): "true", + }), + )).Should(Succeed()) + g.Expect(podList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex]))) + }).Should(Succeed()) + for _, pod := range podList.Items { + pod.Status.Phase = corev1.PodRunning + pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{Type: corev1.PodReady, Status: corev1.ConditionTrue}) + Expect(k8sClient.Status().Update(ctx, &pod)).Should(Succeed()) + } + } + } +} + +func (c *TensorFusionEnv) getPoolName(poolIndex int) string { + return fmt.Sprintf("%s-pool-%d", c.clusterKey.Name, poolIndex) +} + +func (c *TensorFusionEnv) getNodeName(poolIndex int, nodeIndex int) string { + return fmt.Sprintf("%s-pool-%d-node-%d", c.clusterKey.Name, poolIndex, nodeIndex) +} + +func (c *TensorFusionEnv) getGPUName(poolIndex int, nodeIndex int, gpuIndex int) string { + return fmt.Sprintf("%s-pool-%d-node-%d-gpu-%d", c.clusterKey.Name, poolIndex, nodeIndex, gpuIndex) +} + +type TensorFusionEnvBuilder struct { + *TensorFusionEnv +} + +func NewTensorFusionEnvBuilder() *TensorFusionEnvBuilder { + return &TensorFusionEnvBuilder{ + &TensorFusionEnv{ + poolCount: 0, + clusterKey: client.ObjectKey{}, + poolNodeMap: map[int]map[int]int{}, + }, + } +} + +func (b *TensorFusionEnvBuilder) AddPoolWithNodeCount(nodeCount int) *TensorFusionEnvBuilder { + nodeGpuMap := make(map[int]int, nodeCount) + for i := range nodeCount { + nodeGpuMap[i] = 0 + } + b.poolNodeMap[b.poolCount] = nodeGpuMap + b.poolCount++ + return b +} + +func (b *TensorFusionEnvBuilder) SetGpuCountPerNode(gpuCount int) *TensorFusionEnvBuilder { + poolIndex := b.poolCount - 1 + for nodeIndex := range b.poolNodeMap[poolIndex] { + b.poolNodeMap[poolIndex][nodeIndex] = gpuCount + } + return b +} + +func (b *TensorFusionEnvBuilder) SetGpuCountForNode(nodeIndex int, gpuCount int) *TensorFusionEnvBuilder { + poolIndex := b.poolCount - 1 + b.poolNodeMap[poolIndex][nodeIndex] = gpuCount + return b +} + +var testEnvId int = 0 + +func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv { + GinkgoHelper() + b.clusterKey = client.ObjectKey{ + Name: fmt.Sprintf("cluster-%d", testEnvId), + Namespace: "default", + } + testEnvId++ + + // generate cluster + tfc := &tfv1.TensorFusionCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: b.clusterKey.Name, + Namespace: b.clusterKey.Namespace, + }, + Spec: tfv1.TensorFusionClusterSpec{ + GPUPools: []tfv1.GPUPoolDefinition{ + { + Name: fmt.Sprintf("pool-%d", b.poolCount), + SpecTemplate: *config.MockGPUPoolSpec, + }, + }, + }, + } + + // construct pools + gpuPools := make([]tfv1.GPUPoolDefinition, b.poolCount) + for i := range b.poolCount { + poolSpec := config.MockGPUPoolSpec.DeepCopy() + poolSpec.NodeManagerConfig.NodeSelector.NodeSelectorTerms[0].MatchExpressions[0].Key = + fmt.Sprintf("%s-label-%d", tfc.Name, i) + gpuPools[i] = tfv1.GPUPoolDefinition{ + Name: fmt.Sprintf("pool-%d", i), + SpecTemplate: *poolSpec, + } + } + + tfc.Spec.GPUPools = gpuPools + Expect(k8sClient.Create(ctx, tfc)).To(Succeed()) + + // wait for pools are created + Eventually(func(g Gomega) { + gpuPoolList := &tfv1.GPUPoolList{} + g.Expect(k8sClient.List(ctx, gpuPoolList, client.MatchingLabels(map[string]string{ + constants.LabelKeyOwner: tfc.Name, + }))).Should(Succeed()) + g.Expect(gpuPoolList.Items).Should(HaveLen(b.poolCount)) + }).Should(Succeed()) + + // generate nodes + selectors := strings.Split(constants.InitialGPUNodeSelector, "=") + for poolIndex := range b.poolCount { + nodeCount := len(b.poolNodeMap[poolIndex]) + for nodeIndex := range nodeCount { + coreNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: b.getNodeName(poolIndex, nodeIndex), + Labels: map[string]string{ + selectors[0]: selectors[1], + fmt.Sprintf("%s-label-%d", tfc.Name, poolIndex): "true", + }, + }, + } + Expect(k8sClient.Create(ctx, coreNode)).To(Succeed()) + + // generate gpus for gpunode + gpuNode := b.GetGPUNode(poolIndex, nodeIndex) + if gpuCount := b.poolNodeMap[poolIndex][nodeIndex]; gpuCount > 0 { + for gpuIndex := range gpuCount { + key := client.ObjectKey{ + Name: b.getGPUName(poolIndex, nodeIndex, gpuIndex), + } + gpu := &tfv1.GPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Labels: map[string]string{ + constants.LabelKeyOwner: gpuNode.Name, + }, + }, + } + Expect(controllerutil.SetControllerReference(gpuNode, gpu, scheme.Scheme)).To(Succeed()) + Expect(k8sClient.Create(ctx, gpu)).To(Succeed()) + patch := client.MergeFrom(gpu.DeepCopy()) + gpu.Status = tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhaseRunning, + UUID: key.Name, + GPUModel: "mock", + NodeSelector: map[string]string{ + "kubernetes.io/hostname": b.getNodeName(poolIndex, nodeIndex), + }, + Capacity: &tfv1.Resource{ + Tflops: resource.MustParse("2000"), + Vram: resource.MustParse("2000Gi"), + }, + Available: &tfv1.Resource{ + Tflops: resource.MustParse("2000"), + Vram: resource.MustParse("2000Gi"), + }, + Message: "mock message", + } + Expect(k8sClient.Status().Patch(ctx, gpu, patch)).To(Succeed()) + } + } + } + + b.GetPoolGpuList(poolIndex) + } + + b.UpdateHypervisorStatus() + + return b.TensorFusionEnv +} diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go new file mode 100644 index 00000000..9316bc85 --- /dev/null +++ b/internal/autoscaler/autoscaler_test.go @@ -0,0 +1,397 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package autoscaler + +import ( + "fmt" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/aws/smithy-go/ptr" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var _ = Describe("Autoscaler", func() { + Context("when creating an autoscaler", func() { + It("should return an error if there is no client", func() { + as, err := NewAutoscaler(nil) + Expect(as).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("must specify client")) + }) + }) + + Context("when loading history metrics", func() { + It("should create the state of workloads and workers based on historical metrics", func() { + scaler, _ := NewAutoscaler(k8sClient) + scaler.MetricsProvider = &FakeMetricsProvider{} + scaler.LoadHistoryMetrics(ctx) + metrics := scaler.MetricsProvider.GetHistoryMetrics() + for _, m := range metrics { + Expect(scaler.WorkloadStates).To(HaveKey(m.Workload)) + Expect(scaler.WorkerStates).To(HaveKey(m.Worker)) + } + }) + }) + + Context("when loading workloads", func() { + It("should keep the state of workloads and workers with auto-scaling enabled", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(3). + Build() + defer tfEnv.Cleanup() + + scaler, _ := NewAutoscaler(k8sClient) + scaler.LoadWorkloads(ctx) + Expect(scaler.WorkloadStates).To(HaveLen(0)) + Expect(scaler.WorkerStates).To(HaveLen(0)) + + // create two workloads + pool := tfEnv.GetGPUPool(0) + // with two replias + workload0 := createWorkload(pool, 0, 2) + workload0Workers := getWorkers(workload0) + // with one replia + workload1 := createWorkload(pool, 1, 1) + workload1Workers := getWorkers(workload1) + + scaler.LoadWorkloads(ctx) + Expect(scaler.WorkloadStates).To(HaveLen(2)) + Expect(scaler.WorkloadStates).To(HaveKey(workload0.Name)) + Expect(scaler.WorkloadStates).To(HaveKey(workload1.Name)) + Expect(scaler.WorkerStates).To(HaveLen(3)) + Expect(scaler.WorkerStates).To(HaveKey(workload0Workers[0].Name)) + Expect(scaler.WorkerStates).To(HaveKey(workload0Workers[1].Name)) + Expect(scaler.WorkerStates).To(HaveKey(workload1Workers[0].Name)) + + updateWorkloadReplicas(workload0, 1) + scaler.LoadWorkloads(ctx) + Expect(scaler.WorkerStates).To(HaveLen(2)) + + deleteWorkload(workload0) + deleteWorkload(workload1) + scaler.LoadWorkloads(ctx) + Expect(scaler.WorkloadStates).NotTo(HaveKey(workload0.Name)) + Expect(scaler.WorkerStates).NotTo(HaveKey(workload0Workers[0].Name)) + Expect(scaler.WorkerStates).NotTo(HaveKey(workload0Workers[1].Name)) + Expect(scaler.WorkloadStates).NotTo(HaveKey(workload1.Name)) + Expect(scaler.WorkerStates).NotTo(HaveKey(workload1Workers[0].Name)) + }) + }) + + Context("when loading real time metrics", func() { + It("should update the state of workloads and workers", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + pool := tfEnv.GetGPUPool(0) + workload := createWorkload(pool, 0, 1) + workers := getWorkers(workload) + defer deleteWorkload(workload) + + worker := workers[0].Name + + scaler, _ := NewAutoscaler(k8sClient) + scaler.LoadWorkloads(ctx) + ws := scaler.WorkloadStates[workload.Name] + metrics := &WorkerMetrics{ + Workload: workload.Name, + Worker: worker, + TflopsUsage: ResourceAmount(12.0), + VramUsage: 9000, + Timestamp: time.Now(), + } + + scaler.MetricsProvider = &FakeMetricsProvider{[]*WorkerMetrics{metrics}} + scaler.LoadRealTimeMetrics(ctx) + + Expect(scaler.WorkerStates[worker].TflopsPeak).To(Equal(metrics.TflopsUsage)) + Expect(scaler.WorkerStates[worker].LastTflopsSampleTime).To(Equal(metrics.Timestamp)) + Expect(ws.TflopsHistogram.IsEmpty()).To(BeFalse()) + Expect(scaler.WorkerStates[worker].VramPeak).To(Equal(metrics.VramUsage)) + Expect(scaler.WorkerStates[worker].LastVramSampleTime).To(Equal(metrics.Timestamp)) + Expect(ws.VramHistogram.IsEmpty()).To(BeFalse()) + }) + }) + + Context("when processing workloads", func() { + It("should update worker annotations if resource out of bounds", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient) + scaler.LoadWorkloads(ctx) + + recommender := &FakeRecommender{ + RecommendedResources: RecommendedResources{ + TargetTflops: 110, + LowerBoundTflops: 100, + UpperBoundTflops: 120, + TargetVram: 110 * 1000 * 1000 * 1000, + LowerBoundVram: 100 * 1000 * 1000 * 1000, + UpperBoundVram: 120 * 1000 * 1000 * 1000, + }, + } + + scaler.Recommender = recommender + rr := recommender.GetRecommendedResources(nil) + + scaler.ProcessWorkloads(ctx) + + Eventually(func(g Gomega) { + workers := getWorkers(workload) + annotations := workers[0].GetAnnotations() + + tflopsRequest := resource.MustParse(annotations[constants.TFLOPSRequestAnnotation]) + g.Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) + + tflopsLimit := resource.MustParse(annotations[constants.TFLOPSLimitAnnotation]) + g.Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) + + vramRequest := resource.MustParse(annotations[constants.VRAMRequestAnnotation]) + g.Expect(vramRequest.Value()).To(Equal(int64(rr.TargetVram))) + + vramLimit := resource.MustParse(annotations[constants.VRAMLimitAnnotation]) + g.Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2))) + + }).Should(Succeed()) + }) + + It("should not udpate worker annotations if resources in bounds", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient) + scaler.LoadWorkloads(ctx) + + recommender := &FakeRecommender{ + RecommendedResources: RecommendedResources{ + TargetTflops: 110, + LowerBoundTflops: 10, + UpperBoundTflops: 120, + TargetVram: 110 * 1000 * 1000 * 1000, + LowerBoundVram: 5 * 1000 * 1000 * 1000, + UpperBoundVram: 120 * 1000 * 1000 * 1000, + }, + } + + scaler.Recommender = recommender + + scaler.ProcessWorkloads(ctx) + + Consistently(func(g Gomega) { + workers := getWorkers(workload) + annotations := workers[0].GetAnnotations() + + tflopsRequest := resource.MustParse(annotations[constants.TFLOPSRequestAnnotation]) + g.Expect(tflopsRequest.Equal(workload.Spec.Resources.Requests.Tflops)).To(BeTrue()) + + tflopsLimit := resource.MustParse(annotations[constants.TFLOPSLimitAnnotation]) + g.Expect(tflopsLimit.Equal(workload.Spec.Resources.Limits.Tflops)).To(BeTrue()) + + vramRequest := resource.MustParse(annotations[constants.VRAMRequestAnnotation]) + g.Expect(vramRequest.Equal(workload.Spec.Resources.Requests.Vram)).To(BeTrue()) + + vramLimit := resource.MustParse(annotations[constants.VRAMLimitAnnotation]) + g.Expect(vramLimit.Equal(workload.Spec.Resources.Limits.Vram)).To(BeTrue()) + + }).Should(Succeed()) + }) + }) +}) + +func createWorkload(pool *tfv1.GPUPool, id int, replicas int) *tfv1.TensorFusionWorkload { + GinkgoHelper() + tflopsRequests := resource.MustParse("10") + vramRequests := resource.MustParse("8Gi") + tflopsLimits := resource.MustParse("20") + vramLimits := resource.MustParse("16Gi") + + poolName := pool.Name + key := client.ObjectKey{Namespace: "default", Name: getWorkloadName(id)} + workload := &tfv1.TensorFusionWorkload{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: map[string]string{ + constants.GpuPoolKey: poolName, + }, + }, + Spec: tfv1.WorkloadProfileSpec{ + Replicas: ptr.Int32(int32(replicas)), + PoolName: poolName, + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: tflopsRequests, + Vram: vramRequests, + }, + Limits: tfv1.Resource{ + Tflops: tflopsLimits, + Vram: vramLimits, + }, + }, + Qos: constants.QoSLevelMedium, + AutoScalingConfig: tfv1.AutoScalingConfig{ + AutoSetLimits: tfv1.AutoSetLimits{ + Enable: true, + TargetResource: "", + }, + AutoSetRequests: tfv1.AutoSetRequests{ + Enable: true, + TargetResource: "", + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, workload)).To(Succeed()) + + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) + }).Should(Succeed()) + + checkWorkerPodCount(workload) + + return workload +} + +func checkWorkerPodCount(workload *tfv1.TensorFusionWorkload) { + GinkgoHelper() + podList := &corev1.PodList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, podList, + client.InNamespace(workload.Namespace), + client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed()) + g.Expect(podList.Items).Should(HaveLen(int(*workload.Spec.Replicas))) + }).Should(Succeed()) +} + +func getWorkloadName(index int) string { + return fmt.Sprintf("workload-%d", index) +} + +func getWorkers(workload *tfv1.TensorFusionWorkload) []*corev1.Pod { + GinkgoHelper() + podList := &corev1.PodList{} + Expect(k8sClient.List(ctx, podList, + client.InNamespace("default"), + client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed()) + return lo.Map(podList.Items, func(pod corev1.Pod, _ int) *corev1.Pod { + return &pod + }) +} + +type FakeMetricsProvider struct { + Metrics []*WorkerMetrics +} + +func (f *FakeMetricsProvider) GetWorkersMetrics() []*WorkerMetrics { + return f.Metrics +} + +func (f *FakeMetricsProvider) GetHistoryMetrics() []*WorkerMetrics { + metrics := []*WorkerMetrics{} + startTime := time.Now().Add(-7 * 24 * time.Hour) + for day := 0; day < 7; day++ { + for hour := 0; hour < 24; hour++ { + idx := day*24 + hour + metrics = append(metrics, &WorkerMetrics{ + Workload: "workload-0", + Worker: fmt.Sprintf("worker-%d", idx), + TflopsUsage: ResourceAmount(10.0 + float64(idx%10)), + VramUsage: 1 * 1024 * 1024 * 1024, + Timestamp: startTime.Add(time.Duration(day*24+hour) * time.Hour), + }) + } + } + + return metrics +} + +type FakeRecommender struct { + RecommendedResources +} + +func (f *FakeRecommender) GetRecommendedResources(_ *WorkloadState) *RecommendedResources { + return &f.RecommendedResources +} + +func updateWorkloadReplicas(workload *tfv1.TensorFusionWorkload, replicas int) { + GinkgoHelper() + key := client.ObjectKeyFromObject(workload) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) + workload.Spec.Replicas = ptr.Int32(int32(replicas)) + g.Expect(k8sClient.Update(ctx, workload)).To(Succeed()) + }).Should(Succeed()) + + checkWorkerPodCount(workload) +} + +func deleteWorkload(workload *tfv1.TensorFusionWorkload) { + cleanupWorkload(client.ObjectKeyFromObject(workload)) +} + +func cleanupWorkload(key client.ObjectKey) { + GinkgoHelper() + workload := &tfv1.TensorFusionWorkload{} + + if err := k8sClient.Get(ctx, key, workload); err != nil { + if errors.IsNotFound(err) { + return + } + Expect(err).To(HaveOccurred()) + } + + // Set replicas to 0 + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) + workload.Spec.Replicas = ptr.Int32(0) + g.Expect(k8sClient.Update(ctx, workload)).To(Succeed()) + }).Should(Succeed()) + + Eventually(func(g Gomega) { + podList := &corev1.PodList{} + g.Expect(k8sClient.List(ctx, podList, + client.InNamespace(key.Namespace), + client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed()) + g.Expect(podList.Items).Should(BeEmpty()) + }).Should(Succeed()) + + Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, workload)).To(Succeed()) + Eventually(func(g Gomega) { + err := k8sClient.Get(ctx, key, workload) + g.Expect(err).Should(HaveOccurred()) + }).Should(Succeed()) +} diff --git a/internal/autoscaler/estimator.go b/internal/autoscaler/estimator.go new file mode 100644 index 00000000..bbd44d7a --- /dev/null +++ b/internal/autoscaler/estimator.go @@ -0,0 +1,166 @@ +package autoscaler + +import ( + "math" + "time" + + "k8s.io/apimachinery/pkg/api/resource" +) + +const ( + // MaxResourceAmount is the maximum allowed value of resource amount. + MaxResourceAmount = ResourceAmount(1e14) +) + +type ResourceAmount int64 + +// ResourceAmountMax returns the larger of two resource amounts. +func ResourceAmountMax(amount1, amount2 ResourceAmount) ResourceAmount { + if amount1 > amount2 { + return amount1 + } + return amount2 +} + +func QuantityFromAmount(amount ResourceAmount) resource.Quantity { + return *resource.NewScaledQuantity(int64(amount), 0) +} + +func resourceAmountFromFloat(amount float64) ResourceAmount { + if amount < 0 { + return ResourceAmount(0) + } else if amount > float64(MaxResourceAmount) { + return MaxResourceAmount + } else { + return ResourceAmount(amount) + } +} + +type VramEstimator interface { + GetVramEstimation(s *WorkloadState) ResourceAmount +} + +type percentileVramEstimator struct { + percentile float64 +} + +// NewPercentileVramEstimator returns a new percentileVramEstimator that uses provided percentile. +func NewPercentileVramEstimator(percentile float64) VramEstimator { + return &percentileVramEstimator{percentile} +} + +func (e *percentileVramEstimator) GetVramEstimation(s *WorkloadState) ResourceAmount { + return resourceAmountFromFloat(float64(s.VramHistogram.Percentile(e.percentile))) +} + +type vramMarginEstimator struct { + marginFraction float64 + baseEstimator VramEstimator +} + +// WithvramMargin returns a vramEstimator that adds a margin to the base estimator. +func WithVramMargin(marginFraction float64, baseEstimator VramEstimator) VramEstimator { + return &vramMarginEstimator{marginFraction: marginFraction, baseEstimator: baseEstimator} +} + +// GetvramEstimation returns the vram estimation for the given AggregateContainerState. +func (e *vramMarginEstimator) GetVramEstimation(s *WorkloadState) ResourceAmount { + base := e.baseEstimator.GetVramEstimation(s) + margin := resourceAmountFromFloat(float64(base) * e.marginFraction) + return base + margin +} + +type vramConfidenceMultiplier struct { + multiplier float64 + exponent float64 + baseEstimator VramEstimator + confidenceInterval time.Duration +} + +// WithVramConfidenceMultiplier returns a VramEstimator that scales the +func WithVramConfidenceMultiplier(multiplier, exponent float64, baseEstimator VramEstimator, confidenceInterval time.Duration) VramEstimator { + return &vramConfidenceMultiplier{ + multiplier: multiplier, + exponent: exponent, + baseEstimator: baseEstimator, + confidenceInterval: confidenceInterval, + } +} + +func (e *vramConfidenceMultiplier) GetVramEstimation(s *WorkloadState) ResourceAmount { + confidence := getConfidence(s, e.confidenceInterval) + base := e.baseEstimator.GetVramEstimation(s) + return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent)) +} + +type TflopsEstimator interface { + GetTflopsEstimation(s *WorkloadState) ResourceAmount +} + +type percentileTflopsEstimator struct { + percentile float64 +} + +// NewPercentileTflopsEstimator returns a new percentileTflopsEstimator that uses provided percentile. +func NewPercentileTflopsEstimator(percentile float64) TflopsEstimator { + return &percentileTflopsEstimator{percentile} +} + +func (e *percentileTflopsEstimator) GetTflopsEstimation(s *WorkloadState) ResourceAmount { + return resourceAmountFromFloat(float64(s.TflopsHistogram.Percentile(e.percentile))) +} + +type tflopsMarginEstimator struct { + marginFraction float64 + baseEstimator TflopsEstimator +} + +// WithTflopsMargin returns a tflopsEstimator that adds a margin to the base estimator. +func WithTflopsMargin(marginFraction float64, baseEstimator TflopsEstimator) TflopsEstimator { + return &tflopsMarginEstimator{marginFraction: marginFraction, baseEstimator: baseEstimator} +} + +// GetTflopsEstimation returns the tflops estimation for the given AggregateContainerState. +func (e *tflopsMarginEstimator) GetTflopsEstimation(s *WorkloadState) ResourceAmount { + base := e.baseEstimator.GetTflopsEstimation(s) + margin := resourceAmountFromFloat(float64(base) * e.marginFraction) + return base + margin +} + +type tflopsConfidenceMultiplier struct { + multiplier float64 + exponent float64 + baseEstimator TflopsEstimator + confidenceInterval time.Duration +} + +// WithTflopsConfidenceMultiplier returns a TflopsEstimator that scales the +func WithTflopsConfidenceMultiplier(multiplier, exponent float64, baseEstimator TflopsEstimator, confidenceInterval time.Duration) TflopsEstimator { + return &tflopsConfidenceMultiplier{ + multiplier: multiplier, + exponent: exponent, + baseEstimator: baseEstimator, + confidenceInterval: confidenceInterval, + } +} + +func (e *tflopsConfidenceMultiplier) GetTflopsEstimation(s *WorkloadState) ResourceAmount { + confidence := getConfidence(s, e.confidenceInterval) + base := e.baseEstimator.GetTflopsEstimation(s) + return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent)) +} + +// Returns a non-negative real number that heuristically measures how much +// confidence the history aggregated in the WorkloadState provides. +// For a workload producing a steady stream of samples over N days at the rate +// of 1 sample per minute, this metric is equal to N. +// This implementation is a very simple heuristic which looks at the total count +// of samples and the time between the first and the last sample. +func getConfidence(s *WorkloadState, confidenceInterval time.Duration) float64 { + // Distance between the first and the last observed sample time, measured in days. + lifespanInDays := float64(s.LastSampleStart.Sub(s.FirstSampleStart)) / float64(confidenceInterval) + // Total count of samples normalized such that it equals the number of days for + // frequency of 1 sample/minute. + samplesAmount := float64(s.TotalSamplesCount) / confidenceInterval.Minutes() + return math.Min(lifespanInDays, samplesAmount) +} diff --git a/internal/autoscaler/metricsprovider.go b/internal/autoscaler/metricsprovider.go new file mode 100644 index 00000000..bccb1d1d --- /dev/null +++ b/internal/autoscaler/metricsprovider.go @@ -0,0 +1,30 @@ +package autoscaler + +import "time" + +type WorkerMetrics struct { + Workload string + Worker string + TflopsUsage ResourceAmount + VramUsage ResourceAmount + Timestamp time.Time +} + +type MetricsProvider interface { + GetWorkersMetrics() []*WorkerMetrics + GetHistoryMetrics() []*WorkerMetrics +} + +func NewMetricsProvider() MetricsProvider { + return &GreptimeDBProvider{} +} + +type GreptimeDBProvider struct{} + +func (*GreptimeDBProvider) GetWorkersMetrics() []*WorkerMetrics { + panic("unimplemented") +} + +func (*GreptimeDBProvider) GetHistoryMetrics() []*WorkerMetrics { + panic("unimplemented") +} diff --git a/internal/autoscaler/recommender.go b/internal/autoscaler/recommender.go new file mode 100644 index 00000000..2cbddb74 --- /dev/null +++ b/internal/autoscaler/recommender.go @@ -0,0 +1,84 @@ +package autoscaler + +import ( + "flag" + "time" +) + +var ( + safetyMarginFraction = flag.Float64("recommendation-margin-fraction", 0.15, `Fraction of usage added as the safety margin to the recommended request`) + targetVramPercentile = flag.Float64("target-vram-percentile", 0.9, "Vram usage percentile that will be used as a base for vram target recommendation. Doesn't affect vram lower bound nor vram upper bound.") + lowerBoundVramPercentile = flag.Float64("recommendation-lower-bound-vram-percentile", 0.5, `Vram usage percentile that will be used for the lower bound on vram recommendation.`) + upperBoundVramPercentile = flag.Float64("recommendation-upper-bound-vram-percentile", 0.95, `Vram usage percentile that will be used for the upper bound on vram recommendation.`) + targetTflopsPercentile = flag.Float64("target-tflops-percentile", 0.9, "Tflops usage percentile that will be used as a base for tflops target recommendation. Doesn't affect tflops lower bound nor tflops upper bound.") + lowerBoundTflopsPercentile = flag.Float64("recommendation-lower-bound-tflops-percentile", 0.5, `Tflops usage percentile that will be used for the lower bound on tflops recommendation.`) + upperBoundTflopsPercentile = flag.Float64("recommendation-upper-bound-tflops-percentile", 0.95, `Tflops usage percentile that will be used for the upper bound on tflops recommendation.`) + confidenceInterval = flag.Duration("confidence-interval", time.Hour*24, "The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h") +) + +type Recommender interface { + GetRecommendedResources(*WorkloadState) *RecommendedResources +} + +type RecommendedResources struct { + TargetTflops ResourceAmount + LowerBoundTflops ResourceAmount + UpperBoundTflops ResourceAmount + + TargetVram ResourceAmount + LowerBoundVram ResourceAmount + UpperBoundVram ResourceAmount +} + +func NewRecommender() Recommender { + targetTflops := NewPercentileTflopsEstimator(*targetTflopsPercentile) + lowerBoundTflops := NewPercentileTflopsEstimator(*lowerBoundTflopsPercentile) + upperBoundTflops := NewPercentileTflopsEstimator(*upperBoundTflopsPercentile) + + targetTflops = WithTflopsMargin(*safetyMarginFraction, targetTflops) + lowerBoundTflops = WithTflopsMargin(*safetyMarginFraction, lowerBoundTflops) + upperBoundTflops = WithTflopsMargin(*safetyMarginFraction, upperBoundTflops) + + upperBoundTflops = WithTflopsConfidenceMultiplier(1.0, 1.0, upperBoundTflops, *confidenceInterval) + lowerBoundTflops = WithTflopsConfidenceMultiplier(0.001, -2.0, lowerBoundTflops, *confidenceInterval) + + targetVram := NewPercentileVramEstimator(*targetVramPercentile) + lowerBoundVram := NewPercentileVramEstimator(*lowerBoundVramPercentile) + upperBoundVram := NewPercentileVramEstimator(*upperBoundVramPercentile) + + targetVram = WithVramMargin(*safetyMarginFraction, targetVram) + lowerBoundVram = WithVramMargin(*safetyMarginFraction, lowerBoundVram) + upperBoundVram = WithVramMargin(*safetyMarginFraction, upperBoundVram) + + upperBoundVram = WithVramConfidenceMultiplier(1.0, 1.0, upperBoundVram, *confidenceInterval) + lowerBoundVram = WithVramConfidenceMultiplier(0.001, -2.0, lowerBoundVram, *confidenceInterval) + + return &recommender{ + targetTflops: targetTflops, + lowerBoundTflops: lowerBoundTflops, + upperBoundTflops: upperBoundTflops, + targetVram: targetVram, + lowerBoundVram: lowerBoundVram, + upperBoundVram: upperBoundVram, + } +} + +type recommender struct { + targetTflops TflopsEstimator + lowerBoundTflops TflopsEstimator + upperBoundTflops TflopsEstimator + targetVram VramEstimator + lowerBoundVram VramEstimator + upperBoundVram VramEstimator +} + +func (r *recommender) GetRecommendedResources(s *WorkloadState) *RecommendedResources { + return &RecommendedResources{ + TargetTflops: r.targetTflops.GetTflopsEstimation(s), + LowerBoundTflops: r.lowerBoundTflops.GetTflopsEstimation(s), + UpperBoundTflops: r.upperBoundTflops.GetTflopsEstimation(s), + TargetVram: r.targetVram.GetVramEstimation(s), + LowerBoundVram: r.lowerBoundVram.GetVramEstimation(s), + UpperBoundVram: r.upperBoundVram.GetVramEstimation(s), + } +} diff --git a/internal/autoscaler/recommender_test.go b/internal/autoscaler/recommender_test.go new file mode 100644 index 00000000..9dcd52d5 --- /dev/null +++ b/internal/autoscaler/recommender_test.go @@ -0,0 +1,19 @@ +package autoscaler + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Recommender", func() { + Context("when get recommeded resource", func() { + It("should generate recommended resource based on histogram", func() { + recommender := NewRecommender() + Expect(recommender.GetRecommendedResources(nil)).To(BeNil()) + }) + It("should gererate recommended resource with safety margin", func() { + }) + It("should gererate recommended resource with confidence multiplier", func() { + }) + }) +}) diff --git a/internal/autoscaler/workerstate.go b/internal/autoscaler/workerstate.go new file mode 100644 index 00000000..991a4ae2 --- /dev/null +++ b/internal/autoscaler/workerstate.go @@ -0,0 +1,92 @@ +package autoscaler + +import ( + "time" +) + +type WorkerState struct { + Name string + Workload string + TflopsPeak ResourceAmount + LastTflopsSampleTime time.Time + TflopsWindowEnd time.Time + + VramPeak ResourceAmount + LastVramSampleTime time.Time + VramWindowEnd time.Time +} + +func NewWorkerState(name string, workload string) *WorkerState { + return &WorkerState{ + Name: name, + Workload: workload, + LastTflopsSampleTime: time.Time{}, + TflopsWindowEnd: time.Time{}, + LastVramSampleTime: time.Time{}, + VramWindowEnd: time.Time{}, + } +} + +func (w *WorkerState) AddTflopsSample(workload *WorkloadState, metrics *WorkerMetrics) bool { + ts := metrics.Timestamp + if ts.Before(w.LastTflopsSampleTime) { + return false + } + w.LastTflopsSampleTime = ts + if w.TflopsWindowEnd.IsZero() { + w.TflopsWindowEnd = ts + } + + addNewPeak := false + if ts.Before(w.TflopsWindowEnd) { + if w.TflopsPeak != 0 && metrics.TflopsUsage > w.TflopsPeak { + workload.TflopsHistogram.SubtractSample(float64(w.TflopsPeak), 1.0, w.TflopsWindowEnd) + addNewPeak = true + } + } else { + aggregationInteval := DefaultAggregationInterval + shift := ts.Sub(w.TflopsWindowEnd).Truncate(aggregationInteval) + aggregationInteval + w.TflopsWindowEnd = w.TflopsWindowEnd.Add(shift) + w.TflopsPeak = 0 + addNewPeak = true + } + + if addNewPeak { + workload.TflopsHistogram.AddSample(float64(metrics.TflopsUsage), 1.0, metrics.Timestamp) + w.TflopsPeak = metrics.TflopsUsage + } + + return true +} + +func (w *WorkerState) AddVramSample(workload *WorkloadState, metrics *WorkerMetrics) bool { + ts := metrics.Timestamp + if ts.Before(w.LastVramSampleTime) { + return false + } + w.LastVramSampleTime = ts + if w.VramWindowEnd.IsZero() { + w.VramWindowEnd = ts + } + + addNewPeak := false + if ts.Before(w.VramWindowEnd) { + if w.VramPeak != 0 && metrics.VramUsage > w.VramPeak { + workload.VramHistogram.SubtractSample(float64(w.VramPeak), 1.0, w.VramWindowEnd) + addNewPeak = true + } + } else { + aggregationInteval := DefaultAggregationInterval + shift := ts.Sub(w.VramWindowEnd).Truncate(aggregationInteval) + aggregationInteval + w.VramWindowEnd = w.VramWindowEnd.Add(shift) + w.VramPeak = 0 + addNewPeak = true + } + + if addNewPeak { + workload.VramHistogram.AddSample(float64(metrics.VramUsage), 1.0, metrics.Timestamp) + w.VramPeak = metrics.VramUsage + } + + return true +} diff --git a/internal/autoscaler/workloadstate.go b/internal/autoscaler/workloadstate.go new file mode 100644 index 00000000..3649f7c7 --- /dev/null +++ b/internal/autoscaler/workloadstate.go @@ -0,0 +1,64 @@ +package autoscaler + +import ( + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + vpa "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/util" +) + +const ( + // minSampleWeight is the minimal weight of any sample (prior to including decaying factor) + minSampleWeight = 0.1 + // epsilon is the minimal weight kept in histograms, it should be small enough that old samples + // (just inside AggregationWindowLength) added with minSampleWeight are still kept + epsilon = 0.001 * minSampleWeight + // DefaultAggregationInterval is the default value for AggregationInterval. + DefaultAggregationInterval = time.Hour * 24 + // DefaultHistogramBucketSizeGrowth is the default value for HistogramBucketSizeGrowth. + DefaultHistogramBucketSizeGrowth = 0.05 // Make each bucket 5% larger than the previous one. + // DefaultVramHistogramDecayHalfLife is the default value for HistogramDecayHalfLife. + DefaultHistogramDecayHalfLife = time.Hour * 24 +) + +type WorkloadState struct { + Namespace string + Name string + Resources tfv1.Resources + AutoScalingConfig tfv1.AutoScalingConfig + + TflopsHistogram vpa.Histogram + VramHistogram vpa.Histogram + + FirstSampleStart time.Time + LastSampleStart time.Time + TotalSamplesCount int + CreationTime time.Time +} + +func NewWorkloadState(name string) *WorkloadState { + return &WorkloadState{ + Name: name, + TflopsHistogram: vpa.NewDecayingHistogram(histogramOptions(10000.0, 0.1), DefaultHistogramDecayHalfLife), + VramHistogram: vpa.NewDecayingHistogram(histogramOptions(1e12, 1e7), DefaultHistogramDecayHalfLife), + CreationTime: time.Now(), + } +} + +func histogramOptions(maxValue, firstBucketSize float64) vpa.HistogramOptions { + options, err := vpa.NewExponentialHistogramOptions(maxValue, firstBucketSize, 1.+DefaultHistogramBucketSizeGrowth, epsilon) + if err != nil { + panic("Invalid histogram options") // Should not happen. + } + return options +} + +func (w *WorkloadState) UpdateSampleStats(metrics *WorkerMetrics) { + if metrics.Timestamp.After(w.LastSampleStart) { + w.LastSampleStart = metrics.Timestamp + } + if w.FirstSampleStart.IsZero() || metrics.Timestamp.Before(w.FirstSampleStart) { + w.FirstSampleStart = metrics.Timestamp + } + w.TotalSamplesCount++ +} From 1de2fe9c132b70d81883ebeb6df0ce303c54dbbe Mon Sep 17 00:00:00 2001 From: knave Date: Sat, 21 Jun 2025 01:58:54 +0800 Subject: [PATCH 02/27] test: refactor test when processing workloads --- internal/autoscaler/autoscaler_test.go | 36 +++++--------------------- 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 9316bc85..d68f0119 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -137,7 +137,7 @@ var _ = Describe("Autoscaler", func() { }) Context("when processing workloads", func() { - It("should update worker annotations if resource out of bounds", func() { + It("should update only those resources exceeding the recommended resource boundaries", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -181,32 +181,8 @@ var _ = Describe("Autoscaler", func() { g.Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2))) }).Should(Succeed()) - }) - - It("should not udpate worker annotations if resources in bounds", func() { - tfEnv := NewTensorFusionEnvBuilder(). - AddPoolWithNodeCount(1).SetGpuCountPerNode(1). - Build() - defer tfEnv.Cleanup() - workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) - defer deleteWorkload(workload) - - scaler, _ := NewAutoscaler(k8sClient) - scaler.LoadWorkloads(ctx) - - recommender := &FakeRecommender{ - RecommendedResources: RecommendedResources{ - TargetTflops: 110, - LowerBoundTflops: 10, - UpperBoundTflops: 120, - TargetVram: 110 * 1000 * 1000 * 1000, - LowerBoundVram: 5 * 1000 * 1000 * 1000, - UpperBoundVram: 120 * 1000 * 1000 * 1000, - }, - } - - scaler.Recommender = recommender + // Upon reprocessing the workload, it should skip resource updates since they are already within the recommended resource boundaries scaler.ProcessWorkloads(ctx) Consistently(func(g Gomega) { @@ -214,16 +190,16 @@ var _ = Describe("Autoscaler", func() { annotations := workers[0].GetAnnotations() tflopsRequest := resource.MustParse(annotations[constants.TFLOPSRequestAnnotation]) - g.Expect(tflopsRequest.Equal(workload.Spec.Resources.Requests.Tflops)).To(BeTrue()) + g.Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) tflopsLimit := resource.MustParse(annotations[constants.TFLOPSLimitAnnotation]) - g.Expect(tflopsLimit.Equal(workload.Spec.Resources.Limits.Tflops)).To(BeTrue()) + g.Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) vramRequest := resource.MustParse(annotations[constants.VRAMRequestAnnotation]) - g.Expect(vramRequest.Equal(workload.Spec.Resources.Requests.Vram)).To(BeTrue()) + g.Expect(vramRequest.Value()).To(Equal(int64(rr.TargetVram))) vramLimit := resource.MustParse(annotations[constants.VRAMLimitAnnotation]) - g.Expect(vramLimit.Equal(workload.Spec.Resources.Limits.Vram)).To(BeTrue()) + g.Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2))) }).Should(Succeed()) }) From dbea96d77856fc0d9a5fe124415a36d9a37241fb Mon Sep 17 00:00:00 2001 From: knave Date: Tue, 24 Jun 2025 01:50:44 +0800 Subject: [PATCH 03/27] feat: implement LeaderElectonRunnable explicitly and add compile-time check --- internal/autoscaler/autoscaler.go | 43 +++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 923f6a5a..974659b1 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -14,6 +14,12 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +var ( + _ manager.Runnable = (*Autoscaler)(nil) + _ manager.LeaderElectionRunnable = (*Autoscaler)(nil) ) type Autoscaler struct { @@ -32,7 +38,7 @@ func NewAutoscaler(c client.Client) (*Autoscaler, error) { return &Autoscaler{ Client: c, Recommender: NewRecommender(), - MetricsProvider: NewMetricsProvider(), + MetricsProvider: NewMetricsProvider(nil), WorkloadStates: map[string]*WorkloadState{}, WorkerStates: map[string]*WorkerState{}, }, nil @@ -57,6 +63,10 @@ func (s *Autoscaler) Start(ctx context.Context) error { } } +func (s *Autoscaler) NeedLeaderElection() bool { + return true +} + func (s *Autoscaler) Run(ctx context.Context) { log := log.FromContext(ctx) @@ -80,8 +90,8 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) { autoScalingConfig := workload.Spec.AutoScalingConfig // Currently only supports enabling both AutoSetLimits and AutoSetRequests simultaneously if !workload.DeletionTimestamp.IsZero() || - !(autoScalingConfig.AutoSetLimits.Enable && - autoScalingConfig.AutoSetRequests.Enable) { + !autoScalingConfig.AutoSetLimits.Enable || + !autoScalingConfig.AutoSetRequests.Enable { continue } @@ -138,15 +148,15 @@ func (s *Autoscaler) LoadHistoryMetrics(ctx context.Context) { workersMetrics := s.MetricsProvider.GetHistoryMetrics() for _, metrics := range workersMetrics { - workloadState, exists := s.WorkloadStates[metrics.Workload] + workloadState, exists := s.WorkloadStates[metrics.WorkloadName] if !exists { - workloadState = NewWorkloadState(metrics.Workload) - s.WorkloadStates[metrics.Workload] = workloadState + workloadState = NewWorkloadState(metrics.WorkloadName) + s.WorkloadStates[metrics.WorkloadName] = workloadState } - workerState, exists := s.WorkerStates[metrics.Worker] + workerState, exists := s.WorkerStates[metrics.WorkerName] if !exists { - workerState = NewWorkerState(metrics.Worker, metrics.Workload) - s.WorkerStates[metrics.Worker] = workerState + workerState = NewWorkerState(metrics.WorkerName, metrics.WorkloadName) + s.WorkerStates[metrics.WorkerName] = workerState } s.addSamples(workloadState, workerState, metrics) @@ -159,11 +169,11 @@ func (s *Autoscaler) LoadRealTimeMetrics(ctx context.Context) { workersMetrics := s.MetricsProvider.GetWorkersMetrics() for _, metrics := range workersMetrics { - workloadState, workloadExists := s.WorkloadStates[metrics.Workload] + workloadState, workloadExists := s.WorkloadStates[metrics.WorkloadName] if !workloadExists { continue } - workerState, workerExists := s.WorkerStates[metrics.Worker] + workerState, workerExists := s.WorkerStates[metrics.WorkerName] if !workerExists { continue } @@ -186,8 +196,12 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { continue } + if len(podList.Items) <= 0 { + continue + } + // TODO: apply config - // asConfig := workloadState.AutoScalingConfig + // asConfig := workloadState.AutoScalingConfig rr := s.Recommender.GetRecommendedResources(workloadState) log.Info("Autoscaler processWorkloads", "recommended resources", rr) @@ -197,13 +211,13 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { } annotations := worker.GetAnnotations() + newAnnotations := map[string]string{} + tflopsRequest, err := resource.ParseQuantity(annotations[constants.TFLOPSRequestAnnotation]) if err != nil { log.Error(err, "failed to parse vram request") continue } - - newAnnotations := map[string]string{} if tflopsRequest.Cmp(QuantityFromAmount(rr.LowerBoundTflops)) < 0 || tflopsRequest.Cmp(QuantityFromAmount(rr.UpperBoundTflops)) > 0 { targetTflopsRequest := QuantityFromAmount(rr.TargetTflops) @@ -248,6 +262,7 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { worker.Annotations[key] = value } + // TODO: replace using the patch method if err := s.Update(ctx, &worker); err != nil { log.Error(err, "failed to update worker") } From d5f3053ba550afaef79943387bb8b92a8a5c57be Mon Sep 17 00:00:00 2001 From: knave Date: Tue, 24 Jun 2025 02:05:40 +0800 Subject: [PATCH 04/27] feat: aggregate samples into histogram per tflops --- internal/autoscaler/autoscaler_test.go | 36 ++++++++++++++++---------- internal/autoscaler/workerstate.go | 32 +++-------------------- 2 files changed, 25 insertions(+), 43 deletions(-) diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index d68f0119..919969d1 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -33,6 +33,15 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// tflops add all samples, like cpu in vpa +// Consider gpu allocator, check if enough tflops or vram to allocate +// cron scheduler stragegy +// Add AutoSetResources to schedulingconfigtemplate and make it more configurable +// refactor main, setup database may not put in leader election runnable group +// scale to zero when query data if no usage, need carl to support +// add recommendation to workload +// resolve conversation on github, thanks for reviews + var _ = Describe("Autoscaler", func() { Context("when creating an autoscaler", func() { It("should return an error if there is no client", func() { @@ -49,8 +58,8 @@ var _ = Describe("Autoscaler", func() { scaler.LoadHistoryMetrics(ctx) metrics := scaler.MetricsProvider.GetHistoryMetrics() for _, m := range metrics { - Expect(scaler.WorkloadStates).To(HaveKey(m.Workload)) - Expect(scaler.WorkerStates).To(HaveKey(m.Worker)) + Expect(scaler.WorkloadStates).To(HaveKey(m.WorkloadName)) + Expect(scaler.WorkerStates).To(HaveKey(m.WorkerName)) } }) }) @@ -101,7 +110,7 @@ var _ = Describe("Autoscaler", func() { }) Context("when loading real time metrics", func() { - It("should update the state of workloads and workers", func() { + FIt("should update the state of workloads and workers", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -117,17 +126,16 @@ var _ = Describe("Autoscaler", func() { scaler.LoadWorkloads(ctx) ws := scaler.WorkloadStates[workload.Name] metrics := &WorkerMetrics{ - Workload: workload.Name, - Worker: worker, - TflopsUsage: ResourceAmount(12.0), - VramUsage: 9000, - Timestamp: time.Now(), + WorkloadName: workload.Name, + WorkerName: worker, + TflopsUsage: ResourceAmount(12.0), + VramUsage: 9000, + Timestamp: time.Now(), } scaler.MetricsProvider = &FakeMetricsProvider{[]*WorkerMetrics{metrics}} scaler.LoadRealTimeMetrics(ctx) - Expect(scaler.WorkerStates[worker].TflopsPeak).To(Equal(metrics.TflopsUsage)) Expect(scaler.WorkerStates[worker].LastTflopsSampleTime).To(Equal(metrics.Timestamp)) Expect(ws.TflopsHistogram.IsEmpty()).To(BeFalse()) Expect(scaler.WorkerStates[worker].VramPeak).To(Equal(metrics.VramUsage)) @@ -302,11 +310,11 @@ func (f *FakeMetricsProvider) GetHistoryMetrics() []*WorkerMetrics { for hour := 0; hour < 24; hour++ { idx := day*24 + hour metrics = append(metrics, &WorkerMetrics{ - Workload: "workload-0", - Worker: fmt.Sprintf("worker-%d", idx), - TflopsUsage: ResourceAmount(10.0 + float64(idx%10)), - VramUsage: 1 * 1024 * 1024 * 1024, - Timestamp: startTime.Add(time.Duration(day*24+hour) * time.Hour), + WorkloadName: "workload-0", + WorkerName: fmt.Sprintf("worker-%d", idx), + TflopsUsage: ResourceAmount(10.0 + float64(idx%10)), + VramUsage: 1 * 1024 * 1024 * 1024, + Timestamp: startTime.Add(time.Duration(day*24+hour) * time.Hour), }) } } diff --git a/internal/autoscaler/workerstate.go b/internal/autoscaler/workerstate.go index 991a4ae2..356fdfc1 100644 --- a/internal/autoscaler/workerstate.go +++ b/internal/autoscaler/workerstate.go @@ -7,9 +7,7 @@ import ( type WorkerState struct { Name string Workload string - TflopsPeak ResourceAmount LastTflopsSampleTime time.Time - TflopsWindowEnd time.Time VramPeak ResourceAmount LastVramSampleTime time.Time @@ -21,41 +19,17 @@ func NewWorkerState(name string, workload string) *WorkerState { Name: name, Workload: workload, LastTflopsSampleTime: time.Time{}, - TflopsWindowEnd: time.Time{}, LastVramSampleTime: time.Time{}, VramWindowEnd: time.Time{}, } } func (w *WorkerState) AddTflopsSample(workload *WorkloadState, metrics *WorkerMetrics) bool { - ts := metrics.Timestamp - if ts.Before(w.LastTflopsSampleTime) { + if metrics.Timestamp.Before(w.LastTflopsSampleTime) { return false } - w.LastTflopsSampleTime = ts - if w.TflopsWindowEnd.IsZero() { - w.TflopsWindowEnd = ts - } - - addNewPeak := false - if ts.Before(w.TflopsWindowEnd) { - if w.TflopsPeak != 0 && metrics.TflopsUsage > w.TflopsPeak { - workload.TflopsHistogram.SubtractSample(float64(w.TflopsPeak), 1.0, w.TflopsWindowEnd) - addNewPeak = true - } - } else { - aggregationInteval := DefaultAggregationInterval - shift := ts.Sub(w.TflopsWindowEnd).Truncate(aggregationInteval) + aggregationInteval - w.TflopsWindowEnd = w.TflopsWindowEnd.Add(shift) - w.TflopsPeak = 0 - addNewPeak = true - } - - if addNewPeak { - workload.TflopsHistogram.AddSample(float64(metrics.TflopsUsage), 1.0, metrics.Timestamp) - w.TflopsPeak = metrics.TflopsUsage - } - + workload.TflopsHistogram.AddSample(float64(metrics.TflopsUsage), minSampleWeight, metrics.Timestamp) + w.LastTflopsSampleTime = metrics.Timestamp return true } From db9c6ce17c407538beda5faa018f5ee08ce06435 Mon Sep 17 00:00:00 2001 From: knave Date: Thu, 26 Jun 2025 07:51:35 +0800 Subject: [PATCH 05/27] feat: implement metrics provider --- internal/autoscaler/metricsprovider.go | 98 ++++++++++++++--- internal/autoscaler/metricsprovider_test.go | 112 ++++++++++++++++++++ 2 files changed, 195 insertions(+), 15 deletions(-) create mode 100644 internal/autoscaler/metricsprovider_test.go diff --git a/internal/autoscaler/metricsprovider.go b/internal/autoscaler/metricsprovider.go index bccb1d1d..7b4e06d6 100644 --- a/internal/autoscaler/metricsprovider.go +++ b/internal/autoscaler/metricsprovider.go @@ -1,30 +1,98 @@ package autoscaler -import "time" +import ( + "time" + + "github.com/NexusGPU/tensor-fusion/internal/metrics" + "gorm.io/gorm" +) type WorkerMetrics struct { - Workload string - Worker string - TflopsUsage ResourceAmount - VramUsage ResourceAmount - Timestamp time.Time + WorkloadName string + WorkerName string + TflopsUsage ResourceAmount + VramUsage ResourceAmount + Timestamp time.Time } type MetricsProvider interface { - GetWorkersMetrics() []*WorkerMetrics - GetHistoryMetrics() []*WorkerMetrics + GetWorkersMetrics() ([]*WorkerMetrics, error) + GetHistoryMetrics() ([]*WorkerMetrics, error) +} + +func NewMetricsProvider(db *gorm.DB) MetricsProvider { + return &greptimeDBProvider{db: db} } -func NewMetricsProvider() MetricsProvider { - return &GreptimeDBProvider{} +type greptimeDBProvider struct { + db *gorm.DB + lastQueryTime time.Time + historyDuration time.Duration } -type GreptimeDBProvider struct{} +func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerMetrics, error) { + data := []*metrics.HypervisorWorkerUsageMetrics{} + now := time.Now() + // actual meaning: max(avg[10s])[1m] + err := g.db.Select("workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, max(ts) as ts"). + Where("ts > ? and ts <= ?", g.lastQueryTime.Nanosecond(), now.Nanosecond()). + Group("workload, worker"). + Order("ts asc"). + Error + + if err != nil { + return nil, err + } + + g.lastQueryTime = now + + workersMetrics := make([]*WorkerMetrics, 0, len(data)) + for _, row := range data { + workersMetrics = append(workersMetrics, &WorkerMetrics{ + WorkloadName: row.WorkloadName, + WorkerName: row.WorkerName, + TflopsUsage: resourceAmountFromFloat(row.ComputeTflops), + VramUsage: ResourceAmount(row.VRAMBytes), + Timestamp: row.Timestamp, + }) + } -func (*GreptimeDBProvider) GetWorkersMetrics() []*WorkerMetrics { - panic("unimplemented") + return workersMetrics, nil } -func (*GreptimeDBProvider) GetHistoryMetrics() []*WorkerMetrics { - panic("unimplemented") +type hypervisorWorkerUsageMetrics struct { + metrics.HypervisorWorkerUsageMetrics + TimeWindow time.Time `gorm:"column:time_window;index:,class:TIME"` +} + +func (g *greptimeDBProvider) GetHistoryMetrics() ([]*WorkerMetrics, error) { + data := []*hypervisorWorkerUsageMetrics{} + now := time.Now() + // TODO: replace using iteration for handling large datasets efficiently + // TODO: supply history resolution to config time window + err := g.db.Select("workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, date_bin('1 minute'::INTERVAL, ts) as time_window"). + Where("ts > ? and ts <= ?", now.Add(-g.historyDuration), now.Nanosecond()). + Group("workload, worker, time_window"). + Order("time_window asc"). + Find(&data). + Error + + if err != nil { + return nil, err + } + + g.lastQueryTime = now + + workersMetrics := make([]*WorkerMetrics, 0, len(data)) + for _, row := range data { + workersMetrics = append(workersMetrics, &WorkerMetrics{ + WorkloadName: row.WorkloadName, + WorkerName: row.WorkerName, + TflopsUsage: resourceAmountFromFloat(row.ComputeTflops), + VramUsage: ResourceAmount(row.VRAMBytes), + Timestamp: row.TimeWindow, + }) + } + + return workersMetrics, nil } diff --git a/internal/autoscaler/metricsprovider_test.go b/internal/autoscaler/metricsprovider_test.go new file mode 100644 index 00000000..9d7dae04 --- /dev/null +++ b/internal/autoscaler/metricsprovider_test.go @@ -0,0 +1,112 @@ +package autoscaler + +import ( + "regexp" + "time" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/NexusGPU/tensor-fusion/internal/metrics" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "gorm.io/driver/mysql" + "gorm.io/gorm" +) + +var _ = Describe("MetricsProvider", func() { + Context("when getting real time workers metrics", func() { + It("should return slices", func() { + db, mock := NewMockDB() + now := time.Now() + fakeMetrics := []metrics.HypervisorWorkerUsageMetrics{ + { + WorkloadName: "workload-0", + WorkerName: "worker-0", + ComputeTflops: 10.3, + VRAMBytes: 1 * 1000 * 1000 * 1000, + Timestamp: now, + }, + { + WorkloadName: "workload-1", + WorkerName: "worker-1", + ComputeTflops: 10.3, + VRAMBytes: 1 * 1000 * 1000 * 1000, + Timestamp: now, + }, + } + + rows := sqlmock.NewRows([]string{"workload", "worker", "compute_tflops", "memory_bytes", "ts"}) + for _, row := range fakeMetrics { + rows.AddRow(row.WorkloadName, row.WorkerName, row.ComputeTflops, row.VRAMBytes, row.Timestamp) + } + + mock.ExpectQuery(regexp.QuoteMeta("SELECT workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, max(ts) as ts FROM `tf_worker_usage` WHERE ts > ? GROUP BY workload, worker")). + WillReturnRows(rows) + provider := &greptimeDBProvider{db: db} + got, _ := provider.GetWorkersMetrics() + Expect(got).To(HaveLen(2)) + Expect(got[0].WorkloadName).To(Equal(fakeMetrics[0].WorkloadName)) + Expect(got[0].WorkerName).To(Equal(fakeMetrics[0].WorkerName)) + Expect(got[0].VramUsage).To(Equal(ResourceAmount(fakeMetrics[0].VRAMBytes))) + Expect(got[0].TflopsUsage).To(Equal(resourceAmountFromFloat(fakeMetrics[0].ComputeTflops))) + Expect(got[0].Timestamp).To(Equal(fakeMetrics[0].Timestamp)) + }) + }) + + Context("when getting history workers metrics", func() { + FIt("should return slices", func() { + db, mock := NewMockDB() + now := time.Now() + fakeMetrics := []hypervisorWorkerUsageMetrics{ + { + HypervisorWorkerUsageMetrics: metrics.HypervisorWorkerUsageMetrics{ + WorkloadName: "workload-0", + WorkerName: "worker-0", + ComputeTflops: 10.3, + VRAMBytes: 1 * 1000 * 1000 * 1000, + Timestamp: now, + }, + TimeWindow: now, + }, + { + HypervisorWorkerUsageMetrics: metrics.HypervisorWorkerUsageMetrics{ + WorkloadName: "workload-1", + WorkerName: "worker-1", + ComputeTflops: 10.3, + VRAMBytes: 1 * 1000 * 1000 * 1000, + Timestamp: now, + }, + TimeWindow: now, + }, + } + + rows := sqlmock.NewRows([]string{"workload", "worker", "compute_tflops", "memory_bytes", "time_window"}) + for _, row := range fakeMetrics { + rows.AddRow(row.WorkloadName, row.WorkerName, row.ComputeTflops, row.VRAMBytes, row.TimeWindow) + } + + mock.ExpectQuery(regexp.QuoteMeta("SELECT workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, date_bin('1 minute'::INTERVAL, ts) as time_window FROM `tf_worker_usage` WHERE ts > ? and ts <= ? GROUP BY workload, worker, time_window ORDER BY time_window asc")). + WillReturnRows(rows) + provider := &greptimeDBProvider{db: db} + got, _ := provider.GetHistoryMetrics() + Expect(got).To(HaveLen(2)) + Expect(got[0].WorkloadName).To(Equal(fakeMetrics[0].WorkloadName)) + Expect(got[0].WorkerName).To(Equal(fakeMetrics[0].WorkerName)) + Expect(got[0].VramUsage).To(Equal(ResourceAmount(fakeMetrics[0].VRAMBytes))) + Expect(got[0].TflopsUsage).To(Equal(resourceAmountFromFloat(fakeMetrics[0].ComputeTflops))) + Expect(got[0].Timestamp).To(Equal(fakeMetrics[0].TimeWindow)) + }) + }) +}) + +func NewMockDB() (*gorm.DB, sqlmock.Sqlmock) { + GinkgoHelper() + db, mock, err := sqlmock.New() + Expect(err).ToNot(HaveOccurred()) + gormDB, err := gorm.Open(mysql.New(mysql.Config{ + Conn: db, + SkipInitializeWithVersion: true, + }), &gorm.Config{}) + Expect(err).ToNot(HaveOccurred()) + + return gormDB, mock +} From 4142a36b90079f7f7932f32251eaee186454454c Mon Sep 17 00:00:00 2001 From: knave Date: Fri, 27 Jun 2025 08:29:51 +0800 Subject: [PATCH 06/27] feat: add allocator logic --- internal/autoscaler/autoscaler.go | 146 ++++++++++++--------- internal/autoscaler/autoscaler_test.go | 170 ++++++++++++++----------- 2 files changed, 180 insertions(+), 136 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 974659b1..96e6cb5a 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -3,11 +3,13 @@ package autoscaler import ( "context" "errors" + "fmt" "math/big" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -24,19 +26,29 @@ var ( type Autoscaler struct { client.Client + Allocator Recommender MetricsProvider WorkloadStates map[string]*WorkloadState WorkerStates map[string]*WorkerState } -func NewAutoscaler(c client.Client) (*Autoscaler, error) { +type Allocator interface { + Realloc(ctx context.Context, req gpuallocator.AllocRequest) error +} + +func NewAutoscaler(c client.Client, allocator Allocator) (*Autoscaler, error) { if c == nil { return nil, errors.New("must specify client") } + if allocator == nil { + return nil, errors.New("must specify reallocator") + } + return &Autoscaler{ Client: c, + Allocator: allocator, Recommender: NewRecommender(), MetricsProvider: NewMetricsProvider(nil), WorkloadStates: map[string]*WorkloadState{}, @@ -89,6 +101,7 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) { for _, workload := range workloadList.Items { autoScalingConfig := workload.Spec.AutoScalingConfig // Currently only supports enabling both AutoSetLimits and AutoSetRequests simultaneously + // TODO: when recommending, need to observe all workload if !workload.DeletionTimestamp.IsZero() || !autoScalingConfig.AutoSetLimits.Enable || !autoScalingConfig.AutoSetRequests.Enable { @@ -132,12 +145,12 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) { } // remove unused workloadStates - s.WorkloadStates = lo.OmitBy(s.WorkloadStates, func(key string, value *WorkloadState) bool { + s.WorkloadStates = lo.OmitBy(s.WorkloadStates, func(key string, _ *WorkloadState) bool { return !observedWorkloads[key] }) // remove unused workerStates - s.WorkerStates = lo.OmitBy(s.WorkerStates, func(key string, state *WorkerState) bool { + s.WorkerStates = lo.OmitBy(s.WorkerStates, func(_ string, state *WorkerState) bool { return !observedWorkloads[state.Workload] }) } @@ -146,7 +159,11 @@ func (s *Autoscaler) LoadHistoryMetrics(ctx context.Context) { log := log.FromContext(ctx) log.Info("loading historical metrics") - workersMetrics := s.MetricsProvider.GetHistoryMetrics() + workersMetrics, err := s.MetricsProvider.GetHistoryMetrics() + if err != nil { + log.Error(err, "failed to get history metrics") + return + } for _, metrics := range workersMetrics { workloadState, exists := s.WorkloadStates[metrics.WorkloadName] if !exists { @@ -167,7 +184,12 @@ func (s *Autoscaler) LoadRealTimeMetrics(ctx context.Context) { log := log.FromContext(ctx) log.Info("loading realtime metrics") - workersMetrics := s.MetricsProvider.GetWorkersMetrics() + workersMetrics, err := s.MetricsProvider.GetWorkersMetrics() + if err != nil { + log.Error(err, "failed to get workers metrics") + return + } + for _, metrics := range workersMetrics { workloadState, workloadExists := s.WorkloadStates[metrics.WorkloadName] if !workloadExists { @@ -210,65 +232,71 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { continue } - annotations := worker.GetAnnotations() - newAnnotations := map[string]string{} - - tflopsRequest, err := resource.ParseQuantity(annotations[constants.TFLOPSRequestAnnotation]) - if err != nil { - log.Error(err, "failed to parse vram request") - continue - } - if tflopsRequest.Cmp(QuantityFromAmount(rr.LowerBoundTflops)) < 0 || - tflopsRequest.Cmp(QuantityFromAmount(rr.UpperBoundTflops)) > 0 { - targetTflopsRequest := QuantityFromAmount(rr.TargetTflops) - newAnnotations[constants.TFLOPSRequestAnnotation] = targetTflopsRequest.String() - tflopsLimit, err := resource.ParseQuantity(annotations[constants.TFLOPSLimitAnnotation]) - if err != nil { - log.Error(err, "failed to parse tflops limit annotation") - continue - } - targetTflopsLimit := getProportionalLimit(&tflopsLimit, &tflopsRequest, &targetTflopsRequest) - if targetTflopsLimit == nil { - log.Error(err, "failed to get limit for tflops") - continue - } - newAnnotations[constants.TFLOPSLimitAnnotation] = targetTflopsLimit.String() + if err := s.updateWorker(ctx, &worker, rr); err != nil { + log.Error(err, "failed to update worker") } + } + } +} - vramRequest, err := resource.ParseQuantity(annotations[constants.VRAMRequestAnnotation]) - if err != nil { - log.Error(err, "failed to parse vram request") - continue - } - if vramRequest.Cmp(QuantityFromAmount(rr.LowerBoundVram)) < 0 || - vramRequest.Cmp(QuantityFromAmount(rr.UpperBoundVram)) > 0 { - targetVramRequest := QuantityFromAmount(rr.TargetVram) - newAnnotations[constants.VRAMRequestAnnotation] = targetVramRequest.String() - vramLimit, err := resource.ParseQuantity(annotations[constants.VRAMLimitAnnotation]) - if err != nil { - log.Error(err, "failed to parse vram limit annotation") - continue - } - targetVramLimit := getProportionalLimit(&vramLimit, &vramRequest, &targetVramRequest) - if targetVramLimit == nil { - log.Error(err, "failed to get limit for vram") - continue - } - newAnnotations[constants.VRAMLimitAnnotation] = targetVramLimit.String() - } +func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *RecommendedResources) error { + annotations := worker.GetAnnotations() + newAnnotations := map[string]string{} - if len(newAnnotations) > 0 { - for key, value := range newAnnotations { - worker.Annotations[key] = value - } + tflopsRequest, err := resource.ParseQuantity(annotations[constants.TFLOPSRequestAnnotation]) + if err != nil { + return fmt.Errorf("failed to parse tflops request: %v", err) + } + if tflopsRequest.Cmp(QuantityFromAmount(rr.LowerBoundTflops)) < 0 || + tflopsRequest.Cmp(QuantityFromAmount(rr.UpperBoundTflops)) > 0 { + targetTflopsRequest := QuantityFromAmount(rr.TargetTflops) + newAnnotations[constants.TFLOPSRequestAnnotation] = targetTflopsRequest.String() + tflopsLimit, err := resource.ParseQuantity(annotations[constants.TFLOPSLimitAnnotation]) + if err != nil { + return fmt.Errorf("failed to parse tflops limit annotation: %v", err) + } + targetTflopsLimit := getProportionalLimit(&tflopsLimit, &tflopsRequest, &targetTflopsRequest) + if targetTflopsLimit == nil { + return fmt.Errorf("failed to get limit for tflops") + } + newAnnotations[constants.TFLOPSLimitAnnotation] = targetTflopsLimit.String() + } - // TODO: replace using the patch method - if err := s.Update(ctx, &worker); err != nil { - log.Error(err, "failed to update worker") - } - } + vramRequest, err := resource.ParseQuantity(annotations[constants.VRAMRequestAnnotation]) + if err != nil { + return fmt.Errorf("failed to parse vram request: %v", err) + } + if vramRequest.Cmp(QuantityFromAmount(rr.LowerBoundVram)) < 0 || + vramRequest.Cmp(QuantityFromAmount(rr.UpperBoundVram)) > 0 { + targetVramRequest := QuantityFromAmount(rr.TargetVram) + newAnnotations[constants.VRAMRequestAnnotation] = targetVramRequest.String() + vramLimit, err := resource.ParseQuantity(annotations[constants.VRAMLimitAnnotation]) + if err != nil { + return fmt.Errorf("failed to parse vram limit annotation: %v", err) + } + targetVramLimit := getProportionalLimit(&vramLimit, &vramRequest, &targetVramRequest) + if targetVramLimit == nil { + return fmt.Errorf("failed to get limit for vram") } + newAnnotations[constants.VRAMLimitAnnotation] = targetVramLimit.String() } + + if len(newAnnotations) > 0 { + if err := s.Allocator.Realloc(ctx, gpuallocator.AllocRequest{}); err != nil { + return fmt.Errorf("failed to reallocate resources: %v", err) + } + + for key, value := range newAnnotations { + worker.Annotations[key] = value + } + + // TODO: replace using the patch method + if err := s.Update(ctx, worker); err != nil { + return fmt.Errorf("failed to update worker: %v", err) + } + } + + return nil } func (*Autoscaler) addSamples(workloadState *WorkloadState, workerState *WorkerState, metrics *WorkerMetrics) { @@ -300,7 +328,7 @@ func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *re // Start after manager started func SetupWithManager(mgr ctrl.Manager) error { - autoScaler, err := NewAutoscaler(mgr.GetClient()) + autoScaler, err := NewAutoscaler(mgr.GetClient(), nil) if err != nil { return err } diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 919969d1..fb97271b 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -17,11 +17,13 @@ limitations under the License. package autoscaler import ( + "context" "fmt" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/aws/smithy-go/ptr" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -33,30 +35,40 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// tflops add all samples, like cpu in vpa // Consider gpu allocator, check if enough tflops or vram to allocate -// cron scheduler stragegy +// Add tests for recommender +// Add logs for key events +// [x] tflops add all samples, like cpu in vpa +// Implement gc for cleaning outdated data // Add AutoSetResources to schedulingconfigtemplate and make it more configurable -// refactor main, setup database may not put in leader election runnable group -// scale to zero when query data if no usage, need carl to support -// add recommendation to workload +// Scale to zero if no usage, need carl to support +// Add recommendation to workload +// Write some documents +// cron scheduler stragegy, parallisam ? +// Refactor main, setup database may not put in leader election runnable group // resolve conversation on github, thanks for reviews var _ = Describe("Autoscaler", func() { Context("when creating an autoscaler", func() { It("should return an error if there is no client", func() { - as, err := NewAutoscaler(nil) + as, err := NewAutoscaler(nil, nil) Expect(as).To(BeNil()) Expect(err.Error()).To(ContainSubstring("must specify client")) }) + + It("should return an error if there is no reallocator", func() { + as, err := NewAutoscaler(k8sClient, nil) + Expect(as).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("must specify reallocator")) + }) }) Context("when loading history metrics", func() { It("should create the state of workloads and workers based on historical metrics", func() { - scaler, _ := NewAutoscaler(k8sClient) + scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) scaler.MetricsProvider = &FakeMetricsProvider{} scaler.LoadHistoryMetrics(ctx) - metrics := scaler.MetricsProvider.GetHistoryMetrics() + metrics, _ := scaler.MetricsProvider.GetHistoryMetrics() for _, m := range metrics { Expect(scaler.WorkloadStates).To(HaveKey(m.WorkloadName)) Expect(scaler.WorkerStates).To(HaveKey(m.WorkerName)) @@ -71,7 +83,7 @@ var _ = Describe("Autoscaler", func() { Build() defer tfEnv.Cleanup() - scaler, _ := NewAutoscaler(k8sClient) + scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) scaler.LoadWorkloads(ctx) Expect(scaler.WorkloadStates).To(HaveLen(0)) Expect(scaler.WorkerStates).To(HaveLen(0)) @@ -110,7 +122,7 @@ var _ = Describe("Autoscaler", func() { }) Context("when loading real time metrics", func() { - FIt("should update the state of workloads and workers", func() { + It("should update the state of workloads and workers", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -122,15 +134,16 @@ var _ = Describe("Autoscaler", func() { worker := workers[0].Name - scaler, _ := NewAutoscaler(k8sClient) + scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) scaler.LoadWorkloads(ctx) ws := scaler.WorkloadStates[workload.Name] + now := time.Now() metrics := &WorkerMetrics{ WorkloadName: workload.Name, WorkerName: worker, TflopsUsage: ResourceAmount(12.0), VramUsage: 9000, - Timestamp: time.Now(), + Timestamp: now, } scaler.MetricsProvider = &FakeMetricsProvider{[]*WorkerMetrics{metrics}} @@ -153,63 +166,38 @@ var _ = Describe("Autoscaler", func() { workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := NewAutoscaler(k8sClient) + scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) scaler.LoadWorkloads(ctx) - recommender := &FakeRecommender{ - RecommendedResources: RecommendedResources{ - TargetTflops: 110, - LowerBoundTflops: 100, - UpperBoundTflops: 120, - TargetVram: 110 * 1000 * 1000 * 1000, - LowerBoundVram: 100 * 1000 * 1000 * 1000, - UpperBoundVram: 120 * 1000 * 1000 * 1000, - }, - } - - scaler.Recommender = recommender - rr := recommender.GetRecommendedResources(nil) + scaler.Recommender = &FakeOutBoundRecommender{} + rr := scaler.Recommender.GetRecommendedResources(nil) scaler.ProcessWorkloads(ctx) - Eventually(func(g Gomega) { - workers := getWorkers(workload) - annotations := workers[0].GetAnnotations() - - tflopsRequest := resource.MustParse(annotations[constants.TFLOPSRequestAnnotation]) - g.Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) - - tflopsLimit := resource.MustParse(annotations[constants.TFLOPSLimitAnnotation]) - g.Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) - - vramRequest := resource.MustParse(annotations[constants.VRAMRequestAnnotation]) - g.Expect(vramRequest.Value()).To(Equal(int64(rr.TargetVram))) - - vramLimit := resource.MustParse(annotations[constants.VRAMLimitAnnotation]) - g.Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2))) - + assertWorkerAnnotations(getWorkers(workload)[0], rr) }).Should(Succeed()) // Upon reprocessing the workload, it should skip resource updates since they are already within the recommended resource boundaries scaler.ProcessWorkloads(ctx) - Consistently(func(g Gomega) { - workers := getWorkers(workload) - annotations := workers[0].GetAnnotations() - - tflopsRequest := resource.MustParse(annotations[constants.TFLOPSRequestAnnotation]) - g.Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) - - tflopsLimit := resource.MustParse(annotations[constants.TFLOPSLimitAnnotation]) - g.Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) - - vramRequest := resource.MustParse(annotations[constants.VRAMRequestAnnotation]) - g.Expect(vramRequest.Value()).To(Equal(int64(rr.TargetVram))) + assertWorkerAnnotations(getWorkers(workload)[0], rr) + }).Should(Succeed()) + }) - vramLimit := resource.MustParse(annotations[constants.VRAMLimitAnnotation]) - g.Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2))) + It("should return an error if failed to reallocate resources", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) - }).Should(Succeed()) + scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) + scaler.LoadWorkloads(ctx) + scaler.Recommender = &FakeOutBoundRecommender{} + rr := scaler.Recommender.GetRecommendedResources(nil) + err := scaler.updateWorker(ctx, getWorkers(workload)[0], rr) + Expect(err.Error()).To(ContainSubstring("failed to reallocate resources")) }) }) }) @@ -295,39 +283,52 @@ func getWorkers(workload *tfv1.TensorFusionWorkload) []*corev1.Pod { }) } +type FakeAllocator struct{} + +func (*FakeAllocator) Realloc(ctx context.Context, req gpuallocator.AllocRequest) error { + return fmt.Errorf("failed to reallocate resources") +} + type FakeMetricsProvider struct { Metrics []*WorkerMetrics } -func (f *FakeMetricsProvider) GetWorkersMetrics() []*WorkerMetrics { - return f.Metrics +func (f *FakeMetricsProvider) GetWorkersMetrics() ([]*WorkerMetrics, error) { + return f.Metrics, nil } -func (f *FakeMetricsProvider) GetHistoryMetrics() []*WorkerMetrics { +func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*WorkerMetrics, error) { metrics := []*WorkerMetrics{} - startTime := time.Now().Add(-7 * 24 * time.Hour) - for day := 0; day < 7; day++ { - for hour := 0; hour < 24; hour++ { - idx := day*24 + hour - metrics = append(metrics, &WorkerMetrics{ - WorkloadName: "workload-0", - WorkerName: fmt.Sprintf("worker-%d", idx), - TflopsUsage: ResourceAmount(10.0 + float64(idx%10)), - VramUsage: 1 * 1024 * 1024 * 1024, - Timestamp: startTime.Add(time.Duration(day*24+hour) * time.Hour), - }) + startTime := time.Now().Add(-8 * 24 * time.Hour) + for day := 0; day < 8; day++ { + for hour := 0; hour < 1; hour++ { + for minute := 0; minute < 60; minute++ { + // idx := day*24 + hour + metrics = append(metrics, &WorkerMetrics{ + WorkloadName: "workload-0", + WorkerName: fmt.Sprintf("worker-%d", 1), + TflopsUsage: ResourceAmount(100.0), + VramUsage: 1 * 1000 * 1000 * 1000, + Timestamp: startTime.Add(time.Duration(day*24+hour)*time.Hour + time.Duration(minute)*time.Minute), + }) + } } } - return metrics + return metrics, nil } -type FakeRecommender struct { - RecommendedResources -} +type FakeOutBoundRecommender struct{} -func (f *FakeRecommender) GetRecommendedResources(_ *WorkloadState) *RecommendedResources { - return &f.RecommendedResources +func (f *FakeOutBoundRecommender) GetRecommendedResources(_ *WorkloadState) *RecommendedResources { + return &RecommendedResources{ + TargetTflops: 110, + LowerBoundTflops: 100, + UpperBoundTflops: 120, + TargetVram: 110 * 1000 * 1000 * 1000, + LowerBoundVram: 100 * 1000 * 1000 * 1000, + UpperBoundVram: 120 * 1000 * 1000 * 1000, + } } func updateWorkloadReplicas(workload *tfv1.TensorFusionWorkload, replicas int) { @@ -379,3 +380,18 @@ func cleanupWorkload(key client.ObjectKey) { g.Expect(err).Should(HaveOccurred()) }).Should(Succeed()) } + +func assertWorkerAnnotations(worker *corev1.Pod, rr *RecommendedResources) { + annotations := worker.GetAnnotations() + tflopsRequest := resource.MustParse(annotations[constants.TFLOPSRequestAnnotation]) + Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) + + tflopsLimit := resource.MustParse(annotations[constants.TFLOPSLimitAnnotation]) + Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) + + vramRequest := resource.MustParse(annotations[constants.VRAMRequestAnnotation]) + Expect(vramRequest.Value()).To(Equal(int64(rr.TargetVram))) + + vramLimit := resource.MustParse(annotations[constants.VRAMLimitAnnotation]) + Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2))) +} From 3e8076a52d58c4956375e42ce0cf61f2bd7424a6 Mon Sep 17 00:00:00 2001 From: knave Date: Sat, 28 Jun 2025 04:37:36 +0800 Subject: [PATCH 07/27] refactor: optimize update worker method --- internal/autoscaler/autoscaler.go | 119 ++++++++++++++----------- internal/autoscaler/autoscaler_test.go | 44 +++++---- 2 files changed, 92 insertions(+), 71 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 96e6cb5a..c242ac4d 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -27,7 +27,7 @@ var ( type Autoscaler struct { client.Client Allocator - Recommender + ResourceRecommender MetricsProvider WorkloadStates map[string]*WorkloadState WorkerStates map[string]*WorkerState @@ -47,12 +47,12 @@ func NewAutoscaler(c client.Client, allocator Allocator) (*Autoscaler, error) { } return &Autoscaler{ - Client: c, - Allocator: allocator, - Recommender: NewRecommender(), - MetricsProvider: NewMetricsProvider(nil), - WorkloadStates: map[string]*WorkloadState{}, - WorkerStates: map[string]*WorkerState{}, + Client: c, + Allocator: allocator, + ResourceRecommender: NewResourceRecommender(), + MetricsProvider: NewMetricsProvider(nil), + WorkloadStates: map[string]*WorkloadState{}, + WorkerStates: map[string]*WorkerState{}, }, nil } @@ -100,11 +100,7 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) { observedWorkloads := map[string]bool{} for _, workload := range workloadList.Items { autoScalingConfig := workload.Spec.AutoScalingConfig - // Currently only supports enabling both AutoSetLimits and AutoSetRequests simultaneously - // TODO: when recommending, need to observe all workload - if !workload.DeletionTimestamp.IsZero() || - !autoScalingConfig.AutoSetLimits.Enable || - !autoScalingConfig.AutoSetRequests.Enable { + if !workload.DeletionTimestamp.IsZero() { continue } @@ -223,8 +219,10 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { } // TODO: apply config - // asConfig := workloadState.AutoScalingConfig - rr := s.Recommender.GetRecommendedResources(workloadState) + // asConfig := workloadState.AutoScalingConfig + // NewResourceRecommenderFromAutoScalingConfig(ResouceRecomenderConfig{ + // }).GetRecommendedResources(workloadState) + rr := s.ResourceRecommender.GetRecommendedResources(workloadState) log.Info("Autoscaler processWorkloads", "recommended resources", rr) for _, worker := range podList.Items { @@ -243,57 +241,74 @@ func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *R annotations := worker.GetAnnotations() newAnnotations := map[string]string{} - tflopsRequest, err := resource.ParseQuantity(annotations[constants.TFLOPSRequestAnnotation]) - if err != nil { - return fmt.Errorf("failed to parse tflops request: %v", err) - } - if tflopsRequest.Cmp(QuantityFromAmount(rr.LowerBoundTflops)) < 0 || - tflopsRequest.Cmp(QuantityFromAmount(rr.UpperBoundTflops)) > 0 { - targetTflopsRequest := QuantityFromAmount(rr.TargetTflops) - newAnnotations[constants.TFLOPSRequestAnnotation] = targetTflopsRequest.String() - tflopsLimit, err := resource.ParseQuantity(annotations[constants.TFLOPSLimitAnnotation]) - if err != nil { - return fmt.Errorf("failed to parse tflops limit annotation: %v", err) - } - targetTflopsLimit := getProportionalLimit(&tflopsLimit, &tflopsRequest, &targetTflopsRequest) - if targetTflopsLimit == nil { - return fmt.Errorf("failed to get limit for tflops") - } - newAnnotations[constants.TFLOPSLimitAnnotation] = targetTflopsLimit.String() + resourcesInfo := []struct { + requestKey string + limitKey string + lowerBound ResourceAmount + upperBound ResourceAmount + target ResourceAmount + }{ + { + requestKey: constants.TFLOPSRequestAnnotation, + limitKey: constants.TFLOPSLimitAnnotation, + lowerBound: rr.LowerBoundTflops, + upperBound: rr.UpperBoundTflops, + target: rr.TargetTflops, + }, + { + requestKey: constants.VRAMRequestAnnotation, + limitKey: constants.VRAMLimitAnnotation, + lowerBound: rr.LowerBoundVram, + upperBound: rr.UpperBoundVram, + target: rr.TargetVram, + }, } - vramRequest, err := resource.ParseQuantity(annotations[constants.VRAMRequestAnnotation]) - if err != nil { - return fmt.Errorf("failed to parse vram request: %v", err) - } - if vramRequest.Cmp(QuantityFromAmount(rr.LowerBoundVram)) < 0 || - vramRequest.Cmp(QuantityFromAmount(rr.UpperBoundVram)) > 0 { - targetVramRequest := QuantityFromAmount(rr.TargetVram) - newAnnotations[constants.VRAMRequestAnnotation] = targetVramRequest.String() - vramLimit, err := resource.ParseQuantity(annotations[constants.VRAMLimitAnnotation]) - if err != nil { - return fmt.Errorf("failed to parse vram limit annotation: %v", err) + for _, resInfo := range resourcesInfo { + if err := updateResource( + annotations, newAnnotations, + resInfo.requestKey, resInfo.limitKey, + resInfo.lowerBound, resInfo.upperBound, resInfo.target, + ); err != nil { + return err } - targetVramLimit := getProportionalLimit(&vramLimit, &vramRequest, &targetVramRequest) - if targetVramLimit == nil { - return fmt.Errorf("failed to get limit for vram") - } - newAnnotations[constants.VRAMLimitAnnotation] = targetVramLimit.String() } if len(newAnnotations) > 0 { if err := s.Allocator.Realloc(ctx, gpuallocator.AllocRequest{}); err != nil { return fmt.Errorf("failed to reallocate resources: %v", err) } - + // Patch the worker with updated annotations + patch := client.MergeFrom(worker.DeepCopy()) for key, value := range newAnnotations { worker.Annotations[key] = value } + if err := s.Patch(ctx, worker, patch); err != nil { + return fmt.Errorf("failed to patch worker: %v", err) + } + } + + return nil +} - // TODO: replace using the patch method - if err := s.Update(ctx, worker); err != nil { - return fmt.Errorf("failed to update worker: %v", err) +func updateResource(annotations, newAnnotations map[string]string, requestKey, limitKey string, lowerBound, upperBound, target ResourceAmount) error { + currentRequest, err := resource.ParseQuantity(annotations[requestKey]) + if err != nil { + return fmt.Errorf("failed to parse %s: %v", requestKey, err) + } + if currentRequest.Cmp(QuantityFromAmount(lowerBound)) < 0 || + currentRequest.Cmp(QuantityFromAmount(upperBound)) > 0 { + targetRequest := QuantityFromAmount(target) + newAnnotations[requestKey] = targetRequest.String() + currentLimit, err := resource.ParseQuantity(annotations[limitKey]) + if err != nil { + return fmt.Errorf("failed to parse %s: %v", limitKey, err) + } + targetLimit := getProportionalLimit(¤tLimit, ¤tRequest, &targetRequest) + if targetLimit == nil { + return fmt.Errorf("failed to get limit for %s", requestKey) } + newAnnotations[limitKey] = targetLimit.String() } return nil @@ -327,7 +342,7 @@ func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *re } // Start after manager started -func SetupWithManager(mgr ctrl.Manager) error { +func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { autoScaler, err := NewAutoscaler(mgr.GetClient(), nil) if err != nil { return err diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index fb97271b..36476501 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -35,18 +35,18 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// Consider gpu allocator, check if enough tflops or vram to allocate -// Add tests for recommender -// Add logs for key events // [x] tflops add all samples, like cpu in vpa -// Implement gc for cleaning outdated data -// Add AutoSetResources to schedulingconfigtemplate and make it more configurable +// [x] Reallocate resources before update annotation +// Add AutoSetResources, make it more configurable +// Implement Realloc method on GpuAllocator +// Add tests for recommender +// Log key events // Scale to zero if no usage, need carl to support -// Add recommendation to workload +// Add recommendation to workload status // Write some documents // cron scheduler stragegy, parallisam ? // Refactor main, setup database may not put in leader election runnable group -// resolve conversation on github, thanks for reviews +// Resolve conversation on github, thanks for reviews var _ = Describe("Autoscaler", func() { Context("when creating an autoscaler", func() { @@ -158,7 +158,7 @@ var _ = Describe("Autoscaler", func() { }) Context("when processing workloads", func() { - It("should update only those resources exceeding the recommended resource boundaries", func() { + FIt("should update only those resources exceeding the recommended resource boundaries", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -169,8 +169,8 @@ var _ = Describe("Autoscaler", func() { scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) scaler.LoadWorkloads(ctx) - scaler.Recommender = &FakeOutBoundRecommender{} - rr := scaler.Recommender.GetRecommendedResources(nil) + scaler.ResourceRecommender = &FakeOutBoundRecommender{} + rr := scaler.ResourceRecommender.GetRecommendedResources(nil) scaler.ProcessWorkloads(ctx) Eventually(func(g Gomega) { @@ -192,13 +192,17 @@ var _ = Describe("Autoscaler", func() { workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) + scaler, _ := NewAutoscaler(k8sClient, &FakeFailedAllocator{}) scaler.LoadWorkloads(ctx) - scaler.Recommender = &FakeOutBoundRecommender{} - rr := scaler.Recommender.GetRecommendedResources(nil) + scaler.ResourceRecommender = &FakeOutBoundRecommender{} + rr := scaler.ResourceRecommender.GetRecommendedResources(nil) err := scaler.updateWorker(ctx, getWorkers(workload)[0], rr) Expect(err.Error()).To(ContainSubstring("failed to reallocate resources")) }) + + It("should update scaleToZero annotation if recommended resource closer to zero", func() { + + }) }) }) @@ -234,11 +238,7 @@ func createWorkload(pool *tfv1.GPUPool, id int, replicas int) *tfv1.TensorFusion }, Qos: constants.QoSLevelMedium, AutoScalingConfig: tfv1.AutoScalingConfig{ - AutoSetLimits: tfv1.AutoSetLimits{ - Enable: true, - TargetResource: "", - }, - AutoSetRequests: tfv1.AutoSetRequests{ + AutoSetResources: tfv1.AutoSetResources{ Enable: true, TargetResource: "", }, @@ -286,7 +286,13 @@ func getWorkers(workload *tfv1.TensorFusionWorkload) []*corev1.Pod { type FakeAllocator struct{} func (*FakeAllocator) Realloc(ctx context.Context, req gpuallocator.AllocRequest) error { - return fmt.Errorf("failed to reallocate resources") + return nil +} + +type FakeFailedAllocator struct{} + +func (*FakeFailedAllocator) Realloc(ctx context.Context, req gpuallocator.AllocRequest) error { + return fmt.Errorf("not enough resources") } type FakeMetricsProvider struct { From 36b76a53d6c6e766456a7170d6aa503349c16455 Mon Sep 17 00:00:00 2001 From: knave Date: Sun, 29 Jun 2025 05:51:46 +0800 Subject: [PATCH 08/27] feat: add config parsing --- internal/autoscaler/workloadstate.go | 42 +++++++++- internal/autoscaler/workloadstate_test.go | 97 +++++++++++++++++++++++ 2 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 internal/autoscaler/workloadstate_test.go diff --git a/internal/autoscaler/workloadstate.go b/internal/autoscaler/workloadstate.go index 3649f7c7..f3f56316 100644 --- a/internal/autoscaler/workloadstate.go +++ b/internal/autoscaler/workloadstate.go @@ -1,6 +1,7 @@ package autoscaler import ( + "strconv" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" @@ -17,7 +18,7 @@ const ( DefaultAggregationInterval = time.Hour * 24 // DefaultHistogramBucketSizeGrowth is the default value for HistogramBucketSizeGrowth. DefaultHistogramBucketSizeGrowth = 0.05 // Make each bucket 5% larger than the previous one. - // DefaultVramHistogramDecayHalfLife is the default value for HistogramDecayHalfLife. + // DefaultHistogramDecayHalfLife is the default value for HistogramDecayHalfLife. DefaultHistogramDecayHalfLife = time.Hour * 24 ) @@ -62,3 +63,42 @@ func (w *WorkloadState) UpdateSampleStats(metrics *WorkerMetrics) { } w.TotalSamplesCount++ } + +func (w *WorkloadState) GetResourceRecommenderConfig() *ResourceRecommenderConfig { + cfg := DefaultResourceRecommenderConfig + + asr := w.AutoScalingConfig.AutoSetResources + fields := []struct { + val string + dst *float64 + }{ + {asr.TargetTflopsPercentile, &cfg.TargetTflopsPercentile}, + {asr.LowerBoundTflopsPercentile, &cfg.LowerBoundTflopsPercentile}, + {asr.UpperBoundTflopsPercentile, &cfg.UpperBoundTflopsPercentile}, + {asr.TargetVramPercentile, &cfg.TargetVramPercentile}, + {asr.LowerBoundVramPercentile, &cfg.LowerBoundVramPercentile}, + {asr.UpperBoundVramPercentile, &cfg.UpperBoundVramPercentile}, + {asr.RequestMarginFraction, &cfg.RequestMarginFraction}, + } + for _, f := range fields { + if f.val == "" { + continue + } + if v, err := strconv.ParseFloat(f.val, 64); err == nil { + *f.dst = v + } + } + + if asr.ConfidenceInterval != "" { + if d, err := time.ParseDuration(asr.ConfidenceInterval); err == nil { + cfg.ConfidenceInterval = d + } + } + + return &cfg +} + +func (w *WorkloadState) IsTargetResource(resourceName string) bool { + target := w.AutoScalingConfig.AutoSetResources.TargetResource + return target == "" || resourceName == target +} diff --git a/internal/autoscaler/workloadstate_test.go b/internal/autoscaler/workloadstate_test.go new file mode 100644 index 00000000..21cc21f3 --- /dev/null +++ b/internal/autoscaler/workloadstate_test.go @@ -0,0 +1,97 @@ +package autoscaler + +import ( + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Workload State", func() { + It("should return default config when no AutoScalingConfig is set", func() { + ws := NewWorkloadState("test") + cfg := ws.GetResourceRecommenderConfig() + Expect(cfg).ToNot(BeNil()) + Expect(*cfg).To(Equal(DefaultResourceRecommenderConfig)) + }) + + It("should parse float fields from AutoSetResources", func() { + ws := NewWorkloadState("test") + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + TargetTflopsPercentile: "0.8", + LowerBoundTflopsPercentile: "0.1", + UpperBoundTflopsPercentile: "0.95", + TargetVramPercentile: "0.7", + LowerBoundVramPercentile: "0.2", + UpperBoundVramPercentile: "0.9", + RequestMarginFraction: "0.15", + }, + } + cfg := ws.GetResourceRecommenderConfig() + Expect(cfg.TargetTflopsPercentile).To(Equal(0.8)) + Expect(cfg.LowerBoundTflopsPercentile).To(Equal(0.1)) + Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.95)) + Expect(cfg.TargetVramPercentile).To(Equal(0.7)) + Expect(cfg.LowerBoundVramPercentile).To(Equal(0.2)) + Expect(cfg.UpperBoundVramPercentile).To(Equal(0.9)) + Expect(cfg.RequestMarginFraction).To(Equal(0.15)) + }) + + It("should ignore invalid float fields and keep defaults", func() { + ws := NewWorkloadState("test") + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + TargetTflopsPercentile: "not-a-float", + LowerBoundTflopsPercentile: "", + UpperBoundTflopsPercentile: "0.99", + }, + } + cfg := ws.GetResourceRecommenderConfig() + Expect(cfg.TargetTflopsPercentile).To(Equal(DefaultResourceRecommenderConfig.TargetTflopsPercentile)) + Expect(cfg.LowerBoundTflopsPercentile).To(Equal(DefaultResourceRecommenderConfig.LowerBoundTflopsPercentile)) + Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.99)) + }) + + It("should parse ConfidenceInterval if valid", func() { + ws := NewWorkloadState("test") + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + ConfidenceInterval: "30m", + }, + } + cfg := ws.GetResourceRecommenderConfig() + Expect(cfg.ConfidenceInterval).To(Equal(time.Duration(30 * time.Minute))) + }) + + It("should ignore invalid ConfidenceInterval and keep default", func() { + ws := NewWorkloadState("test") + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + ConfidenceInterval: "not-a-duration", + }, + } + cfg := ws.GetResourceRecommenderConfig() + Expect(cfg.ConfidenceInterval).To(Equal(DefaultResourceRecommenderConfig.ConfidenceInterval)) + }) + + It("should correctly determine if a resource is the target based on config", func() { + ws := NewWorkloadState("test") + + Expect(ws.IsTargetResource("tflops")).To(BeTrue()) + Expect(ws.IsTargetResource("vram")).To(BeTrue()) + + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "tflops"}, + } + Expect(ws.IsTargetResource("tflops")).To(BeTrue()) + Expect(ws.IsTargetResource("vram")).To(BeFalse()) + + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "vram"}, + } + Expect(ws.IsTargetResource("tflops")).To(BeFalse()) + Expect(ws.IsTargetResource("vram")).To(BeTrue()) + }) +}) From c748df6ed49e93e15ec2dc3ece77b76ac153a14b Mon Sep 17 00:00:00 2001 From: knave Date: Sun, 29 Jun 2025 05:58:52 +0800 Subject: [PATCH 09/27] feat: apply updates to specified target resources --- internal/autoscaler/autoscaler.go | 28 ++++++------- internal/autoscaler/autoscaler_test.go | 58 +++++++++++++++++++++----- 2 files changed, 62 insertions(+), 24 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index c242ac4d..a0d6a2da 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -99,7 +99,6 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) { observedWorkloads := map[string]bool{} for _, workload := range workloadList.Items { - autoScalingConfig := workload.Spec.AutoScalingConfig if !workload.DeletionTimestamp.IsZero() { continue } @@ -111,7 +110,7 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) { } workloadState.Namespace = workload.Namespace workloadState.Resources = workload.Spec.Resources - workloadState.AutoScalingConfig = autoScalingConfig + workloadState.AutoScalingConfig = workload.Spec.AutoScalingConfig s.WorkloadStates[workloadName] = workloadState observedWorkloads[workloadName] = true @@ -218,30 +217,24 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { continue } - // TODO: apply config - // asConfig := workloadState.AutoScalingConfig - // NewResourceRecommenderFromAutoScalingConfig(ResouceRecomenderConfig{ - // }).GetRecommendedResources(workloadState) rr := s.ResourceRecommender.GetRecommendedResources(workloadState) - log.Info("Autoscaler processWorkloads", "recommended resources", rr) + log.Info("recommend resources", "workload", workloadState.Name, "resources", rr) for _, worker := range podList.Items { if !worker.DeletionTimestamp.IsZero() { continue } - if err := s.updateWorker(ctx, &worker, rr); err != nil { + if err := s.updateWorkerResourcesIfNeeded(ctx, workloadState, &worker, rr); err != nil { log.Error(err, "failed to update worker") } } } } -func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *RecommendedResources) error { - annotations := worker.GetAnnotations() - newAnnotations := map[string]string{} - +func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workloadState *WorkloadState, worker *corev1.Pod, rr *RecommendedResources) error { resourcesInfo := []struct { + name string requestKey string limitKey string lowerBound ResourceAmount @@ -249,6 +242,7 @@ func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *R target ResourceAmount }{ { + name: "tflops", requestKey: constants.TFLOPSRequestAnnotation, limitKey: constants.TFLOPSLimitAnnotation, lowerBound: rr.LowerBoundTflops, @@ -256,6 +250,7 @@ func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *R target: rr.TargetTflops, }, { + name: "vram", requestKey: constants.VRAMRequestAnnotation, limitKey: constants.VRAMLimitAnnotation, lowerBound: rr.LowerBoundVram, @@ -264,8 +259,13 @@ func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *R }, } + annotations := worker.GetAnnotations() + newAnnotations := map[string]string{} for _, resInfo := range resourcesInfo { - if err := updateResource( + if !workloadState.IsTargetResource(resInfo.name) { + continue + } + if err := detectResourceChanges( annotations, newAnnotations, resInfo.requestKey, resInfo.limitKey, resInfo.lowerBound, resInfo.upperBound, resInfo.target, @@ -291,7 +291,7 @@ func (s *Autoscaler) updateWorker(ctx context.Context, worker *corev1.Pod, rr *R return nil } -func updateResource(annotations, newAnnotations map[string]string, requestKey, limitKey string, lowerBound, upperBound, target ResourceAmount) error { +func detectResourceChanges(annotations, newAnnotations map[string]string, requestKey, limitKey string, lowerBound, upperBound, target ResourceAmount) error { currentRequest, err := resource.ParseQuantity(annotations[requestKey]) if err != nil { return fmt.Errorf("failed to parse %s: %v", requestKey, err) diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 36476501..5da840f4 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -158,7 +158,7 @@ var _ = Describe("Autoscaler", func() { }) Context("when processing workloads", func() { - FIt("should update only those resources exceeding the recommended resource boundaries", func() { + It("should update only those resources exceeding the recommended resource boundaries", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -184,6 +184,34 @@ var _ = Describe("Autoscaler", func() { }).Should(Succeed()) }) + It("should update specific resources based on TargetResource", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) + scaler.LoadWorkloads(ctx) + + scaler.ResourceRecommender = &FakeOutBoundRecommender{} + rr := scaler.ResourceRecommender.GetRecommendedResources(nil) + + workloadState := scaler.WorkloadStates[workload.Name] + workloadState.AutoScalingConfig.AutoSetResources.TargetResource = "tflops" + + oldRes := workloadState.Resources + scaler.ProcessWorkloads(ctx) + Eventually(func(g Gomega) { + tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0]) + Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) + Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) + Expect(vramRequest.Equal(oldRes.Requests.Vram)).To(BeTrue()) + Expect(vramLimit.Equal(oldRes.Limits.Vram)).To(BeTrue()) + }).Should(Succeed()) + }) + It("should return an error if failed to reallocate resources", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). @@ -196,7 +224,7 @@ var _ = Describe("Autoscaler", func() { scaler.LoadWorkloads(ctx) scaler.ResourceRecommender = &FakeOutBoundRecommender{} rr := scaler.ResourceRecommender.GetRecommendedResources(nil) - err := scaler.updateWorker(ctx, getWorkers(workload)[0], rr) + err := scaler.updateWorkerResourcesIfNeeded(ctx, scaler.WorkloadStates[workload.Name], getWorkers(workload)[0], rr) Expect(err.Error()).To(ContainSubstring("failed to reallocate resources")) }) @@ -388,16 +416,26 @@ func cleanupWorkload(key client.ObjectKey) { } func assertWorkerAnnotations(worker *corev1.Pod, rr *RecommendedResources) { - annotations := worker.GetAnnotations() - tflopsRequest := resource.MustParse(annotations[constants.TFLOPSRequestAnnotation]) + tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(worker) Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) - - tflopsLimit := resource.MustParse(annotations[constants.TFLOPSLimitAnnotation]) Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) - - vramRequest := resource.MustParse(annotations[constants.VRAMRequestAnnotation]) Expect(vramRequest.Value()).To(Equal(int64(rr.TargetVram))) - - vramLimit := resource.MustParse(annotations[constants.VRAMLimitAnnotation]) Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2))) } + +func parseResourceAnnotations(worker *corev1.Pod) (tflopsRequest, tflopsLimit, vramRequest, vramLimit resource.Quantity) { + annotations := worker.GetAnnotations() + keys := []struct { + key string + dst *resource.Quantity + }{ + {constants.TFLOPSRequestAnnotation, &tflopsRequest}, + {constants.TFLOPSLimitAnnotation, &tflopsLimit}, + {constants.VRAMRequestAnnotation, &vramRequest}, + {constants.VRAMLimitAnnotation, &vramLimit}, + } + for _, k := range keys { + *k.dst = resource.MustParse(annotations[k.key]) + } + return +} From e22b847c304eecbe15cb167296fb4f3ac6e6b9a8 Mon Sep 17 00:00:00 2001 From: knave Date: Sun, 29 Jun 2025 09:58:59 +0800 Subject: [PATCH 10/27] feat: add auto-scaling switch config parsing and apply, TargetResource support value all --- internal/autoscaler/autoscaler.go | 6 ++++++ internal/autoscaler/autoscaler_test.go | 19 ++++++++++++++++--- internal/autoscaler/workloadstate.go | 7 ++++++- internal/autoscaler/workloadstate_test.go | 20 ++++++++++++++++++++ 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index a0d6a2da..c08a68b2 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -220,6 +220,12 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { rr := s.ResourceRecommender.GetRecommendedResources(workloadState) log.Info("recommend resources", "workload", workloadState.Name, "resources", rr) + // TODO: update recommmendation status of workload + + if !workloadState.IsAutoScalingEnabled() { + continue + } + for _, worker := range podList.Items { if !worker.DeletionTimestamp.IsZero() { continue diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 5da840f4..a3453800 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -184,7 +184,7 @@ var _ = Describe("Autoscaler", func() { }).Should(Succeed()) }) - It("should update specific resources based on TargetResource", func() { + It("should update resources based on auto scaling config", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -199,9 +199,22 @@ var _ = Describe("Autoscaler", func() { rr := scaler.ResourceRecommender.GetRecommendedResources(nil) workloadState := scaler.WorkloadStates[workload.Name] - workloadState.AutoScalingConfig.AutoSetResources.TargetResource = "tflops" - oldRes := workloadState.Resources + + // verify IsAutoScalingEnabled + workloadState.AutoScalingConfig.AutoSetResources.Enable = false + scaler.ProcessWorkloads(ctx) + Eventually(func(g Gomega) { + tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0]) + Expect(tflopsRequest.Equal(oldRes.Requests.Tflops)).To(BeTrue()) + Expect(tflopsLimit.Equal(oldRes.Limits.Tflops)).To(BeTrue()) + Expect(vramRequest.Equal(oldRes.Requests.Vram)).To(BeTrue()) + Expect(vramLimit.Equal(oldRes.Limits.Vram)).To(BeTrue()) + }).Should(Succeed()) + + // verify IsTargetResource + workloadState.AutoScalingConfig.AutoSetResources.Enable = true + workloadState.AutoScalingConfig.AutoSetResources.TargetResource = "tflops" scaler.ProcessWorkloads(ctx) Eventually(func(g Gomega) { tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0]) diff --git a/internal/autoscaler/workloadstate.go b/internal/autoscaler/workloadstate.go index f3f56316..fa2ff0b4 100644 --- a/internal/autoscaler/workloadstate.go +++ b/internal/autoscaler/workloadstate.go @@ -2,6 +2,7 @@ package autoscaler import ( "strconv" + "strings" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" @@ -100,5 +101,9 @@ func (w *WorkloadState) GetResourceRecommenderConfig() *ResourceRecommenderConfi func (w *WorkloadState) IsTargetResource(resourceName string) bool { target := w.AutoScalingConfig.AutoSetResources.TargetResource - return target == "" || resourceName == target + return target == "" || strings.EqualFold(target, "all") || strings.EqualFold(resourceName, target) +} + +func (w *WorkloadState) IsAutoScalingEnabled() bool { + return w.AutoScalingConfig.AutoSetResources.Enable } diff --git a/internal/autoscaler/workloadstate_test.go b/internal/autoscaler/workloadstate_test.go index 21cc21f3..ac4035ae 100644 --- a/internal/autoscaler/workloadstate_test.go +++ b/internal/autoscaler/workloadstate_test.go @@ -82,6 +82,13 @@ var _ = Describe("Workload State", func() { Expect(ws.IsTargetResource("tflops")).To(BeTrue()) Expect(ws.IsTargetResource("vram")).To(BeTrue()) + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "all"}, + } + + Expect(ws.IsTargetResource("tflops")).To(BeTrue()) + Expect(ws.IsTargetResource("vram")).To(BeTrue()) + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{TargetResource: "tflops"}, } @@ -94,4 +101,17 @@ var _ = Describe("Workload State", func() { Expect(ws.IsTargetResource("tflops")).To(BeFalse()) Expect(ws.IsTargetResource("vram")).To(BeTrue()) }) + + It("should correctly determine if auto scaling is enabled based on config", func() { + ws := NewWorkloadState("test") + + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{Enable: true}, + } + Expect(ws.IsAutoScalingEnabled()).To(BeTrue()) + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{Enable: false}, + } + Expect(ws.IsAutoScalingEnabled()).To(BeFalse()) + }) }) From 03a326777179d7f7b9ac8c40e4f8a5652045fcc3 Mon Sep 17 00:00:00 2001 From: knave Date: Thu, 3 Jul 2025 22:07:46 +0800 Subject: [PATCH 11/27] feat: merge AutoSetLimits and AutoSetRequests into AutoSetResources --- api/v1/schedulingconfigtemplate_types.go | 45 +++++- api/v1/zz_generated.deepcopy.go | 22 ++- ...r-fusion.ai_schedulingconfigtemplates.yaml | 97 +++++------- ...ensor-fusion.ai_tensorfusionworkloads.yaml | 97 +++++------- .../tensor-fusion.ai_workloadprofiles.yaml | 97 +++++------- ...r-fusion.ai_schedulingconfigtemplates.yaml | 97 +++++------- ...ensor-fusion.ai_tensorfusionworkloads.yaml | 97 +++++------- .../tensor-fusion.ai_workloadprofiles.yaml | 97 +++++------- go.mod | 1 + internal/autoscaler/autoscaler_test.go | 7 - internal/autoscaler/metricsprovider.go | 10 +- internal/autoscaler/metricsprovider_test.go | 6 +- internal/autoscaler/recommender.go | 139 +++++++++++------- internal/autoscaler/recommender_test.go | 14 +- internal/constants/constants.go | 5 +- internal/webhook/v1/tf_parser.go | 10 +- 16 files changed, 404 insertions(+), 437 deletions(-) diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go index 1407b79f..a16fb6ad 100644 --- a/api/v1/schedulingconfigtemplate_types.go +++ b/api/v1/schedulingconfigtemplate_types.go @@ -86,17 +86,50 @@ type GPUFilter struct { } type AutoScalingConfig struct { - // layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - // VPA-like, aggregate metrics data <1m - AutoSetLimits AutoSetLimits `json:"autoSetLimits,omitempty"` + // layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode + // Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks + AutoSetResources AutoSetResources `json:"autoSetResources,omitempty"` // layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit // HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works) AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"` +} - // layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet - // Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks - AutoSetRequests AutoSetRequests `json:"autoSetRequests,omitempty"` +type AutoSetResources struct { + Enable bool `json:"enable,omitempty"` + + // Target resource to scale, such as "tflops", "vram", or "all" by default + TargetResource string `json:"targetResource,omitempty"` + + // Tflops usage percentile that will be used as a base for tflops target recommendation. Default: 0.9 + TargetTflopsPercentile string `json:"targettflopspercentile,omitempty"` + + // Tflops usage percentile that will be used for the lower bound on tflops recommendation. Default: 0.5 + LowerBoundTflopsPercentile string `json:"lowerboundtflopspercentile,omitempty"` + + // Tflops usage percentile that will be used for the upper bound on tflops recommendation. Default: 0.95 + UpperBoundTflopsPercentile string `json:"upperboundtflopspercentile,omitempty"` + + // Vram usage percentile that will be used as a base for vram target recommendation. Default: 0.9 + TargetVramPercentile string `json:"targetvrampercentile,omitempty"` + + // Vram usage percentile that will be used for the lower bound on vram recommendation. Default: 0.5 + LowerBoundVramPercentile string `json:"lowerboundvrampercentile,omitempty"` + + // Vram usage percentile that will be used for the upper bound on vram recommendation. Default: 0.95 + UpperBoundVramPercentile string `json:"upperboundvrampercentile,omitempty"` + + // Fraction of usage added as the safety margin to the recommended request. Default: 0.15 + RequestMarginFraction string `json:"requestMarginFraction,omitempty"` + + // The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h + ConfidenceInterval string `json:"confidenceInterval,omitempty"` + + // How much time back TSDB have to be queried to get historical metrics. Default: 1d + HistoryLength string `json:"historyLength,omitempty"` + + // Resolution at which TSDB is queried for historical metrics. Default: 1m + HistoryResolution string `json:"historyResolution,omitempty"` } // A typical autoLimits algorithm could be checking every 5m, look back 1 day data, diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 2f1bf367..7dc5882e 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -138,9 +138,8 @@ func (in *AutoFreezeAndResume) DeepCopy() *AutoFreezeAndResume { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AutoScalingConfig) DeepCopyInto(out *AutoScalingConfig) { *out = *in - in.AutoSetLimits.DeepCopyInto(&out.AutoSetLimits) + out.AutoSetResources = in.AutoSetResources out.AutoSetReplicas = in.AutoSetReplicas - in.AutoSetRequests.DeepCopyInto(&out.AutoSetRequests) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoScalingConfig. @@ -204,6 +203,21 @@ func (in *AutoSetRequests) DeepCopy() *AutoSetRequests { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AutoSetResources) DeepCopyInto(out *AutoSetResources) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoSetResources. +func (in *AutoSetResources) DeepCopy() *AutoSetResources { + if in == nil { + return nil + } + out := new(AutoSetResources) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CapacityConfig) DeepCopyInto(out *CapacityConfig) { *out = *in @@ -1957,7 +1971,7 @@ func (in *SchedulingConfigTemplateSpec) DeepCopyInto(out *SchedulingConfigTempla if in.AutoScaling != nil { in, out := &in.AutoScaling, &out.AutoScaling *out = new(AutoScalingConfig) - (*in).DeepCopyInto(*out) + **out = **in } if in.ReBalancer != nil { in, out := &in.ReBalancer, &out.ReBalancer @@ -2481,7 +2495,7 @@ func (in *WorkloadProfileSpec) DeepCopyInto(out *WorkloadProfileSpec) { **out = **in } in.Resources.DeepCopyInto(&out.Resources) - in.AutoScalingConfig.DeepCopyInto(&out.AutoScalingConfig) + out.AutoScalingConfig = in.AutoScalingConfig if in.NodeAffinity != nil { in, out := &in.NodeAffinity, &out.NodeAffinity *out = new(corev1.NodeAffinity) diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml index 69a12b27..65092ff0 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -50,41 +50,6 @@ spec: autoScaling: description: scale the workload based on the usage and traffic properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -105,38 +70,56 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - percentileForAutoRequests: + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' + type: string + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object type: object diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml index fc7818d3..95c4c5dc 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -46,41 +46,6 @@ spec: This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation, user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true' properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -101,38 +66,56 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' + type: string + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' type: string - percentileForAutoRequests: + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object type: object diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml index 19b9fd2e..d8e57ee9 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml @@ -45,41 +45,6 @@ spec: This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation, user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true' properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -100,38 +65,56 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' + type: string + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' type: string - percentileForAutoRequests: + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object type: object diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml index 69a12b27..65092ff0 100644 --- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -50,41 +50,6 @@ spec: autoScaling: description: scale the workload based on the usage and traffic properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -105,38 +70,56 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - percentileForAutoRequests: + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' + type: string + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object type: object diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml index fc7818d3..95c4c5dc 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -46,41 +46,6 @@ spec: This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation, user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true' properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -101,38 +66,56 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' + type: string + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' type: string - percentileForAutoRequests: + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object type: object diff --git a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml index 19b9fd2e..d8e57ee9 100644 --- a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml +++ b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml @@ -45,41 +45,6 @@ spec: This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation, user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true' properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -100,38 +65,56 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' + type: string + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' type: string - percentileForAutoRequests: + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object type: object diff --git a/go.mod b/go.mod index 053d4e75..420a88fd 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,7 @@ require ( gorm.io/gorm v1.30.0 k8s.io/api v0.33.2 k8s.io/apimachinery v0.33.2 + k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0 k8s.io/client-go v0.33.2 k8s.io/component-base v0.32.5 k8s.io/component-helpers v0.33.2 diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index a3453800..f59ada52 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -38,10 +38,7 @@ import ( // [x] tflops add all samples, like cpu in vpa // [x] Reallocate resources before update annotation // Add AutoSetResources, make it more configurable -// Implement Realloc method on GpuAllocator -// Add tests for recommender // Log key events -// Scale to zero if no usage, need carl to support // Add recommendation to workload status // Write some documents // cron scheduler stragegy, parallisam ? @@ -240,10 +237,6 @@ var _ = Describe("Autoscaler", func() { err := scaler.updateWorkerResourcesIfNeeded(ctx, scaler.WorkloadStates[workload.Name], getWorkers(workload)[0], rr) Expect(err.Error()).To(ContainSubstring("failed to reallocate resources")) }) - - It("should update scaleToZero annotation if recommended resource closer to zero", func() { - - }) }) }) diff --git a/internal/autoscaler/metricsprovider.go b/internal/autoscaler/metricsprovider.go index 7b4e06d6..5334ff4b 100644 --- a/internal/autoscaler/metricsprovider.go +++ b/internal/autoscaler/metricsprovider.go @@ -25,9 +25,10 @@ func NewMetricsProvider(db *gorm.DB) MetricsProvider { } type greptimeDBProvider struct { - db *gorm.DB - lastQueryTime time.Time - historyDuration time.Duration + db *gorm.DB + lastQueryTime time.Time + historyLength time.Duration + historyResolution time.Duration } func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerMetrics, error) { @@ -38,6 +39,7 @@ func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerMetrics, error) { Where("ts > ? and ts <= ?", g.lastQueryTime.Nanosecond(), now.Nanosecond()). Group("workload, worker"). Order("ts asc"). + Find(&data). Error if err != nil { @@ -71,7 +73,7 @@ func (g *greptimeDBProvider) GetHistoryMetrics() ([]*WorkerMetrics, error) { // TODO: replace using iteration for handling large datasets efficiently // TODO: supply history resolution to config time window err := g.db.Select("workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, date_bin('1 minute'::INTERVAL, ts) as time_window"). - Where("ts > ? and ts <= ?", now.Add(-g.historyDuration), now.Nanosecond()). + Where("ts > ? and ts <= ?", now.Add(-time.Hour*24).Nanosecond(), now.Nanosecond()). Group("workload, worker, time_window"). Order("time_window asc"). Find(&data). diff --git a/internal/autoscaler/metricsprovider_test.go b/internal/autoscaler/metricsprovider_test.go index 9d7dae04..dbd6f968 100644 --- a/internal/autoscaler/metricsprovider_test.go +++ b/internal/autoscaler/metricsprovider_test.go @@ -14,7 +14,7 @@ import ( var _ = Describe("MetricsProvider", func() { Context("when getting real time workers metrics", func() { - It("should return slices", func() { + It("should return metrics for every worker", func() { db, mock := NewMockDB() now := time.Now() fakeMetrics := []metrics.HypervisorWorkerUsageMetrics{ @@ -39,7 +39,7 @@ var _ = Describe("MetricsProvider", func() { rows.AddRow(row.WorkloadName, row.WorkerName, row.ComputeTflops, row.VRAMBytes, row.Timestamp) } - mock.ExpectQuery(regexp.QuoteMeta("SELECT workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, max(ts) as ts FROM `tf_worker_usage` WHERE ts > ? GROUP BY workload, worker")). + mock.ExpectQuery(regexp.QuoteMeta("SELECT workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, max(ts) as ts FROM `tf_worker_usage` WHERE ts > ? and ts <= ? GROUP BY workload, worker ORDER BY ts asc")). WillReturnRows(rows) provider := &greptimeDBProvider{db: db} got, _ := provider.GetWorkersMetrics() @@ -53,7 +53,7 @@ var _ = Describe("MetricsProvider", func() { }) Context("when getting history workers metrics", func() { - FIt("should return slices", func() { + It("should return metrics based on history length", func() { db, mock := NewMockDB() now := time.Now() fakeMetrics := []hypervisorWorkerUsageMetrics{ diff --git a/internal/autoscaler/recommender.go b/internal/autoscaler/recommender.go index 2cbddb74..98160ddf 100644 --- a/internal/autoscaler/recommender.go +++ b/internal/autoscaler/recommender.go @@ -1,84 +1,119 @@ package autoscaler import ( - "flag" "time" ) -var ( - safetyMarginFraction = flag.Float64("recommendation-margin-fraction", 0.15, `Fraction of usage added as the safety margin to the recommended request`) - targetVramPercentile = flag.Float64("target-vram-percentile", 0.9, "Vram usage percentile that will be used as a base for vram target recommendation. Doesn't affect vram lower bound nor vram upper bound.") - lowerBoundVramPercentile = flag.Float64("recommendation-lower-bound-vram-percentile", 0.5, `Vram usage percentile that will be used for the lower bound on vram recommendation.`) - upperBoundVramPercentile = flag.Float64("recommendation-upper-bound-vram-percentile", 0.95, `Vram usage percentile that will be used for the upper bound on vram recommendation.`) - targetTflopsPercentile = flag.Float64("target-tflops-percentile", 0.9, "Tflops usage percentile that will be used as a base for tflops target recommendation. Doesn't affect tflops lower bound nor tflops upper bound.") - lowerBoundTflopsPercentile = flag.Float64("recommendation-lower-bound-tflops-percentile", 0.5, `Tflops usage percentile that will be used for the lower bound on tflops recommendation.`) - upperBoundTflopsPercentile = flag.Float64("recommendation-upper-bound-tflops-percentile", 0.95, `Tflops usage percentile that will be used for the upper bound on tflops recommendation.`) - confidenceInterval = flag.Duration("confidence-interval", time.Hour*24, "The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h") +const ( + // Fraction of usage added as the safety margin to the recommended request + defaultRequestMarginFraction = 0.15 + // Vram usage percentile that will be used as a base for vram target recommendation. Doesn't affect vram lower bound nor vram upper bound. + defaultTargetVramPercentile = 0.9 + // Vram usage percentile that will be used for the lower bound on vram recommendation. + defaultLowerBoundVramPercentile = 0.5 + // Vram usage percentile that will be used for the upper bound on vram recommendation. + defaultUpperBoundVramPercentile = 0.95 + // Tflops usage percentile that will be used as a base for tflops target recommendation. Doesn't affect tflops lower bound nor tflops upper bound. + defaultTargetTflopsPercentile = 0.9 + // Tflops usage percentile that will be used for the lower bound on tflops recommendation. + defaultLowerBoundTflopsPercentile = 0.5 + // Tflops usage percentile that will be used for the upper bound on tflops recommendation. + defaultUpperBoundTflopsPercentile = 0.95 + // The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h + defaultConfidenceInterval = time.Hour * 24 ) -type Recommender interface { +var DefaultResourceRecommenderConfig = ResourceRecommenderConfig{ + TargetTflopsPercentile: defaultTargetTflopsPercentile, + LowerBoundTflopsPercentile: defaultLowerBoundTflopsPercentile, + UpperBoundTflopsPercentile: defaultUpperBoundTflopsPercentile, + TargetVramPercentile: defaultTargetVramPercentile, + LowerBoundVramPercentile: defaultLowerBoundVramPercentile, + UpperBoundVramPercentile: defaultUpperBoundVramPercentile, + RequestMarginFraction: defaultRequestMarginFraction, + ConfidenceInterval: defaultConfidenceInterval, +} + +type ResourceRecommender interface { GetRecommendedResources(*WorkloadState) *RecommendedResources } type RecommendedResources struct { - TargetTflops ResourceAmount LowerBoundTflops ResourceAmount + TargetTflops ResourceAmount UpperBoundTflops ResourceAmount - - TargetVram ResourceAmount - LowerBoundVram ResourceAmount - UpperBoundVram ResourceAmount + LowerBoundVram ResourceAmount + TargetVram ResourceAmount + UpperBoundVram ResourceAmount } -func NewRecommender() Recommender { - targetTflops := NewPercentileTflopsEstimator(*targetTflopsPercentile) - lowerBoundTflops := NewPercentileTflopsEstimator(*lowerBoundTflopsPercentile) - upperBoundTflops := NewPercentileTflopsEstimator(*upperBoundTflopsPercentile) - - targetTflops = WithTflopsMargin(*safetyMarginFraction, targetTflops) - lowerBoundTflops = WithTflopsMargin(*safetyMarginFraction, lowerBoundTflops) - upperBoundTflops = WithTflopsMargin(*safetyMarginFraction, upperBoundTflops) - - upperBoundTflops = WithTflopsConfidenceMultiplier(1.0, 1.0, upperBoundTflops, *confidenceInterval) - lowerBoundTflops = WithTflopsConfidenceMultiplier(0.001, -2.0, lowerBoundTflops, *confidenceInterval) - - targetVram := NewPercentileVramEstimator(*targetVramPercentile) - lowerBoundVram := NewPercentileVramEstimator(*lowerBoundVramPercentile) - upperBoundVram := NewPercentileVramEstimator(*upperBoundVramPercentile) - - targetVram = WithVramMargin(*safetyMarginFraction, targetVram) - lowerBoundVram = WithVramMargin(*safetyMarginFraction, lowerBoundVram) - upperBoundVram = WithVramMargin(*safetyMarginFraction, upperBoundVram) - - upperBoundVram = WithVramConfidenceMultiplier(1.0, 1.0, upperBoundVram, *confidenceInterval) - lowerBoundVram = WithVramConfidenceMultiplier(0.001, -2.0, lowerBoundVram, *confidenceInterval) +type ResourceRecommenderConfig struct { + TargetTflopsPercentile float64 + LowerBoundTflopsPercentile float64 + UpperBoundTflopsPercentile float64 + TargetVramPercentile float64 + LowerBoundVramPercentile float64 + UpperBoundVramPercentile float64 + RequestMarginFraction float64 + ConfidenceInterval time.Duration +} - return &recommender{ - targetTflops: targetTflops, - lowerBoundTflops: lowerBoundTflops, - upperBoundTflops: upperBoundTflops, - targetVram: targetVram, - lowerBoundVram: lowerBoundVram, - upperBoundVram: upperBoundVram, - } +func NewResourceRecommender() ResourceRecommender { + return &resourceRecommender{} } -type recommender struct { - targetTflops TflopsEstimator +type resourceRecommender struct { lowerBoundTflops TflopsEstimator + targetTflops TflopsEstimator upperBoundTflops TflopsEstimator - targetVram VramEstimator lowerBoundVram VramEstimator + targetVram VramEstimator upperBoundVram VramEstimator } -func (r *recommender) GetRecommendedResources(s *WorkloadState) *RecommendedResources { +func (r *resourceRecommender) GetRecommendedResources(s *WorkloadState) *RecommendedResources { + + r.createEstimatorsFromConfig(s.GetResourceRecommenderConfig()) + return &RecommendedResources{ - TargetTflops: r.targetTflops.GetTflopsEstimation(s), LowerBoundTflops: r.lowerBoundTflops.GetTflopsEstimation(s), + TargetTflops: r.targetTflops.GetTflopsEstimation(s), UpperBoundTflops: r.upperBoundTflops.GetTflopsEstimation(s), - TargetVram: r.targetVram.GetVramEstimation(s), LowerBoundVram: r.lowerBoundVram.GetVramEstimation(s), + TargetVram: r.targetVram.GetVramEstimation(s), UpperBoundVram: r.upperBoundVram.GetVramEstimation(s), } } + +func (r *resourceRecommender) createEstimatorsFromConfig(config *ResourceRecommenderConfig) { + targetTflops := NewPercentileTflopsEstimator(config.TargetTflopsPercentile) + lowerBoundTflops := NewPercentileTflopsEstimator(config.LowerBoundTflopsPercentile) + upperBoundTflops := NewPercentileTflopsEstimator(config.UpperBoundTflopsPercentile) + + targetTflops = WithTflopsMargin(config.RequestMarginFraction, targetTflops) + lowerBoundTflops = WithTflopsMargin(config.RequestMarginFraction, lowerBoundTflops) + upperBoundTflops = WithTflopsMargin(config.RequestMarginFraction, upperBoundTflops) + + upperBoundTflops = WithTflopsConfidenceMultiplier(1.0, 1.0, upperBoundTflops, config.ConfidenceInterval) + lowerBoundTflops = WithTflopsConfidenceMultiplier(0.001, -2.0, lowerBoundTflops, config.ConfidenceInterval) + + targetVram := NewPercentileVramEstimator(config.TargetVramPercentile) + lowerBoundVram := NewPercentileVramEstimator(config.LowerBoundVramPercentile) + upperBoundVram := NewPercentileVramEstimator(config.UpperBoundVramPercentile) + + targetVram = WithVramMargin(config.RequestMarginFraction, targetVram) + lowerBoundVram = WithVramMargin(config.RequestMarginFraction, lowerBoundVram) + upperBoundVram = WithVramMargin(config.RequestMarginFraction, upperBoundVram) + + upperBoundVram = WithVramConfidenceMultiplier(1.0, 1.0, upperBoundVram, config.ConfidenceInterval) + lowerBoundVram = WithVramConfidenceMultiplier(0.001, -2.0, lowerBoundVram, config.ConfidenceInterval) + + *r = resourceRecommender{ + lowerBoundTflops: lowerBoundTflops, + targetTflops: targetTflops, + upperBoundTflops: upperBoundTflops, + lowerBoundVram: lowerBoundVram, + targetVram: targetVram, + upperBoundVram: upperBoundVram, + } +} diff --git a/internal/autoscaler/recommender_test.go b/internal/autoscaler/recommender_test.go index 9dcd52d5..12f5cb17 100644 --- a/internal/autoscaler/recommender_test.go +++ b/internal/autoscaler/recommender_test.go @@ -5,15 +5,11 @@ import ( . "github.com/onsi/gomega" ) -var _ = Describe("Recommender", func() { - Context("when get recommeded resource", func() { - It("should generate recommended resource based on histogram", func() { - recommender := NewRecommender() - Expect(recommender.GetRecommendedResources(nil)).To(BeNil()) - }) - It("should gererate recommended resource with safety margin", func() { - }) - It("should gererate recommended resource with confidence multiplier", func() { +var _ = Describe("Resource Recommender", func() { + Context("when getting recommended resource", func() { + It("should return correct RecommendedResources based on WorkloadState and config", func() { + ws := NewWorkloadState("test") + rr := resourceRecommender{} }) }) }) diff --git a/internal/constants/constants.go b/internal/constants/constants.go index dd2810b3..3e0128e5 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -73,9 +73,8 @@ const ( GenPortNumberAnnotation = Domain + "/port-number" TensorFusionWorkerPortNumber = 8000 - AutoScaleLimitsAnnotation = Domain + "/auto-limits" - AutoScaleRequestsAnnotation = Domain + "/auto-requests" - AutoScaleReplicasAnnotation = Domain + "/auto-replicas" + AutoScaleResourcesAnnotation = Domain + "/auto-resources" + AutoScaleReplicasAnnotation = Domain + "/auto-replicas" GpuReleasedAnnotation = Domain + "/gpu-released" diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go index 23c9104e..9fb8d6e6 100644 --- a/internal/webhook/v1/tf_parser.go +++ b/internal/webhook/v1/tf_parser.go @@ -134,13 +134,9 @@ func ParseTensorFusionInfo( } func parseAutoScalingAnnotations(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile) { - autoLimits, ok := pod.Annotations[constants.AutoScaleLimitsAnnotation] - if ok && autoLimits == constants.TrueStringValue { - workloadProfile.Spec.AutoScalingConfig.AutoSetLimits.Enable = true - } - autoRequests, ok := pod.Annotations[constants.AutoScaleRequestsAnnotation] - if ok && autoRequests == constants.TrueStringValue { - workloadProfile.Spec.AutoScalingConfig.AutoSetRequests.Enable = true + autoResources, ok := pod.Annotations[constants.AutoScaleResourcesAnnotation] + if ok && autoResources == constants.TrueStringValue { + workloadProfile.Spec.AutoScalingConfig.AutoSetResources.Enable = true } autoReplicas, ok := pod.Annotations[constants.AutoScaleReplicasAnnotation] if ok && autoReplicas == constants.TrueStringValue { From f73e0cbd28ee3e879c24e1941523aee70c9623bf Mon Sep 17 00:00:00 2001 From: knave Date: Fri, 4 Jul 2025 15:25:20 +0800 Subject: [PATCH 12/27] feat: implement adjust allocation --- go.mod | 7 +- go.sum | 3 + internal/autoscaler/autoscaler.go | 116 +++++++------ internal/autoscaler/autoscaler_suite_test.go | 38 +++-- internal/autoscaler/autoscaler_test.go | 164 ++++++++++++++++--- internal/autoscaler/recommender_test.go | 5 +- internal/autoscaler/workloadstate.go | 11 +- 7 files changed, 248 insertions(+), 96 deletions(-) diff --git a/go.mod b/go.mod index 420a88fd..4092514e 100644 --- a/go.mod +++ b/go.mod @@ -16,10 +16,11 @@ require ( github.com/lithammer/shortuuid/v4 v4.2.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 + github.com/pkg/errors v0.9.1 github.com/samber/lo v1.51.0 github.com/shirou/gopsutil v3.21.11+incompatible github.com/stretchr/testify v1.10.0 - go.etcd.io/etcd/client/v2 v2.305.16 + golang.org/x/time v0.9.0 gomodules.xyz/jsonpatch/v2 v2.5.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gorm.io/driver/mysql v1.6.0 @@ -31,7 +32,7 @@ require ( k8s.io/component-base v0.32.5 k8s.io/component-helpers v0.33.2 k8s.io/klog/v2 v2.130.1 - k8s.io/kubernetes v1.32.5 + k8s.io/kubernetes v1.32.6 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 sigs.k8s.io/controller-runtime v0.20.4 sigs.k8s.io/scheduler-plugins v0.31.8 @@ -111,7 +112,6 @@ require ( github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.1 // indirect @@ -150,7 +150,6 @@ require ( golang.org/x/sys v0.33.0 // indirect golang.org/x/term v0.32.0 // indirect golang.org/x/text v0.25.0 // indirect - golang.org/x/time v0.9.0 // indirect golang.org/x/tools v0.33.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20241223144023-3abc09e42ca8 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20241223144023-3abc09e42ca8 // indirect diff --git a/go.sum b/go.sum index 822b5c80..57f35cd3 100644 --- a/go.sum +++ b/go.sum @@ -270,6 +270,7 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -458,6 +459,8 @@ k8s.io/apimachinery v0.32.5 h1:6We3aJ6crC0ap8EhsEXcgX3LpI6SEjubpiOMXLROwPM= k8s.io/apimachinery v0.32.5/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= k8s.io/apiserver v0.32.5 h1:phmm2EOUVFI+cLiq8Grtuh166fTt/qgvkGPkpgzp5uY= k8s.io/apiserver v0.32.5/go.mod h1:5bfueS1tgARVWVXRJBMI5mHoCmev0jOvbxebai/kiqc= +k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0 h1:oVv4QrTPKM7vWyQRRzCDgDgi00NWo4Rjle5/nujP/dI= +k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0/go.mod h1:W4k7qGP8A9Xqp+UK+lM49AfsWkAdXzE80F/s8kxwWVI= k8s.io/client-go v0.32.5 h1:huFmQMzgWu0z4kbWsuZci+Gt4Fo72I4CcrvhToZ/Qp0= k8s.io/client-go v0.32.5/go.mod h1:Qchw6f9WIVrur7DKojAHpRgGLcANT0RLIvF39Jz58xA= k8s.io/cloud-provider v0.32.5 h1:KzO0mpXYArWxQH91+a4WLLrhTaO5RGWmQn4lzUXY6ak= diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index c08a68b2..08ad533d 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -26,29 +26,25 @@ var ( type Autoscaler struct { client.Client - Allocator + allocator *gpuallocator.GpuAllocator ResourceRecommender MetricsProvider WorkloadStates map[string]*WorkloadState WorkerStates map[string]*WorkerState } -type Allocator interface { - Realloc(ctx context.Context, req gpuallocator.AllocRequest) error -} - -func NewAutoscaler(c client.Client, allocator Allocator) (*Autoscaler, error) { +func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Autoscaler, error) { if c == nil { return nil, errors.New("must specify client") } if allocator == nil { - return nil, errors.New("must specify reallocator") + return nil, errors.New("must specify allocator") } return &Autoscaler{ Client: c, - Allocator: allocator, + allocator: allocator, ResourceRecommender: NewResourceRecommender(), MetricsProvider: NewMetricsProvider(nil), WorkloadStates: map[string]*WorkloadState{}, @@ -239,51 +235,71 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { } func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workloadState *WorkloadState, worker *corev1.Pod, rr *RecommendedResources) error { + log := log.FromContext(ctx) + + adjustRequest, err := getInitialWorkerResourceRequest(worker) + if err != nil { + return fmt.Errorf("failed to get initial worker resource request, %v", err) + } resourcesInfo := []struct { - name string + name ResourceName requestKey string limitKey string + request *resource.Quantity + limit *resource.Quantity lowerBound ResourceAmount upperBound ResourceAmount target ResourceAmount }{ { - name: "tflops", + name: ResourceTflops, requestKey: constants.TFLOPSRequestAnnotation, limitKey: constants.TFLOPSLimitAnnotation, + request: &adjustRequest.NewRequest.Tflops, + limit: &adjustRequest.NewLimit.Tflops, lowerBound: rr.LowerBoundTflops, upperBound: rr.UpperBoundTflops, target: rr.TargetTflops, }, { - name: "vram", + name: ResourceVram, requestKey: constants.VRAMRequestAnnotation, limitKey: constants.VRAMLimitAnnotation, + request: &adjustRequest.NewRequest.Vram, + limit: &adjustRequest.NewLimit.Vram, lowerBound: rr.LowerBoundVram, upperBound: rr.UpperBoundVram, target: rr.TargetVram, }, } - annotations := worker.GetAnnotations() newAnnotations := map[string]string{} + var upScaling, downScaling bool for _, resInfo := range resourcesInfo { if !workloadState.IsTargetResource(resInfo.name) { continue } - if err := detectResourceChanges( - annotations, newAnnotations, - resInfo.requestKey, resInfo.limitKey, - resInfo.lowerBound, resInfo.upperBound, resInfo.target, - ); err != nil { - return err + upScaling = resInfo.request.Cmp(QuantityFromAmount(resInfo.lowerBound)) < 0 + downScaling = resInfo.request.Cmp(QuantityFromAmount(resInfo.upperBound)) > 0 + if upScaling || downScaling { + targetRequest := QuantityFromAmount(resInfo.target) + targetLimit := getProportionalLimit(resInfo.limit, resInfo.request, &targetRequest) + if targetLimit == nil { + return fmt.Errorf("failed to get limit for %s", resInfo.requestKey) + } + newAnnotations[resInfo.requestKey] = targetRequest.String() + newAnnotations[resInfo.limitKey] = targetLimit.String() + *resInfo.request = targetRequest + *resInfo.limit = *targetLimit } } if len(newAnnotations) > 0 { - if err := s.Allocator.Realloc(ctx, gpuallocator.AllocRequest{}); err != nil { - return fmt.Errorf("failed to reallocate resources: %v", err) + adjustRequest.IsScaleUp = upScaling + if _, err := s.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil { + return fmt.Errorf("failed to adjust allocation: %v", err) } + log.Info("adjust allocation successfully", "adjustRequest", adjustRequest) // Patch the worker with updated annotations patch := client.MergeFrom(worker.DeepCopy()) for key, value := range newAnnotations { @@ -297,29 +313,6 @@ func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workload return nil } -func detectResourceChanges(annotations, newAnnotations map[string]string, requestKey, limitKey string, lowerBound, upperBound, target ResourceAmount) error { - currentRequest, err := resource.ParseQuantity(annotations[requestKey]) - if err != nil { - return fmt.Errorf("failed to parse %s: %v", requestKey, err) - } - if currentRequest.Cmp(QuantityFromAmount(lowerBound)) < 0 || - currentRequest.Cmp(QuantityFromAmount(upperBound)) > 0 { - targetRequest := QuantityFromAmount(target) - newAnnotations[requestKey] = targetRequest.String() - currentLimit, err := resource.ParseQuantity(annotations[limitKey]) - if err != nil { - return fmt.Errorf("failed to parse %s: %v", limitKey, err) - } - targetLimit := getProportionalLimit(¤tLimit, ¤tRequest, &targetRequest) - if targetLimit == nil { - return fmt.Errorf("failed to get limit for %s", requestKey) - } - newAnnotations[limitKey] = targetLimit.String() - } - - return nil -} - func (*Autoscaler) addSamples(workloadState *WorkloadState, workerState *WorkerState, metrics *WorkerMetrics) { workerState.AddTflopsSample(workloadState, metrics) workerState.AddVramSample(workloadState, metrics) @@ -327,9 +320,9 @@ func (*Autoscaler) addSamples(workloadState *WorkloadState, workerState *WorkerS } func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity { - if (originalLimit == nil || originalLimit.IsZero()) || - (recommendedRequest == nil || recommendedRequest.IsZero()) || - (originalRequest == nil || originalRequest.IsZero()) { + if originalLimit == nil || originalLimit.IsZero() || + originalRequest == nil || originalRequest.IsZero() || + recommendedRequest == nil || recommendedRequest.IsZero() { return nil } @@ -340,13 +333,40 @@ func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *re scaledOriginal.Mul(originalValue, scaleResultValue) scaledOriginal.Div(&scaledOriginal, scaleBaseValue) if scaledOriginal.IsInt64() { - result := resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format) - return result + return resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format) } return nil } +func getInitialWorkerResourceRequest(worker *corev1.Pod) (*tfv1.AdjustRequest, error) { + adjustRequest := tfv1.AdjustRequest{ + PodUID: string(worker.UID), + IsScaleUp: false, + NewRequest: tfv1.Resource{}, + NewLimit: tfv1.Resource{}, + } + annotations := worker.GetAnnotations() + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {constants.TFLOPSRequestAnnotation, &adjustRequest.NewRequest.Tflops}, + {constants.TFLOPSLimitAnnotation, &adjustRequest.NewLimit.Tflops}, + {constants.VRAMRequestAnnotation, &adjustRequest.NewRequest.Vram}, + {constants.VRAMLimitAnnotation, &adjustRequest.NewLimit.Vram}, + } + for _, info := range resInfo { + q, err := resource.ParseQuantity(annotations[info.key]) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + } + + return &adjustRequest, nil +} + // Start after manager started func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { autoScaler, err := NewAutoscaler(mgr.GetClient(), nil) diff --git a/internal/autoscaler/autoscaler_suite_test.go b/internal/autoscaler/autoscaler_suite_test.go index 63657b0c..6eb9d869 100644 --- a/internal/autoscaler/autoscaler_suite_test.go +++ b/internal/autoscaler/autoscaler_suite_test.go @@ -69,10 +69,15 @@ var metricsRecorder *metrics.MetricsRecorder func TestControllers(t *testing.T) { RegisterFailHandler(Fail) - SetDefaultEventuallyTimeout(6 * time.Second) + + if os.Getenv("DEBUG_MODE") == constants.TrueStringValue { + SetDefaultEventuallyTimeout(10 * time.Minute) + } else { + SetDefaultEventuallyTimeout(7 * time.Second) + } SetDefaultEventuallyPollingInterval(200 * time.Millisecond) SetDefaultConsistentlyDuration(5 * time.Second) - SetDefaultConsistentlyPollingInterval(200 * time.Millisecond) + SetDefaultConsistentlyPollingInterval(250 * time.Millisecond) RunSpecs(t, "Controller Suite") } @@ -126,6 +131,7 @@ var _ = BeforeSuite(func() { BindAddress: "0", }, }) + Expect(err).ToNot(HaveOccurred()) metricsRecorder = &metrics.MetricsRecorder{ @@ -136,6 +142,16 @@ var _ = BeforeSuite(func() { WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing), } + allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 150*time.Millisecond) + _, err = allocator.SetupWithManager(ctx, mgr) + Expect(err).ToNot(HaveOccurred()) + + portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), "40000-42000", "42001-60000") + if err != nil { + Expect(err).ToNot(HaveOccurred()) + } + _ = portAllocator.SetupWithManager(ctx, mgr) + err = (&controller.TensorFusionClusterReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), @@ -158,12 +174,6 @@ var _ = BeforeSuite(func() { }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) - portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), "40000-42000", "42001-60000") - if err != nil { - Expect(err).ToNot(HaveOccurred()) - } - _ = portAllocator.SetupWithManager(ctx, mgr) - err = (&controller.GPUNodeClassReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), @@ -179,6 +189,7 @@ var _ = BeforeSuite(func() { err = (&controller.PodReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), + Allocator: allocator, PortAllocator: portAllocator, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) @@ -196,10 +207,6 @@ var _ = BeforeSuite(func() { }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) - allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 150*time.Millisecond) - _, err = allocator.SetupWithManager(ctx, mgr) - Expect(err).ToNot(HaveOccurred()) - err = (&controller.TensorFusionConnectionReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), @@ -216,9 +223,7 @@ var _ = BeforeSuite(func() { err = (&controller.TensorFusionWorkloadReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), - Allocator: allocator, Recorder: mgr.GetEventRecorderFor("TensorFusionWorkload"), - GpuInfos: config.MockGpuInfo(), PortAllocator: portAllocator, }).SetupWithManager(mgr) Expect(err).ToNot(HaveOccurred()) @@ -422,6 +427,10 @@ func (c *TensorFusionEnv) getGPUName(poolIndex int, nodeIndex int, gpuIndex int) return fmt.Sprintf("%s-pool-%d-node-%d-gpu-%d", c.clusterKey.Name, poolIndex, nodeIndex, gpuIndex) } +func (c *TensorFusionEnv) GetConfig() *rest.Config { + return cfg +} + type TensorFusionEnvBuilder struct { *TensorFusionEnv } @@ -538,6 +547,7 @@ func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv { Name: key.Name, Labels: map[string]string{ constants.LabelKeyOwner: gpuNode.Name, + constants.GpuPoolKey: b.getPoolName(poolIndex), }, }, } diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index f59ada52..fabfd51c 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -19,11 +19,11 @@ package autoscaler import ( "context" "fmt" + "strings" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" - "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/aws/smithy-go/ptr" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -32,6 +32,9 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -53,16 +56,16 @@ var _ = Describe("Autoscaler", func() { Expect(err.Error()).To(ContainSubstring("must specify client")) }) - It("should return an error if there is no reallocator", func() { + It("should return an error if there is no allocator", func() { as, err := NewAutoscaler(k8sClient, nil) Expect(as).To(BeNil()) - Expect(err.Error()).To(ContainSubstring("must specify reallocator")) + Expect(err.Error()).To(ContainSubstring("must specify allocator")) }) }) Context("when loading history metrics", func() { It("should create the state of workloads and workers based on historical metrics", func() { - scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) + scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.MetricsProvider = &FakeMetricsProvider{} scaler.LoadHistoryMetrics(ctx) metrics, _ := scaler.MetricsProvider.GetHistoryMetrics() @@ -80,7 +83,7 @@ var _ = Describe("Autoscaler", func() { Build() defer tfEnv.Cleanup() - scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) + scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.LoadWorkloads(ctx) Expect(scaler.WorkloadStates).To(HaveLen(0)) Expect(scaler.WorkerStates).To(HaveLen(0)) @@ -131,7 +134,7 @@ var _ = Describe("Autoscaler", func() { worker := workers[0].Name - scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) + scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.LoadWorkloads(ctx) ws := scaler.WorkloadStates[workload.Name] now := time.Now() @@ -160,13 +163,14 @@ var _ = Describe("Autoscaler", func() { AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) + scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.LoadWorkloads(ctx) - scaler.ResourceRecommender = &FakeOutBoundRecommender{} + scaler.ResourceRecommender = &FakeUpScalingRecommender{} rr := scaler.ResourceRecommender.GetRecommendedResources(nil) scaler.ProcessWorkloads(ctx) @@ -186,13 +190,14 @@ var _ = Describe("Autoscaler", func() { AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := NewAutoscaler(k8sClient, &FakeAllocator{}) + scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.LoadWorkloads(ctx) - scaler.ResourceRecommender = &FakeOutBoundRecommender{} + scaler.ResourceRecommender = &FakeUpScalingRecommender{} rr := scaler.ResourceRecommender.GetRecommendedResources(nil) workloadState := scaler.WorkloadStates[workload.Name] @@ -227,15 +232,16 @@ var _ = Describe("Autoscaler", func() { AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := NewAutoscaler(k8sClient, &FakeFailedAllocator{}) + scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.LoadWorkloads(ctx) - scaler.ResourceRecommender = &FakeOutBoundRecommender{} + scaler.ResourceRecommender = &FakeQuotaExceededRecommender{} rr := scaler.ResourceRecommender.GetRecommendedResources(nil) err := scaler.updateWorkerResourcesIfNeeded(ctx, scaler.WorkloadStates[workload.Name], getWorkers(workload)[0], rr) - Expect(err.Error()).To(ContainSubstring("failed to reallocate resources")) + Expect(err.Error()).To(ContainSubstring("failed to adjust allocation: scaling quota exceeded")) }) }) }) @@ -319,16 +325,6 @@ func getWorkers(workload *tfv1.TensorFusionWorkload) []*corev1.Pod { type FakeAllocator struct{} -func (*FakeAllocator) Realloc(ctx context.Context, req gpuallocator.AllocRequest) error { - return nil -} - -type FakeFailedAllocator struct{} - -func (*FakeFailedAllocator) Realloc(ctx context.Context, req gpuallocator.AllocRequest) error { - return fmt.Errorf("not enough resources") -} - type FakeMetricsProvider struct { Metrics []*WorkerMetrics } @@ -358,9 +354,9 @@ func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*WorkerMetrics, error) { return metrics, nil } -type FakeOutBoundRecommender struct{} +type FakeUpScalingRecommender struct{} -func (f *FakeOutBoundRecommender) GetRecommendedResources(_ *WorkloadState) *RecommendedResources { +func (f *FakeUpScalingRecommender) GetRecommendedResources(_ *WorkloadState) *RecommendedResources { return &RecommendedResources{ TargetTflops: 110, LowerBoundTflops: 100, @@ -371,6 +367,19 @@ func (f *FakeOutBoundRecommender) GetRecommendedResources(_ *WorkloadState) *Rec } } +type FakeQuotaExceededRecommender struct{} + +func (f *FakeQuotaExceededRecommender) GetRecommendedResources(_ *WorkloadState) *RecommendedResources { + return &RecommendedResources{ + TargetTflops: 9999, + LowerBoundTflops: 9999, + UpperBoundTflops: 9999, + TargetVram: 9999 * 1000 * 1000 * 1000, + LowerBoundVram: 9999 * 1000 * 1000 * 1000, + UpperBoundVram: 9999 * 1000 * 1000 * 1000, + } +} + func updateWorkloadReplicas(workload *tfv1.TensorFusionWorkload, replicas int) { GinkgoHelper() key := client.ObjectKeyFromObject(workload) @@ -422,6 +431,7 @@ func cleanupWorkload(key client.ObjectKey) { } func assertWorkerAnnotations(worker *corev1.Pod, rr *RecommendedResources) { + GinkgoHelper() tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(worker) Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) @@ -445,3 +455,107 @@ func parseResourceAnnotations(worker *corev1.Pod) (tflopsRequest, tflopsLimit, v } return } + +func mockSchedulerLoop(ctx context.Context, cfg *rest.Config) { + ticker := time.NewTicker(50 * time.Millisecond) + clientset, err := kubernetes.NewForConfig(cfg) + if err != nil { + Expect(err).To(Succeed()) + } + for range ticker.C { + select { + case <-ctx.Done(): + return + default: + podList := &corev1.PodList{} + _ = k8sClient.List(ctx, podList) + for _, pod := range podList.Items { + if pod.Spec.NodeName != "" { + continue + } + go scheduleAndStartPod(&pod, clientset) + } + } + } +} + +func scheduleAndStartPod(pod *corev1.Pod, clientset *kubernetes.Clientset) { + // simulate scheduling cycle Filter and Reserve + allocRequest, _, err := allocator.ComposeAllocationRequest(pod) + if errors.IsNotFound(err) { + return + } + Expect(err).To(Succeed()) + gpus, err := allocator.Alloc(&allocRequest) + if err != nil { + // some test cases are expected to fail, just continue + return + } + Expect(gpus).To(HaveLen(int(allocRequest.Count))) + allocator.SyncGPUsToK8s() + + // update pod annotation + Eventually(func(g Gomega) { + latestPod := &corev1.Pod{} + err := k8sClient.Get(ctx, types.NamespacedName{ + Name: pod.Name, + Namespace: pod.Namespace, + }, latestPod) + if errors.IsNotFound(err) { + return + } + g.Expect(err).To(Succeed()) + + if latestPod.Annotations == nil { + latestPod.Annotations = map[string]string{} + } + latestPod.Annotations[constants.GpuKey] = strings.Join( + lo.Map(gpus, func(gpu *tfv1.GPU, _ int) string { + return gpu.Name + }), ",") + err = k8sClient.Status().Update(ctx, latestPod) + if errors.IsNotFound(err) { + return + } + g.Expect(err).To(Succeed()) + + // update pod node name + latestPod.Spec.NodeName = gpus[0].Status.NodeSelector[constants.KubernetesHostNameLabel] + + // simulate k8s scheduler binding cycle Bind function + binding := &corev1.Binding{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + Target: corev1.ObjectReference{ + Kind: "Node", + Name: latestPod.Spec.NodeName, + }, + } + + err = clientset.CoreV1().Pods(latestPod.Namespace).Bind(ctx, binding, metav1.CreateOptions{}) + if errors.IsNotFound(err) { + return + } + g.Expect(err).To(Succeed()) + }).Should(Succeed()) + + // simulate kubelet start the pod successfully + patchPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + } + patchPod.Status.Phase = corev1.PodRunning + patchPod.Status.Conditions = append(patchPod.Status.Conditions, corev1.PodCondition{ + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }) + err = k8sClient.Status().Patch(ctx, patchPod, client.MergeFrom(&corev1.Pod{})) + if errors.IsNotFound(err) { + return + } + Expect(err).To(Succeed()) +} diff --git a/internal/autoscaler/recommender_test.go b/internal/autoscaler/recommender_test.go index 12f5cb17..a9ee1ff6 100644 --- a/internal/autoscaler/recommender_test.go +++ b/internal/autoscaler/recommender_test.go @@ -2,14 +2,13 @@ package autoscaler import ( . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" ) var _ = Describe("Resource Recommender", func() { Context("when getting recommended resource", func() { It("should return correct RecommendedResources based on WorkloadState and config", func() { - ws := NewWorkloadState("test") - rr := resourceRecommender{} + // ws := NewWorkloadState("test") + // rr := resourceRecommender{} }) }) }) diff --git a/internal/autoscaler/workloadstate.go b/internal/autoscaler/workloadstate.go index fa2ff0b4..d1a5d04d 100644 --- a/internal/autoscaler/workloadstate.go +++ b/internal/autoscaler/workloadstate.go @@ -23,6 +23,13 @@ const ( DefaultHistogramDecayHalfLife = time.Hour * 24 ) +type ResourceName string + +const ( + ResourceTflops ResourceName = "tflops" + ResourceVram ResourceName = "vram" +) + type WorkloadState struct { Namespace string Name string @@ -99,9 +106,9 @@ func (w *WorkloadState) GetResourceRecommenderConfig() *ResourceRecommenderConfi return &cfg } -func (w *WorkloadState) IsTargetResource(resourceName string) bool { +func (w *WorkloadState) IsTargetResource(name ResourceName) bool { target := w.AutoScalingConfig.AutoSetResources.TargetResource - return target == "" || strings.EqualFold(target, "all") || strings.EqualFold(resourceName, target) + return target == "" || strings.EqualFold(target, "all") || strings.EqualFold(string(name), target) } func (w *WorkloadState) IsAutoScalingEnabled() bool { From c25d01319cb35d9341bb4b115816da5da9cf32ea Mon Sep 17 00:00:00 2001 From: knave Date: Fri, 4 Jul 2025 15:36:25 +0800 Subject: [PATCH 13/27] fix: linter issues --- internal/autoscaler/autoscaler.go | 6 +++--- internal/autoscaler/autoscaler_test.go | 10 +++++----- internal/autoscaler/metricsprovider.go | 8 ++++---- internal/autoscaler/workloadstate_test.go | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 08ad533d..650c9cbb 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -150,7 +150,7 @@ func (s *Autoscaler) LoadHistoryMetrics(ctx context.Context) { log := log.FromContext(ctx) log.Info("loading historical metrics") - workersMetrics, err := s.MetricsProvider.GetHistoryMetrics() + workersMetrics, err := s.GetHistoryMetrics() if err != nil { log.Error(err, "failed to get history metrics") return @@ -175,7 +175,7 @@ func (s *Autoscaler) LoadRealTimeMetrics(ctx context.Context) { log := log.FromContext(ctx) log.Info("loading realtime metrics") - workersMetrics, err := s.MetricsProvider.GetWorkersMetrics() + workersMetrics, err := s.GetWorkersMetrics() if err != nil { log.Error(err, "failed to get workers metrics") return @@ -213,7 +213,7 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { continue } - rr := s.ResourceRecommender.GetRecommendedResources(workloadState) + rr := s.GetRecommendedResources(workloadState) log.Info("recommend resources", "workload", workloadState.Name, "resources", rr) // TODO: update recommmendation status of workload diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index fabfd51c..18351d6b 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -68,7 +68,7 @@ var _ = Describe("Autoscaler", func() { scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.MetricsProvider = &FakeMetricsProvider{} scaler.LoadHistoryMetrics(ctx) - metrics, _ := scaler.MetricsProvider.GetHistoryMetrics() + metrics, _ := scaler.GetHistoryMetrics() for _, m := range metrics { Expect(scaler.WorkloadStates).To(HaveKey(m.WorkloadName)) Expect(scaler.WorkerStates).To(HaveKey(m.WorkerName)) @@ -85,8 +85,8 @@ var _ = Describe("Autoscaler", func() { scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.LoadWorkloads(ctx) - Expect(scaler.WorkloadStates).To(HaveLen(0)) - Expect(scaler.WorkerStates).To(HaveLen(0)) + Expect(scaler.WorkloadStates).To(BeEmpty()) + Expect(scaler.WorkerStates).To(BeEmpty()) // create two workloads pool := tfEnv.GetGPUPool(0) @@ -171,7 +171,7 @@ var _ = Describe("Autoscaler", func() { scaler.LoadWorkloads(ctx) scaler.ResourceRecommender = &FakeUpScalingRecommender{} - rr := scaler.ResourceRecommender.GetRecommendedResources(nil) + rr := scaler.GetRecommendedResources(nil) scaler.ProcessWorkloads(ctx) Eventually(func(g Gomega) { @@ -198,7 +198,7 @@ var _ = Describe("Autoscaler", func() { scaler.LoadWorkloads(ctx) scaler.ResourceRecommender = &FakeUpScalingRecommender{} - rr := scaler.ResourceRecommender.GetRecommendedResources(nil) + rr := scaler.GetRecommendedResources(nil) workloadState := scaler.WorkloadStates[workload.Name] oldRes := workloadState.Resources diff --git a/internal/autoscaler/metricsprovider.go b/internal/autoscaler/metricsprovider.go index 5334ff4b..f2fa9573 100644 --- a/internal/autoscaler/metricsprovider.go +++ b/internal/autoscaler/metricsprovider.go @@ -25,10 +25,10 @@ func NewMetricsProvider(db *gorm.DB) MetricsProvider { } type greptimeDBProvider struct { - db *gorm.DB - lastQueryTime time.Time - historyLength time.Duration - historyResolution time.Duration + db *gorm.DB + lastQueryTime time.Time + // historyLength time.Duration + // historyResolution time.Duration } func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerMetrics, error) { diff --git a/internal/autoscaler/workloadstate_test.go b/internal/autoscaler/workloadstate_test.go index ac4035ae..1bfb7562 100644 --- a/internal/autoscaler/workloadstate_test.go +++ b/internal/autoscaler/workloadstate_test.go @@ -62,7 +62,7 @@ var _ = Describe("Workload State", func() { }, } cfg := ws.GetResourceRecommenderConfig() - Expect(cfg.ConfidenceInterval).To(Equal(time.Duration(30 * time.Minute))) + Expect(cfg.ConfidenceInterval).To(Equal(30 * time.Minute)) }) It("should ignore invalid ConfidenceInterval and keep default", func() { From 3bca869bb914e959406dd3f59615c0d426119786 Mon Sep 17 00:00:00 2001 From: knave Date: Sat, 5 Jul 2025 12:53:07 +0800 Subject: [PATCH 14/27] fix: linter issues --- internal/autoscaler/autoscaler.go | 2 +- internal/autoscaler/autoscaler_test.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 650c9cbb..a456a548 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -369,7 +369,7 @@ func getInitialWorkerResourceRequest(worker *corev1.Pod) (*tfv1.AdjustRequest, e // Start after manager started func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { - autoScaler, err := NewAutoscaler(mgr.GetClient(), nil) + autoScaler, err := NewAutoscaler(mgr.GetClient(), allocator) if err != nil { return err } diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 18351d6b..0d890a00 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -227,7 +227,7 @@ var _ = Describe("Autoscaler", func() { }).Should(Succeed()) }) - It("should return an error if failed to reallocate resources", func() { + It("should return an error if recommended resources exceeded quota", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -239,7 +239,7 @@ var _ = Describe("Autoscaler", func() { scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.LoadWorkloads(ctx) scaler.ResourceRecommender = &FakeQuotaExceededRecommender{} - rr := scaler.ResourceRecommender.GetRecommendedResources(nil) + rr := scaler.GetRecommendedResources(nil) err := scaler.updateWorkerResourcesIfNeeded(ctx, scaler.WorkloadStates[workload.Name], getWorkers(workload)[0], rr) Expect(err.Error()).To(ContainSubstring("failed to adjust allocation: scaling quota exceeded")) }) From 9be998c75d065bfcea013d6cbbfccc733814bbf4 Mon Sep 17 00:00:00 2001 From: knave Date: Fri, 11 Jul 2025 09:12:53 +0800 Subject: [PATCH 15/27] refactor: support multiple recommenders --- api/v1/tensorfusionconnection_types.go | 7 + internal/autoscaler/recommender_test.go | 14 -- internal/autoscaler/workloadstate_test.go | 117 ------------- .../autoscaler/autoscaler.go | 115 +++++++------ .../autoscaler/autoscaler_suite_test.go | 4 +- .../autoscaler/autoscaler_test.go | 157 +++++++++--------- .../metrics}/metricsprovider.go | 36 ++-- .../metrics}/metricsprovider_test.go | 10 +- internal/autoscaling/recommender/cron/cron.go | 23 +++ .../recommender/percentile}/estimator.go | 23 +-- .../recommender/percentile/percentile.go} | 92 ++++++---- .../recommender/percentile/percentile_test.go | 73 ++++++++ .../autoscaling/recommender/recommender.go | 28 ++++ .../workerstate.go | 10 +- .../workloadstate.go | 71 +++----- internal/autoscaling/workloadstate_test.go | 49 ++++++ 16 files changed, 447 insertions(+), 382 deletions(-) delete mode 100644 internal/autoscaler/recommender_test.go delete mode 100644 internal/autoscaler/workloadstate_test.go rename internal/{ => autoscaling}/autoscaler/autoscaler.go (70%) rename internal/{ => autoscaling}/autoscaler/autoscaler_suite_test.go (99%) rename internal/{ => autoscaling}/autoscaler/autoscaler_test.go (76%) rename internal/{autoscaler => autoscaling/metrics}/metricsprovider.go (68%) rename internal/{autoscaler => autoscaling/metrics}/metricsprovider_test.go (90%) create mode 100644 internal/autoscaling/recommender/cron/cron.go rename internal/{autoscaler => autoscaling/recommender/percentile}/estimator.go (84%) rename internal/{autoscaler/recommender.go => autoscaling/recommender/percentile/percentile.go} (63%) create mode 100644 internal/autoscaling/recommender/percentile/percentile_test.go create mode 100644 internal/autoscaling/recommender/recommender.go rename internal/{autoscaler => autoscaling}/workerstate.go (89%) rename internal/{autoscaler => autoscaling}/workloadstate.go (58%) create mode 100644 internal/autoscaling/workloadstate_test.go diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go index 1b304eca..874ed4de 100644 --- a/api/v1/tensorfusionconnection_types.go +++ b/api/v1/tensorfusionconnection_types.go @@ -21,6 +21,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +type ResourceName string + +const ( + ResourceTflops ResourceName = "tflops" + ResourceVram ResourceName = "vram" +) + type Resource struct { Tflops resource.Quantity `json:"tflops"` Vram resource.Quantity `json:"vram"` diff --git a/internal/autoscaler/recommender_test.go b/internal/autoscaler/recommender_test.go deleted file mode 100644 index a9ee1ff6..00000000 --- a/internal/autoscaler/recommender_test.go +++ /dev/null @@ -1,14 +0,0 @@ -package autoscaler - -import ( - . "github.com/onsi/ginkgo/v2" -) - -var _ = Describe("Resource Recommender", func() { - Context("when getting recommended resource", func() { - It("should return correct RecommendedResources based on WorkloadState and config", func() { - // ws := NewWorkloadState("test") - // rr := resourceRecommender{} - }) - }) -}) diff --git a/internal/autoscaler/workloadstate_test.go b/internal/autoscaler/workloadstate_test.go deleted file mode 100644 index 1bfb7562..00000000 --- a/internal/autoscaler/workloadstate_test.go +++ /dev/null @@ -1,117 +0,0 @@ -package autoscaler - -import ( - "time" - - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -var _ = Describe("Workload State", func() { - It("should return default config when no AutoScalingConfig is set", func() { - ws := NewWorkloadState("test") - cfg := ws.GetResourceRecommenderConfig() - Expect(cfg).ToNot(BeNil()) - Expect(*cfg).To(Equal(DefaultResourceRecommenderConfig)) - }) - - It("should parse float fields from AutoSetResources", func() { - ws := NewWorkloadState("test") - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{ - TargetTflopsPercentile: "0.8", - LowerBoundTflopsPercentile: "0.1", - UpperBoundTflopsPercentile: "0.95", - TargetVramPercentile: "0.7", - LowerBoundVramPercentile: "0.2", - UpperBoundVramPercentile: "0.9", - RequestMarginFraction: "0.15", - }, - } - cfg := ws.GetResourceRecommenderConfig() - Expect(cfg.TargetTflopsPercentile).To(Equal(0.8)) - Expect(cfg.LowerBoundTflopsPercentile).To(Equal(0.1)) - Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.95)) - Expect(cfg.TargetVramPercentile).To(Equal(0.7)) - Expect(cfg.LowerBoundVramPercentile).To(Equal(0.2)) - Expect(cfg.UpperBoundVramPercentile).To(Equal(0.9)) - Expect(cfg.RequestMarginFraction).To(Equal(0.15)) - }) - - It("should ignore invalid float fields and keep defaults", func() { - ws := NewWorkloadState("test") - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{ - TargetTflopsPercentile: "not-a-float", - LowerBoundTflopsPercentile: "", - UpperBoundTflopsPercentile: "0.99", - }, - } - cfg := ws.GetResourceRecommenderConfig() - Expect(cfg.TargetTflopsPercentile).To(Equal(DefaultResourceRecommenderConfig.TargetTflopsPercentile)) - Expect(cfg.LowerBoundTflopsPercentile).To(Equal(DefaultResourceRecommenderConfig.LowerBoundTflopsPercentile)) - Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.99)) - }) - - It("should parse ConfidenceInterval if valid", func() { - ws := NewWorkloadState("test") - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{ - ConfidenceInterval: "30m", - }, - } - cfg := ws.GetResourceRecommenderConfig() - Expect(cfg.ConfidenceInterval).To(Equal(30 * time.Minute)) - }) - - It("should ignore invalid ConfidenceInterval and keep default", func() { - ws := NewWorkloadState("test") - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{ - ConfidenceInterval: "not-a-duration", - }, - } - cfg := ws.GetResourceRecommenderConfig() - Expect(cfg.ConfidenceInterval).To(Equal(DefaultResourceRecommenderConfig.ConfidenceInterval)) - }) - - It("should correctly determine if a resource is the target based on config", func() { - ws := NewWorkloadState("test") - - Expect(ws.IsTargetResource("tflops")).To(BeTrue()) - Expect(ws.IsTargetResource("vram")).To(BeTrue()) - - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{TargetResource: "all"}, - } - - Expect(ws.IsTargetResource("tflops")).To(BeTrue()) - Expect(ws.IsTargetResource("vram")).To(BeTrue()) - - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{TargetResource: "tflops"}, - } - Expect(ws.IsTargetResource("tflops")).To(BeTrue()) - Expect(ws.IsTargetResource("vram")).To(BeFalse()) - - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{TargetResource: "vram"}, - } - Expect(ws.IsTargetResource("tflops")).To(BeFalse()) - Expect(ws.IsTargetResource("vram")).To(BeTrue()) - }) - - It("should correctly determine if auto scaling is enabled based on config", func() { - ws := NewWorkloadState("test") - - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{Enable: true}, - } - Expect(ws.IsAutoScalingEnabled()).To(BeTrue()) - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{Enable: false}, - } - Expect(ws.IsAutoScalingEnabled()).To(BeFalse()) - }) -}) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaling/autoscaler/autoscaler.go similarity index 70% rename from internal/autoscaler/autoscaler.go rename to internal/autoscaling/autoscaler/autoscaler.go index a456a548..6a1e498b 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaling/autoscaler/autoscaler.go @@ -8,6 +8,9 @@ import ( "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling/recommender" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" "github.com/samber/lo" @@ -26,14 +29,14 @@ var ( type Autoscaler struct { client.Client - allocator *gpuallocator.GpuAllocator - ResourceRecommender - MetricsProvider - WorkloadStates map[string]*WorkloadState - WorkerStates map[string]*WorkerState + allocator *gpuallocator.GpuAllocator + recommenders []recommender.Interface + metricsProvider metrics.Provider + workloadStates map[string]*autoscaling.WorkloadState + workerStates map[string]*autoscaling.WorkerState } -func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Autoscaler, error) { +func New(c client.Client, allocator *gpuallocator.GpuAllocator) (*Autoscaler, error) { if c == nil { return nil, errors.New("must specify client") } @@ -42,13 +45,18 @@ func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Auto return nil, errors.New("must specify allocator") } + recommenders := []recommender.Interface{ + recommender.New(recommender.PercentileRecommender), + recommender.New(recommender.CronRecommender), + } + return &Autoscaler{ - Client: c, - allocator: allocator, - ResourceRecommender: NewResourceRecommender(), - MetricsProvider: NewMetricsProvider(nil), - WorkloadStates: map[string]*WorkloadState{}, - WorkerStates: map[string]*WorkerState{}, + Client: c, + allocator: allocator, + recommenders: recommenders, + metricsProvider: metrics.NewProvider(nil), + workloadStates: map[string]*autoscaling.WorkloadState{}, + workerStates: map[string]*autoscaling.WorkerState{}, }, nil } @@ -100,14 +108,14 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) { } workloadName := workload.Name - workloadState, exists := s.WorkloadStates[workloadName] + workloadState, exists := s.workloadStates[workloadName] if !exists { - workloadState = NewWorkloadState(workloadName) + workloadState = autoscaling.NewWorkloadState(workloadName) } workloadState.Namespace = workload.Namespace workloadState.Resources = workload.Spec.Resources workloadState.AutoScalingConfig = workload.Spec.AutoScalingConfig - s.WorkloadStates[workloadName] = workloadState + s.workloadStates[workloadName] = workloadState observedWorkloads[workloadName] = true @@ -124,24 +132,24 @@ func (s *Autoscaler) LoadWorkloads(ctx context.Context) { if !worker.DeletionTimestamp.IsZero() { continue } - if _, exists := s.WorkerStates[worker.Name]; !exists { - s.WorkerStates[worker.Name] = NewWorkerState(worker.Name, workloadName) + if _, exists := s.workerStates[worker.Name]; !exists { + s.workerStates[worker.Name] = autoscaling.NewWorkerState(worker.Name, workloadName) } observedWorkers[worker.Name] = true } - s.WorkerStates = lo.OmitBy(s.WorkerStates, func(key string, state *WorkerState) bool { + s.workerStates = lo.OmitBy(s.workerStates, func(key string, state *autoscaling.WorkerState) bool { return state.Workload == workloadName && !observedWorkers[key] }) } // remove unused workloadStates - s.WorkloadStates = lo.OmitBy(s.WorkloadStates, func(key string, _ *WorkloadState) bool { + s.workloadStates = lo.OmitBy(s.workloadStates, func(key string, _ *autoscaling.WorkloadState) bool { return !observedWorkloads[key] }) // remove unused workerStates - s.WorkerStates = lo.OmitBy(s.WorkerStates, func(_ string, state *WorkerState) bool { + s.workerStates = lo.OmitBy(s.workerStates, func(_ string, state *autoscaling.WorkerState) bool { return !observedWorkloads[state.Workload] }) } @@ -150,21 +158,21 @@ func (s *Autoscaler) LoadHistoryMetrics(ctx context.Context) { log := log.FromContext(ctx) log.Info("loading historical metrics") - workersMetrics, err := s.GetHistoryMetrics() + workersMetrics, err := s.metricsProvider.GetHistoryMetrics() if err != nil { log.Error(err, "failed to get history metrics") return } for _, metrics := range workersMetrics { - workloadState, exists := s.WorkloadStates[metrics.WorkloadName] + workloadState, exists := s.workloadStates[metrics.WorkloadName] if !exists { - workloadState = NewWorkloadState(metrics.WorkloadName) - s.WorkloadStates[metrics.WorkloadName] = workloadState + workloadState = autoscaling.NewWorkloadState(metrics.WorkloadName) + s.workloadStates[metrics.WorkloadName] = workloadState } - workerState, exists := s.WorkerStates[metrics.WorkerName] + workerState, exists := s.workerStates[metrics.WorkerName] if !exists { - workerState = NewWorkerState(metrics.WorkerName, metrics.WorkloadName) - s.WorkerStates[metrics.WorkerName] = workerState + workerState = autoscaling.NewWorkerState(metrics.WorkerName, metrics.WorkloadName) + s.workerStates[metrics.WorkerName] = workerState } s.addSamples(workloadState, workerState, metrics) @@ -175,18 +183,18 @@ func (s *Autoscaler) LoadRealTimeMetrics(ctx context.Context) { log := log.FromContext(ctx) log.Info("loading realtime metrics") - workersMetrics, err := s.GetWorkersMetrics() + workersMetrics, err := s.metricsProvider.GetWorkersMetrics() if err != nil { log.Error(err, "failed to get workers metrics") return } for _, metrics := range workersMetrics { - workloadState, workloadExists := s.WorkloadStates[metrics.WorkloadName] + workloadState, workloadExists := s.workloadStates[metrics.WorkloadName] if !workloadExists { continue } - workerState, workerExists := s.WorkerStates[metrics.WorkerName] + workerState, workerExists := s.workerStates[metrics.WorkerName] if !workerExists { continue } @@ -199,8 +207,7 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { log := log.FromContext(ctx) log.Info("processing workloads") - for _, workloadState := range s.WorkloadStates { - // TODO: continue if histogram is empty + for _, workloadState := range s.workloadStates { podList := &corev1.PodList{} if err := s.List(ctx, podList, client.InNamespace(workloadState.Namespace), @@ -213,8 +220,8 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { continue } - rr := s.GetRecommendedResources(workloadState) - log.Info("recommend resources", "workload", workloadState.Name, "resources", rr) + s.recommenders[0].Recommend(workloadState) + log.Info("recommended resources", "workload", workloadState.Name, "resources", workloadState.Recommendation) // TODO: update recommmendation status of workload @@ -227,32 +234,34 @@ func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { continue } - if err := s.updateWorkerResourcesIfNeeded(ctx, workloadState, &worker, rr); err != nil { + if err := s.updateWorkerResourcesIfNeeded(ctx, workloadState, &worker); err != nil { log.Error(err, "failed to update worker") } } } } -func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workloadState *WorkloadState, worker *corev1.Pod, rr *RecommendedResources) error { +func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workloadState *autoscaling.WorkloadState, worker *corev1.Pod) error { log := log.FromContext(ctx) - adjustRequest, err := getInitialWorkerResourceRequest(worker) + adjustRequest, err := getCurrentWorkerResourceRequest(worker) if err != nil { - return fmt.Errorf("failed to get initial worker resource request, %v", err) + return fmt.Errorf("failed to get current worker resource request, %v", err) } + + rr := &workloadState.Recommendation resourcesInfo := []struct { - name ResourceName + name tfv1.ResourceName requestKey string limitKey string request *resource.Quantity limit *resource.Quantity - lowerBound ResourceAmount - upperBound ResourceAmount - target ResourceAmount + lowerBound resource.Quantity + upperBound resource.Quantity + target resource.Quantity }{ { - name: ResourceTflops, + name: tfv1.ResourceTflops, requestKey: constants.TFLOPSRequestAnnotation, limitKey: constants.TFLOPSLimitAnnotation, request: &adjustRequest.NewRequest.Tflops, @@ -262,7 +271,7 @@ func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workload target: rr.TargetTflops, }, { - name: ResourceVram, + name: tfv1.ResourceVram, requestKey: constants.VRAMRequestAnnotation, limitKey: constants.VRAMLimitAnnotation, request: &adjustRequest.NewRequest.Vram, @@ -279,10 +288,10 @@ func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workload if !workloadState.IsTargetResource(resInfo.name) { continue } - upScaling = resInfo.request.Cmp(QuantityFromAmount(resInfo.lowerBound)) < 0 - downScaling = resInfo.request.Cmp(QuantityFromAmount(resInfo.upperBound)) > 0 + upScaling = resInfo.request.Cmp(resInfo.lowerBound) < 0 + downScaling = resInfo.request.Cmp(resInfo.upperBound) > 0 if upScaling || downScaling { - targetRequest := QuantityFromAmount(resInfo.target) + targetRequest := resInfo.target targetLimit := getProportionalLimit(resInfo.limit, resInfo.request, &targetRequest) if targetLimit == nil { return fmt.Errorf("failed to get limit for %s", resInfo.requestKey) @@ -313,10 +322,10 @@ func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workload return nil } -func (*Autoscaler) addSamples(workloadState *WorkloadState, workerState *WorkerState, metrics *WorkerMetrics) { - workerState.AddTflopsSample(workloadState, metrics) - workerState.AddVramSample(workloadState, metrics) - workloadState.UpdateSampleStats(metrics) +func (*Autoscaler) addSamples(workloadState *autoscaling.WorkloadState, workerState *autoscaling.WorkerState, sample *metrics.WorkerUsage) { + workerState.AddTflopsSample(workloadState, sample) + workerState.AddVramSample(workloadState, sample) + workloadState.UpdateSampleStats(sample) } func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity { @@ -339,7 +348,7 @@ func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *re return nil } -func getInitialWorkerResourceRequest(worker *corev1.Pod) (*tfv1.AdjustRequest, error) { +func getCurrentWorkerResourceRequest(worker *corev1.Pod) (*tfv1.AdjustRequest, error) { adjustRequest := tfv1.AdjustRequest{ PodUID: string(worker.UID), IsScaleUp: false, @@ -369,7 +378,7 @@ func getInitialWorkerResourceRequest(worker *corev1.Pod) (*tfv1.AdjustRequest, e // Start after manager started func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { - autoScaler, err := NewAutoscaler(mgr.GetClient(), allocator) + autoScaler, err := New(mgr.GetClient(), allocator) if err != nil { return err } diff --git a/internal/autoscaler/autoscaler_suite_test.go b/internal/autoscaling/autoscaler/autoscaler_suite_test.go similarity index 99% rename from internal/autoscaler/autoscaler_suite_test.go rename to internal/autoscaling/autoscaler/autoscaler_suite_test.go index 6eb9d869..7cdc52ea 100644 --- a/internal/autoscaler/autoscaler_suite_test.go +++ b/internal/autoscaling/autoscaler/autoscaler_suite_test.go @@ -89,7 +89,7 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, ErrorIfCRDPathMissing: true, // The BinaryAssetsDirectory is only required if you want to run the tests directly @@ -97,7 +97,7 @@ var _ = BeforeSuite(func() { // default path defined in controller-runtime which is /usr/local/kubebuilder/. // Note that you must have the required binaries setup under the bin directory to perform // the tests directly. When we run make test it will be setup and used automatically. - BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", + BinaryAssetsDirectory: filepath.Join("..", "..", "..", "bin", "k8s", fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), } diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaling/autoscaler/autoscaler_test.go similarity index 76% rename from internal/autoscaler/autoscaler_test.go rename to internal/autoscaling/autoscaler/autoscaler_test.go index 0d890a00..7cd76db7 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaling/autoscaler/autoscaler_test.go @@ -23,6 +23,9 @@ import ( "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling/recommender" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/aws/smithy-go/ptr" . "github.com/onsi/ginkgo/v2" @@ -51,13 +54,13 @@ import ( var _ = Describe("Autoscaler", func() { Context("when creating an autoscaler", func() { It("should return an error if there is no client", func() { - as, err := NewAutoscaler(nil, nil) + as, err := New(nil, nil) Expect(as).To(BeNil()) Expect(err.Error()).To(ContainSubstring("must specify client")) }) It("should return an error if there is no allocator", func() { - as, err := NewAutoscaler(k8sClient, nil) + as, err := New(k8sClient, nil) Expect(as).To(BeNil()) Expect(err.Error()).To(ContainSubstring("must specify allocator")) }) @@ -65,13 +68,13 @@ var _ = Describe("Autoscaler", func() { Context("when loading history metrics", func() { It("should create the state of workloads and workers based on historical metrics", func() { - scaler, _ := NewAutoscaler(k8sClient, allocator) - scaler.MetricsProvider = &FakeMetricsProvider{} + scaler, _ := New(k8sClient, allocator) + scaler.metricsProvider = &FakeMetricsProvider{} scaler.LoadHistoryMetrics(ctx) - metrics, _ := scaler.GetHistoryMetrics() + metrics, _ := scaler.metricsProvider.GetHistoryMetrics() for _, m := range metrics { - Expect(scaler.WorkloadStates).To(HaveKey(m.WorkloadName)) - Expect(scaler.WorkerStates).To(HaveKey(m.WorkerName)) + Expect(scaler.workloadStates).To(HaveKey(m.WorkloadName)) + Expect(scaler.workerStates).To(HaveKey(m.WorkerName)) } }) }) @@ -83,10 +86,10 @@ var _ = Describe("Autoscaler", func() { Build() defer tfEnv.Cleanup() - scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler, _ := New(k8sClient, allocator) scaler.LoadWorkloads(ctx) - Expect(scaler.WorkloadStates).To(BeEmpty()) - Expect(scaler.WorkerStates).To(BeEmpty()) + Expect(scaler.workloadStates).To(BeEmpty()) + Expect(scaler.workerStates).To(BeEmpty()) // create two workloads pool := tfEnv.GetGPUPool(0) @@ -98,26 +101,26 @@ var _ = Describe("Autoscaler", func() { workload1Workers := getWorkers(workload1) scaler.LoadWorkloads(ctx) - Expect(scaler.WorkloadStates).To(HaveLen(2)) - Expect(scaler.WorkloadStates).To(HaveKey(workload0.Name)) - Expect(scaler.WorkloadStates).To(HaveKey(workload1.Name)) - Expect(scaler.WorkerStates).To(HaveLen(3)) - Expect(scaler.WorkerStates).To(HaveKey(workload0Workers[0].Name)) - Expect(scaler.WorkerStates).To(HaveKey(workload0Workers[1].Name)) - Expect(scaler.WorkerStates).To(HaveKey(workload1Workers[0].Name)) + Expect(scaler.workloadStates).To(HaveLen(2)) + Expect(scaler.workloadStates).To(HaveKey(workload0.Name)) + Expect(scaler.workloadStates).To(HaveKey(workload1.Name)) + Expect(scaler.workerStates).To(HaveLen(3)) + Expect(scaler.workerStates).To(HaveKey(workload0Workers[0].Name)) + Expect(scaler.workerStates).To(HaveKey(workload0Workers[1].Name)) + Expect(scaler.workerStates).To(HaveKey(workload1Workers[0].Name)) updateWorkloadReplicas(workload0, 1) scaler.LoadWorkloads(ctx) - Expect(scaler.WorkerStates).To(HaveLen(2)) + Expect(scaler.workerStates).To(HaveLen(2)) deleteWorkload(workload0) deleteWorkload(workload1) scaler.LoadWorkloads(ctx) - Expect(scaler.WorkloadStates).NotTo(HaveKey(workload0.Name)) - Expect(scaler.WorkerStates).NotTo(HaveKey(workload0Workers[0].Name)) - Expect(scaler.WorkerStates).NotTo(HaveKey(workload0Workers[1].Name)) - Expect(scaler.WorkloadStates).NotTo(HaveKey(workload1.Name)) - Expect(scaler.WorkerStates).NotTo(HaveKey(workload1Workers[0].Name)) + Expect(scaler.workloadStates).NotTo(HaveKey(workload0.Name)) + Expect(scaler.workerStates).NotTo(HaveKey(workload0Workers[0].Name)) + Expect(scaler.workerStates).NotTo(HaveKey(workload0Workers[1].Name)) + Expect(scaler.workloadStates).NotTo(HaveKey(workload1.Name)) + Expect(scaler.workerStates).NotTo(HaveKey(workload1Workers[0].Name)) }) }) @@ -134,25 +137,25 @@ var _ = Describe("Autoscaler", func() { worker := workers[0].Name - scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler, _ := New(k8sClient, allocator) scaler.LoadWorkloads(ctx) - ws := scaler.WorkloadStates[workload.Name] + ws := scaler.workloadStates[workload.Name] now := time.Now() - metrics := &WorkerMetrics{ + usage := &metrics.WorkerUsage{ WorkloadName: workload.Name, WorkerName: worker, - TflopsUsage: ResourceAmount(12.0), + TflopsUsage: 12.0, VramUsage: 9000, Timestamp: now, } - scaler.MetricsProvider = &FakeMetricsProvider{[]*WorkerMetrics{metrics}} + scaler.metricsProvider = &FakeMetricsProvider{[]*metrics.WorkerUsage{usage}} scaler.LoadRealTimeMetrics(ctx) - Expect(scaler.WorkerStates[worker].LastTflopsSampleTime).To(Equal(metrics.Timestamp)) + Expect(scaler.workerStates[worker].LastTflopsSampleTime).To(Equal(usage.Timestamp)) Expect(ws.TflopsHistogram.IsEmpty()).To(BeFalse()) - Expect(scaler.WorkerStates[worker].VramPeak).To(Equal(metrics.VramUsage)) - Expect(scaler.WorkerStates[worker].LastVramSampleTime).To(Equal(metrics.Timestamp)) + Expect(scaler.workerStates[worker].VramPeak).To(Equal(usage.VramUsage)) + Expect(scaler.workerStates[worker].LastVramSampleTime).To(Equal(usage.Timestamp)) Expect(ws.VramHistogram.IsEmpty()).To(BeFalse()) }) }) @@ -167,13 +170,13 @@ var _ = Describe("Autoscaler", func() { workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler, _ := New(k8sClient, allocator) scaler.LoadWorkloads(ctx) - scaler.ResourceRecommender = &FakeUpScalingRecommender{} - rr := scaler.GetRecommendedResources(nil) - + scaler.recommenders[0] = &FakeUpScalingRecommender{} scaler.ProcessWorkloads(ctx) + + rr := scaler.workloadStates[workload.Name].Recommendation Eventually(func(g Gomega) { assertWorkerAnnotations(getWorkers(workload)[0], rr) }).Should(Succeed()) @@ -194,13 +197,12 @@ var _ = Describe("Autoscaler", func() { workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler, _ := New(k8sClient, allocator) scaler.LoadWorkloads(ctx) - scaler.ResourceRecommender = &FakeUpScalingRecommender{} - rr := scaler.GetRecommendedResources(nil) + scaler.recommenders[0] = &FakeUpScalingRecommender{} - workloadState := scaler.WorkloadStates[workload.Name] + workloadState := scaler.workloadStates[workload.Name] oldRes := workloadState.Resources // verify IsAutoScalingEnabled @@ -218,10 +220,11 @@ var _ = Describe("Autoscaler", func() { workloadState.AutoScalingConfig.AutoSetResources.Enable = true workloadState.AutoScalingConfig.AutoSetResources.TargetResource = "tflops" scaler.ProcessWorkloads(ctx) + rr := scaler.workloadStates[workload.Name].Recommendation Eventually(func(g Gomega) { tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0]) - Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) - Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) + Expect(tflopsRequest.Value()).To(Equal(rr.TargetTflops.Value())) + Expect(tflopsLimit.Value()).To(Equal(rr.TargetTflops.Value() * 2)) Expect(vramRequest.Equal(oldRes.Requests.Vram)).To(BeTrue()) Expect(vramLimit.Equal(oldRes.Limits.Vram)).To(BeTrue()) }).Should(Succeed()) @@ -236,11 +239,11 @@ var _ = Describe("Autoscaler", func() { workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler, _ := New(k8sClient, allocator) scaler.LoadWorkloads(ctx) - scaler.ResourceRecommender = &FakeQuotaExceededRecommender{} - rr := scaler.GetRecommendedResources(nil) - err := scaler.updateWorkerResourcesIfNeeded(ctx, scaler.WorkloadStates[workload.Name], getWorkers(workload)[0], rr) + scaler.recommenders[0] = &FakeQuotaExceededRecommender{} + scaler.ProcessWorkloads(ctx) + err := scaler.updateWorkerResourcesIfNeeded(ctx, scaler.workloadStates[workload.Name], getWorkers(workload)[0]) Expect(err.Error()).To(ContainSubstring("failed to adjust allocation: scaling quota exceeded")) }) }) @@ -326,24 +329,24 @@ func getWorkers(workload *tfv1.TensorFusionWorkload) []*corev1.Pod { type FakeAllocator struct{} type FakeMetricsProvider struct { - Metrics []*WorkerMetrics + Metrics []*metrics.WorkerUsage } -func (f *FakeMetricsProvider) GetWorkersMetrics() ([]*WorkerMetrics, error) { +func (f *FakeMetricsProvider) GetWorkersMetrics() ([]*metrics.WorkerUsage, error) { return f.Metrics, nil } -func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*WorkerMetrics, error) { - metrics := []*WorkerMetrics{} +func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*metrics.WorkerUsage, error) { + sample := []*metrics.WorkerUsage{} startTime := time.Now().Add(-8 * 24 * time.Hour) for day := 0; day < 8; day++ { for hour := 0; hour < 1; hour++ { for minute := 0; minute < 60; minute++ { // idx := day*24 + hour - metrics = append(metrics, &WorkerMetrics{ + sample = append(sample, &metrics.WorkerUsage{ WorkloadName: "workload-0", WorkerName: fmt.Sprintf("worker-%d", 1), - TflopsUsage: ResourceAmount(100.0), + TflopsUsage: 100.0, VramUsage: 1 * 1000 * 1000 * 1000, Timestamp: startTime.Add(time.Duration(day*24+hour)*time.Hour + time.Duration(minute)*time.Minute), }) @@ -351,32 +354,36 @@ func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*WorkerMetrics, error) { } } - return metrics, nil + return sample, nil } -type FakeUpScalingRecommender struct{} +type FakeUpScalingRecommender struct { + recommender.Interface +} -func (f *FakeUpScalingRecommender) GetRecommendedResources(_ *WorkloadState) *RecommendedResources { - return &RecommendedResources{ - TargetTflops: 110, - LowerBoundTflops: 100, - UpperBoundTflops: 120, - TargetVram: 110 * 1000 * 1000 * 1000, - LowerBoundVram: 100 * 1000 * 1000 * 1000, - UpperBoundVram: 120 * 1000 * 1000 * 1000, +func (f *FakeUpScalingRecommender) Recommend(w *autoscaling.WorkloadState) { + w.Recommendation = autoscaling.RecommendedResources{ + TargetTflops: resource.MustParse("110"), + LowerBoundTflops: resource.MustParse("100"), + UpperBoundTflops: resource.MustParse("120"), + TargetVram: resource.MustParse("110Gi"), + LowerBoundVram: resource.MustParse("100Gi"), + UpperBoundVram: resource.MustParse("120Gi"), } } -type FakeQuotaExceededRecommender struct{} +type FakeQuotaExceededRecommender struct { + recommender.Interface +} -func (f *FakeQuotaExceededRecommender) GetRecommendedResources(_ *WorkloadState) *RecommendedResources { - return &RecommendedResources{ - TargetTflops: 9999, - LowerBoundTflops: 9999, - UpperBoundTflops: 9999, - TargetVram: 9999 * 1000 * 1000 * 1000, - LowerBoundVram: 9999 * 1000 * 1000 * 1000, - UpperBoundVram: 9999 * 1000 * 1000 * 1000, +func (f *FakeQuotaExceededRecommender) Recommend(w *autoscaling.WorkloadState) { + w.Recommendation = autoscaling.RecommendedResources{ + TargetTflops: resource.MustParse("9999"), + LowerBoundTflops: resource.MustParse("9999"), + UpperBoundTflops: resource.MustParse("9999"), + TargetVram: resource.MustParse("999Gi"), + LowerBoundVram: resource.MustParse("999Gi"), + UpperBoundVram: resource.MustParse("999Gi"), } } @@ -430,13 +437,13 @@ func cleanupWorkload(key client.ObjectKey) { }).Should(Succeed()) } -func assertWorkerAnnotations(worker *corev1.Pod, rr *RecommendedResources) { +func assertWorkerAnnotations(worker *corev1.Pod, rr autoscaling.RecommendedResources) { GinkgoHelper() tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(worker) - Expect(tflopsRequest.Value()).To(Equal(int64(rr.TargetTflops))) - Expect(tflopsLimit.Value()).To(Equal(int64(rr.TargetTflops * 2))) - Expect(vramRequest.Value()).To(Equal(int64(rr.TargetVram))) - Expect(vramLimit.Value()).To(Equal(int64(rr.TargetVram * 2))) + Expect(tflopsRequest.Value()).To(Equal(rr.TargetTflops.Value())) + Expect(tflopsLimit.Value()).To(Equal(rr.TargetTflops.Value() * 2)) + Expect(vramRequest.Value()).To(Equal(rr.TargetVram.Value())) + Expect(vramLimit.Value()).To(Equal(rr.TargetVram.Value() * 2)) } func parseResourceAnnotations(worker *corev1.Pod) (tflopsRequest, tflopsLimit, vramRequest, vramLimit resource.Quantity) { diff --git a/internal/autoscaler/metricsprovider.go b/internal/autoscaling/metrics/metricsprovider.go similarity index 68% rename from internal/autoscaler/metricsprovider.go rename to internal/autoscaling/metrics/metricsprovider.go index f2fa9573..e35f6911 100644 --- a/internal/autoscaler/metricsprovider.go +++ b/internal/autoscaling/metrics/metricsprovider.go @@ -1,4 +1,4 @@ -package autoscaler +package metrics import ( "time" @@ -7,20 +7,20 @@ import ( "gorm.io/gorm" ) -type WorkerMetrics struct { +type WorkerUsage struct { WorkloadName string WorkerName string - TflopsUsage ResourceAmount - VramUsage ResourceAmount + TflopsUsage float64 + VramUsage uint64 Timestamp time.Time } -type MetricsProvider interface { - GetWorkersMetrics() ([]*WorkerMetrics, error) - GetHistoryMetrics() ([]*WorkerMetrics, error) +type Provider interface { + GetWorkersMetrics() ([]*WorkerUsage, error) + GetHistoryMetrics() ([]*WorkerUsage, error) } -func NewMetricsProvider(db *gorm.DB) MetricsProvider { +func NewProvider(db *gorm.DB) Provider { return &greptimeDBProvider{db: db} } @@ -31,7 +31,7 @@ type greptimeDBProvider struct { // historyResolution time.Duration } -func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerMetrics, error) { +func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerUsage, error) { data := []*metrics.HypervisorWorkerUsageMetrics{} now := time.Now() // actual meaning: max(avg[10s])[1m] @@ -48,13 +48,13 @@ func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerMetrics, error) { g.lastQueryTime = now - workersMetrics := make([]*WorkerMetrics, 0, len(data)) + workersMetrics := make([]*WorkerUsage, 0, len(data)) for _, row := range data { - workersMetrics = append(workersMetrics, &WorkerMetrics{ + workersMetrics = append(workersMetrics, &WorkerUsage{ WorkloadName: row.WorkloadName, WorkerName: row.WorkerName, - TflopsUsage: resourceAmountFromFloat(row.ComputeTflops), - VramUsage: ResourceAmount(row.VRAMBytes), + TflopsUsage: row.ComputeTflops, + VramUsage: row.VRAMBytes, Timestamp: row.Timestamp, }) } @@ -67,7 +67,7 @@ type hypervisorWorkerUsageMetrics struct { TimeWindow time.Time `gorm:"column:time_window;index:,class:TIME"` } -func (g *greptimeDBProvider) GetHistoryMetrics() ([]*WorkerMetrics, error) { +func (g *greptimeDBProvider) GetHistoryMetrics() ([]*WorkerUsage, error) { data := []*hypervisorWorkerUsageMetrics{} now := time.Now() // TODO: replace using iteration for handling large datasets efficiently @@ -85,13 +85,13 @@ func (g *greptimeDBProvider) GetHistoryMetrics() ([]*WorkerMetrics, error) { g.lastQueryTime = now - workersMetrics := make([]*WorkerMetrics, 0, len(data)) + workersMetrics := make([]*WorkerUsage, 0, len(data)) for _, row := range data { - workersMetrics = append(workersMetrics, &WorkerMetrics{ + workersMetrics = append(workersMetrics, &WorkerUsage{ WorkloadName: row.WorkloadName, WorkerName: row.WorkerName, - TflopsUsage: resourceAmountFromFloat(row.ComputeTflops), - VramUsage: ResourceAmount(row.VRAMBytes), + TflopsUsage: row.ComputeTflops, + VramUsage: row.VRAMBytes, Timestamp: row.TimeWindow, }) } diff --git a/internal/autoscaler/metricsprovider_test.go b/internal/autoscaling/metrics/metricsprovider_test.go similarity index 90% rename from internal/autoscaler/metricsprovider_test.go rename to internal/autoscaling/metrics/metricsprovider_test.go index dbd6f968..916c050d 100644 --- a/internal/autoscaler/metricsprovider_test.go +++ b/internal/autoscaling/metrics/metricsprovider_test.go @@ -1,4 +1,4 @@ -package autoscaler +package metrics import ( "regexp" @@ -46,8 +46,8 @@ var _ = Describe("MetricsProvider", func() { Expect(got).To(HaveLen(2)) Expect(got[0].WorkloadName).To(Equal(fakeMetrics[0].WorkloadName)) Expect(got[0].WorkerName).To(Equal(fakeMetrics[0].WorkerName)) - Expect(got[0].VramUsage).To(Equal(ResourceAmount(fakeMetrics[0].VRAMBytes))) - Expect(got[0].TflopsUsage).To(Equal(resourceAmountFromFloat(fakeMetrics[0].ComputeTflops))) + Expect(got[0].VramUsage).To(Equal(fakeMetrics[0].VRAMBytes)) + Expect(got[0].TflopsUsage).To(Equal(fakeMetrics[0].ComputeTflops)) Expect(got[0].Timestamp).To(Equal(fakeMetrics[0].Timestamp)) }) }) @@ -91,8 +91,8 @@ var _ = Describe("MetricsProvider", func() { Expect(got).To(HaveLen(2)) Expect(got[0].WorkloadName).To(Equal(fakeMetrics[0].WorkloadName)) Expect(got[0].WorkerName).To(Equal(fakeMetrics[0].WorkerName)) - Expect(got[0].VramUsage).To(Equal(ResourceAmount(fakeMetrics[0].VRAMBytes))) - Expect(got[0].TflopsUsage).To(Equal(resourceAmountFromFloat(fakeMetrics[0].ComputeTflops))) + Expect(got[0].VramUsage).To(Equal(fakeMetrics[0].VRAMBytes)) + Expect(got[0].TflopsUsage).To(Equal(fakeMetrics[0].ComputeTflops)) Expect(got[0].Timestamp).To(Equal(fakeMetrics[0].TimeWindow)) }) }) diff --git a/internal/autoscaling/recommender/cron/cron.go b/internal/autoscaling/recommender/cron/cron.go new file mode 100644 index 00000000..70185959 --- /dev/null +++ b/internal/autoscaling/recommender/cron/cron.go @@ -0,0 +1,23 @@ +package cron + +import ( + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling" +) + +type CronRecommender struct{} + +func New() *CronRecommender { + return &CronRecommender{} +} + +func (c *CronRecommender) Name() string { + return "cron" +} + +func (c *CronRecommender) Recommend(w *autoscaling.WorkloadState) { + c.getCronConfig(&w.AutoScalingConfig) +} + +func (c *CronRecommender) getCronConfig(asc *tfv1.AutoScalingConfig) { +} diff --git a/internal/autoscaler/estimator.go b/internal/autoscaling/recommender/percentile/estimator.go similarity index 84% rename from internal/autoscaler/estimator.go rename to internal/autoscaling/recommender/percentile/estimator.go index bbd44d7a..14ef67d9 100644 --- a/internal/autoscaler/estimator.go +++ b/internal/autoscaling/recommender/percentile/estimator.go @@ -1,9 +1,10 @@ -package autoscaler +package percentile import ( "math" "time" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling" "k8s.io/apimachinery/pkg/api/resource" ) @@ -37,7 +38,7 @@ func resourceAmountFromFloat(amount float64) ResourceAmount { } type VramEstimator interface { - GetVramEstimation(s *WorkloadState) ResourceAmount + GetVramEstimation(s *autoscaling.WorkloadState) ResourceAmount } type percentileVramEstimator struct { @@ -49,7 +50,7 @@ func NewPercentileVramEstimator(percentile float64) VramEstimator { return &percentileVramEstimator{percentile} } -func (e *percentileVramEstimator) GetVramEstimation(s *WorkloadState) ResourceAmount { +func (e *percentileVramEstimator) GetVramEstimation(s *autoscaling.WorkloadState) ResourceAmount { return resourceAmountFromFloat(float64(s.VramHistogram.Percentile(e.percentile))) } @@ -64,7 +65,7 @@ func WithVramMargin(marginFraction float64, baseEstimator VramEstimator) VramEst } // GetvramEstimation returns the vram estimation for the given AggregateContainerState. -func (e *vramMarginEstimator) GetVramEstimation(s *WorkloadState) ResourceAmount { +func (e *vramMarginEstimator) GetVramEstimation(s *autoscaling.WorkloadState) ResourceAmount { base := e.baseEstimator.GetVramEstimation(s) margin := resourceAmountFromFloat(float64(base) * e.marginFraction) return base + margin @@ -87,14 +88,14 @@ func WithVramConfidenceMultiplier(multiplier, exponent float64, baseEstimator Vr } } -func (e *vramConfidenceMultiplier) GetVramEstimation(s *WorkloadState) ResourceAmount { +func (e *vramConfidenceMultiplier) GetVramEstimation(s *autoscaling.WorkloadState) ResourceAmount { confidence := getConfidence(s, e.confidenceInterval) base := e.baseEstimator.GetVramEstimation(s) return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent)) } type TflopsEstimator interface { - GetTflopsEstimation(s *WorkloadState) ResourceAmount + GetTflopsEstimation(s *autoscaling.WorkloadState) ResourceAmount } type percentileTflopsEstimator struct { @@ -106,7 +107,7 @@ func NewPercentileTflopsEstimator(percentile float64) TflopsEstimator { return &percentileTflopsEstimator{percentile} } -func (e *percentileTflopsEstimator) GetTflopsEstimation(s *WorkloadState) ResourceAmount { +func (e *percentileTflopsEstimator) GetTflopsEstimation(s *autoscaling.WorkloadState) ResourceAmount { return resourceAmountFromFloat(float64(s.TflopsHistogram.Percentile(e.percentile))) } @@ -121,7 +122,7 @@ func WithTflopsMargin(marginFraction float64, baseEstimator TflopsEstimator) Tfl } // GetTflopsEstimation returns the tflops estimation for the given AggregateContainerState. -func (e *tflopsMarginEstimator) GetTflopsEstimation(s *WorkloadState) ResourceAmount { +func (e *tflopsMarginEstimator) GetTflopsEstimation(s *autoscaling.WorkloadState) ResourceAmount { base := e.baseEstimator.GetTflopsEstimation(s) margin := resourceAmountFromFloat(float64(base) * e.marginFraction) return base + margin @@ -144,19 +145,19 @@ func WithTflopsConfidenceMultiplier(multiplier, exponent float64, baseEstimator } } -func (e *tflopsConfidenceMultiplier) GetTflopsEstimation(s *WorkloadState) ResourceAmount { +func (e *tflopsConfidenceMultiplier) GetTflopsEstimation(s *autoscaling.WorkloadState) ResourceAmount { confidence := getConfidence(s, e.confidenceInterval) base := e.baseEstimator.GetTflopsEstimation(s) return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent)) } // Returns a non-negative real number that heuristically measures how much -// confidence the history aggregated in the WorkloadState provides. +// confidence the history aggregated in the AggregateState provides. // For a workload producing a steady stream of samples over N days at the rate // of 1 sample per minute, this metric is equal to N. // This implementation is a very simple heuristic which looks at the total count // of samples and the time between the first and the last sample. -func getConfidence(s *WorkloadState, confidenceInterval time.Duration) float64 { +func getConfidence(s *autoscaling.WorkloadState, confidenceInterval time.Duration) float64 { // Distance between the first and the last observed sample time, measured in days. lifespanInDays := float64(s.LastSampleStart.Sub(s.FirstSampleStart)) / float64(confidenceInterval) // Total count of samples normalized such that it equals the number of days for diff --git a/internal/autoscaler/recommender.go b/internal/autoscaling/recommender/percentile/percentile.go similarity index 63% rename from internal/autoscaler/recommender.go rename to internal/autoscaling/recommender/percentile/percentile.go index 98160ddf..fd7df185 100644 --- a/internal/autoscaler/recommender.go +++ b/internal/autoscaling/recommender/percentile/percentile.go @@ -1,7 +1,11 @@ -package autoscaler +package percentile import ( + "strconv" "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling" ) const ( @@ -23,7 +27,7 @@ const ( defaultConfidenceInterval = time.Hour * 24 ) -var DefaultResourceRecommenderConfig = ResourceRecommenderConfig{ +var DefaultPercentileConfig = PercentileConfig{ TargetTflopsPercentile: defaultTargetTflopsPercentile, LowerBoundTflopsPercentile: defaultLowerBoundTflopsPercentile, UpperBoundTflopsPercentile: defaultUpperBoundTflopsPercentile, @@ -34,20 +38,7 @@ var DefaultResourceRecommenderConfig = ResourceRecommenderConfig{ ConfidenceInterval: defaultConfidenceInterval, } -type ResourceRecommender interface { - GetRecommendedResources(*WorkloadState) *RecommendedResources -} - -type RecommendedResources struct { - LowerBoundTflops ResourceAmount - TargetTflops ResourceAmount - UpperBoundTflops ResourceAmount - LowerBoundVram ResourceAmount - TargetVram ResourceAmount - UpperBoundVram ResourceAmount -} - -type ResourceRecommenderConfig struct { +type PercentileConfig struct { TargetTflopsPercentile float64 LowerBoundTflopsPercentile float64 UpperBoundTflopsPercentile float64 @@ -58,11 +49,7 @@ type ResourceRecommenderConfig struct { ConfidenceInterval time.Duration } -func NewResourceRecommender() ResourceRecommender { - return &resourceRecommender{} -} - -type resourceRecommender struct { +type PercentileRecommender struct { lowerBoundTflops TflopsEstimator targetTflops TflopsEstimator upperBoundTflops TflopsEstimator @@ -71,21 +58,62 @@ type resourceRecommender struct { upperBoundVram VramEstimator } -func (r *resourceRecommender) GetRecommendedResources(s *WorkloadState) *RecommendedResources { +func NewRecommender() *PercentileRecommender { + return &PercentileRecommender{} +} + +func (p *PercentileRecommender) Name() string { + return "percentile" +} + +func (p *PercentileRecommender) Recommend(w *autoscaling.WorkloadState) { + // TODO: cache config + p.createEstimatorsFromConfig(p.getPercentileConfig(&w.AutoScalingConfig)) + w.Recommendation = autoscaling.RecommendedResources{ + LowerBoundTflops: QuantityFromAmount(p.lowerBoundTflops.GetTflopsEstimation(w)), + TargetTflops: QuantityFromAmount(p.targetTflops.GetTflopsEstimation(w)), + UpperBoundTflops: QuantityFromAmount(p.upperBoundTflops.GetTflopsEstimation(w)), + LowerBoundVram: QuantityFromAmount(p.lowerBoundVram.GetVramEstimation(w)), + TargetVram: QuantityFromAmount(p.targetVram.GetVramEstimation(w)), + UpperBoundVram: QuantityFromAmount(p.upperBoundVram.GetVramEstimation(w)), + } +} - r.createEstimatorsFromConfig(s.GetResourceRecommenderConfig()) +func (p *PercentileRecommender) getPercentileConfig(asc *tfv1.AutoScalingConfig) *PercentileConfig { + cfg := DefaultPercentileConfig + + asr := asc.AutoSetResources + fields := []struct { + val string + dst *float64 + }{ + {asr.TargetTflopsPercentile, &cfg.TargetTflopsPercentile}, + {asr.LowerBoundTflopsPercentile, &cfg.LowerBoundTflopsPercentile}, + {asr.UpperBoundTflopsPercentile, &cfg.UpperBoundTflopsPercentile}, + {asr.TargetVramPercentile, &cfg.TargetVramPercentile}, + {asr.LowerBoundVramPercentile, &cfg.LowerBoundVramPercentile}, + {asr.UpperBoundVramPercentile, &cfg.UpperBoundVramPercentile}, + {asr.RequestMarginFraction, &cfg.RequestMarginFraction}, + } + for _, f := range fields { + if f.val == "" { + continue + } + if v, err := strconv.ParseFloat(f.val, 64); err == nil { + *f.dst = v + } + } - return &RecommendedResources{ - LowerBoundTflops: r.lowerBoundTflops.GetTflopsEstimation(s), - TargetTflops: r.targetTflops.GetTflopsEstimation(s), - UpperBoundTflops: r.upperBoundTflops.GetTflopsEstimation(s), - LowerBoundVram: r.lowerBoundVram.GetVramEstimation(s), - TargetVram: r.targetVram.GetVramEstimation(s), - UpperBoundVram: r.upperBoundVram.GetVramEstimation(s), + if asr.ConfidenceInterval != "" { + if d, err := time.ParseDuration(asr.ConfidenceInterval); err == nil { + cfg.ConfidenceInterval = d + } } + + return &cfg } -func (r *resourceRecommender) createEstimatorsFromConfig(config *ResourceRecommenderConfig) { +func (p *PercentileRecommender) createEstimatorsFromConfig(config *PercentileConfig) { targetTflops := NewPercentileTflopsEstimator(config.TargetTflopsPercentile) lowerBoundTflops := NewPercentileTflopsEstimator(config.LowerBoundTflopsPercentile) upperBoundTflops := NewPercentileTflopsEstimator(config.UpperBoundTflopsPercentile) @@ -108,7 +136,7 @@ func (r *resourceRecommender) createEstimatorsFromConfig(config *ResourceRecomme upperBoundVram = WithVramConfidenceMultiplier(1.0, 1.0, upperBoundVram, config.ConfidenceInterval) lowerBoundVram = WithVramConfidenceMultiplier(0.001, -2.0, lowerBoundVram, config.ConfidenceInterval) - *r = resourceRecommender{ + *p = PercentileRecommender{ lowerBoundTflops: lowerBoundTflops, targetTflops: targetTflops, upperBoundTflops: upperBoundTflops, diff --git a/internal/autoscaling/recommender/percentile/percentile_test.go b/internal/autoscaling/recommender/percentile/percentile_test.go new file mode 100644 index 00000000..c2afcc54 --- /dev/null +++ b/internal/autoscaling/recommender/percentile/percentile_test.go @@ -0,0 +1,73 @@ +package percentile + +import ( + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Percentile Recommender", func() { + It("should return default config when no AutoScalingConfig is set", func() { + cfg := NewRecommender().getPercentileConfig(nil) + Expect(cfg).ToNot(BeNil()) + Expect(*cfg).To(Equal(DefaultPercentileConfig)) + }) + + It("should parse float fields from AutoSetResources", func() { + asc := &tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + TargetTflopsPercentile: "0.8", + LowerBoundTflopsPercentile: "0.1", + UpperBoundTflopsPercentile: "0.95", + TargetVramPercentile: "0.7", + LowerBoundVramPercentile: "0.2", + UpperBoundVramPercentile: "0.9", + RequestMarginFraction: "0.15", + }, + } + cfg := NewRecommender().getPercentileConfig(asc) + Expect(cfg.TargetTflopsPercentile).To(Equal(0.8)) + Expect(cfg.LowerBoundTflopsPercentile).To(Equal(0.1)) + Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.95)) + Expect(cfg.TargetVramPercentile).To(Equal(0.7)) + Expect(cfg.LowerBoundVramPercentile).To(Equal(0.2)) + Expect(cfg.UpperBoundVramPercentile).To(Equal(0.9)) + Expect(cfg.RequestMarginFraction).To(Equal(0.15)) + }) + + It("should ignore invalid float fields and keep defaults", func() { + asc := &tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + TargetTflopsPercentile: "not-a-float", + LowerBoundTflopsPercentile: "", + UpperBoundTflopsPercentile: "0.99", + }, + } + cfg := NewRecommender().getPercentileConfig(asc) + Expect(cfg.TargetTflopsPercentile).To(Equal(DefaultPercentileConfig.TargetTflopsPercentile)) + Expect(cfg.LowerBoundTflopsPercentile).To(Equal(DefaultPercentileConfig.LowerBoundTflopsPercentile)) + Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.99)) + }) + + It("should parse ConfidenceInterval if valid", func() { + asc := &tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + ConfidenceInterval: "30m", + }, + } + cfg := NewRecommender().getPercentileConfig(asc) + Expect(cfg.ConfidenceInterval).To(Equal(30 * time.Minute)) + }) + + It("should ignore invalid ConfidenceInterval and keep default", func() { + asc := &tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + ConfidenceInterval: "not-a-duration", + }, + } + cfg := NewRecommender().getPercentileConfig(asc) + Expect(cfg.ConfidenceInterval).To(Equal(DefaultPercentileConfig.ConfidenceInterval)) + }) +}) diff --git a/internal/autoscaling/recommender/recommender.go b/internal/autoscaling/recommender/recommender.go new file mode 100644 index 00000000..a6470860 --- /dev/null +++ b/internal/autoscaling/recommender/recommender.go @@ -0,0 +1,28 @@ +package recommender + +import ( + "github.com/NexusGPU/tensor-fusion/internal/autoscaling" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling/recommender/percentile" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling/recommender/cron" +) + +const ( + PercentileRecommender = "percentile" + CronRecommender = "cron" +) + +type Interface interface { + Name() string + Recommend(*autoscaling.WorkloadState) +} + +func New(name string) Interface { + switch name { + case PercentileRecommender: + return percentile.NewRecommender() + case CronRecommender: + return cron.New() + default: + return nil + } +} diff --git a/internal/autoscaler/workerstate.go b/internal/autoscaling/workerstate.go similarity index 89% rename from internal/autoscaler/workerstate.go rename to internal/autoscaling/workerstate.go index 356fdfc1..f826ca95 100644 --- a/internal/autoscaler/workerstate.go +++ b/internal/autoscaling/workerstate.go @@ -1,7 +1,9 @@ -package autoscaler +package autoscaling import ( "time" + + "github.com/NexusGPU/tensor-fusion/internal/autoscaling/metrics" ) type WorkerState struct { @@ -9,7 +11,7 @@ type WorkerState struct { Workload string LastTflopsSampleTime time.Time - VramPeak ResourceAmount + VramPeak uint64 LastVramSampleTime time.Time VramWindowEnd time.Time } @@ -24,7 +26,7 @@ func NewWorkerState(name string, workload string) *WorkerState { } } -func (w *WorkerState) AddTflopsSample(workload *WorkloadState, metrics *WorkerMetrics) bool { +func (w *WorkerState) AddTflopsSample(workload *WorkloadState, metrics *metrics.WorkerUsage) bool { if metrics.Timestamp.Before(w.LastTflopsSampleTime) { return false } @@ -33,7 +35,7 @@ func (w *WorkerState) AddTflopsSample(workload *WorkloadState, metrics *WorkerMe return true } -func (w *WorkerState) AddVramSample(workload *WorkloadState, metrics *WorkerMetrics) bool { +func (w *WorkerState) AddVramSample(workload *WorkloadState, metrics *metrics.WorkerUsage) bool { ts := metrics.Timestamp if ts.Before(w.LastVramSampleTime) { return false diff --git a/internal/autoscaler/workloadstate.go b/internal/autoscaling/workloadstate.go similarity index 58% rename from internal/autoscaler/workloadstate.go rename to internal/autoscaling/workloadstate.go index d1a5d04d..ff75849d 100644 --- a/internal/autoscaler/workloadstate.go +++ b/internal/autoscaling/workloadstate.go @@ -1,11 +1,12 @@ -package autoscaler +package autoscaling import ( - "strconv" "strings" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaling/metrics" + "k8s.io/apimachinery/pkg/api/resource" vpa "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/util" ) @@ -23,22 +24,24 @@ const ( DefaultHistogramDecayHalfLife = time.Hour * 24 ) -type ResourceName string - -const ( - ResourceTflops ResourceName = "tflops" - ResourceVram ResourceName = "vram" -) +type RecommendedResources struct { + LowerBoundTflops resource.Quantity + TargetTflops resource.Quantity + UpperBoundTflops resource.Quantity + LowerBoundVram resource.Quantity + TargetVram resource.Quantity + UpperBoundVram resource.Quantity +} type WorkloadState struct { Namespace string Name string Resources tfv1.Resources AutoScalingConfig tfv1.AutoScalingConfig + Recommendation RecommendedResources - TflopsHistogram vpa.Histogram - VramHistogram vpa.Histogram - + TflopsHistogram vpa.Histogram + VramHistogram vpa.Histogram FirstSampleStart time.Time LastSampleStart time.Time TotalSamplesCount int @@ -62,51 +65,17 @@ func histogramOptions(maxValue, firstBucketSize float64) vpa.HistogramOptions { return options } -func (w *WorkloadState) UpdateSampleStats(metrics *WorkerMetrics) { - if metrics.Timestamp.After(w.LastSampleStart) { - w.LastSampleStart = metrics.Timestamp +func (w *WorkloadState) UpdateSampleStats(sample *metrics.WorkerUsage) { + if sample.Timestamp.After(w.LastSampleStart) { + w.LastSampleStart = sample.Timestamp } - if w.FirstSampleStart.IsZero() || metrics.Timestamp.Before(w.FirstSampleStart) { - w.FirstSampleStart = metrics.Timestamp + if w.FirstSampleStart.IsZero() || sample.Timestamp.Before(w.FirstSampleStart) { + w.FirstSampleStart = sample.Timestamp } w.TotalSamplesCount++ } -func (w *WorkloadState) GetResourceRecommenderConfig() *ResourceRecommenderConfig { - cfg := DefaultResourceRecommenderConfig - - asr := w.AutoScalingConfig.AutoSetResources - fields := []struct { - val string - dst *float64 - }{ - {asr.TargetTflopsPercentile, &cfg.TargetTflopsPercentile}, - {asr.LowerBoundTflopsPercentile, &cfg.LowerBoundTflopsPercentile}, - {asr.UpperBoundTflopsPercentile, &cfg.UpperBoundTflopsPercentile}, - {asr.TargetVramPercentile, &cfg.TargetVramPercentile}, - {asr.LowerBoundVramPercentile, &cfg.LowerBoundVramPercentile}, - {asr.UpperBoundVramPercentile, &cfg.UpperBoundVramPercentile}, - {asr.RequestMarginFraction, &cfg.RequestMarginFraction}, - } - for _, f := range fields { - if f.val == "" { - continue - } - if v, err := strconv.ParseFloat(f.val, 64); err == nil { - *f.dst = v - } - } - - if asr.ConfidenceInterval != "" { - if d, err := time.ParseDuration(asr.ConfidenceInterval); err == nil { - cfg.ConfidenceInterval = d - } - } - - return &cfg -} - -func (w *WorkloadState) IsTargetResource(name ResourceName) bool { +func (w *WorkloadState) IsTargetResource(name tfv1.ResourceName) bool { target := w.AutoScalingConfig.AutoSetResources.TargetResource return target == "" || strings.EqualFold(target, "all") || strings.EqualFold(string(name), target) } diff --git a/internal/autoscaling/workloadstate_test.go b/internal/autoscaling/workloadstate_test.go new file mode 100644 index 00000000..773cc7f4 --- /dev/null +++ b/internal/autoscaling/workloadstate_test.go @@ -0,0 +1,49 @@ +package autoscaling + +import ( + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Workload State", func() { + + It("should correctly determine if a resource is the target based on config", func() { + ws := NewWorkloadState("test") + + Expect(ws.IsTargetResource("tflops")).To(BeTrue()) + Expect(ws.IsTargetResource("vram")).To(BeTrue()) + + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "all"}, + } + + Expect(ws.IsTargetResource("tflops")).To(BeTrue()) + Expect(ws.IsTargetResource("vram")).To(BeTrue()) + + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "tflops"}, + } + Expect(ws.IsTargetResource("tflops")).To(BeTrue()) + Expect(ws.IsTargetResource("vram")).To(BeFalse()) + + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "vram"}, + } + Expect(ws.IsTargetResource("tflops")).To(BeFalse()) + Expect(ws.IsTargetResource("vram")).To(BeTrue()) + }) + + It("should correctly determine if auto scaling is enabled based on config", func() { + ws := NewWorkloadState("test") + + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{Enable: true}, + } + Expect(ws.IsAutoScalingEnabled()).To(BeTrue()) + ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{Enable: false}, + } + Expect(ws.IsAutoScalingEnabled()).To(BeFalse()) + }) +}) From e326a418f6c5837918a5b5430925880e62f43387 Mon Sep 17 00:00:00 2001 From: knave Date: Sat, 19 Jul 2025 19:14:14 +0800 Subject: [PATCH 16/27] refactor: code organization --- internal/autoscaler/autoscaler.go | 193 +++++++++ .../autoscaler/autoscaler_suite_test.go | 4 +- .../autoscaler/autoscaler_test.go | 127 +++--- .../metrics/metrics_aggregator.go} | 61 +-- .../metrics/metrics_provider.go} | 0 .../metrics/metrics_provider_test.go} | 0 .../recommender/cron_recommender.go | 24 ++ .../recommender}/estimator.go | 42 +- .../recommender/percentile_recommender.go} | 16 +- .../percentile_recommender_test.go} | 20 +- .../autoscaler/recommender/recommender.go | 40 ++ .../recommender/recommender_suite_test.go | 13 + internal/autoscaler/workload/handler.go | 191 +++++++++ internal/autoscaler/workload/worker.go | 74 ++++ internal/autoscaler/workload/workload.go | 68 +++ .../workload/workload_suite_test.go | 13 + .../workload/workload_test.go} | 30 +- internal/autoscaling/autoscaler/autoscaler.go | 386 ------------------ internal/autoscaling/recommender/cron/cron.go | 23 -- .../autoscaling/recommender/recommender.go | 28 -- internal/autoscaling/workerstate.go | 68 --- 21 files changed, 757 insertions(+), 664 deletions(-) create mode 100644 internal/autoscaler/autoscaler.go rename internal/{autoscaling => }/autoscaler/autoscaler_suite_test.go (99%) rename internal/{autoscaling => }/autoscaler/autoscaler_test.go (81%) rename internal/{autoscaling/workloadstate.go => autoscaler/metrics/metrics_aggregator.go} (59%) rename internal/{autoscaling/metrics/metricsprovider.go => autoscaler/metrics/metrics_provider.go} (100%) rename internal/{autoscaling/metrics/metricsprovider_test.go => autoscaler/metrics/metrics_provider_test.go} (100%) create mode 100644 internal/autoscaler/recommender/cron_recommender.go rename internal/{autoscaling/recommender/percentile => autoscaler/recommender}/estimator.go (75%) rename internal/{autoscaling/recommender/percentile/percentile.go => autoscaler/recommender/percentile_recommender.go} (92%) rename internal/{autoscaling/recommender/percentile/percentile_test.go => autoscaler/recommender/percentile_recommender_test.go} (77%) create mode 100644 internal/autoscaler/recommender/recommender.go create mode 100644 internal/autoscaler/recommender/recommender_suite_test.go create mode 100644 internal/autoscaler/workload/handler.go create mode 100644 internal/autoscaler/workload/worker.go create mode 100644 internal/autoscaler/workload/workload.go create mode 100644 internal/autoscaler/workload/workload_suite_test.go rename internal/{autoscaling/workloadstate_test.go => autoscaler/workload/workload_test.go} (51%) delete mode 100644 internal/autoscaling/autoscaler/autoscaler.go delete mode 100644 internal/autoscaling/recommender/cron/cron.go delete mode 100644 internal/autoscaling/recommender/recommender.go delete mode 100644 internal/autoscaling/workerstate.go diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go new file mode 100644 index 00000000..87a6a5bb --- /dev/null +++ b/internal/autoscaler/autoscaler.go @@ -0,0 +1,193 @@ +package autoscaler + +import ( + "context" + "errors" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/recommender" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +var ( + _ manager.Runnable = (*Autoscaler)(nil) + _ manager.LeaderElectionRunnable = (*Autoscaler)(nil) +) + +type Autoscaler struct { + client.Client + allocator *gpuallocator.GpuAllocator + metricsProvider metrics.Provider + recommenders []recommender.Interface + workloadHandler *workload.Handler + workloads map[string]*workload.WorkloadState +} + +func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Autoscaler, error) { + if c == nil { + return nil, errors.New("must specify client") + } + + if allocator == nil { + return nil, errors.New("must specify allocator") + } + + recommenders := []recommender.Interface{ + recommender.NewPercentileRecommender(), + recommender.NewCronRecommender(), + } + + return &Autoscaler{ + Client: c, + allocator: allocator, + metricsProvider: metrics.NewProvider(nil), + recommenders: recommenders, + workloadHandler: workload.NewHandler(c, allocator), + workloads: map[string]*workload.WorkloadState{}, + }, nil +} + +func (s *Autoscaler) Start(ctx context.Context) error { + log := log.FromContext(ctx) + log.Info("Starting autoscaler") + + // Handle timeout for loading historical metrics + historyCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + s.loadHistoryMetrics(historyCtx) + + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + for { + select { + case <-ticker.C: + s.Run(ctx) + case <-ctx.Done(): + log.Info("Stopping autoscaler") + return nil + } + } +} + +func (s *Autoscaler) NeedLeaderElection() bool { + return true +} + +func (s *Autoscaler) Run(ctx context.Context) { + log := log.FromContext(ctx) + + log.Info("Autoscaler running") + s.loadWorkloads(ctx) + s.loadRealTimeMetrics(ctx) + s.processWorkloads(ctx) +} + +func (s *Autoscaler) loadWorkloads(ctx context.Context) { + log := log.FromContext(ctx) + + workloadList := tfv1.TensorFusionWorkloadList{} + if err := s.List(ctx, &workloadList); err != nil { + log.Error(err, "failed to list workloads") + return + } + + observedWorkloads := map[string]bool{} + for _, workload := range workloadList.Items { + if !workload.DeletionTimestamp.IsZero() { + continue + } + workloadState := s.findOrCreateWorkload(workload.Name) + workloadState.Namespace = workload.Namespace + workloadState.Spec = workload.Spec + observedWorkloads[workload.Name] = true + + s.workloadHandler.UpdateWorkers(ctx, workloadState) + } + + // remove non-existent workloads + for key := range s.workloads { + if !observedWorkloads[key] { + delete(s.workloads, key) + } + } +} + +func (s *Autoscaler) loadHistoryMetrics(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("loading historical metrics") + + workersMetrics, err := s.metricsProvider.GetHistoryMetrics() + if err != nil { + log.Error(err, "failed to get history metrics") + return + } + for _, sample := range workersMetrics { + workload := s.findOrCreateWorkload(sample.WorkloadName) + workload.AddSample(sample) + } +} + +func (s *Autoscaler) loadRealTimeMetrics(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("loading realtime metrics") + + workersMetrics, err := s.metricsProvider.GetWorkersMetrics() + if err != nil { + log.Error(err, "failed to get workers metrics") + return + } + + for _, sample := range workersMetrics { + if workload, exists := s.workloads[sample.WorkloadName]; exists { + workload.AddSample(sample) + } + } +} + +func (s *Autoscaler) processWorkloads(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("processing workloads") + + for _, workload := range s.workloads { + recommendations := map[string]recommender.RecommendedResources{} + for _, recommender := range s.recommenders { + name := recommender.Name() + recommendations[name] = recommender.Recommend(&workload.Spec.AutoScalingConfig, workload.WorkerUsageAggregator) + log.Info("recommendation", "recommender", name, "workload", workload.Name, "resources", recommendations[name]) + } + + // var finalRecommendation recommender.RecommendedResources + // for _, recommendation := range recommendations { + // if recommendation.TargetTflops.IsZero() + // } + + // TODO: Implement updating the recommendation status of the workload CRD when the API is ready. + workload.UpdateRecommendation(recommendations["percentile"]) + s.workloadHandler.ProcessWorkload(ctx, workload) + } +} + +func (s *Autoscaler) findOrCreateWorkload(name string) *workload.WorkloadState { + w, ok := s.workloads[name] + if !ok { + w = workload.NewWorkloadState(name) + s.workloads[name] = w + } + return w +} + +// Start after manager started +func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { + autoScaler, err := NewAutoscaler(mgr.GetClient(), allocator) + if err != nil { + return err + } + return mgr.Add(autoScaler) +} diff --git a/internal/autoscaling/autoscaler/autoscaler_suite_test.go b/internal/autoscaler/autoscaler_suite_test.go similarity index 99% rename from internal/autoscaling/autoscaler/autoscaler_suite_test.go rename to internal/autoscaler/autoscaler_suite_test.go index 7cdc52ea..6eb9d869 100644 --- a/internal/autoscaling/autoscaler/autoscaler_suite_test.go +++ b/internal/autoscaler/autoscaler_suite_test.go @@ -89,7 +89,7 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, ErrorIfCRDPathMissing: true, // The BinaryAssetsDirectory is only required if you want to run the tests directly @@ -97,7 +97,7 @@ var _ = BeforeSuite(func() { // default path defined in controller-runtime which is /usr/local/kubebuilder/. // Note that you must have the required binaries setup under the bin directory to perform // the tests directly. When we run make test it will be setup and used automatically. - BinaryAssetsDirectory: filepath.Join("..", "..", "..", "bin", "k8s", + BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), } diff --git a/internal/autoscaling/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go similarity index 81% rename from internal/autoscaling/autoscaler/autoscaler_test.go rename to internal/autoscaler/autoscaler_test.go index 7cd76db7..4ed653a6 100644 --- a/internal/autoscaling/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -23,9 +23,8 @@ import ( "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling/metrics" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling/recommender" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/recommender" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/aws/smithy-go/ptr" . "github.com/onsi/ginkgo/v2" @@ -54,13 +53,13 @@ import ( var _ = Describe("Autoscaler", func() { Context("when creating an autoscaler", func() { It("should return an error if there is no client", func() { - as, err := New(nil, nil) + as, err := NewAutoscaler(nil, nil) Expect(as).To(BeNil()) Expect(err.Error()).To(ContainSubstring("must specify client")) }) It("should return an error if there is no allocator", func() { - as, err := New(k8sClient, nil) + as, err := NewAutoscaler(k8sClient, nil) Expect(as).To(BeNil()) Expect(err.Error()).To(ContainSubstring("must specify allocator")) }) @@ -68,13 +67,13 @@ var _ = Describe("Autoscaler", func() { Context("when loading history metrics", func() { It("should create the state of workloads and workers based on historical metrics", func() { - scaler, _ := New(k8sClient, allocator) + scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.metricsProvider = &FakeMetricsProvider{} - scaler.LoadHistoryMetrics(ctx) + scaler.loadHistoryMetrics(ctx) metrics, _ := scaler.metricsProvider.GetHistoryMetrics() for _, m := range metrics { - Expect(scaler.workloadStates).To(HaveKey(m.WorkloadName)) - Expect(scaler.workerStates).To(HaveKey(m.WorkerName)) + Expect(scaler.workloads).To(HaveKey(m.WorkloadName)) + Expect(scaler.workloads[m.WorkloadName].Workers).To(HaveKey(m.WorkerName)) } }) }) @@ -86,10 +85,9 @@ var _ = Describe("Autoscaler", func() { Build() defer tfEnv.Cleanup() - scaler, _ := New(k8sClient, allocator) - scaler.LoadWorkloads(ctx) - Expect(scaler.workloadStates).To(BeEmpty()) - Expect(scaler.workerStates).To(BeEmpty()) + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + Expect(scaler.workloads).To(BeEmpty()) // create two workloads pool := tfEnv.GetGPUPool(0) @@ -100,27 +98,29 @@ var _ = Describe("Autoscaler", func() { workload1 := createWorkload(pool, 1, 1) workload1Workers := getWorkers(workload1) - scaler.LoadWorkloads(ctx) - Expect(scaler.workloadStates).To(HaveLen(2)) - Expect(scaler.workloadStates).To(HaveKey(workload0.Name)) - Expect(scaler.workloadStates).To(HaveKey(workload1.Name)) - Expect(scaler.workerStates).To(HaveLen(3)) - Expect(scaler.workerStates).To(HaveKey(workload0Workers[0].Name)) - Expect(scaler.workerStates).To(HaveKey(workload0Workers[1].Name)) - Expect(scaler.workerStates).To(HaveKey(workload1Workers[0].Name)) + scaler.loadWorkloads(ctx) + Expect(scaler.workloads).To(HaveLen(2)) + Expect(scaler.workloads).To(HaveKey(workload0.Name)) + Expect(scaler.workloads).To(HaveKey(workload1.Name)) + workers := scaler.workloads[workload0.Name].Workers + Expect(workers).To(HaveLen(2)) + Expect(workers).To(HaveKey(workload0Workers[0].Name)) + Expect(workers).To(HaveKey(workload0Workers[1].Name)) + Expect(scaler.workloads[workload1.Name].Workers).To(HaveKey(workload1Workers[0].Name)) updateWorkloadReplicas(workload0, 1) - scaler.LoadWorkloads(ctx) - Expect(scaler.workerStates).To(HaveLen(2)) + scaler.loadWorkloads(ctx) + Expect(scaler.workloads[workload0.Name].Workers).To(HaveLen(2)) deleteWorkload(workload0) deleteWorkload(workload1) - scaler.LoadWorkloads(ctx) - Expect(scaler.workloadStates).NotTo(HaveKey(workload0.Name)) - Expect(scaler.workerStates).NotTo(HaveKey(workload0Workers[0].Name)) - Expect(scaler.workerStates).NotTo(HaveKey(workload0Workers[1].Name)) - Expect(scaler.workloadStates).NotTo(HaveKey(workload1.Name)) - Expect(scaler.workerStates).NotTo(HaveKey(workload1Workers[0].Name)) + scaler.loadWorkloads(ctx) + Expect(scaler.workloads).NotTo(HaveKey(workload0.Name)) + workers = scaler.workloads[workload0.Name].Workers + Expect(workers).NotTo(HaveKey(workload0Workers[0].Name)) + Expect(workers).NotTo(HaveKey(workload0Workers[1].Name)) + Expect(scaler.workloads).NotTo(HaveKey(workload1.Name)) + Expect(scaler.workloads[workload1.Name].Workers).NotTo(HaveKey(workload1Workers[0].Name)) }) }) @@ -137,9 +137,9 @@ var _ = Describe("Autoscaler", func() { worker := workers[0].Name - scaler, _ := New(k8sClient, allocator) - scaler.LoadWorkloads(ctx) - ws := scaler.workloadStates[workload.Name] + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + ws := scaler.workloads[workload.Name] now := time.Now() usage := &metrics.WorkerUsage{ WorkloadName: workload.Name, @@ -150,13 +150,14 @@ var _ = Describe("Autoscaler", func() { } scaler.metricsProvider = &FakeMetricsProvider{[]*metrics.WorkerUsage{usage}} - scaler.LoadRealTimeMetrics(ctx) - - Expect(scaler.workerStates[worker].LastTflopsSampleTime).To(Equal(usage.Timestamp)) - Expect(ws.TflopsHistogram.IsEmpty()).To(BeFalse()) - Expect(scaler.workerStates[worker].VramPeak).To(Equal(usage.VramUsage)) - Expect(scaler.workerStates[worker].LastVramSampleTime).To(Equal(usage.Timestamp)) - Expect(ws.VramHistogram.IsEmpty()).To(BeFalse()) + scaler.loadRealTimeMetrics(ctx) + + scalerWorkers := scaler.workloads[workload.Name].Workers + Expect(scalerWorkers[worker].LastTflopsSampleTime).To(Equal(usage.Timestamp)) + Expect(ws.WorkerUsageAggregator.TflopsHistogram.IsEmpty()).To(BeFalse()) + Expect(scalerWorkers[worker].VramPeak).To(Equal(usage.VramUsage)) + Expect(scalerWorkers[worker].LastVramSampleTime).To(Equal(usage.Timestamp)) + Expect(ws.WorkerUsageAggregator.VramHistogram.IsEmpty()).To(BeFalse()) }) }) @@ -170,19 +171,19 @@ var _ = Describe("Autoscaler", func() { workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := New(k8sClient, allocator) - scaler.LoadWorkloads(ctx) + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) scaler.recommenders[0] = &FakeUpScalingRecommender{} - scaler.ProcessWorkloads(ctx) + scaler.processWorkloads(ctx) - rr := scaler.workloadStates[workload.Name].Recommendation + rr := scaler.workloads[workload.Name].Recommendation Eventually(func(g Gomega) { assertWorkerAnnotations(getWorkers(workload)[0], rr) }).Should(Succeed()) // Upon reprocessing the workload, it should skip resource updates since they are already within the recommended resource boundaries - scaler.ProcessWorkloads(ctx) + scaler.processWorkloads(ctx) Consistently(func(g Gomega) { assertWorkerAnnotations(getWorkers(workload)[0], rr) }).Should(Succeed()) @@ -197,17 +198,17 @@ var _ = Describe("Autoscaler", func() { workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := New(k8sClient, allocator) - scaler.LoadWorkloads(ctx) + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) scaler.recommenders[0] = &FakeUpScalingRecommender{} - workloadState := scaler.workloadStates[workload.Name] - oldRes := workloadState.Resources + workloadState := scaler.workloads[workload.Name] + oldRes := workloadState.Spec.Resources // verify IsAutoScalingEnabled - workloadState.AutoScalingConfig.AutoSetResources.Enable = false - scaler.ProcessWorkloads(ctx) + workloadState.Spec.AutoScalingConfig.AutoSetResources.Enable = false + scaler.processWorkloads(ctx) Eventually(func(g Gomega) { tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0]) Expect(tflopsRequest.Equal(oldRes.Requests.Tflops)).To(BeTrue()) @@ -217,10 +218,10 @@ var _ = Describe("Autoscaler", func() { }).Should(Succeed()) // verify IsTargetResource - workloadState.AutoScalingConfig.AutoSetResources.Enable = true - workloadState.AutoScalingConfig.AutoSetResources.TargetResource = "tflops" - scaler.ProcessWorkloads(ctx) - rr := scaler.workloadStates[workload.Name].Recommendation + workloadState.Spec.AutoScalingConfig.AutoSetResources.Enable = true + workloadState.Spec.AutoScalingConfig.AutoSetResources.TargetResource = "tflops" + scaler.processWorkloads(ctx) + rr := scaler.workloads[workload.Name].Recommendation Eventually(func(g Gomega) { tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0]) Expect(tflopsRequest.Value()).To(Equal(rr.TargetTflops.Value())) @@ -239,11 +240,11 @@ var _ = Describe("Autoscaler", func() { workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) defer deleteWorkload(workload) - scaler, _ := New(k8sClient, allocator) - scaler.LoadWorkloads(ctx) + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) scaler.recommenders[0] = &FakeQuotaExceededRecommender{} - scaler.ProcessWorkloads(ctx) - err := scaler.updateWorkerResourcesIfNeeded(ctx, scaler.workloadStates[workload.Name], getWorkers(workload)[0]) + scaler.processWorkloads(ctx) + err := scaler.workloadHandler.UpdateWorkerResourcesIfNeeded(ctx, scaler.workloads[workload.Name], getWorkers(workload)[0]) Expect(err.Error()).To(ContainSubstring("failed to adjust allocation: scaling quota exceeded")) }) }) @@ -361,8 +362,8 @@ type FakeUpScalingRecommender struct { recommender.Interface } -func (f *FakeUpScalingRecommender) Recommend(w *autoscaling.WorkloadState) { - w.Recommendation = autoscaling.RecommendedResources{ +func (f *FakeUpScalingRecommender) Recommend(_ *tfv1.AutoScalingConfig, _ *metrics.WorkerUsageAggregator) recommender.RecommendedResources { + return recommender.RecommendedResources{ TargetTflops: resource.MustParse("110"), LowerBoundTflops: resource.MustParse("100"), UpperBoundTflops: resource.MustParse("120"), @@ -376,8 +377,8 @@ type FakeQuotaExceededRecommender struct { recommender.Interface } -func (f *FakeQuotaExceededRecommender) Recommend(w *autoscaling.WorkloadState) { - w.Recommendation = autoscaling.RecommendedResources{ +func (f *FakeQuotaExceededRecommender) Recommend(_ *tfv1.AutoScalingConfig, _ *metrics.WorkerUsageAggregator) recommender.RecommendedResources { + return recommender.RecommendedResources{ TargetTflops: resource.MustParse("9999"), LowerBoundTflops: resource.MustParse("9999"), UpperBoundTflops: resource.MustParse("9999"), @@ -437,7 +438,7 @@ func cleanupWorkload(key client.ObjectKey) { }).Should(Succeed()) } -func assertWorkerAnnotations(worker *corev1.Pod, rr autoscaling.RecommendedResources) { +func assertWorkerAnnotations(worker *corev1.Pod, rr recommender.RecommendedResources) { GinkgoHelper() tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(worker) Expect(tflopsRequest.Value()).To(Equal(rr.TargetTflops.Value())) diff --git a/internal/autoscaling/workloadstate.go b/internal/autoscaler/metrics/metrics_aggregator.go similarity index 59% rename from internal/autoscaling/workloadstate.go rename to internal/autoscaler/metrics/metrics_aggregator.go index ff75849d..91ac5f88 100644 --- a/internal/autoscaling/workloadstate.go +++ b/internal/autoscaler/metrics/metrics_aggregator.go @@ -1,12 +1,8 @@ -package autoscaling +package metrics import ( - "strings" "time" - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling/metrics" - "k8s.io/apimachinery/pkg/api/resource" vpa "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/util" ) @@ -24,48 +20,23 @@ const ( DefaultHistogramDecayHalfLife = time.Hour * 24 ) -type RecommendedResources struct { - LowerBoundTflops resource.Quantity - TargetTflops resource.Quantity - UpperBoundTflops resource.Quantity - LowerBoundVram resource.Quantity - TargetVram resource.Quantity - UpperBoundVram resource.Quantity -} - -type WorkloadState struct { - Namespace string - Name string - Resources tfv1.Resources - AutoScalingConfig tfv1.AutoScalingConfig - Recommendation RecommendedResources - +type WorkerUsageAggregator struct { TflopsHistogram vpa.Histogram VramHistogram vpa.Histogram FirstSampleStart time.Time LastSampleStart time.Time TotalSamplesCount int - CreationTime time.Time } -func NewWorkloadState(name string) *WorkloadState { - return &WorkloadState{ - Name: name, +func NewWorkerUsageAggregator() *WorkerUsageAggregator { + return &WorkerUsageAggregator{ TflopsHistogram: vpa.NewDecayingHistogram(histogramOptions(10000.0, 0.1), DefaultHistogramDecayHalfLife), VramHistogram: vpa.NewDecayingHistogram(histogramOptions(1e12, 1e7), DefaultHistogramDecayHalfLife), - CreationTime: time.Now(), } } -func histogramOptions(maxValue, firstBucketSize float64) vpa.HistogramOptions { - options, err := vpa.NewExponentialHistogramOptions(maxValue, firstBucketSize, 1.+DefaultHistogramBucketSizeGrowth, epsilon) - if err != nil { - panic("Invalid histogram options") // Should not happen. - } - return options -} - -func (w *WorkloadState) UpdateSampleStats(sample *metrics.WorkerUsage) { +func (w *WorkerUsageAggregator) AddTflopsSample(sample *WorkerUsage) bool { + w.TflopsHistogram.AddSample(float64(sample.TflopsUsage), minSampleWeight, sample.Timestamp) if sample.Timestamp.After(w.LastSampleStart) { w.LastSampleStart = sample.Timestamp } @@ -73,13 +44,23 @@ func (w *WorkloadState) UpdateSampleStats(sample *metrics.WorkerUsage) { w.FirstSampleStart = sample.Timestamp } w.TotalSamplesCount++ + return true +} + +func (w *WorkerUsageAggregator) AddVramSample(sample *WorkerUsage) bool { + w.VramHistogram.AddSample(float64(sample.VramUsage), 1.0, sample.Timestamp) + return true } -func (w *WorkloadState) IsTargetResource(name tfv1.ResourceName) bool { - target := w.AutoScalingConfig.AutoSetResources.TargetResource - return target == "" || strings.EqualFold(target, "all") || strings.EqualFold(string(name), target) +func (w *WorkerUsageAggregator) SubtractVramSample(usage float64, time time.Time) bool { + w.VramHistogram.SubtractSample(usage, 1.0, time) + return true } -func (w *WorkloadState) IsAutoScalingEnabled() bool { - return w.AutoScalingConfig.AutoSetResources.Enable +func histogramOptions(maxValue, firstBucketSize float64) vpa.HistogramOptions { + options, err := vpa.NewExponentialHistogramOptions(maxValue, firstBucketSize, 1.+DefaultHistogramBucketSizeGrowth, epsilon) + if err != nil { + panic("Invalid histogram options") // Should not happen. + } + return options } diff --git a/internal/autoscaling/metrics/metricsprovider.go b/internal/autoscaler/metrics/metrics_provider.go similarity index 100% rename from internal/autoscaling/metrics/metricsprovider.go rename to internal/autoscaler/metrics/metrics_provider.go diff --git a/internal/autoscaling/metrics/metricsprovider_test.go b/internal/autoscaler/metrics/metrics_provider_test.go similarity index 100% rename from internal/autoscaling/metrics/metricsprovider_test.go rename to internal/autoscaler/metrics/metrics_provider_test.go diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go new file mode 100644 index 00000000..7f900b1c --- /dev/null +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -0,0 +1,24 @@ +package recommender + +import ( + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" +) + +type CronRecommender struct{} + +func NewCronRecommender() *CronRecommender { + return &CronRecommender{} +} + +func (c *CronRecommender) Name() string { + return "cron" +} + +func (p *CronRecommender) Recommend(config *tfv1.AutoScalingConfig, w *metrics.WorkerUsageAggregator) RecommendedResources { + return RecommendedResources{} +} + +func (c *CronRecommender) getCronConfig(asc *tfv1.AutoScalingConfig) { + +} diff --git a/internal/autoscaling/recommender/percentile/estimator.go b/internal/autoscaler/recommender/estimator.go similarity index 75% rename from internal/autoscaling/recommender/percentile/estimator.go rename to internal/autoscaler/recommender/estimator.go index 14ef67d9..f1daa06b 100644 --- a/internal/autoscaling/recommender/percentile/estimator.go +++ b/internal/autoscaler/recommender/estimator.go @@ -1,10 +1,10 @@ -package percentile +package recommender import ( "math" "time" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" "k8s.io/apimachinery/pkg/api/resource" ) @@ -38,7 +38,7 @@ func resourceAmountFromFloat(amount float64) ResourceAmount { } type VramEstimator interface { - GetVramEstimation(s *autoscaling.WorkloadState) ResourceAmount + GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount } type percentileVramEstimator struct { @@ -50,8 +50,8 @@ func NewPercentileVramEstimator(percentile float64) VramEstimator { return &percentileVramEstimator{percentile} } -func (e *percentileVramEstimator) GetVramEstimation(s *autoscaling.WorkloadState) ResourceAmount { - return resourceAmountFromFloat(float64(s.VramHistogram.Percentile(e.percentile))) +func (e *percentileVramEstimator) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + return resourceAmountFromFloat(float64(w.VramHistogram.Percentile(e.percentile))) } type vramMarginEstimator struct { @@ -65,8 +65,8 @@ func WithVramMargin(marginFraction float64, baseEstimator VramEstimator) VramEst } // GetvramEstimation returns the vram estimation for the given AggregateContainerState. -func (e *vramMarginEstimator) GetVramEstimation(s *autoscaling.WorkloadState) ResourceAmount { - base := e.baseEstimator.GetVramEstimation(s) +func (e *vramMarginEstimator) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + base := e.baseEstimator.GetVramEstimation(w) margin := resourceAmountFromFloat(float64(base) * e.marginFraction) return base + margin } @@ -88,14 +88,14 @@ func WithVramConfidenceMultiplier(multiplier, exponent float64, baseEstimator Vr } } -func (e *vramConfidenceMultiplier) GetVramEstimation(s *autoscaling.WorkloadState) ResourceAmount { - confidence := getConfidence(s, e.confidenceInterval) - base := e.baseEstimator.GetVramEstimation(s) +func (e *vramConfidenceMultiplier) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + confidence := getConfidence(w, e.confidenceInterval) + base := e.baseEstimator.GetVramEstimation(w) return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent)) } type TflopsEstimator interface { - GetTflopsEstimation(s *autoscaling.WorkloadState) ResourceAmount + GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount } type percentileTflopsEstimator struct { @@ -107,8 +107,8 @@ func NewPercentileTflopsEstimator(percentile float64) TflopsEstimator { return &percentileTflopsEstimator{percentile} } -func (e *percentileTflopsEstimator) GetTflopsEstimation(s *autoscaling.WorkloadState) ResourceAmount { - return resourceAmountFromFloat(float64(s.TflopsHistogram.Percentile(e.percentile))) +func (e *percentileTflopsEstimator) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + return resourceAmountFromFloat(float64(w.TflopsHistogram.Percentile(e.percentile))) } type tflopsMarginEstimator struct { @@ -122,8 +122,8 @@ func WithTflopsMargin(marginFraction float64, baseEstimator TflopsEstimator) Tfl } // GetTflopsEstimation returns the tflops estimation for the given AggregateContainerState. -func (e *tflopsMarginEstimator) GetTflopsEstimation(s *autoscaling.WorkloadState) ResourceAmount { - base := e.baseEstimator.GetTflopsEstimation(s) +func (e *tflopsMarginEstimator) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + base := e.baseEstimator.GetTflopsEstimation(w) margin := resourceAmountFromFloat(float64(base) * e.marginFraction) return base + margin } @@ -145,9 +145,9 @@ func WithTflopsConfidenceMultiplier(multiplier, exponent float64, baseEstimator } } -func (e *tflopsConfidenceMultiplier) GetTflopsEstimation(s *autoscaling.WorkloadState) ResourceAmount { - confidence := getConfidence(s, e.confidenceInterval) - base := e.baseEstimator.GetTflopsEstimation(s) +func (e *tflopsConfidenceMultiplier) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + confidence := getConfidence(w, e.confidenceInterval) + base := e.baseEstimator.GetTflopsEstimation(w) return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent)) } @@ -157,11 +157,11 @@ func (e *tflopsConfidenceMultiplier) GetTflopsEstimation(s *autoscaling.Workload // of 1 sample per minute, this metric is equal to N. // This implementation is a very simple heuristic which looks at the total count // of samples and the time between the first and the last sample. -func getConfidence(s *autoscaling.WorkloadState, confidenceInterval time.Duration) float64 { +func getConfidence(w *metrics.WorkerUsageAggregator, confidenceInterval time.Duration) float64 { // Distance between the first and the last observed sample time, measured in days. - lifespanInDays := float64(s.LastSampleStart.Sub(s.FirstSampleStart)) / float64(confidenceInterval) + lifespanInDays := float64(w.LastSampleStart.Sub(w.FirstSampleStart)) / float64(confidenceInterval) // Total count of samples normalized such that it equals the number of days for // frequency of 1 sample/minute. - samplesAmount := float64(s.TotalSamplesCount) / confidenceInterval.Minutes() + samplesAmount := float64(w.TotalSamplesCount) / confidenceInterval.Minutes() return math.Min(lifespanInDays, samplesAmount) } diff --git a/internal/autoscaling/recommender/percentile/percentile.go b/internal/autoscaler/recommender/percentile_recommender.go similarity index 92% rename from internal/autoscaling/recommender/percentile/percentile.go rename to internal/autoscaler/recommender/percentile_recommender.go index fd7df185..f4d3ed51 100644 --- a/internal/autoscaling/recommender/percentile/percentile.go +++ b/internal/autoscaler/recommender/percentile_recommender.go @@ -1,11 +1,11 @@ -package percentile +package recommender import ( "strconv" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" ) const ( @@ -27,7 +27,7 @@ const ( defaultConfidenceInterval = time.Hour * 24 ) -var DefaultPercentileConfig = PercentileConfig{ +var defaultPercentileConfig = PercentileConfig{ TargetTflopsPercentile: defaultTargetTflopsPercentile, LowerBoundTflopsPercentile: defaultLowerBoundTflopsPercentile, UpperBoundTflopsPercentile: defaultUpperBoundTflopsPercentile, @@ -58,7 +58,7 @@ type PercentileRecommender struct { upperBoundVram VramEstimator } -func NewRecommender() *PercentileRecommender { +func NewPercentileRecommender() *PercentileRecommender { return &PercentileRecommender{} } @@ -66,10 +66,10 @@ func (p *PercentileRecommender) Name() string { return "percentile" } -func (p *PercentileRecommender) Recommend(w *autoscaling.WorkloadState) { +func (p *PercentileRecommender) Recommend(config *tfv1.AutoScalingConfig, w *metrics.WorkerUsageAggregator) RecommendedResources { // TODO: cache config - p.createEstimatorsFromConfig(p.getPercentileConfig(&w.AutoScalingConfig)) - w.Recommendation = autoscaling.RecommendedResources{ + p.createEstimatorsFromConfig(p.getPercentileConfig(config)) + return RecommendedResources{ LowerBoundTflops: QuantityFromAmount(p.lowerBoundTflops.GetTflopsEstimation(w)), TargetTflops: QuantityFromAmount(p.targetTflops.GetTflopsEstimation(w)), UpperBoundTflops: QuantityFromAmount(p.upperBoundTflops.GetTflopsEstimation(w)), @@ -80,7 +80,7 @@ func (p *PercentileRecommender) Recommend(w *autoscaling.WorkloadState) { } func (p *PercentileRecommender) getPercentileConfig(asc *tfv1.AutoScalingConfig) *PercentileConfig { - cfg := DefaultPercentileConfig + cfg := defaultPercentileConfig asr := asc.AutoSetResources fields := []struct { diff --git a/internal/autoscaling/recommender/percentile/percentile_test.go b/internal/autoscaler/recommender/percentile_recommender_test.go similarity index 77% rename from internal/autoscaling/recommender/percentile/percentile_test.go rename to internal/autoscaler/recommender/percentile_recommender_test.go index c2afcc54..b4782296 100644 --- a/internal/autoscaling/recommender/percentile/percentile_test.go +++ b/internal/autoscaler/recommender/percentile_recommender_test.go @@ -1,4 +1,4 @@ -package percentile +package recommender import ( "time" @@ -10,9 +10,9 @@ import ( var _ = Describe("Percentile Recommender", func() { It("should return default config when no AutoScalingConfig is set", func() { - cfg := NewRecommender().getPercentileConfig(nil) + cfg := NewPercentileRecommender().getPercentileConfig(nil) Expect(cfg).ToNot(BeNil()) - Expect(*cfg).To(Equal(DefaultPercentileConfig)) + Expect(*cfg).To(Equal(defaultPercentileConfig)) }) It("should parse float fields from AutoSetResources", func() { @@ -27,7 +27,7 @@ var _ = Describe("Percentile Recommender", func() { RequestMarginFraction: "0.15", }, } - cfg := NewRecommender().getPercentileConfig(asc) + cfg := NewPercentileRecommender().getPercentileConfig(asc) Expect(cfg.TargetTflopsPercentile).To(Equal(0.8)) Expect(cfg.LowerBoundTflopsPercentile).To(Equal(0.1)) Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.95)) @@ -45,9 +45,9 @@ var _ = Describe("Percentile Recommender", func() { UpperBoundTflopsPercentile: "0.99", }, } - cfg := NewRecommender().getPercentileConfig(asc) - Expect(cfg.TargetTflopsPercentile).To(Equal(DefaultPercentileConfig.TargetTflopsPercentile)) - Expect(cfg.LowerBoundTflopsPercentile).To(Equal(DefaultPercentileConfig.LowerBoundTflopsPercentile)) + cfg := NewPercentileRecommender().getPercentileConfig(asc) + Expect(cfg.TargetTflopsPercentile).To(Equal(defaultPercentileConfig.TargetTflopsPercentile)) + Expect(cfg.LowerBoundTflopsPercentile).To(Equal(defaultPercentileConfig.LowerBoundTflopsPercentile)) Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.99)) }) @@ -57,7 +57,7 @@ var _ = Describe("Percentile Recommender", func() { ConfidenceInterval: "30m", }, } - cfg := NewRecommender().getPercentileConfig(asc) + cfg := NewPercentileRecommender().getPercentileConfig(asc) Expect(cfg.ConfidenceInterval).To(Equal(30 * time.Minute)) }) @@ -67,7 +67,7 @@ var _ = Describe("Percentile Recommender", func() { ConfidenceInterval: "not-a-duration", }, } - cfg := NewRecommender().getPercentileConfig(asc) - Expect(cfg.ConfidenceInterval).To(Equal(DefaultPercentileConfig.ConfidenceInterval)) + cfg := NewPercentileRecommender().getPercentileConfig(asc) + Expect(cfg.ConfidenceInterval).To(Equal(defaultPercentileConfig.ConfidenceInterval)) }) }) diff --git a/internal/autoscaler/recommender/recommender.go b/internal/autoscaler/recommender/recommender.go new file mode 100644 index 00000000..7f4cb3ca --- /dev/null +++ b/internal/autoscaler/recommender/recommender.go @@ -0,0 +1,40 @@ +package recommender + +import ( + "fmt" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "k8s.io/apimachinery/pkg/api/resource" +) + +const ( + RecommenderPercentile = "percentile" + RecommenderCron = "cron" +) + +type RecommendedResources struct { + LowerBoundTflops resource.Quantity + TargetTflops resource.Quantity + UpperBoundTflops resource.Quantity + LowerBoundVram resource.Quantity + TargetVram resource.Quantity + UpperBoundVram resource.Quantity +} + +// Interface defines the contract for resource recommendation strategies used by the autoscaler. +type Interface interface { + Name() string + Recommend(*tfv1.AutoScalingConfig, *metrics.WorkerUsageAggregator) RecommendedResources +} + +func New(name string) (Interface, error) { + switch name { + case RecommenderPercentile: + return NewPercentileRecommender(), nil + case RecommenderCron: + return NewCronRecommender(), nil + default: + return nil, fmt.Errorf("unknown recommender name: %s", name) + } +} diff --git a/internal/autoscaler/recommender/recommender_suite_test.go b/internal/autoscaler/recommender/recommender_suite_test.go new file mode 100644 index 00000000..7177cf1d --- /dev/null +++ b/internal/autoscaler/recommender/recommender_suite_test.go @@ -0,0 +1,13 @@ +package recommender_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestRecommender(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Recommender Suite") +} diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go new file mode 100644 index 00000000..41a78738 --- /dev/null +++ b/internal/autoscaler/workload/handler.go @@ -0,0 +1,191 @@ +package workload + +import ( + "context" + "fmt" + "math/big" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +type Handler struct { + client.Client + allocator *gpuallocator.GpuAllocator +} + +func NewHandler(client client.Client, allocator *gpuallocator.GpuAllocator) *Handler { + return &Handler{ + Client: client, + allocator: allocator, + } +} + +func (h *Handler) UpdateWorkers(ctx context.Context, workload *WorkloadState) { + workerList := &corev1.PodList{} + if err := h.List(ctx, workerList, + client.InNamespace(workload.Namespace), + client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil { + log.FromContext(ctx).Error(err, "failed to list workers") + return + } + workload.UpdateWorkers(workerList) +} + +func (h *Handler) ProcessWorkload(ctx context.Context, workload *WorkloadState) { + log := log.FromContext(ctx) + workerList := &corev1.PodList{} + if err := h.List(ctx, workerList, + client.InNamespace(workload.Namespace), + client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil { + log.Error(err, "failed to list workers") + } + + if !workload.IsAutoScalingEnabled() { + return + } + + for _, worker := range workerList.Items { + if !worker.DeletionTimestamp.IsZero() { + continue + } + + if err := h.UpdateWorkerResourcesIfNeeded(ctx, workload, &worker); err != nil { + log.Error(err, "failed to update worker") + } + } +} + +func (h *Handler) UpdateWorkerResourcesIfNeeded(ctx context.Context, workload *WorkloadState, worker *corev1.Pod) error { + log := log.FromContext(ctx) + + adjustRequest, err := getCurrentWorkerResourceRequest(worker) + if err != nil { + return fmt.Errorf("failed to get current worker resource request, %v", err) + } + + rr := &workload.Recommendation + resourcesInfo := []struct { + name tfv1.ResourceName + requestKey string + limitKey string + request *resource.Quantity + limit *resource.Quantity + lowerBound resource.Quantity + upperBound resource.Quantity + target resource.Quantity + }{ + { + name: tfv1.ResourceTflops, + requestKey: constants.TFLOPSRequestAnnotation, + limitKey: constants.TFLOPSLimitAnnotation, + request: &adjustRequest.NewRequest.Tflops, + limit: &adjustRequest.NewLimit.Tflops, + lowerBound: rr.LowerBoundTflops, + upperBound: rr.UpperBoundTflops, + target: rr.TargetTflops, + }, + { + name: tfv1.ResourceVram, + requestKey: constants.VRAMRequestAnnotation, + limitKey: constants.VRAMLimitAnnotation, + request: &adjustRequest.NewRequest.Vram, + limit: &adjustRequest.NewLimit.Vram, + lowerBound: rr.LowerBoundVram, + upperBound: rr.UpperBoundVram, + target: rr.TargetVram, + }, + } + + newAnnotations := map[string]string{} + var upScaling, downScaling bool + for _, resInfo := range resourcesInfo { + if !workload.ShouldScaleResource(resInfo.name) { + continue + } + upScaling = resInfo.request.Cmp(resInfo.lowerBound) < 0 + downScaling = resInfo.request.Cmp(resInfo.upperBound) > 0 + if upScaling || downScaling { + targetRequest := resInfo.target + targetLimit := getProportionalLimit(resInfo.limit, resInfo.request, &targetRequest) + if targetLimit == nil { + return fmt.Errorf("failed to get limit for %s", resInfo.requestKey) + } + newAnnotations[resInfo.requestKey] = targetRequest.String() + newAnnotations[resInfo.limitKey] = targetLimit.String() + *resInfo.request = targetRequest + *resInfo.limit = *targetLimit + } + } + + if len(newAnnotations) > 0 { + adjustRequest.IsScaleUp = upScaling + if _, err := h.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil { + return fmt.Errorf("failed to adjust allocation: %v", err) + } + log.Info("adjust allocation successfully", "adjustRequest", adjustRequest) + // Patch the worker with updated annotations + patch := client.MergeFrom(worker.DeepCopy()) + for key, value := range newAnnotations { + worker.Annotations[key] = value + } + if err := h.Patch(ctx, worker, patch); err != nil { + return fmt.Errorf("failed to patch worker: %v", err) + } + } + + return nil +} + +func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity { + if originalLimit == nil || originalLimit.IsZero() || + originalRequest == nil || originalRequest.IsZero() || + recommendedRequest == nil || recommendedRequest.IsZero() { + return nil + } + + originalValue := big.NewInt(originalLimit.Value()) + scaleBaseValue := big.NewInt(originalRequest.Value()) + scaleResultValue := big.NewInt(recommendedRequest.Value()) + var scaledOriginal big.Int + scaledOriginal.Mul(originalValue, scaleResultValue) + scaledOriginal.Div(&scaledOriginal, scaleBaseValue) + if scaledOriginal.IsInt64() { + return resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format) + } + + return nil +} + +func getCurrentWorkerResourceRequest(worker *corev1.Pod) (*tfv1.AdjustRequest, error) { + adjustRequest := tfv1.AdjustRequest{ + PodUID: string(worker.UID), + IsScaleUp: false, + NewRequest: tfv1.Resource{}, + NewLimit: tfv1.Resource{}, + } + annotations := worker.GetAnnotations() + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {constants.TFLOPSRequestAnnotation, &adjustRequest.NewRequest.Tflops}, + {constants.TFLOPSLimitAnnotation, &adjustRequest.NewLimit.Tflops}, + {constants.VRAMRequestAnnotation, &adjustRequest.NewRequest.Vram}, + {constants.VRAMLimitAnnotation, &adjustRequest.NewLimit.Vram}, + } + for _, info := range resInfo { + q, err := resource.ParseQuantity(annotations[info.key]) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + } + + return &adjustRequest, nil +} diff --git a/internal/autoscaler/workload/worker.go b/internal/autoscaler/workload/worker.go new file mode 100644 index 00000000..8ad57ec3 --- /dev/null +++ b/internal/autoscaler/workload/worker.go @@ -0,0 +1,74 @@ +package workload + +import ( + "time" + + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" +) + +type WorkerState struct { + Name string + WorkloadName string + LastTflopsSampleTime time.Time + + VramPeak uint64 + LastVramSampleTime time.Time + VramWindowEnd time.Time +} + +func NewWorkerState(name string, workloadName string) *WorkerState { + return &WorkerState{ + Name: name, + WorkloadName: workloadName, + LastTflopsSampleTime: time.Time{}, + LastVramSampleTime: time.Time{}, + VramWindowEnd: time.Time{}, + } +} + +func (w *WorkerState) AddSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool { + w.AddTflopsSample(aggregator, sample) + w.AddVramSample(aggregator, sample) + return true +} + +func (w *WorkerState) AddTflopsSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool { + if sample.Timestamp.Before(w.LastTflopsSampleTime) { + return false + } + aggregator.AddTflopsSample(sample) + w.LastTflopsSampleTime = sample.Timestamp + return true +} + +func (w *WorkerState) AddVramSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool { + ts := sample.Timestamp + if ts.Before(w.LastVramSampleTime) { + return false + } + w.LastVramSampleTime = ts + if w.VramWindowEnd.IsZero() { + w.VramWindowEnd = ts + } + + addNewPeak := false + if ts.Before(w.VramWindowEnd) { + if w.VramPeak != 0 && sample.VramUsage > w.VramPeak { + aggregator.SubtractVramSample(float64(w.VramPeak), w.VramWindowEnd) + addNewPeak = true + } + } else { + aggregationInteval := metrics.DefaultAggregationInterval + shift := ts.Sub(w.VramWindowEnd).Truncate(aggregationInteval) + aggregationInteval + w.VramWindowEnd = w.VramWindowEnd.Add(shift) + w.VramPeak = 0 + addNewPeak = true + } + + if addNewPeak { + aggregator.AddVramSample(sample) + w.VramPeak = sample.VramUsage + } + + return true +} diff --git a/internal/autoscaler/workload/workload.go b/internal/autoscaler/workload/workload.go new file mode 100644 index 00000000..5c6e32a0 --- /dev/null +++ b/internal/autoscaler/workload/workload.go @@ -0,0 +1,68 @@ +package workload + +import ( + "strings" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/recommender" + corev1 "k8s.io/api/core/v1" +) + +type WorkloadState struct { + Namespace string + Name string + Spec tfv1.WorkloadProfileSpec + Recommendation recommender.RecommendedResources + Workers map[string]*WorkerState + WorkerUsageAggregator *metrics.WorkerUsageAggregator +} + +func NewWorkloadState(name string) *WorkloadState { + return &WorkloadState{ + Name: name, + Workers: make(map[string]*WorkerState), + WorkerUsageAggregator: metrics.NewWorkerUsageAggregator(), + } +} + +func (w *WorkloadState) UpdateRecommendation(recommendation recommender.RecommendedResources) { + w.Recommendation = recommendation +} + +func (w *WorkloadState) IsAutoScalingEnabled() bool { + return w.Spec.AutoScalingConfig.AutoSetResources.Enable +} + +func (w *WorkloadState) ShouldScaleResource(name tfv1.ResourceName) bool { + target := w.Spec.AutoScalingConfig.AutoSetResources.TargetResource + return target == "" || strings.EqualFold(target, "all") || strings.EqualFold(string(name), target) +} + +func (w *WorkloadState) UpdateWorkers(podList *corev1.PodList) { + observedWorkers := map[string]bool{} + for _, worker := range podList.Items { + if !worker.DeletionTimestamp.IsZero() { + continue + } + if _, exists := w.Workers[worker.Name]; !exists { + w.Workers[worker.Name] = NewWorkerState(worker.Name, w.Name) + } + observedWorkers[worker.Name] = true + } + + for key, worker := range w.Workers { + if worker.WorkloadName == w.Name && !observedWorkers[key] { + delete(w.Workers, key) + } + } +} + +func (w *WorkloadState) AddSample(sample *metrics.WorkerUsage) { + worker, exists := w.Workers[sample.WorkerName] + if !exists { + worker = NewWorkerState(sample.WorkerName, sample.WorkloadName) + w.Workers[sample.WorkerName] = worker + } + worker.AddSample(w.WorkerUsageAggregator, sample) +} diff --git a/internal/autoscaler/workload/workload_suite_test.go b/internal/autoscaler/workload/workload_suite_test.go new file mode 100644 index 00000000..cd3451b6 --- /dev/null +++ b/internal/autoscaler/workload/workload_suite_test.go @@ -0,0 +1,13 @@ +package workload_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestWorkload(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Workload Suite") +} diff --git a/internal/autoscaling/workloadstate_test.go b/internal/autoscaler/workload/workload_test.go similarity index 51% rename from internal/autoscaling/workloadstate_test.go rename to internal/autoscaler/workload/workload_test.go index 773cc7f4..2e36adbe 100644 --- a/internal/autoscaling/workloadstate_test.go +++ b/internal/autoscaler/workload/workload_test.go @@ -1,4 +1,4 @@ -package autoscaling +package workload import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" @@ -6,42 +6,42 @@ import ( . "github.com/onsi/gomega" ) -var _ = Describe("Workload State", func() { +var _ = Describe("Workload", func() { It("should correctly determine if a resource is the target based on config", func() { ws := NewWorkloadState("test") - Expect(ws.IsTargetResource("tflops")).To(BeTrue()) - Expect(ws.IsTargetResource("vram")).To(BeTrue()) + Expect(ws.ShouldScaleResource("tflops")).To(BeTrue()) + Expect(ws.ShouldScaleResource("vram")).To(BeTrue()) - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{TargetResource: "all"}, } - Expect(ws.IsTargetResource("tflops")).To(BeTrue()) - Expect(ws.IsTargetResource("vram")).To(BeTrue()) + Expect(ws.ShouldScaleResource("tflops")).To(BeTrue()) + Expect(ws.ShouldScaleResource("vram")).To(BeTrue()) - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{TargetResource: "tflops"}, } - Expect(ws.IsTargetResource("tflops")).To(BeTrue()) - Expect(ws.IsTargetResource("vram")).To(BeFalse()) + Expect(ws.ShouldScaleResource("tflops")).To(BeTrue()) + Expect(ws.ShouldScaleResource("vram")).To(BeFalse()) - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{TargetResource: "vram"}, } - Expect(ws.IsTargetResource("tflops")).To(BeFalse()) - Expect(ws.IsTargetResource("vram")).To(BeTrue()) + Expect(ws.ShouldScaleResource("tflops")).To(BeFalse()) + Expect(ws.ShouldScaleResource("vram")).To(BeTrue()) }) It("should correctly determine if auto scaling is enabled based on config", func() { ws := NewWorkloadState("test") - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{Enable: true}, } Expect(ws.IsAutoScalingEnabled()).To(BeTrue()) - ws.AutoScalingConfig = tfv1.AutoScalingConfig{ + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{Enable: false}, } Expect(ws.IsAutoScalingEnabled()).To(BeFalse()) diff --git a/internal/autoscaling/autoscaler/autoscaler.go b/internal/autoscaling/autoscaler/autoscaler.go deleted file mode 100644 index 6a1e498b..00000000 --- a/internal/autoscaling/autoscaler/autoscaler.go +++ /dev/null @@ -1,386 +0,0 @@ -package autoscaler - -import ( - "context" - "errors" - "fmt" - "math/big" - "time" - - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling/metrics" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling/recommender" - "github.com/NexusGPU/tensor-fusion/internal/constants" - "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" - "github.com/samber/lo" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/manager" -) - -var ( - _ manager.Runnable = (*Autoscaler)(nil) - _ manager.LeaderElectionRunnable = (*Autoscaler)(nil) -) - -type Autoscaler struct { - client.Client - allocator *gpuallocator.GpuAllocator - recommenders []recommender.Interface - metricsProvider metrics.Provider - workloadStates map[string]*autoscaling.WorkloadState - workerStates map[string]*autoscaling.WorkerState -} - -func New(c client.Client, allocator *gpuallocator.GpuAllocator) (*Autoscaler, error) { - if c == nil { - return nil, errors.New("must specify client") - } - - if allocator == nil { - return nil, errors.New("must specify allocator") - } - - recommenders := []recommender.Interface{ - recommender.New(recommender.PercentileRecommender), - recommender.New(recommender.CronRecommender), - } - - return &Autoscaler{ - Client: c, - allocator: allocator, - recommenders: recommenders, - metricsProvider: metrics.NewProvider(nil), - workloadStates: map[string]*autoscaling.WorkloadState{}, - workerStates: map[string]*autoscaling.WorkerState{}, - }, nil -} - -func (s *Autoscaler) Start(ctx context.Context) error { - log := log.FromContext(ctx) - log.Info("Starting autoscaler") - - s.LoadHistoryMetrics(ctx) // TODO: handle timeout - - ticker := time.NewTicker(time.Minute) - defer ticker.Stop() - for { - select { - case <-ticker.C: - s.Run(ctx) - case <-ctx.Done(): - log.Info("Stopping autoscaler") - return nil - } - } -} - -func (s *Autoscaler) NeedLeaderElection() bool { - return true -} - -func (s *Autoscaler) Run(ctx context.Context) { - log := log.FromContext(ctx) - - log.Info("Autoscaler running") - s.LoadWorkloads(ctx) - s.LoadRealTimeMetrics(ctx) - s.ProcessWorkloads(ctx) -} - -func (s *Autoscaler) LoadWorkloads(ctx context.Context) { - log := log.FromContext(ctx) - - workloadList := tfv1.TensorFusionWorkloadList{} - if err := s.List(ctx, &workloadList); err != nil { - log.Error(err, "failed to list workloads") - return - } - - observedWorkloads := map[string]bool{} - for _, workload := range workloadList.Items { - if !workload.DeletionTimestamp.IsZero() { - continue - } - - workloadName := workload.Name - workloadState, exists := s.workloadStates[workloadName] - if !exists { - workloadState = autoscaling.NewWorkloadState(workloadName) - } - workloadState.Namespace = workload.Namespace - workloadState.Resources = workload.Spec.Resources - workloadState.AutoScalingConfig = workload.Spec.AutoScalingConfig - s.workloadStates[workloadName] = workloadState - - observedWorkloads[workloadName] = true - - podList := &corev1.PodList{} - if err := s.List(ctx, podList, - client.InNamespace(workload.Namespace), - client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil { - log.Error(err, "failed to list workers") - continue - } - - observedWorkers := map[string]bool{} - for _, worker := range podList.Items { - if !worker.DeletionTimestamp.IsZero() { - continue - } - if _, exists := s.workerStates[worker.Name]; !exists { - s.workerStates[worker.Name] = autoscaling.NewWorkerState(worker.Name, workloadName) - } - observedWorkers[worker.Name] = true - } - - s.workerStates = lo.OmitBy(s.workerStates, func(key string, state *autoscaling.WorkerState) bool { - return state.Workload == workloadName && !observedWorkers[key] - }) - } - - // remove unused workloadStates - s.workloadStates = lo.OmitBy(s.workloadStates, func(key string, _ *autoscaling.WorkloadState) bool { - return !observedWorkloads[key] - }) - - // remove unused workerStates - s.workerStates = lo.OmitBy(s.workerStates, func(_ string, state *autoscaling.WorkerState) bool { - return !observedWorkloads[state.Workload] - }) -} - -func (s *Autoscaler) LoadHistoryMetrics(ctx context.Context) { - log := log.FromContext(ctx) - log.Info("loading historical metrics") - - workersMetrics, err := s.metricsProvider.GetHistoryMetrics() - if err != nil { - log.Error(err, "failed to get history metrics") - return - } - for _, metrics := range workersMetrics { - workloadState, exists := s.workloadStates[metrics.WorkloadName] - if !exists { - workloadState = autoscaling.NewWorkloadState(metrics.WorkloadName) - s.workloadStates[metrics.WorkloadName] = workloadState - } - workerState, exists := s.workerStates[metrics.WorkerName] - if !exists { - workerState = autoscaling.NewWorkerState(metrics.WorkerName, metrics.WorkloadName) - s.workerStates[metrics.WorkerName] = workerState - } - - s.addSamples(workloadState, workerState, metrics) - } -} - -func (s *Autoscaler) LoadRealTimeMetrics(ctx context.Context) { - log := log.FromContext(ctx) - log.Info("loading realtime metrics") - - workersMetrics, err := s.metricsProvider.GetWorkersMetrics() - if err != nil { - log.Error(err, "failed to get workers metrics") - return - } - - for _, metrics := range workersMetrics { - workloadState, workloadExists := s.workloadStates[metrics.WorkloadName] - if !workloadExists { - continue - } - workerState, workerExists := s.workerStates[metrics.WorkerName] - if !workerExists { - continue - } - - s.addSamples(workloadState, workerState, metrics) - } -} - -func (s *Autoscaler) ProcessWorkloads(ctx context.Context) { - log := log.FromContext(ctx) - log.Info("processing workloads") - - for _, workloadState := range s.workloadStates { - podList := &corev1.PodList{} - if err := s.List(ctx, podList, - client.InNamespace(workloadState.Namespace), - client.MatchingLabels{constants.WorkloadKey: workloadState.Name}); err != nil { - log.Error(err, "failed to list workers") - continue - } - - if len(podList.Items) <= 0 { - continue - } - - s.recommenders[0].Recommend(workloadState) - log.Info("recommended resources", "workload", workloadState.Name, "resources", workloadState.Recommendation) - - // TODO: update recommmendation status of workload - - if !workloadState.IsAutoScalingEnabled() { - continue - } - - for _, worker := range podList.Items { - if !worker.DeletionTimestamp.IsZero() { - continue - } - - if err := s.updateWorkerResourcesIfNeeded(ctx, workloadState, &worker); err != nil { - log.Error(err, "failed to update worker") - } - } - } -} - -func (s *Autoscaler) updateWorkerResourcesIfNeeded(ctx context.Context, workloadState *autoscaling.WorkloadState, worker *corev1.Pod) error { - log := log.FromContext(ctx) - - adjustRequest, err := getCurrentWorkerResourceRequest(worker) - if err != nil { - return fmt.Errorf("failed to get current worker resource request, %v", err) - } - - rr := &workloadState.Recommendation - resourcesInfo := []struct { - name tfv1.ResourceName - requestKey string - limitKey string - request *resource.Quantity - limit *resource.Quantity - lowerBound resource.Quantity - upperBound resource.Quantity - target resource.Quantity - }{ - { - name: tfv1.ResourceTflops, - requestKey: constants.TFLOPSRequestAnnotation, - limitKey: constants.TFLOPSLimitAnnotation, - request: &adjustRequest.NewRequest.Tflops, - limit: &adjustRequest.NewLimit.Tflops, - lowerBound: rr.LowerBoundTflops, - upperBound: rr.UpperBoundTflops, - target: rr.TargetTflops, - }, - { - name: tfv1.ResourceVram, - requestKey: constants.VRAMRequestAnnotation, - limitKey: constants.VRAMLimitAnnotation, - request: &adjustRequest.NewRequest.Vram, - limit: &adjustRequest.NewLimit.Vram, - lowerBound: rr.LowerBoundVram, - upperBound: rr.UpperBoundVram, - target: rr.TargetVram, - }, - } - - newAnnotations := map[string]string{} - var upScaling, downScaling bool - for _, resInfo := range resourcesInfo { - if !workloadState.IsTargetResource(resInfo.name) { - continue - } - upScaling = resInfo.request.Cmp(resInfo.lowerBound) < 0 - downScaling = resInfo.request.Cmp(resInfo.upperBound) > 0 - if upScaling || downScaling { - targetRequest := resInfo.target - targetLimit := getProportionalLimit(resInfo.limit, resInfo.request, &targetRequest) - if targetLimit == nil { - return fmt.Errorf("failed to get limit for %s", resInfo.requestKey) - } - newAnnotations[resInfo.requestKey] = targetRequest.String() - newAnnotations[resInfo.limitKey] = targetLimit.String() - *resInfo.request = targetRequest - *resInfo.limit = *targetLimit - } - } - - if len(newAnnotations) > 0 { - adjustRequest.IsScaleUp = upScaling - if _, err := s.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil { - return fmt.Errorf("failed to adjust allocation: %v", err) - } - log.Info("adjust allocation successfully", "adjustRequest", adjustRequest) - // Patch the worker with updated annotations - patch := client.MergeFrom(worker.DeepCopy()) - for key, value := range newAnnotations { - worker.Annotations[key] = value - } - if err := s.Patch(ctx, worker, patch); err != nil { - return fmt.Errorf("failed to patch worker: %v", err) - } - } - - return nil -} - -func (*Autoscaler) addSamples(workloadState *autoscaling.WorkloadState, workerState *autoscaling.WorkerState, sample *metrics.WorkerUsage) { - workerState.AddTflopsSample(workloadState, sample) - workerState.AddVramSample(workloadState, sample) - workloadState.UpdateSampleStats(sample) -} - -func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity { - if originalLimit == nil || originalLimit.IsZero() || - originalRequest == nil || originalRequest.IsZero() || - recommendedRequest == nil || recommendedRequest.IsZero() { - return nil - } - - originalValue := big.NewInt(originalLimit.Value()) - scaleBaseValue := big.NewInt(originalRequest.Value()) - scaleResultValue := big.NewInt(recommendedRequest.Value()) - var scaledOriginal big.Int - scaledOriginal.Mul(originalValue, scaleResultValue) - scaledOriginal.Div(&scaledOriginal, scaleBaseValue) - if scaledOriginal.IsInt64() { - return resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format) - } - - return nil -} - -func getCurrentWorkerResourceRequest(worker *corev1.Pod) (*tfv1.AdjustRequest, error) { - adjustRequest := tfv1.AdjustRequest{ - PodUID: string(worker.UID), - IsScaleUp: false, - NewRequest: tfv1.Resource{}, - NewLimit: tfv1.Resource{}, - } - annotations := worker.GetAnnotations() - resInfo := []struct { - key string - dst *resource.Quantity - }{ - {constants.TFLOPSRequestAnnotation, &adjustRequest.NewRequest.Tflops}, - {constants.TFLOPSLimitAnnotation, &adjustRequest.NewLimit.Tflops}, - {constants.VRAMRequestAnnotation, &adjustRequest.NewRequest.Vram}, - {constants.VRAMLimitAnnotation, &adjustRequest.NewLimit.Vram}, - } - for _, info := range resInfo { - q, err := resource.ParseQuantity(annotations[info.key]) - if err != nil { - return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) - } - *info.dst = q - } - - return &adjustRequest, nil -} - -// Start after manager started -func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { - autoScaler, err := New(mgr.GetClient(), allocator) - if err != nil { - return err - } - return mgr.Add(autoScaler) -} diff --git a/internal/autoscaling/recommender/cron/cron.go b/internal/autoscaling/recommender/cron/cron.go deleted file mode 100644 index 70185959..00000000 --- a/internal/autoscaling/recommender/cron/cron.go +++ /dev/null @@ -1,23 +0,0 @@ -package cron - -import ( - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling" -) - -type CronRecommender struct{} - -func New() *CronRecommender { - return &CronRecommender{} -} - -func (c *CronRecommender) Name() string { - return "cron" -} - -func (c *CronRecommender) Recommend(w *autoscaling.WorkloadState) { - c.getCronConfig(&w.AutoScalingConfig) -} - -func (c *CronRecommender) getCronConfig(asc *tfv1.AutoScalingConfig) { -} diff --git a/internal/autoscaling/recommender/recommender.go b/internal/autoscaling/recommender/recommender.go deleted file mode 100644 index a6470860..00000000 --- a/internal/autoscaling/recommender/recommender.go +++ /dev/null @@ -1,28 +0,0 @@ -package recommender - -import ( - "github.com/NexusGPU/tensor-fusion/internal/autoscaling" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling/recommender/percentile" - "github.com/NexusGPU/tensor-fusion/internal/autoscaling/recommender/cron" -) - -const ( - PercentileRecommender = "percentile" - CronRecommender = "cron" -) - -type Interface interface { - Name() string - Recommend(*autoscaling.WorkloadState) -} - -func New(name string) Interface { - switch name { - case PercentileRecommender: - return percentile.NewRecommender() - case CronRecommender: - return cron.New() - default: - return nil - } -} diff --git a/internal/autoscaling/workerstate.go b/internal/autoscaling/workerstate.go deleted file mode 100644 index f826ca95..00000000 --- a/internal/autoscaling/workerstate.go +++ /dev/null @@ -1,68 +0,0 @@ -package autoscaling - -import ( - "time" - - "github.com/NexusGPU/tensor-fusion/internal/autoscaling/metrics" -) - -type WorkerState struct { - Name string - Workload string - LastTflopsSampleTime time.Time - - VramPeak uint64 - LastVramSampleTime time.Time - VramWindowEnd time.Time -} - -func NewWorkerState(name string, workload string) *WorkerState { - return &WorkerState{ - Name: name, - Workload: workload, - LastTflopsSampleTime: time.Time{}, - LastVramSampleTime: time.Time{}, - VramWindowEnd: time.Time{}, - } -} - -func (w *WorkerState) AddTflopsSample(workload *WorkloadState, metrics *metrics.WorkerUsage) bool { - if metrics.Timestamp.Before(w.LastTflopsSampleTime) { - return false - } - workload.TflopsHistogram.AddSample(float64(metrics.TflopsUsage), minSampleWeight, metrics.Timestamp) - w.LastTflopsSampleTime = metrics.Timestamp - return true -} - -func (w *WorkerState) AddVramSample(workload *WorkloadState, metrics *metrics.WorkerUsage) bool { - ts := metrics.Timestamp - if ts.Before(w.LastVramSampleTime) { - return false - } - w.LastVramSampleTime = ts - if w.VramWindowEnd.IsZero() { - w.VramWindowEnd = ts - } - - addNewPeak := false - if ts.Before(w.VramWindowEnd) { - if w.VramPeak != 0 && metrics.VramUsage > w.VramPeak { - workload.VramHistogram.SubtractSample(float64(w.VramPeak), 1.0, w.VramWindowEnd) - addNewPeak = true - } - } else { - aggregationInteval := DefaultAggregationInterval - shift := ts.Sub(w.VramWindowEnd).Truncate(aggregationInteval) + aggregationInteval - w.VramWindowEnd = w.VramWindowEnd.Add(shift) - w.VramPeak = 0 - addNewPeak = true - } - - if addNewPeak { - workload.VramHistogram.AddSample(float64(metrics.VramUsage), 1.0, metrics.Timestamp) - w.VramPeak = metrics.VramUsage - } - - return true -} From acca957c8282b2e515af6da8a6742b131b1cd3a1 Mon Sep 17 00:00:00 2001 From: knave Date: Mon, 21 Jul 2025 09:51:15 +0800 Subject: [PATCH 17/27] feat: define cron scaler crd --- api/v1/schedulingconfigtemplate_types.go | 25 ++++++ api/v1/zz_generated.deepcopy.go | 37 ++++++++- ...r-fusion.ai_schedulingconfigtemplates.yaml | 83 +++++++++++++++++++ ...ensor-fusion.ai_tensorfusionworkloads.yaml | 83 +++++++++++++++++++ .../tensor-fusion.ai_workloadprofiles.yaml | 83 +++++++++++++++++++ ...r-fusion.ai_schedulingconfigtemplates.yaml | 83 +++++++++++++++++++ ...ensor-fusion.ai_tensorfusionworkloads.yaml | 83 +++++++++++++++++++ .../tensor-fusion.ai_workloadprofiles.yaml | 83 +++++++++++++++++++ 8 files changed, 558 insertions(+), 2 deletions(-) diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go index a16fb6ad..320c7678 100644 --- a/api/v1/schedulingconfigtemplate_types.go +++ b/api/v1/schedulingconfigtemplate_types.go @@ -93,6 +93,31 @@ type AutoScalingConfig struct { // layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit // HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works) AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"` + + // CronScalers defines a list of CronScaler configurations used to schedule scaling actions based on cron expressions. + CronScalers []CronScaler `json:"cronScalers,omitempty"` +} + +// CronScaler defines the configuration for scaling resources based on a cron schedule. +// It allows enabling/disabling the scaler, specifying the time window for scaling, +// and configuring the desired resources and replicas during the scheduled period. +type CronScaler struct { + // Enable specifies whether the cron scaler is enabled. + Enable *bool `json:"enable,omitempty"` + // Name is the identifier for the cron scaler. + Name string `json:"name,omitempty"` + // Start is the start time for the scaling schedule, in cron format. + Start string `json:"start,omitempty"` + // End is the end time for the scaling schedule, in cron format. + End string `json:"end,omitempty"` + // DesiredResources specifies the target resources to scale to during the schedule. + DesiredResources Resources `json:"desiredResources,omitempty"` + // ResourceMultiplier is a string representing the multiplier to apply to resources. + ResourceMultiplier string `json:"resourceMultiplier,omitempty"` + // DesiredReplicas is the target number of replicas during the schedule. + DesiredReplicas *int32 `json:"desiredReplicas,omitempty"` + // ReplicasMultiplier is a string representing the multiplier to apply to replicas. + ReplicasMultiplier string `json:"replicasMultiplier,omitempty"` } type AutoSetResources struct { diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 7dc5882e..4ebd264e 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -140,6 +140,13 @@ func (in *AutoScalingConfig) DeepCopyInto(out *AutoScalingConfig) { *out = *in out.AutoSetResources = in.AutoSetResources out.AutoSetReplicas = in.AutoSetReplicas + if in.CronScalers != nil { + in, out := &in.CronScalers, &out.CronScalers + *out = make([]CronScaler, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoScalingConfig. @@ -361,6 +368,32 @@ func (in *ComputingVendorParams) DeepCopy() *ComputingVendorParams { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CronScaler) DeepCopyInto(out *CronScaler) { + *out = *in + if in.Enable != nil { + in, out := &in.Enable, &out.Enable + *out = new(bool) + **out = **in + } + in.DesiredResources.DeepCopyInto(&out.DesiredResources) + if in.DesiredReplicas != nil { + in, out := &in.DesiredReplicas, &out.DesiredReplicas + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CronScaler. +func (in *CronScaler) DeepCopy() *CronScaler { + if in == nil { + return nil + } + out := new(CronScaler) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DataPipeline4ResourcesConfig) DeepCopyInto(out *DataPipeline4ResourcesConfig) { *out = *in @@ -1971,7 +2004,7 @@ func (in *SchedulingConfigTemplateSpec) DeepCopyInto(out *SchedulingConfigTempla if in.AutoScaling != nil { in, out := &in.AutoScaling, &out.AutoScaling *out = new(AutoScalingConfig) - **out = **in + (*in).DeepCopyInto(*out) } if in.ReBalancer != nil { in, out := &in.ReBalancer, &out.ReBalancer @@ -2495,7 +2528,7 @@ func (in *WorkloadProfileSpec) DeepCopyInto(out *WorkloadProfileSpec) { **out = **in } in.Resources.DeepCopyInto(&out.Resources) - out.AutoScalingConfig = in.AutoScalingConfig + in.AutoScalingConfig.DeepCopyInto(&out.AutoScalingConfig) if in.NodeAffinity != nil { in, out := &in.NodeAffinity, &out.NodeAffinity *out = new(corev1.NodeAffinity) diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml index 65092ff0..81599c21 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -122,6 +122,89 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalers: + description: CronScalers defines a list of CronScaler configurations + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScaler defines the configuration for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object hypervisor: description: single GPU device multi-process queuing and fair scheduling diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml index 95c4c5dc..57498b44 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -118,6 +118,89 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalers: + description: CronScalers defines a list of CronScaler configurations + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScaler defines the configuration for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object gpuCount: description: The number of GPUs to be used by the workload, default diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml index d8e57ee9..33b4a65a 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml @@ -117,6 +117,89 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalers: + description: CronScalers defines a list of CronScaler configurations + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScaler defines the configuration for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object gpuCount: description: The number of GPUs to be used by the workload, default diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml index 65092ff0..81599c21 100644 --- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -122,6 +122,89 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalers: + description: CronScalers defines a list of CronScaler configurations + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScaler defines the configuration for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object hypervisor: description: single GPU device multi-process queuing and fair scheduling diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml index 95c4c5dc..57498b44 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -118,6 +118,89 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalers: + description: CronScalers defines a list of CronScaler configurations + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScaler defines the configuration for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object gpuCount: description: The number of GPUs to be used by the workload, default diff --git a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml index d8e57ee9..33b4a65a 100644 --- a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml +++ b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml @@ -117,6 +117,89 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalers: + description: CronScalers defines a list of CronScaler configurations + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScaler defines the configuration for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object gpuCount: description: The number of GPUs to be used by the workload, default From 1d16f925d2caeef089def9fac43f94d8b91e312d Mon Sep 17 00:00:00 2001 From: knave Date: Mon, 28 Jul 2025 09:37:35 +0800 Subject: [PATCH 18/27] feat: implement cron scaling --- internal/autoscaler/autoscaler.go | 58 +++++--- internal/autoscaler/autoscaler_test.go | 126 +++++++++++++--- internal/autoscaler/recommendation.go | 19 +++ .../recommender/cron_recommender.go | 96 +++++++++++- .../recommender/cron_recommender_test.go | 140 ++++++++++++++++++ .../recommender/percentile_recommender.go | 32 ++-- .../percentile_recommender_test.go | 48 +++--- .../autoscaler/recommender/recommender.go | 22 +-- .../recommender/recommender_test.go | 1 + internal/autoscaler/workload/handler.go | 130 +++++++++++----- internal/autoscaler/workload/workload.go | 54 +++++-- internal/autoscaler/workload/workload_test.go | 50 +++++-- 12 files changed, 614 insertions(+), 162 deletions(-) create mode 100644 internal/autoscaler/recommendation.go create mode 100644 internal/autoscaler/recommender/cron_recommender_test.go create mode 100644 internal/autoscaler/recommender/recommender_test.go diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 87a6a5bb..852d9357 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -26,8 +26,8 @@ type Autoscaler struct { allocator *gpuallocator.GpuAllocator metricsProvider metrics.Provider recommenders []recommender.Interface - workloadHandler *workload.Handler - workloads map[string]*workload.WorkloadState + workloadHandler workload.Handler + workloads map[string]*workload.State } func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Autoscaler, error) { @@ -50,7 +50,7 @@ func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Auto metricsProvider: metrics.NewProvider(nil), recommenders: recommenders, workloadHandler: workload.NewHandler(c, allocator), - workloads: map[string]*workload.WorkloadState{}, + workloads: map[string]*workload.State{}, }, nil } @@ -103,18 +103,16 @@ func (s *Autoscaler) loadWorkloads(ctx context.Context) { if !workload.DeletionTimestamp.IsZero() { continue } - workloadState := s.findOrCreateWorkload(workload.Name) - workloadState.Namespace = workload.Namespace - workloadState.Spec = workload.Spec - observedWorkloads[workload.Name] = true - s.workloadHandler.UpdateWorkers(ctx, workloadState) + workloadState := s.findOrCreateWorkloadState(workload.Name) + s.workloadHandler.UpdateWorkloadState(ctx, workloadState, &workload) + observedWorkloads[workload.Name] = true } // remove non-existent workloads - for key := range s.workloads { - if !observedWorkloads[key] { - delete(s.workloads, key) + for name := range s.workloads { + if !observedWorkloads[name] { + delete(s.workloads, name) } } } @@ -129,8 +127,7 @@ func (s *Autoscaler) loadHistoryMetrics(ctx context.Context) { return } for _, sample := range workersMetrics { - workload := s.findOrCreateWorkload(sample.WorkloadName) - workload.AddSample(sample) + s.findOrCreateWorkloadState(sample.WorkloadName).AddSample(sample) } } @@ -156,27 +153,42 @@ func (s *Autoscaler) processWorkloads(ctx context.Context) { log.Info("processing workloads") for _, workload := range s.workloads { - recommendations := map[string]recommender.RecommendedResources{} + recommendations := map[string]*tfv1.RecommendedResources{} for _, recommender := range s.recommenders { name := recommender.Name() - recommendations[name] = recommender.Recommend(&workload.Spec.AutoScalingConfig, workload.WorkerUsageAggregator) + recommendation, err := recommender.Recommend(workload) + if err != nil { + log.Error(err, "failed to recommend resources", "recommender", name) + continue + } + if recommendation == nil { + continue + } + + recommendations[name] = recommendation log.Info("recommendation", "recommender", name, "workload", workload.Name, "resources", recommendations[name]) } - // var finalRecommendation recommender.RecommendedResources + if len(recommendations) == 0 { + continue + } + + var finalRecommendation *tfv1.RecommendedResources // for _, recommendation := range recommendations { - // if recommendation.TargetTflops.IsZero() + // finalRecommendation = recommendation // } + // process cron recommendation + if recommendation, ok := recommendations[recommender.Cron]; ok { + finalRecommendation = recommendation + } - // TODO: Implement updating the recommendation status of the workload CRD when the API is ready. - workload.UpdateRecommendation(recommendations["percentile"]) - s.workloadHandler.ProcessWorkload(ctx, workload) + s.workloadHandler.ApplyRecommendationToWorkload(ctx, workload, finalRecommendation) } } -func (s *Autoscaler) findOrCreateWorkload(name string) *workload.WorkloadState { - w, ok := s.workloads[name] - if !ok { +func (s *Autoscaler) findOrCreateWorkloadState(name string) *workload.State { + w, exists := s.workloads[name] + if !exists { w = workload.NewWorkloadState(name) s.workloads[name] = w } diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 4ed653a6..67c53064 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -24,7 +24,7 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" - "github.com/NexusGPU/tensor-fusion/internal/autoscaler/recommender" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/aws/smithy-go/ptr" . "github.com/onsi/ginkgo/v2" @@ -79,7 +79,7 @@ var _ = Describe("Autoscaler", func() { }) Context("when loading workloads", func() { - It("should keep the state of workloads and workers with auto-scaling enabled", func() { + It("should keep the state of workloads", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(3). Build() @@ -110,17 +110,13 @@ var _ = Describe("Autoscaler", func() { updateWorkloadReplicas(workload0, 1) scaler.loadWorkloads(ctx) - Expect(scaler.workloads[workload0.Name].Workers).To(HaveLen(2)) + Expect(scaler.workloads[workload0.Name].Workers).To(HaveLen(1)) deleteWorkload(workload0) deleteWorkload(workload1) scaler.loadWorkloads(ctx) Expect(scaler.workloads).NotTo(HaveKey(workload0.Name)) - workers = scaler.workloads[workload0.Name].Workers - Expect(workers).NotTo(HaveKey(workload0Workers[0].Name)) - Expect(workers).NotTo(HaveKey(workload0Workers[1].Name)) Expect(scaler.workloads).NotTo(HaveKey(workload1.Name)) - Expect(scaler.workloads[workload1.Name].Workers).NotTo(HaveKey(workload1Workers[0].Name)) }) }) @@ -231,6 +227,79 @@ var _ = Describe("Autoscaler", func() { }).Should(Succeed()) }) + It("should update resources based on cron auto scaling config", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + + workloadState := scaler.workloads[workload.Name] + lastResources := workloadState.Spec.Resources + + tflopsRequestInRule := resource.MustParse("20") + vramRequestInRule := resource.MustParse("16Gi") + workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: tflopsRequestInRule, + Vram: vramRequestInRule, + }, + }, + }, + } + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + tflopsRequest, _, vramRequest, _ := parseResourceAnnotations(getWorkers(workload)[0]) + Expect(tflopsRequest.Equal(tflopsRequestInRule)).To(BeTrue()) + Expect(vramRequest.Equal(vramRequestInRule)).To(BeTrue()) + tflopsRequest, _, vramRequest, _ = parseLastResourceAnnotations(getWorkers(workload)[0]) + Expect(tflopsRequest.Equal(lastResources.Requests.Tflops)).To(BeTrue()) + Expect(vramRequest.Equal(lastResources.Requests.Vram)).To(BeTrue()) + }).Should(Succeed()) + + // invalidate the rule by updating start and end fields + workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "", + End: "", + DesiredResources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: tflopsRequestInRule, + Vram: vramRequestInRule, + }, + }, + }, + } + + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + tflopsRequest, _, vramRequest, _ := parseResourceAnnotations(getWorkers(workload)[0]) + Expect(tflopsRequest.Equal(lastResources.Requests.Tflops)).To(BeTrue()) + Expect(vramRequest.Equal(lastResources.Requests.Vram)).To(BeTrue()) + }).Should(Succeed()) + }) + + It("should merge multiple recommendations", func() { + + }) + + It("should not update resource if resource is zero", func() { + + }) + It("should return an error if recommended resources exceeded quota", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). @@ -358,34 +427,38 @@ func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*metrics.WorkerUsage, error return sample, nil } -type FakeUpScalingRecommender struct { - recommender.Interface +type FakeUpScalingRecommender struct{} + +func (f *FakeUpScalingRecommender) Name() string { + return "FakeUpScaling" } -func (f *FakeUpScalingRecommender) Recommend(_ *tfv1.AutoScalingConfig, _ *metrics.WorkerUsageAggregator) recommender.RecommendedResources { - return recommender.RecommendedResources{ +func (f *FakeUpScalingRecommender) Recommend(workoad *workload.State) (*tfv1.RecommendedResources, error) { + return &tfv1.RecommendedResources{ TargetTflops: resource.MustParse("110"), LowerBoundTflops: resource.MustParse("100"), UpperBoundTflops: resource.MustParse("120"), TargetVram: resource.MustParse("110Gi"), LowerBoundVram: resource.MustParse("100Gi"), UpperBoundVram: resource.MustParse("120Gi"), - } + }, nil } -type FakeQuotaExceededRecommender struct { - recommender.Interface +type FakeQuotaExceededRecommender struct{} + +func (f *FakeQuotaExceededRecommender) Name() string { + return "FakeQuotaExceeded" } -func (f *FakeQuotaExceededRecommender) Recommend(_ *tfv1.AutoScalingConfig, _ *metrics.WorkerUsageAggregator) recommender.RecommendedResources { - return recommender.RecommendedResources{ +func (f *FakeQuotaExceededRecommender) Recommend(workoad *workload.State) (*tfv1.RecommendedResources, error) { + return &tfv1.RecommendedResources{ TargetTflops: resource.MustParse("9999"), LowerBoundTflops: resource.MustParse("9999"), UpperBoundTflops: resource.MustParse("9999"), TargetVram: resource.MustParse("999Gi"), LowerBoundVram: resource.MustParse("999Gi"), UpperBoundVram: resource.MustParse("999Gi"), - } + }, nil } func updateWorkloadReplicas(workload *tfv1.TensorFusionWorkload, replicas int) { @@ -438,7 +511,7 @@ func cleanupWorkload(key client.ObjectKey) { }).Should(Succeed()) } -func assertWorkerAnnotations(worker *corev1.Pod, rr recommender.RecommendedResources) { +func assertWorkerAnnotations(worker *corev1.Pod, rr tfv1.RecommendedResources) { GinkgoHelper() tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(worker) Expect(tflopsRequest.Value()).To(Equal(rr.TargetTflops.Value())) @@ -464,6 +537,23 @@ func parseResourceAnnotations(worker *corev1.Pod) (tflopsRequest, tflopsLimit, v return } +func parseLastResourceAnnotations(worker *corev1.Pod) (tflopsRequest, tflopsLimit, vramRequest, vramLimit resource.Quantity) { + annotations := worker.GetAnnotations() + keys := []struct { + key string + dst *resource.Quantity + }{ + {constants.LastTFLOPSRequestAnnotation, &tflopsRequest}, + {constants.LastTFLOPSLimitAnnotation, &tflopsLimit}, + {constants.LastVRAMRequestAnnotation, &vramRequest}, + {constants.LastVRAMLimitAnnotation, &vramLimit}, + } + for _, k := range keys { + *k.dst = resource.MustParse(annotations[k.key]) + } + return +} + func mockSchedulerLoop(ctx context.Context, cfg *rest.Config) { ticker := time.NewTicker(50 * time.Millisecond) clientset, err := kubernetes.NewForConfig(cfg) diff --git a/internal/autoscaler/recommendation.go b/internal/autoscaler/recommendation.go new file mode 100644 index 00000000..2a2993c4 --- /dev/null +++ b/internal/autoscaler/recommendation.go @@ -0,0 +1,19 @@ +package autoscaler + +import ( + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" +) + +type RecommendationProcessor interface { + Process() +} + +type CronRecommendationProcessor struct{} + +func (c *CronRecommendationProcessor) Process() { + +} + +func MergeRecommendations() *tfv1.RecommendedResources { + return &tfv1.RecommendedResources{} +} diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go index 7f900b1c..6266cdfe 100644 --- a/internal/autoscaler/recommender/cron_recommender.go +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -1,24 +1,108 @@ package recommender import ( + "fmt" + "time" + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "github.com/robfig/cron/v3" + "k8s.io/apimachinery/pkg/api/resource" ) -type CronRecommender struct{} +type CronRecommender struct { + parser cron.Parser +} func NewCronRecommender() *CronRecommender { - return &CronRecommender{} + return &CronRecommender{ + parser: cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow), + } } func (c *CronRecommender) Name() string { return "cron" } -func (p *CronRecommender) Recommend(config *tfv1.AutoScalingConfig, w *metrics.WorkerUsageAggregator) RecommendedResources { - return RecommendedResources{} +func (c *CronRecommender) Recommend(w *workload.State) (*tfv1.RecommendedResources, error) { + activeRule, err := c.getActiveCronScalingRule(&w.Spec.AutoScalingConfig) + if err != nil { + return nil, fmt.Errorf("failed to get active cron scaling rule %w", err) + } + + var tflopsRequest, vramRequest resource.Quantity + if activeRule == nil { + // if no active rule, return last resources if annotations exists + resources, err := w.GetLastResourcesFromAnnotations() + if err != nil { + return nil, fmt.Errorf("failed to get last resources: %w", err) + } + // no annotations + if resources == nil { + return nil, nil + } + tflopsRequest = resources.Requests.Tflops + vramRequest = resources.Requests.Vram + } else { + tflopsRequest = activeRule.DesiredResources.Requests.Tflops + vramRequest = activeRule.DesiredResources.Requests.Vram + } + + return &tfv1.RecommendedResources{ + LowerBoundTflops: tflopsRequest, + TargetTflops: tflopsRequest, + UpperBoundTflops: tflopsRequest, + LowerBoundVram: vramRequest, + TargetVram: vramRequest, + UpperBoundVram: vramRequest, + }, nil } -func (c *CronRecommender) getCronConfig(asc *tfv1.AutoScalingConfig) { +func (c *CronRecommender) getActiveCronScalingRule(config *tfv1.AutoScalingConfig) (*tfv1.CronScalingRule, error) { + activeRules := []*tfv1.CronScalingRule{} + + currentTime := time.Now() + + for _, rule := range config.CronScalingRules { + if !rule.Enable || rule.Start == "" || rule.End == "" { + continue + } + + if rule.Start == rule.End { + return nil, fmt.Errorf("start and end can not same") + } + + startSchedule, err := c.parser.Parse(rule.Start) + if err != nil { + return nil, fmt.Errorf("failed to parse start: %w", err) + } + endSchedule, err := c.parser.Parse(rule.End) + if err != nil { + return nil, fmt.Errorf("failed to parse end: %w", err) + } + + nextStartTime := startSchedule.Next(time.Now()) + nextEndTime := endSchedule.Next(time.Now()) + + isActive := false + if nextStartTime.Before(nextEndTime) { + isActive = currentTime.After(nextStartTime) && currentTime.Before(nextEndTime) + } else { + isActive = currentTime.After(nextStartTime) || currentTime.Before(nextEndTime) + } + + if isActive { + activeRules = append(activeRules, &rule) + } + } + + if len(activeRules) > 1 { + return nil, fmt.Errorf("only one active cron scaling rule is permitted at any given time") + } + + if len(activeRules) == 0 { + return nil, nil + } + return activeRules[0], nil } diff --git a/internal/autoscaler/recommender/cron_recommender_test.go b/internal/autoscaler/recommender/cron_recommender_test.go new file mode 100644 index 00000000..dffce1be --- /dev/null +++ b/internal/autoscaler/recommender/cron_recommender_test.go @@ -0,0 +1,140 @@ +package recommender + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/resource" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "github.com/NexusGPU/tensor-fusion/internal/constants" +) + +var _ = Describe("CronRecommender", func() { + It("should return recommended resource based on active cron scaling rule", func() { + tflopsRequest := resource.MustParse("10") + vramRequest := resource.MustParse("8Gi") + tflopsLimit := resource.MustParse("20") + vramLimit := resource.MustParse("16Gi") + + workload := workload.NewWorkloadState("test") + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: tflopsRequest, + Vram: vramRequest, + }, + Limits: tfv1.Resource{ + Tflops: tflopsLimit, + Vram: vramLimit, + }, + }, + }, + }, + } + + recommender := NewCronRecommender() + recommendation, _ := recommender.Recommend(workload) + Expect(*recommendation).To(Equal(tfv1.RecommendedResources{ + LowerBoundTflops: tflopsRequest, + TargetTflops: tflopsRequest, + UpperBoundTflops: tflopsRequest, + LowerBoundVram: vramRequest, + TargetVram: vramRequest, + UpperBoundVram: vramRequest, + })) + }) + + FIt("should return recommended resource based on last resource annotations", func() { + tflopsRequest := resource.MustParse("10") + vramRequest := resource.MustParse("8Gi") + // tflopsLimit := resource.MustParse("20") + // vramLimit := resource.MustParse("16Gi") + + workload := workload.NewWorkloadState("test") + workload.Annotations = map[string]string{ + constants.LastTFLOPSRequestAnnotation: tflopsRequest.String(), + constants.LastVRAMRequestAnnotation: vramRequest.String(), + // constants.LastTFLOPSLimitAnnotation: tflopsLimit.String(), + // constants.LastVRAMLimitAnnotation: vramLimit.String(), + } + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "", + End: "", + }, + }, + } + + recommender := NewCronRecommender() + recommendation, _ := recommender.Recommend(workload) + Expect(recommendation.Equal(&tfv1.RecommendedResources{ + LowerBoundTflops: tflopsRequest, + TargetTflops: tflopsRequest, + UpperBoundTflops: tflopsRequest, + LowerBoundVram: vramRequest, + TargetVram: vramRequest, + UpperBoundVram: vramRequest, + })).To(BeTrue()) + }) + + It("should return error if getting multiple active rules", func() { + workload := workload.NewWorkloadState("test") + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + }, + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + }, + }, + } + recommender := NewCronRecommender() + _, err := recommender.Recommend(workload) + Expect(err).To(HaveOccurred()) + }) + + It("should not return cron scaling rule if no config or disable", func() { + asc := tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{}, + } + Expect(NewCronRecommender().getActiveCronScalingRule(&asc)).To(BeNil()) + asc = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + {Enable: false}, + }, + } + Expect(NewCronRecommender().getActiveCronScalingRule(&asc)).To(BeNil()) + }) + + It("should return the active cron scaling rule if the current time falls within its scheduled interval", func() { + asc := tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + }, + }, + } + rule, _ := NewCronRecommender().getActiveCronScalingRule(&asc) + Expect(rule).NotTo(BeNil()) + }) +}) diff --git a/internal/autoscaler/recommender/percentile_recommender.go b/internal/autoscaler/recommender/percentile_recommender.go index f4d3ed51..35be2503 100644 --- a/internal/autoscaler/recommender/percentile_recommender.go +++ b/internal/autoscaler/recommender/percentile_recommender.go @@ -5,7 +5,7 @@ import ( "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" ) const ( @@ -66,23 +66,31 @@ func (p *PercentileRecommender) Name() string { return "percentile" } -func (p *PercentileRecommender) Recommend(config *tfv1.AutoScalingConfig, w *metrics.WorkerUsageAggregator) RecommendedResources { +func (p *PercentileRecommender) Recommend(workload *workload.State) (*tfv1.RecommendedResources, error) { // TODO: cache config - p.createEstimatorsFromConfig(p.getPercentileConfig(config)) - return RecommendedResources{ - LowerBoundTflops: QuantityFromAmount(p.lowerBoundTflops.GetTflopsEstimation(w)), - TargetTflops: QuantityFromAmount(p.targetTflops.GetTflopsEstimation(w)), - UpperBoundTflops: QuantityFromAmount(p.upperBoundTflops.GetTflopsEstimation(w)), - LowerBoundVram: QuantityFromAmount(p.lowerBoundVram.GetVramEstimation(w)), - TargetVram: QuantityFromAmount(p.targetVram.GetVramEstimation(w)), - UpperBoundVram: QuantityFromAmount(p.upperBoundVram.GetVramEstimation(w)), + aggregator := workload.WorkerUsageAggregator + if aggregator.TflopsHistogram.IsEmpty() && aggregator.VramHistogram.IsEmpty() { + return nil, nil } + + p.createEstimatorsFromConfig(p.getPercentileConfig(&workload.Spec.AutoScalingConfig.AutoSetResources)) + return &tfv1.RecommendedResources{ + LowerBoundTflops: QuantityFromAmount(p.lowerBoundTflops.GetTflopsEstimation(aggregator)), + TargetTflops: QuantityFromAmount(p.targetTflops.GetTflopsEstimation(aggregator)), + UpperBoundTflops: QuantityFromAmount(p.upperBoundTflops.GetTflopsEstimation(aggregator)), + LowerBoundVram: QuantityFromAmount(p.lowerBoundVram.GetVramEstimation(aggregator)), + TargetVram: QuantityFromAmount(p.targetVram.GetVramEstimation(aggregator)), + UpperBoundVram: QuantityFromAmount(p.upperBoundVram.GetVramEstimation(aggregator)), + }, nil } -func (p *PercentileRecommender) getPercentileConfig(asc *tfv1.AutoScalingConfig) *PercentileConfig { +func (p *PercentileRecommender) getPercentileConfig(asr *tfv1.AutoSetResources) *PercentileConfig { cfg := defaultPercentileConfig - asr := asc.AutoSetResources + if asr == nil { + return &cfg + } + fields := []struct { val string dst *float64 diff --git a/internal/autoscaler/recommender/percentile_recommender_test.go b/internal/autoscaler/recommender/percentile_recommender_test.go index b4782296..fd6fe8a4 100644 --- a/internal/autoscaler/recommender/percentile_recommender_test.go +++ b/internal/autoscaler/recommender/percentile_recommender_test.go @@ -16,18 +16,16 @@ var _ = Describe("Percentile Recommender", func() { }) It("should parse float fields from AutoSetResources", func() { - asc := &tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{ - TargetTflopsPercentile: "0.8", - LowerBoundTflopsPercentile: "0.1", - UpperBoundTflopsPercentile: "0.95", - TargetVramPercentile: "0.7", - LowerBoundVramPercentile: "0.2", - UpperBoundVramPercentile: "0.9", - RequestMarginFraction: "0.15", - }, + asr := &tfv1.AutoSetResources{ + TargetTflopsPercentile: "0.8", + LowerBoundTflopsPercentile: "0.1", + UpperBoundTflopsPercentile: "0.95", + TargetVramPercentile: "0.7", + LowerBoundVramPercentile: "0.2", + UpperBoundVramPercentile: "0.9", + RequestMarginFraction: "0.15", } - cfg := NewPercentileRecommender().getPercentileConfig(asc) + cfg := NewPercentileRecommender().getPercentileConfig(asr) Expect(cfg.TargetTflopsPercentile).To(Equal(0.8)) Expect(cfg.LowerBoundTflopsPercentile).To(Equal(0.1)) Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.95)) @@ -38,36 +36,30 @@ var _ = Describe("Percentile Recommender", func() { }) It("should ignore invalid float fields and keep defaults", func() { - asc := &tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{ - TargetTflopsPercentile: "not-a-float", - LowerBoundTflopsPercentile: "", - UpperBoundTflopsPercentile: "0.99", - }, + asr := &tfv1.AutoSetResources{ + TargetTflopsPercentile: "not-a-float", + LowerBoundTflopsPercentile: "", + UpperBoundTflopsPercentile: "0.99", } - cfg := NewPercentileRecommender().getPercentileConfig(asc) + cfg := NewPercentileRecommender().getPercentileConfig(asr) Expect(cfg.TargetTflopsPercentile).To(Equal(defaultPercentileConfig.TargetTflopsPercentile)) Expect(cfg.LowerBoundTflopsPercentile).To(Equal(defaultPercentileConfig.LowerBoundTflopsPercentile)) Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.99)) }) It("should parse ConfidenceInterval if valid", func() { - asc := &tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{ - ConfidenceInterval: "30m", - }, + asr := &tfv1.AutoSetResources{ + ConfidenceInterval: "30m", } - cfg := NewPercentileRecommender().getPercentileConfig(asc) + cfg := NewPercentileRecommender().getPercentileConfig(asr) Expect(cfg.ConfidenceInterval).To(Equal(30 * time.Minute)) }) It("should ignore invalid ConfidenceInterval and keep default", func() { - asc := &tfv1.AutoScalingConfig{ - AutoSetResources: tfv1.AutoSetResources{ - ConfidenceInterval: "not-a-duration", - }, + asr := &tfv1.AutoSetResources{ + ConfidenceInterval: "not-a-duration", } - cfg := NewPercentileRecommender().getPercentileConfig(asc) + cfg := NewPercentileRecommender().getPercentileConfig(asr) Expect(cfg.ConfidenceInterval).To(Equal(defaultPercentileConfig.ConfidenceInterval)) }) }) diff --git a/internal/autoscaler/recommender/recommender.go b/internal/autoscaler/recommender/recommender.go index 7f4cb3ca..c500fa7c 100644 --- a/internal/autoscaler/recommender/recommender.go +++ b/internal/autoscaler/recommender/recommender.go @@ -4,35 +4,25 @@ import ( "fmt" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" - "k8s.io/apimachinery/pkg/api/resource" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" ) const ( - RecommenderPercentile = "percentile" - RecommenderCron = "cron" + Percentile = "percentile" + Cron = "cron" ) -type RecommendedResources struct { - LowerBoundTflops resource.Quantity - TargetTflops resource.Quantity - UpperBoundTflops resource.Quantity - LowerBoundVram resource.Quantity - TargetVram resource.Quantity - UpperBoundVram resource.Quantity -} - // Interface defines the contract for resource recommendation strategies used by the autoscaler. type Interface interface { Name() string - Recommend(*tfv1.AutoScalingConfig, *metrics.WorkerUsageAggregator) RecommendedResources + Recommend(workload *workload.State) (*tfv1.RecommendedResources, error) } func New(name string) (Interface, error) { switch name { - case RecommenderPercentile: + case Percentile: return NewPercentileRecommender(), nil - case RecommenderCron: + case Cron: return NewCronRecommender(), nil default: return nil, fmt.Errorf("unknown recommender name: %s", name) diff --git a/internal/autoscaler/recommender/recommender_test.go b/internal/autoscaler/recommender/recommender_test.go new file mode 100644 index 00000000..e0645922 --- /dev/null +++ b/internal/autoscaler/recommender/recommender_test.go @@ -0,0 +1 @@ +package recommender diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index 41a78738..b16537b7 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -14,30 +14,78 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" ) -type Handler struct { +type Handler interface { + UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) + ApplyRecommendationToWorkload(ctx context.Context, state *State, recommendation *tfv1.RecommendedResources) error + UpdateWorkerResourcesIfNeeded(ctx context.Context, workload *State, worker *corev1.Pod) error +} + +type handler struct { client.Client allocator *gpuallocator.GpuAllocator } -func NewHandler(client client.Client, allocator *gpuallocator.GpuAllocator) *Handler { - return &Handler{ +func NewHandler(client client.Client, allocator *gpuallocator.GpuAllocator) Handler { + return &handler{ Client: client, allocator: allocator, } } -func (h *Handler) UpdateWorkers(ctx context.Context, workload *WorkloadState) { +func (h *handler) UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) { + workloadState.Namespace = workload.Namespace + workloadState.Spec = workload.Spec + workloadState.Annotations = workload.Annotations + workerList := &corev1.PodList{} if err := h.List(ctx, workerList, - client.InNamespace(workload.Namespace), - client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil { + client.InNamespace(workloadState.Namespace), + client.MatchingLabels{constants.WorkloadKey: workloadState.Name}); err != nil { log.FromContext(ctx).Error(err, "failed to list workers") return } - workload.UpdateWorkers(workerList) + workloadState.updateWorkers(workerList) } -func (h *Handler) ProcessWorkload(ctx context.Context, workload *WorkloadState) { +func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, state *State, recommendation *tfv1.RecommendedResources) error { + workload := &tfv1.TensorFusionWorkload{} + if err := h.Get(ctx, client.ObjectKey{Namespace: state.Namespace, Name: state.Name}, workload); err != nil { + return fmt.Errorf("failed to get workload: %v", err) + } + + // record current and last resources by annotations + patch := client.MergeFrom(workload.DeepCopy()) + if workload.Annotations == nil { + workload.Annotations = map[string]string{} + } + if tflopsRequest, ok := workload.Annotations[constants.TFLOPSRequestAnnotation]; ok { + workload.Annotations[constants.LastTFLOPSRequestAnnotation] = tflopsRequest + } else { + workload.Annotations[constants.LastTFLOPSRequestAnnotation] = workload.Spec.Resources.Requests.Tflops.String() + } + if vramRequest, ok := workload.Annotations[constants.VRAMRequestAnnotation]; ok { + workload.Annotations[constants.LastVRAMRequestAnnotation] = vramRequest + } else { + workload.Annotations[constants.LastVRAMRequestAnnotation] = workload.Spec.Resources.Requests.Vram.String() + } + workload.Annotations[constants.TFLOPSRequestAnnotation] = recommendation.TargetTflops.String() + workload.Annotations[constants.VRAMRequestAnnotation] = recommendation.TargetVram.String() + + if err := h.Patch(ctx, workload, patch); err != nil { + return fmt.Errorf("failed to patch workload: %v", err) + } + + state.Annotations = workload.Annotations + state.Recommendation = *recommendation + + if err := h.ApplyRecommendationToWorkers(ctx, state); err != nil { + return fmt.Errorf("failed to apply recommendation to workers: %v", err) + } + + return nil +} + +func (h *handler) ApplyRecommendationToWorkers(ctx context.Context, workload *State) error { log := log.FromContext(ctx) workerList := &corev1.PodList{} if err := h.List(ctx, workerList, @@ -46,8 +94,8 @@ func (h *Handler) ProcessWorkload(ctx context.Context, workload *WorkloadState) log.Error(err, "failed to list workers") } - if !workload.IsAutoScalingEnabled() { - return + if !workload.IsAutoSetResourcesEnabled() { + return nil } for _, worker := range workerList.Items { @@ -59,9 +107,11 @@ func (h *Handler) ProcessWorkload(ctx context.Context, workload *WorkloadState) log.Error(err, "failed to update worker") } } + + return nil } -func (h *Handler) UpdateWorkerResourcesIfNeeded(ctx context.Context, workload *WorkloadState, worker *corev1.Pod) error { +func (h *handler) UpdateWorkerResourcesIfNeeded(ctx context.Context, workload *State, worker *corev1.Pod) error { log := log.FromContext(ctx) adjustRequest, err := getCurrentWorkerResourceRequest(worker) @@ -69,36 +119,42 @@ func (h *Handler) UpdateWorkerResourcesIfNeeded(ctx context.Context, workload *W return fmt.Errorf("failed to get current worker resource request, %v", err) } - rr := &workload.Recommendation + recommendation := &workload.Recommendation resourcesInfo := []struct { - name tfv1.ResourceName - requestKey string - limitKey string - request *resource.Quantity - limit *resource.Quantity - lowerBound resource.Quantity - upperBound resource.Quantity - target resource.Quantity + name tfv1.ResourceName + requestKey string + limitKey string + lastRequestKey string + lastLimitKey string + request *resource.Quantity + limit *resource.Quantity + lowerBound resource.Quantity + upperBound resource.Quantity + target resource.Quantity }{ { - name: tfv1.ResourceTflops, - requestKey: constants.TFLOPSRequestAnnotation, - limitKey: constants.TFLOPSLimitAnnotation, - request: &adjustRequest.NewRequest.Tflops, - limit: &adjustRequest.NewLimit.Tflops, - lowerBound: rr.LowerBoundTflops, - upperBound: rr.UpperBoundTflops, - target: rr.TargetTflops, + name: tfv1.ResourceTflops, + requestKey: constants.TFLOPSRequestAnnotation, + limitKey: constants.TFLOPSLimitAnnotation, + lastRequestKey: constants.LastTFLOPSRequestAnnotation, + lastLimitKey: constants.LastTFLOPSLimitAnnotation, + request: &adjustRequest.NewRequest.Tflops, + limit: &adjustRequest.NewLimit.Tflops, + lowerBound: recommendation.LowerBoundTflops, + upperBound: recommendation.UpperBoundTflops, + target: recommendation.TargetTflops, }, { - name: tfv1.ResourceVram, - requestKey: constants.VRAMRequestAnnotation, - limitKey: constants.VRAMLimitAnnotation, - request: &adjustRequest.NewRequest.Vram, - limit: &adjustRequest.NewLimit.Vram, - lowerBound: rr.LowerBoundVram, - upperBound: rr.UpperBoundVram, - target: rr.TargetVram, + name: tfv1.ResourceVram, + requestKey: constants.VRAMRequestAnnotation, + limitKey: constants.VRAMLimitAnnotation, + lastRequestKey: constants.LastVRAMRequestAnnotation, + lastLimitKey: constants.LastVRAMLimitAnnotation, + request: &adjustRequest.NewRequest.Vram, + limit: &adjustRequest.NewLimit.Vram, + lowerBound: recommendation.LowerBoundVram, + upperBound: recommendation.UpperBoundVram, + target: recommendation.TargetVram, }, } @@ -116,6 +172,8 @@ func (h *Handler) UpdateWorkerResourcesIfNeeded(ctx context.Context, workload *W if targetLimit == nil { return fmt.Errorf("failed to get limit for %s", resInfo.requestKey) } + newAnnotations[resInfo.lastRequestKey] = resInfo.request.String() + newAnnotations[resInfo.lastLimitKey] = resInfo.limit.String() newAnnotations[resInfo.requestKey] = targetRequest.String() newAnnotations[resInfo.limitKey] = targetLimit.String() *resInfo.request = targetRequest diff --git a/internal/autoscaler/workload/workload.go b/internal/autoscaler/workload/workload.go index 5c6e32a0..5d30ca24 100644 --- a/internal/autoscaler/workload/workload.go +++ b/internal/autoscaler/workload/workload.go @@ -1,45 +1,77 @@ package workload import ( + "fmt" "strings" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" - "github.com/NexusGPU/tensor-fusion/internal/autoscaler/recommender" + "github.com/NexusGPU/tensor-fusion/internal/constants" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" ) -type WorkloadState struct { +type State struct { Namespace string Name string + Annotations map[string]string Spec tfv1.WorkloadProfileSpec - Recommendation recommender.RecommendedResources + Recommendation tfv1.RecommendedResources Workers map[string]*WorkerState WorkerUsageAggregator *metrics.WorkerUsageAggregator } -func NewWorkloadState(name string) *WorkloadState { - return &WorkloadState{ +func NewWorkloadState(name string) *State { + return &State{ Name: name, Workers: make(map[string]*WorkerState), WorkerUsageAggregator: metrics.NewWorkerUsageAggregator(), } } -func (w *WorkloadState) UpdateRecommendation(recommendation recommender.RecommendedResources) { - w.Recommendation = recommendation +func (w *State) GetLastResourcesFromAnnotations() (*tfv1.Resources, error) { + result := tfv1.Resources{} + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {constants.LastTFLOPSRequestAnnotation, &result.Requests.Tflops}, + {constants.LastTFLOPSLimitAnnotation, &result.Limits.Tflops}, + {constants.LastVRAMRequestAnnotation, &result.Requests.Vram}, + {constants.LastVRAMLimitAnnotation, &result.Limits.Vram}, + } + annotations := w.Annotations + hasAnnotation := false + for _, info := range resInfo { + annotation, ok := annotations[info.key] + if !ok { + continue + } + q, err := resource.ParseQuantity(annotation) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + hasAnnotation = true + } + + if !hasAnnotation { + return nil, nil + } + + return &result, nil } -func (w *WorkloadState) IsAutoScalingEnabled() bool { +func (w *State) IsAutoSetResourcesEnabled() bool { return w.Spec.AutoScalingConfig.AutoSetResources.Enable } -func (w *WorkloadState) ShouldScaleResource(name tfv1.ResourceName) bool { +func (w *State) ShouldScaleResource(name tfv1.ResourceName) bool { target := w.Spec.AutoScalingConfig.AutoSetResources.TargetResource return target == "" || strings.EqualFold(target, "all") || strings.EqualFold(string(name), target) } -func (w *WorkloadState) UpdateWorkers(podList *corev1.PodList) { +func (w *State) updateWorkers(podList *corev1.PodList) { observedWorkers := map[string]bool{} for _, worker := range podList.Items { if !worker.DeletionTimestamp.IsZero() { @@ -58,7 +90,7 @@ func (w *WorkloadState) UpdateWorkers(podList *corev1.PodList) { } } -func (w *WorkloadState) AddSample(sample *metrics.WorkerUsage) { +func (w *State) AddSample(sample *metrics.WorkerUsage) { worker, exists := w.Workers[sample.WorkerName] if !exists { worker = NewWorkerState(sample.WorkerName, sample.WorkloadName) diff --git a/internal/autoscaler/workload/workload_test.go b/internal/autoscaler/workload/workload_test.go index 2e36adbe..1de776c3 100644 --- a/internal/autoscaler/workload/workload_test.go +++ b/internal/autoscaler/workload/workload_test.go @@ -2,48 +2,74 @@ package workload import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/resource" ) var _ = Describe("Workload", func() { - It("should correctly determine if a resource is the target based on config", func() { ws := NewWorkloadState("test") - Expect(ws.ShouldScaleResource("tflops")).To(BeTrue()) - Expect(ws.ShouldScaleResource("vram")).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeTrue()) ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{TargetResource: "all"}, } - Expect(ws.ShouldScaleResource("tflops")).To(BeTrue()) - Expect(ws.ShouldScaleResource("vram")).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeTrue()) ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{TargetResource: "tflops"}, } - Expect(ws.ShouldScaleResource("tflops")).To(BeTrue()) - Expect(ws.ShouldScaleResource("vram")).To(BeFalse()) + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeFalse()) ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{TargetResource: "vram"}, } - Expect(ws.ShouldScaleResource("tflops")).To(BeFalse()) - Expect(ws.ShouldScaleResource("vram")).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeFalse()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeTrue()) }) - It("should correctly determine if auto scaling is enabled based on config", func() { + It("should correctly determine if auto set resources is enabled based on config", func() { ws := NewWorkloadState("test") ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{Enable: true}, } - Expect(ws.IsAutoScalingEnabled()).To(BeTrue()) + Expect(ws.IsAutoSetResourcesEnabled()).To(BeTrue()) ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{Enable: false}, } - Expect(ws.IsAutoScalingEnabled()).To(BeFalse()) + Expect(ws.IsAutoSetResourcesEnabled()).To(BeFalse()) + }) + + It("should return last resources request from the annotations", func() { + ws := NewWorkloadState("test") + tflopsRequest := resource.MustParse("10") + vramRequest := resource.MustParse("8Gi") + tflopsLimit := resource.MustParse("20") + vramLimit := resource.MustParse("16Gi") + ws.Annotations = map[string]string{ + constants.LastTFLOPSRequestAnnotation: tflopsRequest.String(), + constants.LastVRAMRequestAnnotation: vramRequest.String(), + constants.LastTFLOPSLimitAnnotation: tflopsLimit.String(), + constants.LastVRAMLimitAnnotation: vramLimit.String(), + } + resources, _ := ws.GetLastResourcesFromAnnotations() + Expect(resources.Requests.Tflops.Equal(tflopsRequest)).To(BeTrue()) + Expect(resources.Requests.Vram.Equal(vramRequest)).To(BeTrue()) + Expect(resources.Limits.Tflops.Equal(tflopsLimit)).To(BeTrue()) + Expect(resources.Limits.Vram.Equal(vramLimit)).To(BeTrue()) + + ws.Annotations = map[string]string{ + constants.LastVRAMLimitAnnotation: vramLimit.String(), + } + _, err := ws.GetLastResourcesFromAnnotations() + Expect(err).To(HaveOccurred()) }) }) From 714284b57be704885730cd0d296a418f961c568e Mon Sep 17 00:00:00 2001 From: knave Date: Mon, 28 Jul 2025 10:09:52 +0800 Subject: [PATCH 19/27] feat: implement cron scaling --- api/v1/schedulingconfigtemplate_types.go | 10 ++--- api/v1/tensorfusionworkload_types.go | 23 +++++++++++ api/v1/zz_generated.deepcopy.go | 40 +++++++++++++------ ...r-fusion.ai_schedulingconfigtemplates.yaml | 6 +-- ...ensor-fusion.ai_tensorfusionworkloads.yaml | 6 +-- .../tensor-fusion.ai_workloadprofiles.yaml | 6 +-- ...r-fusion.ai_schedulingconfigtemplates.yaml | 6 +-- ...ensor-fusion.ai_tensorfusionworkloads.yaml | 6 +-- .../tensor-fusion.ai_workloadprofiles.yaml | 6 +-- go.mod | 1 + go.sum | 2 + internal/autoscaler/autoscaler_test.go | 2 +- internal/constants/constants.go | 4 ++ 13 files changed, 82 insertions(+), 36 deletions(-) diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go index 320c7678..307fcffb 100644 --- a/api/v1/schedulingconfigtemplate_types.go +++ b/api/v1/schedulingconfigtemplate_types.go @@ -94,16 +94,16 @@ type AutoScalingConfig struct { // HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works) AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"` - // CronScalers defines a list of CronScaler configurations used to schedule scaling actions based on cron expressions. - CronScalers []CronScaler `json:"cronScalers,omitempty"` + // CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions. + CronScalingRules []CronScalingRule `json:"cronScalingRules,omitempty"` } -// CronScaler defines the configuration for scaling resources based on a cron schedule. +// CronScalingRule defines the rule for scaling resources based on a cron schedule. // It allows enabling/disabling the scaler, specifying the time window for scaling, // and configuring the desired resources and replicas during the scheduled period. -type CronScaler struct { +type CronScalingRule struct { // Enable specifies whether the cron scaler is enabled. - Enable *bool `json:"enable,omitempty"` + Enable bool `json:"enable,omitempty"` // Name is the identifier for the cron scaler. Name string `json:"name,omitempty"` // Start is the start time for the scaling schedule, in cron format. diff --git a/api/v1/tensorfusionworkload_types.go b/api/v1/tensorfusionworkload_types.go index f8a62e1d..209addf2 100644 --- a/api/v1/tensorfusionworkload_types.go +++ b/api/v1/tensorfusionworkload_types.go @@ -17,6 +17,7 @@ limitations under the License. package v1 import ( + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -48,6 +49,28 @@ const ( TensorFusionWorkloadPhaseFailed TensorFusionWorkloadPhase = "Failed" ) +type RecommendedResources struct { + LowerBoundTflops resource.Quantity + TargetTflops resource.Quantity + UpperBoundTflops resource.Quantity + LowerBoundVram resource.Quantity + TargetVram resource.Quantity + UpperBoundVram resource.Quantity +} + +func (r RecommendedResources) Merge(target *RecommendedResources) *RecommendedResources { + return target +} + +func (r *RecommendedResources) Equal(t *RecommendedResources) bool { + return r.LowerBoundTflops.Equal(t.LowerBoundTflops) && + r.TargetTflops.Equal(t.TargetTflops) && + r.UpperBoundTflops.Equal(t.UpperBoundTflops) && + r.LowerBoundVram.Equal(t.LowerBoundVram) && + r.TargetVram.Equal(t.TargetVram) && + r.UpperBoundVram.Equal(t.UpperBoundVram) +} + // TensorFusionWorkloadStatus defines the observed state of TensorFusionWorkload. type TensorFusionWorkloadStatus struct { // +kubebuilder:default=Pending diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 4ebd264e..73c40d28 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -140,9 +140,9 @@ func (in *AutoScalingConfig) DeepCopyInto(out *AutoScalingConfig) { *out = *in out.AutoSetResources = in.AutoSetResources out.AutoSetReplicas = in.AutoSetReplicas - if in.CronScalers != nil { - in, out := &in.CronScalers, &out.CronScalers - *out = make([]CronScaler, len(*in)) + if in.CronScalingRules != nil { + in, out := &in.CronScalingRules, &out.CronScalingRules + *out = make([]CronScalingRule, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -369,13 +369,8 @@ func (in *ComputingVendorParams) DeepCopy() *ComputingVendorParams { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *CronScaler) DeepCopyInto(out *CronScaler) { +func (in *CronScalingRule) DeepCopyInto(out *CronScalingRule) { *out = *in - if in.Enable != nil { - in, out := &in.Enable, &out.Enable - *out = new(bool) - **out = **in - } in.DesiredResources.DeepCopyInto(&out.DesiredResources) if in.DesiredReplicas != nil { in, out := &in.DesiredReplicas, &out.DesiredReplicas @@ -384,12 +379,12 @@ func (in *CronScaler) DeepCopyInto(out *CronScaler) { } } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CronScaler. -func (in *CronScaler) DeepCopy() *CronScaler { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CronScalingRule. +func (in *CronScalingRule) DeepCopy() *CronScalingRule { if in == nil { return nil } - out := new(CronScaler) + out := new(CronScalingRule) in.DeepCopyInto(out) return out } @@ -1848,6 +1843,27 @@ func (in *ReBalancerConfig) DeepCopy() *ReBalancerConfig { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RecommendedResources) DeepCopyInto(out *RecommendedResources) { + *out = *in + out.LowerBoundTflops = in.LowerBoundTflops.DeepCopy() + out.TargetTflops = in.TargetTflops.DeepCopy() + out.UpperBoundTflops = in.UpperBoundTflops.DeepCopy() + out.LowerBoundVram = in.LowerBoundVram.DeepCopy() + out.TargetVram = in.TargetVram.DeepCopy() + out.UpperBoundVram = in.UpperBoundVram.DeepCopy() +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecommendedResources. +func (in *RecommendedResources) DeepCopy() *RecommendedResources { + if in == nil { + return nil + } + out := new(RecommendedResources) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RemoteWriteConfig) DeepCopyInto(out *RemoteWriteConfig) { *out = *in diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml index 81599c21..bb8dd068 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -122,12 +122,12 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object - cronScalers: - description: CronScalers defines a list of CronScaler configurations + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions. items: description: |- - CronScaler defines the configuration for scaling resources based on a cron schedule. + CronScalingRule defines the rule for scaling resources based on a cron schedule. It allows enabling/disabling the scaler, specifying the time window for scaling, and configuring the desired resources and replicas during the scheduled period. properties: diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml index 57498b44..1661ae5b 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -118,12 +118,12 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object - cronScalers: - description: CronScalers defines a list of CronScaler configurations + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions. items: description: |- - CronScaler defines the configuration for scaling resources based on a cron schedule. + CronScalingRule defines the rule for scaling resources based on a cron schedule. It allows enabling/disabling the scaler, specifying the time window for scaling, and configuring the desired resources and replicas during the scheduled period. properties: diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml index 33b4a65a..01005b7c 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml @@ -117,12 +117,12 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object - cronScalers: - description: CronScalers defines a list of CronScaler configurations + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions. items: description: |- - CronScaler defines the configuration for scaling resources based on a cron schedule. + CronScalingRule defines the rule for scaling resources based on a cron schedule. It allows enabling/disabling the scaler, specifying the time window for scaling, and configuring the desired resources and replicas during the scheduled period. properties: diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml index 81599c21..bb8dd068 100644 --- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -122,12 +122,12 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object - cronScalers: - description: CronScalers defines a list of CronScaler configurations + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions. items: description: |- - CronScaler defines the configuration for scaling resources based on a cron schedule. + CronScalingRule defines the rule for scaling resources based on a cron schedule. It allows enabling/disabling the scaler, specifying the time window for scaling, and configuring the desired resources and replicas during the scheduled period. properties: diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml index 57498b44..1661ae5b 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -118,12 +118,12 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object - cronScalers: - description: CronScalers defines a list of CronScaler configurations + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions. items: description: |- - CronScaler defines the configuration for scaling resources based on a cron schedule. + CronScalingRule defines the rule for scaling resources based on a cron schedule. It allows enabling/disabling the scaler, specifying the time window for scaling, and configuring the desired resources and replicas during the scheduled period. properties: diff --git a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml index 33b4a65a..01005b7c 100644 --- a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml +++ b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml @@ -117,12 +117,12 @@ spec: the upper bound on vram recommendation. Default: 0.95' type: string type: object - cronScalers: - description: CronScalers defines a list of CronScaler configurations + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions. items: description: |- - CronScaler defines the configuration for scaling resources based on a cron schedule. + CronScalingRule defines the rule for scaling resources based on a cron schedule. It allows enabling/disabling the scaler, specifying the time window for scaling, and configuring the desired resources and replicas during the scheduled period. properties: diff --git a/go.mod b/go.mod index 4092514e..69a98eeb 100644 --- a/go.mod +++ b/go.mod @@ -17,6 +17,7 @@ require ( github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 github.com/pkg/errors v0.9.1 + github.com/robfig/cron/v3 v3.0.1 github.com/samber/lo v1.51.0 github.com/shirou/gopsutil v3.21.11+incompatible github.com/stretchr/testify v1.10.0 diff --git a/go.sum b/go.sum index 57f35cd3..4f7607ff 100644 --- a/go.sum +++ b/go.sum @@ -250,6 +250,8 @@ github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 67c53064..07581531 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -300,7 +300,7 @@ var _ = Describe("Autoscaler", func() { }) - It("should return an error if recommended resources exceeded quota", func() { + FIt("should return an error if recommended resources exceeded quota", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 3e0128e5..2183f852 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -55,6 +55,10 @@ const ( VRAMRequestAnnotation = Domain + "/vram-request" TFLOPSLimitAnnotation = Domain + "/tflops-limit" VRAMLimitAnnotation = Domain + "/vram-limit" + LastTFLOPSRequestAnnotation = Domain + "/last-tflops-request" + LastVRAMRequestAnnotation = Domain + "/last-vram-request" + LastTFLOPSLimitAnnotation = Domain + "/last-tflops-limit" + LastVRAMLimitAnnotation = Domain + "/last-vram-limit" WorkloadProfileAnnotation = Domain + "/client-profile" InjectContainerAnnotation = Domain + "/inject-container" IsLocalGPUAnnotation = Domain + "/is-local-gpu" From 22cd81e823c9069600d5540c2eb2e06707dfa67c Mon Sep 17 00:00:00 2001 From: knave Date: Wed, 30 Jul 2025 18:09:03 +0800 Subject: [PATCH 20/27] feat: implement merging recommendations --- api/v1/tensorfusionconnection_types.go | 14 + api/v1/tensorfusionworkload_types.go | 23 -- api/v1/zz_generated.deepcopy.go | 21 -- internal/autoscaler/autoscaler.go | 38 ++- internal/autoscaler/autoscaler_test.go | 302 ++++++++++-------- .../autoscaler/metrics/metrics_aggregator.go | 7 + internal/autoscaler/recommendation.go | 19 -- .../recommender/cron_recommender.go | 35 +- .../recommender/cron_recommender_test.go | 82 ++--- .../recommender/percentile_recommender.go | 79 ++++- .../autoscaler/recommender/recommender.go | 3 +- internal/autoscaler/workload/handler.go | 196 ++++-------- internal/autoscaler/workload/workload.go | 43 +-- internal/autoscaler/workload/workload_test.go | 53 +-- internal/utils/resource.go | 87 +++++ 15 files changed, 510 insertions(+), 492 deletions(-) delete mode 100644 internal/autoscaler/recommendation.go create mode 100644 internal/utils/resource.go diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go index 874ed4de..11075bbf 100644 --- a/api/v1/tensorfusionconnection_types.go +++ b/api/v1/tensorfusionconnection_types.go @@ -38,6 +38,20 @@ type Resources struct { Limits Resource `json:"limits"` } +func (r *Resources) Equal(t *Resources) bool { + return r.Requests.Tflops.Equal(t.Requests.Tflops) && + r.Requests.Vram.Equal(t.Requests.Vram) && + r.Limits.Tflops.Equal(t.Limits.Tflops) && + r.Limits.Vram.Equal(t.Limits.Vram) +} + +func (r *Resources) IsZero() bool { + return r.Requests.Tflops.IsZero() && + r.Requests.Vram.IsZero() && + r.Limits.Tflops.IsZero() && + r.Limits.Vram.IsZero() +} + // TensorFusionConnectionSpec defines the desired state of TensorFusionConnection. type TensorFusionConnectionSpec struct { WorkloadName string `json:"workloadName"` diff --git a/api/v1/tensorfusionworkload_types.go b/api/v1/tensorfusionworkload_types.go index 209addf2..f8a62e1d 100644 --- a/api/v1/tensorfusionworkload_types.go +++ b/api/v1/tensorfusionworkload_types.go @@ -17,7 +17,6 @@ limitations under the License. package v1 import ( - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -49,28 +48,6 @@ const ( TensorFusionWorkloadPhaseFailed TensorFusionWorkloadPhase = "Failed" ) -type RecommendedResources struct { - LowerBoundTflops resource.Quantity - TargetTflops resource.Quantity - UpperBoundTflops resource.Quantity - LowerBoundVram resource.Quantity - TargetVram resource.Quantity - UpperBoundVram resource.Quantity -} - -func (r RecommendedResources) Merge(target *RecommendedResources) *RecommendedResources { - return target -} - -func (r *RecommendedResources) Equal(t *RecommendedResources) bool { - return r.LowerBoundTflops.Equal(t.LowerBoundTflops) && - r.TargetTflops.Equal(t.TargetTflops) && - r.UpperBoundTflops.Equal(t.UpperBoundTflops) && - r.LowerBoundVram.Equal(t.LowerBoundVram) && - r.TargetVram.Equal(t.TargetVram) && - r.UpperBoundVram.Equal(t.UpperBoundVram) -} - // TensorFusionWorkloadStatus defines the observed state of TensorFusionWorkload. type TensorFusionWorkloadStatus struct { // +kubebuilder:default=Pending diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 73c40d28..579a9c25 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -1843,27 +1843,6 @@ func (in *ReBalancerConfig) DeepCopy() *ReBalancerConfig { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *RecommendedResources) DeepCopyInto(out *RecommendedResources) { - *out = *in - out.LowerBoundTflops = in.LowerBoundTflops.DeepCopy() - out.TargetTflops = in.TargetTflops.DeepCopy() - out.UpperBoundTflops = in.UpperBoundTflops.DeepCopy() - out.LowerBoundVram = in.LowerBoundVram.DeepCopy() - out.TargetVram = in.TargetVram.DeepCopy() - out.UpperBoundVram = in.UpperBoundVram.DeepCopy() -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RecommendedResources. -func (in *RecommendedResources) DeepCopy() *RecommendedResources { - if in == nil { - return nil - } - out := new(RecommendedResources) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RemoteWriteConfig) DeepCopyInto(out *RemoteWriteConfig) { *out = *in diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 852d9357..2bfd349b 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -3,6 +3,7 @@ package autoscaler import ( "context" "errors" + "fmt" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" @@ -153,10 +154,10 @@ func (s *Autoscaler) processWorkloads(ctx context.Context) { log.Info("processing workloads") for _, workload := range s.workloads { - recommendations := map[string]*tfv1.RecommendedResources{} + recommendations := map[string]*tfv1.Resources{} for _, recommender := range s.recommenders { name := recommender.Name() - recommendation, err := recommender.Recommend(workload) + recommendation, err := recommender.Recommend(ctx, workload) if err != nil { log.Error(err, "failed to recommend resources", "recommender", name) continue @@ -166,23 +167,19 @@ func (s *Autoscaler) processWorkloads(ctx context.Context) { } recommendations[name] = recommendation - log.Info("recommendation", "recommender", name, "workload", workload.Name, "resources", recommendations[name]) + log.Info("recommendation", "workload", workload.Name, "recommender", name, "resources", recommendations[name]) } if len(recommendations) == 0 { continue } - var finalRecommendation *tfv1.RecommendedResources - // for _, recommendation := range recommendations { - // finalRecommendation = recommendation - // } - // process cron recommendation - if recommendation, ok := recommendations[recommender.Cron]; ok { - finalRecommendation = recommendation - } + finalRecommendation := mergeRecommendations(recommendations) + log.Info("final recommendation", "workload", workload.Name, "resources", finalRecommendation) - s.workloadHandler.ApplyRecommendationToWorkload(ctx, workload, finalRecommendation) + if err := s.workloadHandler.ApplyRecommendationToWorkload(ctx, workload, finalRecommendation); err != nil { + log.Error(err, "failed to apply recommendation to workload %s", workload.Name) + } } } @@ -195,11 +192,26 @@ func (s *Autoscaler) findOrCreateWorkloadState(name string) *workload.State { return w } +func mergeRecommendations(recommendations map[string]*tfv1.Resources) *tfv1.Resources { + result := &tfv1.Resources{} + for _, rec := range recommendations { + if result.Requests.Tflops.Cmp(rec.Requests.Tflops) < 0 { + result.Requests.Tflops = rec.Requests.Tflops + result.Limits.Tflops = rec.Limits.Tflops + } + if result.Requests.Vram.Cmp(rec.Requests.Vram) < 0 { + result.Requests.Vram = rec.Requests.Vram + result.Limits.Vram = rec.Limits.Vram + } + } + return result +} + // Start after manager started func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { autoScaler, err := NewAutoscaler(mgr.GetClient(), allocator) if err != nil { - return err + return fmt.Errorf("failed to create auto scaler: %v", err) } return mgr.Add(autoScaler) } diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 07581531..72163593 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -26,6 +26,7 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" "github.com/aws/smithy-go/ptr" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -158,7 +159,7 @@ var _ = Describe("Autoscaler", func() { }) Context("when processing workloads", func() { - It("should update only those resources exceeding the recommended resource boundaries", func() { + It("should scale up when the recommended resources exceed the current allocation", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -170,18 +171,32 @@ var _ = Describe("Autoscaler", func() { scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.loadWorkloads(ctx) - scaler.recommenders[0] = &FakeUpScalingRecommender{} - scaler.processWorkloads(ctx) + rec := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + } - rr := scaler.workloads[workload.Name].Recommendation + scaler.recommenders[0] = &FakeRecommender{ + Resources: &rec, + } + + scaler.processWorkloads(ctx) Eventually(func(g Gomega) { - assertWorkerAnnotations(getWorkers(workload)[0], rr) + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&rec)).To(BeTrue()) }).Should(Succeed()) - // Upon reprocessing the workload, it should skip resource updates since they are already within the recommended resource boundaries + // Upon reprocessing the workload, it should skip resource updates scaler.processWorkloads(ctx) Consistently(func(g Gomega) { - assertWorkerAnnotations(getWorkers(workload)[0], rr) + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&rec)).To(BeTrue()) }).Should(Succeed()) }) @@ -197,7 +212,20 @@ var _ = Describe("Autoscaler", func() { scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.loadWorkloads(ctx) - scaler.recommenders[0] = &FakeUpScalingRecommender{} + rec := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + } + + scaler.recommenders[0] = &FakeRecommender{ + Resources: &rec, + } workloadState := scaler.workloads[workload.Name] oldRes := workloadState.Spec.Resources @@ -206,28 +234,67 @@ var _ = Describe("Autoscaler", func() { workloadState.Spec.AutoScalingConfig.AutoSetResources.Enable = false scaler.processWorkloads(ctx) Eventually(func(g Gomega) { - tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0]) - Expect(tflopsRequest.Equal(oldRes.Requests.Tflops)).To(BeTrue()) - Expect(tflopsLimit.Equal(oldRes.Limits.Tflops)).To(BeTrue()) - Expect(vramRequest.Equal(oldRes.Requests.Vram)).To(BeTrue()) - Expect(vramLimit.Equal(oldRes.Limits.Vram)).To(BeTrue()) + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&oldRes)).To(BeTrue()) }).Should(Succeed()) // verify IsTargetResource workloadState.Spec.AutoScalingConfig.AutoSetResources.Enable = true workloadState.Spec.AutoScalingConfig.AutoSetResources.TargetResource = "tflops" scaler.processWorkloads(ctx) - rr := scaler.workloads[workload.Name].Recommendation + expect := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("8Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("16Gi"), + }, + } Eventually(func(g Gomega) { - tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(getWorkers(workload)[0]) - Expect(tflopsRequest.Value()).To(Equal(rr.TargetTflops.Value())) - Expect(tflopsLimit.Value()).To(Equal(rr.TargetTflops.Value() * 2)) - Expect(vramRequest.Equal(oldRes.Requests.Vram)).To(BeTrue()) - Expect(vramLimit.Equal(oldRes.Limits.Vram)).To(BeTrue()) + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&expect)).To(BeTrue()) }).Should(Succeed()) }) - It("should update resources based on cron auto scaling config", func() { + It("should not update resources if recommended resources exceeded quota", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + + rec := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("9999"), + Vram: resource.MustParse("9999Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("9999"), + Vram: resource.MustParse("9999Gi"), + }, + } + + scaler.recommenders[0] = &FakeRecommender{ + Resources: &rec, + } + + workloadState := scaler.workloads[workload.Name] + oldRes := workloadState.Spec.Resources + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&oldRes)).To(BeTrue()) + }).Should(Succeed()) + }) + + FIt("should update resources based on cron scaling rule", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -240,81 +307,97 @@ var _ = Describe("Autoscaler", func() { scaler.loadWorkloads(ctx) workloadState := scaler.workloads[workload.Name] - lastResources := workloadState.Spec.Resources - tflopsRequestInRule := resource.MustParse("20") - vramRequestInRule := resource.MustParse("16Gi") + resourcesInRule := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + } + workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{ { - Enable: true, - Name: "test", - Start: "0 0 * * *", - End: "59 23 * * *", - DesiredResources: tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: tflopsRequestInRule, - Vram: vramRequestInRule, - }, - }, + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: resourcesInRule, }, } scaler.processWorkloads(ctx) Eventually(func(g Gomega) { - tflopsRequest, _, vramRequest, _ := parseResourceAnnotations(getWorkers(workload)[0]) - Expect(tflopsRequest.Equal(tflopsRequestInRule)).To(BeTrue()) - Expect(vramRequest.Equal(vramRequestInRule)).To(BeTrue()) - tflopsRequest, _, vramRequest, _ = parseLastResourceAnnotations(getWorkers(workload)[0]) - Expect(tflopsRequest.Equal(lastResources.Requests.Tflops)).To(BeTrue()) - Expect(vramRequest.Equal(lastResources.Requests.Vram)).To(BeTrue()) + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&resourcesInRule)).To(BeTrue()) }).Should(Succeed()) // invalidate the rule by updating start and end fields workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{ { - Enable: true, - Name: "test", - Start: "", - End: "", - DesiredResources: tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: tflopsRequestInRule, - Vram: vramRequestInRule, - }, - }, + Enable: true, + Name: "test", + Start: "", + End: "", + DesiredResources: resourcesInRule, }, } scaler.processWorkloads(ctx) + originalResources := workloadState.Spec.Resources Eventually(func(g Gomega) { - tflopsRequest, _, vramRequest, _ := parseResourceAnnotations(getWorkers(workload)[0]) - Expect(tflopsRequest.Equal(lastResources.Requests.Tflops)).To(BeTrue()) - Expect(vramRequest.Equal(lastResources.Requests.Vram)).To(BeTrue()) + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&originalResources)).To(BeTrue()) }).Should(Succeed()) - }) - - It("should merge multiple recommendations", func() { + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&originalResources)).To(BeTrue()) + }).Should(Succeed()) }) - It("should not update resource if resource is zero", func() { + It("should merge recomendations based on a larger request value", func() { + recommendations := map[string]*tfv1.Resources{ + "rec1": { + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("10Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("15"), + Vram: resource.MustParse("15Gi"), + }, + }, + "rec2": { + Requests: tfv1.Resource{ + Tflops: resource.MustParse("5"), + Vram: resource.MustParse("15Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("20Gi"), + }, + }, + } + final := mergeRecommendations(recommendations) + Expect(final.Equal(&tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("15Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("15"), + Vram: resource.MustParse("20Gi"), + }, + })).To(BeTrue()) }) - FIt("should return an error if recommended resources exceeded quota", func() { - tfEnv := NewTensorFusionEnvBuilder(). - AddPoolWithNodeCount(1).SetGpuCountPerNode(1). - Build() - defer tfEnv.Cleanup() - go mockSchedulerLoop(ctx, cfg) - workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) - defer deleteWorkload(workload) + It("should not update resource if resource is zero", func() { - scaler, _ := NewAutoscaler(k8sClient, allocator) - scaler.loadWorkloads(ctx) - scaler.recommenders[0] = &FakeQuotaExceededRecommender{} - scaler.processWorkloads(ctx) - err := scaler.workloadHandler.UpdateWorkerResourcesIfNeeded(ctx, scaler.workloads[workload.Name], getWorkers(workload)[0]) - Expect(err.Error()).To(ContainSubstring("failed to adjust allocation: scaling quota exceeded")) }) }) }) @@ -353,7 +436,7 @@ func createWorkload(pool *tfv1.GPUPool, id int, replicas int) *tfv1.TensorFusion AutoScalingConfig: tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{ Enable: true, - TargetResource: "", + TargetResource: "all", }, }, }, @@ -427,38 +510,16 @@ func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*metrics.WorkerUsage, error return sample, nil } -type FakeUpScalingRecommender struct{} - -func (f *FakeUpScalingRecommender) Name() string { - return "FakeUpScaling" -} - -func (f *FakeUpScalingRecommender) Recommend(workoad *workload.State) (*tfv1.RecommendedResources, error) { - return &tfv1.RecommendedResources{ - TargetTflops: resource.MustParse("110"), - LowerBoundTflops: resource.MustParse("100"), - UpperBoundTflops: resource.MustParse("120"), - TargetVram: resource.MustParse("110Gi"), - LowerBoundVram: resource.MustParse("100Gi"), - UpperBoundVram: resource.MustParse("120Gi"), - }, nil +type FakeRecommender struct { + *tfv1.Resources } -type FakeQuotaExceededRecommender struct{} - -func (f *FakeQuotaExceededRecommender) Name() string { - return "FakeQuotaExceeded" +func (f *FakeRecommender) Name() string { + return "Fake" } -func (f *FakeQuotaExceededRecommender) Recommend(workoad *workload.State) (*tfv1.RecommendedResources, error) { - return &tfv1.RecommendedResources{ - TargetTflops: resource.MustParse("9999"), - LowerBoundTflops: resource.MustParse("9999"), - UpperBoundTflops: resource.MustParse("9999"), - TargetVram: resource.MustParse("999Gi"), - LowerBoundVram: resource.MustParse("999Gi"), - UpperBoundVram: resource.MustParse("999Gi"), - }, nil +func (f *FakeRecommender) Recommend(ctx context.Context, workoad *workload.State) (*tfv1.Resources, error) { + return f.Resources, nil } func updateWorkloadReplicas(workload *tfv1.TensorFusionWorkload, replicas int) { @@ -511,49 +572,6 @@ func cleanupWorkload(key client.ObjectKey) { }).Should(Succeed()) } -func assertWorkerAnnotations(worker *corev1.Pod, rr tfv1.RecommendedResources) { - GinkgoHelper() - tflopsRequest, tflopsLimit, vramRequest, vramLimit := parseResourceAnnotations(worker) - Expect(tflopsRequest.Value()).To(Equal(rr.TargetTflops.Value())) - Expect(tflopsLimit.Value()).To(Equal(rr.TargetTflops.Value() * 2)) - Expect(vramRequest.Value()).To(Equal(rr.TargetVram.Value())) - Expect(vramLimit.Value()).To(Equal(rr.TargetVram.Value() * 2)) -} - -func parseResourceAnnotations(worker *corev1.Pod) (tflopsRequest, tflopsLimit, vramRequest, vramLimit resource.Quantity) { - annotations := worker.GetAnnotations() - keys := []struct { - key string - dst *resource.Quantity - }{ - {constants.TFLOPSRequestAnnotation, &tflopsRequest}, - {constants.TFLOPSLimitAnnotation, &tflopsLimit}, - {constants.VRAMRequestAnnotation, &vramRequest}, - {constants.VRAMLimitAnnotation, &vramLimit}, - } - for _, k := range keys { - *k.dst = resource.MustParse(annotations[k.key]) - } - return -} - -func parseLastResourceAnnotations(worker *corev1.Pod) (tflopsRequest, tflopsLimit, vramRequest, vramLimit resource.Quantity) { - annotations := worker.GetAnnotations() - keys := []struct { - key string - dst *resource.Quantity - }{ - {constants.LastTFLOPSRequestAnnotation, &tflopsRequest}, - {constants.LastTFLOPSLimitAnnotation, &tflopsLimit}, - {constants.LastVRAMRequestAnnotation, &vramRequest}, - {constants.LastVRAMLimitAnnotation, &vramLimit}, - } - for _, k := range keys { - *k.dst = resource.MustParse(annotations[k.key]) - } - return -} - func mockSchedulerLoop(ctx context.Context, cfg *rest.Config) { ticker := time.NewTicker(50 * time.Millisecond) clientset, err := kubernetes.NewForConfig(cfg) diff --git a/internal/autoscaler/metrics/metrics_aggregator.go b/internal/autoscaler/metrics/metrics_aggregator.go index 91ac5f88..5ffe51d9 100644 --- a/internal/autoscaler/metrics/metrics_aggregator.go +++ b/internal/autoscaler/metrics/metrics_aggregator.go @@ -35,6 +35,13 @@ func NewWorkerUsageAggregator() *WorkerUsageAggregator { } } +func (w *WorkerUsageAggregator) IsEmpty() bool { + if w.TflopsHistogram.IsEmpty() && w.VramHistogram.IsEmpty() { + return true + } + return false +} + func (w *WorkerUsageAggregator) AddTflopsSample(sample *WorkerUsage) bool { w.TflopsHistogram.AddSample(float64(sample.TflopsUsage), minSampleWeight, sample.Timestamp) if sample.Timestamp.After(w.LastSampleStart) { diff --git a/internal/autoscaler/recommendation.go b/internal/autoscaler/recommendation.go deleted file mode 100644 index 2a2993c4..00000000 --- a/internal/autoscaler/recommendation.go +++ /dev/null @@ -1,19 +0,0 @@ -package autoscaler - -import ( - tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" -) - -type RecommendationProcessor interface { - Process() -} - -type CronRecommendationProcessor struct{} - -func (c *CronRecommendationProcessor) Process() { - -} - -func MergeRecommendations() *tfv1.RecommendedResources { - return &tfv1.RecommendedResources{} -} diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go index 6266cdfe..ecb085ab 100644 --- a/internal/autoscaler/recommender/cron_recommender.go +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -1,13 +1,14 @@ package recommender import ( + "context" "fmt" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" "github.com/robfig/cron/v3" - "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/controller-runtime/pkg/log" ) type CronRecommender struct { @@ -24,38 +25,33 @@ func (c *CronRecommender) Name() string { return "cron" } -func (c *CronRecommender) Recommend(w *workload.State) (*tfv1.RecommendedResources, error) { +func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*tfv1.Resources, error) { + log := log.FromContext(ctx) activeRule, err := c.getActiveCronScalingRule(&w.Spec.AutoScalingConfig) if err != nil { return nil, fmt.Errorf("failed to get active cron scaling rule %w", err) } - var tflopsRequest, vramRequest resource.Quantity + var result tfv1.Resources if activeRule == nil { // if no active rule, return last resources if annotations exists - resources, err := w.GetLastResourcesFromAnnotations() + resources, err := w.GetLastResourcesSpec() if err != nil { return nil, fmt.Errorf("failed to get last resources: %w", err) } + // TODO: need to find a way to determine if triggered by cron recommender // no annotations if resources == nil { return nil, nil } - tflopsRequest = resources.Requests.Tflops - vramRequest = resources.Requests.Vram + result = *resources + log.Info("restore last resources", "workload", w.Name, "resources", result) } else { - tflopsRequest = activeRule.DesiredResources.Requests.Tflops - vramRequest = activeRule.DesiredResources.Requests.Vram + result = activeRule.DesiredResources + log.Info("cron scaling rule matched", "workload", w.Name, "rule", activeRule.Name, "resources", result) } - return &tfv1.RecommendedResources{ - LowerBoundTflops: tflopsRequest, - TargetTflops: tflopsRequest, - UpperBoundTflops: tflopsRequest, - LowerBoundVram: vramRequest, - TargetVram: vramRequest, - UpperBoundVram: vramRequest, - }, nil + return &result, nil } func (c *CronRecommender) getActiveCronScalingRule(config *tfv1.AutoScalingConfig) (*tfv1.CronScalingRule, error) { @@ -64,7 +60,8 @@ func (c *CronRecommender) getActiveCronScalingRule(config *tfv1.AutoScalingConfi currentTime := time.Now() for _, rule := range config.CronScalingRules { - if !rule.Enable || rule.Start == "" || rule.End == "" { + if !rule.Enable || rule.Name == "" || + rule.Start == "" || rule.End == "" { continue } @@ -74,11 +71,11 @@ func (c *CronRecommender) getActiveCronScalingRule(config *tfv1.AutoScalingConfi startSchedule, err := c.parser.Parse(rule.Start) if err != nil { - return nil, fmt.Errorf("failed to parse start: %w", err) + return nil, fmt.Errorf("failed to parse cron rule %s start: %w", rule.Name, err) } endSchedule, err := c.parser.Parse(rule.End) if err != nil { - return nil, fmt.Errorf("failed to parse end: %w", err) + return nil, fmt.Errorf("failed to parse cron rule %s end: %w", rule.Name, err) } nextStartTime := startSchedule.Next(time.Now()) diff --git a/internal/autoscaler/recommender/cron_recommender_test.go b/internal/autoscaler/recommender/cron_recommender_test.go index dffce1be..6c7d7446 100644 --- a/internal/autoscaler/recommender/cron_recommender_test.go +++ b/internal/autoscaler/recommender/cron_recommender_test.go @@ -1,69 +1,56 @@ package recommender import ( + "context" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/api/resource" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" - "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" ) var _ = Describe("CronRecommender", func() { - It("should return recommended resource based on active cron scaling rule", func() { - tflopsRequest := resource.MustParse("10") - vramRequest := resource.MustParse("8Gi") - tflopsLimit := resource.MustParse("20") - vramLimit := resource.MustParse("16Gi") + ctx := context.TODO() + tflopsRequest := resource.MustParse("10") + vramRequest := resource.MustParse("8Gi") + tflopsLimit := resource.MustParse("20") + vramLimit := resource.MustParse("16Gi") + res := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: tflopsRequest, + Vram: vramRequest, + }, + Limits: tfv1.Resource{ + Tflops: tflopsLimit, + Vram: vramLimit, + }, + } + It("should return recommended resource based on active cron scaling rule", func() { workload := workload.NewWorkloadState("test") workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{ { - Enable: true, - Name: "test", - Start: "0 0 * * *", - End: "59 23 * * *", - DesiredResources: tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: tflopsRequest, - Vram: vramRequest, - }, - Limits: tfv1.Resource{ - Tflops: tflopsLimit, - Vram: vramLimit, - }, - }, + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: res, }, }, } recommender := NewCronRecommender() - recommendation, _ := recommender.Recommend(workload) - Expect(*recommendation).To(Equal(tfv1.RecommendedResources{ - LowerBoundTflops: tflopsRequest, - TargetTflops: tflopsRequest, - UpperBoundTflops: tflopsRequest, - LowerBoundVram: vramRequest, - TargetVram: vramRequest, - UpperBoundVram: vramRequest, - })) + recommendation, _ := recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&res)).To(BeTrue()) }) - FIt("should return recommended resource based on last resource annotations", func() { - tflopsRequest := resource.MustParse("10") - vramRequest := resource.MustParse("8Gi") - // tflopsLimit := resource.MustParse("20") - // vramLimit := resource.MustParse("16Gi") - + It("should return recommended resource based on last resources spec", func() { workload := workload.NewWorkloadState("test") - workload.Annotations = map[string]string{ - constants.LastTFLOPSRequestAnnotation: tflopsRequest.String(), - constants.LastVRAMRequestAnnotation: vramRequest.String(), - // constants.LastTFLOPSLimitAnnotation: tflopsLimit.String(), - // constants.LastVRAMLimitAnnotation: vramLimit.String(), - } + workload.Annotations = utils.LastResourcesToAnnotations(&res) workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{ { @@ -76,15 +63,8 @@ var _ = Describe("CronRecommender", func() { } recommender := NewCronRecommender() - recommendation, _ := recommender.Recommend(workload) - Expect(recommendation.Equal(&tfv1.RecommendedResources{ - LowerBoundTflops: tflopsRequest, - TargetTflops: tflopsRequest, - UpperBoundTflops: tflopsRequest, - LowerBoundVram: vramRequest, - TargetVram: vramRequest, - UpperBoundVram: vramRequest, - })).To(BeTrue()) + recommendation, _ := recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&res)).To(BeTrue()) }) It("should return error if getting multiple active rules", func() { @@ -106,7 +86,7 @@ var _ = Describe("CronRecommender", func() { }, } recommender := NewCronRecommender() - _, err := recommender.Recommend(workload) + _, err := recommender.Recommend(ctx, workload) Expect(err).To(HaveOccurred()) }) diff --git a/internal/autoscaler/recommender/percentile_recommender.go b/internal/autoscaler/recommender/percentile_recommender.go index 35be2503..43df93b1 100644 --- a/internal/autoscaler/recommender/percentile_recommender.go +++ b/internal/autoscaler/recommender/percentile_recommender.go @@ -1,11 +1,16 @@ package recommender import ( + "context" + "fmt" + "math/big" "strconv" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/controller-runtime/pkg/log" ) const ( @@ -38,6 +43,15 @@ var defaultPercentileConfig = PercentileConfig{ ConfidenceInterval: defaultConfidenceInterval, } +type RecommendedResources struct { + LowerBoundTflops resource.Quantity + TargetTflops resource.Quantity + UpperBoundTflops resource.Quantity + LowerBoundVram resource.Quantity + TargetVram resource.Quantity + UpperBoundVram resource.Quantity +} + type PercentileConfig struct { TargetTflopsPercentile float64 LowerBoundTflopsPercentile float64 @@ -66,22 +80,59 @@ func (p *PercentileRecommender) Name() string { return "percentile" } -func (p *PercentileRecommender) Recommend(workload *workload.State) (*tfv1.RecommendedResources, error) { +func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) { + log := log.FromContext(ctx) // TODO: cache config aggregator := workload.WorkerUsageAggregator - if aggregator.TflopsHistogram.IsEmpty() && aggregator.VramHistogram.IsEmpty() { + if aggregator.IsEmpty() { return nil, nil } + curRes, err := workload.GetCurrentResourcesSpec() + if err != nil { + return nil, fmt.Errorf("failed to get current resources from workload %s: %v", workload.Name, err) + } + p.createEstimatorsFromConfig(p.getPercentileConfig(&workload.Spec.AutoScalingConfig.AutoSetResources)) - return &tfv1.RecommendedResources{ + rr := &RecommendedResources{ LowerBoundTflops: QuantityFromAmount(p.lowerBoundTflops.GetTflopsEstimation(aggregator)), TargetTflops: QuantityFromAmount(p.targetTflops.GetTflopsEstimation(aggregator)), UpperBoundTflops: QuantityFromAmount(p.upperBoundTflops.GetTflopsEstimation(aggregator)), LowerBoundVram: QuantityFromAmount(p.lowerBoundVram.GetVramEstimation(aggregator)), TargetVram: QuantityFromAmount(p.targetVram.GetVramEstimation(aggregator)), UpperBoundVram: QuantityFromAmount(p.upperBoundVram.GetVramEstimation(aggregator)), - }, nil + } + + log.Info("recommendation", "workload", workload.Name, "recommender", p.Name(), "resources", rr) + + var result tfv1.Resources + if curRes.Requests.Tflops.Cmp(rr.LowerBoundTflops) < 0 || + curRes.Requests.Tflops.Cmp(rr.UpperBoundTflops) > 0 { + result.Requests.Tflops = rr.TargetTflops + targetLimit := getProportionalLimit(&curRes.Limits.Tflops, &curRes.Requests.Tflops, &rr.TargetTflops) + if targetLimit == nil { + return nil, fmt.Errorf("failed to get tflops limit from workload %s", workload.Name) + } + result.Limits.Tflops = *targetLimit + } + + if curRes.Requests.Vram.Cmp(rr.LowerBoundVram) < 0 || + curRes.Requests.Vram.Cmp(rr.UpperBoundVram) > 0 { + result.Requests.Vram = rr.TargetVram + targetLimit := getProportionalLimit(&curRes.Limits.Vram, &curRes.Requests.Vram, &rr.TargetVram) + if targetLimit == nil { + return nil, fmt.Errorf("failed to get vram limit from workload %s", workload.Name) + } + result.Limits.Vram = *targetLimit + } + + if result.Equal(curRes) { + return nil, nil + } + + // TODO: handle tflops or vram should recommend + + return &result, nil } func (p *PercentileRecommender) getPercentileConfig(asr *tfv1.AutoSetResources) *PercentileConfig { @@ -153,3 +204,23 @@ func (p *PercentileRecommender) createEstimatorsFromConfig(config *PercentileCon upperBoundVram: upperBoundVram, } } + +func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity { + if originalLimit == nil || originalLimit.IsZero() || + originalRequest == nil || originalRequest.IsZero() || + recommendedRequest == nil || recommendedRequest.IsZero() { + return nil + } + + originalValue := big.NewInt(originalLimit.Value()) + scaleBaseValue := big.NewInt(originalRequest.Value()) + scaleResultValue := big.NewInt(recommendedRequest.Value()) + var scaledOriginal big.Int + scaledOriginal.Mul(originalValue, scaleResultValue) + scaledOriginal.Div(&scaledOriginal, scaleBaseValue) + if scaledOriginal.IsInt64() { + return resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format) + } + + return nil +} diff --git a/internal/autoscaler/recommender/recommender.go b/internal/autoscaler/recommender/recommender.go index c500fa7c..43127746 100644 --- a/internal/autoscaler/recommender/recommender.go +++ b/internal/autoscaler/recommender/recommender.go @@ -1,6 +1,7 @@ package recommender import ( + "context" "fmt" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" @@ -15,7 +16,7 @@ const ( // Interface defines the contract for resource recommendation strategies used by the autoscaler. type Interface interface { Name() string - Recommend(workload *workload.State) (*tfv1.RecommendedResources, error) + Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) } func New(name string) (Interface, error) { diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index b16537b7..7ce9cdfb 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -3,21 +3,20 @@ package workload import ( "context" "fmt" - "math/big" + "maps" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + "github.com/NexusGPU/tensor-fusion/internal/utils" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) type Handler interface { UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) - ApplyRecommendationToWorkload(ctx context.Context, state *State, recommendation *tfv1.RecommendedResources) error - UpdateWorkerResourcesIfNeeded(ctx context.Context, workload *State, worker *corev1.Pod) error + ApplyRecommendationToWorkload(ctx context.Context, state *State, recommendation *tfv1.Resources) error } type handler struct { @@ -47,7 +46,7 @@ func (h *handler) UpdateWorkloadState(ctx context.Context, workloadState *State, workloadState.updateWorkers(workerList) } -func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, state *State, recommendation *tfv1.RecommendedResources) error { +func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, state *State, recommendation *tfv1.Resources) error { workload := &tfv1.TensorFusionWorkload{} if err := h.Get(ctx, client.ObjectKey{Namespace: state.Namespace, Name: state.Name}, workload); err != nil { return fmt.Errorf("failed to get workload: %v", err) @@ -58,40 +57,37 @@ func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, state *Stat if workload.Annotations == nil { workload.Annotations = map[string]string{} } - if tflopsRequest, ok := workload.Annotations[constants.TFLOPSRequestAnnotation]; ok { - workload.Annotations[constants.LastTFLOPSRequestAnnotation] = tflopsRequest - } else { - workload.Annotations[constants.LastTFLOPSRequestAnnotation] = workload.Spec.Resources.Requests.Tflops.String() + curRes, err := utils.CurrentResourcesFromAnnotations(workload.Annotations) + if err != nil { + return fmt.Errorf("failed to get current workload resources: %v", err) } - if vramRequest, ok := workload.Annotations[constants.VRAMRequestAnnotation]; ok { - workload.Annotations[constants.LastVRAMRequestAnnotation] = vramRequest + maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(recommendation)) + if curRes == nil { + maps.Copy(workload.Annotations, utils.LastResourcesToAnnotations(&workload.Spec.Resources)) } else { - workload.Annotations[constants.LastVRAMRequestAnnotation] = workload.Spec.Resources.Requests.Vram.String() + maps.Copy(workload.Annotations, utils.LastResourcesToAnnotations(curRes)) } - workload.Annotations[constants.TFLOPSRequestAnnotation] = recommendation.TargetTflops.String() - workload.Annotations[constants.VRAMRequestAnnotation] = recommendation.TargetVram.String() if err := h.Patch(ctx, workload, patch); err != nil { - return fmt.Errorf("failed to patch workload: %v", err) + return fmt.Errorf("failed to patch workload %s: %v", workload.Name, err) } state.Annotations = workload.Annotations state.Recommendation = *recommendation - if err := h.ApplyRecommendationToWorkers(ctx, state); err != nil { + if err := h.applyRecommendationToWorkers(ctx, state); err != nil { return fmt.Errorf("failed to apply recommendation to workers: %v", err) } return nil } -func (h *handler) ApplyRecommendationToWorkers(ctx context.Context, workload *State) error { - log := log.FromContext(ctx) +func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *State) error { workerList := &corev1.PodList{} if err := h.List(ctx, workerList, client.InNamespace(workload.Namespace), client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil { - log.Error(err, "failed to list workers") + return fmt.Errorf("failed to list workers: %v", err) } if !workload.IsAutoSetResourcesEnabled() { @@ -103,147 +99,63 @@ func (h *handler) ApplyRecommendationToWorkers(ctx context.Context, workload *St continue } - if err := h.UpdateWorkerResourcesIfNeeded(ctx, workload, &worker); err != nil { - log.Error(err, "failed to update worker") + if err := h.updateWorkerResources(ctx, workload, &worker); err != nil { + return fmt.Errorf("failed to update worker %s resources: %v", worker.Name, err) } } return nil } -func (h *handler) UpdateWorkerResourcesIfNeeded(ctx context.Context, workload *State, worker *corev1.Pod) error { +func (h *handler) updateWorkerResources(ctx context.Context, workload *State, worker *corev1.Pod) error { log := log.FromContext(ctx) - adjustRequest, err := getCurrentWorkerResourceRequest(worker) - if err != nil { - return fmt.Errorf("failed to get current worker resource request, %v", err) - } - - recommendation := &workload.Recommendation - resourcesInfo := []struct { - name tfv1.ResourceName - requestKey string - limitKey string - lastRequestKey string - lastLimitKey string - request *resource.Quantity - limit *resource.Quantity - lowerBound resource.Quantity - upperBound resource.Quantity - target resource.Quantity - }{ - { - name: tfv1.ResourceTflops, - requestKey: constants.TFLOPSRequestAnnotation, - limitKey: constants.TFLOPSLimitAnnotation, - lastRequestKey: constants.LastTFLOPSRequestAnnotation, - lastLimitKey: constants.LastTFLOPSLimitAnnotation, - request: &adjustRequest.NewRequest.Tflops, - limit: &adjustRequest.NewLimit.Tflops, - lowerBound: recommendation.LowerBoundTflops, - upperBound: recommendation.UpperBoundTflops, - target: recommendation.TargetTflops, - }, - { - name: tfv1.ResourceVram, - requestKey: constants.VRAMRequestAnnotation, - limitKey: constants.VRAMLimitAnnotation, - lastRequestKey: constants.LastVRAMRequestAnnotation, - lastLimitKey: constants.LastVRAMLimitAnnotation, - request: &adjustRequest.NewRequest.Vram, - limit: &adjustRequest.NewLimit.Vram, - lowerBound: recommendation.LowerBoundVram, - upperBound: recommendation.UpperBoundVram, - target: recommendation.TargetVram, - }, - } - - newAnnotations := map[string]string{} - var upScaling, downScaling bool - for _, resInfo := range resourcesInfo { - if !workload.ShouldScaleResource(resInfo.name) { - continue - } - upScaling = resInfo.request.Cmp(resInfo.lowerBound) < 0 - downScaling = resInfo.request.Cmp(resInfo.upperBound) > 0 - if upScaling || downScaling { - targetRequest := resInfo.target - targetLimit := getProportionalLimit(resInfo.limit, resInfo.request, &targetRequest) - if targetLimit == nil { - return fmt.Errorf("failed to get limit for %s", resInfo.requestKey) - } - newAnnotations[resInfo.lastRequestKey] = resInfo.request.String() - newAnnotations[resInfo.lastLimitKey] = resInfo.limit.String() - newAnnotations[resInfo.requestKey] = targetRequest.String() - newAnnotations[resInfo.limitKey] = targetLimit.String() - *resInfo.request = targetRequest - *resInfo.limit = *targetLimit - } + rec := workload.Recommendation + annotationsToUpdate := utils.CurrentResourcesToAnnotations(&rec) + if !workload.ShouldScaleResource(tfv1.ResourceTflops) { + delete(annotationsToUpdate, constants.TFLOPSRequestAnnotation) + delete(annotationsToUpdate, constants.TFLOPSLimitAnnotation) } - - if len(newAnnotations) > 0 { - adjustRequest.IsScaleUp = upScaling - if _, err := h.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil { - return fmt.Errorf("failed to adjust allocation: %v", err) - } - log.Info("adjust allocation successfully", "adjustRequest", adjustRequest) - // Patch the worker with updated annotations - patch := client.MergeFrom(worker.DeepCopy()) - for key, value := range newAnnotations { - worker.Annotations[key] = value - } - if err := h.Patch(ctx, worker, patch); err != nil { - return fmt.Errorf("failed to patch worker: %v", err) - } + if !workload.ShouldScaleResource(tfv1.ResourceVram) { + delete(annotationsToUpdate, constants.VRAMRequestAnnotation) + delete(annotationsToUpdate, constants.VRAMLimitAnnotation) } - return nil -} + if len(annotationsToUpdate) <= 0 { + return nil + } -func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity { - if originalLimit == nil || originalLimit.IsZero() || - originalRequest == nil || originalRequest.IsZero() || - recommendedRequest == nil || recommendedRequest.IsZero() { + curRes, err := utils.CurrentResourcesFromAnnotations(worker.Annotations) + if err != nil { + return fmt.Errorf("failed to get current worker resources: %v", err) + } + if curRes.Equal(&rec) { return nil } - originalValue := big.NewInt(originalLimit.Value()) - scaleBaseValue := big.NewInt(originalRequest.Value()) - scaleResultValue := big.NewInt(recommendedRequest.Value()) - var scaledOriginal big.Int - scaledOriginal.Mul(originalValue, scaleResultValue) - scaledOriginal.Div(&scaledOriginal, scaleBaseValue) - if scaledOriginal.IsInt64() { - return resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format) + adjustRequest := &tfv1.AdjustRequest{ + PodUID: string(worker.UID), + IsScaleUp: rec.Requests.Tflops.Cmp(curRes.Requests.Tflops) > 0, // TODO: handle vram? + NewRequest: rec.Requests, + NewLimit: rec.Limits, } - return nil -} + if _, err := h.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil { + return fmt.Errorf("failed to adjust allocation: %v", err) + } + log.Info("adjust allocation successfully", "adjustRequest", adjustRequest) -func getCurrentWorkerResourceRequest(worker *corev1.Pod) (*tfv1.AdjustRequest, error) { - adjustRequest := tfv1.AdjustRequest{ - PodUID: string(worker.UID), - IsScaleUp: false, - NewRequest: tfv1.Resource{}, - NewLimit: tfv1.Resource{}, - } - annotations := worker.GetAnnotations() - resInfo := []struct { - key string - dst *resource.Quantity - }{ - {constants.TFLOPSRequestAnnotation, &adjustRequest.NewRequest.Tflops}, - {constants.TFLOPSLimitAnnotation, &adjustRequest.NewLimit.Tflops}, - {constants.VRAMRequestAnnotation, &adjustRequest.NewRequest.Vram}, - {constants.VRAMLimitAnnotation, &adjustRequest.NewLimit.Vram}, - } - for _, info := range resInfo { - q, err := resource.ParseQuantity(annotations[info.key]) - if err != nil { - return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) - } - *info.dst = q + patch := client.MergeFrom(worker.DeepCopy()) + + for key, value := range annotationsToUpdate { + worker.Annotations[key] = value } - return &adjustRequest, nil + if err := h.Patch(ctx, worker, patch); err != nil { + return fmt.Errorf("failed to patch worker %s: %v", worker.Name, err) + } + + log.Info("apply recommendation successfully", "worker", worker.Name, "recommendation", rec, "currentResources", curRes) + + return nil } diff --git a/internal/autoscaler/workload/workload.go b/internal/autoscaler/workload/workload.go index 5d30ca24..1006517a 100644 --- a/internal/autoscaler/workload/workload.go +++ b/internal/autoscaler/workload/workload.go @@ -1,14 +1,12 @@ package workload import ( - "fmt" "strings" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" - "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" ) type State struct { @@ -16,7 +14,7 @@ type State struct { Name string Annotations map[string]string Spec tfv1.WorkloadProfileSpec - Recommendation tfv1.RecommendedResources + Recommendation tfv1.Resources Workers map[string]*WorkerState WorkerUsageAggregator *metrics.WorkerUsageAggregator } @@ -29,37 +27,12 @@ func NewWorkloadState(name string) *State { } } -func (w *State) GetLastResourcesFromAnnotations() (*tfv1.Resources, error) { - result := tfv1.Resources{} - resInfo := []struct { - key string - dst *resource.Quantity - }{ - {constants.LastTFLOPSRequestAnnotation, &result.Requests.Tflops}, - {constants.LastTFLOPSLimitAnnotation, &result.Limits.Tflops}, - {constants.LastVRAMRequestAnnotation, &result.Requests.Vram}, - {constants.LastVRAMLimitAnnotation, &result.Limits.Vram}, - } - annotations := w.Annotations - hasAnnotation := false - for _, info := range resInfo { - annotation, ok := annotations[info.key] - if !ok { - continue - } - q, err := resource.ParseQuantity(annotation) - if err != nil { - return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) - } - *info.dst = q - hasAnnotation = true - } - - if !hasAnnotation { - return nil, nil - } +func (w *State) GetLastResourcesSpec() (*tfv1.Resources, error) { + return utils.LastResourcesFromAnnotations(w.Annotations) +} - return &result, nil +func (w *State) GetCurrentResourcesSpec() (*tfv1.Resources, error) { + return utils.CurrentResourcesFromAnnotations(w.Annotations) } func (w *State) IsAutoSetResourcesEnabled() bool { @@ -68,7 +41,7 @@ func (w *State) IsAutoSetResourcesEnabled() bool { func (w *State) ShouldScaleResource(name tfv1.ResourceName) bool { target := w.Spec.AutoScalingConfig.AutoSetResources.TargetResource - return target == "" || strings.EqualFold(target, "all") || strings.EqualFold(string(name), target) + return strings.EqualFold(target, "all") || strings.EqualFold(string(name), target) } func (w *State) updateWorkers(podList *corev1.PodList) { diff --git a/internal/autoscaler/workload/workload_test.go b/internal/autoscaler/workload/workload_test.go index 1de776c3..bd18e9f7 100644 --- a/internal/autoscaler/workload/workload_test.go +++ b/internal/autoscaler/workload/workload_test.go @@ -2,7 +2,7 @@ package workload import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" - "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/api/resource" @@ -12,8 +12,8 @@ var _ = Describe("Workload", func() { It("should correctly determine if a resource is the target based on config", func() { ws := NewWorkloadState("test") - Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeTrue()) - Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeFalse()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeFalse()) ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ AutoSetResources: tfv1.AutoSetResources{TargetResource: "all"}, @@ -48,28 +48,37 @@ var _ = Describe("Workload", func() { Expect(ws.IsAutoSetResourcesEnabled()).To(BeFalse()) }) - It("should return last resources request from the annotations", func() { + It("should return last resources spec from the annotations", func() { ws := NewWorkloadState("test") - tflopsRequest := resource.MustParse("10") - vramRequest := resource.MustParse("8Gi") - tflopsLimit := resource.MustParse("20") - vramLimit := resource.MustParse("16Gi") - ws.Annotations = map[string]string{ - constants.LastTFLOPSRequestAnnotation: tflopsRequest.String(), - constants.LastVRAMRequestAnnotation: vramRequest.String(), - constants.LastTFLOPSLimitAnnotation: tflopsLimit.String(), - constants.LastVRAMLimitAnnotation: vramLimit.String(), + expect := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("16Gi"), + }, } - resources, _ := ws.GetLastResourcesFromAnnotations() - Expect(resources.Requests.Tflops.Equal(tflopsRequest)).To(BeTrue()) - Expect(resources.Requests.Vram.Equal(vramRequest)).To(BeTrue()) - Expect(resources.Limits.Tflops.Equal(tflopsLimit)).To(BeTrue()) - Expect(resources.Limits.Vram.Equal(vramLimit)).To(BeTrue()) + ws.Annotations = utils.LastResourcesToAnnotations(&expect) + got, _ := ws.GetLastResourcesSpec() + Expect(got.Equal(&expect)) + }) - ws.Annotations = map[string]string{ - constants.LastVRAMLimitAnnotation: vramLimit.String(), + It("should return current resources spec from the annotations", func() { + ws := NewWorkloadState("test") + expect := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("16Gi"), + }, } - _, err := ws.GetLastResourcesFromAnnotations() - Expect(err).To(HaveOccurred()) + ws.Annotations = utils.CurrentResourcesToAnnotations(&expect) + got, _ := ws.GetCurrentResourcesSpec() + Expect(got.Equal(&expect)) }) }) diff --git a/internal/utils/resource.go b/internal/utils/resource.go new file mode 100644 index 00000000..855d3ce3 --- /dev/null +++ b/internal/utils/resource.go @@ -0,0 +1,87 @@ +package utils + +import ( + "fmt" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "k8s.io/apimachinery/pkg/api/resource" +) + +func CurrentResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) { + result := tfv1.Resources{} + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {constants.TFLOPSRequestAnnotation, &result.Requests.Tflops}, + {constants.TFLOPSLimitAnnotation, &result.Limits.Tflops}, + {constants.VRAMRequestAnnotation, &result.Requests.Vram}, + {constants.VRAMLimitAnnotation, &result.Limits.Vram}, + } + for _, info := range resInfo { + annotation, ok := annotations[info.key] + if !ok { + continue + } + q, err := resource.ParseQuantity(annotation) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + } + + if result.IsZero() { + return nil, nil + } + + return &result, nil +} + +func LastResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) { + result := tfv1.Resources{} + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {constants.LastTFLOPSRequestAnnotation, &result.Requests.Tflops}, + {constants.LastTFLOPSLimitAnnotation, &result.Limits.Tflops}, + {constants.LastVRAMRequestAnnotation, &result.Requests.Vram}, + {constants.LastVRAMLimitAnnotation, &result.Limits.Vram}, + } + for _, info := range resInfo { + annotation, ok := annotations[info.key] + if !ok { + continue + } + q, err := resource.ParseQuantity(annotation) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + } + + if result.IsZero() { + return nil, nil + } + + return &result, nil +} + +func CurrentResourcesToAnnotations(resources *tfv1.Resources) map[string]string { + return map[string]string{ + constants.TFLOPSRequestAnnotation: resources.Requests.Tflops.String(), + constants.TFLOPSLimitAnnotation: resources.Limits.Tflops.String(), + constants.VRAMRequestAnnotation: resources.Requests.Vram.String(), + constants.VRAMLimitAnnotation: resources.Limits.Vram.String(), + } +} + +func LastResourcesToAnnotations(resources *tfv1.Resources) map[string]string { + return map[string]string{ + constants.LastTFLOPSRequestAnnotation: resources.Requests.Tflops.String(), + constants.LastTFLOPSLimitAnnotation: resources.Limits.Tflops.String(), + constants.LastVRAMRequestAnnotation: resources.Requests.Vram.String(), + constants.LastVRAMLimitAnnotation: resources.Limits.Vram.String(), + } +} From a8a3a7cce8fe63d45854714b477172fe79a00a83 Mon Sep 17 00:00:00 2001 From: knave Date: Thu, 31 Jul 2025 00:03:56 +0800 Subject: [PATCH 21/27] feat: implement restoring resources upon cron scaling termination --- internal/autoscaler/autoscaler.go | 4 +-- internal/autoscaler/autoscaler_test.go | 10 -------- .../recommender/cron_recommender.go | 25 +++++++++++++------ .../recommender/recommender_test.go | 1 - internal/autoscaler/workload/handler.go | 22 ++++++++-------- internal/autoscaler/workload/workload.go | 3 ++- 6 files changed, 31 insertions(+), 34 deletions(-) delete mode 100644 internal/autoscaler/recommender/recommender_test.go diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 2bfd349b..8b0322b2 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -167,7 +167,7 @@ func (s *Autoscaler) processWorkloads(ctx context.Context) { } recommendations[name] = recommendation - log.Info("recommendation", "workload", workload.Name, "recommender", name, "resources", recommendations[name]) + log.Info("recommendation", "workload", workload.Name, "recommender", name, "resources", recommendation) } if len(recommendations) == 0 { @@ -178,7 +178,7 @@ func (s *Autoscaler) processWorkloads(ctx context.Context) { log.Info("final recommendation", "workload", workload.Name, "resources", finalRecommendation) if err := s.workloadHandler.ApplyRecommendationToWorkload(ctx, workload, finalRecommendation); err != nil { - log.Error(err, "failed to apply recommendation to workload %s", workload.Name) + log.Error(err, "failed to apply recommendation", "workload", workload.Name, "recommendation", finalRecommendation) } } } diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 72163593..3f9ce0cd 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -41,16 +41,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -// [x] tflops add all samples, like cpu in vpa -// [x] Reallocate resources before update annotation -// Add AutoSetResources, make it more configurable -// Log key events -// Add recommendation to workload status -// Write some documents -// cron scheduler stragegy, parallisam ? -// Refactor main, setup database may not put in leader election runnable group -// Resolve conversation on github, thanks for reviews - var _ = Describe("Autoscaler", func() { Context("when creating an autoscaler", func() { It("should return an error if there is no client", func() { diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go index ecb085ab..a2c8c91b 100644 --- a/internal/autoscaler/recommender/cron_recommender.go +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -7,10 +7,15 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/robfig/cron/v3" "sigs.k8s.io/controller-runtime/pkg/log" ) +const ( + CronScalingAnnotation = constants.Domain + "/cron-scaling" +) + type CronRecommender struct { parser cron.Parser } @@ -32,26 +37,30 @@ func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*tf return nil, fmt.Errorf("failed to get active cron scaling rule %w", err) } - var result tfv1.Resources + var result *tfv1.Resources if activeRule == nil { - // if no active rule, return last resources if annotations exists + // if no active ruleName, return last resources if cron scaling not finish + ruleName, ok := w.Annotations[CronScalingAnnotation] + if !ok || ruleName == "" { + return nil, nil + } resources, err := w.GetLastResourcesSpec() if err != nil { return nil, fmt.Errorf("failed to get last resources: %w", err) } - // TODO: need to find a way to determine if triggered by cron recommender - // no annotations if resources == nil { return nil, nil } - result = *resources - log.Info("restore last resources", "workload", w.Name, "resources", result) + result = resources + w.ScalingAnnotations[CronScalingAnnotation] = "" + log.Info("cron scaling rule finished and restore last resources", "workload", w.Name, "rule", ruleName, "resources", result) } else { - result = activeRule.DesiredResources + result = &activeRule.DesiredResources + w.ScalingAnnotations[CronScalingAnnotation] = activeRule.Name log.Info("cron scaling rule matched", "workload", w.Name, "rule", activeRule.Name, "resources", result) } - return &result, nil + return result, nil } func (c *CronRecommender) getActiveCronScalingRule(config *tfv1.AutoScalingConfig) (*tfv1.CronScalingRule, error) { diff --git a/internal/autoscaler/recommender/recommender_test.go b/internal/autoscaler/recommender/recommender_test.go deleted file mode 100644 index e0645922..00000000 --- a/internal/autoscaler/recommender/recommender_test.go +++ /dev/null @@ -1 +0,0 @@ -package recommender diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index 7ce9cdfb..7d386500 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -61,28 +61,27 @@ func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, state *Stat if err != nil { return fmt.Errorf("failed to get current workload resources: %v", err) } - maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(recommendation)) if curRes == nil { - maps.Copy(workload.Annotations, utils.LastResourcesToAnnotations(&workload.Spec.Resources)) - } else { - maps.Copy(workload.Annotations, utils.LastResourcesToAnnotations(curRes)) + curRes = &workload.Spec.Resources } + maps.Copy(workload.Annotations, utils.LastResourcesToAnnotations(curRes)) + maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(recommendation)) + maps.Copy(workload.Annotations, state.ScalingAnnotations) if err := h.Patch(ctx, workload, patch); err != nil { return fmt.Errorf("failed to patch workload %s: %v", workload.Name, err) } state.Annotations = workload.Annotations - state.Recommendation = *recommendation - if err := h.applyRecommendationToWorkers(ctx, state); err != nil { + if err := h.applyRecommendationToWorkers(ctx, state, recommendation); err != nil { return fmt.Errorf("failed to apply recommendation to workers: %v", err) } return nil } -func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *State) error { +func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *State, recommendation *tfv1.Resources) error { workerList := &corev1.PodList{} if err := h.List(ctx, workerList, client.InNamespace(workload.Namespace), @@ -99,7 +98,7 @@ func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *St continue } - if err := h.updateWorkerResources(ctx, workload, &worker); err != nil { + if err := h.updateWorkerResources(ctx, workload, &worker, recommendation); err != nil { return fmt.Errorf("failed to update worker %s resources: %v", worker.Name, err) } } @@ -107,11 +106,10 @@ func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *St return nil } -func (h *handler) updateWorkerResources(ctx context.Context, workload *State, worker *corev1.Pod) error { +func (h *handler) updateWorkerResources(ctx context.Context, workload *State, worker *corev1.Pod, rec *tfv1.Resources) error { log := log.FromContext(ctx) - rec := workload.Recommendation - annotationsToUpdate := utils.CurrentResourcesToAnnotations(&rec) + annotationsToUpdate := utils.CurrentResourcesToAnnotations(rec) if !workload.ShouldScaleResource(tfv1.ResourceTflops) { delete(annotationsToUpdate, constants.TFLOPSRequestAnnotation) delete(annotationsToUpdate, constants.TFLOPSLimitAnnotation) @@ -129,7 +127,7 @@ func (h *handler) updateWorkerResources(ctx context.Context, workload *State, wo if err != nil { return fmt.Errorf("failed to get current worker resources: %v", err) } - if curRes.Equal(&rec) { + if curRes.Equal(rec) { return nil } diff --git a/internal/autoscaler/workload/workload.go b/internal/autoscaler/workload/workload.go index 1006517a..fb697335 100644 --- a/internal/autoscaler/workload/workload.go +++ b/internal/autoscaler/workload/workload.go @@ -13,8 +13,8 @@ type State struct { Namespace string Name string Annotations map[string]string + ScalingAnnotations map[string]string Spec tfv1.WorkloadProfileSpec - Recommendation tfv1.Resources Workers map[string]*WorkerState WorkerUsageAggregator *metrics.WorkerUsageAggregator } @@ -23,6 +23,7 @@ func NewWorkloadState(name string) *State { return &State{ Name: name, Workers: make(map[string]*WorkerState), + ScalingAnnotations: make(map[string]string), WorkerUsageAggregator: metrics.NewWorkerUsageAggregator(), } } From 84a9a46108013dbf35c6007e88d0579bf6ad2727 Mon Sep 17 00:00:00 2001 From: knave Date: Thu, 31 Jul 2025 00:34:05 +0800 Subject: [PATCH 22/27] fix: properly handle the isScaleUp --- internal/autoscaler/autoscaler_test.go | 3 +- internal/autoscaler/workload/handler.go | 40 +++++++++++++------------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 3f9ce0cd..49299e63 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -284,7 +284,7 @@ var _ = Describe("Autoscaler", func() { }).Should(Succeed()) }) - FIt("should update resources based on cron scaling rule", func() { + It("should update resources based on cron scaling rule", func() { tfEnv := NewTensorFusionEnvBuilder(). AddPoolWithNodeCount(1).SetGpuCountPerNode(1). Build() @@ -342,6 +342,7 @@ var _ = Describe("Autoscaler", func() { g.Expect(res.Equal(&originalResources)).To(BeTrue()) }).Should(Succeed()) + // should not change after cron scaling finish scaler.processWorkloads(ctx) Eventually(func(g Gomega) { res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index 7d386500..33dd6cda 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -106,10 +106,18 @@ func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *St return nil } -func (h *handler) updateWorkerResources(ctx context.Context, workload *State, worker *corev1.Pod, rec *tfv1.Resources) error { +func (h *handler) updateWorkerResources(ctx context.Context, workload *State, worker *corev1.Pod, recommendation *tfv1.Resources) error { log := log.FromContext(ctx) - annotationsToUpdate := utils.CurrentResourcesToAnnotations(rec) + curRes, err := utils.CurrentResourcesFromAnnotations(worker.Annotations) + if err != nil { + return fmt.Errorf("failed to get current worker resources: %v", err) + } + if curRes.Equal(recommendation) { + return nil + } + + annotationsToUpdate := utils.CurrentResourcesToAnnotations(recommendation) if !workload.ShouldScaleResource(tfv1.ResourceTflops) { delete(annotationsToUpdate, constants.TFLOPSRequestAnnotation) delete(annotationsToUpdate, constants.TFLOPSLimitAnnotation) @@ -123,37 +131,31 @@ func (h *handler) updateWorkerResources(ctx context.Context, workload *State, wo return nil } - curRes, err := utils.CurrentResourcesFromAnnotations(worker.Annotations) - if err != nil { - return fmt.Errorf("failed to get current worker resources: %v", err) - } - if curRes.Equal(rec) { - return nil + isScaleUp := false + if _, ok := annotationsToUpdate[constants.TFLOPSRequestAnnotation]; ok { + isScaleUp = recommendation.Requests.Tflops.Cmp(curRes.Requests.Tflops) > 0 + } else { + isScaleUp = recommendation.Requests.Vram.Cmp(curRes.Requests.Vram) > 0 } adjustRequest := &tfv1.AdjustRequest{ PodUID: string(worker.UID), - IsScaleUp: rec.Requests.Tflops.Cmp(curRes.Requests.Tflops) > 0, // TODO: handle vram? - NewRequest: rec.Requests, - NewLimit: rec.Limits, + IsScaleUp: isScaleUp, + NewRequest: recommendation.Requests, + NewLimit: recommendation.Limits, } - if _, err := h.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil { return fmt.Errorf("failed to adjust allocation: %v", err) } - log.Info("adjust allocation successfully", "adjustRequest", adjustRequest) + log.Info("adjust allocation successfully", "worker", worker.Name, "adjustRequest", adjustRequest) patch := client.MergeFrom(worker.DeepCopy()) - - for key, value := range annotationsToUpdate { - worker.Annotations[key] = value - } - + maps.Copy(worker.Annotations, annotationsToUpdate) if err := h.Patch(ctx, worker, patch); err != nil { return fmt.Errorf("failed to patch worker %s: %v", worker.Name, err) } - log.Info("apply recommendation successfully", "worker", worker.Name, "recommendation", rec, "currentResources", curRes) + log.Info("apply recommendation successfully", "worker", worker.Name, "recommendation", recommendation, "currentResources", curRes) return nil } From 23b5c030754f3424912b5cea22c27ca162a99861 Mon Sep 17 00:00:00 2001 From: knave Date: Thu, 31 Jul 2025 19:43:15 +0800 Subject: [PATCH 23/27] refactor: each recommender is responsible for managing its own annotations --- internal/autoscaler/autoscaler.go | 8 +- .../recommender/cron_recommender.go | 81 ++++++++++--- .../recommender/cron_recommender_test.go | 112 ++++++++++++++---- .../recommender/percentile_recommender.go | 8 +- .../autoscaler/recommender/recommender.go | 12 -- internal/autoscaler/workload/handler.go | 74 ++++++------ internal/autoscaler/workload/workload.go | 18 ++- 7 files changed, 211 insertions(+), 102 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 8b0322b2..ac5e4662 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -42,7 +42,7 @@ func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Auto recommenders := []recommender.Interface{ recommender.NewPercentileRecommender(), - recommender.NewCronRecommender(), + recommender.NewCronRecommender(c), } return &Autoscaler{ @@ -165,16 +165,14 @@ func (s *Autoscaler) processWorkloads(ctx context.Context) { if recommendation == nil { continue } - recommendations[name] = recommendation log.Info("recommendation", "workload", workload.Name, "recommender", name, "resources", recommendation) } - if len(recommendations) == 0 { + finalRecommendation := mergeRecommendations(recommendations) + if finalRecommendation.IsZero() { continue } - - finalRecommendation := mergeRecommendations(recommendations) log.Info("final recommendation", "workload", workload.Name, "resources", finalRecommendation) if err := s.workloadHandler.ApplyRecommendationToWorkload(ctx, workload, finalRecommendation); err != nil { diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go index a2c8c91b..694dc649 100644 --- a/internal/autoscaler/recommender/cron_recommender.go +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -3,25 +3,34 @@ package recommender import ( "context" "fmt" + "maps" "time" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/robfig/cron/v3" + "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) +// Utilize these annotations to determine if the configuration has changed const ( - CronScalingAnnotation = constants.Domain + "/cron-scaling" + CronScalingTFLOPSRequestAnnotation = constants.Domain + "/cron-scaling-tflops-request" + CronScalingVRAMRequestAnnotation = constants.Domain + "/cron-scaling-vram-request" + CronScalingTFLOPSLimitAnnotation = constants.Domain + "/cron-scaling-tflops-limit" + CronScalingVRAMLimitAnnotation = constants.Domain + "/cron-scaling-vram-limit" ) type CronRecommender struct { + client.Client parser cron.Parser } -func NewCronRecommender() *CronRecommender { +func NewCronRecommender(c client.Client) *CronRecommender { return &CronRecommender{ + Client: c, parser: cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow), } } @@ -37,32 +46,72 @@ func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*tf return nil, fmt.Errorf("failed to get active cron scaling rule %w", err) } + curRes, err := cronScalingResourcesFromAnnotations(w.Annotations) + if err != nil { + return nil, fmt.Errorf("failed to get current resources from workload %s: %v", w.Name, err) + } + var result *tfv1.Resources if activeRule == nil { - // if no active ruleName, return last resources if cron scaling not finish - ruleName, ok := w.Annotations[CronScalingAnnotation] - if !ok || ruleName == "" { - return nil, nil - } - resources, err := w.GetLastResourcesSpec() - if err != nil { - return nil, fmt.Errorf("failed to get last resources: %w", err) - } - if resources == nil { + if curRes == nil { return nil, nil } - result = resources - w.ScalingAnnotations[CronScalingAnnotation] = "" - log.Info("cron scaling rule finished and restore last resources", "workload", w.Name, "rule", ruleName, "resources", result) + // revert the resources to those specified in the workload spec + result = w.GetResourcesSpec() + maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(&tfv1.Resources{})) + log.Info("cron scaling finished", "workload", w.Name, "resources", result) } else { result = &activeRule.DesiredResources - w.ScalingAnnotations[CronScalingAnnotation] = activeRule.Name + maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(result)) log.Info("cron scaling rule matched", "workload", w.Name, "rule", activeRule.Name, "resources", result) } + if curRes != nil && result.Equal(curRes) { + return nil, nil + } + return result, nil } +func cronScalingResourcesToAnnotations(resources *tfv1.Resources) map[string]string { + return map[string]string{ + CronScalingTFLOPSRequestAnnotation: resources.Requests.Tflops.String(), + CronScalingTFLOPSLimitAnnotation: resources.Limits.Tflops.String(), + CronScalingVRAMRequestAnnotation: resources.Requests.Vram.String(), + CronScalingVRAMLimitAnnotation: resources.Limits.Vram.String(), + } +} + +func cronScalingResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) { + result := tfv1.Resources{} + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {CronScalingTFLOPSRequestAnnotation, &result.Requests.Tflops}, + {CronScalingTFLOPSLimitAnnotation, &result.Limits.Tflops}, + {CronScalingVRAMRequestAnnotation, &result.Requests.Vram}, + {CronScalingVRAMLimitAnnotation, &result.Limits.Vram}, + } + for _, info := range resInfo { + annotation, ok := annotations[info.key] + if !ok { + continue + } + q, err := resource.ParseQuantity(annotation) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + } + + if result.IsZero() { + return nil, nil + } + + return &result, nil +} + func (c *CronRecommender) getActiveCronScalingRule(config *tfv1.AutoScalingConfig) (*tfv1.CronScalingRule, error) { activeRules := []*tfv1.CronScalingRule{} diff --git a/internal/autoscaler/recommender/cron_recommender_test.go b/internal/autoscaler/recommender/cron_recommender_test.go index 6c7d7446..5825e309 100644 --- a/internal/autoscaler/recommender/cron_recommender_test.go +++ b/internal/autoscaler/recommender/cron_recommender_test.go @@ -9,27 +9,22 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" - "github.com/NexusGPU/tensor-fusion/internal/utils" ) var _ = Describe("CronRecommender", func() { ctx := context.TODO() - tflopsRequest := resource.MustParse("10") - vramRequest := resource.MustParse("8Gi") - tflopsLimit := resource.MustParse("20") - vramLimit := resource.MustParse("16Gi") res := tfv1.Resources{ Requests: tfv1.Resource{ - Tflops: tflopsRequest, - Vram: vramRequest, + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), }, Limits: tfv1.Resource{ - Tflops: tflopsLimit, - Vram: vramLimit, + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("16Gi"), }, } - It("should return recommended resource based on active cron scaling rule", func() { + It("should return recommendation based on the active cron scaling rule", func() { workload := workload.NewWorkloadState("test") workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{ @@ -43,28 +38,101 @@ var _ = Describe("CronRecommender", func() { }, } - recommender := NewCronRecommender() + recommender := NewCronRecommender(nil) recommendation, _ := recommender.Recommend(ctx, workload) Expect(recommendation.Equal(&res)).To(BeTrue()) }) - It("should return recommended resource based on last resources spec", func() { + It("should not return recommendation if there is no active cron scaling rule", func() { workload := workload.NewWorkloadState("test") - workload.Annotations = utils.LastResourcesToAnnotations(&res) workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{ { - Enable: true, - Name: "test", - Start: "", - End: "", + Enable: true, + Name: "test", + Start: "", + End: "", + DesiredResources: res, + }, + }, + } + + recommender := NewCronRecommender(nil) + recommendation, _ := recommender.Recommend(ctx, workload) + Expect(recommendation).To(BeNil()) + }) + + It("should not return recommendation if the active cron scaling rule remains unchanged", func() { + workload := workload.NewWorkloadState("test") + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: res, }, }, } - recommender := NewCronRecommender() + recommender := NewCronRecommender(nil) recommendation, _ := recommender.Recommend(ctx, workload) Expect(recommendation.Equal(&res)).To(BeTrue()) + + workload.Annotations = cronScalingResourcesToAnnotations(&res) + + recommendation, _ = recommender.Recommend(ctx, workload) + Expect(recommendation).To(BeNil()) + }) + + It("should revert the resources to those specified in the workload spec if the active cron scaling finished", func() { + workload := workload.NewWorkloadState("test") + workload.Spec.Resources = tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("5"), + Vram: resource.MustParse("4Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + } + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: res, + }, + }, + } + + recommender := NewCronRecommender(nil) + recommendation, _ := recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&res)).To(BeTrue()) + + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "", + End: "", + DesiredResources: res, + }, + }, + } + + workload.Annotations = cronScalingResourcesToAnnotations(&res) + recommendation, _ = recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&workload.Spec.Resources)).To(BeTrue()) + + workload.Annotations = cronScalingResourcesToAnnotations(&tfv1.Resources{}) + recommendation, _ = recommender.Recommend(ctx, workload) + Expect(recommendation).To(BeNil()) }) It("should return error if getting multiple active rules", func() { @@ -85,7 +153,7 @@ var _ = Describe("CronRecommender", func() { }, }, } - recommender := NewCronRecommender() + recommender := NewCronRecommender(nil) _, err := recommender.Recommend(ctx, workload) Expect(err).To(HaveOccurred()) }) @@ -94,13 +162,13 @@ var _ = Describe("CronRecommender", func() { asc := tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{}, } - Expect(NewCronRecommender().getActiveCronScalingRule(&asc)).To(BeNil()) + Expect(NewCronRecommender(nil).getActiveCronScalingRule(&asc)).To(BeNil()) asc = tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{ {Enable: false}, }, } - Expect(NewCronRecommender().getActiveCronScalingRule(&asc)).To(BeNil()) + Expect(NewCronRecommender(nil).getActiveCronScalingRule(&asc)).To(BeNil()) }) It("should return the active cron scaling rule if the current time falls within its scheduled interval", func() { @@ -114,7 +182,7 @@ var _ = Describe("CronRecommender", func() { }, }, } - rule, _ := NewCronRecommender().getActiveCronScalingRule(&asc) + rule, _ := NewCronRecommender(nil).getActiveCronScalingRule(&asc) Expect(rule).NotTo(BeNil()) }) }) diff --git a/internal/autoscaler/recommender/percentile_recommender.go b/internal/autoscaler/recommender/percentile_recommender.go index 43df93b1..b08113e5 100644 --- a/internal/autoscaler/recommender/percentile_recommender.go +++ b/internal/autoscaler/recommender/percentile_recommender.go @@ -82,7 +82,6 @@ func (p *PercentileRecommender) Name() string { func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) { log := log.FromContext(ctx) - // TODO: cache config aggregator := workload.WorkerUsageAggregator if aggregator.IsEmpty() { return nil, nil @@ -93,6 +92,7 @@ func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workloa return nil, fmt.Errorf("failed to get current resources from workload %s: %v", workload.Name, err) } + // TODO: cache config p.createEstimatorsFromConfig(p.getPercentileConfig(&workload.Spec.AutoScalingConfig.AutoSetResources)) rr := &RecommendedResources{ LowerBoundTflops: QuantityFromAmount(p.lowerBoundTflops.GetTflopsEstimation(aggregator)), @@ -105,7 +105,7 @@ func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workloa log.Info("recommendation", "workload", workload.Name, "recommender", p.Name(), "resources", rr) - var result tfv1.Resources + result := &tfv1.Resources{} if curRes.Requests.Tflops.Cmp(rr.LowerBoundTflops) < 0 || curRes.Requests.Tflops.Cmp(rr.UpperBoundTflops) > 0 { result.Requests.Tflops = rr.TargetTflops @@ -130,9 +130,7 @@ func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workloa return nil, nil } - // TODO: handle tflops or vram should recommend - - return &result, nil + return result, nil } func (p *PercentileRecommender) getPercentileConfig(asr *tfv1.AutoSetResources) *PercentileConfig { diff --git a/internal/autoscaler/recommender/recommender.go b/internal/autoscaler/recommender/recommender.go index 43127746..3248ad6c 100644 --- a/internal/autoscaler/recommender/recommender.go +++ b/internal/autoscaler/recommender/recommender.go @@ -2,7 +2,6 @@ package recommender import ( "context" - "fmt" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" @@ -18,14 +17,3 @@ type Interface interface { Name() string Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) } - -func New(name string) (Interface, error) { - switch name { - case Percentile: - return NewPercentileRecommender(), nil - case Cron: - return NewCronRecommender(), nil - default: - return nil, fmt.Errorf("unknown recommender name: %s", name) - } -} diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index 33dd6cda..bda6768d 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -16,7 +16,7 @@ import ( type Handler interface { UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) - ApplyRecommendationToWorkload(ctx context.Context, state *State, recommendation *tfv1.Resources) error + ApplyRecommendationToWorkload(ctx context.Context, workloadState *State, recommendation *tfv1.Resources) error } type handler struct { @@ -46,42 +46,15 @@ func (h *handler) UpdateWorkloadState(ctx context.Context, workloadState *State, workloadState.updateWorkers(workerList) } -func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, state *State, recommendation *tfv1.Resources) error { - workload := &tfv1.TensorFusionWorkload{} - if err := h.Get(ctx, client.ObjectKey{Namespace: state.Namespace, Name: state.Name}, workload); err != nil { - return fmt.Errorf("failed to get workload: %v", err) - } - - // record current and last resources by annotations - patch := client.MergeFrom(workload.DeepCopy()) - if workload.Annotations == nil { - workload.Annotations = map[string]string{} - } - curRes, err := utils.CurrentResourcesFromAnnotations(workload.Annotations) - if err != nil { - return fmt.Errorf("failed to get current workload resources: %v", err) - } - if curRes == nil { - curRes = &workload.Spec.Resources - } - maps.Copy(workload.Annotations, utils.LastResourcesToAnnotations(curRes)) - maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(recommendation)) - maps.Copy(workload.Annotations, state.ScalingAnnotations) - - if err := h.Patch(ctx, workload, patch); err != nil { - return fmt.Errorf("failed to patch workload %s: %v", workload.Name, err) +func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, workload *State, recommendation *tfv1.Resources) error { + if err := h.updateAutoScalingAnnotations(ctx, workload, recommendation); err != nil { + return fmt.Errorf("failed to update auto scaling annotations: %v", err) } - state.Annotations = workload.Annotations - - if err := h.applyRecommendationToWorkers(ctx, state, recommendation); err != nil { - return fmt.Errorf("failed to apply recommendation to workers: %v", err) + if !workload.IsAutoSetResourcesEnabled() { + return nil } - return nil -} - -func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *State, recommendation *tfv1.Resources) error { workerList := &corev1.PodList{} if err := h.List(ctx, workerList, client.InNamespace(workload.Namespace), @@ -89,16 +62,11 @@ func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *St return fmt.Errorf("failed to list workers: %v", err) } - if !workload.IsAutoSetResourcesEnabled() { - return nil - } - for _, worker := range workerList.Items { if !worker.DeletionTimestamp.IsZero() { continue } - - if err := h.updateWorkerResources(ctx, workload, &worker, recommendation); err != nil { + if err := h.applyRecommendationToWorker(ctx, workload, &worker, recommendation); err != nil { return fmt.Errorf("failed to update worker %s resources: %v", worker.Name, err) } } @@ -106,14 +74,38 @@ func (h *handler) applyRecommendationToWorkers(ctx context.Context, workload *St return nil } -func (h *handler) updateWorkerResources(ctx context.Context, workload *State, worker *corev1.Pod, recommendation *tfv1.Resources) error { +func (h *handler) updateAutoScalingAnnotations( + ctx context.Context, + state *State, + recommendation *tfv1.Resources) error { + workload := &tfv1.TensorFusionWorkload{} + if err := h.Get(ctx, client.ObjectKey{Namespace: state.Namespace, Name: state.Name}, workload); err != nil { + return fmt.Errorf("failed to get workload: %v", err) + } + + // record current and last resources + if workload.Annotations == nil { + workload.Annotations = map[string]string{} + } + patch := client.MergeFrom(workload.DeepCopy()) + maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(recommendation)) + maps.Copy(workload.Annotations, state.ScalingAnnotations) + if err := h.Patch(ctx, workload, patch); err != nil { + return fmt.Errorf("failed to patch workload %s: %v", workload.Name, err) + } + + state.Annotations = workload.Annotations + return nil +} + +func (h *handler) applyRecommendationToWorker(ctx context.Context, workload *State, worker *corev1.Pod, recommendation *tfv1.Resources) error { log := log.FromContext(ctx) curRes, err := utils.CurrentResourcesFromAnnotations(worker.Annotations) if err != nil { return fmt.Errorf("failed to get current worker resources: %v", err) } - if curRes.Equal(recommendation) { + if curRes != nil && curRes.Equal(recommendation) { return nil } diff --git a/internal/autoscaler/workload/workload.go b/internal/autoscaler/workload/workload.go index fb697335..4e2063f9 100644 --- a/internal/autoscaler/workload/workload.go +++ b/internal/autoscaler/workload/workload.go @@ -1,6 +1,7 @@ package workload import ( + "fmt" "strings" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" @@ -32,8 +33,23 @@ func (w *State) GetLastResourcesSpec() (*tfv1.Resources, error) { return utils.LastResourcesFromAnnotations(w.Annotations) } +func (w *State) GetResourcesSpec() *tfv1.Resources { + return &w.Spec.Resources +} + func (w *State) GetCurrentResourcesSpec() (*tfv1.Resources, error) { - return utils.CurrentResourcesFromAnnotations(w.Annotations) + resources, err := utils.CurrentResourcesFromAnnotations(w.Annotations) + if err != nil { + return nil, fmt.Errorf("failed to get resources from annotations: %v", err) + } + if resources == nil { + return &w.Spec.Resources, nil + } + return resources, nil +} + +func (w *State) SetScalingAnnotation(key string, value string) { + w.ScalingAnnotations[key] = value } func (w *State) IsAutoSetResourcesEnabled() bool { From 31a5a36967a0b998fb30d7436ce4becd3a5fb53a Mon Sep 17 00:00:00 2001 From: knave Date: Thu, 31 Jul 2025 23:50:26 +0800 Subject: [PATCH 24/27] refactor: remove unused functions and params --- internal/autoscaler/autoscaler.go | 2 +- .../recommender/cron_recommender.go | 5 +- .../recommender/cron_recommender_test.go | 63 +++++++++++++------ internal/autoscaler/workload/handler.go | 1 - internal/autoscaler/workload/workload.go | 4 -- internal/autoscaler/workload/workload_test.go | 17 ----- internal/constants/constants.go | 4 -- internal/utils/resource.go | 39 ------------ 8 files changed, 46 insertions(+), 89 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index ac5e4662..3e089f2f 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -42,7 +42,7 @@ func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Auto recommenders := []recommender.Interface{ recommender.NewPercentileRecommender(), - recommender.NewCronRecommender(c), + recommender.NewCronRecommender(), } return &Autoscaler{ diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go index 694dc649..9e3cf6e7 100644 --- a/internal/autoscaler/recommender/cron_recommender.go +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -11,7 +11,6 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/robfig/cron/v3" "k8s.io/apimachinery/pkg/api/resource" - "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -24,13 +23,11 @@ const ( ) type CronRecommender struct { - client.Client parser cron.Parser } -func NewCronRecommender(c client.Client) *CronRecommender { +func NewCronRecommender() *CronRecommender { return &CronRecommender{ - Client: c, parser: cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow), } } diff --git a/internal/autoscaler/recommender/cron_recommender_test.go b/internal/autoscaler/recommender/cron_recommender_test.go index 5825e309..82051aeb 100644 --- a/internal/autoscaler/recommender/cron_recommender_test.go +++ b/internal/autoscaler/recommender/cron_recommender_test.go @@ -13,7 +13,7 @@ import ( var _ = Describe("CronRecommender", func() { ctx := context.TODO() - res := tfv1.Resources{ + defaultRes := tfv1.Resources{ Requests: tfv1.Resource{ Tflops: resource.MustParse("10"), Vram: resource.MustParse("8Gi"), @@ -33,14 +33,39 @@ var _ = Describe("CronRecommender", func() { Name: "test", Start: "0 0 * * *", End: "59 23 * * *", - DesiredResources: res, + DesiredResources: defaultRes, }, }, } - recommender := NewCronRecommender(nil) + recommender := NewCronRecommender() recommendation, _ := recommender.Recommend(ctx, workload) - Expect(recommendation.Equal(&res)).To(BeTrue()) + Expect(recommendation.Equal(&defaultRes)).To(BeTrue()) + newRes := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("5"), + Vram: resource.MustParse("4Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + } + + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: newRes, + }, + }, + } + + recommendation, _ = recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&newRes)).To(BeTrue()) }) It("should not return recommendation if there is no active cron scaling rule", func() { @@ -52,12 +77,12 @@ var _ = Describe("CronRecommender", func() { Name: "test", Start: "", End: "", - DesiredResources: res, + DesiredResources: defaultRes, }, }, } - recommender := NewCronRecommender(nil) + recommender := NewCronRecommender() recommendation, _ := recommender.Recommend(ctx, workload) Expect(recommendation).To(BeNil()) }) @@ -71,16 +96,16 @@ var _ = Describe("CronRecommender", func() { Name: "test", Start: "0 0 * * *", End: "59 23 * * *", - DesiredResources: res, + DesiredResources: defaultRes, }, }, } - recommender := NewCronRecommender(nil) + recommender := NewCronRecommender() recommendation, _ := recommender.Recommend(ctx, workload) - Expect(recommendation.Equal(&res)).To(BeTrue()) + Expect(recommendation.Equal(&defaultRes)).To(BeTrue()) - workload.Annotations = cronScalingResourcesToAnnotations(&res) + workload.Annotations = cronScalingResourcesToAnnotations(&defaultRes) recommendation, _ = recommender.Recommend(ctx, workload) Expect(recommendation).To(BeNil()) @@ -105,14 +130,14 @@ var _ = Describe("CronRecommender", func() { Name: "test", Start: "0 0 * * *", End: "59 23 * * *", - DesiredResources: res, + DesiredResources: defaultRes, }, }, } - recommender := NewCronRecommender(nil) + recommender := NewCronRecommender() recommendation, _ := recommender.Recommend(ctx, workload) - Expect(recommendation.Equal(&res)).To(BeTrue()) + Expect(recommendation.Equal(&defaultRes)).To(BeTrue()) workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{ @@ -121,12 +146,12 @@ var _ = Describe("CronRecommender", func() { Name: "test", Start: "", End: "", - DesiredResources: res, + DesiredResources: defaultRes, }, }, } - workload.Annotations = cronScalingResourcesToAnnotations(&res) + workload.Annotations = cronScalingResourcesToAnnotations(&defaultRes) recommendation, _ = recommender.Recommend(ctx, workload) Expect(recommendation.Equal(&workload.Spec.Resources)).To(BeTrue()) @@ -153,7 +178,7 @@ var _ = Describe("CronRecommender", func() { }, }, } - recommender := NewCronRecommender(nil) + recommender := NewCronRecommender() _, err := recommender.Recommend(ctx, workload) Expect(err).To(HaveOccurred()) }) @@ -162,13 +187,13 @@ var _ = Describe("CronRecommender", func() { asc := tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{}, } - Expect(NewCronRecommender(nil).getActiveCronScalingRule(&asc)).To(BeNil()) + Expect(NewCronRecommender().getActiveCronScalingRule(&asc)).To(BeNil()) asc = tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{ {Enable: false}, }, } - Expect(NewCronRecommender(nil).getActiveCronScalingRule(&asc)).To(BeNil()) + Expect(NewCronRecommender().getActiveCronScalingRule(&asc)).To(BeNil()) }) It("should return the active cron scaling rule if the current time falls within its scheduled interval", func() { @@ -182,7 +207,7 @@ var _ = Describe("CronRecommender", func() { }, }, } - rule, _ := NewCronRecommender(nil).getActiveCronScalingRule(&asc) + rule, _ := NewCronRecommender().getActiveCronScalingRule(&asc) Expect(rule).NotTo(BeNil()) }) }) diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index bda6768d..03415465 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -83,7 +83,6 @@ func (h *handler) updateAutoScalingAnnotations( return fmt.Errorf("failed to get workload: %v", err) } - // record current and last resources if workload.Annotations == nil { workload.Annotations = map[string]string{} } diff --git a/internal/autoscaler/workload/workload.go b/internal/autoscaler/workload/workload.go index 4e2063f9..38466106 100644 --- a/internal/autoscaler/workload/workload.go +++ b/internal/autoscaler/workload/workload.go @@ -29,10 +29,6 @@ func NewWorkloadState(name string) *State { } } -func (w *State) GetLastResourcesSpec() (*tfv1.Resources, error) { - return utils.LastResourcesFromAnnotations(w.Annotations) -} - func (w *State) GetResourcesSpec() *tfv1.Resources { return &w.Spec.Resources } diff --git a/internal/autoscaler/workload/workload_test.go b/internal/autoscaler/workload/workload_test.go index bd18e9f7..f1768014 100644 --- a/internal/autoscaler/workload/workload_test.go +++ b/internal/autoscaler/workload/workload_test.go @@ -48,23 +48,6 @@ var _ = Describe("Workload", func() { Expect(ws.IsAutoSetResourcesEnabled()).To(BeFalse()) }) - It("should return last resources spec from the annotations", func() { - ws := NewWorkloadState("test") - expect := tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: resource.MustParse("10"), - Vram: resource.MustParse("8Gi"), - }, - Limits: tfv1.Resource{ - Tflops: resource.MustParse("20"), - Vram: resource.MustParse("16Gi"), - }, - } - ws.Annotations = utils.LastResourcesToAnnotations(&expect) - got, _ := ws.GetLastResourcesSpec() - Expect(got.Equal(&expect)) - }) - It("should return current resources spec from the annotations", func() { ws := NewWorkloadState("test") expect := tfv1.Resources{ diff --git a/internal/constants/constants.go b/internal/constants/constants.go index 2183f852..3e0128e5 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -55,10 +55,6 @@ const ( VRAMRequestAnnotation = Domain + "/vram-request" TFLOPSLimitAnnotation = Domain + "/tflops-limit" VRAMLimitAnnotation = Domain + "/vram-limit" - LastTFLOPSRequestAnnotation = Domain + "/last-tflops-request" - LastVRAMRequestAnnotation = Domain + "/last-vram-request" - LastTFLOPSLimitAnnotation = Domain + "/last-tflops-limit" - LastVRAMLimitAnnotation = Domain + "/last-vram-limit" WorkloadProfileAnnotation = Domain + "/client-profile" InjectContainerAnnotation = Domain + "/inject-container" IsLocalGPUAnnotation = Domain + "/is-local-gpu" diff --git a/internal/utils/resource.go b/internal/utils/resource.go index 855d3ce3..444da3c3 100644 --- a/internal/utils/resource.go +++ b/internal/utils/resource.go @@ -38,36 +38,6 @@ func CurrentResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resou return &result, nil } -func LastResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) { - result := tfv1.Resources{} - resInfo := []struct { - key string - dst *resource.Quantity - }{ - {constants.LastTFLOPSRequestAnnotation, &result.Requests.Tflops}, - {constants.LastTFLOPSLimitAnnotation, &result.Limits.Tflops}, - {constants.LastVRAMRequestAnnotation, &result.Requests.Vram}, - {constants.LastVRAMLimitAnnotation, &result.Limits.Vram}, - } - for _, info := range resInfo { - annotation, ok := annotations[info.key] - if !ok { - continue - } - q, err := resource.ParseQuantity(annotation) - if err != nil { - return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) - } - *info.dst = q - } - - if result.IsZero() { - return nil, nil - } - - return &result, nil -} - func CurrentResourcesToAnnotations(resources *tfv1.Resources) map[string]string { return map[string]string{ constants.TFLOPSRequestAnnotation: resources.Requests.Tflops.String(), @@ -76,12 +46,3 @@ func CurrentResourcesToAnnotations(resources *tfv1.Resources) map[string]string constants.VRAMLimitAnnotation: resources.Limits.Vram.String(), } } - -func LastResourcesToAnnotations(resources *tfv1.Resources) map[string]string { - return map[string]string{ - constants.LastTFLOPSRequestAnnotation: resources.Requests.Tflops.String(), - constants.LastTFLOPSLimitAnnotation: resources.Limits.Tflops.String(), - constants.LastVRAMRequestAnnotation: resources.Requests.Vram.String(), - constants.LastVRAMLimitAnnotation: resources.Limits.Vram.String(), - } -} From 817a2831cd7c4b9b783ceced2e191277dc32ee10 Mon Sep 17 00:00:00 2001 From: knave Date: Fri, 8 Aug 2025 12:08:57 +0800 Subject: [PATCH 25/27] feat: implement scale-down lock --- internal/autoscaler/autoscaler.go | 58 +++---- internal/autoscaler/autoscaler_test.go | 83 ++++++---- .../recommender/cron_recommender.go | 24 +-- .../recommender/cron_recommender_test.go | 26 ++-- .../recommender/percentile_recommender.go | 22 +-- .../autoscaler/recommender/recommender.go | 31 +++- .../recommender/recommender_test.go | 145 ++++++++++++++++++ internal/autoscaler/workload/handler.go | 8 +- 8 files changed, 287 insertions(+), 110 deletions(-) create mode 100644 internal/autoscaler/recommender/recommender_test.go diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 3e089f2f..333a445e 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -154,31 +154,36 @@ func (s *Autoscaler) processWorkloads(ctx context.Context) { log.Info("processing workloads") for _, workload := range s.workloads { - recommendations := map[string]*tfv1.Resources{} - for _, recommender := range s.recommenders { - name := recommender.Name() - recommendation, err := recommender.Recommend(ctx, workload) - if err != nil { - log.Error(err, "failed to recommend resources", "recommender", name) - continue - } - if recommendation == nil { - continue - } - recommendations[name] = recommendation - log.Info("recommendation", "workload", workload.Name, "recommender", name, "resources", recommendation) + targetRes := s.calcWorkloadTargetResources(ctx, workload) + if targetRes.IsZero() { + continue } - finalRecommendation := mergeRecommendations(recommendations) - if finalRecommendation.IsZero() { - continue + log.Info("recommended target resources", "workload", workload.Name, "resources", targetRes) + + if err := s.workloadHandler.ApplyResourcesToWorkload(ctx, workload, targetRes); err != nil { + log.Error(err, "failed to apply resources", "workload", workload.Name, "resources", targetRes) } - log.Info("final recommendation", "workload", workload.Name, "resources", finalRecommendation) + } +} - if err := s.workloadHandler.ApplyRecommendationToWorkload(ctx, workload, finalRecommendation); err != nil { - log.Error(err, "failed to apply recommendation", "workload", workload.Name, "recommendation", finalRecommendation) +func (s *Autoscaler) calcWorkloadTargetResources(ctx context.Context, workload *workload.State) *tfv1.Resources { + log := log.FromContext(ctx) + recommendations := map[string]*recommender.Recommendation{} + for _, recommender := range s.recommenders { + name := recommender.Name() + rec, err := recommender.Recommend(ctx, workload) + if err != nil { + log.Error(err, "failed to get recommendation", "recommender", name) + continue + } + if rec != nil { + recommendations[name] = rec + log.Info("recommendation", "workload", workload.Name, "recommender", name, "recommendation", rec) } } + + return recommender.MergeRecommendations(recommendations) } func (s *Autoscaler) findOrCreateWorkloadState(name string) *workload.State { @@ -190,21 +195,6 @@ func (s *Autoscaler) findOrCreateWorkloadState(name string) *workload.State { return w } -func mergeRecommendations(recommendations map[string]*tfv1.Resources) *tfv1.Resources { - result := &tfv1.Resources{} - for _, rec := range recommendations { - if result.Requests.Tflops.Cmp(rec.Requests.Tflops) < 0 { - result.Requests.Tflops = rec.Requests.Tflops - result.Limits.Tflops = rec.Limits.Tflops - } - if result.Requests.Vram.Cmp(rec.Requests.Vram) < 0 { - result.Requests.Vram = rec.Requests.Vram - result.Limits.Vram = rec.Limits.Vram - } - } - return result -} - // Start after manager started func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { autoScaler, err := NewAutoscaler(mgr.GetClient(), allocator) diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 49299e63..9c0c127f 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -24,6 +24,7 @@ import ( tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/recommender" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" "github.com/NexusGPU/tensor-fusion/internal/constants" "github.com/NexusGPU/tensor-fusion/internal/utils" @@ -350,45 +351,63 @@ var _ = Describe("Autoscaler", func() { }).Should(Succeed()) }) - It("should merge recomendations based on a larger request value", func() { - recommendations := map[string]*tfv1.Resources{ - "rec1": { - Requests: tfv1.Resource{ - Tflops: resource.MustParse("10"), - Vram: resource.MustParse("10Gi"), - }, - Limits: tfv1.Resource{ - Tflops: resource.MustParse("15"), - Vram: resource.MustParse("15Gi"), - }, + It("should not scale down when merging recommendations during active cron scaling progress", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + + workloadState := scaler.workloads[workload.Name] + resourcesInRule := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), }, - "rec2": { - Requests: tfv1.Resource{ - Tflops: resource.MustParse("5"), - Vram: resource.MustParse("15Gi"), - }, - Limits: tfv1.Resource{ - Tflops: resource.MustParse("20"), - Vram: resource.MustParse("20Gi"), - }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), }, } + workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: resourcesInRule, + }, + } + + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&resourcesInRule)).To(BeTrue()) + }).Should(Succeed()) - final := mergeRecommendations(recommendations) - Expect(final.Equal(&tfv1.Resources{ + fakeRec := tfv1.Resources{ Requests: tfv1.Resource{ - Tflops: resource.MustParse("10"), - Vram: resource.MustParse("15Gi"), + Tflops: resource.MustParse("1"), + Vram: resource.MustParse("1Gi"), }, Limits: tfv1.Resource{ - Tflops: resource.MustParse("15"), - Vram: resource.MustParse("20Gi"), + Tflops: resource.MustParse("1"), + Vram: resource.MustParse("1Gi"), }, - })).To(BeTrue()) - }) + } - It("should not update resource if resource is zero", func() { + scaler.recommenders = append(scaler.recommenders, &FakeRecommender{Resources: &fakeRec}) + scaler.processWorkloads(ctx) + Consistently(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&resourcesInRule)).To(BeTrue()) + }).Should(Succeed()) }) }) }) @@ -509,8 +528,10 @@ func (f *FakeRecommender) Name() string { return "Fake" } -func (f *FakeRecommender) Recommend(ctx context.Context, workoad *workload.State) (*tfv1.Resources, error) { - return f.Resources, nil +func (f *FakeRecommender) Recommend(ctx context.Context, workoad *workload.State) (*recommender.Recommendation, error) { + return &recommender.Recommendation{ + Resources: *f.Resources, + }, nil } func updateWorkloadReplicas(workload *tfv1.TensorFusionWorkload, replicas int) { diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go index 9e3cf6e7..3e9a6eb6 100644 --- a/internal/autoscaler/recommender/cron_recommender.go +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -36,7 +36,7 @@ func (c *CronRecommender) Name() string { return "cron" } -func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*tfv1.Resources, error) { +func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*Recommendation, error) { log := log.FromContext(ctx) activeRule, err := c.getActiveCronScalingRule(&w.Spec.AutoScalingConfig) if err != nil { @@ -48,26 +48,26 @@ func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*tf return nil, fmt.Errorf("failed to get current resources from workload %s: %v", w.Name, err) } - var result *tfv1.Resources + var targetRes *tfv1.Resources if activeRule == nil { if curRes == nil { return nil, nil } // revert the resources to those specified in the workload spec - result = w.GetResourcesSpec() + targetRes = w.GetResourcesSpec() maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(&tfv1.Resources{})) - log.Info("cron scaling finished", "workload", w.Name, "resources", result) + log.Info("cron scaling finished", "workload", w.Name, "resources", targetRes) } else { - result = &activeRule.DesiredResources - maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(result)) - log.Info("cron scaling rule matched", "workload", w.Name, "rule", activeRule.Name, "resources", result) + targetRes = &activeRule.DesiredResources + maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(targetRes)) + log.Info("cron scaling rule matched", "workload", w.Name, "rule", activeRule.Name, "resources", targetRes) } - if curRes != nil && result.Equal(curRes) { - return nil, nil - } - - return result, nil + return &Recommendation{ + Resources: *targetRes, + Applied: curRes != nil && targetRes.Equal(curRes), + ScaleDownLocking: true, + }, nil } func cronScalingResourcesToAnnotations(resources *tfv1.Resources) map[string]string { diff --git a/internal/autoscaler/recommender/cron_recommender_test.go b/internal/autoscaler/recommender/cron_recommender_test.go index 82051aeb..d64cbb6f 100644 --- a/internal/autoscaler/recommender/cron_recommender_test.go +++ b/internal/autoscaler/recommender/cron_recommender_test.go @@ -39,8 +39,8 @@ var _ = Describe("CronRecommender", func() { } recommender := NewCronRecommender() - recommendation, _ := recommender.Recommend(ctx, workload) - Expect(recommendation.Equal(&defaultRes)).To(BeTrue()) + rec, _ := recommender.Recommend(ctx, workload) + Expect(rec.Resources.Equal(&defaultRes)).To(BeTrue()) newRes := tfv1.Resources{ Requests: tfv1.Resource{ Tflops: resource.MustParse("5"), @@ -64,8 +64,8 @@ var _ = Describe("CronRecommender", func() { }, } - recommendation, _ = recommender.Recommend(ctx, workload) - Expect(recommendation.Equal(&newRes)).To(BeTrue()) + rec, _ = recommender.Recommend(ctx, workload) + Expect(rec.Resources.Equal(&newRes)).To(BeTrue()) }) It("should not return recommendation if there is no active cron scaling rule", func() { @@ -103,12 +103,14 @@ var _ = Describe("CronRecommender", func() { recommender := NewCronRecommender() recommendation, _ := recommender.Recommend(ctx, workload) - Expect(recommendation.Equal(&defaultRes)).To(BeTrue()) + Expect(recommendation.Resources.Equal(&defaultRes)).To(BeTrue()) workload.Annotations = cronScalingResourcesToAnnotations(&defaultRes) recommendation, _ = recommender.Recommend(ctx, workload) - Expect(recommendation).To(BeNil()) + Expect(recommendation).ToNot(BeNil()) + Expect(recommendation.ScaleDownLocking).To(BeTrue()) + Expect(recommendation.Resources.Equal(&defaultRes)).To(BeTrue()) }) It("should revert the resources to those specified in the workload spec if the active cron scaling finished", func() { @@ -136,8 +138,8 @@ var _ = Describe("CronRecommender", func() { } recommender := NewCronRecommender() - recommendation, _ := recommender.Recommend(ctx, workload) - Expect(recommendation.Equal(&defaultRes)).To(BeTrue()) + rec, _ := recommender.Recommend(ctx, workload) + Expect(rec.Resources.Equal(&defaultRes)).To(BeTrue()) workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ CronScalingRules: []tfv1.CronScalingRule{ @@ -152,12 +154,12 @@ var _ = Describe("CronRecommender", func() { } workload.Annotations = cronScalingResourcesToAnnotations(&defaultRes) - recommendation, _ = recommender.Recommend(ctx, workload) - Expect(recommendation.Equal(&workload.Spec.Resources)).To(BeTrue()) + rec, _ = recommender.Recommend(ctx, workload) + Expect(rec.Resources.Equal(&workload.Spec.Resources)).To(BeTrue()) workload.Annotations = cronScalingResourcesToAnnotations(&tfv1.Resources{}) - recommendation, _ = recommender.Recommend(ctx, workload) - Expect(recommendation).To(BeNil()) + rec, _ = recommender.Recommend(ctx, workload) + Expect(rec).To(BeNil()) }) It("should return error if getting multiple active rules", func() { diff --git a/internal/autoscaler/recommender/percentile_recommender.go b/internal/autoscaler/recommender/percentile_recommender.go index b08113e5..23ebaf47 100644 --- a/internal/autoscaler/recommender/percentile_recommender.go +++ b/internal/autoscaler/recommender/percentile_recommender.go @@ -80,7 +80,7 @@ func (p *PercentileRecommender) Name() string { return "percentile" } -func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) { +func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workload.State) (*Recommendation, error) { log := log.FromContext(ctx) aggregator := workload.WorkerUsageAggregator if aggregator.IsEmpty() { @@ -105,32 +105,32 @@ func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workloa log.Info("recommendation", "workload", workload.Name, "recommender", p.Name(), "resources", rr) - result := &tfv1.Resources{} + targetRes := &tfv1.Resources{} if curRes.Requests.Tflops.Cmp(rr.LowerBoundTflops) < 0 || curRes.Requests.Tflops.Cmp(rr.UpperBoundTflops) > 0 { - result.Requests.Tflops = rr.TargetTflops + targetRes.Requests.Tflops = rr.TargetTflops targetLimit := getProportionalLimit(&curRes.Limits.Tflops, &curRes.Requests.Tflops, &rr.TargetTflops) if targetLimit == nil { return nil, fmt.Errorf("failed to get tflops limit from workload %s", workload.Name) } - result.Limits.Tflops = *targetLimit + targetRes.Limits.Tflops = *targetLimit } if curRes.Requests.Vram.Cmp(rr.LowerBoundVram) < 0 || curRes.Requests.Vram.Cmp(rr.UpperBoundVram) > 0 { - result.Requests.Vram = rr.TargetVram + targetRes.Requests.Vram = rr.TargetVram targetLimit := getProportionalLimit(&curRes.Limits.Vram, &curRes.Requests.Vram, &rr.TargetVram) if targetLimit == nil { return nil, fmt.Errorf("failed to get vram limit from workload %s", workload.Name) } - result.Limits.Vram = *targetLimit + targetRes.Limits.Vram = *targetLimit } - if result.Equal(curRes) { - return nil, nil - } - - return result, nil + return &Recommendation{ + Resources: *targetRes, + Applied: targetRes.Equal(curRes), + ScaleDownLocking: false, + }, nil } func (p *PercentileRecommender) getPercentileConfig(asr *tfv1.AutoSetResources) *PercentileConfig { diff --git a/internal/autoscaler/recommender/recommender.go b/internal/autoscaler/recommender/recommender.go index 3248ad6c..60161f56 100644 --- a/internal/autoscaler/recommender/recommender.go +++ b/internal/autoscaler/recommender/recommender.go @@ -7,13 +7,32 @@ import ( "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" ) -const ( - Percentile = "percentile" - Cron = "cron" -) - // Interface defines the contract for resource recommendation strategies used by the autoscaler. type Interface interface { Name() string - Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) + Recommend(ctx context.Context, workload *workload.State) (*Recommendation, error) +} + +type Recommendation struct { + Resources tfv1.Resources + Applied bool + ScaleDownLocking bool +} + +func MergeRecommendations(recommendations map[string]*Recommendation) *tfv1.Resources { + result := &tfv1.Resources{} + for _, rec := range recommendations { + if !rec.ScaleDownLocking && rec.Applied { + continue + } + if result.Requests.Tflops.Cmp(rec.Resources.Requests.Tflops) < 0 { + result.Requests.Tflops = rec.Resources.Requests.Tflops + result.Limits.Tflops = rec.Resources.Limits.Tflops + } + if result.Requests.Vram.Cmp(rec.Resources.Requests.Vram) < 0 { + result.Requests.Vram = rec.Resources.Requests.Vram + result.Limits.Vram = rec.Resources.Limits.Vram + } + } + return result } diff --git a/internal/autoscaler/recommender/recommender_test.go b/internal/autoscaler/recommender/recommender_test.go new file mode 100644 index 00000000..8c10c6e7 --- /dev/null +++ b/internal/autoscaler/recommender/recommender_test.go @@ -0,0 +1,145 @@ +package recommender + +import ( + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/resource" +) + +var _ = Describe("Recommender", func() { + It("should merge recomendations based on a larger request value", func() { + recs := map[string]*Recommendation{ + "rec1": { + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("10Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("15"), + Vram: resource.MustParse("15Gi"), + }, + }, + Applied: false, + ScaleDownLocking: false, + }, + "rec2": { + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("5"), + Vram: resource.MustParse("15Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("20Gi"), + }, + }, + Applied: false, + ScaleDownLocking: false, + }, + } + + final := MergeRecommendations(recs) + Expect(final.Equal(&tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("15Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("15"), + Vram: resource.MustParse("20Gi"), + }, + })).To(BeTrue()) + }) + + It("should be excluded from megring operations if recommendations have been applied", func() { + recs := map[string]*Recommendation{ + "rec1": { + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("10Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("15"), + Vram: resource.MustParse("15Gi"), + }, + }, + Applied: false, + ScaleDownLocking: false, + }, + "rec2": { + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("100"), + Vram: resource.MustParse("150Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("100"), + Vram: resource.MustParse("150Gi"), + }, + }, + Applied: true, + ScaleDownLocking: false, + }, + } + + final := MergeRecommendations(recs) + Expect(final.Equal(&tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("10Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("15"), + Vram: resource.MustParse("15Gi"), + }, + })).To(BeTrue()) + }) + + It("should not scale down when merging recomendations if scale down is locked", func() { + recs := map[string]*Recommendation{ + "rec1": { + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("50"), + Vram: resource.MustParse("50Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("50"), + Vram: resource.MustParse("50Gi"), + }, + }, + Applied: true, + ScaleDownLocking: true, + }, + "rec2": { + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("10Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("10Gi"), + }, + }, + Applied: false, + ScaleDownLocking: false, + }, + } + + final := MergeRecommendations(recs) + Expect(final.Equal(&tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("50"), + Vram: resource.MustParse("50Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("50"), + Vram: resource.MustParse("50Gi"), + }, + })).To(BeTrue()) + }) +}) diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index 03415465..982912f1 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -16,7 +16,7 @@ import ( type Handler interface { UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) - ApplyRecommendationToWorkload(ctx context.Context, workloadState *State, recommendation *tfv1.Resources) error + ApplyResourcesToWorkload(ctx context.Context, workloadState *State, recommendation *tfv1.Resources) error } type handler struct { @@ -46,7 +46,7 @@ func (h *handler) UpdateWorkloadState(ctx context.Context, workloadState *State, workloadState.updateWorkers(workerList) } -func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, workload *State, recommendation *tfv1.Resources) error { +func (h *handler) ApplyResourcesToWorkload(ctx context.Context, workload *State, recommendation *tfv1.Resources) error { if err := h.updateAutoScalingAnnotations(ctx, workload, recommendation); err != nil { return fmt.Errorf("failed to update auto scaling annotations: %v", err) } @@ -66,7 +66,7 @@ func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, workload *S if !worker.DeletionTimestamp.IsZero() { continue } - if err := h.applyRecommendationToWorker(ctx, workload, &worker, recommendation); err != nil { + if err := h.applyResourcesToWorker(ctx, workload, &worker, recommendation); err != nil { return fmt.Errorf("failed to update worker %s resources: %v", worker.Name, err) } } @@ -97,7 +97,7 @@ func (h *handler) updateAutoScalingAnnotations( return nil } -func (h *handler) applyRecommendationToWorker(ctx context.Context, workload *State, worker *corev1.Pod, recommendation *tfv1.Resources) error { +func (h *handler) applyResourcesToWorker(ctx context.Context, workload *State, worker *corev1.Pod, recommendation *tfv1.Resources) error { log := log.FromContext(ctx) curRes, err := utils.CurrentResourcesFromAnnotations(worker.Annotations) From 12b42193af4143b96eb9291f369588e99a237096 Mon Sep 17 00:00:00 2001 From: knave Date: Fri, 8 Aug 2025 13:45:59 +0800 Subject: [PATCH 26/27] refactor: improve naming --- internal/autoscaler/autoscaler_test.go | 16 ++++++++-------- internal/autoscaler/workload/handler.go | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go index 9c0c127f..02500acf 100644 --- a/internal/autoscaler/autoscaler_test.go +++ b/internal/autoscaler/autoscaler_test.go @@ -162,7 +162,7 @@ var _ = Describe("Autoscaler", func() { scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.loadWorkloads(ctx) - rec := tfv1.Resources{ + targetRes := tfv1.Resources{ Requests: tfv1.Resource{ Tflops: resource.MustParse("110"), Vram: resource.MustParse("110Gi"), @@ -174,20 +174,20 @@ var _ = Describe("Autoscaler", func() { } scaler.recommenders[0] = &FakeRecommender{ - Resources: &rec, + Resources: &targetRes, } scaler.processWorkloads(ctx) Eventually(func(g Gomega) { res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) - g.Expect(res.Equal(&rec)).To(BeTrue()) + g.Expect(res.Equal(&targetRes)).To(BeTrue()) }).Should(Succeed()) // Upon reprocessing the workload, it should skip resource updates scaler.processWorkloads(ctx) Consistently(func(g Gomega) { res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) - g.Expect(res.Equal(&rec)).To(BeTrue()) + g.Expect(res.Equal(&targetRes)).To(BeTrue()) }).Should(Succeed()) }) @@ -203,7 +203,7 @@ var _ = Describe("Autoscaler", func() { scaler, _ := NewAutoscaler(k8sClient, allocator) scaler.loadWorkloads(ctx) - rec := tfv1.Resources{ + targetRes := tfv1.Resources{ Requests: tfv1.Resource{ Tflops: resource.MustParse("110"), Vram: resource.MustParse("110Gi"), @@ -215,7 +215,7 @@ var _ = Describe("Autoscaler", func() { } scaler.recommenders[0] = &FakeRecommender{ - Resources: &rec, + Resources: &targetRes, } workloadState := scaler.workloads[workload.Name] @@ -390,7 +390,7 @@ var _ = Describe("Autoscaler", func() { g.Expect(res.Equal(&resourcesInRule)).To(BeTrue()) }).Should(Succeed()) - fakeRec := tfv1.Resources{ + fakeRes := tfv1.Resources{ Requests: tfv1.Resource{ Tflops: resource.MustParse("1"), Vram: resource.MustParse("1Gi"), @@ -401,7 +401,7 @@ var _ = Describe("Autoscaler", func() { }, } - scaler.recommenders = append(scaler.recommenders, &FakeRecommender{Resources: &fakeRec}) + scaler.recommenders = append(scaler.recommenders, &FakeRecommender{Resources: &fakeRes}) scaler.processWorkloads(ctx) Consistently(func(g Gomega) { diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index 982912f1..a5352ff6 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -16,7 +16,7 @@ import ( type Handler interface { UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) - ApplyResourcesToWorkload(ctx context.Context, workloadState *State, recommendation *tfv1.Resources) error + ApplyResourcesToWorkload(ctx context.Context, workloadState *State, targetRes *tfv1.Resources) error } type handler struct { @@ -46,8 +46,8 @@ func (h *handler) UpdateWorkloadState(ctx context.Context, workloadState *State, workloadState.updateWorkers(workerList) } -func (h *handler) ApplyResourcesToWorkload(ctx context.Context, workload *State, recommendation *tfv1.Resources) error { - if err := h.updateAutoScalingAnnotations(ctx, workload, recommendation); err != nil { +func (h *handler) ApplyResourcesToWorkload(ctx context.Context, workload *State, targetRes *tfv1.Resources) error { + if err := h.updateAutoScalingAnnotations(ctx, workload, targetRes); err != nil { return fmt.Errorf("failed to update auto scaling annotations: %v", err) } @@ -66,7 +66,7 @@ func (h *handler) ApplyResourcesToWorkload(ctx context.Context, workload *State, if !worker.DeletionTimestamp.IsZero() { continue } - if err := h.applyResourcesToWorker(ctx, workload, &worker, recommendation); err != nil { + if err := h.applyResourcesToWorker(ctx, workload, &worker, targetRes); err != nil { return fmt.Errorf("failed to update worker %s resources: %v", worker.Name, err) } } @@ -97,18 +97,18 @@ func (h *handler) updateAutoScalingAnnotations( return nil } -func (h *handler) applyResourcesToWorker(ctx context.Context, workload *State, worker *corev1.Pod, recommendation *tfv1.Resources) error { +func (h *handler) applyResourcesToWorker(ctx context.Context, workload *State, worker *corev1.Pod, targetRes *tfv1.Resources) error { log := log.FromContext(ctx) curRes, err := utils.CurrentResourcesFromAnnotations(worker.Annotations) if err != nil { return fmt.Errorf("failed to get current worker resources: %v", err) } - if curRes != nil && curRes.Equal(recommendation) { + if curRes != nil && curRes.Equal(targetRes) { return nil } - annotationsToUpdate := utils.CurrentResourcesToAnnotations(recommendation) + annotationsToUpdate := utils.CurrentResourcesToAnnotations(targetRes) if !workload.ShouldScaleResource(tfv1.ResourceTflops) { delete(annotationsToUpdate, constants.TFLOPSRequestAnnotation) delete(annotationsToUpdate, constants.TFLOPSLimitAnnotation) @@ -124,16 +124,16 @@ func (h *handler) applyResourcesToWorker(ctx context.Context, workload *State, w isScaleUp := false if _, ok := annotationsToUpdate[constants.TFLOPSRequestAnnotation]; ok { - isScaleUp = recommendation.Requests.Tflops.Cmp(curRes.Requests.Tflops) > 0 + isScaleUp = targetRes.Requests.Tflops.Cmp(curRes.Requests.Tflops) > 0 } else { - isScaleUp = recommendation.Requests.Vram.Cmp(curRes.Requests.Vram) > 0 + isScaleUp = targetRes.Requests.Vram.Cmp(curRes.Requests.Vram) > 0 } adjustRequest := &tfv1.AdjustRequest{ PodUID: string(worker.UID), IsScaleUp: isScaleUp, - NewRequest: recommendation.Requests, - NewLimit: recommendation.Limits, + NewRequest: targetRes.Requests, + NewLimit: targetRes.Limits, } if _, err := h.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil { return fmt.Errorf("failed to adjust allocation: %v", err) @@ -146,7 +146,7 @@ func (h *handler) applyResourcesToWorker(ctx context.Context, workload *State, w return fmt.Errorf("failed to patch worker %s: %v", worker.Name, err) } - log.Info("apply recommendation successfully", "worker", worker.Name, "recommendation", recommendation, "currentResources", curRes) + log.Info("apply resources successfully", "worker", worker.Name, "targetResources", targetRes, "currentResources", curRes) return nil } From abd4522bed59988882be4c47c2a66e23897a71c9 Mon Sep 17 00:00:00 2001 From: knave Date: Sun, 10 Aug 2025 04:22:24 +0800 Subject: [PATCH 27/27] fix: scale down issue --- internal/autoscaler/autoscaler.go | 33 ++------- .../autoscaler/metrics/metrics_aggregator.go | 5 +- .../recommender/cron_recommender.go | 2 +- .../recommender/percentile_recommender.go | 2 +- .../autoscaler/recommender/recommender.go | 56 ++++++++++++--- .../recommender/recommender_test.go | 69 ++----------------- internal/autoscaler/workload/handler.go | 4 +- 7 files changed, 64 insertions(+), 107 deletions(-) diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go index 333a445e..24d98474 100644 --- a/internal/autoscaler/autoscaler.go +++ b/internal/autoscaler/autoscaler.go @@ -154,36 +154,17 @@ func (s *Autoscaler) processWorkloads(ctx context.Context) { log.Info("processing workloads") for _, workload := range s.workloads { - targetRes := s.calcWorkloadTargetResources(ctx, workload) - if targetRes.IsZero() { - continue - } - - log.Info("recommended target resources", "workload", workload.Name, "resources", targetRes) - - if err := s.workloadHandler.ApplyResourcesToWorkload(ctx, workload, targetRes); err != nil { - log.Error(err, "failed to apply resources", "workload", workload.Name, "resources", targetRes) - } - } -} - -func (s *Autoscaler) calcWorkloadTargetResources(ctx context.Context, workload *workload.State) *tfv1.Resources { - log := log.FromContext(ctx) - recommendations := map[string]*recommender.Recommendation{} - for _, recommender := range s.recommenders { - name := recommender.Name() - rec, err := recommender.Recommend(ctx, workload) + resources, err := recommender.GetResourcesFromRecommenders(ctx, workload, s.recommenders) if err != nil { - log.Error(err, "failed to get recommendation", "recommender", name) - continue + log.Error(err, "failed to get resources from recommenders") } - if rec != nil { - recommendations[name] = rec - log.Info("recommendation", "workload", workload.Name, "recommender", name, "recommendation", rec) + if resources != nil { + log.Info("recommended resources", "workload", workload.Name, "resources", resources) + if err := s.workloadHandler.ApplyResourcesToWorkload(ctx, workload, resources); err != nil { + log.Error(err, "failed to apply resources", "workload", workload.Name, "resources", resources) + } } } - - return recommender.MergeRecommendations(recommendations) } func (s *Autoscaler) findOrCreateWorkloadState(name string) *workload.State { diff --git a/internal/autoscaler/metrics/metrics_aggregator.go b/internal/autoscaler/metrics/metrics_aggregator.go index 5ffe51d9..7c11edfb 100644 --- a/internal/autoscaler/metrics/metrics_aggregator.go +++ b/internal/autoscaler/metrics/metrics_aggregator.go @@ -36,10 +36,7 @@ func NewWorkerUsageAggregator() *WorkerUsageAggregator { } func (w *WorkerUsageAggregator) IsEmpty() bool { - if w.TflopsHistogram.IsEmpty() && w.VramHistogram.IsEmpty() { - return true - } - return false + return w.TflopsHistogram.IsEmpty() && w.VramHistogram.IsEmpty() } func (w *WorkerUsageAggregator) AddTflopsSample(sample *WorkerUsage) bool { diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go index 3e9a6eb6..2829c9f1 100644 --- a/internal/autoscaler/recommender/cron_recommender.go +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -65,7 +65,7 @@ func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*Re return &Recommendation{ Resources: *targetRes, - Applied: curRes != nil && targetRes.Equal(curRes), + HasApplied: curRes != nil && targetRes.Equal(curRes), ScaleDownLocking: true, }, nil } diff --git a/internal/autoscaler/recommender/percentile_recommender.go b/internal/autoscaler/recommender/percentile_recommender.go index 23ebaf47..f285506d 100644 --- a/internal/autoscaler/recommender/percentile_recommender.go +++ b/internal/autoscaler/recommender/percentile_recommender.go @@ -128,7 +128,7 @@ func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workloa return &Recommendation{ Resources: *targetRes, - Applied: targetRes.Equal(curRes), + HasApplied: targetRes.Equal(curRes), ScaleDownLocking: false, }, nil } diff --git a/internal/autoscaler/recommender/recommender.go b/internal/autoscaler/recommender/recommender.go index 60161f56..cde3ff9d 100644 --- a/internal/autoscaler/recommender/recommender.go +++ b/internal/autoscaler/recommender/recommender.go @@ -2,6 +2,7 @@ package recommender import ( "context" + "fmt" tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" @@ -15,24 +16,57 @@ type Interface interface { type Recommendation struct { Resources tfv1.Resources - Applied bool + HasApplied bool ScaleDownLocking bool } -func MergeRecommendations(recommendations map[string]*Recommendation) *tfv1.Resources { +func GetResourcesFromRecommenders(ctx context.Context, workload *workload.State, recommenders []Interface) (*tfv1.Resources, error) { + recommendations := map[string]*Recommendation{} + for _, recommender := range recommenders { + rec, err := recommender.Recommend(ctx, workload) + if err != nil { + return nil, fmt.Errorf("failed to get recommendation from %s: %v", recommender.Name(), err) + } + if rec != nil { + recommendations[recommender.Name()] = rec + } + } + + if len(recommendations) <= 0 { + return nil, nil + } + + return getResourcesFromRecommendations(recommendations), nil +} + +func getResourcesFromRecommendations(recommendations map[string]*Recommendation) *tfv1.Resources { result := &tfv1.Resources{} + minRes := &tfv1.Resources{} for _, rec := range recommendations { - if !rec.ScaleDownLocking && rec.Applied { - continue + if !rec.HasApplied { + mergeResourcesByLargerRequests(result, &rec.Resources) } - if result.Requests.Tflops.Cmp(rec.Resources.Requests.Tflops) < 0 { - result.Requests.Tflops = rec.Resources.Requests.Tflops - result.Limits.Tflops = rec.Resources.Limits.Tflops - } - if result.Requests.Vram.Cmp(rec.Resources.Requests.Vram) < 0 { - result.Requests.Vram = rec.Resources.Requests.Vram - result.Limits.Vram = rec.Resources.Limits.Vram + if rec.ScaleDownLocking { + mergeResourcesByLargerRequests(minRes, &rec.Resources) } } + + if result.IsZero() || + result.Requests.Tflops.Cmp(minRes.Requests.Tflops) < 0 || + result.Requests.Vram.Cmp(minRes.Requests.Vram) < 0 { + return nil + } + return result } + +func mergeResourcesByLargerRequests(s *tfv1.Resources, t *tfv1.Resources) { + if s.Requests.Tflops.Cmp(t.Requests.Tflops) < 0 { + s.Requests.Tflops = t.Requests.Tflops + s.Limits.Tflops = t.Limits.Tflops + } + if s.Requests.Vram.Cmp(t.Requests.Vram) < 0 { + s.Requests.Vram = t.Requests.Vram + s.Limits.Vram = t.Limits.Vram + } +} diff --git a/internal/autoscaler/recommender/recommender_test.go b/internal/autoscaler/recommender/recommender_test.go index 8c10c6e7..ac770962 100644 --- a/internal/autoscaler/recommender/recommender_test.go +++ b/internal/autoscaler/recommender/recommender_test.go @@ -21,7 +21,7 @@ var _ = Describe("Recommender", func() { Vram: resource.MustParse("15Gi"), }, }, - Applied: false, + HasApplied: false, ScaleDownLocking: false, }, "rec2": { @@ -35,12 +35,12 @@ var _ = Describe("Recommender", func() { Vram: resource.MustParse("20Gi"), }, }, - Applied: false, + HasApplied: false, ScaleDownLocking: false, }, } - final := MergeRecommendations(recs) + final := getResourcesFromRecommendations(recs) Expect(final.Equal(&tfv1.Resources{ Requests: tfv1.Resource{ Tflops: resource.MustParse("10"), @@ -53,52 +53,7 @@ var _ = Describe("Recommender", func() { })).To(BeTrue()) }) - It("should be excluded from megring operations if recommendations have been applied", func() { - recs := map[string]*Recommendation{ - "rec1": { - Resources: tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: resource.MustParse("10"), - Vram: resource.MustParse("10Gi"), - }, - Limits: tfv1.Resource{ - Tflops: resource.MustParse("15"), - Vram: resource.MustParse("15Gi"), - }, - }, - Applied: false, - ScaleDownLocking: false, - }, - "rec2": { - Resources: tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: resource.MustParse("100"), - Vram: resource.MustParse("150Gi"), - }, - Limits: tfv1.Resource{ - Tflops: resource.MustParse("100"), - Vram: resource.MustParse("150Gi"), - }, - }, - Applied: true, - ScaleDownLocking: false, - }, - } - - final := MergeRecommendations(recs) - Expect(final.Equal(&tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: resource.MustParse("10"), - Vram: resource.MustParse("10Gi"), - }, - Limits: tfv1.Resource{ - Tflops: resource.MustParse("15"), - Vram: resource.MustParse("15Gi"), - }, - })).To(BeTrue()) - }) - - It("should not scale down when merging recomendations if scale down is locked", func() { + It("should not reduce resources if scale down is locked", func() { recs := map[string]*Recommendation{ "rec1": { Resources: tfv1.Resources{ @@ -111,7 +66,7 @@ var _ = Describe("Recommender", func() { Vram: resource.MustParse("50Gi"), }, }, - Applied: true, + HasApplied: true, ScaleDownLocking: true, }, "rec2": { @@ -125,21 +80,11 @@ var _ = Describe("Recommender", func() { Vram: resource.MustParse("10Gi"), }, }, - Applied: false, + HasApplied: false, ScaleDownLocking: false, }, } - final := MergeRecommendations(recs) - Expect(final.Equal(&tfv1.Resources{ - Requests: tfv1.Resource{ - Tflops: resource.MustParse("50"), - Vram: resource.MustParse("50Gi"), - }, - Limits: tfv1.Resource{ - Tflops: resource.MustParse("50"), - Vram: resource.MustParse("50Gi"), - }, - })).To(BeTrue()) + Expect(getResourcesFromRecommendations(recs)).To(BeNil()) }) }) diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go index a5352ff6..68ac0fb3 100644 --- a/internal/autoscaler/workload/handler.go +++ b/internal/autoscaler/workload/handler.go @@ -77,7 +77,7 @@ func (h *handler) ApplyResourcesToWorkload(ctx context.Context, workload *State, func (h *handler) updateAutoScalingAnnotations( ctx context.Context, state *State, - recommendation *tfv1.Resources) error { + targetRes *tfv1.Resources) error { workload := &tfv1.TensorFusionWorkload{} if err := h.Get(ctx, client.ObjectKey{Namespace: state.Namespace, Name: state.Name}, workload); err != nil { return fmt.Errorf("failed to get workload: %v", err) @@ -87,7 +87,7 @@ func (h *handler) updateAutoScalingAnnotations( workload.Annotations = map[string]string{} } patch := client.MergeFrom(workload.DeepCopy()) - maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(recommendation)) + maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(targetRes)) maps.Copy(workload.Annotations, state.ScalingAnnotations) if err := h.Patch(ctx, workload, patch); err != nil { return fmt.Errorf("failed to patch workload %s: %v", workload.Name, err)