diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go index 1407b79f..307fcffb 100644 --- a/api/v1/schedulingconfigtemplate_types.go +++ b/api/v1/schedulingconfigtemplate_types.go @@ -86,17 +86,75 @@ type GPUFilter struct { } type AutoScalingConfig struct { - // layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - // VPA-like, aggregate metrics data <1m - AutoSetLimits AutoSetLimits `json:"autoSetLimits,omitempty"` + // layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode + // Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks + AutoSetResources AutoSetResources `json:"autoSetResources,omitempty"` // layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit // HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works) AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"` - // layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet - // Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks - AutoSetRequests AutoSetRequests `json:"autoSetRequests,omitempty"` + // CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions. + CronScalingRules []CronScalingRule `json:"cronScalingRules,omitempty"` +} + +// CronScalingRule defines the rule for scaling resources based on a cron schedule. +// It allows enabling/disabling the scaler, specifying the time window for scaling, +// and configuring the desired resources and replicas during the scheduled period. +type CronScalingRule struct { + // Enable specifies whether the cron scaler is enabled. + Enable bool `json:"enable,omitempty"` + // Name is the identifier for the cron scaler. + Name string `json:"name,omitempty"` + // Start is the start time for the scaling schedule, in cron format. + Start string `json:"start,omitempty"` + // End is the end time for the scaling schedule, in cron format. + End string `json:"end,omitempty"` + // DesiredResources specifies the target resources to scale to during the schedule. + DesiredResources Resources `json:"desiredResources,omitempty"` + // ResourceMultiplier is a string representing the multiplier to apply to resources. + ResourceMultiplier string `json:"resourceMultiplier,omitempty"` + // DesiredReplicas is the target number of replicas during the schedule. + DesiredReplicas *int32 `json:"desiredReplicas,omitempty"` + // ReplicasMultiplier is a string representing the multiplier to apply to replicas. + ReplicasMultiplier string `json:"replicasMultiplier,omitempty"` +} + +type AutoSetResources struct { + Enable bool `json:"enable,omitempty"` + + // Target resource to scale, such as "tflops", "vram", or "all" by default + TargetResource string `json:"targetResource,omitempty"` + + // Tflops usage percentile that will be used as a base for tflops target recommendation. Default: 0.9 + TargetTflopsPercentile string `json:"targettflopspercentile,omitempty"` + + // Tflops usage percentile that will be used for the lower bound on tflops recommendation. Default: 0.5 + LowerBoundTflopsPercentile string `json:"lowerboundtflopspercentile,omitempty"` + + // Tflops usage percentile that will be used for the upper bound on tflops recommendation. Default: 0.95 + UpperBoundTflopsPercentile string `json:"upperboundtflopspercentile,omitempty"` + + // Vram usage percentile that will be used as a base for vram target recommendation. Default: 0.9 + TargetVramPercentile string `json:"targetvrampercentile,omitempty"` + + // Vram usage percentile that will be used for the lower bound on vram recommendation. Default: 0.5 + LowerBoundVramPercentile string `json:"lowerboundvrampercentile,omitempty"` + + // Vram usage percentile that will be used for the upper bound on vram recommendation. Default: 0.95 + UpperBoundVramPercentile string `json:"upperboundvrampercentile,omitempty"` + + // Fraction of usage added as the safety margin to the recommended request. Default: 0.15 + RequestMarginFraction string `json:"requestMarginFraction,omitempty"` + + // The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h + ConfidenceInterval string `json:"confidenceInterval,omitempty"` + + // How much time back TSDB have to be queried to get historical metrics. Default: 1d + HistoryLength string `json:"historyLength,omitempty"` + + // Resolution at which TSDB is queried for historical metrics. Default: 1m + HistoryResolution string `json:"historyResolution,omitempty"` } // A typical autoLimits algorithm could be checking every 5m, look back 1 day data, diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go index 1b304eca..11075bbf 100644 --- a/api/v1/tensorfusionconnection_types.go +++ b/api/v1/tensorfusionconnection_types.go @@ -21,6 +21,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +type ResourceName string + +const ( + ResourceTflops ResourceName = "tflops" + ResourceVram ResourceName = "vram" +) + type Resource struct { Tflops resource.Quantity `json:"tflops"` Vram resource.Quantity `json:"vram"` @@ -31,6 +38,20 @@ type Resources struct { Limits Resource `json:"limits"` } +func (r *Resources) Equal(t *Resources) bool { + return r.Requests.Tflops.Equal(t.Requests.Tflops) && + r.Requests.Vram.Equal(t.Requests.Vram) && + r.Limits.Tflops.Equal(t.Limits.Tflops) && + r.Limits.Vram.Equal(t.Limits.Vram) +} + +func (r *Resources) IsZero() bool { + return r.Requests.Tflops.IsZero() && + r.Requests.Vram.IsZero() && + r.Limits.Tflops.IsZero() && + r.Limits.Vram.IsZero() +} + // TensorFusionConnectionSpec defines the desired state of TensorFusionConnection. type TensorFusionConnectionSpec struct { WorkloadName string `json:"workloadName"` diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 2f1bf367..579a9c25 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -138,9 +138,15 @@ func (in *AutoFreezeAndResume) DeepCopy() *AutoFreezeAndResume { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AutoScalingConfig) DeepCopyInto(out *AutoScalingConfig) { *out = *in - in.AutoSetLimits.DeepCopyInto(&out.AutoSetLimits) + out.AutoSetResources = in.AutoSetResources out.AutoSetReplicas = in.AutoSetReplicas - in.AutoSetRequests.DeepCopyInto(&out.AutoSetRequests) + if in.CronScalingRules != nil { + in, out := &in.CronScalingRules, &out.CronScalingRules + *out = make([]CronScalingRule, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoScalingConfig. @@ -204,6 +210,21 @@ func (in *AutoSetRequests) DeepCopy() *AutoSetRequests { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AutoSetResources) DeepCopyInto(out *AutoSetResources) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoSetResources. +func (in *AutoSetResources) DeepCopy() *AutoSetResources { + if in == nil { + return nil + } + out := new(AutoSetResources) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *CapacityConfig) DeepCopyInto(out *CapacityConfig) { *out = *in @@ -347,6 +368,27 @@ func (in *ComputingVendorParams) DeepCopy() *ComputingVendorParams { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CronScalingRule) DeepCopyInto(out *CronScalingRule) { + *out = *in + in.DesiredResources.DeepCopyInto(&out.DesiredResources) + if in.DesiredReplicas != nil { + in, out := &in.DesiredReplicas, &out.DesiredReplicas + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CronScalingRule. +func (in *CronScalingRule) DeepCopy() *CronScalingRule { + if in == nil { + return nil + } + out := new(CronScalingRule) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DataPipeline4ResourcesConfig) DeepCopyInto(out *DataPipeline4ResourcesConfig) { *out = *in diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml index 69a12b27..bb8dd068 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -50,41 +50,6 @@ spec: autoScaling: description: scale the workload based on the usage and traffic properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -105,40 +70,141 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - percentileForAutoRequests: + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' + type: string + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScalingRule defines the rule for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object hypervisor: description: single GPU device multi-process queuing and fair scheduling diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml index fc7818d3..1661ae5b 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -46,41 +46,6 @@ spec: This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation, user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true' properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -101,40 +66,141 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - percentileForAutoRequests: + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' + type: string + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScalingRule defines the rule for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object gpuCount: description: The number of GPUs to be used by the workload, default diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml index 19b9fd2e..01005b7c 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml @@ -45,41 +45,6 @@ spec: This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation, user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true' properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -100,40 +65,141 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - percentileForAutoRequests: + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' + type: string + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScalingRule defines the rule for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object gpuCount: description: The number of GPUs to be used by the workload, default diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml index 69a12b27..bb8dd068 100644 --- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -50,41 +50,6 @@ spec: autoScaling: description: scale the workload based on the usage and traffic properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -105,40 +70,141 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - percentileForAutoRequests: + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' + type: string + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScalingRule defines the rule for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object hypervisor: description: single GPU device multi-process queuing and fair scheduling diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml index fc7818d3..1661ae5b 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -46,41 +46,6 @@ spec: This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation, user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true' properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -101,40 +66,141 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - percentileForAutoRequests: + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' + type: string + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScalingRule defines the rule for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object gpuCount: description: The number of GPUs to be used by the workload, default diff --git a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml index 19b9fd2e..01005b7c 100644 --- a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml +++ b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml @@ -45,41 +45,6 @@ spec: This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation, user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true' properties: - autoSetLimits: - description: |- - layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly - VPA-like, aggregate metrics data <1m - properties: - enable: - type: boolean - evaluationPeriod: - type: string - extraTFlopsBufferRatio: - type: string - ignoredDeltaRange: - type: string - maxRatioToRequests: - description: the multiplier of requests, to avoid limit set - too high, like 5.0 - type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object - scaleUpStep: - type: string - targetResource: - description: target resource to scale limits, such as "tflops", - "vram", or "all" by default - type: string - type: object autoSetReplicas: description: |- layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit @@ -100,40 +65,141 @@ spec: targetTFlopsOfLimits: type: string type: object - autoSetRequests: + autoSetResources: description: |- - layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet + layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks properties: - aggregationPeriod: + confidenceInterval: + description: 'The time interval used for computing the confidence + multiplier for the lower and upper bound. Default: 24h' type: string enable: type: boolean - evaluationPeriod: + historyLength: + description: 'How much time back TSDB have to be queried to + get historical metrics. Default: 1d' type: string - extraBufferRatio: - description: the request buffer ratio, for example actual - usage is 1.0, 10% buffer will be 1.1 as final preferred - requests + historyResolution: + description: 'Resolution at which TSDB is queried for historical + metrics. Default: 1m' type: string - percentileForAutoRequests: + lowerboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the lower bound on tflops recommendation. Default: 0.5' + type: string + lowerboundvrampercentile: + description: 'Vram usage percentile that will be used for + the lower bound on vram recommendation. Default: 0.5' + type: string + requestMarginFraction: + description: 'Fraction of usage added as the safety margin + to the recommended request. Default: 0.15' type: string - prediction: - properties: - enable: - type: boolean - historyDataPeriod: - type: string - model: - type: string - predictionPeriod: - type: string - type: object targetResource: - description: target resource to scale requests, such as "tflops", - "vram", or "all" by default + description: Target resource to scale, such as "tflops", "vram", + or "all" by default + type: string + targettflopspercentile: + description: 'Tflops usage percentile that will be used as + a base for tflops target recommendation. Default: 0.9' + type: string + targetvrampercentile: + description: 'Vram usage percentile that will be used as a + base for vram target recommendation. Default: 0.9' + type: string + upperboundtflopspercentile: + description: 'Tflops usage percentile that will be used for + the upper bound on tflops recommendation. Default: 0.95' + type: string + upperboundvrampercentile: + description: 'Vram usage percentile that will be used for + the upper bound on vram recommendation. Default: 0.95' type: string type: object + cronScalingRules: + description: CronScalingRules defines a list of CronScaling rules + used to schedule scaling actions based on cron expressions. + items: + description: |- + CronScalingRule defines the rule for scaling resources based on a cron schedule. + It allows enabling/disabling the scaler, specifying the time window for scaling, + and configuring the desired resources and replicas during the scheduled period. + properties: + desiredReplicas: + description: DesiredReplicas is the target number of replicas + during the schedule. + format: int32 + type: integer + desiredResources: + description: DesiredResources specifies the target resources + to scale to during the schedule. + properties: + limits: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + requests: + properties: + tflops: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + vram: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + required: + - tflops + - vram + type: object + required: + - limits + - requests + type: object + enable: + description: Enable specifies whether the cron scaler is + enabled. + type: boolean + end: + description: End is the end time for the scaling schedule, + in cron format. + type: string + name: + description: Name is the identifier for the cron scaler. + type: string + replicasMultiplier: + description: ReplicasMultiplier is a string representing + the multiplier to apply to replicas. + type: string + resourceMultiplier: + description: ResourceMultiplier is a string representing + the multiplier to apply to resources. + type: string + start: + description: Start is the start time for the scaling schedule, + in cron format. + type: string + type: object + type: array type: object gpuCount: description: The number of GPUs to be used by the workload, default diff --git a/go.mod b/go.mod index bfdc3c41..69a98eeb 100644 --- a/go.mod +++ b/go.mod @@ -16,21 +16,24 @@ require ( github.com/lithammer/shortuuid/v4 v4.2.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 + github.com/pkg/errors v0.9.1 + github.com/robfig/cron/v3 v3.0.1 github.com/samber/lo v1.51.0 github.com/shirou/gopsutil v3.21.11+incompatible github.com/stretchr/testify v1.10.0 - go.etcd.io/etcd/client/v2 v2.305.16 + golang.org/x/time v0.9.0 gomodules.xyz/jsonpatch/v2 v2.5.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gorm.io/driver/mysql v1.6.0 gorm.io/gorm v1.30.0 k8s.io/api v0.33.2 k8s.io/apimachinery v0.33.2 + k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0 k8s.io/client-go v0.33.2 k8s.io/component-base v0.32.5 k8s.io/component-helpers v0.33.2 k8s.io/klog/v2 v2.130.1 - k8s.io/kubernetes v1.32.5 + k8s.io/kubernetes v1.32.6 k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 sigs.k8s.io/controller-runtime v0.20.4 sigs.k8s.io/scheduler-plugins v0.31.8 @@ -110,7 +113,6 @@ require ( github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.1 // indirect @@ -119,6 +121,7 @@ require ( github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect + github.com/stretchr/objx v0.5.2 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.12 // indirect github.com/x448/float16 v0.8.4 // indirect @@ -148,7 +151,6 @@ require ( golang.org/x/sys v0.33.0 // indirect golang.org/x/term v0.32.0 // indirect golang.org/x/text v0.25.0 // indirect - golang.org/x/time v0.9.0 // indirect golang.org/x/tools v0.33.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20241223144023-3abc09e42ca8 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20241223144023-3abc09e42ca8 // indirect diff --git a/go.sum b/go.sum index 822b5c80..4f7607ff 100644 --- a/go.sum +++ b/go.sum @@ -250,6 +250,8 @@ github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= +github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= @@ -270,6 +272,7 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -458,6 +461,8 @@ k8s.io/apimachinery v0.32.5 h1:6We3aJ6crC0ap8EhsEXcgX3LpI6SEjubpiOMXLROwPM= k8s.io/apimachinery v0.32.5/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= k8s.io/apiserver v0.32.5 h1:phmm2EOUVFI+cLiq8Grtuh166fTt/qgvkGPkpgzp5uY= k8s.io/apiserver v0.32.5/go.mod h1:5bfueS1tgARVWVXRJBMI5mHoCmev0jOvbxebai/kiqc= +k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0 h1:oVv4QrTPKM7vWyQRRzCDgDgi00NWo4Rjle5/nujP/dI= +k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0/go.mod h1:W4k7qGP8A9Xqp+UK+lM49AfsWkAdXzE80F/s8kxwWVI= k8s.io/client-go v0.32.5 h1:huFmQMzgWu0z4kbWsuZci+Gt4Fo72I4CcrvhToZ/Qp0= k8s.io/client-go v0.32.5/go.mod h1:Qchw6f9WIVrur7DKojAHpRgGLcANT0RLIvF39Jz58xA= k8s.io/cloud-provider v0.32.5 h1:KzO0mpXYArWxQH91+a4WLLrhTaO5RGWmQn4lzUXY6ak= diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go new file mode 100644 index 00000000..ac5e4662 --- /dev/null +++ b/internal/autoscaler/autoscaler.go @@ -0,0 +1,215 @@ +package autoscaler + +import ( + "context" + "errors" + "fmt" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/recommender" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +var ( + _ manager.Runnable = (*Autoscaler)(nil) + _ manager.LeaderElectionRunnable = (*Autoscaler)(nil) +) + +type Autoscaler struct { + client.Client + allocator *gpuallocator.GpuAllocator + metricsProvider metrics.Provider + recommenders []recommender.Interface + workloadHandler workload.Handler + workloads map[string]*workload.State +} + +func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Autoscaler, error) { + if c == nil { + return nil, errors.New("must specify client") + } + + if allocator == nil { + return nil, errors.New("must specify allocator") + } + + recommenders := []recommender.Interface{ + recommender.NewPercentileRecommender(), + recommender.NewCronRecommender(c), + } + + return &Autoscaler{ + Client: c, + allocator: allocator, + metricsProvider: metrics.NewProvider(nil), + recommenders: recommenders, + workloadHandler: workload.NewHandler(c, allocator), + workloads: map[string]*workload.State{}, + }, nil +} + +func (s *Autoscaler) Start(ctx context.Context) error { + log := log.FromContext(ctx) + log.Info("Starting autoscaler") + + // Handle timeout for loading historical metrics + historyCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + s.loadHistoryMetrics(historyCtx) + + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + for { + select { + case <-ticker.C: + s.Run(ctx) + case <-ctx.Done(): + log.Info("Stopping autoscaler") + return nil + } + } +} + +func (s *Autoscaler) NeedLeaderElection() bool { + return true +} + +func (s *Autoscaler) Run(ctx context.Context) { + log := log.FromContext(ctx) + + log.Info("Autoscaler running") + s.loadWorkloads(ctx) + s.loadRealTimeMetrics(ctx) + s.processWorkloads(ctx) +} + +func (s *Autoscaler) loadWorkloads(ctx context.Context) { + log := log.FromContext(ctx) + + workloadList := tfv1.TensorFusionWorkloadList{} + if err := s.List(ctx, &workloadList); err != nil { + log.Error(err, "failed to list workloads") + return + } + + observedWorkloads := map[string]bool{} + for _, workload := range workloadList.Items { + if !workload.DeletionTimestamp.IsZero() { + continue + } + + workloadState := s.findOrCreateWorkloadState(workload.Name) + s.workloadHandler.UpdateWorkloadState(ctx, workloadState, &workload) + observedWorkloads[workload.Name] = true + } + + // remove non-existent workloads + for name := range s.workloads { + if !observedWorkloads[name] { + delete(s.workloads, name) + } + } +} + +func (s *Autoscaler) loadHistoryMetrics(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("loading historical metrics") + + workersMetrics, err := s.metricsProvider.GetHistoryMetrics() + if err != nil { + log.Error(err, "failed to get history metrics") + return + } + for _, sample := range workersMetrics { + s.findOrCreateWorkloadState(sample.WorkloadName).AddSample(sample) + } +} + +func (s *Autoscaler) loadRealTimeMetrics(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("loading realtime metrics") + + workersMetrics, err := s.metricsProvider.GetWorkersMetrics() + if err != nil { + log.Error(err, "failed to get workers metrics") + return + } + + for _, sample := range workersMetrics { + if workload, exists := s.workloads[sample.WorkloadName]; exists { + workload.AddSample(sample) + } + } +} + +func (s *Autoscaler) processWorkloads(ctx context.Context) { + log := log.FromContext(ctx) + log.Info("processing workloads") + + for _, workload := range s.workloads { + recommendations := map[string]*tfv1.Resources{} + for _, recommender := range s.recommenders { + name := recommender.Name() + recommendation, err := recommender.Recommend(ctx, workload) + if err != nil { + log.Error(err, "failed to recommend resources", "recommender", name) + continue + } + if recommendation == nil { + continue + } + recommendations[name] = recommendation + log.Info("recommendation", "workload", workload.Name, "recommender", name, "resources", recommendation) + } + + finalRecommendation := mergeRecommendations(recommendations) + if finalRecommendation.IsZero() { + continue + } + log.Info("final recommendation", "workload", workload.Name, "resources", finalRecommendation) + + if err := s.workloadHandler.ApplyRecommendationToWorkload(ctx, workload, finalRecommendation); err != nil { + log.Error(err, "failed to apply recommendation", "workload", workload.Name, "recommendation", finalRecommendation) + } + } +} + +func (s *Autoscaler) findOrCreateWorkloadState(name string) *workload.State { + w, exists := s.workloads[name] + if !exists { + w = workload.NewWorkloadState(name) + s.workloads[name] = w + } + return w +} + +func mergeRecommendations(recommendations map[string]*tfv1.Resources) *tfv1.Resources { + result := &tfv1.Resources{} + for _, rec := range recommendations { + if result.Requests.Tflops.Cmp(rec.Requests.Tflops) < 0 { + result.Requests.Tflops = rec.Requests.Tflops + result.Limits.Tflops = rec.Limits.Tflops + } + if result.Requests.Vram.Cmp(rec.Requests.Vram) < 0 { + result.Requests.Vram = rec.Requests.Vram + result.Limits.Vram = rec.Limits.Vram + } + } + return result +} + +// Start after manager started +func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error { + autoScaler, err := NewAutoscaler(mgr.GetClient(), allocator) + if err != nil { + return fmt.Errorf("failed to create auto scaler: %v", err) + } + return mgr.Add(autoScaler) +} diff --git a/internal/autoscaler/autoscaler_suite_test.go b/internal/autoscaler/autoscaler_suite_test.go new file mode 100644 index 00000000..6eb9d869 --- /dev/null +++ b/internal/autoscaler/autoscaler_suite_test.go @@ -0,0 +1,585 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package autoscaler + +import ( + "context" + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + "testing" + "time" + + ctrl "sigs.k8s.io/controller-runtime" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + corev1 "k8s.io/api/core/v1" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/config" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/controller" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + "github.com/NexusGPU/tensor-fusion/internal/metrics" + "github.com/NexusGPU/tensor-fusion/internal/portallocator" + "github.com/NexusGPU/tensor-fusion/internal/utils" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + // +kubebuilder:scaffold:imports +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var cfg *rest.Config +var k8sClient client.Client +var testEnv *envtest.Environment +var ctx context.Context +var cancel context.CancelFunc +var allocator *gpuallocator.GpuAllocator +var metricsRecorder *metrics.MetricsRecorder + +func TestControllers(t *testing.T) { + RegisterFailHandler(Fail) + + if os.Getenv("DEBUG_MODE") == constants.TrueStringValue { + SetDefaultEventuallyTimeout(10 * time.Minute) + } else { + SetDefaultEventuallyTimeout(7 * time.Second) + } + SetDefaultEventuallyPollingInterval(200 * time.Millisecond) + SetDefaultConsistentlyDuration(5 * time.Second) + SetDefaultConsistentlyPollingInterval(250 * time.Millisecond) + RunSpecs(t, "Controller Suite") +} + +var _ = BeforeSuite(func() { + // Expect(os.Setenv("USE_EXISTING_CLUSTER", "true")).Should(Succeed()) + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + ctx, cancel = context.WithCancel(context.TODO()) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + + // The BinaryAssetsDirectory is only required if you want to run the tests directly + // without call the makefile target test. If not informed it will look for the + // default path defined in controller-runtime which is /usr/local/kubebuilder/. + // Note that you must have the required binaries setup under the bin directory to perform + // the tests directly. When we run make test it will be setup and used automatically. + BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s", + fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), + } + + var err error + // cfg is defined in this file globally. + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + err = tfv1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + err = corev1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:scheme + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) + + Expect(k8sClient.Create(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: utils.CurrentNamespace(), + }, + })).NotTo(HaveOccurred()) + + mgr, err := ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme.Scheme, + Metrics: metricsserver.Options{ + BindAddress: "0", + }, + }) + + Expect(err).ToNot(HaveOccurred()) + + metricsRecorder = &metrics.MetricsRecorder{ + MetricsOutputPath: "./metrics.log", + HourlyUnitPriceMap: map[string]float64{ + "A100": 10, + }, + WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing), + } + + allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 150*time.Millisecond) + _, err = allocator.SetupWithManager(ctx, mgr) + Expect(err).ToNot(HaveOccurred()) + + portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), "40000-42000", "42001-60000") + if err != nil { + Expect(err).ToNot(HaveOccurred()) + } + _ = portAllocator.SetupWithManager(ctx, mgr) + + err = (&controller.TensorFusionClusterReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("TensorFusionCluster"), + MetricsRecorder: metricsRecorder, + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.GPUPoolReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUPool"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.GPUNodeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("GPUNode"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.GPUNodeClassReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.SchedulingConfigTemplateReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.PodReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Allocator: allocator, + PortAllocator: portAllocator, + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.NodeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("Node"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.WorkloadProfileReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.TensorFusionConnectionReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("TensorFusionConnection"), + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.GPUReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(ctx, mgr) + Expect(err).ToNot(HaveOccurred()) + + err = (&controller.TensorFusionWorkloadReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("TensorFusionWorkload"), + PortAllocator: portAllocator, + }).SetupWithManager(mgr) + Expect(err).ToNot(HaveOccurred()) + + go func() { + defer GinkgoRecover() + err = mgr.Start(ctx) + Expect(err).ToNot(HaveOccurred(), "failed to run manager") + }() + +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + allocator.Stop() + cancel() + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) + // Expect(os.Unsetenv("USE_EXISTING_CLUSTER")).To(Succeed()) +}) + +type TensorFusionEnv struct { + clusterKey client.ObjectKey + poolCount int + poolNodeMap map[int]map[int]int +} + +func (c *TensorFusionEnv) GetCluster() *tfv1.TensorFusionCluster { + GinkgoHelper() + tfc := &tfv1.TensorFusionCluster{} + Expect(k8sClient.Get(ctx, c.clusterKey, tfc)).Should(Succeed()) + return tfc +} + +func (c *TensorFusionEnv) UpdateCluster(tfc *tfv1.TensorFusionCluster) { + GinkgoHelper() + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + latest := &tfv1.TensorFusionCluster{} + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(tfc), latest); err != nil { + return err + } + latest.Spec = tfc.Spec + return k8sClient.Update(ctx, latest) + }) + Expect(err).Should(Succeed()) +} + +func (c *TensorFusionEnv) Cleanup() { + GinkgoHelper() + for poolIndex, nodeGpuMap := range c.poolNodeMap { + for nodeIndex := range nodeGpuMap { + c.DeleteGPUNode(poolIndex, nodeIndex) + } + } + + tfc := c.GetCluster() + tfcCopy := tfc.DeepCopy() + tfcCopy.Spec.GPUPools = []tfv1.GPUPoolDefinition{} + c.UpdateCluster(tfcCopy) + + for poolIndex := range c.poolNodeMap { + Eventually(func(g Gomega) { + pool := &tfv1.GPUPool{} + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(HaveOccurred()) + }).Should(Succeed()) + delete(c.poolNodeMap, poolIndex) + c.poolCount-- + } + + Expect(k8sClient.Delete(ctx, tfc)).Should(Succeed()) + Eventually(func(g Gomega) { + err := k8sClient.Get(ctx, c.clusterKey, tfc) + g.Expect(err).Should(HaveOccurred()) + }).Should(Succeed()) +} + +func (c *TensorFusionEnv) GetGPUPoolList() *tfv1.GPUPoolList { + GinkgoHelper() + poolList := &tfv1.GPUPoolList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, poolList, client.MatchingLabels(map[string]string{ + constants.LabelKeyOwner: c.clusterKey.Name, + }))).Should(Succeed()) + g.Expect(poolList.Items).Should(HaveLen(c.poolCount)) + }).Should(Succeed()) + return poolList +} + +func (c *TensorFusionEnv) GetGPUPool(poolIndex int) *tfv1.GPUPool { + GinkgoHelper() + pool := &tfv1.GPUPool{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(Succeed()) + }).Should(Succeed()) + return pool +} + +func (c *TensorFusionEnv) GetGPUNodeList(poolIndex int) *tfv1.GPUNodeList { + GinkgoHelper() + nodeList := &tfv1.GPUNodeList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, nodeList, client.MatchingLabels(map[string]string{ + fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, c.getPoolName(poolIndex)): "true", + }))).Should(Succeed()) + g.Expect(nodeList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex]))) + }).Should(Succeed()) + return nodeList +} + +func (c *TensorFusionEnv) GetGPUNode(poolIndex int, nodeIndex int) *tfv1.GPUNode { + GinkgoHelper() + node := &tfv1.GPUNode{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(Succeed()) + }).Should(Succeed()) + return node +} + +func (c *TensorFusionEnv) DeleteGPUNode(poolIndex int, nodeIndex int) { + GinkgoHelper() + c.DeleteNodeGpuList(poolIndex, nodeIndex) + node := c.GetGPUNode(poolIndex, nodeIndex) + Expect(k8sClient.Delete(ctx, node)).Should(Succeed()) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(HaveOccurred()) + }).Should(Succeed()) + delete(c.poolNodeMap[poolIndex], nodeIndex) +} + +func (c *TensorFusionEnv) GetNodeGpuList(poolIndex int, nodeIndex int) *tfv1.GPUList { + GinkgoHelper() + gpuList := &tfv1.GPUList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, gpuList, client.MatchingLabels(map[string]string{ + constants.LabelKeyOwner: c.getNodeName(poolIndex, nodeIndex), + }))).Should(Succeed()) + g.Expect(gpuList.Items).Should(HaveLen(c.poolNodeMap[poolIndex][nodeIndex])) + }).Should(Succeed()) + return gpuList +} + +func (c *TensorFusionEnv) DeleteNodeGpuList(poolIndex int, nodeIndex int) { + GinkgoHelper() + Expect(k8sClient.DeleteAllOf(ctx, &tfv1.GPU{}, + client.MatchingLabels{constants.LabelKeyOwner: c.getNodeName(poolIndex, nodeIndex)}, + )).Should(Succeed()) +} + +func (c *TensorFusionEnv) GetPoolGpuList(poolIndex int) *tfv1.GPUList { + GinkgoHelper() + gpuList := &tfv1.GPUList{} + poolGpuCount := 0 + for _, gpuCount := range c.poolNodeMap[poolIndex] { + poolGpuCount += gpuCount + } + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, gpuList, client.MatchingLabels(map[string]string{ + constants.GpuPoolKey: c.getPoolName(poolIndex), + }))).Should(Succeed()) + g.Expect(gpuList.Items).Should(HaveLen(poolGpuCount)) + }).Should(Succeed()) + return gpuList +} + +// https://book.kubebuilder.io/reference/envtest#testing-considerations +// Unless you’re using an existing cluster, keep in mind that no built-in controllers are running in the test context. +// So the checkStatusAndUpdateVirtualCapacity in gpunode_controller.go checking pod status always pending and the gpunode status can't change to running +// When using an existing cluster, the test speed go a lot faster, may change later? +func (c *TensorFusionEnv) UpdateHypervisorStatus() { + GinkgoHelper() + if os.Getenv("USE_EXISTING_CLUSTER") != "true" { + for poolIndex := range c.poolNodeMap { + podList := &corev1.PodList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, podList, + client.InNamespace(utils.CurrentNamespace()), + client.MatchingLabels(map[string]string{ + fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, c.getPoolName(poolIndex)): "true", + }), + )).Should(Succeed()) + g.Expect(podList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex]))) + }).Should(Succeed()) + for _, pod := range podList.Items { + pod.Status.Phase = corev1.PodRunning + pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{Type: corev1.PodReady, Status: corev1.ConditionTrue}) + Expect(k8sClient.Status().Update(ctx, &pod)).Should(Succeed()) + } + } + } +} + +func (c *TensorFusionEnv) getPoolName(poolIndex int) string { + return fmt.Sprintf("%s-pool-%d", c.clusterKey.Name, poolIndex) +} + +func (c *TensorFusionEnv) getNodeName(poolIndex int, nodeIndex int) string { + return fmt.Sprintf("%s-pool-%d-node-%d", c.clusterKey.Name, poolIndex, nodeIndex) +} + +func (c *TensorFusionEnv) getGPUName(poolIndex int, nodeIndex int, gpuIndex int) string { + return fmt.Sprintf("%s-pool-%d-node-%d-gpu-%d", c.clusterKey.Name, poolIndex, nodeIndex, gpuIndex) +} + +func (c *TensorFusionEnv) GetConfig() *rest.Config { + return cfg +} + +type TensorFusionEnvBuilder struct { + *TensorFusionEnv +} + +func NewTensorFusionEnvBuilder() *TensorFusionEnvBuilder { + return &TensorFusionEnvBuilder{ + &TensorFusionEnv{ + poolCount: 0, + clusterKey: client.ObjectKey{}, + poolNodeMap: map[int]map[int]int{}, + }, + } +} + +func (b *TensorFusionEnvBuilder) AddPoolWithNodeCount(nodeCount int) *TensorFusionEnvBuilder { + nodeGpuMap := make(map[int]int, nodeCount) + for i := range nodeCount { + nodeGpuMap[i] = 0 + } + b.poolNodeMap[b.poolCount] = nodeGpuMap + b.poolCount++ + return b +} + +func (b *TensorFusionEnvBuilder) SetGpuCountPerNode(gpuCount int) *TensorFusionEnvBuilder { + poolIndex := b.poolCount - 1 + for nodeIndex := range b.poolNodeMap[poolIndex] { + b.poolNodeMap[poolIndex][nodeIndex] = gpuCount + } + return b +} + +func (b *TensorFusionEnvBuilder) SetGpuCountForNode(nodeIndex int, gpuCount int) *TensorFusionEnvBuilder { + poolIndex := b.poolCount - 1 + b.poolNodeMap[poolIndex][nodeIndex] = gpuCount + return b +} + +var testEnvId int = 0 + +func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv { + GinkgoHelper() + b.clusterKey = client.ObjectKey{ + Name: fmt.Sprintf("cluster-%d", testEnvId), + Namespace: "default", + } + testEnvId++ + + // generate cluster + tfc := &tfv1.TensorFusionCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: b.clusterKey.Name, + Namespace: b.clusterKey.Namespace, + }, + Spec: tfv1.TensorFusionClusterSpec{ + GPUPools: []tfv1.GPUPoolDefinition{ + { + Name: fmt.Sprintf("pool-%d", b.poolCount), + SpecTemplate: *config.MockGPUPoolSpec, + }, + }, + }, + } + + // construct pools + gpuPools := make([]tfv1.GPUPoolDefinition, b.poolCount) + for i := range b.poolCount { + poolSpec := config.MockGPUPoolSpec.DeepCopy() + poolSpec.NodeManagerConfig.NodeSelector.NodeSelectorTerms[0].MatchExpressions[0].Key = + fmt.Sprintf("%s-label-%d", tfc.Name, i) + gpuPools[i] = tfv1.GPUPoolDefinition{ + Name: fmt.Sprintf("pool-%d", i), + SpecTemplate: *poolSpec, + } + } + + tfc.Spec.GPUPools = gpuPools + Expect(k8sClient.Create(ctx, tfc)).To(Succeed()) + + // wait for pools are created + Eventually(func(g Gomega) { + gpuPoolList := &tfv1.GPUPoolList{} + g.Expect(k8sClient.List(ctx, gpuPoolList, client.MatchingLabels(map[string]string{ + constants.LabelKeyOwner: tfc.Name, + }))).Should(Succeed()) + g.Expect(gpuPoolList.Items).Should(HaveLen(b.poolCount)) + }).Should(Succeed()) + + // generate nodes + selectors := strings.Split(constants.InitialGPUNodeSelector, "=") + for poolIndex := range b.poolCount { + nodeCount := len(b.poolNodeMap[poolIndex]) + for nodeIndex := range nodeCount { + coreNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: b.getNodeName(poolIndex, nodeIndex), + Labels: map[string]string{ + selectors[0]: selectors[1], + fmt.Sprintf("%s-label-%d", tfc.Name, poolIndex): "true", + }, + }, + } + Expect(k8sClient.Create(ctx, coreNode)).To(Succeed()) + + // generate gpus for gpunode + gpuNode := b.GetGPUNode(poolIndex, nodeIndex) + if gpuCount := b.poolNodeMap[poolIndex][nodeIndex]; gpuCount > 0 { + for gpuIndex := range gpuCount { + key := client.ObjectKey{ + Name: b.getGPUName(poolIndex, nodeIndex, gpuIndex), + } + gpu := &tfv1.GPU{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Labels: map[string]string{ + constants.LabelKeyOwner: gpuNode.Name, + constants.GpuPoolKey: b.getPoolName(poolIndex), + }, + }, + } + Expect(controllerutil.SetControllerReference(gpuNode, gpu, scheme.Scheme)).To(Succeed()) + Expect(k8sClient.Create(ctx, gpu)).To(Succeed()) + patch := client.MergeFrom(gpu.DeepCopy()) + gpu.Status = tfv1.GPUStatus{ + Phase: tfv1.TensorFusionGPUPhaseRunning, + UUID: key.Name, + GPUModel: "mock", + NodeSelector: map[string]string{ + "kubernetes.io/hostname": b.getNodeName(poolIndex, nodeIndex), + }, + Capacity: &tfv1.Resource{ + Tflops: resource.MustParse("2000"), + Vram: resource.MustParse("2000Gi"), + }, + Available: &tfv1.Resource{ + Tflops: resource.MustParse("2000"), + Vram: resource.MustParse("2000Gi"), + }, + Message: "mock message", + } + Expect(k8sClient.Status().Patch(ctx, gpu, patch)).To(Succeed()) + } + } + } + + b.GetPoolGpuList(poolIndex) + } + + b.UpdateHypervisorStatus() + + return b.TensorFusionEnv +} diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go new file mode 100644 index 00000000..49299e63 --- /dev/null +++ b/internal/autoscaler/autoscaler_test.go @@ -0,0 +1,668 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package autoscaler + +import ( + "context" + "fmt" + "strings" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/utils" + "github.com/aws/smithy-go/ptr" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var _ = Describe("Autoscaler", func() { + Context("when creating an autoscaler", func() { + It("should return an error if there is no client", func() { + as, err := NewAutoscaler(nil, nil) + Expect(as).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("must specify client")) + }) + + It("should return an error if there is no allocator", func() { + as, err := NewAutoscaler(k8sClient, nil) + Expect(as).To(BeNil()) + Expect(err.Error()).To(ContainSubstring("must specify allocator")) + }) + }) + + Context("when loading history metrics", func() { + It("should create the state of workloads and workers based on historical metrics", func() { + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.metricsProvider = &FakeMetricsProvider{} + scaler.loadHistoryMetrics(ctx) + metrics, _ := scaler.metricsProvider.GetHistoryMetrics() + for _, m := range metrics { + Expect(scaler.workloads).To(HaveKey(m.WorkloadName)) + Expect(scaler.workloads[m.WorkloadName].Workers).To(HaveKey(m.WorkerName)) + } + }) + }) + + Context("when loading workloads", func() { + It("should keep the state of workloads", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(3). + Build() + defer tfEnv.Cleanup() + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + Expect(scaler.workloads).To(BeEmpty()) + + // create two workloads + pool := tfEnv.GetGPUPool(0) + // with two replias + workload0 := createWorkload(pool, 0, 2) + workload0Workers := getWorkers(workload0) + // with one replia + workload1 := createWorkload(pool, 1, 1) + workload1Workers := getWorkers(workload1) + + scaler.loadWorkloads(ctx) + Expect(scaler.workloads).To(HaveLen(2)) + Expect(scaler.workloads).To(HaveKey(workload0.Name)) + Expect(scaler.workloads).To(HaveKey(workload1.Name)) + workers := scaler.workloads[workload0.Name].Workers + Expect(workers).To(HaveLen(2)) + Expect(workers).To(HaveKey(workload0Workers[0].Name)) + Expect(workers).To(HaveKey(workload0Workers[1].Name)) + Expect(scaler.workloads[workload1.Name].Workers).To(HaveKey(workload1Workers[0].Name)) + + updateWorkloadReplicas(workload0, 1) + scaler.loadWorkloads(ctx) + Expect(scaler.workloads[workload0.Name].Workers).To(HaveLen(1)) + + deleteWorkload(workload0) + deleteWorkload(workload1) + scaler.loadWorkloads(ctx) + Expect(scaler.workloads).NotTo(HaveKey(workload0.Name)) + Expect(scaler.workloads).NotTo(HaveKey(workload1.Name)) + }) + }) + + Context("when loading real time metrics", func() { + It("should update the state of workloads and workers", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + pool := tfEnv.GetGPUPool(0) + workload := createWorkload(pool, 0, 1) + workers := getWorkers(workload) + defer deleteWorkload(workload) + + worker := workers[0].Name + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + ws := scaler.workloads[workload.Name] + now := time.Now() + usage := &metrics.WorkerUsage{ + WorkloadName: workload.Name, + WorkerName: worker, + TflopsUsage: 12.0, + VramUsage: 9000, + Timestamp: now, + } + + scaler.metricsProvider = &FakeMetricsProvider{[]*metrics.WorkerUsage{usage}} + scaler.loadRealTimeMetrics(ctx) + + scalerWorkers := scaler.workloads[workload.Name].Workers + Expect(scalerWorkers[worker].LastTflopsSampleTime).To(Equal(usage.Timestamp)) + Expect(ws.WorkerUsageAggregator.TflopsHistogram.IsEmpty()).To(BeFalse()) + Expect(scalerWorkers[worker].VramPeak).To(Equal(usage.VramUsage)) + Expect(scalerWorkers[worker].LastVramSampleTime).To(Equal(usage.Timestamp)) + Expect(ws.WorkerUsageAggregator.VramHistogram.IsEmpty()).To(BeFalse()) + }) + }) + + Context("when processing workloads", func() { + It("should scale up when the recommended resources exceed the current allocation", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + + rec := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + } + + scaler.recommenders[0] = &FakeRecommender{ + Resources: &rec, + } + + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&rec)).To(BeTrue()) + }).Should(Succeed()) + + // Upon reprocessing the workload, it should skip resource updates + scaler.processWorkloads(ctx) + Consistently(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&rec)).To(BeTrue()) + }).Should(Succeed()) + }) + + It("should update resources based on auto scaling config", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + + rec := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + } + + scaler.recommenders[0] = &FakeRecommender{ + Resources: &rec, + } + + workloadState := scaler.workloads[workload.Name] + oldRes := workloadState.Spec.Resources + + // verify IsAutoScalingEnabled + workloadState.Spec.AutoScalingConfig.AutoSetResources.Enable = false + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&oldRes)).To(BeTrue()) + }).Should(Succeed()) + + // verify IsTargetResource + workloadState.Spec.AutoScalingConfig.AutoSetResources.Enable = true + workloadState.Spec.AutoScalingConfig.AutoSetResources.TargetResource = "tflops" + scaler.processWorkloads(ctx) + expect := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("8Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("16Gi"), + }, + } + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&expect)).To(BeTrue()) + }).Should(Succeed()) + }) + + It("should not update resources if recommended resources exceeded quota", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + + rec := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("9999"), + Vram: resource.MustParse("9999Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("9999"), + Vram: resource.MustParse("9999Gi"), + }, + } + + scaler.recommenders[0] = &FakeRecommender{ + Resources: &rec, + } + + workloadState := scaler.workloads[workload.Name] + oldRes := workloadState.Spec.Resources + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&oldRes)).To(BeTrue()) + }).Should(Succeed()) + }) + + It("should update resources based on cron scaling rule", func() { + tfEnv := NewTensorFusionEnvBuilder(). + AddPoolWithNodeCount(1).SetGpuCountPerNode(1). + Build() + defer tfEnv.Cleanup() + go mockSchedulerLoop(ctx, cfg) + workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1) + defer deleteWorkload(workload) + + scaler, _ := NewAutoscaler(k8sClient, allocator) + scaler.loadWorkloads(ctx) + + workloadState := scaler.workloads[workload.Name] + + resourcesInRule := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("110"), + Vram: resource.MustParse("110Gi"), + }, + } + + workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: resourcesInRule, + }, + } + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&resourcesInRule)).To(BeTrue()) + }).Should(Succeed()) + + // invalidate the rule by updating start and end fields + workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "", + End: "", + DesiredResources: resourcesInRule, + }, + } + + scaler.processWorkloads(ctx) + originalResources := workloadState.Spec.Resources + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&originalResources)).To(BeTrue()) + }).Should(Succeed()) + + // should not change after cron scaling finish + scaler.processWorkloads(ctx) + Eventually(func(g Gomega) { + res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations) + g.Expect(res.Equal(&originalResources)).To(BeTrue()) + }).Should(Succeed()) + }) + + It("should merge recomendations based on a larger request value", func() { + recommendations := map[string]*tfv1.Resources{ + "rec1": { + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("10Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("15"), + Vram: resource.MustParse("15Gi"), + }, + }, + "rec2": { + Requests: tfv1.Resource{ + Tflops: resource.MustParse("5"), + Vram: resource.MustParse("15Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("20Gi"), + }, + }, + } + + final := mergeRecommendations(recommendations) + Expect(final.Equal(&tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("15Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("15"), + Vram: resource.MustParse("20Gi"), + }, + })).To(BeTrue()) + }) + + It("should not update resource if resource is zero", func() { + + }) + }) +}) + +func createWorkload(pool *tfv1.GPUPool, id int, replicas int) *tfv1.TensorFusionWorkload { + GinkgoHelper() + tflopsRequests := resource.MustParse("10") + vramRequests := resource.MustParse("8Gi") + tflopsLimits := resource.MustParse("20") + vramLimits := resource.MustParse("16Gi") + + poolName := pool.Name + key := client.ObjectKey{Namespace: "default", Name: getWorkloadName(id)} + workload := &tfv1.TensorFusionWorkload{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: map[string]string{ + constants.GpuPoolKey: poolName, + }, + }, + Spec: tfv1.WorkloadProfileSpec{ + Replicas: ptr.Int32(int32(replicas)), + PoolName: poolName, + Resources: tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: tflopsRequests, + Vram: vramRequests, + }, + Limits: tfv1.Resource{ + Tflops: tflopsLimits, + Vram: vramLimits, + }, + }, + Qos: constants.QoSLevelMedium, + AutoScalingConfig: tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{ + Enable: true, + TargetResource: "all", + }, + }, + }, + } + + Expect(k8sClient.Create(ctx, workload)).To(Succeed()) + + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) + }).Should(Succeed()) + + checkWorkerPodCount(workload) + + return workload +} + +func checkWorkerPodCount(workload *tfv1.TensorFusionWorkload) { + GinkgoHelper() + podList := &corev1.PodList{} + Eventually(func(g Gomega) { + g.Expect(k8sClient.List(ctx, podList, + client.InNamespace(workload.Namespace), + client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed()) + g.Expect(podList.Items).Should(HaveLen(int(*workload.Spec.Replicas))) + }).Should(Succeed()) +} + +func getWorkloadName(index int) string { + return fmt.Sprintf("workload-%d", index) +} + +func getWorkers(workload *tfv1.TensorFusionWorkload) []*corev1.Pod { + GinkgoHelper() + podList := &corev1.PodList{} + Expect(k8sClient.List(ctx, podList, + client.InNamespace("default"), + client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed()) + return lo.Map(podList.Items, func(pod corev1.Pod, _ int) *corev1.Pod { + return &pod + }) +} + +type FakeAllocator struct{} + +type FakeMetricsProvider struct { + Metrics []*metrics.WorkerUsage +} + +func (f *FakeMetricsProvider) GetWorkersMetrics() ([]*metrics.WorkerUsage, error) { + return f.Metrics, nil +} + +func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*metrics.WorkerUsage, error) { + sample := []*metrics.WorkerUsage{} + startTime := time.Now().Add(-8 * 24 * time.Hour) + for day := 0; day < 8; day++ { + for hour := 0; hour < 1; hour++ { + for minute := 0; minute < 60; minute++ { + // idx := day*24 + hour + sample = append(sample, &metrics.WorkerUsage{ + WorkloadName: "workload-0", + WorkerName: fmt.Sprintf("worker-%d", 1), + TflopsUsage: 100.0, + VramUsage: 1 * 1000 * 1000 * 1000, + Timestamp: startTime.Add(time.Duration(day*24+hour)*time.Hour + time.Duration(minute)*time.Minute), + }) + } + } + } + + return sample, nil +} + +type FakeRecommender struct { + *tfv1.Resources +} + +func (f *FakeRecommender) Name() string { + return "Fake" +} + +func (f *FakeRecommender) Recommend(ctx context.Context, workoad *workload.State) (*tfv1.Resources, error) { + return f.Resources, nil +} + +func updateWorkloadReplicas(workload *tfv1.TensorFusionWorkload, replicas int) { + GinkgoHelper() + key := client.ObjectKeyFromObject(workload) + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) + workload.Spec.Replicas = ptr.Int32(int32(replicas)) + g.Expect(k8sClient.Update(ctx, workload)).To(Succeed()) + }).Should(Succeed()) + + checkWorkerPodCount(workload) +} + +func deleteWorkload(workload *tfv1.TensorFusionWorkload) { + cleanupWorkload(client.ObjectKeyFromObject(workload)) +} + +func cleanupWorkload(key client.ObjectKey) { + GinkgoHelper() + workload := &tfv1.TensorFusionWorkload{} + + if err := k8sClient.Get(ctx, key, workload); err != nil { + if errors.IsNotFound(err) { + return + } + Expect(err).To(HaveOccurred()) + } + + // Set replicas to 0 + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) + workload.Spec.Replicas = ptr.Int32(0) + g.Expect(k8sClient.Update(ctx, workload)).To(Succeed()) + }).Should(Succeed()) + + Eventually(func(g Gomega) { + podList := &corev1.PodList{} + g.Expect(k8sClient.List(ctx, podList, + client.InNamespace(key.Namespace), + client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed()) + g.Expect(podList.Items).Should(BeEmpty()) + }).Should(Succeed()) + + Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed()) + Expect(k8sClient.Delete(ctx, workload)).To(Succeed()) + Eventually(func(g Gomega) { + err := k8sClient.Get(ctx, key, workload) + g.Expect(err).Should(HaveOccurred()) + }).Should(Succeed()) +} + +func mockSchedulerLoop(ctx context.Context, cfg *rest.Config) { + ticker := time.NewTicker(50 * time.Millisecond) + clientset, err := kubernetes.NewForConfig(cfg) + if err != nil { + Expect(err).To(Succeed()) + } + for range ticker.C { + select { + case <-ctx.Done(): + return + default: + podList := &corev1.PodList{} + _ = k8sClient.List(ctx, podList) + for _, pod := range podList.Items { + if pod.Spec.NodeName != "" { + continue + } + go scheduleAndStartPod(&pod, clientset) + } + } + } +} + +func scheduleAndStartPod(pod *corev1.Pod, clientset *kubernetes.Clientset) { + // simulate scheduling cycle Filter and Reserve + allocRequest, _, err := allocator.ComposeAllocationRequest(pod) + if errors.IsNotFound(err) { + return + } + Expect(err).To(Succeed()) + gpus, err := allocator.Alloc(&allocRequest) + if err != nil { + // some test cases are expected to fail, just continue + return + } + Expect(gpus).To(HaveLen(int(allocRequest.Count))) + allocator.SyncGPUsToK8s() + + // update pod annotation + Eventually(func(g Gomega) { + latestPod := &corev1.Pod{} + err := k8sClient.Get(ctx, types.NamespacedName{ + Name: pod.Name, + Namespace: pod.Namespace, + }, latestPod) + if errors.IsNotFound(err) { + return + } + g.Expect(err).To(Succeed()) + + if latestPod.Annotations == nil { + latestPod.Annotations = map[string]string{} + } + latestPod.Annotations[constants.GpuKey] = strings.Join( + lo.Map(gpus, func(gpu *tfv1.GPU, _ int) string { + return gpu.Name + }), ",") + err = k8sClient.Status().Update(ctx, latestPod) + if errors.IsNotFound(err) { + return + } + g.Expect(err).To(Succeed()) + + // update pod node name + latestPod.Spec.NodeName = gpus[0].Status.NodeSelector[constants.KubernetesHostNameLabel] + + // simulate k8s scheduler binding cycle Bind function + binding := &corev1.Binding{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + Target: corev1.ObjectReference{ + Kind: "Node", + Name: latestPod.Spec.NodeName, + }, + } + + err = clientset.CoreV1().Pods(latestPod.Namespace).Bind(ctx, binding, metav1.CreateOptions{}) + if errors.IsNotFound(err) { + return + } + g.Expect(err).To(Succeed()) + }).Should(Succeed()) + + // simulate kubelet start the pod successfully + patchPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + } + patchPod.Status.Phase = corev1.PodRunning + patchPod.Status.Conditions = append(patchPod.Status.Conditions, corev1.PodCondition{ + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }) + err = k8sClient.Status().Patch(ctx, patchPod, client.MergeFrom(&corev1.Pod{})) + if errors.IsNotFound(err) { + return + } + Expect(err).To(Succeed()) +} diff --git a/internal/autoscaler/metrics/metrics_aggregator.go b/internal/autoscaler/metrics/metrics_aggregator.go new file mode 100644 index 00000000..5ffe51d9 --- /dev/null +++ b/internal/autoscaler/metrics/metrics_aggregator.go @@ -0,0 +1,73 @@ +package metrics + +import ( + "time" + + vpa "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/util" +) + +const ( + // minSampleWeight is the minimal weight of any sample (prior to including decaying factor) + minSampleWeight = 0.1 + // epsilon is the minimal weight kept in histograms, it should be small enough that old samples + // (just inside AggregationWindowLength) added with minSampleWeight are still kept + epsilon = 0.001 * minSampleWeight + // DefaultAggregationInterval is the default value for AggregationInterval. + DefaultAggregationInterval = time.Hour * 24 + // DefaultHistogramBucketSizeGrowth is the default value for HistogramBucketSizeGrowth. + DefaultHistogramBucketSizeGrowth = 0.05 // Make each bucket 5% larger than the previous one. + // DefaultHistogramDecayHalfLife is the default value for HistogramDecayHalfLife. + DefaultHistogramDecayHalfLife = time.Hour * 24 +) + +type WorkerUsageAggregator struct { + TflopsHistogram vpa.Histogram + VramHistogram vpa.Histogram + FirstSampleStart time.Time + LastSampleStart time.Time + TotalSamplesCount int +} + +func NewWorkerUsageAggregator() *WorkerUsageAggregator { + return &WorkerUsageAggregator{ + TflopsHistogram: vpa.NewDecayingHistogram(histogramOptions(10000.0, 0.1), DefaultHistogramDecayHalfLife), + VramHistogram: vpa.NewDecayingHistogram(histogramOptions(1e12, 1e7), DefaultHistogramDecayHalfLife), + } +} + +func (w *WorkerUsageAggregator) IsEmpty() bool { + if w.TflopsHistogram.IsEmpty() && w.VramHistogram.IsEmpty() { + return true + } + return false +} + +func (w *WorkerUsageAggregator) AddTflopsSample(sample *WorkerUsage) bool { + w.TflopsHistogram.AddSample(float64(sample.TflopsUsage), minSampleWeight, sample.Timestamp) + if sample.Timestamp.After(w.LastSampleStart) { + w.LastSampleStart = sample.Timestamp + } + if w.FirstSampleStart.IsZero() || sample.Timestamp.Before(w.FirstSampleStart) { + w.FirstSampleStart = sample.Timestamp + } + w.TotalSamplesCount++ + return true +} + +func (w *WorkerUsageAggregator) AddVramSample(sample *WorkerUsage) bool { + w.VramHistogram.AddSample(float64(sample.VramUsage), 1.0, sample.Timestamp) + return true +} + +func (w *WorkerUsageAggregator) SubtractVramSample(usage float64, time time.Time) bool { + w.VramHistogram.SubtractSample(usage, 1.0, time) + return true +} + +func histogramOptions(maxValue, firstBucketSize float64) vpa.HistogramOptions { + options, err := vpa.NewExponentialHistogramOptions(maxValue, firstBucketSize, 1.+DefaultHistogramBucketSizeGrowth, epsilon) + if err != nil { + panic("Invalid histogram options") // Should not happen. + } + return options +} diff --git a/internal/autoscaler/metrics/metrics_provider.go b/internal/autoscaler/metrics/metrics_provider.go new file mode 100644 index 00000000..e35f6911 --- /dev/null +++ b/internal/autoscaler/metrics/metrics_provider.go @@ -0,0 +1,100 @@ +package metrics + +import ( + "time" + + "github.com/NexusGPU/tensor-fusion/internal/metrics" + "gorm.io/gorm" +) + +type WorkerUsage struct { + WorkloadName string + WorkerName string + TflopsUsage float64 + VramUsage uint64 + Timestamp time.Time +} + +type Provider interface { + GetWorkersMetrics() ([]*WorkerUsage, error) + GetHistoryMetrics() ([]*WorkerUsage, error) +} + +func NewProvider(db *gorm.DB) Provider { + return &greptimeDBProvider{db: db} +} + +type greptimeDBProvider struct { + db *gorm.DB + lastQueryTime time.Time + // historyLength time.Duration + // historyResolution time.Duration +} + +func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerUsage, error) { + data := []*metrics.HypervisorWorkerUsageMetrics{} + now := time.Now() + // actual meaning: max(avg[10s])[1m] + err := g.db.Select("workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, max(ts) as ts"). + Where("ts > ? and ts <= ?", g.lastQueryTime.Nanosecond(), now.Nanosecond()). + Group("workload, worker"). + Order("ts asc"). + Find(&data). + Error + + if err != nil { + return nil, err + } + + g.lastQueryTime = now + + workersMetrics := make([]*WorkerUsage, 0, len(data)) + for _, row := range data { + workersMetrics = append(workersMetrics, &WorkerUsage{ + WorkloadName: row.WorkloadName, + WorkerName: row.WorkerName, + TflopsUsage: row.ComputeTflops, + VramUsage: row.VRAMBytes, + Timestamp: row.Timestamp, + }) + } + + return workersMetrics, nil +} + +type hypervisorWorkerUsageMetrics struct { + metrics.HypervisorWorkerUsageMetrics + TimeWindow time.Time `gorm:"column:time_window;index:,class:TIME"` +} + +func (g *greptimeDBProvider) GetHistoryMetrics() ([]*WorkerUsage, error) { + data := []*hypervisorWorkerUsageMetrics{} + now := time.Now() + // TODO: replace using iteration for handling large datasets efficiently + // TODO: supply history resolution to config time window + err := g.db.Select("workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, date_bin('1 minute'::INTERVAL, ts) as time_window"). + Where("ts > ? and ts <= ?", now.Add(-time.Hour*24).Nanosecond(), now.Nanosecond()). + Group("workload, worker, time_window"). + Order("time_window asc"). + Find(&data). + Error + + if err != nil { + return nil, err + } + + g.lastQueryTime = now + + workersMetrics := make([]*WorkerUsage, 0, len(data)) + for _, row := range data { + workersMetrics = append(workersMetrics, &WorkerUsage{ + WorkloadName: row.WorkloadName, + WorkerName: row.WorkerName, + TflopsUsage: row.ComputeTflops, + VramUsage: row.VRAMBytes, + Timestamp: row.TimeWindow, + }) + } + + return workersMetrics, nil +} diff --git a/internal/autoscaler/metrics/metrics_provider_test.go b/internal/autoscaler/metrics/metrics_provider_test.go new file mode 100644 index 00000000..916c050d --- /dev/null +++ b/internal/autoscaler/metrics/metrics_provider_test.go @@ -0,0 +1,112 @@ +package metrics + +import ( + "regexp" + "time" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/NexusGPU/tensor-fusion/internal/metrics" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "gorm.io/driver/mysql" + "gorm.io/gorm" +) + +var _ = Describe("MetricsProvider", func() { + Context("when getting real time workers metrics", func() { + It("should return metrics for every worker", func() { + db, mock := NewMockDB() + now := time.Now() + fakeMetrics := []metrics.HypervisorWorkerUsageMetrics{ + { + WorkloadName: "workload-0", + WorkerName: "worker-0", + ComputeTflops: 10.3, + VRAMBytes: 1 * 1000 * 1000 * 1000, + Timestamp: now, + }, + { + WorkloadName: "workload-1", + WorkerName: "worker-1", + ComputeTflops: 10.3, + VRAMBytes: 1 * 1000 * 1000 * 1000, + Timestamp: now, + }, + } + + rows := sqlmock.NewRows([]string{"workload", "worker", "compute_tflops", "memory_bytes", "ts"}) + for _, row := range fakeMetrics { + rows.AddRow(row.WorkloadName, row.WorkerName, row.ComputeTflops, row.VRAMBytes, row.Timestamp) + } + + mock.ExpectQuery(regexp.QuoteMeta("SELECT workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, max(ts) as ts FROM `tf_worker_usage` WHERE ts > ? and ts <= ? GROUP BY workload, worker ORDER BY ts asc")). + WillReturnRows(rows) + provider := &greptimeDBProvider{db: db} + got, _ := provider.GetWorkersMetrics() + Expect(got).To(HaveLen(2)) + Expect(got[0].WorkloadName).To(Equal(fakeMetrics[0].WorkloadName)) + Expect(got[0].WorkerName).To(Equal(fakeMetrics[0].WorkerName)) + Expect(got[0].VramUsage).To(Equal(fakeMetrics[0].VRAMBytes)) + Expect(got[0].TflopsUsage).To(Equal(fakeMetrics[0].ComputeTflops)) + Expect(got[0].Timestamp).To(Equal(fakeMetrics[0].Timestamp)) + }) + }) + + Context("when getting history workers metrics", func() { + It("should return metrics based on history length", func() { + db, mock := NewMockDB() + now := time.Now() + fakeMetrics := []hypervisorWorkerUsageMetrics{ + { + HypervisorWorkerUsageMetrics: metrics.HypervisorWorkerUsageMetrics{ + WorkloadName: "workload-0", + WorkerName: "worker-0", + ComputeTflops: 10.3, + VRAMBytes: 1 * 1000 * 1000 * 1000, + Timestamp: now, + }, + TimeWindow: now, + }, + { + HypervisorWorkerUsageMetrics: metrics.HypervisorWorkerUsageMetrics{ + WorkloadName: "workload-1", + WorkerName: "worker-1", + ComputeTflops: 10.3, + VRAMBytes: 1 * 1000 * 1000 * 1000, + Timestamp: now, + }, + TimeWindow: now, + }, + } + + rows := sqlmock.NewRows([]string{"workload", "worker", "compute_tflops", "memory_bytes", "time_window"}) + for _, row := range fakeMetrics { + rows.AddRow(row.WorkloadName, row.WorkerName, row.ComputeTflops, row.VRAMBytes, row.TimeWindow) + } + + mock.ExpectQuery(regexp.QuoteMeta("SELECT workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, date_bin('1 minute'::INTERVAL, ts) as time_window FROM `tf_worker_usage` WHERE ts > ? and ts <= ? GROUP BY workload, worker, time_window ORDER BY time_window asc")). + WillReturnRows(rows) + provider := &greptimeDBProvider{db: db} + got, _ := provider.GetHistoryMetrics() + Expect(got).To(HaveLen(2)) + Expect(got[0].WorkloadName).To(Equal(fakeMetrics[0].WorkloadName)) + Expect(got[0].WorkerName).To(Equal(fakeMetrics[0].WorkerName)) + Expect(got[0].VramUsage).To(Equal(fakeMetrics[0].VRAMBytes)) + Expect(got[0].TflopsUsage).To(Equal(fakeMetrics[0].ComputeTflops)) + Expect(got[0].Timestamp).To(Equal(fakeMetrics[0].TimeWindow)) + }) + }) +}) + +func NewMockDB() (*gorm.DB, sqlmock.Sqlmock) { + GinkgoHelper() + db, mock, err := sqlmock.New() + Expect(err).ToNot(HaveOccurred()) + gormDB, err := gorm.Open(mysql.New(mysql.Config{ + Conn: db, + SkipInitializeWithVersion: true, + }), &gorm.Config{}) + Expect(err).ToNot(HaveOccurred()) + + return gormDB, mock +} diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go new file mode 100644 index 00000000..694dc649 --- /dev/null +++ b/internal/autoscaler/recommender/cron_recommender.go @@ -0,0 +1,163 @@ +package recommender + +import ( + "context" + "fmt" + "maps" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/robfig/cron/v3" + "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +// Utilize these annotations to determine if the configuration has changed +const ( + CronScalingTFLOPSRequestAnnotation = constants.Domain + "/cron-scaling-tflops-request" + CronScalingVRAMRequestAnnotation = constants.Domain + "/cron-scaling-vram-request" + CronScalingTFLOPSLimitAnnotation = constants.Domain + "/cron-scaling-tflops-limit" + CronScalingVRAMLimitAnnotation = constants.Domain + "/cron-scaling-vram-limit" +) + +type CronRecommender struct { + client.Client + parser cron.Parser +} + +func NewCronRecommender(c client.Client) *CronRecommender { + return &CronRecommender{ + Client: c, + parser: cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow), + } +} + +func (c *CronRecommender) Name() string { + return "cron" +} + +func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*tfv1.Resources, error) { + log := log.FromContext(ctx) + activeRule, err := c.getActiveCronScalingRule(&w.Spec.AutoScalingConfig) + if err != nil { + return nil, fmt.Errorf("failed to get active cron scaling rule %w", err) + } + + curRes, err := cronScalingResourcesFromAnnotations(w.Annotations) + if err != nil { + return nil, fmt.Errorf("failed to get current resources from workload %s: %v", w.Name, err) + } + + var result *tfv1.Resources + if activeRule == nil { + if curRes == nil { + return nil, nil + } + // revert the resources to those specified in the workload spec + result = w.GetResourcesSpec() + maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(&tfv1.Resources{})) + log.Info("cron scaling finished", "workload", w.Name, "resources", result) + } else { + result = &activeRule.DesiredResources + maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(result)) + log.Info("cron scaling rule matched", "workload", w.Name, "rule", activeRule.Name, "resources", result) + } + + if curRes != nil && result.Equal(curRes) { + return nil, nil + } + + return result, nil +} + +func cronScalingResourcesToAnnotations(resources *tfv1.Resources) map[string]string { + return map[string]string{ + CronScalingTFLOPSRequestAnnotation: resources.Requests.Tflops.String(), + CronScalingTFLOPSLimitAnnotation: resources.Limits.Tflops.String(), + CronScalingVRAMRequestAnnotation: resources.Requests.Vram.String(), + CronScalingVRAMLimitAnnotation: resources.Limits.Vram.String(), + } +} + +func cronScalingResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) { + result := tfv1.Resources{} + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {CronScalingTFLOPSRequestAnnotation, &result.Requests.Tflops}, + {CronScalingTFLOPSLimitAnnotation, &result.Limits.Tflops}, + {CronScalingVRAMRequestAnnotation, &result.Requests.Vram}, + {CronScalingVRAMLimitAnnotation, &result.Limits.Vram}, + } + for _, info := range resInfo { + annotation, ok := annotations[info.key] + if !ok { + continue + } + q, err := resource.ParseQuantity(annotation) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + } + + if result.IsZero() { + return nil, nil + } + + return &result, nil +} + +func (c *CronRecommender) getActiveCronScalingRule(config *tfv1.AutoScalingConfig) (*tfv1.CronScalingRule, error) { + activeRules := []*tfv1.CronScalingRule{} + + currentTime := time.Now() + + for _, rule := range config.CronScalingRules { + if !rule.Enable || rule.Name == "" || + rule.Start == "" || rule.End == "" { + continue + } + + if rule.Start == rule.End { + return nil, fmt.Errorf("start and end can not same") + } + + startSchedule, err := c.parser.Parse(rule.Start) + if err != nil { + return nil, fmt.Errorf("failed to parse cron rule %s start: %w", rule.Name, err) + } + endSchedule, err := c.parser.Parse(rule.End) + if err != nil { + return nil, fmt.Errorf("failed to parse cron rule %s end: %w", rule.Name, err) + } + + nextStartTime := startSchedule.Next(time.Now()) + nextEndTime := endSchedule.Next(time.Now()) + + isActive := false + if nextStartTime.Before(nextEndTime) { + isActive = currentTime.After(nextStartTime) && currentTime.Before(nextEndTime) + } else { + isActive = currentTime.After(nextStartTime) || currentTime.Before(nextEndTime) + } + + if isActive { + activeRules = append(activeRules, &rule) + } + } + + if len(activeRules) > 1 { + return nil, fmt.Errorf("only one active cron scaling rule is permitted at any given time") + } + + if len(activeRules) == 0 { + return nil, nil + } + + return activeRules[0], nil +} diff --git a/internal/autoscaler/recommender/cron_recommender_test.go b/internal/autoscaler/recommender/cron_recommender_test.go new file mode 100644 index 00000000..5825e309 --- /dev/null +++ b/internal/autoscaler/recommender/cron_recommender_test.go @@ -0,0 +1,188 @@ +package recommender + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/resource" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" +) + +var _ = Describe("CronRecommender", func() { + ctx := context.TODO() + res := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("16Gi"), + }, + } + + It("should return recommendation based on the active cron scaling rule", func() { + workload := workload.NewWorkloadState("test") + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: res, + }, + }, + } + + recommender := NewCronRecommender(nil) + recommendation, _ := recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&res)).To(BeTrue()) + }) + + It("should not return recommendation if there is no active cron scaling rule", func() { + workload := workload.NewWorkloadState("test") + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "", + End: "", + DesiredResources: res, + }, + }, + } + + recommender := NewCronRecommender(nil) + recommendation, _ := recommender.Recommend(ctx, workload) + Expect(recommendation).To(BeNil()) + }) + + It("should not return recommendation if the active cron scaling rule remains unchanged", func() { + workload := workload.NewWorkloadState("test") + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: res, + }, + }, + } + + recommender := NewCronRecommender(nil) + recommendation, _ := recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&res)).To(BeTrue()) + + workload.Annotations = cronScalingResourcesToAnnotations(&res) + + recommendation, _ = recommender.Recommend(ctx, workload) + Expect(recommendation).To(BeNil()) + }) + + It("should revert the resources to those specified in the workload spec if the active cron scaling finished", func() { + workload := workload.NewWorkloadState("test") + workload.Spec.Resources = tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("5"), + Vram: resource.MustParse("4Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + } + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + DesiredResources: res, + }, + }, + } + + recommender := NewCronRecommender(nil) + recommendation, _ := recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&res)).To(BeTrue()) + + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "", + End: "", + DesiredResources: res, + }, + }, + } + + workload.Annotations = cronScalingResourcesToAnnotations(&res) + recommendation, _ = recommender.Recommend(ctx, workload) + Expect(recommendation.Equal(&workload.Spec.Resources)).To(BeTrue()) + + workload.Annotations = cronScalingResourcesToAnnotations(&tfv1.Resources{}) + recommendation, _ = recommender.Recommend(ctx, workload) + Expect(recommendation).To(BeNil()) + }) + + It("should return error if getting multiple active rules", func() { + workload := workload.NewWorkloadState("test") + workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + }, + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + }, + }, + } + recommender := NewCronRecommender(nil) + _, err := recommender.Recommend(ctx, workload) + Expect(err).To(HaveOccurred()) + }) + + It("should not return cron scaling rule if no config or disable", func() { + asc := tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{}, + } + Expect(NewCronRecommender(nil).getActiveCronScalingRule(&asc)).To(BeNil()) + asc = tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + {Enable: false}, + }, + } + Expect(NewCronRecommender(nil).getActiveCronScalingRule(&asc)).To(BeNil()) + }) + + It("should return the active cron scaling rule if the current time falls within its scheduled interval", func() { + asc := tfv1.AutoScalingConfig{ + CronScalingRules: []tfv1.CronScalingRule{ + { + Enable: true, + Name: "test", + Start: "0 0 * * *", + End: "59 23 * * *", + }, + }, + } + rule, _ := NewCronRecommender(nil).getActiveCronScalingRule(&asc) + Expect(rule).NotTo(BeNil()) + }) +}) diff --git a/internal/autoscaler/recommender/estimator.go b/internal/autoscaler/recommender/estimator.go new file mode 100644 index 00000000..f1daa06b --- /dev/null +++ b/internal/autoscaler/recommender/estimator.go @@ -0,0 +1,167 @@ +package recommender + +import ( + "math" + "time" + + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "k8s.io/apimachinery/pkg/api/resource" +) + +const ( + // MaxResourceAmount is the maximum allowed value of resource amount. + MaxResourceAmount = ResourceAmount(1e14) +) + +type ResourceAmount int64 + +// ResourceAmountMax returns the larger of two resource amounts. +func ResourceAmountMax(amount1, amount2 ResourceAmount) ResourceAmount { + if amount1 > amount2 { + return amount1 + } + return amount2 +} + +func QuantityFromAmount(amount ResourceAmount) resource.Quantity { + return *resource.NewScaledQuantity(int64(amount), 0) +} + +func resourceAmountFromFloat(amount float64) ResourceAmount { + if amount < 0 { + return ResourceAmount(0) + } else if amount > float64(MaxResourceAmount) { + return MaxResourceAmount + } else { + return ResourceAmount(amount) + } +} + +type VramEstimator interface { + GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount +} + +type percentileVramEstimator struct { + percentile float64 +} + +// NewPercentileVramEstimator returns a new percentileVramEstimator that uses provided percentile. +func NewPercentileVramEstimator(percentile float64) VramEstimator { + return &percentileVramEstimator{percentile} +} + +func (e *percentileVramEstimator) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + return resourceAmountFromFloat(float64(w.VramHistogram.Percentile(e.percentile))) +} + +type vramMarginEstimator struct { + marginFraction float64 + baseEstimator VramEstimator +} + +// WithvramMargin returns a vramEstimator that adds a margin to the base estimator. +func WithVramMargin(marginFraction float64, baseEstimator VramEstimator) VramEstimator { + return &vramMarginEstimator{marginFraction: marginFraction, baseEstimator: baseEstimator} +} + +// GetvramEstimation returns the vram estimation for the given AggregateContainerState. +func (e *vramMarginEstimator) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + base := e.baseEstimator.GetVramEstimation(w) + margin := resourceAmountFromFloat(float64(base) * e.marginFraction) + return base + margin +} + +type vramConfidenceMultiplier struct { + multiplier float64 + exponent float64 + baseEstimator VramEstimator + confidenceInterval time.Duration +} + +// WithVramConfidenceMultiplier returns a VramEstimator that scales the +func WithVramConfidenceMultiplier(multiplier, exponent float64, baseEstimator VramEstimator, confidenceInterval time.Duration) VramEstimator { + return &vramConfidenceMultiplier{ + multiplier: multiplier, + exponent: exponent, + baseEstimator: baseEstimator, + confidenceInterval: confidenceInterval, + } +} + +func (e *vramConfidenceMultiplier) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + confidence := getConfidence(w, e.confidenceInterval) + base := e.baseEstimator.GetVramEstimation(w) + return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent)) +} + +type TflopsEstimator interface { + GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount +} + +type percentileTflopsEstimator struct { + percentile float64 +} + +// NewPercentileTflopsEstimator returns a new percentileTflopsEstimator that uses provided percentile. +func NewPercentileTflopsEstimator(percentile float64) TflopsEstimator { + return &percentileTflopsEstimator{percentile} +} + +func (e *percentileTflopsEstimator) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + return resourceAmountFromFloat(float64(w.TflopsHistogram.Percentile(e.percentile))) +} + +type tflopsMarginEstimator struct { + marginFraction float64 + baseEstimator TflopsEstimator +} + +// WithTflopsMargin returns a tflopsEstimator that adds a margin to the base estimator. +func WithTflopsMargin(marginFraction float64, baseEstimator TflopsEstimator) TflopsEstimator { + return &tflopsMarginEstimator{marginFraction: marginFraction, baseEstimator: baseEstimator} +} + +// GetTflopsEstimation returns the tflops estimation for the given AggregateContainerState. +func (e *tflopsMarginEstimator) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + base := e.baseEstimator.GetTflopsEstimation(w) + margin := resourceAmountFromFloat(float64(base) * e.marginFraction) + return base + margin +} + +type tflopsConfidenceMultiplier struct { + multiplier float64 + exponent float64 + baseEstimator TflopsEstimator + confidenceInterval time.Duration +} + +// WithTflopsConfidenceMultiplier returns a TflopsEstimator that scales the +func WithTflopsConfidenceMultiplier(multiplier, exponent float64, baseEstimator TflopsEstimator, confidenceInterval time.Duration) TflopsEstimator { + return &tflopsConfidenceMultiplier{ + multiplier: multiplier, + exponent: exponent, + baseEstimator: baseEstimator, + confidenceInterval: confidenceInterval, + } +} + +func (e *tflopsConfidenceMultiplier) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount { + confidence := getConfidence(w, e.confidenceInterval) + base := e.baseEstimator.GetTflopsEstimation(w) + return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent)) +} + +// Returns a non-negative real number that heuristically measures how much +// confidence the history aggregated in the AggregateState provides. +// For a workload producing a steady stream of samples over N days at the rate +// of 1 sample per minute, this metric is equal to N. +// This implementation is a very simple heuristic which looks at the total count +// of samples and the time between the first and the last sample. +func getConfidence(w *metrics.WorkerUsageAggregator, confidenceInterval time.Duration) float64 { + // Distance between the first and the last observed sample time, measured in days. + lifespanInDays := float64(w.LastSampleStart.Sub(w.FirstSampleStart)) / float64(confidenceInterval) + // Total count of samples normalized such that it equals the number of days for + // frequency of 1 sample/minute. + samplesAmount := float64(w.TotalSamplesCount) / confidenceInterval.Minutes() + return math.Min(lifespanInDays, samplesAmount) +} diff --git a/internal/autoscaler/recommender/percentile_recommender.go b/internal/autoscaler/recommender/percentile_recommender.go new file mode 100644 index 00000000..b08113e5 --- /dev/null +++ b/internal/autoscaler/recommender/percentile_recommender.go @@ -0,0 +1,224 @@ +package recommender + +import ( + "context" + "fmt" + "math/big" + "strconv" + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" + "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +const ( + // Fraction of usage added as the safety margin to the recommended request + defaultRequestMarginFraction = 0.15 + // Vram usage percentile that will be used as a base for vram target recommendation. Doesn't affect vram lower bound nor vram upper bound. + defaultTargetVramPercentile = 0.9 + // Vram usage percentile that will be used for the lower bound on vram recommendation. + defaultLowerBoundVramPercentile = 0.5 + // Vram usage percentile that will be used for the upper bound on vram recommendation. + defaultUpperBoundVramPercentile = 0.95 + // Tflops usage percentile that will be used as a base for tflops target recommendation. Doesn't affect tflops lower bound nor tflops upper bound. + defaultTargetTflopsPercentile = 0.9 + // Tflops usage percentile that will be used for the lower bound on tflops recommendation. + defaultLowerBoundTflopsPercentile = 0.5 + // Tflops usage percentile that will be used for the upper bound on tflops recommendation. + defaultUpperBoundTflopsPercentile = 0.95 + // The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h + defaultConfidenceInterval = time.Hour * 24 +) + +var defaultPercentileConfig = PercentileConfig{ + TargetTflopsPercentile: defaultTargetTflopsPercentile, + LowerBoundTflopsPercentile: defaultLowerBoundTflopsPercentile, + UpperBoundTflopsPercentile: defaultUpperBoundTflopsPercentile, + TargetVramPercentile: defaultTargetVramPercentile, + LowerBoundVramPercentile: defaultLowerBoundVramPercentile, + UpperBoundVramPercentile: defaultUpperBoundVramPercentile, + RequestMarginFraction: defaultRequestMarginFraction, + ConfidenceInterval: defaultConfidenceInterval, +} + +type RecommendedResources struct { + LowerBoundTflops resource.Quantity + TargetTflops resource.Quantity + UpperBoundTflops resource.Quantity + LowerBoundVram resource.Quantity + TargetVram resource.Quantity + UpperBoundVram resource.Quantity +} + +type PercentileConfig struct { + TargetTflopsPercentile float64 + LowerBoundTflopsPercentile float64 + UpperBoundTflopsPercentile float64 + TargetVramPercentile float64 + LowerBoundVramPercentile float64 + UpperBoundVramPercentile float64 + RequestMarginFraction float64 + ConfidenceInterval time.Duration +} + +type PercentileRecommender struct { + lowerBoundTflops TflopsEstimator + targetTflops TflopsEstimator + upperBoundTflops TflopsEstimator + lowerBoundVram VramEstimator + targetVram VramEstimator + upperBoundVram VramEstimator +} + +func NewPercentileRecommender() *PercentileRecommender { + return &PercentileRecommender{} +} + +func (p *PercentileRecommender) Name() string { + return "percentile" +} + +func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) { + log := log.FromContext(ctx) + aggregator := workload.WorkerUsageAggregator + if aggregator.IsEmpty() { + return nil, nil + } + + curRes, err := workload.GetCurrentResourcesSpec() + if err != nil { + return nil, fmt.Errorf("failed to get current resources from workload %s: %v", workload.Name, err) + } + + // TODO: cache config + p.createEstimatorsFromConfig(p.getPercentileConfig(&workload.Spec.AutoScalingConfig.AutoSetResources)) + rr := &RecommendedResources{ + LowerBoundTflops: QuantityFromAmount(p.lowerBoundTflops.GetTflopsEstimation(aggregator)), + TargetTflops: QuantityFromAmount(p.targetTflops.GetTflopsEstimation(aggregator)), + UpperBoundTflops: QuantityFromAmount(p.upperBoundTflops.GetTflopsEstimation(aggregator)), + LowerBoundVram: QuantityFromAmount(p.lowerBoundVram.GetVramEstimation(aggregator)), + TargetVram: QuantityFromAmount(p.targetVram.GetVramEstimation(aggregator)), + UpperBoundVram: QuantityFromAmount(p.upperBoundVram.GetVramEstimation(aggregator)), + } + + log.Info("recommendation", "workload", workload.Name, "recommender", p.Name(), "resources", rr) + + result := &tfv1.Resources{} + if curRes.Requests.Tflops.Cmp(rr.LowerBoundTflops) < 0 || + curRes.Requests.Tflops.Cmp(rr.UpperBoundTflops) > 0 { + result.Requests.Tflops = rr.TargetTflops + targetLimit := getProportionalLimit(&curRes.Limits.Tflops, &curRes.Requests.Tflops, &rr.TargetTflops) + if targetLimit == nil { + return nil, fmt.Errorf("failed to get tflops limit from workload %s", workload.Name) + } + result.Limits.Tflops = *targetLimit + } + + if curRes.Requests.Vram.Cmp(rr.LowerBoundVram) < 0 || + curRes.Requests.Vram.Cmp(rr.UpperBoundVram) > 0 { + result.Requests.Vram = rr.TargetVram + targetLimit := getProportionalLimit(&curRes.Limits.Vram, &curRes.Requests.Vram, &rr.TargetVram) + if targetLimit == nil { + return nil, fmt.Errorf("failed to get vram limit from workload %s", workload.Name) + } + result.Limits.Vram = *targetLimit + } + + if result.Equal(curRes) { + return nil, nil + } + + return result, nil +} + +func (p *PercentileRecommender) getPercentileConfig(asr *tfv1.AutoSetResources) *PercentileConfig { + cfg := defaultPercentileConfig + + if asr == nil { + return &cfg + } + + fields := []struct { + val string + dst *float64 + }{ + {asr.TargetTflopsPercentile, &cfg.TargetTflopsPercentile}, + {asr.LowerBoundTflopsPercentile, &cfg.LowerBoundTflopsPercentile}, + {asr.UpperBoundTflopsPercentile, &cfg.UpperBoundTflopsPercentile}, + {asr.TargetVramPercentile, &cfg.TargetVramPercentile}, + {asr.LowerBoundVramPercentile, &cfg.LowerBoundVramPercentile}, + {asr.UpperBoundVramPercentile, &cfg.UpperBoundVramPercentile}, + {asr.RequestMarginFraction, &cfg.RequestMarginFraction}, + } + for _, f := range fields { + if f.val == "" { + continue + } + if v, err := strconv.ParseFloat(f.val, 64); err == nil { + *f.dst = v + } + } + + if asr.ConfidenceInterval != "" { + if d, err := time.ParseDuration(asr.ConfidenceInterval); err == nil { + cfg.ConfidenceInterval = d + } + } + + return &cfg +} + +func (p *PercentileRecommender) createEstimatorsFromConfig(config *PercentileConfig) { + targetTflops := NewPercentileTflopsEstimator(config.TargetTflopsPercentile) + lowerBoundTflops := NewPercentileTflopsEstimator(config.LowerBoundTflopsPercentile) + upperBoundTflops := NewPercentileTflopsEstimator(config.UpperBoundTflopsPercentile) + + targetTflops = WithTflopsMargin(config.RequestMarginFraction, targetTflops) + lowerBoundTflops = WithTflopsMargin(config.RequestMarginFraction, lowerBoundTflops) + upperBoundTflops = WithTflopsMargin(config.RequestMarginFraction, upperBoundTflops) + + upperBoundTflops = WithTflopsConfidenceMultiplier(1.0, 1.0, upperBoundTflops, config.ConfidenceInterval) + lowerBoundTflops = WithTflopsConfidenceMultiplier(0.001, -2.0, lowerBoundTflops, config.ConfidenceInterval) + + targetVram := NewPercentileVramEstimator(config.TargetVramPercentile) + lowerBoundVram := NewPercentileVramEstimator(config.LowerBoundVramPercentile) + upperBoundVram := NewPercentileVramEstimator(config.UpperBoundVramPercentile) + + targetVram = WithVramMargin(config.RequestMarginFraction, targetVram) + lowerBoundVram = WithVramMargin(config.RequestMarginFraction, lowerBoundVram) + upperBoundVram = WithVramMargin(config.RequestMarginFraction, upperBoundVram) + + upperBoundVram = WithVramConfidenceMultiplier(1.0, 1.0, upperBoundVram, config.ConfidenceInterval) + lowerBoundVram = WithVramConfidenceMultiplier(0.001, -2.0, lowerBoundVram, config.ConfidenceInterval) + + *p = PercentileRecommender{ + lowerBoundTflops: lowerBoundTflops, + targetTflops: targetTflops, + upperBoundTflops: upperBoundTflops, + lowerBoundVram: lowerBoundVram, + targetVram: targetVram, + upperBoundVram: upperBoundVram, + } +} + +func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity { + if originalLimit == nil || originalLimit.IsZero() || + originalRequest == nil || originalRequest.IsZero() || + recommendedRequest == nil || recommendedRequest.IsZero() { + return nil + } + + originalValue := big.NewInt(originalLimit.Value()) + scaleBaseValue := big.NewInt(originalRequest.Value()) + scaleResultValue := big.NewInt(recommendedRequest.Value()) + var scaledOriginal big.Int + scaledOriginal.Mul(originalValue, scaleResultValue) + scaledOriginal.Div(&scaledOriginal, scaleBaseValue) + if scaledOriginal.IsInt64() { + return resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format) + } + + return nil +} diff --git a/internal/autoscaler/recommender/percentile_recommender_test.go b/internal/autoscaler/recommender/percentile_recommender_test.go new file mode 100644 index 00000000..fd6fe8a4 --- /dev/null +++ b/internal/autoscaler/recommender/percentile_recommender_test.go @@ -0,0 +1,65 @@ +package recommender + +import ( + "time" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Percentile Recommender", func() { + It("should return default config when no AutoScalingConfig is set", func() { + cfg := NewPercentileRecommender().getPercentileConfig(nil) + Expect(cfg).ToNot(BeNil()) + Expect(*cfg).To(Equal(defaultPercentileConfig)) + }) + + It("should parse float fields from AutoSetResources", func() { + asr := &tfv1.AutoSetResources{ + TargetTflopsPercentile: "0.8", + LowerBoundTflopsPercentile: "0.1", + UpperBoundTflopsPercentile: "0.95", + TargetVramPercentile: "0.7", + LowerBoundVramPercentile: "0.2", + UpperBoundVramPercentile: "0.9", + RequestMarginFraction: "0.15", + } + cfg := NewPercentileRecommender().getPercentileConfig(asr) + Expect(cfg.TargetTflopsPercentile).To(Equal(0.8)) + Expect(cfg.LowerBoundTflopsPercentile).To(Equal(0.1)) + Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.95)) + Expect(cfg.TargetVramPercentile).To(Equal(0.7)) + Expect(cfg.LowerBoundVramPercentile).To(Equal(0.2)) + Expect(cfg.UpperBoundVramPercentile).To(Equal(0.9)) + Expect(cfg.RequestMarginFraction).To(Equal(0.15)) + }) + + It("should ignore invalid float fields and keep defaults", func() { + asr := &tfv1.AutoSetResources{ + TargetTflopsPercentile: "not-a-float", + LowerBoundTflopsPercentile: "", + UpperBoundTflopsPercentile: "0.99", + } + cfg := NewPercentileRecommender().getPercentileConfig(asr) + Expect(cfg.TargetTflopsPercentile).To(Equal(defaultPercentileConfig.TargetTflopsPercentile)) + Expect(cfg.LowerBoundTflopsPercentile).To(Equal(defaultPercentileConfig.LowerBoundTflopsPercentile)) + Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.99)) + }) + + It("should parse ConfidenceInterval if valid", func() { + asr := &tfv1.AutoSetResources{ + ConfidenceInterval: "30m", + } + cfg := NewPercentileRecommender().getPercentileConfig(asr) + Expect(cfg.ConfidenceInterval).To(Equal(30 * time.Minute)) + }) + + It("should ignore invalid ConfidenceInterval and keep default", func() { + asr := &tfv1.AutoSetResources{ + ConfidenceInterval: "not-a-duration", + } + cfg := NewPercentileRecommender().getPercentileConfig(asr) + Expect(cfg.ConfidenceInterval).To(Equal(defaultPercentileConfig.ConfidenceInterval)) + }) +}) diff --git a/internal/autoscaler/recommender/recommender.go b/internal/autoscaler/recommender/recommender.go new file mode 100644 index 00000000..3248ad6c --- /dev/null +++ b/internal/autoscaler/recommender/recommender.go @@ -0,0 +1,19 @@ +package recommender + +import ( + "context" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload" +) + +const ( + Percentile = "percentile" + Cron = "cron" +) + +// Interface defines the contract for resource recommendation strategies used by the autoscaler. +type Interface interface { + Name() string + Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) +} diff --git a/internal/autoscaler/recommender/recommender_suite_test.go b/internal/autoscaler/recommender/recommender_suite_test.go new file mode 100644 index 00000000..7177cf1d --- /dev/null +++ b/internal/autoscaler/recommender/recommender_suite_test.go @@ -0,0 +1,13 @@ +package recommender_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestRecommender(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Recommender Suite") +} diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go new file mode 100644 index 00000000..bda6768d --- /dev/null +++ b/internal/autoscaler/workload/handler.go @@ -0,0 +1,153 @@ +package workload + +import ( + "context" + "fmt" + "maps" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "github.com/NexusGPU/tensor-fusion/internal/gpuallocator" + "github.com/NexusGPU/tensor-fusion/internal/utils" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +type Handler interface { + UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) + ApplyRecommendationToWorkload(ctx context.Context, workloadState *State, recommendation *tfv1.Resources) error +} + +type handler struct { + client.Client + allocator *gpuallocator.GpuAllocator +} + +func NewHandler(client client.Client, allocator *gpuallocator.GpuAllocator) Handler { + return &handler{ + Client: client, + allocator: allocator, + } +} + +func (h *handler) UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) { + workloadState.Namespace = workload.Namespace + workloadState.Spec = workload.Spec + workloadState.Annotations = workload.Annotations + + workerList := &corev1.PodList{} + if err := h.List(ctx, workerList, + client.InNamespace(workloadState.Namespace), + client.MatchingLabels{constants.WorkloadKey: workloadState.Name}); err != nil { + log.FromContext(ctx).Error(err, "failed to list workers") + return + } + workloadState.updateWorkers(workerList) +} + +func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, workload *State, recommendation *tfv1.Resources) error { + if err := h.updateAutoScalingAnnotations(ctx, workload, recommendation); err != nil { + return fmt.Errorf("failed to update auto scaling annotations: %v", err) + } + + if !workload.IsAutoSetResourcesEnabled() { + return nil + } + + workerList := &corev1.PodList{} + if err := h.List(ctx, workerList, + client.InNamespace(workload.Namespace), + client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil { + return fmt.Errorf("failed to list workers: %v", err) + } + + for _, worker := range workerList.Items { + if !worker.DeletionTimestamp.IsZero() { + continue + } + if err := h.applyRecommendationToWorker(ctx, workload, &worker, recommendation); err != nil { + return fmt.Errorf("failed to update worker %s resources: %v", worker.Name, err) + } + } + + return nil +} + +func (h *handler) updateAutoScalingAnnotations( + ctx context.Context, + state *State, + recommendation *tfv1.Resources) error { + workload := &tfv1.TensorFusionWorkload{} + if err := h.Get(ctx, client.ObjectKey{Namespace: state.Namespace, Name: state.Name}, workload); err != nil { + return fmt.Errorf("failed to get workload: %v", err) + } + + // record current and last resources + if workload.Annotations == nil { + workload.Annotations = map[string]string{} + } + patch := client.MergeFrom(workload.DeepCopy()) + maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(recommendation)) + maps.Copy(workload.Annotations, state.ScalingAnnotations) + if err := h.Patch(ctx, workload, patch); err != nil { + return fmt.Errorf("failed to patch workload %s: %v", workload.Name, err) + } + + state.Annotations = workload.Annotations + return nil +} + +func (h *handler) applyRecommendationToWorker(ctx context.Context, workload *State, worker *corev1.Pod, recommendation *tfv1.Resources) error { + log := log.FromContext(ctx) + + curRes, err := utils.CurrentResourcesFromAnnotations(worker.Annotations) + if err != nil { + return fmt.Errorf("failed to get current worker resources: %v", err) + } + if curRes != nil && curRes.Equal(recommendation) { + return nil + } + + annotationsToUpdate := utils.CurrentResourcesToAnnotations(recommendation) + if !workload.ShouldScaleResource(tfv1.ResourceTflops) { + delete(annotationsToUpdate, constants.TFLOPSRequestAnnotation) + delete(annotationsToUpdate, constants.TFLOPSLimitAnnotation) + } + if !workload.ShouldScaleResource(tfv1.ResourceVram) { + delete(annotationsToUpdate, constants.VRAMRequestAnnotation) + delete(annotationsToUpdate, constants.VRAMLimitAnnotation) + } + + if len(annotationsToUpdate) <= 0 { + return nil + } + + isScaleUp := false + if _, ok := annotationsToUpdate[constants.TFLOPSRequestAnnotation]; ok { + isScaleUp = recommendation.Requests.Tflops.Cmp(curRes.Requests.Tflops) > 0 + } else { + isScaleUp = recommendation.Requests.Vram.Cmp(curRes.Requests.Vram) > 0 + } + + adjustRequest := &tfv1.AdjustRequest{ + PodUID: string(worker.UID), + IsScaleUp: isScaleUp, + NewRequest: recommendation.Requests, + NewLimit: recommendation.Limits, + } + if _, err := h.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil { + return fmt.Errorf("failed to adjust allocation: %v", err) + } + log.Info("adjust allocation successfully", "worker", worker.Name, "adjustRequest", adjustRequest) + + patch := client.MergeFrom(worker.DeepCopy()) + maps.Copy(worker.Annotations, annotationsToUpdate) + if err := h.Patch(ctx, worker, patch); err != nil { + return fmt.Errorf("failed to patch worker %s: %v", worker.Name, err) + } + + log.Info("apply recommendation successfully", "worker", worker.Name, "recommendation", recommendation, "currentResources", curRes) + + return nil +} diff --git a/internal/autoscaler/workload/worker.go b/internal/autoscaler/workload/worker.go new file mode 100644 index 00000000..8ad57ec3 --- /dev/null +++ b/internal/autoscaler/workload/worker.go @@ -0,0 +1,74 @@ +package workload + +import ( + "time" + + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" +) + +type WorkerState struct { + Name string + WorkloadName string + LastTflopsSampleTime time.Time + + VramPeak uint64 + LastVramSampleTime time.Time + VramWindowEnd time.Time +} + +func NewWorkerState(name string, workloadName string) *WorkerState { + return &WorkerState{ + Name: name, + WorkloadName: workloadName, + LastTflopsSampleTime: time.Time{}, + LastVramSampleTime: time.Time{}, + VramWindowEnd: time.Time{}, + } +} + +func (w *WorkerState) AddSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool { + w.AddTflopsSample(aggregator, sample) + w.AddVramSample(aggregator, sample) + return true +} + +func (w *WorkerState) AddTflopsSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool { + if sample.Timestamp.Before(w.LastTflopsSampleTime) { + return false + } + aggregator.AddTflopsSample(sample) + w.LastTflopsSampleTime = sample.Timestamp + return true +} + +func (w *WorkerState) AddVramSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool { + ts := sample.Timestamp + if ts.Before(w.LastVramSampleTime) { + return false + } + w.LastVramSampleTime = ts + if w.VramWindowEnd.IsZero() { + w.VramWindowEnd = ts + } + + addNewPeak := false + if ts.Before(w.VramWindowEnd) { + if w.VramPeak != 0 && sample.VramUsage > w.VramPeak { + aggregator.SubtractVramSample(float64(w.VramPeak), w.VramWindowEnd) + addNewPeak = true + } + } else { + aggregationInteval := metrics.DefaultAggregationInterval + shift := ts.Sub(w.VramWindowEnd).Truncate(aggregationInteval) + aggregationInteval + w.VramWindowEnd = w.VramWindowEnd.Add(shift) + w.VramPeak = 0 + addNewPeak = true + } + + if addNewPeak { + aggregator.AddVramSample(sample) + w.VramPeak = sample.VramUsage + } + + return true +} diff --git a/internal/autoscaler/workload/workload.go b/internal/autoscaler/workload/workload.go new file mode 100644 index 00000000..4e2063f9 --- /dev/null +++ b/internal/autoscaler/workload/workload.go @@ -0,0 +1,90 @@ +package workload + +import ( + "fmt" + "strings" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics" + "github.com/NexusGPU/tensor-fusion/internal/utils" + corev1 "k8s.io/api/core/v1" +) + +type State struct { + Namespace string + Name string + Annotations map[string]string + ScalingAnnotations map[string]string + Spec tfv1.WorkloadProfileSpec + Workers map[string]*WorkerState + WorkerUsageAggregator *metrics.WorkerUsageAggregator +} + +func NewWorkloadState(name string) *State { + return &State{ + Name: name, + Workers: make(map[string]*WorkerState), + ScalingAnnotations: make(map[string]string), + WorkerUsageAggregator: metrics.NewWorkerUsageAggregator(), + } +} + +func (w *State) GetLastResourcesSpec() (*tfv1.Resources, error) { + return utils.LastResourcesFromAnnotations(w.Annotations) +} + +func (w *State) GetResourcesSpec() *tfv1.Resources { + return &w.Spec.Resources +} + +func (w *State) GetCurrentResourcesSpec() (*tfv1.Resources, error) { + resources, err := utils.CurrentResourcesFromAnnotations(w.Annotations) + if err != nil { + return nil, fmt.Errorf("failed to get resources from annotations: %v", err) + } + if resources == nil { + return &w.Spec.Resources, nil + } + return resources, nil +} + +func (w *State) SetScalingAnnotation(key string, value string) { + w.ScalingAnnotations[key] = value +} + +func (w *State) IsAutoSetResourcesEnabled() bool { + return w.Spec.AutoScalingConfig.AutoSetResources.Enable +} + +func (w *State) ShouldScaleResource(name tfv1.ResourceName) bool { + target := w.Spec.AutoScalingConfig.AutoSetResources.TargetResource + return strings.EqualFold(target, "all") || strings.EqualFold(string(name), target) +} + +func (w *State) updateWorkers(podList *corev1.PodList) { + observedWorkers := map[string]bool{} + for _, worker := range podList.Items { + if !worker.DeletionTimestamp.IsZero() { + continue + } + if _, exists := w.Workers[worker.Name]; !exists { + w.Workers[worker.Name] = NewWorkerState(worker.Name, w.Name) + } + observedWorkers[worker.Name] = true + } + + for key, worker := range w.Workers { + if worker.WorkloadName == w.Name && !observedWorkers[key] { + delete(w.Workers, key) + } + } +} + +func (w *State) AddSample(sample *metrics.WorkerUsage) { + worker, exists := w.Workers[sample.WorkerName] + if !exists { + worker = NewWorkerState(sample.WorkerName, sample.WorkloadName) + w.Workers[sample.WorkerName] = worker + } + worker.AddSample(w.WorkerUsageAggregator, sample) +} diff --git a/internal/autoscaler/workload/workload_suite_test.go b/internal/autoscaler/workload/workload_suite_test.go new file mode 100644 index 00000000..cd3451b6 --- /dev/null +++ b/internal/autoscaler/workload/workload_suite_test.go @@ -0,0 +1,13 @@ +package workload_test + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestWorkload(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Workload Suite") +} diff --git a/internal/autoscaler/workload/workload_test.go b/internal/autoscaler/workload/workload_test.go new file mode 100644 index 00000000..bd18e9f7 --- /dev/null +++ b/internal/autoscaler/workload/workload_test.go @@ -0,0 +1,84 @@ +package workload + +import ( + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/utils" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/resource" +) + +var _ = Describe("Workload", func() { + It("should correctly determine if a resource is the target based on config", func() { + ws := NewWorkloadState("test") + + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeFalse()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeFalse()) + + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "all"}, + } + + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeTrue()) + + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "tflops"}, + } + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeTrue()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeFalse()) + + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{TargetResource: "vram"}, + } + Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeFalse()) + Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeTrue()) + }) + + It("should correctly determine if auto set resources is enabled based on config", func() { + ws := NewWorkloadState("test") + + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{Enable: true}, + } + Expect(ws.IsAutoSetResourcesEnabled()).To(BeTrue()) + ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{ + AutoSetResources: tfv1.AutoSetResources{Enable: false}, + } + Expect(ws.IsAutoSetResourcesEnabled()).To(BeFalse()) + }) + + It("should return last resources spec from the annotations", func() { + ws := NewWorkloadState("test") + expect := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("16Gi"), + }, + } + ws.Annotations = utils.LastResourcesToAnnotations(&expect) + got, _ := ws.GetLastResourcesSpec() + Expect(got.Equal(&expect)) + }) + + It("should return current resources spec from the annotations", func() { + ws := NewWorkloadState("test") + expect := tfv1.Resources{ + Requests: tfv1.Resource{ + Tflops: resource.MustParse("10"), + Vram: resource.MustParse("8Gi"), + }, + Limits: tfv1.Resource{ + Tflops: resource.MustParse("20"), + Vram: resource.MustParse("16Gi"), + }, + } + ws.Annotations = utils.CurrentResourcesToAnnotations(&expect) + got, _ := ws.GetCurrentResourcesSpec() + Expect(got.Equal(&expect)) + }) +}) diff --git a/internal/constants/constants.go b/internal/constants/constants.go index dd2810b3..2183f852 100644 --- a/internal/constants/constants.go +++ b/internal/constants/constants.go @@ -55,6 +55,10 @@ const ( VRAMRequestAnnotation = Domain + "/vram-request" TFLOPSLimitAnnotation = Domain + "/tflops-limit" VRAMLimitAnnotation = Domain + "/vram-limit" + LastTFLOPSRequestAnnotation = Domain + "/last-tflops-request" + LastVRAMRequestAnnotation = Domain + "/last-vram-request" + LastTFLOPSLimitAnnotation = Domain + "/last-tflops-limit" + LastVRAMLimitAnnotation = Domain + "/last-vram-limit" WorkloadProfileAnnotation = Domain + "/client-profile" InjectContainerAnnotation = Domain + "/inject-container" IsLocalGPUAnnotation = Domain + "/is-local-gpu" @@ -73,9 +77,8 @@ const ( GenPortNumberAnnotation = Domain + "/port-number" TensorFusionWorkerPortNumber = 8000 - AutoScaleLimitsAnnotation = Domain + "/auto-limits" - AutoScaleRequestsAnnotation = Domain + "/auto-requests" - AutoScaleReplicasAnnotation = Domain + "/auto-replicas" + AutoScaleResourcesAnnotation = Domain + "/auto-resources" + AutoScaleReplicasAnnotation = Domain + "/auto-replicas" GpuReleasedAnnotation = Domain + "/gpu-released" diff --git a/internal/utils/resource.go b/internal/utils/resource.go new file mode 100644 index 00000000..855d3ce3 --- /dev/null +++ b/internal/utils/resource.go @@ -0,0 +1,87 @@ +package utils + +import ( + "fmt" + + tfv1 "github.com/NexusGPU/tensor-fusion/api/v1" + "github.com/NexusGPU/tensor-fusion/internal/constants" + "k8s.io/apimachinery/pkg/api/resource" +) + +func CurrentResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) { + result := tfv1.Resources{} + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {constants.TFLOPSRequestAnnotation, &result.Requests.Tflops}, + {constants.TFLOPSLimitAnnotation, &result.Limits.Tflops}, + {constants.VRAMRequestAnnotation, &result.Requests.Vram}, + {constants.VRAMLimitAnnotation, &result.Limits.Vram}, + } + for _, info := range resInfo { + annotation, ok := annotations[info.key] + if !ok { + continue + } + q, err := resource.ParseQuantity(annotation) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + } + + if result.IsZero() { + return nil, nil + } + + return &result, nil +} + +func LastResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) { + result := tfv1.Resources{} + resInfo := []struct { + key string + dst *resource.Quantity + }{ + {constants.LastTFLOPSRequestAnnotation, &result.Requests.Tflops}, + {constants.LastTFLOPSLimitAnnotation, &result.Limits.Tflops}, + {constants.LastVRAMRequestAnnotation, &result.Requests.Vram}, + {constants.LastVRAMLimitAnnotation, &result.Limits.Vram}, + } + for _, info := range resInfo { + annotation, ok := annotations[info.key] + if !ok { + continue + } + q, err := resource.ParseQuantity(annotation) + if err != nil { + return nil, fmt.Errorf("failed to parse %s: %v", info.key, err) + } + *info.dst = q + } + + if result.IsZero() { + return nil, nil + } + + return &result, nil +} + +func CurrentResourcesToAnnotations(resources *tfv1.Resources) map[string]string { + return map[string]string{ + constants.TFLOPSRequestAnnotation: resources.Requests.Tflops.String(), + constants.TFLOPSLimitAnnotation: resources.Limits.Tflops.String(), + constants.VRAMRequestAnnotation: resources.Requests.Vram.String(), + constants.VRAMLimitAnnotation: resources.Limits.Vram.String(), + } +} + +func LastResourcesToAnnotations(resources *tfv1.Resources) map[string]string { + return map[string]string{ + constants.LastTFLOPSRequestAnnotation: resources.Requests.Tflops.String(), + constants.LastTFLOPSLimitAnnotation: resources.Limits.Tflops.String(), + constants.LastVRAMRequestAnnotation: resources.Requests.Vram.String(), + constants.LastVRAMLimitAnnotation: resources.Limits.Vram.String(), + } +} diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go index 23c9104e..9fb8d6e6 100644 --- a/internal/webhook/v1/tf_parser.go +++ b/internal/webhook/v1/tf_parser.go @@ -134,13 +134,9 @@ func ParseTensorFusionInfo( } func parseAutoScalingAnnotations(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile) { - autoLimits, ok := pod.Annotations[constants.AutoScaleLimitsAnnotation] - if ok && autoLimits == constants.TrueStringValue { - workloadProfile.Spec.AutoScalingConfig.AutoSetLimits.Enable = true - } - autoRequests, ok := pod.Annotations[constants.AutoScaleRequestsAnnotation] - if ok && autoRequests == constants.TrueStringValue { - workloadProfile.Spec.AutoScalingConfig.AutoSetRequests.Enable = true + autoResources, ok := pod.Annotations[constants.AutoScaleResourcesAnnotation] + if ok && autoResources == constants.TrueStringValue { + workloadProfile.Spec.AutoScalingConfig.AutoSetResources.Enable = true } autoReplicas, ok := pod.Annotations[constants.AutoScaleReplicasAnnotation] if ok && autoReplicas == constants.TrueStringValue {