Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
8fd0509
feat: implement the core auto-scaling functionality
knave Jun 19, 2025
fe4c1c7
test: refactor test when processing workloads
knave Jun 20, 2025
f4840f5
feat: implement LeaderElectonRunnable explicitly and add compile-time…
knave Jun 23, 2025
d5c4100
feat: aggregate samples into histogram per tflops
knave Jun 23, 2025
95754a8
feat: implement metrics provider
knave Jun 25, 2025
3ac9191
feat: add allocator logic
knave Jun 27, 2025
5ec5ed8
refactor: optimize update worker method
knave Jun 27, 2025
c3a38b3
feat: add config parsing
knave Jun 28, 2025
c027d52
feat: apply updates to specified target resources
knave Jun 28, 2025
b13bbbd
feat: add auto-scaling switch config parsing and apply, TargetResourc…
knave Jun 29, 2025
f802843
feat: merge AutoSetLimits and AutoSetRequests into AutoSetResources
knave Jul 3, 2025
32d1cf2
feat: implement adjust allocation
knave Jul 4, 2025
6883801
fix: linter issues
knave Jul 4, 2025
8411dae
fix: linter issues
knave Jul 5, 2025
d7eff33
refactor: support multiple recommenders
knave Jul 11, 2025
2c6286c
refactor: code organization
knave Jul 19, 2025
ffafe04
feat: define cron scaler crd
knave Jul 21, 2025
0d9cb03
feat: implement cron scaling
knave Jul 28, 2025
a40c530
feat: implement cron scaling
knave Jul 28, 2025
8f91a61
feat: implement merging recommendations
knave Jul 30, 2025
48e16c4
feat: implement restoring resources upon cron scaling termination
knave Jul 30, 2025
bb2f53f
fix: properly handle the isScaleUp
knave Jul 30, 2025
bef7c05
refactor: each recommender is responsible for managing its own annota…
knave Jul 31, 2025
0874e05
refactor: remove unused functions and params
knave Jul 31, 2025
2fa4b1c
feat: implement scale-down lock
knave Aug 8, 2025
51e571b
refactor: improve naming
knave Aug 8, 2025
dba549f
fix: scale down issue
knave Aug 9, 2025
4d4ea4a
feat: add a recommendation field to the status of the workload and im…
knave Aug 28, 2025
ae29b88
test: refactor tests
knave Aug 28, 2025
dddef69
feat: integrate the autoscaler into the main function
knave Aug 29, 2025
7d197aa
fix: timestamp field issue
knave Aug 30, 2025
6cb9798
refactor: inject the metrics provider dependency
knave Aug 30, 2025
1bf9222
refactor: add namespace to identify the workload and process the name…
knave Aug 30, 2025
0d2a485
fix: handle zero resource value properly
knave Aug 31, 2025
4434a9b
feat: add condition type RecommendationProvided to workload
knave Aug 31, 2025
bb3d7e9
refactor: make percentile recommender more testable
knave Sep 2, 2025
2551111
refactor: improve status conditions
knave Sep 6, 2025
2ca6d3c
feat: add appliedRecommendedReplicas field to status and refactor
knave Sep 7, 2025
31d8850
fix: handle activeCronScalingRule and applied replicas properly
Code2Life Sep 8, 2025
c7ce48e
fix: applied recommended replicas issue
knave Sep 13, 2025
03fb19d
fix: add vpa package
knave Sep 13, 2025
09fe945
fix: linter issues
knave Sep 13, 2025
4dbb6d2
test: wrong suite name
knave Sep 13, 2025
c492225
fix: vram peak zero issue and fix query issue caused by time zone
knave Sep 15, 2025
88db809
feat: only recommend but do not apply recommendation if worker has a …
knave Sep 15, 2025
9f4efdc
fix: missing autoScalingConfig
knave Sep 17, 2025
5a76361
feat: implement max allowed resources recommendation processor
knave Sep 20, 2025
06b133e
fix: linter issue
knave Sep 20, 2025
4582e4a
fix: only get the max allowed resources when scaling up
knave Sep 21, 2025
abb1b1d
fix: test bug
knave Sep 21, 2025
cf07dd3
feat: add targetResource annoataion
knave Sep 22, 2025
9c7efc3
refactor: improve logs
knave Sep 22, 2025
cbf19d3
fix: skipping preemption tests and linter issue
knave Sep 22, 2025
05914e1
fix: handle gpu status properly
knave Sep 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 60 additions & 6 deletions api/v1/schedulingconfigtemplate_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,71 @@ type GPUFilter struct {
}

type AutoScalingConfig struct {
// layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
// VPA-like, aggregate metrics data <1m
AutoSetLimits AutoSetLimits `json:"autoSetLimits,omitempty"`
// layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
AutoSetResources AutoSetResources `json:"autoSetResources,omitempty"`

// layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
// HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works)
AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"`

// layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
AutoSetRequests AutoSetRequests `json:"autoSetRequests,omitempty"`
// CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions.
CronScalingRules []CronScalingRule `json:"cronScalingRules,omitempty"`
}

// CronScalingRule defines the rule for scaling resources based on a cron schedule.
// It allows enabling/disabling the scaler, specifying the time window for scaling,
// and configuring the desired resources and replicas during the scheduled period.
type CronScalingRule struct {
// Enable specifies whether the cron scaler is enabled.
Enable bool `json:"enable,omitempty"`
// Name is the identifier for the cron scaler.
Name string `json:"name,omitempty"`
// Start is the start time for the scaling schedule, in cron format.
Start string `json:"start,omitempty"`
// End is the end time for the scaling schedule, in cron format.
End string `json:"end,omitempty"`
// DesiredResources specifies the target resources to scale to during the schedule.
DesiredResources Resources `json:"desiredResources,omitempty"`
// DesiredReplicas is the target number of replicas during the schedule.
DesiredReplicas *int32 `json:"desiredReplicas,omitempty"`
}

type AutoSetResources struct {
Enable bool `json:"enable,omitempty"`

// Target resource to scale, such as "tflops", "vram", or "all" by default
TargetResource string `json:"targetResource,omitempty"`

// Tflops usage percentile that will be used as a base for tflops target recommendation. Default: 0.9
TargetTflopsPercentile string `json:"targettflopspercentile,omitempty"`

// Tflops usage percentile that will be used for the lower bound on tflops recommendation. Default: 0.5
LowerBoundTflopsPercentile string `json:"lowerboundtflopspercentile,omitempty"`

// Tflops usage percentile that will be used for the upper bound on tflops recommendation. Default: 0.95
UpperBoundTflopsPercentile string `json:"upperboundtflopspercentile,omitempty"`

// Vram usage percentile that will be used as a base for vram target recommendation. Default: 0.9
TargetVramPercentile string `json:"targetvrampercentile,omitempty"`

// Vram usage percentile that will be used for the lower bound on vram recommendation. Default: 0.5
LowerBoundVramPercentile string `json:"lowerboundvrampercentile,omitempty"`

// Vram usage percentile that will be used for the upper bound on vram recommendation. Default: 0.95
UpperBoundVramPercentile string `json:"upperboundvrampercentile,omitempty"`

// Fraction of usage added as the safety margin to the recommended request. Default: 0.15
RequestMarginFraction string `json:"requestMarginFraction,omitempty"`

// The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h
ConfidenceInterval string `json:"confidenceInterval,omitempty"`

// How much time back TSDB have to be queried to get historical metrics. Default: 1d
HistoryLength string `json:"historyLength,omitempty"`

// Resolution at which TSDB is queried for historical metrics. Default: 1m
HistoryResolution string `json:"historyResolution,omitempty"`
}

// A typical autoLimits algorithm could be checking every 5m, look back 1 day data,
Expand Down
24 changes: 24 additions & 0 deletions api/v1/tensorfusionconnection_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type ResourceName string

const (
ResourceTflops ResourceName = "tflops"
ResourceVram ResourceName = "vram"
)

type Resource struct {
Tflops resource.Quantity `json:"tflops"`
Vram resource.Quantity `json:"vram"`
Expand All @@ -31,6 +38,23 @@ type Resources struct {
Limits Resource `json:"limits"`
}

func (r Resources) Equal(target *Resources) bool {
if target == nil {
return false
}
return r.Requests.Tflops.Equal(target.Requests.Tflops) &&
r.Requests.Vram.Equal(target.Requests.Vram) &&
r.Limits.Tflops.Equal(target.Limits.Tflops) &&
r.Limits.Vram.Equal(target.Limits.Vram)
}

func (r Resources) IsZero() bool {
return r.Requests.Tflops.IsZero() &&
r.Requests.Vram.IsZero() &&
r.Limits.Tflops.IsZero() &&
r.Limits.Vram.IsZero()
}

// TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
type TensorFusionConnectionSpec struct {
WorkloadName string `json:"workloadName"`
Expand Down
12 changes: 12 additions & 0 deletions api/v1/tensorfusionworkload_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ type TensorFusionWorkloadStatus struct {

// Hash of the pod template used to create worker pods
PodTemplateHash string `json:"podTemplateHash,omitempty"`

// The most recently GPU resources recommended by the autoscaler
// +optional
Recommendation *Resources `json:"recommendation,omitempty"`

// The number of replicas currently applied based on the latest recommendation
// +optional
AppliedRecommendedReplicas int32 `json:"appliedRecommendedReplicas,omitempty"`

// The currently active cron scaling rule
// +optional
ActiveCronScalingRule *CronScalingRule `json:"activeCronScalingRule,omitempty"`
}

// +kubebuilder:object:root=true
Expand Down
2 changes: 1 addition & 1 deletion api/v1/workloadprofile_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ type WorkloadProfileSpec struct {
// +optional
// AutoScalingConfig configured here will override Pool's schedulingConfig
// This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
// user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
// user can set tensor-fusion.ai/auto-resources|replicas: 'true'
AutoScalingConfig AutoScalingConfig `json:"autoScalingConfig,omitempty"`

// +optional
Expand Down
56 changes: 54 additions & 2 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading