diff --git a/api/v1/schedulingconfigtemplate_types.go b/api/v1/schedulingconfigtemplate_types.go
index 1407b79f..307fcffb 100644
--- a/api/v1/schedulingconfigtemplate_types.go
+++ b/api/v1/schedulingconfigtemplate_types.go
@@ -86,17 +86,75 @@ type GPUFilter struct {
 }
 
 type AutoScalingConfig struct {
-	// layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
-	// VPA-like, aggregate metrics data <1m
-	AutoSetLimits AutoSetLimits `json:"autoSetLimits,omitempty"`
+	// layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
+	// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
+	AutoSetResources AutoSetResources `json:"autoSetResources,omitempty"`
 
 	// layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
 	// HPA-like, aggregate metrics data 1m-1h (when tf-worker scaled-up, should also trigger client pod's owner[Deployment etc.]'s replica increasing, check if KNative works)
 	AutoSetReplicas AutoSetReplicas `json:"autoSetReplicas,omitempty"`
 
-	// layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
-	// Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
-	AutoSetRequests AutoSetRequests `json:"autoSetRequests,omitempty"`
+	// CronScalingRules defines a list of CronScaling rules used to schedule scaling actions based on cron expressions.
+	CronScalingRules []CronScalingRule `json:"cronScalingRules,omitempty"`
+}
+
+// CronScalingRule defines the rule for scaling resources based on a cron schedule.
+// It allows enabling/disabling the scaler, specifying the time window for scaling,
+// and configuring the desired resources and replicas during the scheduled period.
+type CronScalingRule struct {
+	// Enable specifies whether the cron scaler is enabled.
+	Enable bool `json:"enable,omitempty"`
+	// Name is the identifier for the cron scaler.
+	Name string `json:"name,omitempty"`
+	// Start is the start time for the scaling schedule, in cron format.
+	Start string `json:"start,omitempty"`
+	// End is the end time for the scaling schedule, in cron format.
+	End string `json:"end,omitempty"`
+	// DesiredResources specifies the target resources to scale to during the schedule.
+	DesiredResources Resources `json:"desiredResources,omitempty"`
+	// ResourceMultiplier is a string representing the multiplier to apply to resources.
+	ResourceMultiplier string `json:"resourceMultiplier,omitempty"`
+	// DesiredReplicas is the target number of replicas during the schedule.
+	DesiredReplicas *int32 `json:"desiredReplicas,omitempty"`
+	// ReplicasMultiplier is a string representing the multiplier to apply to replicas.
+	ReplicasMultiplier string `json:"replicasMultiplier,omitempty"`
+}
+
+type AutoSetResources struct {
+	Enable bool `json:"enable,omitempty"`
+
+	// Target resource to scale, such as "tflops", "vram", or "all" by default
+	TargetResource string `json:"targetResource,omitempty"`
+
+	// Tflops usage percentile that will be used as a base for tflops target recommendation. Default: 0.9
+	TargetTflopsPercentile string `json:"targettflopspercentile,omitempty"`
+
+	// Tflops usage percentile that will be used for the lower bound on tflops recommendation. Default: 0.5
+	LowerBoundTflopsPercentile string `json:"lowerboundtflopspercentile,omitempty"`
+
+	// Tflops usage percentile that will be used for the upper bound on tflops recommendation. Default: 0.95
+	UpperBoundTflopsPercentile string `json:"upperboundtflopspercentile,omitempty"`
+
+	// Vram usage percentile that will be used as a base for vram target recommendation. Default: 0.9
+	TargetVramPercentile string `json:"targetvrampercentile,omitempty"`
+
+	// Vram usage percentile that will be used for the lower bound on vram recommendation. Default: 0.5
+	LowerBoundVramPercentile string `json:"lowerboundvrampercentile,omitempty"`
+
+	// Vram usage percentile that will be used for the upper bound on vram recommendation. Default: 0.95
+	UpperBoundVramPercentile string `json:"upperboundvrampercentile,omitempty"`
+
+	// Fraction of usage added as the safety margin to the recommended request. Default: 0.15
+	RequestMarginFraction string `json:"requestMarginFraction,omitempty"`
+
+	// The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h
+	ConfidenceInterval string `json:"confidenceInterval,omitempty"`
+
+	// How much time back TSDB have to be queried to get historical metrics. Default: 1d
+	HistoryLength string `json:"historyLength,omitempty"`
+
+	// Resolution at which TSDB is queried for historical metrics. Default: 1m
+	HistoryResolution string `json:"historyResolution,omitempty"`
 }
 
 // A typical autoLimits algorithm could be checking every 5m, look back 1 day data,
diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go
index 1b304eca..11075bbf 100644
--- a/api/v1/tensorfusionconnection_types.go
+++ b/api/v1/tensorfusionconnection_types.go
@@ -21,6 +21,13 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
+type ResourceName string
+
+const (
+	ResourceTflops ResourceName = "tflops"
+	ResourceVram   ResourceName = "vram"
+)
+
 type Resource struct {
 	Tflops resource.Quantity `json:"tflops"`
 	Vram   resource.Quantity `json:"vram"`
@@ -31,6 +38,20 @@ type Resources struct {
 	Limits   Resource `json:"limits"`
 }
 
+func (r *Resources) Equal(t *Resources) bool {
+	return r.Requests.Tflops.Equal(t.Requests.Tflops) &&
+		r.Requests.Vram.Equal(t.Requests.Vram) &&
+		r.Limits.Tflops.Equal(t.Limits.Tflops) &&
+		r.Limits.Vram.Equal(t.Limits.Vram)
+}
+
+func (r *Resources) IsZero() bool {
+	return r.Requests.Tflops.IsZero() &&
+		r.Requests.Vram.IsZero() &&
+		r.Limits.Tflops.IsZero() &&
+		r.Limits.Vram.IsZero()
+}
+
 // TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
 type TensorFusionConnectionSpec struct {
 	WorkloadName string `json:"workloadName"`
diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
index 2f1bf367..579a9c25 100644
--- a/api/v1/zz_generated.deepcopy.go
+++ b/api/v1/zz_generated.deepcopy.go
@@ -138,9 +138,15 @@ func (in *AutoFreezeAndResume) DeepCopy() *AutoFreezeAndResume {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *AutoScalingConfig) DeepCopyInto(out *AutoScalingConfig) {
 	*out = *in
-	in.AutoSetLimits.DeepCopyInto(&out.AutoSetLimits)
+	out.AutoSetResources = in.AutoSetResources
 	out.AutoSetReplicas = in.AutoSetReplicas
-	in.AutoSetRequests.DeepCopyInto(&out.AutoSetRequests)
+	if in.CronScalingRules != nil {
+		in, out := &in.CronScalingRules, &out.CronScalingRules
+		*out = make([]CronScalingRule, len(*in))
+		for i := range *in {
+			(*in)[i].DeepCopyInto(&(*out)[i])
+		}
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoScalingConfig.
@@ -204,6 +210,21 @@ func (in *AutoSetRequests) DeepCopy() *AutoSetRequests {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *AutoSetResources) DeepCopyInto(out *AutoSetResources) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AutoSetResources.
+func (in *AutoSetResources) DeepCopy() *AutoSetResources {
+	if in == nil {
+		return nil
+	}
+	out := new(AutoSetResources)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *CapacityConfig) DeepCopyInto(out *CapacityConfig) {
 	*out = *in
@@ -347,6 +368,27 @@ func (in *ComputingVendorParams) DeepCopy() *ComputingVendorParams {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *CronScalingRule) DeepCopyInto(out *CronScalingRule) {
+	*out = *in
+	in.DesiredResources.DeepCopyInto(&out.DesiredResources)
+	if in.DesiredReplicas != nil {
+		in, out := &in.DesiredReplicas, &out.DesiredReplicas
+		*out = new(int32)
+		**out = **in
+	}
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CronScalingRule.
+func (in *CronScalingRule) DeepCopy() *CronScalingRule {
+	if in == nil {
+		return nil
+	}
+	out := new(CronScalingRule)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *DataPipeline4ResourcesConfig) DeepCopyInto(out *DataPipeline4ResourcesConfig) {
 	*out = *in
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml
index 69a12b27..bb8dd068 100644
--- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml
+++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml
@@ -50,41 +50,6 @@ spec:
               autoScaling:
                 description: scale the workload based on the usage and traffic
                 properties:
-                  autoSetLimits:
-                    description: |-
-                      layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
-                      VPA-like, aggregate metrics data <1m
-                    properties:
-                      enable:
-                        type: boolean
-                      evaluationPeriod:
-                        type: string
-                      extraTFlopsBufferRatio:
-                        type: string
-                      ignoredDeltaRange:
-                        type: string
-                      maxRatioToRequests:
-                        description: the multiplier of requests, to avoid limit set
-                          too high, like 5.0
-                        type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
-                      scaleUpStep:
-                        type: string
-                      targetResource:
-                        description: target resource to scale limits, such as "tflops",
-                          "vram", or "all" by default
-                        type: string
-                    type: object
                   autoSetReplicas:
                     description: |-
                       layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
@@ -105,40 +70,141 @@ spec:
                       targetTFlopsOfLimits:
                         type: string
                     type: object
-                  autoSetRequests:
+                  autoSetResources:
                     description: |-
-                      layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
+                      layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
                       Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
                     properties:
-                      aggregationPeriod:
+                      confidenceInterval:
+                        description: 'The time interval used for computing the confidence
+                          multiplier for the lower and upper bound. Default: 24h'
                         type: string
                       enable:
                         type: boolean
-                      evaluationPeriod:
+                      historyLength:
+                        description: 'How much time back TSDB have to be queried to
+                          get historical metrics. Default: 1d'
                         type: string
-                      extraBufferRatio:
-                        description: the request buffer ratio, for example actual
-                          usage is 1.0, 10% buffer will be 1.1 as final preferred
-                          requests
+                      historyResolution:
+                        description: 'Resolution at which TSDB is queried for historical
+                          metrics. Default: 1m'
                         type: string
-                      percentileForAutoRequests:
+                      lowerboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the lower bound on tflops recommendation. Default: 0.5'
+                        type: string
+                      lowerboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the lower bound on vram recommendation. Default: 0.5'
+                        type: string
+                      requestMarginFraction:
+                        description: 'Fraction of usage added as the safety margin
+                          to the recommended request. Default: 0.15'
                         type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
                       targetResource:
-                        description: target resource to scale requests, such as "tflops",
-                          "vram", or "all" by default
+                        description: Target resource to scale, such as "tflops", "vram",
+                          or "all" by default
+                        type: string
+                      targettflopspercentile:
+                        description: 'Tflops usage percentile that will be used as
+                          a base for tflops target recommendation. Default: 0.9'
+                        type: string
+                      targetvrampercentile:
+                        description: 'Vram usage percentile that will be used as a
+                          base for vram target recommendation. Default: 0.9'
+                        type: string
+                      upperboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the upper bound on tflops recommendation. Default: 0.95'
+                        type: string
+                      upperboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the upper bound on vram recommendation. Default: 0.95'
                         type: string
                     type: object
+                  cronScalingRules:
+                    description: CronScalingRules defines a list of CronScaling rules
+                      used to schedule scaling actions based on cron expressions.
+                    items:
+                      description: |-
+                        CronScalingRule defines the rule for scaling resources based on a cron schedule.
+                        It allows enabling/disabling the scaler, specifying the time window for scaling,
+                        and configuring the desired resources and replicas during the scheduled period.
+                      properties:
+                        desiredReplicas:
+                          description: DesiredReplicas is the target number of replicas
+                            during the schedule.
+                          format: int32
+                          type: integer
+                        desiredResources:
+                          description: DesiredResources specifies the target resources
+                            to scale to during the schedule.
+                          properties:
+                            limits:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                            requests:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                          required:
+                          - limits
+                          - requests
+                          type: object
+                        enable:
+                          description: Enable specifies whether the cron scaler is
+                            enabled.
+                          type: boolean
+                        end:
+                          description: End is the end time for the scaling schedule,
+                            in cron format.
+                          type: string
+                        name:
+                          description: Name is the identifier for the cron scaler.
+                          type: string
+                        replicasMultiplier:
+                          description: ReplicasMultiplier is a string representing
+                            the multiplier to apply to replicas.
+                          type: string
+                        resourceMultiplier:
+                          description: ResourceMultiplier is a string representing
+                            the multiplier to apply to resources.
+                          type: string
+                        start:
+                          description: Start is the start time for the scaling schedule,
+                            in cron format.
+                          type: string
+                      type: object
+                    type: array
                 type: object
               hypervisor:
                 description: single GPU device multi-process queuing and fair scheduling
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
index fc7818d3..1661ae5b 100644
--- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
+++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml
@@ -46,41 +46,6 @@ spec:
                   This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
                   user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
                 properties:
-                  autoSetLimits:
-                    description: |-
-                      layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
-                      VPA-like, aggregate metrics data <1m
-                    properties:
-                      enable:
-                        type: boolean
-                      evaluationPeriod:
-                        type: string
-                      extraTFlopsBufferRatio:
-                        type: string
-                      ignoredDeltaRange:
-                        type: string
-                      maxRatioToRequests:
-                        description: the multiplier of requests, to avoid limit set
-                          too high, like 5.0
-                        type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
-                      scaleUpStep:
-                        type: string
-                      targetResource:
-                        description: target resource to scale limits, such as "tflops",
-                          "vram", or "all" by default
-                        type: string
-                    type: object
                   autoSetReplicas:
                     description: |-
                       layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
@@ -101,40 +66,141 @@ spec:
                       targetTFlopsOfLimits:
                         type: string
                     type: object
-                  autoSetRequests:
+                  autoSetResources:
                     description: |-
-                      layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
+                      layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
                       Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
                     properties:
-                      aggregationPeriod:
+                      confidenceInterval:
+                        description: 'The time interval used for computing the confidence
+                          multiplier for the lower and upper bound. Default: 24h'
                         type: string
                       enable:
                         type: boolean
-                      evaluationPeriod:
+                      historyLength:
+                        description: 'How much time back TSDB have to be queried to
+                          get historical metrics. Default: 1d'
                         type: string
-                      extraBufferRatio:
-                        description: the request buffer ratio, for example actual
-                          usage is 1.0, 10% buffer will be 1.1 as final preferred
-                          requests
+                      historyResolution:
+                        description: 'Resolution at which TSDB is queried for historical
+                          metrics. Default: 1m'
                         type: string
-                      percentileForAutoRequests:
+                      lowerboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the lower bound on tflops recommendation. Default: 0.5'
+                        type: string
+                      lowerboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the lower bound on vram recommendation. Default: 0.5'
+                        type: string
+                      requestMarginFraction:
+                        description: 'Fraction of usage added as the safety margin
+                          to the recommended request. Default: 0.15'
                         type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
                       targetResource:
-                        description: target resource to scale requests, such as "tflops",
-                          "vram", or "all" by default
+                        description: Target resource to scale, such as "tflops", "vram",
+                          or "all" by default
+                        type: string
+                      targettflopspercentile:
+                        description: 'Tflops usage percentile that will be used as
+                          a base for tflops target recommendation. Default: 0.9'
+                        type: string
+                      targetvrampercentile:
+                        description: 'Vram usage percentile that will be used as a
+                          base for vram target recommendation. Default: 0.9'
+                        type: string
+                      upperboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the upper bound on tflops recommendation. Default: 0.95'
+                        type: string
+                      upperboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the upper bound on vram recommendation. Default: 0.95'
                         type: string
                     type: object
+                  cronScalingRules:
+                    description: CronScalingRules defines a list of CronScaling rules
+                      used to schedule scaling actions based on cron expressions.
+                    items:
+                      description: |-
+                        CronScalingRule defines the rule for scaling resources based on a cron schedule.
+                        It allows enabling/disabling the scaler, specifying the time window for scaling,
+                        and configuring the desired resources and replicas during the scheduled period.
+                      properties:
+                        desiredReplicas:
+                          description: DesiredReplicas is the target number of replicas
+                            during the schedule.
+                          format: int32
+                          type: integer
+                        desiredResources:
+                          description: DesiredResources specifies the target resources
+                            to scale to during the schedule.
+                          properties:
+                            limits:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                            requests:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                          required:
+                          - limits
+                          - requests
+                          type: object
+                        enable:
+                          description: Enable specifies whether the cron scaler is
+                            enabled.
+                          type: boolean
+                        end:
+                          description: End is the end time for the scaling schedule,
+                            in cron format.
+                          type: string
+                        name:
+                          description: Name is the identifier for the cron scaler.
+                          type: string
+                        replicasMultiplier:
+                          description: ReplicasMultiplier is a string representing
+                            the multiplier to apply to replicas.
+                          type: string
+                        resourceMultiplier:
+                          description: ResourceMultiplier is a string representing
+                            the multiplier to apply to resources.
+                          type: string
+                        start:
+                          description: Start is the start time for the scaling schedule,
+                            in cron format.
+                          type: string
+                      type: object
+                    type: array
                 type: object
               gpuCount:
                 description: The number of GPUs to be used by the workload, default
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml
index 19b9fd2e..01005b7c 100644
--- a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml
+++ b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml
@@ -45,41 +45,6 @@ spec:
                   This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
                   user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
                 properties:
-                  autoSetLimits:
-                    description: |-
-                      layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
-                      VPA-like, aggregate metrics data <1m
-                    properties:
-                      enable:
-                        type: boolean
-                      evaluationPeriod:
-                        type: string
-                      extraTFlopsBufferRatio:
-                        type: string
-                      ignoredDeltaRange:
-                        type: string
-                      maxRatioToRequests:
-                        description: the multiplier of requests, to avoid limit set
-                          too high, like 5.0
-                        type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
-                      scaleUpStep:
-                        type: string
-                      targetResource:
-                        description: target resource to scale limits, such as "tflops",
-                          "vram", or "all" by default
-                        type: string
-                    type: object
                   autoSetReplicas:
                     description: |-
                       layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
@@ -100,40 +65,141 @@ spec:
                       targetTFlopsOfLimits:
                         type: string
                     type: object
-                  autoSetRequests:
+                  autoSetResources:
                     description: |-
-                      layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
+                      layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
                       Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
                     properties:
-                      aggregationPeriod:
+                      confidenceInterval:
+                        description: 'The time interval used for computing the confidence
+                          multiplier for the lower and upper bound. Default: 24h'
                         type: string
                       enable:
                         type: boolean
-                      evaluationPeriod:
+                      historyLength:
+                        description: 'How much time back TSDB have to be queried to
+                          get historical metrics. Default: 1d'
                         type: string
-                      extraBufferRatio:
-                        description: the request buffer ratio, for example actual
-                          usage is 1.0, 10% buffer will be 1.1 as final preferred
-                          requests
+                      historyResolution:
+                        description: 'Resolution at which TSDB is queried for historical
+                          metrics. Default: 1m'
                         type: string
-                      percentileForAutoRequests:
+                      lowerboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the lower bound on tflops recommendation. Default: 0.5'
+                        type: string
+                      lowerboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the lower bound on vram recommendation. Default: 0.5'
+                        type: string
+                      requestMarginFraction:
+                        description: 'Fraction of usage added as the safety margin
+                          to the recommended request. Default: 0.15'
                         type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
                       targetResource:
-                        description: target resource to scale requests, such as "tflops",
-                          "vram", or "all" by default
+                        description: Target resource to scale, such as "tflops", "vram",
+                          or "all" by default
+                        type: string
+                      targettflopspercentile:
+                        description: 'Tflops usage percentile that will be used as
+                          a base for tflops target recommendation. Default: 0.9'
+                        type: string
+                      targetvrampercentile:
+                        description: 'Vram usage percentile that will be used as a
+                          base for vram target recommendation. Default: 0.9'
+                        type: string
+                      upperboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the upper bound on tflops recommendation. Default: 0.95'
+                        type: string
+                      upperboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the upper bound on vram recommendation. Default: 0.95'
                         type: string
                     type: object
+                  cronScalingRules:
+                    description: CronScalingRules defines a list of CronScaling rules
+                      used to schedule scaling actions based on cron expressions.
+                    items:
+                      description: |-
+                        CronScalingRule defines the rule for scaling resources based on a cron schedule.
+                        It allows enabling/disabling the scaler, specifying the time window for scaling,
+                        and configuring the desired resources and replicas during the scheduled period.
+                      properties:
+                        desiredReplicas:
+                          description: DesiredReplicas is the target number of replicas
+                            during the schedule.
+                          format: int32
+                          type: integer
+                        desiredResources:
+                          description: DesiredResources specifies the target resources
+                            to scale to during the schedule.
+                          properties:
+                            limits:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                            requests:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                          required:
+                          - limits
+                          - requests
+                          type: object
+                        enable:
+                          description: Enable specifies whether the cron scaler is
+                            enabled.
+                          type: boolean
+                        end:
+                          description: End is the end time for the scaling schedule,
+                            in cron format.
+                          type: string
+                        name:
+                          description: Name is the identifier for the cron scaler.
+                          type: string
+                        replicasMultiplier:
+                          description: ReplicasMultiplier is a string representing
+                            the multiplier to apply to replicas.
+                          type: string
+                        resourceMultiplier:
+                          description: ResourceMultiplier is a string representing
+                            the multiplier to apply to resources.
+                          type: string
+                        start:
+                          description: Start is the start time for the scaling schedule,
+                            in cron format.
+                          type: string
+                      type: object
+                    type: array
                 type: object
               gpuCount:
                 description: The number of GPUs to be used by the workload, default
diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml
index 69a12b27..bb8dd068 100644
--- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml
+++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml
@@ -50,41 +50,6 @@ spec:
               autoScaling:
                 description: scale the workload based on the usage and traffic
                 properties:
-                  autoSetLimits:
-                    description: |-
-                      layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
-                      VPA-like, aggregate metrics data <1m
-                    properties:
-                      enable:
-                        type: boolean
-                      evaluationPeriod:
-                        type: string
-                      extraTFlopsBufferRatio:
-                        type: string
-                      ignoredDeltaRange:
-                        type: string
-                      maxRatioToRequests:
-                        description: the multiplier of requests, to avoid limit set
-                          too high, like 5.0
-                        type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
-                      scaleUpStep:
-                        type: string
-                      targetResource:
-                        description: target resource to scale limits, such as "tflops",
-                          "vram", or "all" by default
-                        type: string
-                    type: object
                   autoSetReplicas:
                     description: |-
                       layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
@@ -105,40 +70,141 @@ spec:
                       targetTFlopsOfLimits:
                         type: string
                     type: object
-                  autoSetRequests:
+                  autoSetResources:
                     description: |-
-                      layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
+                      layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
                       Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
                     properties:
-                      aggregationPeriod:
+                      confidenceInterval:
+                        description: 'The time interval used for computing the confidence
+                          multiplier for the lower and upper bound. Default: 24h'
                         type: string
                       enable:
                         type: boolean
-                      evaluationPeriod:
+                      historyLength:
+                        description: 'How much time back TSDB have to be queried to
+                          get historical metrics. Default: 1d'
                         type: string
-                      extraBufferRatio:
-                        description: the request buffer ratio, for example actual
-                          usage is 1.0, 10% buffer will be 1.1 as final preferred
-                          requests
+                      historyResolution:
+                        description: 'Resolution at which TSDB is queried for historical
+                          metrics. Default: 1m'
                         type: string
-                      percentileForAutoRequests:
+                      lowerboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the lower bound on tflops recommendation. Default: 0.5'
+                        type: string
+                      lowerboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the lower bound on vram recommendation. Default: 0.5'
+                        type: string
+                      requestMarginFraction:
+                        description: 'Fraction of usage added as the safety margin
+                          to the recommended request. Default: 0.15'
                         type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
                       targetResource:
-                        description: target resource to scale requests, such as "tflops",
-                          "vram", or "all" by default
+                        description: Target resource to scale, such as "tflops", "vram",
+                          or "all" by default
+                        type: string
+                      targettflopspercentile:
+                        description: 'Tflops usage percentile that will be used as
+                          a base for tflops target recommendation. Default: 0.9'
+                        type: string
+                      targetvrampercentile:
+                        description: 'Vram usage percentile that will be used as a
+                          base for vram target recommendation. Default: 0.9'
+                        type: string
+                      upperboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the upper bound on tflops recommendation. Default: 0.95'
+                        type: string
+                      upperboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the upper bound on vram recommendation. Default: 0.95'
                         type: string
                     type: object
+                  cronScalingRules:
+                    description: CronScalingRules defines a list of CronScaling rules
+                      used to schedule scaling actions based on cron expressions.
+                    items:
+                      description: |-
+                        CronScalingRule defines the rule for scaling resources based on a cron schedule.
+                        It allows enabling/disabling the scaler, specifying the time window for scaling,
+                        and configuring the desired resources and replicas during the scheduled period.
+                      properties:
+                        desiredReplicas:
+                          description: DesiredReplicas is the target number of replicas
+                            during the schedule.
+                          format: int32
+                          type: integer
+                        desiredResources:
+                          description: DesiredResources specifies the target resources
+                            to scale to during the schedule.
+                          properties:
+                            limits:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                            requests:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                          required:
+                          - limits
+                          - requests
+                          type: object
+                        enable:
+                          description: Enable specifies whether the cron scaler is
+                            enabled.
+                          type: boolean
+                        end:
+                          description: End is the end time for the scaling schedule,
+                            in cron format.
+                          type: string
+                        name:
+                          description: Name is the identifier for the cron scaler.
+                          type: string
+                        replicasMultiplier:
+                          description: ReplicasMultiplier is a string representing
+                            the multiplier to apply to replicas.
+                          type: string
+                        resourceMultiplier:
+                          description: ResourceMultiplier is a string representing
+                            the multiplier to apply to resources.
+                          type: string
+                        start:
+                          description: Start is the start time for the scaling schedule,
+                            in cron format.
+                          type: string
+                      type: object
+                    type: array
                 type: object
               hypervisor:
                 description: single GPU device multi-process queuing and fair scheduling
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml
index fc7818d3..1661ae5b 100644
--- a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml
+++ b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml
@@ -46,41 +46,6 @@ spec:
                   This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
                   user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
                 properties:
-                  autoSetLimits:
-                    description: |-
-                      layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
-                      VPA-like, aggregate metrics data <1m
-                    properties:
-                      enable:
-                        type: boolean
-                      evaluationPeriod:
-                        type: string
-                      extraTFlopsBufferRatio:
-                        type: string
-                      ignoredDeltaRange:
-                        type: string
-                      maxRatioToRequests:
-                        description: the multiplier of requests, to avoid limit set
-                          too high, like 5.0
-                        type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
-                      scaleUpStep:
-                        type: string
-                      targetResource:
-                        description: target resource to scale limits, such as "tflops",
-                          "vram", or "all" by default
-                        type: string
-                    type: object
                   autoSetReplicas:
                     description: |-
                       layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
@@ -101,40 +66,141 @@ spec:
                       targetTFlopsOfLimits:
                         type: string
                     type: object
-                  autoSetRequests:
+                  autoSetResources:
                     description: |-
-                      layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
+                      layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
                       Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
                     properties:
-                      aggregationPeriod:
+                      confidenceInterval:
+                        description: 'The time interval used for computing the confidence
+                          multiplier for the lower and upper bound. Default: 24h'
                         type: string
                       enable:
                         type: boolean
-                      evaluationPeriod:
+                      historyLength:
+                        description: 'How much time back TSDB have to be queried to
+                          get historical metrics. Default: 1d'
                         type: string
-                      extraBufferRatio:
-                        description: the request buffer ratio, for example actual
-                          usage is 1.0, 10% buffer will be 1.1 as final preferred
-                          requests
+                      historyResolution:
+                        description: 'Resolution at which TSDB is queried for historical
+                          metrics. Default: 1m'
                         type: string
-                      percentileForAutoRequests:
+                      lowerboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the lower bound on tflops recommendation. Default: 0.5'
+                        type: string
+                      lowerboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the lower bound on vram recommendation. Default: 0.5'
+                        type: string
+                      requestMarginFraction:
+                        description: 'Fraction of usage added as the safety margin
+                          to the recommended request. Default: 0.15'
                         type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
                       targetResource:
-                        description: target resource to scale requests, such as "tflops",
-                          "vram", or "all" by default
+                        description: Target resource to scale, such as "tflops", "vram",
+                          or "all" by default
+                        type: string
+                      targettflopspercentile:
+                        description: 'Tflops usage percentile that will be used as
+                          a base for tflops target recommendation. Default: 0.9'
+                        type: string
+                      targetvrampercentile:
+                        description: 'Vram usage percentile that will be used as a
+                          base for vram target recommendation. Default: 0.9'
+                        type: string
+                      upperboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the upper bound on tflops recommendation. Default: 0.95'
+                        type: string
+                      upperboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the upper bound on vram recommendation. Default: 0.95'
                         type: string
                     type: object
+                  cronScalingRules:
+                    description: CronScalingRules defines a list of CronScaling rules
+                      used to schedule scaling actions based on cron expressions.
+                    items:
+                      description: |-
+                        CronScalingRule defines the rule for scaling resources based on a cron schedule.
+                        It allows enabling/disabling the scaler, specifying the time window for scaling,
+                        and configuring the desired resources and replicas during the scheduled period.
+                      properties:
+                        desiredReplicas:
+                          description: DesiredReplicas is the target number of replicas
+                            during the schedule.
+                          format: int32
+                          type: integer
+                        desiredResources:
+                          description: DesiredResources specifies the target resources
+                            to scale to during the schedule.
+                          properties:
+                            limits:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                            requests:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                          required:
+                          - limits
+                          - requests
+                          type: object
+                        enable:
+                          description: Enable specifies whether the cron scaler is
+                            enabled.
+                          type: boolean
+                        end:
+                          description: End is the end time for the scaling schedule,
+                            in cron format.
+                          type: string
+                        name:
+                          description: Name is the identifier for the cron scaler.
+                          type: string
+                        replicasMultiplier:
+                          description: ReplicasMultiplier is a string representing
+                            the multiplier to apply to replicas.
+                          type: string
+                        resourceMultiplier:
+                          description: ResourceMultiplier is a string representing
+                            the multiplier to apply to resources.
+                          type: string
+                        start:
+                          description: Start is the start time for the scaling schedule,
+                            in cron format.
+                          type: string
+                      type: object
+                    type: array
                 type: object
               gpuCount:
                 description: The number of GPUs to be used by the workload, default
diff --git a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml
index 19b9fd2e..01005b7c 100644
--- a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml
+++ b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml
@@ -45,41 +45,6 @@ spec:
                   This field can not be fully supported in annotation, if user want to enable auto-scaling in annotation,
                   user can set tensor-fusion.ai/auto-limits|requests|replicas: 'true'
                 properties:
-                  autoSetLimits:
-                    description: |-
-                      layer 1 vertical auto-scaling, turbo burst to existing GPU cards quickly
-                      VPA-like, aggregate metrics data <1m
-                    properties:
-                      enable:
-                        type: boolean
-                      evaluationPeriod:
-                        type: string
-                      extraTFlopsBufferRatio:
-                        type: string
-                      ignoredDeltaRange:
-                        type: string
-                      maxRatioToRequests:
-                        description: the multiplier of requests, to avoid limit set
-                          too high, like 5.0
-                        type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
-                      scaleUpStep:
-                        type: string
-                      targetResource:
-                        description: target resource to scale limits, such as "tflops",
-                          "vram", or "all" by default
-                        type: string
-                    type: object
                   autoSetReplicas:
                     description: |-
                       layer 2 horizontal auto-scaling, scale up to more GPU cards if max limits threshold hit
@@ -100,40 +65,141 @@ spec:
                       targetTFlopsOfLimits:
                         type: string
                     type: object
-                  autoSetRequests:
+                  autoSetResources:
                     description: |-
-                      layer 3 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode, not impl yet
+                      layer 1 adjusting, to match the actual usage in the long run, only for N:M remote vGPU mode
                       Adjust baseline requests to match the actual usage in longer period, such as 1day - 2weeks
                     properties:
-                      aggregationPeriod:
+                      confidenceInterval:
+                        description: 'The time interval used for computing the confidence
+                          multiplier for the lower and upper bound. Default: 24h'
                         type: string
                       enable:
                         type: boolean
-                      evaluationPeriod:
+                      historyLength:
+                        description: 'How much time back TSDB have to be queried to
+                          get historical metrics. Default: 1d'
                         type: string
-                      extraBufferRatio:
-                        description: the request buffer ratio, for example actual
-                          usage is 1.0, 10% buffer will be 1.1 as final preferred
-                          requests
+                      historyResolution:
+                        description: 'Resolution at which TSDB is queried for historical
+                          metrics. Default: 1m'
                         type: string
-                      percentileForAutoRequests:
+                      lowerboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the lower bound on tflops recommendation. Default: 0.5'
+                        type: string
+                      lowerboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the lower bound on vram recommendation. Default: 0.5'
+                        type: string
+                      requestMarginFraction:
+                        description: 'Fraction of usage added as the safety margin
+                          to the recommended request. Default: 0.15'
                         type: string
-                      prediction:
-                        properties:
-                          enable:
-                            type: boolean
-                          historyDataPeriod:
-                            type: string
-                          model:
-                            type: string
-                          predictionPeriod:
-                            type: string
-                        type: object
                       targetResource:
-                        description: target resource to scale requests, such as "tflops",
-                          "vram", or "all" by default
+                        description: Target resource to scale, such as "tflops", "vram",
+                          or "all" by default
+                        type: string
+                      targettflopspercentile:
+                        description: 'Tflops usage percentile that will be used as
+                          a base for tflops target recommendation. Default: 0.9'
+                        type: string
+                      targetvrampercentile:
+                        description: 'Vram usage percentile that will be used as a
+                          base for vram target recommendation. Default: 0.9'
+                        type: string
+                      upperboundtflopspercentile:
+                        description: 'Tflops usage percentile that will be used for
+                          the upper bound on tflops recommendation. Default: 0.95'
+                        type: string
+                      upperboundvrampercentile:
+                        description: 'Vram usage percentile that will be used for
+                          the upper bound on vram recommendation. Default: 0.95'
                         type: string
                     type: object
+                  cronScalingRules:
+                    description: CronScalingRules defines a list of CronScaling rules
+                      used to schedule scaling actions based on cron expressions.
+                    items:
+                      description: |-
+                        CronScalingRule defines the rule for scaling resources based on a cron schedule.
+                        It allows enabling/disabling the scaler, specifying the time window for scaling,
+                        and configuring the desired resources and replicas during the scheduled period.
+                      properties:
+                        desiredReplicas:
+                          description: DesiredReplicas is the target number of replicas
+                            during the schedule.
+                          format: int32
+                          type: integer
+                        desiredResources:
+                          description: DesiredResources specifies the target resources
+                            to scale to during the schedule.
+                          properties:
+                            limits:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                            requests:
+                              properties:
+                                tflops:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                vram:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                              required:
+                              - tflops
+                              - vram
+                              type: object
+                          required:
+                          - limits
+                          - requests
+                          type: object
+                        enable:
+                          description: Enable specifies whether the cron scaler is
+                            enabled.
+                          type: boolean
+                        end:
+                          description: End is the end time for the scaling schedule,
+                            in cron format.
+                          type: string
+                        name:
+                          description: Name is the identifier for the cron scaler.
+                          type: string
+                        replicasMultiplier:
+                          description: ReplicasMultiplier is a string representing
+                            the multiplier to apply to replicas.
+                          type: string
+                        resourceMultiplier:
+                          description: ResourceMultiplier is a string representing
+                            the multiplier to apply to resources.
+                          type: string
+                        start:
+                          description: Start is the start time for the scaling schedule,
+                            in cron format.
+                          type: string
+                      type: object
+                    type: array
                 type: object
               gpuCount:
                 description: The number of GPUs to be used by the workload, default
diff --git a/go.mod b/go.mod
index bfdc3c41..69a98eeb 100644
--- a/go.mod
+++ b/go.mod
@@ -16,21 +16,24 @@ require (
 	github.com/lithammer/shortuuid/v4 v4.2.0
 	github.com/onsi/ginkgo/v2 v2.23.4
 	github.com/onsi/gomega v1.37.0
+	github.com/pkg/errors v0.9.1
+	github.com/robfig/cron/v3 v3.0.1
 	github.com/samber/lo v1.51.0
 	github.com/shirou/gopsutil v3.21.11+incompatible
 	github.com/stretchr/testify v1.10.0
-	go.etcd.io/etcd/client/v2 v2.305.16
+	golang.org/x/time v0.9.0
 	gomodules.xyz/jsonpatch/v2 v2.5.0
 	gopkg.in/natefinch/lumberjack.v2 v2.2.1
 	gorm.io/driver/mysql v1.6.0
 	gorm.io/gorm v1.30.0
 	k8s.io/api v0.33.2
 	k8s.io/apimachinery v0.33.2
+	k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0
 	k8s.io/client-go v0.33.2
 	k8s.io/component-base v0.32.5
 	k8s.io/component-helpers v0.33.2
 	k8s.io/klog/v2 v2.130.1
-	k8s.io/kubernetes v1.32.5
+	k8s.io/kubernetes v1.32.6
 	k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979
 	sigs.k8s.io/controller-runtime v0.20.4
 	sigs.k8s.io/scheduler-plugins v0.31.8
@@ -110,7 +113,6 @@ require (
 	github.com/opencontainers/go-digest v1.0.0 // indirect
 	github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b // indirect
 	github.com/pelletier/go-toml/v2 v2.2.3 // indirect
-	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/prometheus/client_golang v1.22.0 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
@@ -119,6 +121,7 @@ require (
 	github.com/spf13/cobra v1.8.1 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/stoewer/go-strcase v1.3.0 // indirect
+	github.com/stretchr/objx v0.5.2 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.2.12 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
@@ -148,7 +151,6 @@ require (
 	golang.org/x/sys v0.33.0 // indirect
 	golang.org/x/term v0.32.0 // indirect
 	golang.org/x/text v0.25.0 // indirect
-	golang.org/x/time v0.9.0 // indirect
 	golang.org/x/tools v0.33.0 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20241223144023-3abc09e42ca8 // indirect
 	google.golang.org/genproto/googleapis/rpc v0.0.0-20241223144023-3abc09e42ca8 // indirect
diff --git a/go.sum b/go.sum
index 822b5c80..4f7607ff 100644
--- a/go.sum
+++ b/go.sum
@@ -250,6 +250,8 @@ github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ
 github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I=
 github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
 github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
+github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
+github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
 github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII=
 github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o=
 github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
@@ -270,6 +272,7 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
 github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
@@ -458,6 +461,8 @@ k8s.io/apimachinery v0.32.5 h1:6We3aJ6crC0ap8EhsEXcgX3LpI6SEjubpiOMXLROwPM=
 k8s.io/apimachinery v0.32.5/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE=
 k8s.io/apiserver v0.32.5 h1:phmm2EOUVFI+cLiq8Grtuh166fTt/qgvkGPkpgzp5uY=
 k8s.io/apiserver v0.32.5/go.mod h1:5bfueS1tgARVWVXRJBMI5mHoCmev0jOvbxebai/kiqc=
+k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0 h1:oVv4QrTPKM7vWyQRRzCDgDgi00NWo4Rjle5/nujP/dI=
+k8s.io/autoscaler/vertical-pod-autoscaler v1.3.0/go.mod h1:W4k7qGP8A9Xqp+UK+lM49AfsWkAdXzE80F/s8kxwWVI=
 k8s.io/client-go v0.32.5 h1:huFmQMzgWu0z4kbWsuZci+Gt4Fo72I4CcrvhToZ/Qp0=
 k8s.io/client-go v0.32.5/go.mod h1:Qchw6f9WIVrur7DKojAHpRgGLcANT0RLIvF39Jz58xA=
 k8s.io/cloud-provider v0.32.5 h1:KzO0mpXYArWxQH91+a4WLLrhTaO5RGWmQn4lzUXY6ak=
diff --git a/internal/autoscaler/autoscaler.go b/internal/autoscaler/autoscaler.go
new file mode 100644
index 00000000..ac5e4662
--- /dev/null
+++ b/internal/autoscaler/autoscaler.go
@@ -0,0 +1,215 @@
+package autoscaler
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/recommender"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload"
+	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
+	ctrl "sigs.k8s.io/controller-runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/manager"
+)
+
+var (
+	_ manager.Runnable               = (*Autoscaler)(nil)
+	_ manager.LeaderElectionRunnable = (*Autoscaler)(nil)
+)
+
+type Autoscaler struct {
+	client.Client
+	allocator       *gpuallocator.GpuAllocator
+	metricsProvider metrics.Provider
+	recommenders    []recommender.Interface
+	workloadHandler workload.Handler
+	workloads       map[string]*workload.State
+}
+
+func NewAutoscaler(c client.Client, allocator *gpuallocator.GpuAllocator) (*Autoscaler, error) {
+	if c == nil {
+		return nil, errors.New("must specify client")
+	}
+
+	if allocator == nil {
+		return nil, errors.New("must specify allocator")
+	}
+
+	recommenders := []recommender.Interface{
+		recommender.NewPercentileRecommender(),
+		recommender.NewCronRecommender(c),
+	}
+
+	return &Autoscaler{
+		Client:          c,
+		allocator:       allocator,
+		metricsProvider: metrics.NewProvider(nil),
+		recommenders:    recommenders,
+		workloadHandler: workload.NewHandler(c, allocator),
+		workloads:       map[string]*workload.State{},
+	}, nil
+}
+
+func (s *Autoscaler) Start(ctx context.Context) error {
+	log := log.FromContext(ctx)
+	log.Info("Starting autoscaler")
+
+	// Handle timeout for loading historical metrics
+	historyCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+	s.loadHistoryMetrics(historyCtx)
+
+	ticker := time.NewTicker(time.Minute)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ticker.C:
+			s.Run(ctx)
+		case <-ctx.Done():
+			log.Info("Stopping autoscaler")
+			return nil
+		}
+	}
+}
+
+func (s *Autoscaler) NeedLeaderElection() bool {
+	return true
+}
+
+func (s *Autoscaler) Run(ctx context.Context) {
+	log := log.FromContext(ctx)
+
+	log.Info("Autoscaler running")
+	s.loadWorkloads(ctx)
+	s.loadRealTimeMetrics(ctx)
+	s.processWorkloads(ctx)
+}
+
+func (s *Autoscaler) loadWorkloads(ctx context.Context) {
+	log := log.FromContext(ctx)
+
+	workloadList := tfv1.TensorFusionWorkloadList{}
+	if err := s.List(ctx, &workloadList); err != nil {
+		log.Error(err, "failed to list workloads")
+		return
+	}
+
+	observedWorkloads := map[string]bool{}
+	for _, workload := range workloadList.Items {
+		if !workload.DeletionTimestamp.IsZero() {
+			continue
+		}
+
+		workloadState := s.findOrCreateWorkloadState(workload.Name)
+		s.workloadHandler.UpdateWorkloadState(ctx, workloadState, &workload)
+		observedWorkloads[workload.Name] = true
+	}
+
+	// remove non-existent workloads
+	for name := range s.workloads {
+		if !observedWorkloads[name] {
+			delete(s.workloads, name)
+		}
+	}
+}
+
+func (s *Autoscaler) loadHistoryMetrics(ctx context.Context) {
+	log := log.FromContext(ctx)
+	log.Info("loading historical metrics")
+
+	workersMetrics, err := s.metricsProvider.GetHistoryMetrics()
+	if err != nil {
+		log.Error(err, "failed to get history metrics")
+		return
+	}
+	for _, sample := range workersMetrics {
+		s.findOrCreateWorkloadState(sample.WorkloadName).AddSample(sample)
+	}
+}
+
+func (s *Autoscaler) loadRealTimeMetrics(ctx context.Context) {
+	log := log.FromContext(ctx)
+	log.Info("loading realtime metrics")
+
+	workersMetrics, err := s.metricsProvider.GetWorkersMetrics()
+	if err != nil {
+		log.Error(err, "failed to get workers metrics")
+		return
+	}
+
+	for _, sample := range workersMetrics {
+		if workload, exists := s.workloads[sample.WorkloadName]; exists {
+			workload.AddSample(sample)
+		}
+	}
+}
+
+func (s *Autoscaler) processWorkloads(ctx context.Context) {
+	log := log.FromContext(ctx)
+	log.Info("processing workloads")
+
+	for _, workload := range s.workloads {
+		recommendations := map[string]*tfv1.Resources{}
+		for _, recommender := range s.recommenders {
+			name := recommender.Name()
+			recommendation, err := recommender.Recommend(ctx, workload)
+			if err != nil {
+				log.Error(err, "failed to recommend resources", "recommender", name)
+				continue
+			}
+			if recommendation == nil {
+				continue
+			}
+			recommendations[name] = recommendation
+			log.Info("recommendation", "workload", workload.Name, "recommender", name, "resources", recommendation)
+		}
+
+		finalRecommendation := mergeRecommendations(recommendations)
+		if finalRecommendation.IsZero() {
+			continue
+		}
+		log.Info("final recommendation", "workload", workload.Name, "resources", finalRecommendation)
+
+		if err := s.workloadHandler.ApplyRecommendationToWorkload(ctx, workload, finalRecommendation); err != nil {
+			log.Error(err, "failed to apply recommendation", "workload", workload.Name, "recommendation", finalRecommendation)
+		}
+	}
+}
+
+func (s *Autoscaler) findOrCreateWorkloadState(name string) *workload.State {
+	w, exists := s.workloads[name]
+	if !exists {
+		w = workload.NewWorkloadState(name)
+		s.workloads[name] = w
+	}
+	return w
+}
+
+func mergeRecommendations(recommendations map[string]*tfv1.Resources) *tfv1.Resources {
+	result := &tfv1.Resources{}
+	for _, rec := range recommendations {
+		if result.Requests.Tflops.Cmp(rec.Requests.Tflops) < 0 {
+			result.Requests.Tflops = rec.Requests.Tflops
+			result.Limits.Tflops = rec.Limits.Tflops
+		}
+		if result.Requests.Vram.Cmp(rec.Requests.Vram) < 0 {
+			result.Requests.Vram = rec.Requests.Vram
+			result.Limits.Vram = rec.Limits.Vram
+		}
+	}
+	return result
+}
+
+// Start after manager started
+func SetupWithManager(mgr ctrl.Manager, allocator *gpuallocator.GpuAllocator) error {
+	autoScaler, err := NewAutoscaler(mgr.GetClient(), allocator)
+	if err != nil {
+		return fmt.Errorf("failed to create auto scaler: %v", err)
+	}
+	return mgr.Add(autoScaler)
+}
diff --git a/internal/autoscaler/autoscaler_suite_test.go b/internal/autoscaler/autoscaler_suite_test.go
new file mode 100644
index 00000000..6eb9d869
--- /dev/null
+++ b/internal/autoscaler/autoscaler_suite_test.go
@@ -0,0 +1,585 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package autoscaler
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+	"time"
+
+	ctrl "sigs.k8s.io/controller-runtime"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes/scheme"
+	"k8s.io/client-go/rest"
+	"k8s.io/client-go/util/retry"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
+	"sigs.k8s.io/controller-runtime/pkg/envtest"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
+	"sigs.k8s.io/controller-runtime/pkg/log/zap"
+
+	corev1 "k8s.io/api/core/v1"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/config"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/controller"
+	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
+	"github.com/NexusGPU/tensor-fusion/internal/metrics"
+	"github.com/NexusGPU/tensor-fusion/internal/portallocator"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
+	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
+	// +kubebuilder:scaffold:imports
+)
+
+// These tests use Ginkgo (BDD-style Go testing framework). Refer to
+// http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
+
+var cfg *rest.Config
+var k8sClient client.Client
+var testEnv *envtest.Environment
+var ctx context.Context
+var cancel context.CancelFunc
+var allocator *gpuallocator.GpuAllocator
+var metricsRecorder *metrics.MetricsRecorder
+
+func TestControllers(t *testing.T) {
+	RegisterFailHandler(Fail)
+
+	if os.Getenv("DEBUG_MODE") == constants.TrueStringValue {
+		SetDefaultEventuallyTimeout(10 * time.Minute)
+	} else {
+		SetDefaultEventuallyTimeout(7 * time.Second)
+	}
+	SetDefaultEventuallyPollingInterval(200 * time.Millisecond)
+	SetDefaultConsistentlyDuration(5 * time.Second)
+	SetDefaultConsistentlyPollingInterval(250 * time.Millisecond)
+	RunSpecs(t, "Controller Suite")
+}
+
+var _ = BeforeSuite(func() {
+	// Expect(os.Setenv("USE_EXISTING_CLUSTER", "true")).Should(Succeed())
+	logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
+
+	ctx, cancel = context.WithCancel(context.TODO())
+
+	By("bootstrapping test environment")
+	testEnv = &envtest.Environment{
+		CRDDirectoryPaths:     []string{filepath.Join("..", "..", "config", "crd", "bases")},
+		ErrorIfCRDPathMissing: true,
+
+		// The BinaryAssetsDirectory is only required if you want to run the tests directly
+		// without call the makefile target test. If not informed it will look for the
+		// default path defined in controller-runtime which is /usr/local/kubebuilder/.
+		// Note that you must have the required binaries setup under the bin directory to perform
+		// the tests directly. When we run make test it will be setup and used automatically.
+		BinaryAssetsDirectory: filepath.Join("..", "..", "bin", "k8s",
+			fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)),
+	}
+
+	var err error
+	// cfg is defined in this file globally.
+	cfg, err = testEnv.Start()
+	Expect(err).NotTo(HaveOccurred())
+	Expect(cfg).NotTo(BeNil())
+
+	err = tfv1.AddToScheme(scheme.Scheme)
+	Expect(err).NotTo(HaveOccurred())
+
+	err = corev1.AddToScheme(scheme.Scheme)
+	Expect(err).NotTo(HaveOccurred())
+
+	// +kubebuilder:scaffold:scheme
+
+	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
+	Expect(err).NotTo(HaveOccurred())
+	Expect(k8sClient).NotTo(BeNil())
+
+	Expect(k8sClient.Create(ctx, &corev1.Namespace{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: utils.CurrentNamespace(),
+		},
+	})).NotTo(HaveOccurred())
+
+	mgr, err := ctrl.NewManager(cfg, ctrl.Options{
+		Scheme: scheme.Scheme,
+		Metrics: metricsserver.Options{
+			BindAddress: "0",
+		},
+	})
+
+	Expect(err).ToNot(HaveOccurred())
+
+	metricsRecorder = &metrics.MetricsRecorder{
+		MetricsOutputPath: "./metrics.log",
+		HourlyUnitPriceMap: map[string]float64{
+			"A100": 10,
+		},
+		WorkerUnitPriceMap: make(map[string]map[string]metrics.RawBillingPricing),
+	}
+
+	allocator = gpuallocator.NewGpuAllocator(ctx, mgr.GetClient(), 150*time.Millisecond)
+	_, err = allocator.SetupWithManager(ctx, mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	portAllocator, err := portallocator.NewPortAllocator(ctx, mgr.GetClient(), "40000-42000", "42001-60000")
+	if err != nil {
+		Expect(err).ToNot(HaveOccurred())
+	}
+	_ = portAllocator.SetupWithManager(ctx, mgr)
+
+	err = (&controller.TensorFusionClusterReconciler{
+		Client:          mgr.GetClient(),
+		Scheme:          mgr.GetScheme(),
+		Recorder:        mgr.GetEventRecorderFor("TensorFusionCluster"),
+		MetricsRecorder: metricsRecorder,
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.GPUPoolReconciler{
+		Client:   mgr.GetClient(),
+		Scheme:   mgr.GetScheme(),
+		Recorder: mgr.GetEventRecorderFor("GPUPool"),
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.GPUNodeReconciler{
+		Client:   mgr.GetClient(),
+		Scheme:   mgr.GetScheme(),
+		Recorder: mgr.GetEventRecorderFor("GPUNode"),
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.GPUNodeClassReconciler{
+		Client: mgr.GetClient(),
+		Scheme: mgr.GetScheme(),
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.SchedulingConfigTemplateReconciler{
+		Client: mgr.GetClient(),
+		Scheme: mgr.GetScheme(),
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.PodReconciler{
+		Client:        mgr.GetClient(),
+		Scheme:        mgr.GetScheme(),
+		Allocator:     allocator,
+		PortAllocator: portAllocator,
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.NodeReconciler{
+		Client:   mgr.GetClient(),
+		Scheme:   mgr.GetScheme(),
+		Recorder: mgr.GetEventRecorderFor("Node"),
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.WorkloadProfileReconciler{
+		Client: mgr.GetClient(),
+		Scheme: mgr.GetScheme(),
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.TensorFusionConnectionReconciler{
+		Client:   mgr.GetClient(),
+		Scheme:   mgr.GetScheme(),
+		Recorder: mgr.GetEventRecorderFor("TensorFusionConnection"),
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.GPUReconciler{
+		Client: mgr.GetClient(),
+		Scheme: mgr.GetScheme(),
+	}).SetupWithManager(ctx, mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	err = (&controller.TensorFusionWorkloadReconciler{
+		Client:        mgr.GetClient(),
+		Scheme:        mgr.GetScheme(),
+		Recorder:      mgr.GetEventRecorderFor("TensorFusionWorkload"),
+		PortAllocator: portAllocator,
+	}).SetupWithManager(mgr)
+	Expect(err).ToNot(HaveOccurred())
+
+	go func() {
+		defer GinkgoRecover()
+		err = mgr.Start(ctx)
+		Expect(err).ToNot(HaveOccurred(), "failed to run manager")
+	}()
+
+})
+
+var _ = AfterSuite(func() {
+	By("tearing down the test environment")
+	allocator.Stop()
+	cancel()
+	err := testEnv.Stop()
+	Expect(err).NotTo(HaveOccurred())
+	// Expect(os.Unsetenv("USE_EXISTING_CLUSTER")).To(Succeed())
+})
+
+type TensorFusionEnv struct {
+	clusterKey  client.ObjectKey
+	poolCount   int
+	poolNodeMap map[int]map[int]int
+}
+
+func (c *TensorFusionEnv) GetCluster() *tfv1.TensorFusionCluster {
+	GinkgoHelper()
+	tfc := &tfv1.TensorFusionCluster{}
+	Expect(k8sClient.Get(ctx, c.clusterKey, tfc)).Should(Succeed())
+	return tfc
+}
+
+func (c *TensorFusionEnv) UpdateCluster(tfc *tfv1.TensorFusionCluster) {
+	GinkgoHelper()
+	err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+		latest := &tfv1.TensorFusionCluster{}
+		if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(tfc), latest); err != nil {
+			return err
+		}
+		latest.Spec = tfc.Spec
+		return k8sClient.Update(ctx, latest)
+	})
+	Expect(err).Should(Succeed())
+}
+
+func (c *TensorFusionEnv) Cleanup() {
+	GinkgoHelper()
+	for poolIndex, nodeGpuMap := range c.poolNodeMap {
+		for nodeIndex := range nodeGpuMap {
+			c.DeleteGPUNode(poolIndex, nodeIndex)
+		}
+	}
+
+	tfc := c.GetCluster()
+	tfcCopy := tfc.DeepCopy()
+	tfcCopy.Spec.GPUPools = []tfv1.GPUPoolDefinition{}
+	c.UpdateCluster(tfcCopy)
+
+	for poolIndex := range c.poolNodeMap {
+		Eventually(func(g Gomega) {
+			pool := &tfv1.GPUPool{}
+			g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(HaveOccurred())
+		}).Should(Succeed())
+		delete(c.poolNodeMap, poolIndex)
+		c.poolCount--
+	}
+
+	Expect(k8sClient.Delete(ctx, tfc)).Should(Succeed())
+	Eventually(func(g Gomega) {
+		err := k8sClient.Get(ctx, c.clusterKey, tfc)
+		g.Expect(err).Should(HaveOccurred())
+	}).Should(Succeed())
+}
+
+func (c *TensorFusionEnv) GetGPUPoolList() *tfv1.GPUPoolList {
+	GinkgoHelper()
+	poolList := &tfv1.GPUPoolList{}
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.List(ctx, poolList, client.MatchingLabels(map[string]string{
+			constants.LabelKeyOwner: c.clusterKey.Name,
+		}))).Should(Succeed())
+		g.Expect(poolList.Items).Should(HaveLen(c.poolCount))
+	}).Should(Succeed())
+	return poolList
+}
+
+func (c *TensorFusionEnv) GetGPUPool(poolIndex int) *tfv1.GPUPool {
+	GinkgoHelper()
+	pool := &tfv1.GPUPool{}
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getPoolName(poolIndex)}, pool)).Should(Succeed())
+	}).Should(Succeed())
+	return pool
+}
+
+func (c *TensorFusionEnv) GetGPUNodeList(poolIndex int) *tfv1.GPUNodeList {
+	GinkgoHelper()
+	nodeList := &tfv1.GPUNodeList{}
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.List(ctx, nodeList, client.MatchingLabels(map[string]string{
+			fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, c.getPoolName(poolIndex)): "true",
+		}))).Should(Succeed())
+		g.Expect(nodeList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex])))
+	}).Should(Succeed())
+	return nodeList
+}
+
+func (c *TensorFusionEnv) GetGPUNode(poolIndex int, nodeIndex int) *tfv1.GPUNode {
+	GinkgoHelper()
+	node := &tfv1.GPUNode{}
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(Succeed())
+	}).Should(Succeed())
+	return node
+}
+
+func (c *TensorFusionEnv) DeleteGPUNode(poolIndex int, nodeIndex int) {
+	GinkgoHelper()
+	c.DeleteNodeGpuList(poolIndex, nodeIndex)
+	node := c.GetGPUNode(poolIndex, nodeIndex)
+	Expect(k8sClient.Delete(ctx, node)).Should(Succeed())
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: c.getNodeName(poolIndex, nodeIndex)}, node)).Should(HaveOccurred())
+	}).Should(Succeed())
+	delete(c.poolNodeMap[poolIndex], nodeIndex)
+}
+
+func (c *TensorFusionEnv) GetNodeGpuList(poolIndex int, nodeIndex int) *tfv1.GPUList {
+	GinkgoHelper()
+	gpuList := &tfv1.GPUList{}
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.List(ctx, gpuList, client.MatchingLabels(map[string]string{
+			constants.LabelKeyOwner: c.getNodeName(poolIndex, nodeIndex),
+		}))).Should(Succeed())
+		g.Expect(gpuList.Items).Should(HaveLen(c.poolNodeMap[poolIndex][nodeIndex]))
+	}).Should(Succeed())
+	return gpuList
+}
+
+func (c *TensorFusionEnv) DeleteNodeGpuList(poolIndex int, nodeIndex int) {
+	GinkgoHelper()
+	Expect(k8sClient.DeleteAllOf(ctx, &tfv1.GPU{},
+		client.MatchingLabels{constants.LabelKeyOwner: c.getNodeName(poolIndex, nodeIndex)},
+	)).Should(Succeed())
+}
+
+func (c *TensorFusionEnv) GetPoolGpuList(poolIndex int) *tfv1.GPUList {
+	GinkgoHelper()
+	gpuList := &tfv1.GPUList{}
+	poolGpuCount := 0
+	for _, gpuCount := range c.poolNodeMap[poolIndex] {
+		poolGpuCount += gpuCount
+	}
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.List(ctx, gpuList, client.MatchingLabels(map[string]string{
+			constants.GpuPoolKey: c.getPoolName(poolIndex),
+		}))).Should(Succeed())
+		g.Expect(gpuList.Items).Should(HaveLen(poolGpuCount))
+	}).Should(Succeed())
+	return gpuList
+}
+
+// https://book.kubebuilder.io/reference/envtest#testing-considerations
+// Unless you’re using an existing cluster, keep in mind that no built-in controllers are running in the test context.
+// So the checkStatusAndUpdateVirtualCapacity in gpunode_controller.go checking pod status always pending and the gpunode status can't change to running
+// When using an existing cluster, the test speed go a lot faster, may change later?
+func (c *TensorFusionEnv) UpdateHypervisorStatus() {
+	GinkgoHelper()
+	if os.Getenv("USE_EXISTING_CLUSTER") != "true" {
+		for poolIndex := range c.poolNodeMap {
+			podList := &corev1.PodList{}
+			Eventually(func(g Gomega) {
+				g.Expect(k8sClient.List(ctx, podList,
+					client.InNamespace(utils.CurrentNamespace()),
+					client.MatchingLabels(map[string]string{
+						fmt.Sprintf(constants.GPUNodePoolIdentifierLabelFormat, c.getPoolName(poolIndex)): "true",
+					}),
+				)).Should(Succeed())
+				g.Expect(podList.Items).Should(HaveLen(len(c.poolNodeMap[poolIndex])))
+			}).Should(Succeed())
+			for _, pod := range podList.Items {
+				pod.Status.Phase = corev1.PodRunning
+				pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{Type: corev1.PodReady, Status: corev1.ConditionTrue})
+				Expect(k8sClient.Status().Update(ctx, &pod)).Should(Succeed())
+			}
+		}
+	}
+}
+
+func (c *TensorFusionEnv) getPoolName(poolIndex int) string {
+	return fmt.Sprintf("%s-pool-%d", c.clusterKey.Name, poolIndex)
+}
+
+func (c *TensorFusionEnv) getNodeName(poolIndex int, nodeIndex int) string {
+	return fmt.Sprintf("%s-pool-%d-node-%d", c.clusterKey.Name, poolIndex, nodeIndex)
+}
+
+func (c *TensorFusionEnv) getGPUName(poolIndex int, nodeIndex int, gpuIndex int) string {
+	return fmt.Sprintf("%s-pool-%d-node-%d-gpu-%d", c.clusterKey.Name, poolIndex, nodeIndex, gpuIndex)
+}
+
+func (c *TensorFusionEnv) GetConfig() *rest.Config {
+	return cfg
+}
+
+type TensorFusionEnvBuilder struct {
+	*TensorFusionEnv
+}
+
+func NewTensorFusionEnvBuilder() *TensorFusionEnvBuilder {
+	return &TensorFusionEnvBuilder{
+		&TensorFusionEnv{
+			poolCount:   0,
+			clusterKey:  client.ObjectKey{},
+			poolNodeMap: map[int]map[int]int{},
+		},
+	}
+}
+
+func (b *TensorFusionEnvBuilder) AddPoolWithNodeCount(nodeCount int) *TensorFusionEnvBuilder {
+	nodeGpuMap := make(map[int]int, nodeCount)
+	for i := range nodeCount {
+		nodeGpuMap[i] = 0
+	}
+	b.poolNodeMap[b.poolCount] = nodeGpuMap
+	b.poolCount++
+	return b
+}
+
+func (b *TensorFusionEnvBuilder) SetGpuCountPerNode(gpuCount int) *TensorFusionEnvBuilder {
+	poolIndex := b.poolCount - 1
+	for nodeIndex := range b.poolNodeMap[poolIndex] {
+		b.poolNodeMap[poolIndex][nodeIndex] = gpuCount
+	}
+	return b
+}
+
+func (b *TensorFusionEnvBuilder) SetGpuCountForNode(nodeIndex int, gpuCount int) *TensorFusionEnvBuilder {
+	poolIndex := b.poolCount - 1
+	b.poolNodeMap[poolIndex][nodeIndex] = gpuCount
+	return b
+}
+
+var testEnvId int = 0
+
+func (b *TensorFusionEnvBuilder) Build() *TensorFusionEnv {
+	GinkgoHelper()
+	b.clusterKey = client.ObjectKey{
+		Name:      fmt.Sprintf("cluster-%d", testEnvId),
+		Namespace: "default",
+	}
+	testEnvId++
+
+	// generate cluster
+	tfc := &tfv1.TensorFusionCluster{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      b.clusterKey.Name,
+			Namespace: b.clusterKey.Namespace,
+		},
+		Spec: tfv1.TensorFusionClusterSpec{
+			GPUPools: []tfv1.GPUPoolDefinition{
+				{
+					Name:         fmt.Sprintf("pool-%d", b.poolCount),
+					SpecTemplate: *config.MockGPUPoolSpec,
+				},
+			},
+		},
+	}
+
+	// construct pools
+	gpuPools := make([]tfv1.GPUPoolDefinition, b.poolCount)
+	for i := range b.poolCount {
+		poolSpec := config.MockGPUPoolSpec.DeepCopy()
+		poolSpec.NodeManagerConfig.NodeSelector.NodeSelectorTerms[0].MatchExpressions[0].Key =
+			fmt.Sprintf("%s-label-%d", tfc.Name, i)
+		gpuPools[i] = tfv1.GPUPoolDefinition{
+			Name:         fmt.Sprintf("pool-%d", i),
+			SpecTemplate: *poolSpec,
+		}
+	}
+
+	tfc.Spec.GPUPools = gpuPools
+	Expect(k8sClient.Create(ctx, tfc)).To(Succeed())
+
+	// wait for pools are created
+	Eventually(func(g Gomega) {
+		gpuPoolList := &tfv1.GPUPoolList{}
+		g.Expect(k8sClient.List(ctx, gpuPoolList, client.MatchingLabels(map[string]string{
+			constants.LabelKeyOwner: tfc.Name,
+		}))).Should(Succeed())
+		g.Expect(gpuPoolList.Items).Should(HaveLen(b.poolCount))
+	}).Should(Succeed())
+
+	// generate nodes
+	selectors := strings.Split(constants.InitialGPUNodeSelector, "=")
+	for poolIndex := range b.poolCount {
+		nodeCount := len(b.poolNodeMap[poolIndex])
+		for nodeIndex := range nodeCount {
+			coreNode := &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: b.getNodeName(poolIndex, nodeIndex),
+					Labels: map[string]string{
+						selectors[0]: selectors[1],
+						fmt.Sprintf("%s-label-%d", tfc.Name, poolIndex): "true",
+					},
+				},
+			}
+			Expect(k8sClient.Create(ctx, coreNode)).To(Succeed())
+
+			// generate gpus for gpunode
+			gpuNode := b.GetGPUNode(poolIndex, nodeIndex)
+			if gpuCount := b.poolNodeMap[poolIndex][nodeIndex]; gpuCount > 0 {
+				for gpuIndex := range gpuCount {
+					key := client.ObjectKey{
+						Name: b.getGPUName(poolIndex, nodeIndex, gpuIndex),
+					}
+					gpu := &tfv1.GPU{
+						ObjectMeta: metav1.ObjectMeta{
+							Name: key.Name,
+							Labels: map[string]string{
+								constants.LabelKeyOwner: gpuNode.Name,
+								constants.GpuPoolKey:    b.getPoolName(poolIndex),
+							},
+						},
+					}
+					Expect(controllerutil.SetControllerReference(gpuNode, gpu, scheme.Scheme)).To(Succeed())
+					Expect(k8sClient.Create(ctx, gpu)).To(Succeed())
+					patch := client.MergeFrom(gpu.DeepCopy())
+					gpu.Status = tfv1.GPUStatus{
+						Phase:    tfv1.TensorFusionGPUPhaseRunning,
+						UUID:     key.Name,
+						GPUModel: "mock",
+						NodeSelector: map[string]string{
+							"kubernetes.io/hostname": b.getNodeName(poolIndex, nodeIndex),
+						},
+						Capacity: &tfv1.Resource{
+							Tflops: resource.MustParse("2000"),
+							Vram:   resource.MustParse("2000Gi"),
+						},
+						Available: &tfv1.Resource{
+							Tflops: resource.MustParse("2000"),
+							Vram:   resource.MustParse("2000Gi"),
+						},
+						Message: "mock message",
+					}
+					Expect(k8sClient.Status().Patch(ctx, gpu, patch)).To(Succeed())
+				}
+			}
+		}
+
+		b.GetPoolGpuList(poolIndex)
+	}
+
+	b.UpdateHypervisorStatus()
+
+	return b.TensorFusionEnv
+}
diff --git a/internal/autoscaler/autoscaler_test.go b/internal/autoscaler/autoscaler_test.go
new file mode 100644
index 00000000..49299e63
--- /dev/null
+++ b/internal/autoscaler/autoscaler_test.go
@@ -0,0 +1,668 @@
+/*
+Copyright 2024.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package autoscaler
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
+	"github.com/aws/smithy-go/ptr"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"github.com/samber/lo"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+var _ = Describe("Autoscaler", func() {
+	Context("when creating an autoscaler", func() {
+		It("should return an error if there is no client", func() {
+			as, err := NewAutoscaler(nil, nil)
+			Expect(as).To(BeNil())
+			Expect(err.Error()).To(ContainSubstring("must specify client"))
+		})
+
+		It("should return an error if there is no allocator", func() {
+			as, err := NewAutoscaler(k8sClient, nil)
+			Expect(as).To(BeNil())
+			Expect(err.Error()).To(ContainSubstring("must specify allocator"))
+		})
+	})
+
+	Context("when loading history metrics", func() {
+		It("should create the state of workloads and workers based on historical metrics", func() {
+			scaler, _ := NewAutoscaler(k8sClient, allocator)
+			scaler.metricsProvider = &FakeMetricsProvider{}
+			scaler.loadHistoryMetrics(ctx)
+			metrics, _ := scaler.metricsProvider.GetHistoryMetrics()
+			for _, m := range metrics {
+				Expect(scaler.workloads).To(HaveKey(m.WorkloadName))
+				Expect(scaler.workloads[m.WorkloadName].Workers).To(HaveKey(m.WorkerName))
+			}
+		})
+	})
+
+	Context("when loading workloads", func() {
+		It("should keep the state of workloads", func() {
+			tfEnv := NewTensorFusionEnvBuilder().
+				AddPoolWithNodeCount(1).SetGpuCountPerNode(3).
+				Build()
+			defer tfEnv.Cleanup()
+
+			scaler, _ := NewAutoscaler(k8sClient, allocator)
+			scaler.loadWorkloads(ctx)
+			Expect(scaler.workloads).To(BeEmpty())
+
+			// create two workloads
+			pool := tfEnv.GetGPUPool(0)
+			// with two replias
+			workload0 := createWorkload(pool, 0, 2)
+			workload0Workers := getWorkers(workload0)
+			// with one replia
+			workload1 := createWorkload(pool, 1, 1)
+			workload1Workers := getWorkers(workload1)
+
+			scaler.loadWorkloads(ctx)
+			Expect(scaler.workloads).To(HaveLen(2))
+			Expect(scaler.workloads).To(HaveKey(workload0.Name))
+			Expect(scaler.workloads).To(HaveKey(workload1.Name))
+			workers := scaler.workloads[workload0.Name].Workers
+			Expect(workers).To(HaveLen(2))
+			Expect(workers).To(HaveKey(workload0Workers[0].Name))
+			Expect(workers).To(HaveKey(workload0Workers[1].Name))
+			Expect(scaler.workloads[workload1.Name].Workers).To(HaveKey(workload1Workers[0].Name))
+
+			updateWorkloadReplicas(workload0, 1)
+			scaler.loadWorkloads(ctx)
+			Expect(scaler.workloads[workload0.Name].Workers).To(HaveLen(1))
+
+			deleteWorkload(workload0)
+			deleteWorkload(workload1)
+			scaler.loadWorkloads(ctx)
+			Expect(scaler.workloads).NotTo(HaveKey(workload0.Name))
+			Expect(scaler.workloads).NotTo(HaveKey(workload1.Name))
+		})
+	})
+
+	Context("when loading real time metrics", func() {
+		It("should update the state of workloads and workers", func() {
+			tfEnv := NewTensorFusionEnvBuilder().
+				AddPoolWithNodeCount(1).SetGpuCountPerNode(1).
+				Build()
+			defer tfEnv.Cleanup()
+			pool := tfEnv.GetGPUPool(0)
+			workload := createWorkload(pool, 0, 1)
+			workers := getWorkers(workload)
+			defer deleteWorkload(workload)
+
+			worker := workers[0].Name
+
+			scaler, _ := NewAutoscaler(k8sClient, allocator)
+			scaler.loadWorkloads(ctx)
+			ws := scaler.workloads[workload.Name]
+			now := time.Now()
+			usage := &metrics.WorkerUsage{
+				WorkloadName: workload.Name,
+				WorkerName:   worker,
+				TflopsUsage:  12.0,
+				VramUsage:    9000,
+				Timestamp:    now,
+			}
+
+			scaler.metricsProvider = &FakeMetricsProvider{[]*metrics.WorkerUsage{usage}}
+			scaler.loadRealTimeMetrics(ctx)
+
+			scalerWorkers := scaler.workloads[workload.Name].Workers
+			Expect(scalerWorkers[worker].LastTflopsSampleTime).To(Equal(usage.Timestamp))
+			Expect(ws.WorkerUsageAggregator.TflopsHistogram.IsEmpty()).To(BeFalse())
+			Expect(scalerWorkers[worker].VramPeak).To(Equal(usage.VramUsage))
+			Expect(scalerWorkers[worker].LastVramSampleTime).To(Equal(usage.Timestamp))
+			Expect(ws.WorkerUsageAggregator.VramHistogram.IsEmpty()).To(BeFalse())
+		})
+	})
+
+	Context("when processing workloads", func() {
+		It("should scale up when the recommended resources exceed the current allocation", func() {
+			tfEnv := NewTensorFusionEnvBuilder().
+				AddPoolWithNodeCount(1).SetGpuCountPerNode(1).
+				Build()
+			defer tfEnv.Cleanup()
+			go mockSchedulerLoop(ctx, cfg)
+			workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1)
+			defer deleteWorkload(workload)
+
+			scaler, _ := NewAutoscaler(k8sClient, allocator)
+			scaler.loadWorkloads(ctx)
+
+			rec := tfv1.Resources{
+				Requests: tfv1.Resource{
+					Tflops: resource.MustParse("110"),
+					Vram:   resource.MustParse("110Gi"),
+				},
+				Limits: tfv1.Resource{
+					Tflops: resource.MustParse("110"),
+					Vram:   resource.MustParse("110Gi"),
+				},
+			}
+
+			scaler.recommenders[0] = &FakeRecommender{
+				Resources: &rec,
+			}
+
+			scaler.processWorkloads(ctx)
+			Eventually(func(g Gomega) {
+				res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
+				g.Expect(res.Equal(&rec)).To(BeTrue())
+			}).Should(Succeed())
+
+			// Upon reprocessing the workload, it should skip resource updates
+			scaler.processWorkloads(ctx)
+			Consistently(func(g Gomega) {
+				res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
+				g.Expect(res.Equal(&rec)).To(BeTrue())
+			}).Should(Succeed())
+		})
+
+		It("should update resources based on auto scaling config", func() {
+			tfEnv := NewTensorFusionEnvBuilder().
+				AddPoolWithNodeCount(1).SetGpuCountPerNode(1).
+				Build()
+			defer tfEnv.Cleanup()
+			go mockSchedulerLoop(ctx, cfg)
+			workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1)
+			defer deleteWorkload(workload)
+
+			scaler, _ := NewAutoscaler(k8sClient, allocator)
+			scaler.loadWorkloads(ctx)
+
+			rec := tfv1.Resources{
+				Requests: tfv1.Resource{
+					Tflops: resource.MustParse("110"),
+					Vram:   resource.MustParse("110Gi"),
+				},
+				Limits: tfv1.Resource{
+					Tflops: resource.MustParse("110"),
+					Vram:   resource.MustParse("110Gi"),
+				},
+			}
+
+			scaler.recommenders[0] = &FakeRecommender{
+				Resources: &rec,
+			}
+
+			workloadState := scaler.workloads[workload.Name]
+			oldRes := workloadState.Spec.Resources
+
+			// verify IsAutoScalingEnabled
+			workloadState.Spec.AutoScalingConfig.AutoSetResources.Enable = false
+			scaler.processWorkloads(ctx)
+			Eventually(func(g Gomega) {
+				res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
+				g.Expect(res.Equal(&oldRes)).To(BeTrue())
+			}).Should(Succeed())
+
+			// verify IsTargetResource
+			workloadState.Spec.AutoScalingConfig.AutoSetResources.Enable = true
+			workloadState.Spec.AutoScalingConfig.AutoSetResources.TargetResource = "tflops"
+			scaler.processWorkloads(ctx)
+			expect := tfv1.Resources{
+				Requests: tfv1.Resource{
+					Tflops: resource.MustParse("110"),
+					Vram:   resource.MustParse("8Gi"),
+				},
+				Limits: tfv1.Resource{
+					Tflops: resource.MustParse("110"),
+					Vram:   resource.MustParse("16Gi"),
+				},
+			}
+			Eventually(func(g Gomega) {
+				res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
+				g.Expect(res.Equal(&expect)).To(BeTrue())
+			}).Should(Succeed())
+		})
+
+		It("should not update resources if recommended resources exceeded quota", func() {
+			tfEnv := NewTensorFusionEnvBuilder().
+				AddPoolWithNodeCount(1).SetGpuCountPerNode(1).
+				Build()
+			defer tfEnv.Cleanup()
+			go mockSchedulerLoop(ctx, cfg)
+			workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1)
+			defer deleteWorkload(workload)
+
+			scaler, _ := NewAutoscaler(k8sClient, allocator)
+			scaler.loadWorkloads(ctx)
+
+			rec := tfv1.Resources{
+				Requests: tfv1.Resource{
+					Tflops: resource.MustParse("9999"),
+					Vram:   resource.MustParse("9999Gi"),
+				},
+				Limits: tfv1.Resource{
+					Tflops: resource.MustParse("9999"),
+					Vram:   resource.MustParse("9999Gi"),
+				},
+			}
+
+			scaler.recommenders[0] = &FakeRecommender{
+				Resources: &rec,
+			}
+
+			workloadState := scaler.workloads[workload.Name]
+			oldRes := workloadState.Spec.Resources
+			scaler.processWorkloads(ctx)
+			Eventually(func(g Gomega) {
+				res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
+				g.Expect(res.Equal(&oldRes)).To(BeTrue())
+			}).Should(Succeed())
+		})
+
+		It("should update resources based on cron scaling rule", func() {
+			tfEnv := NewTensorFusionEnvBuilder().
+				AddPoolWithNodeCount(1).SetGpuCountPerNode(1).
+				Build()
+			defer tfEnv.Cleanup()
+			go mockSchedulerLoop(ctx, cfg)
+			workload := createWorkload(tfEnv.GetGPUPool(0), 0, 1)
+			defer deleteWorkload(workload)
+
+			scaler, _ := NewAutoscaler(k8sClient, allocator)
+			scaler.loadWorkloads(ctx)
+
+			workloadState := scaler.workloads[workload.Name]
+
+			resourcesInRule := tfv1.Resources{
+				Requests: tfv1.Resource{
+					Tflops: resource.MustParse("110"),
+					Vram:   resource.MustParse("110Gi"),
+				},
+				Limits: tfv1.Resource{
+					Tflops: resource.MustParse("110"),
+					Vram:   resource.MustParse("110Gi"),
+				},
+			}
+
+			workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{
+				{
+					Enable:           true,
+					Name:             "test",
+					Start:            "0 0 * * *",
+					End:              "59 23 * * *",
+					DesiredResources: resourcesInRule,
+				},
+			}
+			scaler.processWorkloads(ctx)
+			Eventually(func(g Gomega) {
+				res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
+				g.Expect(res.Equal(&resourcesInRule)).To(BeTrue())
+			}).Should(Succeed())
+
+			// invalidate the rule by updating start and end fields
+			workloadState.Spec.AutoScalingConfig.CronScalingRules = []tfv1.CronScalingRule{
+				{
+					Enable:           true,
+					Name:             "test",
+					Start:            "",
+					End:              "",
+					DesiredResources: resourcesInRule,
+				},
+			}
+
+			scaler.processWorkloads(ctx)
+			originalResources := workloadState.Spec.Resources
+			Eventually(func(g Gomega) {
+				res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
+				g.Expect(res.Equal(&originalResources)).To(BeTrue())
+			}).Should(Succeed())
+
+			// should not change after cron scaling finish
+			scaler.processWorkloads(ctx)
+			Eventually(func(g Gomega) {
+				res, _ := utils.CurrentResourcesFromAnnotations(getWorkers(workload)[0].Annotations)
+				g.Expect(res.Equal(&originalResources)).To(BeTrue())
+			}).Should(Succeed())
+		})
+
+		It("should merge recomendations based on a larger request value", func() {
+			recommendations := map[string]*tfv1.Resources{
+				"rec1": {
+					Requests: tfv1.Resource{
+						Tflops: resource.MustParse("10"),
+						Vram:   resource.MustParse("10Gi"),
+					},
+					Limits: tfv1.Resource{
+						Tflops: resource.MustParse("15"),
+						Vram:   resource.MustParse("15Gi"),
+					},
+				},
+				"rec2": {
+					Requests: tfv1.Resource{
+						Tflops: resource.MustParse("5"),
+						Vram:   resource.MustParse("15Gi"),
+					},
+					Limits: tfv1.Resource{
+						Tflops: resource.MustParse("20"),
+						Vram:   resource.MustParse("20Gi"),
+					},
+				},
+			}
+
+			final := mergeRecommendations(recommendations)
+			Expect(final.Equal(&tfv1.Resources{
+				Requests: tfv1.Resource{
+					Tflops: resource.MustParse("10"),
+					Vram:   resource.MustParse("15Gi"),
+				},
+				Limits: tfv1.Resource{
+					Tflops: resource.MustParse("15"),
+					Vram:   resource.MustParse("20Gi"),
+				},
+			})).To(BeTrue())
+		})
+
+		It("should not update resource if resource is zero", func() {
+
+		})
+	})
+})
+
+func createWorkload(pool *tfv1.GPUPool, id int, replicas int) *tfv1.TensorFusionWorkload {
+	GinkgoHelper()
+	tflopsRequests := resource.MustParse("10")
+	vramRequests := resource.MustParse("8Gi")
+	tflopsLimits := resource.MustParse("20")
+	vramLimits := resource.MustParse("16Gi")
+
+	poolName := pool.Name
+	key := client.ObjectKey{Namespace: "default", Name: getWorkloadName(id)}
+	workload := &tfv1.TensorFusionWorkload{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      key.Name,
+			Namespace: key.Namespace,
+			Labels: map[string]string{
+				constants.GpuPoolKey: poolName,
+			},
+		},
+		Spec: tfv1.WorkloadProfileSpec{
+			Replicas: ptr.Int32(int32(replicas)),
+			PoolName: poolName,
+			Resources: tfv1.Resources{
+				Requests: tfv1.Resource{
+					Tflops: tflopsRequests,
+					Vram:   vramRequests,
+				},
+				Limits: tfv1.Resource{
+					Tflops: tflopsLimits,
+					Vram:   vramLimits,
+				},
+			},
+			Qos: constants.QoSLevelMedium,
+			AutoScalingConfig: tfv1.AutoScalingConfig{
+				AutoSetResources: tfv1.AutoSetResources{
+					Enable:         true,
+					TargetResource: "all",
+				},
+			},
+		},
+	}
+
+	Expect(k8sClient.Create(ctx, workload)).To(Succeed())
+
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
+	}).Should(Succeed())
+
+	checkWorkerPodCount(workload)
+
+	return workload
+}
+
+func checkWorkerPodCount(workload *tfv1.TensorFusionWorkload) {
+	GinkgoHelper()
+	podList := &corev1.PodList{}
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.List(ctx, podList,
+			client.InNamespace(workload.Namespace),
+			client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed())
+		g.Expect(podList.Items).Should(HaveLen(int(*workload.Spec.Replicas)))
+	}).Should(Succeed())
+}
+
+func getWorkloadName(index int) string {
+	return fmt.Sprintf("workload-%d", index)
+}
+
+func getWorkers(workload *tfv1.TensorFusionWorkload) []*corev1.Pod {
+	GinkgoHelper()
+	podList := &corev1.PodList{}
+	Expect(k8sClient.List(ctx, podList,
+		client.InNamespace("default"),
+		client.MatchingLabels{constants.WorkloadKey: workload.Name})).Should(Succeed())
+	return lo.Map(podList.Items, func(pod corev1.Pod, _ int) *corev1.Pod {
+		return &pod
+	})
+}
+
+type FakeAllocator struct{}
+
+type FakeMetricsProvider struct {
+	Metrics []*metrics.WorkerUsage
+}
+
+func (f *FakeMetricsProvider) GetWorkersMetrics() ([]*metrics.WorkerUsage, error) {
+	return f.Metrics, nil
+}
+
+func (f *FakeMetricsProvider) GetHistoryMetrics() ([]*metrics.WorkerUsage, error) {
+	sample := []*metrics.WorkerUsage{}
+	startTime := time.Now().Add(-8 * 24 * time.Hour)
+	for day := 0; day < 8; day++ {
+		for hour := 0; hour < 1; hour++ {
+			for minute := 0; minute < 60; minute++ {
+				// idx := day*24 + hour
+				sample = append(sample, &metrics.WorkerUsage{
+					WorkloadName: "workload-0",
+					WorkerName:   fmt.Sprintf("worker-%d", 1),
+					TflopsUsage:  100.0,
+					VramUsage:    1 * 1000 * 1000 * 1000,
+					Timestamp:    startTime.Add(time.Duration(day*24+hour)*time.Hour + time.Duration(minute)*time.Minute),
+				})
+			}
+		}
+	}
+
+	return sample, nil
+}
+
+type FakeRecommender struct {
+	*tfv1.Resources
+}
+
+func (f *FakeRecommender) Name() string {
+	return "Fake"
+}
+
+func (f *FakeRecommender) Recommend(ctx context.Context, workoad *workload.State) (*tfv1.Resources, error) {
+	return f.Resources, nil
+}
+
+func updateWorkloadReplicas(workload *tfv1.TensorFusionWorkload, replicas int) {
+	GinkgoHelper()
+	key := client.ObjectKeyFromObject(workload)
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
+		workload.Spec.Replicas = ptr.Int32(int32(replicas))
+		g.Expect(k8sClient.Update(ctx, workload)).To(Succeed())
+	}).Should(Succeed())
+
+	checkWorkerPodCount(workload)
+}
+
+func deleteWorkload(workload *tfv1.TensorFusionWorkload) {
+	cleanupWorkload(client.ObjectKeyFromObject(workload))
+}
+
+func cleanupWorkload(key client.ObjectKey) {
+	GinkgoHelper()
+	workload := &tfv1.TensorFusionWorkload{}
+
+	if err := k8sClient.Get(ctx, key, workload); err != nil {
+		if errors.IsNotFound(err) {
+			return
+		}
+		Expect(err).To(HaveOccurred())
+	}
+
+	// Set replicas to 0
+	Eventually(func(g Gomega) {
+		g.Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
+		workload.Spec.Replicas = ptr.Int32(0)
+		g.Expect(k8sClient.Update(ctx, workload)).To(Succeed())
+	}).Should(Succeed())
+
+	Eventually(func(g Gomega) {
+		podList := &corev1.PodList{}
+		g.Expect(k8sClient.List(ctx, podList,
+			client.InNamespace(key.Namespace),
+			client.MatchingLabels{constants.WorkloadKey: key.Name})).To(Succeed())
+		g.Expect(podList.Items).Should(BeEmpty())
+	}).Should(Succeed())
+
+	Expect(k8sClient.Get(ctx, key, workload)).Should(Succeed())
+	Expect(k8sClient.Delete(ctx, workload)).To(Succeed())
+	Eventually(func(g Gomega) {
+		err := k8sClient.Get(ctx, key, workload)
+		g.Expect(err).Should(HaveOccurred())
+	}).Should(Succeed())
+}
+
+func mockSchedulerLoop(ctx context.Context, cfg *rest.Config) {
+	ticker := time.NewTicker(50 * time.Millisecond)
+	clientset, err := kubernetes.NewForConfig(cfg)
+	if err != nil {
+		Expect(err).To(Succeed())
+	}
+	for range ticker.C {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+			podList := &corev1.PodList{}
+			_ = k8sClient.List(ctx, podList)
+			for _, pod := range podList.Items {
+				if pod.Spec.NodeName != "" {
+					continue
+				}
+				go scheduleAndStartPod(&pod, clientset)
+			}
+		}
+	}
+}
+
+func scheduleAndStartPod(pod *corev1.Pod, clientset *kubernetes.Clientset) {
+	// simulate scheduling cycle Filter and Reserve
+	allocRequest, _, err := allocator.ComposeAllocationRequest(pod)
+	if errors.IsNotFound(err) {
+		return
+	}
+	Expect(err).To(Succeed())
+	gpus, err := allocator.Alloc(&allocRequest)
+	if err != nil {
+		// some test cases are expected to fail, just continue
+		return
+	}
+	Expect(gpus).To(HaveLen(int(allocRequest.Count)))
+	allocator.SyncGPUsToK8s()
+
+	// update pod annotation
+	Eventually(func(g Gomega) {
+		latestPod := &corev1.Pod{}
+		err := k8sClient.Get(ctx, types.NamespacedName{
+			Name:      pod.Name,
+			Namespace: pod.Namespace,
+		}, latestPod)
+		if errors.IsNotFound(err) {
+			return
+		}
+		g.Expect(err).To(Succeed())
+
+		if latestPod.Annotations == nil {
+			latestPod.Annotations = map[string]string{}
+		}
+		latestPod.Annotations[constants.GpuKey] = strings.Join(
+			lo.Map(gpus, func(gpu *tfv1.GPU, _ int) string {
+				return gpu.Name
+			}), ",")
+		err = k8sClient.Status().Update(ctx, latestPod)
+		if errors.IsNotFound(err) {
+			return
+		}
+		g.Expect(err).To(Succeed())
+
+		// update pod node name
+		latestPod.Spec.NodeName = gpus[0].Status.NodeSelector[constants.KubernetesHostNameLabel]
+
+		// simulate k8s scheduler binding cycle Bind function
+		binding := &corev1.Binding{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      pod.Name,
+				Namespace: pod.Namespace,
+			},
+			Target: corev1.ObjectReference{
+				Kind: "Node",
+				Name: latestPod.Spec.NodeName,
+			},
+		}
+
+		err = clientset.CoreV1().Pods(latestPod.Namespace).Bind(ctx, binding, metav1.CreateOptions{})
+		if errors.IsNotFound(err) {
+			return
+		}
+		g.Expect(err).To(Succeed())
+	}).Should(Succeed())
+
+	// simulate kubelet start the pod successfully
+	patchPod := &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      pod.Name,
+			Namespace: pod.Namespace,
+		},
+	}
+	patchPod.Status.Phase = corev1.PodRunning
+	patchPod.Status.Conditions = append(patchPod.Status.Conditions, corev1.PodCondition{
+		Type:   corev1.PodReady,
+		Status: corev1.ConditionTrue,
+	})
+	err = k8sClient.Status().Patch(ctx, patchPod, client.MergeFrom(&corev1.Pod{}))
+	if errors.IsNotFound(err) {
+		return
+	}
+	Expect(err).To(Succeed())
+}
diff --git a/internal/autoscaler/metrics/metrics_aggregator.go b/internal/autoscaler/metrics/metrics_aggregator.go
new file mode 100644
index 00000000..5ffe51d9
--- /dev/null
+++ b/internal/autoscaler/metrics/metrics_aggregator.go
@@ -0,0 +1,73 @@
+package metrics
+
+import (
+	"time"
+
+	vpa "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/recommender/util"
+)
+
+const (
+	// minSampleWeight is the minimal weight of any sample (prior to including decaying factor)
+	minSampleWeight = 0.1
+	// epsilon is the minimal weight kept in histograms, it should be small enough that old samples
+	// (just inside AggregationWindowLength) added with minSampleWeight are still kept
+	epsilon = 0.001 * minSampleWeight
+	// DefaultAggregationInterval is the default value for AggregationInterval.
+	DefaultAggregationInterval = time.Hour * 24
+	// DefaultHistogramBucketSizeGrowth is the default value for HistogramBucketSizeGrowth.
+	DefaultHistogramBucketSizeGrowth = 0.05 // Make each bucket 5% larger than the previous one.
+	// DefaultHistogramDecayHalfLife is the default value for HistogramDecayHalfLife.
+	DefaultHistogramDecayHalfLife = time.Hour * 24
+)
+
+type WorkerUsageAggregator struct {
+	TflopsHistogram   vpa.Histogram
+	VramHistogram     vpa.Histogram
+	FirstSampleStart  time.Time
+	LastSampleStart   time.Time
+	TotalSamplesCount int
+}
+
+func NewWorkerUsageAggregator() *WorkerUsageAggregator {
+	return &WorkerUsageAggregator{
+		TflopsHistogram: vpa.NewDecayingHistogram(histogramOptions(10000.0, 0.1), DefaultHistogramDecayHalfLife),
+		VramHistogram:   vpa.NewDecayingHistogram(histogramOptions(1e12, 1e7), DefaultHistogramDecayHalfLife),
+	}
+}
+
+func (w *WorkerUsageAggregator) IsEmpty() bool {
+	if w.TflopsHistogram.IsEmpty() && w.VramHistogram.IsEmpty() {
+		return true
+	}
+	return false
+}
+
+func (w *WorkerUsageAggregator) AddTflopsSample(sample *WorkerUsage) bool {
+	w.TflopsHistogram.AddSample(float64(sample.TflopsUsage), minSampleWeight, sample.Timestamp)
+	if sample.Timestamp.After(w.LastSampleStart) {
+		w.LastSampleStart = sample.Timestamp
+	}
+	if w.FirstSampleStart.IsZero() || sample.Timestamp.Before(w.FirstSampleStart) {
+		w.FirstSampleStart = sample.Timestamp
+	}
+	w.TotalSamplesCount++
+	return true
+}
+
+func (w *WorkerUsageAggregator) AddVramSample(sample *WorkerUsage) bool {
+	w.VramHistogram.AddSample(float64(sample.VramUsage), 1.0, sample.Timestamp)
+	return true
+}
+
+func (w *WorkerUsageAggregator) SubtractVramSample(usage float64, time time.Time) bool {
+	w.VramHistogram.SubtractSample(usage, 1.0, time)
+	return true
+}
+
+func histogramOptions(maxValue, firstBucketSize float64) vpa.HistogramOptions {
+	options, err := vpa.NewExponentialHistogramOptions(maxValue, firstBucketSize, 1.+DefaultHistogramBucketSizeGrowth, epsilon)
+	if err != nil {
+		panic("Invalid histogram options") // Should not happen.
+	}
+	return options
+}
diff --git a/internal/autoscaler/metrics/metrics_provider.go b/internal/autoscaler/metrics/metrics_provider.go
new file mode 100644
index 00000000..e35f6911
--- /dev/null
+++ b/internal/autoscaler/metrics/metrics_provider.go
@@ -0,0 +1,100 @@
+package metrics
+
+import (
+	"time"
+
+	"github.com/NexusGPU/tensor-fusion/internal/metrics"
+	"gorm.io/gorm"
+)
+
+type WorkerUsage struct {
+	WorkloadName string
+	WorkerName   string
+	TflopsUsage  float64
+	VramUsage    uint64
+	Timestamp    time.Time
+}
+
+type Provider interface {
+	GetWorkersMetrics() ([]*WorkerUsage, error)
+	GetHistoryMetrics() ([]*WorkerUsage, error)
+}
+
+func NewProvider(db *gorm.DB) Provider {
+	return &greptimeDBProvider{db: db}
+}
+
+type greptimeDBProvider struct {
+	db            *gorm.DB
+	lastQueryTime time.Time
+	// historyLength     time.Duration
+	// historyResolution time.Duration
+}
+
+func (g *greptimeDBProvider) GetWorkersMetrics() ([]*WorkerUsage, error) {
+	data := []*metrics.HypervisorWorkerUsageMetrics{}
+	now := time.Now()
+	// actual meaning:  max(avg[10s])[1m]
+	err := g.db.Select("workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, max(ts) as ts").
+		Where("ts > ? and ts <= ?", g.lastQueryTime.Nanosecond(), now.Nanosecond()).
+		Group("workload, worker").
+		Order("ts asc").
+		Find(&data).
+		Error
+
+	if err != nil {
+		return nil, err
+	}
+
+	g.lastQueryTime = now
+
+	workersMetrics := make([]*WorkerUsage, 0, len(data))
+	for _, row := range data {
+		workersMetrics = append(workersMetrics, &WorkerUsage{
+			WorkloadName: row.WorkloadName,
+			WorkerName:   row.WorkerName,
+			TflopsUsage:  row.ComputeTflops,
+			VramUsage:    row.VRAMBytes,
+			Timestamp:    row.Timestamp,
+		})
+	}
+
+	return workersMetrics, nil
+}
+
+type hypervisorWorkerUsageMetrics struct {
+	metrics.HypervisorWorkerUsageMetrics
+	TimeWindow time.Time `gorm:"column:time_window;index:,class:TIME"`
+}
+
+func (g *greptimeDBProvider) GetHistoryMetrics() ([]*WorkerUsage, error) {
+	data := []*hypervisorWorkerUsageMetrics{}
+	now := time.Now()
+	// TODO: replace using iteration for handling large datasets efficiently
+	// TODO: supply history resolution to config time window
+	err := g.db.Select("workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, date_bin('1 minute'::INTERVAL, ts) as time_window").
+		Where("ts > ? and ts <= ?", now.Add(-time.Hour*24).Nanosecond(), now.Nanosecond()).
+		Group("workload, worker, time_window").
+		Order("time_window asc").
+		Find(&data).
+		Error
+
+	if err != nil {
+		return nil, err
+	}
+
+	g.lastQueryTime = now
+
+	workersMetrics := make([]*WorkerUsage, 0, len(data))
+	for _, row := range data {
+		workersMetrics = append(workersMetrics, &WorkerUsage{
+			WorkloadName: row.WorkloadName,
+			WorkerName:   row.WorkerName,
+			TflopsUsage:  row.ComputeTflops,
+			VramUsage:    row.VRAMBytes,
+			Timestamp:    row.TimeWindow,
+		})
+	}
+
+	return workersMetrics, nil
+}
diff --git a/internal/autoscaler/metrics/metrics_provider_test.go b/internal/autoscaler/metrics/metrics_provider_test.go
new file mode 100644
index 00000000..916c050d
--- /dev/null
+++ b/internal/autoscaler/metrics/metrics_provider_test.go
@@ -0,0 +1,112 @@
+package metrics
+
+import (
+	"regexp"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/NexusGPU/tensor-fusion/internal/metrics"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"gorm.io/driver/mysql"
+	"gorm.io/gorm"
+)
+
+var _ = Describe("MetricsProvider", func() {
+	Context("when getting real time workers metrics", func() {
+		It("should return metrics for every worker", func() {
+			db, mock := NewMockDB()
+			now := time.Now()
+			fakeMetrics := []metrics.HypervisorWorkerUsageMetrics{
+				{
+					WorkloadName:  "workload-0",
+					WorkerName:    "worker-0",
+					ComputeTflops: 10.3,
+					VRAMBytes:     1 * 1000 * 1000 * 1000,
+					Timestamp:     now,
+				},
+				{
+					WorkloadName:  "workload-1",
+					WorkerName:    "worker-1",
+					ComputeTflops: 10.3,
+					VRAMBytes:     1 * 1000 * 1000 * 1000,
+					Timestamp:     now,
+				},
+			}
+
+			rows := sqlmock.NewRows([]string{"workload", "worker", "compute_tflops", "memory_bytes", "ts"})
+			for _, row := range fakeMetrics {
+				rows.AddRow(row.WorkloadName, row.WorkerName, row.ComputeTflops, row.VRAMBytes, row.Timestamp)
+			}
+
+			mock.ExpectQuery(regexp.QuoteMeta("SELECT workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, max(ts) as ts FROM `tf_worker_usage` WHERE ts > ? and ts <= ? GROUP BY workload, worker ORDER BY ts asc")).
+				WillReturnRows(rows)
+			provider := &greptimeDBProvider{db: db}
+			got, _ := provider.GetWorkersMetrics()
+			Expect(got).To(HaveLen(2))
+			Expect(got[0].WorkloadName).To(Equal(fakeMetrics[0].WorkloadName))
+			Expect(got[0].WorkerName).To(Equal(fakeMetrics[0].WorkerName))
+			Expect(got[0].VramUsage).To(Equal(fakeMetrics[0].VRAMBytes))
+			Expect(got[0].TflopsUsage).To(Equal(fakeMetrics[0].ComputeTflops))
+			Expect(got[0].Timestamp).To(Equal(fakeMetrics[0].Timestamp))
+		})
+	})
+
+	Context("when getting history workers metrics", func() {
+		It("should return metrics based on history length", func() {
+			db, mock := NewMockDB()
+			now := time.Now()
+			fakeMetrics := []hypervisorWorkerUsageMetrics{
+				{
+					HypervisorWorkerUsageMetrics: metrics.HypervisorWorkerUsageMetrics{
+						WorkloadName:  "workload-0",
+						WorkerName:    "worker-0",
+						ComputeTflops: 10.3,
+						VRAMBytes:     1 * 1000 * 1000 * 1000,
+						Timestamp:     now,
+					},
+					TimeWindow: now,
+				},
+				{
+					HypervisorWorkerUsageMetrics: metrics.HypervisorWorkerUsageMetrics{
+						WorkloadName:  "workload-1",
+						WorkerName:    "worker-1",
+						ComputeTflops: 10.3,
+						VRAMBytes:     1 * 1000 * 1000 * 1000,
+						Timestamp:     now,
+					},
+					TimeWindow: now,
+				},
+			}
+
+			rows := sqlmock.NewRows([]string{"workload", "worker", "compute_tflops", "memory_bytes", "time_window"})
+			for _, row := range fakeMetrics {
+				rows.AddRow(row.WorkloadName, row.WorkerName, row.ComputeTflops, row.VRAMBytes, row.TimeWindow)
+			}
+
+			mock.ExpectQuery(regexp.QuoteMeta("SELECT workload, worker, max(compute_tflops) as compute_tflops, max(memory_bytes) as memory_bytes, date_bin('1 minute'::INTERVAL, ts) as time_window FROM `tf_worker_usage` WHERE ts > ? and ts <= ? GROUP BY workload, worker, time_window ORDER BY time_window asc")).
+				WillReturnRows(rows)
+			provider := &greptimeDBProvider{db: db}
+			got, _ := provider.GetHistoryMetrics()
+			Expect(got).To(HaveLen(2))
+			Expect(got[0].WorkloadName).To(Equal(fakeMetrics[0].WorkloadName))
+			Expect(got[0].WorkerName).To(Equal(fakeMetrics[0].WorkerName))
+			Expect(got[0].VramUsage).To(Equal(fakeMetrics[0].VRAMBytes))
+			Expect(got[0].TflopsUsage).To(Equal(fakeMetrics[0].ComputeTflops))
+			Expect(got[0].Timestamp).To(Equal(fakeMetrics[0].TimeWindow))
+		})
+	})
+})
+
+func NewMockDB() (*gorm.DB, sqlmock.Sqlmock) {
+	GinkgoHelper()
+	db, mock, err := sqlmock.New()
+	Expect(err).ToNot(HaveOccurred())
+	gormDB, err := gorm.Open(mysql.New(mysql.Config{
+		Conn:                      db,
+		SkipInitializeWithVersion: true,
+	}), &gorm.Config{})
+	Expect(err).ToNot(HaveOccurred())
+
+	return gormDB, mock
+}
diff --git a/internal/autoscaler/recommender/cron_recommender.go b/internal/autoscaler/recommender/cron_recommender.go
new file mode 100644
index 00000000..694dc649
--- /dev/null
+++ b/internal/autoscaler/recommender/cron_recommender.go
@@ -0,0 +1,163 @@
+package recommender
+
+import (
+	"context"
+	"fmt"
+	"maps"
+	"time"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/robfig/cron/v3"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+)
+
+// Utilize these annotations to determine if the configuration has changed
+const (
+	CronScalingTFLOPSRequestAnnotation = constants.Domain + "/cron-scaling-tflops-request"
+	CronScalingVRAMRequestAnnotation   = constants.Domain + "/cron-scaling-vram-request"
+	CronScalingTFLOPSLimitAnnotation   = constants.Domain + "/cron-scaling-tflops-limit"
+	CronScalingVRAMLimitAnnotation     = constants.Domain + "/cron-scaling-vram-limit"
+)
+
+type CronRecommender struct {
+	client.Client
+	parser cron.Parser
+}
+
+func NewCronRecommender(c client.Client) *CronRecommender {
+	return &CronRecommender{
+		Client: c,
+		parser: cron.NewParser(cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow),
+	}
+}
+
+func (c *CronRecommender) Name() string {
+	return "cron"
+}
+
+func (c *CronRecommender) Recommend(ctx context.Context, w *workload.State) (*tfv1.Resources, error) {
+	log := log.FromContext(ctx)
+	activeRule, err := c.getActiveCronScalingRule(&w.Spec.AutoScalingConfig)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get active cron scaling rule %w", err)
+	}
+
+	curRes, err := cronScalingResourcesFromAnnotations(w.Annotations)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get current resources from workload %s: %v", w.Name, err)
+	}
+
+	var result *tfv1.Resources
+	if activeRule == nil {
+		if curRes == nil {
+			return nil, nil
+		}
+		// revert the resources to those specified in the workload spec
+		result = w.GetResourcesSpec()
+		maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(&tfv1.Resources{}))
+		log.Info("cron scaling finished", "workload", w.Name, "resources", result)
+	} else {
+		result = &activeRule.DesiredResources
+		maps.Copy(w.ScalingAnnotations, cronScalingResourcesToAnnotations(result))
+		log.Info("cron scaling rule matched", "workload", w.Name, "rule", activeRule.Name, "resources", result)
+	}
+
+	if curRes != nil && result.Equal(curRes) {
+		return nil, nil
+	}
+
+	return result, nil
+}
+
+func cronScalingResourcesToAnnotations(resources *tfv1.Resources) map[string]string {
+	return map[string]string{
+		CronScalingTFLOPSRequestAnnotation: resources.Requests.Tflops.String(),
+		CronScalingTFLOPSLimitAnnotation:   resources.Limits.Tflops.String(),
+		CronScalingVRAMRequestAnnotation:   resources.Requests.Vram.String(),
+		CronScalingVRAMLimitAnnotation:     resources.Limits.Vram.String(),
+	}
+}
+
+func cronScalingResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) {
+	result := tfv1.Resources{}
+	resInfo := []struct {
+		key string
+		dst *resource.Quantity
+	}{
+		{CronScalingTFLOPSRequestAnnotation, &result.Requests.Tflops},
+		{CronScalingTFLOPSLimitAnnotation, &result.Limits.Tflops},
+		{CronScalingVRAMRequestAnnotation, &result.Requests.Vram},
+		{CronScalingVRAMLimitAnnotation, &result.Limits.Vram},
+	}
+	for _, info := range resInfo {
+		annotation, ok := annotations[info.key]
+		if !ok {
+			continue
+		}
+		q, err := resource.ParseQuantity(annotation)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse %s: %v", info.key, err)
+		}
+		*info.dst = q
+	}
+
+	if result.IsZero() {
+		return nil, nil
+	}
+
+	return &result, nil
+}
+
+func (c *CronRecommender) getActiveCronScalingRule(config *tfv1.AutoScalingConfig) (*tfv1.CronScalingRule, error) {
+	activeRules := []*tfv1.CronScalingRule{}
+
+	currentTime := time.Now()
+
+	for _, rule := range config.CronScalingRules {
+		if !rule.Enable || rule.Name == "" ||
+			rule.Start == "" || rule.End == "" {
+			continue
+		}
+
+		if rule.Start == rule.End {
+			return nil, fmt.Errorf("start and end can not same")
+		}
+
+		startSchedule, err := c.parser.Parse(rule.Start)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse cron rule %s start: %w", rule.Name, err)
+		}
+		endSchedule, err := c.parser.Parse(rule.End)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse cron rule %s end: %w", rule.Name, err)
+		}
+
+		nextStartTime := startSchedule.Next(time.Now())
+		nextEndTime := endSchedule.Next(time.Now())
+
+		isActive := false
+		if nextStartTime.Before(nextEndTime) {
+			isActive = currentTime.After(nextStartTime) && currentTime.Before(nextEndTime)
+		} else {
+			isActive = currentTime.After(nextStartTime) || currentTime.Before(nextEndTime)
+		}
+
+		if isActive {
+			activeRules = append(activeRules, &rule)
+		}
+	}
+
+	if len(activeRules) > 1 {
+		return nil, fmt.Errorf("only one active cron scaling rule is permitted at any given time")
+	}
+
+	if len(activeRules) == 0 {
+		return nil, nil
+	}
+
+	return activeRules[0], nil
+}
diff --git a/internal/autoscaler/recommender/cron_recommender_test.go b/internal/autoscaler/recommender/cron_recommender_test.go
new file mode 100644
index 00000000..5825e309
--- /dev/null
+++ b/internal/autoscaler/recommender/cron_recommender_test.go
@@ -0,0 +1,188 @@
+package recommender
+
+import (
+	"context"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"k8s.io/apimachinery/pkg/api/resource"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload"
+)
+
+var _ = Describe("CronRecommender", func() {
+	ctx := context.TODO()
+	res := tfv1.Resources{
+		Requests: tfv1.Resource{
+			Tflops: resource.MustParse("10"),
+			Vram:   resource.MustParse("8Gi"),
+		},
+		Limits: tfv1.Resource{
+			Tflops: resource.MustParse("20"),
+			Vram:   resource.MustParse("16Gi"),
+		},
+	}
+
+	It("should return recommendation based on the active cron scaling rule", func() {
+		workload := workload.NewWorkloadState("test")
+		workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{
+				{
+					Enable:           true,
+					Name:             "test",
+					Start:            "0 0 * * *",
+					End:              "59 23 * * *",
+					DesiredResources: res,
+				},
+			},
+		}
+
+		recommender := NewCronRecommender(nil)
+		recommendation, _ := recommender.Recommend(ctx, workload)
+		Expect(recommendation.Equal(&res)).To(BeTrue())
+	})
+
+	It("should not return recommendation if there is no active cron scaling rule", func() {
+		workload := workload.NewWorkloadState("test")
+		workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{
+				{
+					Enable:           true,
+					Name:             "test",
+					Start:            "",
+					End:              "",
+					DesiredResources: res,
+				},
+			},
+		}
+
+		recommender := NewCronRecommender(nil)
+		recommendation, _ := recommender.Recommend(ctx, workload)
+		Expect(recommendation).To(BeNil())
+	})
+
+	It("should not return recommendation if the active cron scaling rule remains unchanged", func() {
+		workload := workload.NewWorkloadState("test")
+		workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{
+				{
+					Enable:           true,
+					Name:             "test",
+					Start:            "0 0 * * *",
+					End:              "59 23 * * *",
+					DesiredResources: res,
+				},
+			},
+		}
+
+		recommender := NewCronRecommender(nil)
+		recommendation, _ := recommender.Recommend(ctx, workload)
+		Expect(recommendation.Equal(&res)).To(BeTrue())
+
+		workload.Annotations = cronScalingResourcesToAnnotations(&res)
+
+		recommendation, _ = recommender.Recommend(ctx, workload)
+		Expect(recommendation).To(BeNil())
+	})
+
+	It("should revert the resources to those specified in the workload spec if the active cron scaling finished", func() {
+		workload := workload.NewWorkloadState("test")
+		workload.Spec.Resources = tfv1.Resources{
+			Requests: tfv1.Resource{
+				Tflops: resource.MustParse("5"),
+				Vram:   resource.MustParse("4Gi"),
+			},
+			Limits: tfv1.Resource{
+				Tflops: resource.MustParse("10"),
+				Vram:   resource.MustParse("8Gi"),
+			},
+		}
+		workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{
+				{
+					Enable:           true,
+					Name:             "test",
+					Start:            "0 0 * * *",
+					End:              "59 23 * * *",
+					DesiredResources: res,
+				},
+			},
+		}
+
+		recommender := NewCronRecommender(nil)
+		recommendation, _ := recommender.Recommend(ctx, workload)
+		Expect(recommendation.Equal(&res)).To(BeTrue())
+
+		workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{
+				{
+					Enable:           true,
+					Name:             "test",
+					Start:            "",
+					End:              "",
+					DesiredResources: res,
+				},
+			},
+		}
+
+		workload.Annotations = cronScalingResourcesToAnnotations(&res)
+		recommendation, _ = recommender.Recommend(ctx, workload)
+		Expect(recommendation.Equal(&workload.Spec.Resources)).To(BeTrue())
+
+		workload.Annotations = cronScalingResourcesToAnnotations(&tfv1.Resources{})
+		recommendation, _ = recommender.Recommend(ctx, workload)
+		Expect(recommendation).To(BeNil())
+	})
+
+	It("should return error if getting multiple active rules", func() {
+		workload := workload.NewWorkloadState("test")
+		workload.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{
+				{
+					Enable: true,
+					Name:   "test",
+					Start:  "0 0 * * *",
+					End:    "59 23 * * *",
+				},
+				{
+					Enable: true,
+					Name:   "test",
+					Start:  "0 0 * * *",
+					End:    "59 23 * * *",
+				},
+			},
+		}
+		recommender := NewCronRecommender(nil)
+		_, err := recommender.Recommend(ctx, workload)
+		Expect(err).To(HaveOccurred())
+	})
+
+	It("should not return cron scaling rule if no config or disable", func() {
+		asc := tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{},
+		}
+		Expect(NewCronRecommender(nil).getActiveCronScalingRule(&asc)).To(BeNil())
+		asc = tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{
+				{Enable: false},
+			},
+		}
+		Expect(NewCronRecommender(nil).getActiveCronScalingRule(&asc)).To(BeNil())
+	})
+
+	It("should return the active cron scaling rule if the current time falls within its scheduled interval", func() {
+		asc := tfv1.AutoScalingConfig{
+			CronScalingRules: []tfv1.CronScalingRule{
+				{
+					Enable: true,
+					Name:   "test",
+					Start:  "0 0 * * *",
+					End:    "59 23 * * *",
+				},
+			},
+		}
+		rule, _ := NewCronRecommender(nil).getActiveCronScalingRule(&asc)
+		Expect(rule).NotTo(BeNil())
+	})
+})
diff --git a/internal/autoscaler/recommender/estimator.go b/internal/autoscaler/recommender/estimator.go
new file mode 100644
index 00000000..f1daa06b
--- /dev/null
+++ b/internal/autoscaler/recommender/estimator.go
@@ -0,0 +1,167 @@
+package recommender
+
+import (
+	"math"
+	"time"
+
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics"
+	"k8s.io/apimachinery/pkg/api/resource"
+)
+
+const (
+	// MaxResourceAmount is the maximum allowed value of resource amount.
+	MaxResourceAmount = ResourceAmount(1e14)
+)
+
+type ResourceAmount int64
+
+// ResourceAmountMax returns the larger of two resource amounts.
+func ResourceAmountMax(amount1, amount2 ResourceAmount) ResourceAmount {
+	if amount1 > amount2 {
+		return amount1
+	}
+	return amount2
+}
+
+func QuantityFromAmount(amount ResourceAmount) resource.Quantity {
+	return *resource.NewScaledQuantity(int64(amount), 0)
+}
+
+func resourceAmountFromFloat(amount float64) ResourceAmount {
+	if amount < 0 {
+		return ResourceAmount(0)
+	} else if amount > float64(MaxResourceAmount) {
+		return MaxResourceAmount
+	} else {
+		return ResourceAmount(amount)
+	}
+}
+
+type VramEstimator interface {
+	GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount
+}
+
+type percentileVramEstimator struct {
+	percentile float64
+}
+
+// NewPercentileVramEstimator returns a new percentileVramEstimator that uses provided percentile.
+func NewPercentileVramEstimator(percentile float64) VramEstimator {
+	return &percentileVramEstimator{percentile}
+}
+
+func (e *percentileVramEstimator) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount {
+	return resourceAmountFromFloat(float64(w.VramHistogram.Percentile(e.percentile)))
+}
+
+type vramMarginEstimator struct {
+	marginFraction float64
+	baseEstimator  VramEstimator
+}
+
+// WithvramMargin returns a vramEstimator that adds a margin to the base estimator.
+func WithVramMargin(marginFraction float64, baseEstimator VramEstimator) VramEstimator {
+	return &vramMarginEstimator{marginFraction: marginFraction, baseEstimator: baseEstimator}
+}
+
+// GetvramEstimation returns the vram estimation for the given AggregateContainerState.
+func (e *vramMarginEstimator) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount {
+	base := e.baseEstimator.GetVramEstimation(w)
+	margin := resourceAmountFromFloat(float64(base) * e.marginFraction)
+	return base + margin
+}
+
+type vramConfidenceMultiplier struct {
+	multiplier         float64
+	exponent           float64
+	baseEstimator      VramEstimator
+	confidenceInterval time.Duration
+}
+
+// WithVramConfidenceMultiplier returns a VramEstimator that scales the
+func WithVramConfidenceMultiplier(multiplier, exponent float64, baseEstimator VramEstimator, confidenceInterval time.Duration) VramEstimator {
+	return &vramConfidenceMultiplier{
+		multiplier:         multiplier,
+		exponent:           exponent,
+		baseEstimator:      baseEstimator,
+		confidenceInterval: confidenceInterval,
+	}
+}
+
+func (e *vramConfidenceMultiplier) GetVramEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount {
+	confidence := getConfidence(w, e.confidenceInterval)
+	base := e.baseEstimator.GetVramEstimation(w)
+	return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent))
+}
+
+type TflopsEstimator interface {
+	GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount
+}
+
+type percentileTflopsEstimator struct {
+	percentile float64
+}
+
+// NewPercentileTflopsEstimator returns a new percentileTflopsEstimator that uses provided percentile.
+func NewPercentileTflopsEstimator(percentile float64) TflopsEstimator {
+	return &percentileTflopsEstimator{percentile}
+}
+
+func (e *percentileTflopsEstimator) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount {
+	return resourceAmountFromFloat(float64(w.TflopsHistogram.Percentile(e.percentile)))
+}
+
+type tflopsMarginEstimator struct {
+	marginFraction float64
+	baseEstimator  TflopsEstimator
+}
+
+// WithTflopsMargin returns a tflopsEstimator that adds a margin to the base estimator.
+func WithTflopsMargin(marginFraction float64, baseEstimator TflopsEstimator) TflopsEstimator {
+	return &tflopsMarginEstimator{marginFraction: marginFraction, baseEstimator: baseEstimator}
+}
+
+// GetTflopsEstimation returns the tflops estimation for the given AggregateContainerState.
+func (e *tflopsMarginEstimator) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount {
+	base := e.baseEstimator.GetTflopsEstimation(w)
+	margin := resourceAmountFromFloat(float64(base) * e.marginFraction)
+	return base + margin
+}
+
+type tflopsConfidenceMultiplier struct {
+	multiplier         float64
+	exponent           float64
+	baseEstimator      TflopsEstimator
+	confidenceInterval time.Duration
+}
+
+// WithTflopsConfidenceMultiplier returns a TflopsEstimator that scales the
+func WithTflopsConfidenceMultiplier(multiplier, exponent float64, baseEstimator TflopsEstimator, confidenceInterval time.Duration) TflopsEstimator {
+	return &tflopsConfidenceMultiplier{
+		multiplier:         multiplier,
+		exponent:           exponent,
+		baseEstimator:      baseEstimator,
+		confidenceInterval: confidenceInterval,
+	}
+}
+
+func (e *tflopsConfidenceMultiplier) GetTflopsEstimation(w *metrics.WorkerUsageAggregator) ResourceAmount {
+	confidence := getConfidence(w, e.confidenceInterval)
+	base := e.baseEstimator.GetTflopsEstimation(w)
+	return resourceAmountFromFloat(float64(base) * math.Pow(1.+e.multiplier/confidence, e.exponent))
+}
+
+// Returns a non-negative real number that heuristically measures how much
+// confidence the history aggregated in the AggregateState provides.
+// For a workload producing a steady stream of samples over N days at the rate
+// of 1 sample per minute, this metric is equal to N.
+// This implementation is a very simple heuristic which looks at the total count
+// of samples and the time between the first and the last sample.
+func getConfidence(w *metrics.WorkerUsageAggregator, confidenceInterval time.Duration) float64 {
+	// Distance between the first and the last observed sample time, measured in days.
+	lifespanInDays := float64(w.LastSampleStart.Sub(w.FirstSampleStart)) / float64(confidenceInterval)
+	// Total count of samples normalized such that it equals the number of days for
+	// frequency of 1 sample/minute.
+	samplesAmount := float64(w.TotalSamplesCount) / confidenceInterval.Minutes()
+	return math.Min(lifespanInDays, samplesAmount)
+}
diff --git a/internal/autoscaler/recommender/percentile_recommender.go b/internal/autoscaler/recommender/percentile_recommender.go
new file mode 100644
index 00000000..b08113e5
--- /dev/null
+++ b/internal/autoscaler/recommender/percentile_recommender.go
@@ -0,0 +1,224 @@
+package recommender
+
+import (
+	"context"
+	"fmt"
+	"math/big"
+	"strconv"
+	"time"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+)
+
+const (
+	// Fraction of usage added as the safety margin to the recommended request
+	defaultRequestMarginFraction = 0.15
+	// Vram usage percentile that will be used as a base for vram target recommendation. Doesn't affect vram lower bound nor vram upper bound.
+	defaultTargetVramPercentile = 0.9
+	// Vram usage percentile that will be used for the lower bound on vram recommendation.
+	defaultLowerBoundVramPercentile = 0.5
+	// Vram usage percentile that will be used for the upper bound on vram recommendation.
+	defaultUpperBoundVramPercentile = 0.95
+	// Tflops usage percentile that will be used as a base for tflops target recommendation. Doesn't affect tflops lower bound nor tflops upper bound.
+	defaultTargetTflopsPercentile = 0.9
+	// Tflops usage percentile that will be used for the lower bound on tflops recommendation.
+	defaultLowerBoundTflopsPercentile = 0.5
+	// Tflops usage percentile that will be used for the upper bound on tflops recommendation.
+	defaultUpperBoundTflopsPercentile = 0.95
+	// The time interval used for computing the confidence multiplier for the lower and upper bound. Default: 24h
+	defaultConfidenceInterval = time.Hour * 24
+)
+
+var defaultPercentileConfig = PercentileConfig{
+	TargetTflopsPercentile:     defaultTargetTflopsPercentile,
+	LowerBoundTflopsPercentile: defaultLowerBoundTflopsPercentile,
+	UpperBoundTflopsPercentile: defaultUpperBoundTflopsPercentile,
+	TargetVramPercentile:       defaultTargetVramPercentile,
+	LowerBoundVramPercentile:   defaultLowerBoundVramPercentile,
+	UpperBoundVramPercentile:   defaultUpperBoundVramPercentile,
+	RequestMarginFraction:      defaultRequestMarginFraction,
+	ConfidenceInterval:         defaultConfidenceInterval,
+}
+
+type RecommendedResources struct {
+	LowerBoundTflops resource.Quantity
+	TargetTflops     resource.Quantity
+	UpperBoundTflops resource.Quantity
+	LowerBoundVram   resource.Quantity
+	TargetVram       resource.Quantity
+	UpperBoundVram   resource.Quantity
+}
+
+type PercentileConfig struct {
+	TargetTflopsPercentile     float64
+	LowerBoundTflopsPercentile float64
+	UpperBoundTflopsPercentile float64
+	TargetVramPercentile       float64
+	LowerBoundVramPercentile   float64
+	UpperBoundVramPercentile   float64
+	RequestMarginFraction      float64
+	ConfidenceInterval         time.Duration
+}
+
+type PercentileRecommender struct {
+	lowerBoundTflops TflopsEstimator
+	targetTflops     TflopsEstimator
+	upperBoundTflops TflopsEstimator
+	lowerBoundVram   VramEstimator
+	targetVram       VramEstimator
+	upperBoundVram   VramEstimator
+}
+
+func NewPercentileRecommender() *PercentileRecommender {
+	return &PercentileRecommender{}
+}
+
+func (p *PercentileRecommender) Name() string {
+	return "percentile"
+}
+
+func (p *PercentileRecommender) Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error) {
+	log := log.FromContext(ctx)
+	aggregator := workload.WorkerUsageAggregator
+	if aggregator.IsEmpty() {
+		return nil, nil
+	}
+
+	curRes, err := workload.GetCurrentResourcesSpec()
+	if err != nil {
+		return nil, fmt.Errorf("failed to get current resources from workload %s: %v", workload.Name, err)
+	}
+
+	// TODO: cache config
+	p.createEstimatorsFromConfig(p.getPercentileConfig(&workload.Spec.AutoScalingConfig.AutoSetResources))
+	rr := &RecommendedResources{
+		LowerBoundTflops: QuantityFromAmount(p.lowerBoundTflops.GetTflopsEstimation(aggregator)),
+		TargetTflops:     QuantityFromAmount(p.targetTflops.GetTflopsEstimation(aggregator)),
+		UpperBoundTflops: QuantityFromAmount(p.upperBoundTflops.GetTflopsEstimation(aggregator)),
+		LowerBoundVram:   QuantityFromAmount(p.lowerBoundVram.GetVramEstimation(aggregator)),
+		TargetVram:       QuantityFromAmount(p.targetVram.GetVramEstimation(aggregator)),
+		UpperBoundVram:   QuantityFromAmount(p.upperBoundVram.GetVramEstimation(aggregator)),
+	}
+
+	log.Info("recommendation", "workload", workload.Name, "recommender", p.Name(), "resources", rr)
+
+	result := &tfv1.Resources{}
+	if curRes.Requests.Tflops.Cmp(rr.LowerBoundTflops) < 0 ||
+		curRes.Requests.Tflops.Cmp(rr.UpperBoundTflops) > 0 {
+		result.Requests.Tflops = rr.TargetTflops
+		targetLimit := getProportionalLimit(&curRes.Limits.Tflops, &curRes.Requests.Tflops, &rr.TargetTflops)
+		if targetLimit == nil {
+			return nil, fmt.Errorf("failed to get tflops limit from workload %s", workload.Name)
+		}
+		result.Limits.Tflops = *targetLimit
+	}
+
+	if curRes.Requests.Vram.Cmp(rr.LowerBoundVram) < 0 ||
+		curRes.Requests.Vram.Cmp(rr.UpperBoundVram) > 0 {
+		result.Requests.Vram = rr.TargetVram
+		targetLimit := getProportionalLimit(&curRes.Limits.Vram, &curRes.Requests.Vram, &rr.TargetVram)
+		if targetLimit == nil {
+			return nil, fmt.Errorf("failed to get vram limit from workload %s", workload.Name)
+		}
+		result.Limits.Vram = *targetLimit
+	}
+
+	if result.Equal(curRes) {
+		return nil, nil
+	}
+
+	return result, nil
+}
+
+func (p *PercentileRecommender) getPercentileConfig(asr *tfv1.AutoSetResources) *PercentileConfig {
+	cfg := defaultPercentileConfig
+
+	if asr == nil {
+		return &cfg
+	}
+
+	fields := []struct {
+		val string
+		dst *float64
+	}{
+		{asr.TargetTflopsPercentile, &cfg.TargetTflopsPercentile},
+		{asr.LowerBoundTflopsPercentile, &cfg.LowerBoundTflopsPercentile},
+		{asr.UpperBoundTflopsPercentile, &cfg.UpperBoundTflopsPercentile},
+		{asr.TargetVramPercentile, &cfg.TargetVramPercentile},
+		{asr.LowerBoundVramPercentile, &cfg.LowerBoundVramPercentile},
+		{asr.UpperBoundVramPercentile, &cfg.UpperBoundVramPercentile},
+		{asr.RequestMarginFraction, &cfg.RequestMarginFraction},
+	}
+	for _, f := range fields {
+		if f.val == "" {
+			continue
+		}
+		if v, err := strconv.ParseFloat(f.val, 64); err == nil {
+			*f.dst = v
+		}
+	}
+
+	if asr.ConfidenceInterval != "" {
+		if d, err := time.ParseDuration(asr.ConfidenceInterval); err == nil {
+			cfg.ConfidenceInterval = d
+		}
+	}
+
+	return &cfg
+}
+
+func (p *PercentileRecommender) createEstimatorsFromConfig(config *PercentileConfig) {
+	targetTflops := NewPercentileTflopsEstimator(config.TargetTflopsPercentile)
+	lowerBoundTflops := NewPercentileTflopsEstimator(config.LowerBoundTflopsPercentile)
+	upperBoundTflops := NewPercentileTflopsEstimator(config.UpperBoundTflopsPercentile)
+
+	targetTflops = WithTflopsMargin(config.RequestMarginFraction, targetTflops)
+	lowerBoundTflops = WithTflopsMargin(config.RequestMarginFraction, lowerBoundTflops)
+	upperBoundTflops = WithTflopsMargin(config.RequestMarginFraction, upperBoundTflops)
+
+	upperBoundTflops = WithTflopsConfidenceMultiplier(1.0, 1.0, upperBoundTflops, config.ConfidenceInterval)
+	lowerBoundTflops = WithTflopsConfidenceMultiplier(0.001, -2.0, lowerBoundTflops, config.ConfidenceInterval)
+
+	targetVram := NewPercentileVramEstimator(config.TargetVramPercentile)
+	lowerBoundVram := NewPercentileVramEstimator(config.LowerBoundVramPercentile)
+	upperBoundVram := NewPercentileVramEstimator(config.UpperBoundVramPercentile)
+
+	targetVram = WithVramMargin(config.RequestMarginFraction, targetVram)
+	lowerBoundVram = WithVramMargin(config.RequestMarginFraction, lowerBoundVram)
+	upperBoundVram = WithVramMargin(config.RequestMarginFraction, upperBoundVram)
+
+	upperBoundVram = WithVramConfidenceMultiplier(1.0, 1.0, upperBoundVram, config.ConfidenceInterval)
+	lowerBoundVram = WithVramConfidenceMultiplier(0.001, -2.0, lowerBoundVram, config.ConfidenceInterval)
+
+	*p = PercentileRecommender{
+		lowerBoundTflops: lowerBoundTflops,
+		targetTflops:     targetTflops,
+		upperBoundTflops: upperBoundTflops,
+		lowerBoundVram:   lowerBoundVram,
+		targetVram:       targetVram,
+		upperBoundVram:   upperBoundVram,
+	}
+}
+
+func getProportionalLimit(originalLimit, originalRequest, recommendedRequest *resource.Quantity) *resource.Quantity {
+	if originalLimit == nil || originalLimit.IsZero() ||
+		originalRequest == nil || originalRequest.IsZero() ||
+		recommendedRequest == nil || recommendedRequest.IsZero() {
+		return nil
+	}
+
+	originalValue := big.NewInt(originalLimit.Value())
+	scaleBaseValue := big.NewInt(originalRequest.Value())
+	scaleResultValue := big.NewInt(recommendedRequest.Value())
+	var scaledOriginal big.Int
+	scaledOriginal.Mul(originalValue, scaleResultValue)
+	scaledOriginal.Div(&scaledOriginal, scaleBaseValue)
+	if scaledOriginal.IsInt64() {
+		return resource.NewQuantity(scaledOriginal.Int64(), originalLimit.Format)
+	}
+
+	return nil
+}
diff --git a/internal/autoscaler/recommender/percentile_recommender_test.go b/internal/autoscaler/recommender/percentile_recommender_test.go
new file mode 100644
index 00000000..fd6fe8a4
--- /dev/null
+++ b/internal/autoscaler/recommender/percentile_recommender_test.go
@@ -0,0 +1,65 @@
+package recommender
+
+import (
+	"time"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Percentile Recommender", func() {
+	It("should return default config when no AutoScalingConfig is set", func() {
+		cfg := NewPercentileRecommender().getPercentileConfig(nil)
+		Expect(cfg).ToNot(BeNil())
+		Expect(*cfg).To(Equal(defaultPercentileConfig))
+	})
+
+	It("should parse float fields from AutoSetResources", func() {
+		asr := &tfv1.AutoSetResources{
+			TargetTflopsPercentile:     "0.8",
+			LowerBoundTflopsPercentile: "0.1",
+			UpperBoundTflopsPercentile: "0.95",
+			TargetVramPercentile:       "0.7",
+			LowerBoundVramPercentile:   "0.2",
+			UpperBoundVramPercentile:   "0.9",
+			RequestMarginFraction:      "0.15",
+		}
+		cfg := NewPercentileRecommender().getPercentileConfig(asr)
+		Expect(cfg.TargetTflopsPercentile).To(Equal(0.8))
+		Expect(cfg.LowerBoundTflopsPercentile).To(Equal(0.1))
+		Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.95))
+		Expect(cfg.TargetVramPercentile).To(Equal(0.7))
+		Expect(cfg.LowerBoundVramPercentile).To(Equal(0.2))
+		Expect(cfg.UpperBoundVramPercentile).To(Equal(0.9))
+		Expect(cfg.RequestMarginFraction).To(Equal(0.15))
+	})
+
+	It("should ignore invalid float fields and keep defaults", func() {
+		asr := &tfv1.AutoSetResources{
+			TargetTflopsPercentile:     "not-a-float",
+			LowerBoundTflopsPercentile: "",
+			UpperBoundTflopsPercentile: "0.99",
+		}
+		cfg := NewPercentileRecommender().getPercentileConfig(asr)
+		Expect(cfg.TargetTflopsPercentile).To(Equal(defaultPercentileConfig.TargetTflopsPercentile))
+		Expect(cfg.LowerBoundTflopsPercentile).To(Equal(defaultPercentileConfig.LowerBoundTflopsPercentile))
+		Expect(cfg.UpperBoundTflopsPercentile).To(Equal(0.99))
+	})
+
+	It("should parse ConfidenceInterval if valid", func() {
+		asr := &tfv1.AutoSetResources{
+			ConfidenceInterval: "30m",
+		}
+		cfg := NewPercentileRecommender().getPercentileConfig(asr)
+		Expect(cfg.ConfidenceInterval).To(Equal(30 * time.Minute))
+	})
+
+	It("should ignore invalid ConfidenceInterval and keep default", func() {
+		asr := &tfv1.AutoSetResources{
+			ConfidenceInterval: "not-a-duration",
+		}
+		cfg := NewPercentileRecommender().getPercentileConfig(asr)
+		Expect(cfg.ConfidenceInterval).To(Equal(defaultPercentileConfig.ConfidenceInterval))
+	})
+})
diff --git a/internal/autoscaler/recommender/recommender.go b/internal/autoscaler/recommender/recommender.go
new file mode 100644
index 00000000..3248ad6c
--- /dev/null
+++ b/internal/autoscaler/recommender/recommender.go
@@ -0,0 +1,19 @@
+package recommender
+
+import (
+	"context"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/workload"
+)
+
+const (
+	Percentile = "percentile"
+	Cron       = "cron"
+)
+
+// Interface defines the contract for resource recommendation strategies used by the autoscaler.
+type Interface interface {
+	Name() string
+	Recommend(ctx context.Context, workload *workload.State) (*tfv1.Resources, error)
+}
diff --git a/internal/autoscaler/recommender/recommender_suite_test.go b/internal/autoscaler/recommender/recommender_suite_test.go
new file mode 100644
index 00000000..7177cf1d
--- /dev/null
+++ b/internal/autoscaler/recommender/recommender_suite_test.go
@@ -0,0 +1,13 @@
+package recommender_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestRecommender(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Recommender Suite")
+}
diff --git a/internal/autoscaler/workload/handler.go b/internal/autoscaler/workload/handler.go
new file mode 100644
index 00000000..bda6768d
--- /dev/null
+++ b/internal/autoscaler/workload/handler.go
@@ -0,0 +1,153 @@
+package workload
+
+import (
+	"context"
+	"fmt"
+	"maps"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
+	corev1 "k8s.io/api/core/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/log"
+)
+
+type Handler interface {
+	UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload)
+	ApplyRecommendationToWorkload(ctx context.Context, workloadState *State, recommendation *tfv1.Resources) error
+}
+
+type handler struct {
+	client.Client
+	allocator *gpuallocator.GpuAllocator
+}
+
+func NewHandler(client client.Client, allocator *gpuallocator.GpuAllocator) Handler {
+	return &handler{
+		Client:    client,
+		allocator: allocator,
+	}
+}
+
+func (h *handler) UpdateWorkloadState(ctx context.Context, workloadState *State, workload *tfv1.TensorFusionWorkload) {
+	workloadState.Namespace = workload.Namespace
+	workloadState.Spec = workload.Spec
+	workloadState.Annotations = workload.Annotations
+
+	workerList := &corev1.PodList{}
+	if err := h.List(ctx, workerList,
+		client.InNamespace(workloadState.Namespace),
+		client.MatchingLabels{constants.WorkloadKey: workloadState.Name}); err != nil {
+		log.FromContext(ctx).Error(err, "failed to list workers")
+		return
+	}
+	workloadState.updateWorkers(workerList)
+}
+
+func (h *handler) ApplyRecommendationToWorkload(ctx context.Context, workload *State, recommendation *tfv1.Resources) error {
+	if err := h.updateAutoScalingAnnotations(ctx, workload, recommendation); err != nil {
+		return fmt.Errorf("failed to update auto scaling annotations: %v", err)
+	}
+
+	if !workload.IsAutoSetResourcesEnabled() {
+		return nil
+	}
+
+	workerList := &corev1.PodList{}
+	if err := h.List(ctx, workerList,
+		client.InNamespace(workload.Namespace),
+		client.MatchingLabels{constants.WorkloadKey: workload.Name}); err != nil {
+		return fmt.Errorf("failed to list workers: %v", err)
+	}
+
+	for _, worker := range workerList.Items {
+		if !worker.DeletionTimestamp.IsZero() {
+			continue
+		}
+		if err := h.applyRecommendationToWorker(ctx, workload, &worker, recommendation); err != nil {
+			return fmt.Errorf("failed to update worker %s resources: %v", worker.Name, err)
+		}
+	}
+
+	return nil
+}
+
+func (h *handler) updateAutoScalingAnnotations(
+	ctx context.Context,
+	state *State,
+	recommendation *tfv1.Resources) error {
+	workload := &tfv1.TensorFusionWorkload{}
+	if err := h.Get(ctx, client.ObjectKey{Namespace: state.Namespace, Name: state.Name}, workload); err != nil {
+		return fmt.Errorf("failed to get workload: %v", err)
+	}
+
+	// record current and last resources
+	if workload.Annotations == nil {
+		workload.Annotations = map[string]string{}
+	}
+	patch := client.MergeFrom(workload.DeepCopy())
+	maps.Copy(workload.Annotations, utils.CurrentResourcesToAnnotations(recommendation))
+	maps.Copy(workload.Annotations, state.ScalingAnnotations)
+	if err := h.Patch(ctx, workload, patch); err != nil {
+		return fmt.Errorf("failed to patch workload %s: %v", workload.Name, err)
+	}
+
+	state.Annotations = workload.Annotations
+	return nil
+}
+
+func (h *handler) applyRecommendationToWorker(ctx context.Context, workload *State, worker *corev1.Pod, recommendation *tfv1.Resources) error {
+	log := log.FromContext(ctx)
+
+	curRes, err := utils.CurrentResourcesFromAnnotations(worker.Annotations)
+	if err != nil {
+		return fmt.Errorf("failed to get current worker resources: %v", err)
+	}
+	if curRes != nil && curRes.Equal(recommendation) {
+		return nil
+	}
+
+	annotationsToUpdate := utils.CurrentResourcesToAnnotations(recommendation)
+	if !workload.ShouldScaleResource(tfv1.ResourceTflops) {
+		delete(annotationsToUpdate, constants.TFLOPSRequestAnnotation)
+		delete(annotationsToUpdate, constants.TFLOPSLimitAnnotation)
+	}
+	if !workload.ShouldScaleResource(tfv1.ResourceVram) {
+		delete(annotationsToUpdate, constants.VRAMRequestAnnotation)
+		delete(annotationsToUpdate, constants.VRAMLimitAnnotation)
+	}
+
+	if len(annotationsToUpdate) <= 0 {
+		return nil
+	}
+
+	isScaleUp := false
+	if _, ok := annotationsToUpdate[constants.TFLOPSRequestAnnotation]; ok {
+		isScaleUp = recommendation.Requests.Tflops.Cmp(curRes.Requests.Tflops) > 0
+	} else {
+		isScaleUp = recommendation.Requests.Vram.Cmp(curRes.Requests.Vram) > 0
+	}
+
+	adjustRequest := &tfv1.AdjustRequest{
+		PodUID:     string(worker.UID),
+		IsScaleUp:  isScaleUp,
+		NewRequest: recommendation.Requests,
+		NewLimit:   recommendation.Limits,
+	}
+	if _, err := h.allocator.AdjustAllocation(ctx, *adjustRequest, true); err != nil {
+		return fmt.Errorf("failed to adjust allocation: %v", err)
+	}
+	log.Info("adjust allocation successfully", "worker", worker.Name, "adjustRequest", adjustRequest)
+
+	patch := client.MergeFrom(worker.DeepCopy())
+	maps.Copy(worker.Annotations, annotationsToUpdate)
+	if err := h.Patch(ctx, worker, patch); err != nil {
+		return fmt.Errorf("failed to patch worker %s: %v", worker.Name, err)
+	}
+
+	log.Info("apply recommendation successfully", "worker", worker.Name, "recommendation", recommendation, "currentResources", curRes)
+
+	return nil
+}
diff --git a/internal/autoscaler/workload/worker.go b/internal/autoscaler/workload/worker.go
new file mode 100644
index 00000000..8ad57ec3
--- /dev/null
+++ b/internal/autoscaler/workload/worker.go
@@ -0,0 +1,74 @@
+package workload
+
+import (
+	"time"
+
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics"
+)
+
+type WorkerState struct {
+	Name                 string
+	WorkloadName         string
+	LastTflopsSampleTime time.Time
+
+	VramPeak           uint64
+	LastVramSampleTime time.Time
+	VramWindowEnd      time.Time
+}
+
+func NewWorkerState(name string, workloadName string) *WorkerState {
+	return &WorkerState{
+		Name:                 name,
+		WorkloadName:         workloadName,
+		LastTflopsSampleTime: time.Time{},
+		LastVramSampleTime:   time.Time{},
+		VramWindowEnd:        time.Time{},
+	}
+}
+
+func (w *WorkerState) AddSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool {
+	w.AddTflopsSample(aggregator, sample)
+	w.AddVramSample(aggregator, sample)
+	return true
+}
+
+func (w *WorkerState) AddTflopsSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool {
+	if sample.Timestamp.Before(w.LastTflopsSampleTime) {
+		return false
+	}
+	aggregator.AddTflopsSample(sample)
+	w.LastTflopsSampleTime = sample.Timestamp
+	return true
+}
+
+func (w *WorkerState) AddVramSample(aggregator *metrics.WorkerUsageAggregator, sample *metrics.WorkerUsage) bool {
+	ts := sample.Timestamp
+	if ts.Before(w.LastVramSampleTime) {
+		return false
+	}
+	w.LastVramSampleTime = ts
+	if w.VramWindowEnd.IsZero() {
+		w.VramWindowEnd = ts
+	}
+
+	addNewPeak := false
+	if ts.Before(w.VramWindowEnd) {
+		if w.VramPeak != 0 && sample.VramUsage > w.VramPeak {
+			aggregator.SubtractVramSample(float64(w.VramPeak), w.VramWindowEnd)
+			addNewPeak = true
+		}
+	} else {
+		aggregationInteval := metrics.DefaultAggregationInterval
+		shift := ts.Sub(w.VramWindowEnd).Truncate(aggregationInteval) + aggregationInteval
+		w.VramWindowEnd = w.VramWindowEnd.Add(shift)
+		w.VramPeak = 0
+		addNewPeak = true
+	}
+
+	if addNewPeak {
+		aggregator.AddVramSample(sample)
+		w.VramPeak = sample.VramUsage
+	}
+
+	return true
+}
diff --git a/internal/autoscaler/workload/workload.go b/internal/autoscaler/workload/workload.go
new file mode 100644
index 00000000..4e2063f9
--- /dev/null
+++ b/internal/autoscaler/workload/workload.go
@@ -0,0 +1,90 @@
+package workload
+
+import (
+	"fmt"
+	"strings"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/autoscaler/metrics"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
+	corev1 "k8s.io/api/core/v1"
+)
+
+type State struct {
+	Namespace             string
+	Name                  string
+	Annotations           map[string]string
+	ScalingAnnotations    map[string]string
+	Spec                  tfv1.WorkloadProfileSpec
+	Workers               map[string]*WorkerState
+	WorkerUsageAggregator *metrics.WorkerUsageAggregator
+}
+
+func NewWorkloadState(name string) *State {
+	return &State{
+		Name:                  name,
+		Workers:               make(map[string]*WorkerState),
+		ScalingAnnotations:    make(map[string]string),
+		WorkerUsageAggregator: metrics.NewWorkerUsageAggregator(),
+	}
+}
+
+func (w *State) GetLastResourcesSpec() (*tfv1.Resources, error) {
+	return utils.LastResourcesFromAnnotations(w.Annotations)
+}
+
+func (w *State) GetResourcesSpec() *tfv1.Resources {
+	return &w.Spec.Resources
+}
+
+func (w *State) GetCurrentResourcesSpec() (*tfv1.Resources, error) {
+	resources, err := utils.CurrentResourcesFromAnnotations(w.Annotations)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get resources from annotations: %v", err)
+	}
+	if resources == nil {
+		return &w.Spec.Resources, nil
+	}
+	return resources, nil
+}
+
+func (w *State) SetScalingAnnotation(key string, value string) {
+	w.ScalingAnnotations[key] = value
+}
+
+func (w *State) IsAutoSetResourcesEnabled() bool {
+	return w.Spec.AutoScalingConfig.AutoSetResources.Enable
+}
+
+func (w *State) ShouldScaleResource(name tfv1.ResourceName) bool {
+	target := w.Spec.AutoScalingConfig.AutoSetResources.TargetResource
+	return strings.EqualFold(target, "all") || strings.EqualFold(string(name), target)
+}
+
+func (w *State) updateWorkers(podList *corev1.PodList) {
+	observedWorkers := map[string]bool{}
+	for _, worker := range podList.Items {
+		if !worker.DeletionTimestamp.IsZero() {
+			continue
+		}
+		if _, exists := w.Workers[worker.Name]; !exists {
+			w.Workers[worker.Name] = NewWorkerState(worker.Name, w.Name)
+		}
+		observedWorkers[worker.Name] = true
+	}
+
+	for key, worker := range w.Workers {
+		if worker.WorkloadName == w.Name && !observedWorkers[key] {
+			delete(w.Workers, key)
+		}
+	}
+}
+
+func (w *State) AddSample(sample *metrics.WorkerUsage) {
+	worker, exists := w.Workers[sample.WorkerName]
+	if !exists {
+		worker = NewWorkerState(sample.WorkerName, sample.WorkloadName)
+		w.Workers[sample.WorkerName] = worker
+	}
+	worker.AddSample(w.WorkerUsageAggregator, sample)
+}
diff --git a/internal/autoscaler/workload/workload_suite_test.go b/internal/autoscaler/workload/workload_suite_test.go
new file mode 100644
index 00000000..cd3451b6
--- /dev/null
+++ b/internal/autoscaler/workload/workload_suite_test.go
@@ -0,0 +1,13 @@
+package workload_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestWorkload(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Workload Suite")
+}
diff --git a/internal/autoscaler/workload/workload_test.go b/internal/autoscaler/workload/workload_test.go
new file mode 100644
index 00000000..bd18e9f7
--- /dev/null
+++ b/internal/autoscaler/workload/workload_test.go
@@ -0,0 +1,84 @@
+package workload
+
+import (
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/utils"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"k8s.io/apimachinery/pkg/api/resource"
+)
+
+var _ = Describe("Workload", func() {
+	It("should correctly determine if a resource is the target based on config", func() {
+		ws := NewWorkloadState("test")
+
+		Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeFalse())
+		Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeFalse())
+
+		ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			AutoSetResources: tfv1.AutoSetResources{TargetResource: "all"},
+		}
+
+		Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeTrue())
+		Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeTrue())
+
+		ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			AutoSetResources: tfv1.AutoSetResources{TargetResource: "tflops"},
+		}
+		Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeTrue())
+		Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeFalse())
+
+		ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			AutoSetResources: tfv1.AutoSetResources{TargetResource: "vram"},
+		}
+		Expect(ws.ShouldScaleResource(tfv1.ResourceTflops)).To(BeFalse())
+		Expect(ws.ShouldScaleResource(tfv1.ResourceVram)).To(BeTrue())
+	})
+
+	It("should correctly determine if auto set resources is enabled based on config", func() {
+		ws := NewWorkloadState("test")
+
+		ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			AutoSetResources: tfv1.AutoSetResources{Enable: true},
+		}
+		Expect(ws.IsAutoSetResourcesEnabled()).To(BeTrue())
+		ws.Spec.AutoScalingConfig = tfv1.AutoScalingConfig{
+			AutoSetResources: tfv1.AutoSetResources{Enable: false},
+		}
+		Expect(ws.IsAutoSetResourcesEnabled()).To(BeFalse())
+	})
+
+	It("should return last resources spec from the annotations", func() {
+		ws := NewWorkloadState("test")
+		expect := tfv1.Resources{
+			Requests: tfv1.Resource{
+				Tflops: resource.MustParse("10"),
+				Vram:   resource.MustParse("8Gi"),
+			},
+			Limits: tfv1.Resource{
+				Tflops: resource.MustParse("20"),
+				Vram:   resource.MustParse("16Gi"),
+			},
+		}
+		ws.Annotations = utils.LastResourcesToAnnotations(&expect)
+		got, _ := ws.GetLastResourcesSpec()
+		Expect(got.Equal(&expect))
+	})
+
+	It("should return current resources spec from the annotations", func() {
+		ws := NewWorkloadState("test")
+		expect := tfv1.Resources{
+			Requests: tfv1.Resource{
+				Tflops: resource.MustParse("10"),
+				Vram:   resource.MustParse("8Gi"),
+			},
+			Limits: tfv1.Resource{
+				Tflops: resource.MustParse("20"),
+				Vram:   resource.MustParse("16Gi"),
+			},
+		}
+		ws.Annotations = utils.CurrentResourcesToAnnotations(&expect)
+		got, _ := ws.GetCurrentResourcesSpec()
+		Expect(got.Equal(&expect))
+	})
+})
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
index dd2810b3..2183f852 100644
--- a/internal/constants/constants.go
+++ b/internal/constants/constants.go
@@ -55,6 +55,10 @@ const (
 	VRAMRequestAnnotation          = Domain + "/vram-request"
 	TFLOPSLimitAnnotation          = Domain + "/tflops-limit"
 	VRAMLimitAnnotation            = Domain + "/vram-limit"
+	LastTFLOPSRequestAnnotation    = Domain + "/last-tflops-request"
+	LastVRAMRequestAnnotation      = Domain + "/last-vram-request"
+	LastTFLOPSLimitAnnotation      = Domain + "/last-tflops-limit"
+	LastVRAMLimitAnnotation        = Domain + "/last-vram-limit"
 	WorkloadProfileAnnotation      = Domain + "/client-profile"
 	InjectContainerAnnotation      = Domain + "/inject-container"
 	IsLocalGPUAnnotation           = Domain + "/is-local-gpu"
@@ -73,9 +77,8 @@ const (
 	GenPortNumberAnnotation      = Domain + "/port-number"
 	TensorFusionWorkerPortNumber = 8000
 
-	AutoScaleLimitsAnnotation   = Domain + "/auto-limits"
-	AutoScaleRequestsAnnotation = Domain + "/auto-requests"
-	AutoScaleReplicasAnnotation = Domain + "/auto-replicas"
+	AutoScaleResourcesAnnotation = Domain + "/auto-resources"
+	AutoScaleReplicasAnnotation  = Domain + "/auto-replicas"
 
 	GpuReleasedAnnotation = Domain + "/gpu-released"
 
diff --git a/internal/utils/resource.go b/internal/utils/resource.go
new file mode 100644
index 00000000..855d3ce3
--- /dev/null
+++ b/internal/utils/resource.go
@@ -0,0 +1,87 @@
+package utils
+
+import (
+	"fmt"
+
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"k8s.io/apimachinery/pkg/api/resource"
+)
+
+func CurrentResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) {
+	result := tfv1.Resources{}
+	resInfo := []struct {
+		key string
+		dst *resource.Quantity
+	}{
+		{constants.TFLOPSRequestAnnotation, &result.Requests.Tflops},
+		{constants.TFLOPSLimitAnnotation, &result.Limits.Tflops},
+		{constants.VRAMRequestAnnotation, &result.Requests.Vram},
+		{constants.VRAMLimitAnnotation, &result.Limits.Vram},
+	}
+	for _, info := range resInfo {
+		annotation, ok := annotations[info.key]
+		if !ok {
+			continue
+		}
+		q, err := resource.ParseQuantity(annotation)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse %s: %v", info.key, err)
+		}
+		*info.dst = q
+	}
+
+	if result.IsZero() {
+		return nil, nil
+	}
+
+	return &result, nil
+}
+
+func LastResourcesFromAnnotations(annotations map[string]string) (*tfv1.Resources, error) {
+	result := tfv1.Resources{}
+	resInfo := []struct {
+		key string
+		dst *resource.Quantity
+	}{
+		{constants.LastTFLOPSRequestAnnotation, &result.Requests.Tflops},
+		{constants.LastTFLOPSLimitAnnotation, &result.Limits.Tflops},
+		{constants.LastVRAMRequestAnnotation, &result.Requests.Vram},
+		{constants.LastVRAMLimitAnnotation, &result.Limits.Vram},
+	}
+	for _, info := range resInfo {
+		annotation, ok := annotations[info.key]
+		if !ok {
+			continue
+		}
+		q, err := resource.ParseQuantity(annotation)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse %s: %v", info.key, err)
+		}
+		*info.dst = q
+	}
+
+	if result.IsZero() {
+		return nil, nil
+	}
+
+	return &result, nil
+}
+
+func CurrentResourcesToAnnotations(resources *tfv1.Resources) map[string]string {
+	return map[string]string{
+		constants.TFLOPSRequestAnnotation: resources.Requests.Tflops.String(),
+		constants.TFLOPSLimitAnnotation:   resources.Limits.Tflops.String(),
+		constants.VRAMRequestAnnotation:   resources.Requests.Vram.String(),
+		constants.VRAMLimitAnnotation:     resources.Limits.Vram.String(),
+	}
+}
+
+func LastResourcesToAnnotations(resources *tfv1.Resources) map[string]string {
+	return map[string]string{
+		constants.LastTFLOPSRequestAnnotation: resources.Requests.Tflops.String(),
+		constants.LastTFLOPSLimitAnnotation:   resources.Limits.Tflops.String(),
+		constants.LastVRAMRequestAnnotation:   resources.Requests.Vram.String(),
+		constants.LastVRAMLimitAnnotation:     resources.Limits.Vram.String(),
+	}
+}
diff --git a/internal/webhook/v1/tf_parser.go b/internal/webhook/v1/tf_parser.go
index 23c9104e..9fb8d6e6 100644
--- a/internal/webhook/v1/tf_parser.go
+++ b/internal/webhook/v1/tf_parser.go
@@ -134,13 +134,9 @@ func ParseTensorFusionInfo(
 }
 
 func parseAutoScalingAnnotations(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile) {
-	autoLimits, ok := pod.Annotations[constants.AutoScaleLimitsAnnotation]
-	if ok && autoLimits == constants.TrueStringValue {
-		workloadProfile.Spec.AutoScalingConfig.AutoSetLimits.Enable = true
-	}
-	autoRequests, ok := pod.Annotations[constants.AutoScaleRequestsAnnotation]
-	if ok && autoRequests == constants.TrueStringValue {
-		workloadProfile.Spec.AutoScalingConfig.AutoSetRequests.Enable = true
+	autoResources, ok := pod.Annotations[constants.AutoScaleResourcesAnnotation]
+	if ok && autoResources == constants.TrueStringValue {
+		workloadProfile.Spec.AutoScalingConfig.AutoSetResources.Enable = true
 	}
 	autoReplicas, ok := pod.Annotations[constants.AutoScaleReplicasAnnotation]
 	if ok && autoReplicas == constants.TrueStringValue {