Skip to content

Commit 98ef48b

Browse files
committed
Fix: validation in right place
1. Remove validation logic in GetWorkerGroupDesiredReplicas (utils.go), and append to ValidateRayClusterSpec. 2. Add other validation logic for worker group specs. 3. Remove unnecessary test cases in GetWorkerGroupDesiredReplicas. Fix: MinReplicas / MaxReplicas should verify if autoscaling is not enabled 1. fix golangci-lint test (unused ctx)
1 parent c6bafa3 commit 98ef48b

File tree

7 files changed

+43
-47
lines changed

7 files changed

+43
-47
lines changed

ray-operator/controllers/ray/batchscheduler/scheduler-plugins/scheduler_plugins.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func createPodGroup(ctx context.Context, app *rayv1.RayCluster) *v1alpha1.PodGro
6262
},
6363
},
6464
Spec: v1alpha1.PodGroupSpec{
65-
MinMember: utils.CalculateDesiredReplicas(ctx, app) + 1, // +1 for the head pod
65+
MinMember: utils.CalculateDesiredReplicas(app) + 1, // +1 for the head pod
6666
MinResources: utils.CalculateDesiredResources(app),
6767
},
6868
}

ray-operator/controllers/ray/batchscheduler/volcano/volcano_scheduler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ func (v *VolcanoBatchScheduler) DoBatchSchedulingOnSubmission(ctx context.Contex
5050
var minMember int32
5151
var totalResource corev1.ResourceList
5252
if !utils.IsAutoscalingEnabled(&app.Spec) {
53-
minMember = utils.CalculateDesiredReplicas(ctx, app) + 1
53+
minMember = utils.CalculateDesiredReplicas(app) + 1
5454
totalResource = utils.CalculateDesiredResources(app)
5555
} else {
5656
minMember = utils.CalculateMinReplicas(app) + 1

ray-operator/controllers/ray/batchscheduler/volcano/volcano_scheduler_test.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package volcano
22

33
import (
4-
"context"
54
"testing"
65

76
"github.com/stretchr/testify/assert"
@@ -83,7 +82,7 @@ func TestCreatePodGroup(t *testing.T) {
8382

8483
cluster := createTestRayCluster(1)
8584

86-
minMember := utils.CalculateDesiredReplicas(context.Background(), &cluster) + 1
85+
minMember := utils.CalculateDesiredReplicas(&cluster) + 1
8786
totalResource := utils.CalculateDesiredResources(&cluster)
8887
pg := createPodGroup(&cluster, getAppPodGroupName(&cluster), minMember, totalResource)
8988

@@ -107,7 +106,7 @@ func TestCreatePodGroup_NumOfHosts2(t *testing.T) {
107106

108107
cluster := createTestRayCluster(2)
109108

110-
minMember := utils.CalculateDesiredReplicas(context.Background(), &cluster) + 1
109+
minMember := utils.CalculateDesiredReplicas(&cluster) + 1
111110
totalResource := utils.CalculateDesiredResources(&cluster)
112111
pg := createPodGroup(&cluster, getAppPodGroupName(&cluster), minMember, totalResource)
113112

ray-operator/controllers/ray/raycluster_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -642,7 +642,7 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
642642
continue
643643
}
644644
// workerReplicas will store the target number of pods for this worker group.
645-
numExpectedWorkerPods := int(utils.GetWorkerGroupDesiredReplicas(ctx, worker))
645+
numExpectedWorkerPods := int(utils.GetWorkerGroupDesiredReplicas(worker))
646646
logger.Info("reconcilePods", "desired workerReplicas (always adhering to minReplicas/maxReplica)", numExpectedWorkerPods, "worker group", worker.GroupName, "maxReplicas", worker.MaxReplicas, "minReplicas", worker.MinReplicas, "replicas", worker.Replicas)
647647

648648
workerPods := corev1.PodList{}
@@ -1169,7 +1169,7 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
11691169

11701170
newInstance.Status.ReadyWorkerReplicas = utils.CalculateReadyReplicas(runtimePods)
11711171
newInstance.Status.AvailableWorkerReplicas = utils.CalculateAvailableReplicas(runtimePods)
1172-
newInstance.Status.DesiredWorkerReplicas = utils.CalculateDesiredReplicas(ctx, newInstance)
1172+
newInstance.Status.DesiredWorkerReplicas = utils.CalculateDesiredReplicas(newInstance)
11731173
newInstance.Status.MinWorkerReplicas = utils.CalculateMinReplicas(newInstance)
11741174
newInstance.Status.MaxWorkerReplicas = utils.CalculateMaxReplicas(newInstance)
11751175

ray-operator/controllers/ray/utils/util.go

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -335,34 +335,27 @@ func GenerateIdentifier(clusterName string, nodeType rayv1.RayNodeType) string {
335335
return fmt.Sprintf("%s-%s", clusterName, nodeType)
336336
}
337337

338-
func GetWorkerGroupDesiredReplicas(ctx context.Context, workerGroupSpec rayv1.WorkerGroupSpec) int32 {
339-
log := ctrl.LoggerFrom(ctx)
338+
func GetWorkerGroupDesiredReplicas(workerGroupSpec rayv1.WorkerGroupSpec) int32 {
340339
// Always adhere to min/max replicas constraints.
341340
var workerReplicas int32
342341
if workerGroupSpec.Suspend != nil && *workerGroupSpec.Suspend {
343342
return 0
344343
}
345-
if *workerGroupSpec.MinReplicas > *workerGroupSpec.MaxReplicas {
346-
log.Info("minReplicas is greater than maxReplicas, using maxReplicas as desired replicas. "+
347-
"Please fix this to avoid any unexpected behaviors.", "minReplicas", *workerGroupSpec.MinReplicas, "maxReplicas", *workerGroupSpec.MaxReplicas)
348-
workerReplicas = *workerGroupSpec.MaxReplicas
349-
} else if workerGroupSpec.Replicas == nil || *workerGroupSpec.Replicas < *workerGroupSpec.MinReplicas {
350-
// Replicas is impossible to be nil as it has a default value assigned in the CRD.
351-
// Add this check to make testing easier.
344+
// Validation for replicas/min/max should be enforced in validation.go before reconcile proceeds.
345+
// Here we only compute the desired replicas within the already-validated bounds.
346+
if workerGroupSpec.Replicas == nil {
352347
workerReplicas = *workerGroupSpec.MinReplicas
353-
} else if *workerGroupSpec.Replicas > *workerGroupSpec.MaxReplicas {
354-
workerReplicas = *workerGroupSpec.MaxReplicas
355348
} else {
356349
workerReplicas = *workerGroupSpec.Replicas
357350
}
358351
return workerReplicas * workerGroupSpec.NumOfHosts
359352
}
360353

361354
// CalculateDesiredReplicas calculate desired worker replicas at the cluster level
362-
func CalculateDesiredReplicas(ctx context.Context, cluster *rayv1.RayCluster) int32 {
355+
func CalculateDesiredReplicas(cluster *rayv1.RayCluster) int32 {
363356
count := int32(0)
364357
for _, nodeGroup := range cluster.Spec.WorkerGroupSpecs {
365-
count += GetWorkerGroupDesiredReplicas(ctx, nodeGroup)
358+
count += GetWorkerGroupDesiredReplicas(nodeGroup)
366359
}
367360

368361
return count

ray-operator/controllers/ray/utils/util_test.go

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,6 @@ func TestGenerateHeadServiceName(t *testing.T) {
550550
}
551551

552552
func TestGetWorkerGroupDesiredReplicas(t *testing.T) {
553-
ctx := context.Background()
554553
// Test 1: `WorkerGroupSpec.Replicas` is nil.
555554
// `Replicas` is impossible to be nil in a real RayCluster CR as it has a default value assigned in the CRD.
556555
numOfHosts := int32(1)
@@ -562,37 +561,21 @@ func TestGetWorkerGroupDesiredReplicas(t *testing.T) {
562561
MinReplicas: &minReplicas,
563562
MaxReplicas: &maxReplicas,
564563
}
565-
assert.Equal(t, GetWorkerGroupDesiredReplicas(ctx, workerGroupSpec), minReplicas)
564+
assert.Equal(t, GetWorkerGroupDesiredReplicas(workerGroupSpec), minReplicas)
566565

567566
// Test 2: `WorkerGroupSpec.Replicas` is not nil and is within the range.
568567
replicas := int32(3)
569568
workerGroupSpec.Replicas = &replicas
570-
assert.Equal(t, GetWorkerGroupDesiredReplicas(ctx, workerGroupSpec), replicas)
569+
assert.Equal(t, GetWorkerGroupDesiredReplicas(workerGroupSpec), replicas)
571570

572-
// Test 3: `WorkerGroupSpec.Replicas` is not nil but is more than maxReplicas.
573-
replicas = int32(6)
574-
workerGroupSpec.Replicas = &replicas
575-
assert.Equal(t, GetWorkerGroupDesiredReplicas(ctx, workerGroupSpec), maxReplicas)
576-
577-
// Test 4: `WorkerGroupSpec.Replicas` is not nil but is less than minReplicas.
578-
replicas = int32(0)
579-
workerGroupSpec.Replicas = &replicas
580-
assert.Equal(t, GetWorkerGroupDesiredReplicas(ctx, workerGroupSpec), minReplicas)
581-
582-
// Test 5: `WorkerGroupSpec.Replicas` is nil and minReplicas is less than maxReplicas.
583-
workerGroupSpec.Replicas = nil
584-
workerGroupSpec.MinReplicas = &maxReplicas
585-
workerGroupSpec.MaxReplicas = &minReplicas
586-
assert.Equal(t, GetWorkerGroupDesiredReplicas(ctx, workerGroupSpec), *workerGroupSpec.MaxReplicas)
587-
588-
// Test 6: `WorkerGroupSpec.Suspend` is true.
571+
// Test 3: `WorkerGroupSpec.Suspend` is true.
589572
suspend := true
590573
workerGroupSpec.MinReplicas = &maxReplicas
591574
workerGroupSpec.MaxReplicas = &minReplicas
592575
workerGroupSpec.Suspend = &suspend
593-
assert.Zero(t, GetWorkerGroupDesiredReplicas(ctx, workerGroupSpec))
576+
assert.Zero(t, GetWorkerGroupDesiredReplicas(workerGroupSpec))
594577

595-
// Test 7: `WorkerGroupSpec.NumOfHosts` is 4.
578+
// Test 4: `WorkerGroupSpec.NumOfHosts` is 4.
596579
numOfHosts = int32(4)
597580
replicas = int32(5)
598581
suspend = false
@@ -601,7 +584,7 @@ func TestGetWorkerGroupDesiredReplicas(t *testing.T) {
601584
workerGroupSpec.Suspend = &suspend
602585
workerGroupSpec.MinReplicas = &minReplicas
603586
workerGroupSpec.MaxReplicas = &maxReplicas
604-
assert.Equal(t, GetWorkerGroupDesiredReplicas(ctx, workerGroupSpec), replicas*numOfHosts)
587+
assert.Equal(t, GetWorkerGroupDesiredReplicas(workerGroupSpec), replicas*numOfHosts)
605588
}
606589

607590
func TestCalculateMinAndMaxReplicas(t *testing.T) {
@@ -798,7 +781,7 @@ func TestCalculateDesiredReplicas(t *testing.T) {
798781
},
799782
},
800783
}
801-
assert.Equal(t, CalculateDesiredReplicas(context.Background(), &cluster), tc.answer)
784+
assert.Equal(t, CalculateDesiredReplicas(&cluster), tc.answer)
802785
})
803786
}
804787
}

ray-operator/controllers/ray/utils/validation.go

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,34 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s
3939
return fmt.Errorf("headGroupSpec should have at least one container")
4040
}
4141

42+
// Check if autoscaling is enabled once to avoid repeated calls
43+
isAutoscalingEnabled := IsAutoscalingEnabled(spec)
44+
4245
for _, workerGroup := range spec.WorkerGroupSpecs {
4346
if len(workerGroup.Template.Spec.Containers) == 0 {
4447
return fmt.Errorf("workerGroupSpec should have at least one container")
4548
}
49+
// When autoscaling is enabled, MinReplicas and MaxReplicas are optional
50+
// as users can manually update them and the autoscaler will handle the adjustment.
51+
if !isAutoscalingEnabled && (workerGroup.MinReplicas == nil || workerGroup.MaxReplicas == nil) {
52+
return fmt.Errorf("worker group %s must set both minReplicas and maxReplicas when autoscaling is disabled", workerGroup.GroupName)
53+
}
54+
if workerGroup.MinReplicas != nil && *workerGroup.MinReplicas < 0 {
55+
return fmt.Errorf("worker group %s has negative minReplicas %d", workerGroup.GroupName, *workerGroup.MinReplicas)
56+
}
57+
if workerGroup.MaxReplicas != nil && *workerGroup.MaxReplicas < 0 {
58+
return fmt.Errorf("worker group %s has negative maxReplicas %d", workerGroup.GroupName, *workerGroup.MaxReplicas)
59+
}
60+
// When autoscaling is enabled, the Ray Autoscaler will manage replicas and
61+
// eventually adjust them to fall within minReplicas/maxReplicas bounds.
62+
if workerGroup.Replicas != nil && !isAutoscalingEnabled && workerGroup.MinReplicas != nil && workerGroup.MaxReplicas != nil {
63+
if *workerGroup.Replicas < *workerGroup.MinReplicas {
64+
return fmt.Errorf("worker group %s has replicas %d smaller than minReplicas %d", workerGroup.GroupName, *workerGroup.Replicas, *workerGroup.MinReplicas)
65+
}
66+
if *workerGroup.Replicas > *workerGroup.MaxReplicas {
67+
return fmt.Errorf("worker group %s has replicas %d greater than maxReplicas %d", workerGroup.GroupName, *workerGroup.Replicas, *workerGroup.MaxReplicas)
68+
}
69+
}
4670
}
4771

4872
if annotations[RayFTEnabledAnnotationKey] != "" && spec.GcsFaultToleranceOptions != nil {
@@ -93,9 +117,6 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s
93117
}
94118
}
95119

96-
// Check if autoscaling is enabled once to avoid repeated calls
97-
isAutoscalingEnabled := IsAutoscalingEnabled(spec)
98-
99120
// Validate that RAY_enable_autoscaler_v2 environment variable is not set to "1" or "true" when autoscaler is disabled
100121
if !isAutoscalingEnabled {
101122
if envVar, exists := EnvVarByName(RAY_ENABLE_AUTOSCALER_V2, spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env); exists {

0 commit comments

Comments
 (0)