Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,35 @@ Package v1 contains API Schema definitions for the ray v1 API group



#### AuthMode

_Underlying type:_ _string_

AuthMode describes the authentication mode for the Ray cluster.



_Appears in:_
- [AuthOptions](#authoptions)



#### AuthOptions



AuthOptions defines the authentication options for a RayCluster.



_Appears in:_
- [RayClusterSpec](#rayclusterspec)

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `mode` _[AuthMode](#authmode)_ | Mode specifies the authentication mode.<br />Supported values are "disabled" and "token".<br />Defaults to "token". | | Enum: [disabled token] <br /> |
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it’s better to set the default to disabled, as token authentication requires Ray >= 2.51.0 and some users may still be on older pinned versions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's worth mentioning the mode only defaults to token when authOptions != nil



#### AutoscalerOptions


Expand Down Expand Up @@ -268,6 +297,7 @@ _Appears in:_

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `authOptions` _[AuthOptions](#authoptions)_ | AuthOptions specifies the authentication options for the RayCluster. | | |
| `suspend` _boolean_ | Suspend indicates whether a RayCluster should be suspended.<br />A suspended RayCluster will have head pods and worker pods deleted. | | |
| `managedBy` _string_ | ManagedBy is an optional configuration for the controller or entity that manages a RayCluster.<br />The value must be either 'ray.io/kuberay-operator' or 'kueue.x-k8s.io/multikueue'.<br />The kuberay-operator reconciles a RayCluster which doesn't have this field at all or<br />the field value is the reserved string 'ray.io/kuberay-operator',<br />but delegates reconciling the RayCluster with 'kueue.x-k8s.io/multikueue' to the Kueue.<br />The field is immutable. | | |
| `autoscalerOptions` _[AutoscalerOptions](#autoscaleroptions)_ | AutoscalerOptions specifies optional configuration for the Ray autoscaler. | | |
Expand Down
8 changes: 8 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayclusters.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayjobs.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions helm-chart/kuberay-operator/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ rules:
resources:
- events
- pods/status
- secrets
- services
verbs:
- create
Expand Down
23 changes: 23 additions & 0 deletions ray-operator/apis/ray/v1/raycluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ import (

// RayClusterSpec defines the desired state of RayCluster
type RayClusterSpec struct {
// AuthOptions specifies the authentication options for the RayCluster.
// +optional
AuthOptions *AuthOptions `json:"authOptions,omitempty"`
// Suspend indicates whether a RayCluster should be suspended.
// A suspended RayCluster will have head pods and worker pods deleted.
// +optional
Expand Down Expand Up @@ -46,6 +49,26 @@ type RayClusterSpec struct {
WorkerGroupSpecs []WorkerGroupSpec `json:"workerGroupSpecs,omitempty"`
}

// AuthMode describes the authentication mode for the Ray cluster.
type AuthMode string

const (
// AuthModeDisabled disables authentication.
AuthModeDisabled AuthMode = "disabled"
// AuthModeToken enables token-based authentication.
AuthModeToken AuthMode = "token"
)

// AuthOptions defines the authentication options for a RayCluster.
type AuthOptions struct {
// Mode specifies the authentication mode.
// Supported values are "disabled" and "token".
// Defaults to "token".
// +kubebuilder:validation:Enum=disabled;token
// +optional
Mode AuthMode `json:"mode,omitempty"`
}

// GcsFaultToleranceOptions contains configs for GCS FT
type GcsFaultToleranceOptions struct {
// +optional
Expand Down
20 changes: 20 additions & 0 deletions ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayclusters.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayjobs.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions ray-operator/config/crd/bases/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions ray-operator/config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ rules:
resources:
- events
- pods/status
- secrets
- services
verbs:
- create
Expand Down
49 changes: 49 additions & 0 deletions ray-operator/controllers/ray/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ func DefaultHeadPodTemplate(ctx context.Context, instance rayv1.RayCluster, head
podTemplate.Spec.Containers[utils.RayContainerIndex].Ports = append(podTemplate.Spec.Containers[utils.RayContainerIndex].Ports, metricsPort)
}

if utils.IsAuthEnabled(&instance.Spec) {
setTokenAuthEnvVars(instance.Name, &podTemplate)
}

return podTemplate
}

Expand All @@ -236,6 +240,47 @@ func setAutoscalerV2EnvVars(podTemplate *corev1.PodTemplateSpec) {
})
}

// setTokenAuthEnvVars sets environment variables required for Ray token authentication
func setTokenAuthEnvVars(clusterName string, podTemplate *corev1.PodTemplateSpec) {
podTemplate.Spec.Containers[utils.RayContainerIndex].Env = append(podTemplate.Spec.Containers[utils.RayContainerIndex].Env, corev1.EnvVar{
Name: utils.RAY_AUTH_MODE_ENV_VAR,
Value: "token",
})
Comment on lines +247 to +248
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Value: "token",
})
Value: rayv1.AuthModeToken,
})


secretName := utils.CheckName(clusterName)
podTemplate.Spec.Containers[utils.RayContainerIndex].Env = append(podTemplate.Spec.Containers[utils.RayContainerIndex].Env, corev1.EnvVar{
Name: utils.RAY_AUTH_TOKEN_ENV_VAR,
ValueFrom: &corev1.EnvVarSource{
SecretKeyRef: &corev1.SecretKeySelector{
LocalObjectReference: corev1.LocalObjectReference{Name: secretName},
Key: utils.RAY_AUTH_TOKEN_SECRET_KEY,
},
},
})

// Configure auth token for wait-gcs-ready init container if it exists
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sampan-s-nayak is it expected for "ray healh-check` to require auth token?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is ray autoscaler also require auth token?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, I'll add this logic shortly

for i, initContainer := range podTemplate.Spec.InitContainers {
if initContainer.Name != "wait-gcs-ready" {
continue
}

podTemplate.Spec.InitContainers[i].Env = append(podTemplate.Spec.InitContainers[i].Env, corev1.EnvVar{
Name: utils.RAY_AUTH_MODE_ENV_VAR,
Value: "token",
})

podTemplate.Spec.InitContainers[i].Env = append(podTemplate.Spec.InitContainers[i].Env, corev1.EnvVar{
Name: utils.RAY_AUTH_TOKEN_ENV_VAR,
ValueFrom: &corev1.EnvVarSource{
SecretKeyRef: &corev1.SecretKeySelector{
LocalObjectReference: corev1.LocalObjectReference{Name: secretName},
Key: utils.RAY_AUTH_TOKEN_SECRET_KEY,
},
},
})
}
}

func getEnableInitContainerInjection() bool {
if s := os.Getenv(EnableInitContainerInjectionEnvKey); strings.ToLower(s) == "false" {
return false
Expand Down Expand Up @@ -358,6 +403,10 @@ func DefaultWorkerPodTemplate(ctx context.Context, instance rayv1.RayCluster, wo
podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever
}

if utils.IsAuthEnabled(&instance.Spec) {
setTokenAuthEnvVars(instance.Name, &podTemplate)
}

return podTemplate
}

Expand Down
60 changes: 60 additions & 0 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package ray

import (
"context"
"crypto/rand"
"encoding/base64"
errstd "errors"
"fmt"
"os"
Expand Down Expand Up @@ -98,6 +100,7 @@ type RayClusterReconcilerOptions struct {
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete;deletecollection
// +kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=pods/resize,verbs=patch
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=services/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=coordination.k8s.io,resources=leases,verbs=get;list;create;update
Expand Down Expand Up @@ -298,6 +301,7 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, instance
r.reconcileAutoscalerRole,
r.reconcileAutoscalerRoleBinding,
r.reconcileIngress,
r.reconcileAuthSecret,
r.reconcileHeadService,
r.reconcileHeadlessService,
r.reconcileServeService,
Expand Down Expand Up @@ -354,6 +358,62 @@ func (r *RayClusterReconciler) rayClusterReconcile(ctx context.Context, instance
return ctrl.Result{RequeueAfter: time.Duration(requeueAfterSeconds) * time.Second}, nil
}

func (r *RayClusterReconciler) reconcileAuthSecret(ctx context.Context, instance *rayv1.RayCluster) error {
logger := ctrl.LoggerFrom(ctx)
logger.Info("Reconciling Auth")

if instance.Spec.AuthOptions == nil || instance.Spec.AuthOptions.Mode == rayv1.AuthModeDisabled {
return nil
}

secret := &corev1.Secret{}
secretName := utils.CheckName(instance.Name)
err := r.Get(ctx, types.NamespacedName{Name: secretName, Namespace: instance.Namespace}, secret)
if err != nil {
if errors.IsNotFound(err) {
return r.createAuthSecret(ctx, instance, secretName)
}
return err
}

return nil
}

// createAuthSecret generates a new secret with a random token.
func (r *RayClusterReconciler) createAuthSecret(ctx context.Context, rayCluster *rayv1.RayCluster, secretName string) error {
token, err := generateRandomToken(32)
if err != nil {
return err
}

secret := &corev1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: secretName,
Namespace: rayCluster.Namespace,
Labels: map[string]string{
utils.RayClusterLabelKey: rayCluster.Name,
},
OwnerReferences: []metav1.OwnerReference{
*metav1.NewControllerRef(rayCluster, rayv1.SchemeGroupVersion.WithKind("RayCluster")),
},
},
StringData: map[string]string{
"auth_token": token,
},
}

return r.Create(ctx, secret)
}

// generateRandomToken creates a random base64 encoded string.
func generateRandomToken(length int) (string, error) {
bytes := make([]byte, length)
if _, err := rand.Read(bytes); err != nil {
return "", err
}
return base64.StdEncoding.EncodeToString(bytes), nil
}

func (r *RayClusterReconciler) reconcileIngress(ctx context.Context, instance *rayv1.RayCluster) error {
logger := ctrl.LoggerFrom(ctx)
logger.Info("Reconciling Ingress")
Expand Down
Loading
Loading