Skip to content

Commit 4f82f5b

Browse files
committed
Merge remote-tracking branch 'upstream/master' into move-unmarshall-envyaml
2 parents 4e4bc57 + 3d5c9e6 commit 4f82f5b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1120
-400
lines changed

apiserver/pkg/server/ray_job_submission_service_server.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
api "github.com/ray-project/kuberay/proto/go_client"
2121
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
2222
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
23+
utiltypes "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/types"
2324
)
2425

2526
type RayJobSubmissionServiceServerOptions struct {
@@ -54,7 +55,7 @@ func (s *RayJobSubmissionServiceServer) SubmitRayJob(ctx context.Context, req *a
5455
if err != nil {
5556
return nil, err
5657
}
57-
request := &utils.RayJobRequest{Entrypoint: req.Jobsubmission.Entrypoint}
58+
request := &utiltypes.RayJobRequest{Entrypoint: req.Jobsubmission.Entrypoint}
5859
if req.Jobsubmission.SubmissionId != "" {
5960
request.SubmissionId = req.Jobsubmission.SubmissionId
6061
}
@@ -89,7 +90,7 @@ func (s *RayJobSubmissionServiceServer) SubmitRayJob(ctx context.Context, req *a
8990
}
9091
}
9192

92-
sid, err := rayDashboardClient.SubmitJobReq(ctx, request, nil)
93+
sid, err := rayDashboardClient.SubmitJobReq(ctx, request)
9394
if err != nil {
9495
return nil, err
9596
}
@@ -216,7 +217,7 @@ func (s *RayJobSubmissionServiceServer) getRayClusterURL(ctx context.Context, re
216217
}
217218

218219
// Internal method to convert RayJobInfo to JobSubmissionInfo
219-
func convertNodeInfo(info *utils.RayJobInfo) *api.JobSubmissionInfo {
220+
func convertNodeInfo(info *utiltypes.RayJobInfo) *api.JobSubmissionInfo {
220221
jsi := api.JobSubmissionInfo{
221222
Entrypoint: info.Entrypoint,
222223
JobId: info.JobId,

apiserver/pkg/server/ray_job_submission_service_server_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"github.com/ray-project/kuberay/apiserver/pkg/util"
1818
api "github.com/ray-project/kuberay/proto/go_client"
1919
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
20-
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
2120
utiltypes "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils/types"
2221
fakeclientset "github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/fake"
2322
)
@@ -189,7 +188,7 @@ func TestConvertNodeInfo(t *testing.T) {
189188
"pip": "[numpy pandas]",
190189
}
191190

192-
rayJobInfo := utils.RayJobInfo{
191+
rayJobInfo := utiltypes.RayJobInfo{
193192
Entrypoint: entrypoint,
194193
JobId: jobID,
195194
SubmissionId: submissionID,

docs/reference/api.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ _Appears in:_
246246
| `entrypoint` _string_ | Entrypoint represents the command to start execution. | | |
247247
| `runtimeEnvYAML` _string_ | RuntimeEnvYAML represents the runtime environment configuration<br />provided as a multi-line YAML string. | | |
248248
| `jobId` _string_ | If jobId is not set, a new jobId will be auto-generated. | | |
249-
| `submissionMode` _[JobSubmissionMode](#jobsubmissionmode)_ | SubmissionMode specifies how RayJob submits the Ray job to the RayCluster.<br />In "K8sJobMode", the KubeRay operator creates a submitter Kubernetes Job to submit the Ray job.<br />In "HTTPMode", the KubeRay operator sends a request to the RayCluster to create a Ray job.<br />In "InteractiveMode", the KubeRay operator waits for a user to submit a job to the Ray cluster. | K8sJobMode | |
249+
| `submissionMode` _[JobSubmissionMode](#jobsubmissionmode)_ | SubmissionMode specifies how RayJob submits the Ray job to the RayCluster.<br />In "K8sJobMode", the KubeRay operator creates a submitter Kubernetes Job to submit the Ray job.<br />In "HTTPMode", the KubeRay operator sends a request to the RayCluster to create a Ray job.<br />In "InteractiveMode", the KubeRay operator waits for a user to submit a job to the Ray cluster.<br />In "SidecarMode", the KubeRay operator injects a container into the Ray head Pod that acts as the job submitter to submit the Ray job. | K8sJobMode | |
250250
| `entrypointResources` _string_ | EntrypointResources specifies the custom resources and quantities to reserve for the<br />entrypoint command. | | |
251251
| `entrypointNumCpus` _float_ | EntrypointNumCpus specifies the number of cpus to reserve for the entrypoint command. | | |
252252
| `entrypointNumGpus` _float_ | EntrypointNumGpus specifies the number of gpus to reserve for the entrypoint command. | | |

helm-chart/kuberay-operator/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,9 @@ spec:
174174
| operatorCommand | string | `"/manager"` | Path to the operator binary |
175175
| leaderElectionEnabled | bool | `true` | If leaderElectionEnabled is set to true, the KubeRay operator will use leader election for high availability. |
176176
| reconcileConcurrency | int | `1` | The maximum number of reconcile operations that can be performed simultaneously. This setting controls the concurrency of the controller reconciliation loops. Higher values can improve throughput in clusters with many resources, but may increase resource consumption. |
177+
| kubeClient | object | `{"burst":200,"qps":100}` | Kube Client configuration for QPS and burst settings. This setting controls the QPS and burst rate of the kube client when sending requests to the Kubernetes API server. If the QPS and burst values are too low, we may easily hit rate limits on the API server and slow down the controller reconciliation loops. |
178+
| kubeClient.qps | float | `100` | The QPS value for the client communicating with the Kubernetes API server. Must be a float number. |
179+
| kubeClient.burst | int | `200` | The maximum burst for throttling requests from this client to the Kubernetes API server. Must be a non-negative integer. |
177180
| rbacEnable | bool | `true` | If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on. |
178181
| crNamespacedRbacEnable | bool | `true` | When crNamespacedRbacEnable is set to true, the KubeRay operator will create a Role for RayCluster preparation (e.g., Pods, Services) and a corresponding RoleBinding for each namespace listed in the "watchNamespace" parameter. Please note that even if crNamespacedRbacEnable is set to false, the Role and RoleBinding for leader election will still be created. Note: (1) This variable is only effective when rbacEnable and singleNamespaceInstall are both set to true. (2) In most cases, it should be set to true, unless you are using a Kubernetes cluster managed by GitOps tools such as ArgoCD. |
179182
| singleNamespaceInstall | bool | `false` | When singleNamespaceInstall is true: - Install namespaced RBAC resources such as Role and RoleBinding instead of cluster-scoped ones like ClusterRole and ClusterRoleBinding so that the chart can be installed by users with permissions restricted to a single namespace. (Please note that this excludes the CRDs, which can only be installed at the cluster scope.) - If "watchNamespace" is not set, the KubeRay operator will, by default, only listen to resource events within its own namespace. |

helm-chart/kuberay-operator/templates/_helpers.tpl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,12 @@ rules:
163163
- get
164164
- patch
165165
- update
166+
- apiGroups:
167+
- ""
168+
resources:
169+
- pods/resize
170+
verbs:
171+
- patch
166172
- apiGroups:
167173
- ""
168174
resources:
@@ -301,12 +307,6 @@ rules:
301307
- list
302308
- update
303309
- watch
304-
- apiGroups:
305-
- apiextensions.k8s.io
306-
resources:
307-
- customresourcedefinitions
308-
verbs:
309-
- get
310310
{{- end -}}
311311
{{- if or .batchSchedulerEnabled (eq .batchSchedulerName "scheduler-plugins") }}
312312
- apiGroups:

helm-chart/kuberay-operator/templates/deployment.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,22 @@ spec:
115115
{{- end }}
116116
{{- $argList = append $argList (printf "--reconcile-concurrency=%v" .Values.reconcileConcurrency) -}}
117117
{{- end -}}
118+
{{- if hasKey .Values "kubeClient" -}}
119+
{{- if hasKey .Values.kubeClient "qps" -}}
120+
{{- $qps := toString .Values.kubeClient.qps }}
121+
{{- if not (regexMatch "^[+-]?[0-9]+(\\.[0-9]+)?$" $qps) }}
122+
{{- fail (printf "values.kubeClient.qps must be a valid float number, got %q" $qps) }}
123+
{{- end }}
124+
{{- $argList = append $argList (printf "--qps=%v" .Values.kubeClient.qps) -}}
125+
{{- end -}}
126+
{{- if hasKey .Values.kubeClient "burst" -}}
127+
{{- $burst := toString .Values.kubeClient.burst }}
128+
{{- if not (regexMatch "^[0-9]+$" $burst) }}
129+
{{- fail (printf "values.kubeClient.burst must be a non-negative integer, got %q" $burst) }}
130+
{{- end }}
131+
{{- $argList = append $argList (printf "--burst=%v" .Values.kubeClient.burst) -}}
132+
{{- end -}}
133+
{{- end -}}
118134
{{- (printf "\n") -}}
119135
{{- $argList | toYaml | indent 12 }}
120136
ports:

helm-chart/kuberay-operator/tests/deployment_test.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,3 +275,16 @@ tests:
275275
- contains:
276276
path: spec.template.spec.containers[?(@.name=="kuberay-operator")].args
277277
content: "--reconcile-concurrency=5"
278+
279+
- it: Should use custom kube client qps and burst when set
280+
set:
281+
kubeClient:
282+
qps: 75.5
283+
burst: 150
284+
asserts:
285+
- contains:
286+
path: spec.template.spec.containers[?(@.name=="kuberay-operator")].args
287+
content: "--qps=75.5"
288+
- contains:
289+
path: spec.template.spec.containers[?(@.name=="kuberay-operator")].args
290+
content: "--burst=150"

helm-chart/kuberay-operator/values.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,17 @@ leaderElectionEnabled: true
121121
# Higher values can improve throughput in clusters with many resources, but may increase resource consumption.
122122
reconcileConcurrency: 1
123123

124+
# -- Kube Client configuration for QPS and burst settings.
125+
# This setting controls the QPS and burst rate of the kube client when sending requests to the Kubernetes API server.
126+
# If the QPS and burst values are too low, we may easily hit rate limits on the API server and slow down the controller reconciliation loops.
127+
kubeClient:
128+
# -- The QPS value for the client communicating with the Kubernetes API server.
129+
# Must be a float number.
130+
qps: 100.0
131+
# -- The maximum burst for throttling requests from this client to the Kubernetes API server.
132+
# Must be a non-negative integer.
133+
burst: 200
134+
124135
# -- If rbacEnable is set to false, no RBAC resources will be created, including the Role for leader election, the Role for Pods and Services, and so on.
125136
rbacEnable: true
126137

ray-operator/apis/config/v1alpha1/configuration_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,6 @@ func (config Configuration) GetDashboardClient(mgr manager.Manager) func(rayClus
8888
return utils.GetRayDashboardClientFunc(mgr, config.UseKubernetesProxy)
8989
}
9090

91-
func (config Configuration) GetHttpProxyClient(mgr manager.Manager) func() utils.RayHttpProxyClientInterface {
91+
func (config Configuration) GetHttpProxyClient(mgr manager.Manager) func(hostIp, podNamespace, podName string, port int) utils.RayHttpProxyClientInterface {
9292
return utils.GetRayHttpProxyClientFunc(mgr, config.UseKubernetesProxy)
9393
}

ray-operator/apis/ray/v1/rayjob_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ const (
8282
K8sJobMode JobSubmissionMode = "K8sJobMode" // Submit job via Kubernetes Job
8383
HTTPMode JobSubmissionMode = "HTTPMode" // Submit job via HTTP request
8484
InteractiveMode JobSubmissionMode = "InteractiveMode" // Don't submit job in KubeRay. Instead, wait for user to submit job and provide the job submission ID.
85+
SidecarMode JobSubmissionMode = "SidecarMode" // Submit job via a sidecar container in the Ray head Pod
8586
)
8687

8788
type DeletionPolicyType string
@@ -174,6 +175,7 @@ type RayJobSpec struct {
174175
// In "K8sJobMode", the KubeRay operator creates a submitter Kubernetes Job to submit the Ray job.
175176
// In "HTTPMode", the KubeRay operator sends a request to the RayCluster to create a Ray job.
176177
// In "InteractiveMode", the KubeRay operator waits for a user to submit a job to the Ray cluster.
178+
// In "SidecarMode", the KubeRay operator injects a container into the Ray head Pod that acts as the job submitter to submit the Ray job.
177179
// +kubebuilder:default:=K8sJobMode
178180
// +optional
179181
SubmissionMode JobSubmissionMode `json:"submissionMode,omitempty"`

0 commit comments

Comments
 (0)