Skip to content

Commit d591688

Browse files
committed
[Feature] RayJob Volcano integration
Signed-off-by: win5923 <[email protected]>
1 parent 8ba2421 commit d591688

File tree

10 files changed

+546
-61
lines changed

10 files changed

+546
-61
lines changed
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
apiVersion: scheduling.volcano.sh/v1beta1
2+
kind: Queue
3+
metadata:
4+
name: kuberay-test-queue
5+
spec:
6+
weight: 1
7+
capability:
8+
cpu: 4
9+
memory: 6Gi
10+
---
11+
apiVersion: ray.io/v1
12+
kind: RayJob
13+
metadata:
14+
name: rayjob-sample-2
15+
labels:
16+
ray.io/scheduler-name: volcano
17+
volcano.sh/queue-name: kuberay-test-queue
18+
spec:
19+
entrypoint: python /home/ray/samples/sample_code.py
20+
runtimeEnvYAML: |
21+
pip:
22+
- requests==2.26.0
23+
- pendulum==2.1.2
24+
env_vars:
25+
counter_name: "test_counter"
26+
# rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller.
27+
rayClusterSpec:
28+
rayVersion: '2.46.0' # should match the Ray version in the image of the containers
29+
# Ray head pod template
30+
headGroupSpec:
31+
# The `rayStartParams` are used to configure the `ray start` command.
32+
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
33+
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
34+
rayStartParams: {}
35+
#pod template
36+
template:
37+
spec:
38+
containers:
39+
- name: ray-head
40+
image: rayproject/ray:2.46.0
41+
ports:
42+
- containerPort: 6379
43+
name: gcs-server
44+
- containerPort: 8265 # Ray dashboard
45+
name: dashboard
46+
- containerPort: 10001
47+
name: client
48+
resources:
49+
limits:
50+
cpu: "1"
51+
memory: "2Gi"
52+
requests:
53+
cpu: "1"
54+
memory: "2Gi"
55+
volumeMounts:
56+
- mountPath: /home/ray/samples
57+
name: code-sample
58+
volumes:
59+
# You set volumes at the Pod level, then mount them into containers inside that Pod
60+
- name: code-sample
61+
configMap:
62+
# Provide the name of the ConfigMap you want to mount.
63+
name: ray-job-code-sample
64+
# An array of keys from the ConfigMap to create as files
65+
items:
66+
- key: sample_code.py
67+
path: sample_code.py
68+
workerGroupSpecs:
69+
# the pod replicas in this group typed worker
70+
- replicas: 2
71+
minReplicas: 2
72+
maxReplicas: 2
73+
# logical group name, for this called small-group, also can be functional
74+
groupName: small-group
75+
# The `rayStartParams` are used to configure the `ray start` command.
76+
# See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
77+
# See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
78+
rayStartParams: {}
79+
#pod template
80+
template:
81+
spec:
82+
containers:
83+
- name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
84+
image: rayproject/ray:2.46.0
85+
resources:
86+
limits:
87+
cpu: "1"
88+
memory: "1Gi"
89+
requests:
90+
cpu: "1"
91+
memory: "1Gi"
92+
93+
# SubmitterPodTemplate is the template for the pod that will run the `ray job submit` command against the RayCluster.
94+
# If SubmitterPodTemplate is specified, the first container is assumed to be the submitter container.
95+
# submitterPodTemplate:
96+
# spec:
97+
# restartPolicy: Never
98+
# containers:
99+
# - name: my-custom-rayjob-submitter-pod
100+
# image: rayproject/ray:2.46.0
101+
# # If Command is not specified, the correct command will be supplied at runtime using the RayJob spec `entrypoint` field.
102+
# # Specifying Command is not recommended.
103+
# # command: ["sh", "-c", "ray job submit --address=http://$RAY_DASHBOARD_ADDRESS --submission-id=$RAY_JOB_SUBMISSION_ID -- echo hello world"]
104+
105+
106+
######################Ray code sample#################################
107+
# this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example
108+
# it is mounted into the container and executed to show the Ray job at work
109+
---
110+
apiVersion: v1
111+
kind: ConfigMap
112+
metadata:
113+
name: ray-job-code-sample
114+
data:
115+
sample_code.py: |
116+
import ray
117+
import os
118+
import requests
119+
120+
ray.init()
121+
122+
@ray.remote
123+
class Counter:
124+
def __init__(self):
125+
# Used to verify runtimeEnv
126+
self.name = os.getenv("counter_name")
127+
assert self.name == "test_counter"
128+
self.counter = 0
129+
130+
def inc(self):
131+
self.counter += 1
132+
133+
def get_counter(self):
134+
return "{} got {}".format(self.name, self.counter)
135+
136+
counter = Counter.remote()
137+
138+
for _ in range(5):
139+
ray.get(counter.inc.remote())
140+
print(ray.get(counter.get_counter.remote()))
141+
142+
# Verify that the correct runtime env was used for the job.
143+
assert requests.__version__ == "2.26.0"

ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ func GetPluginName() string { return "kai-scheduler" }
3333

3434
func (k *KaiScheduler) Name() string { return GetPluginName() }
3535

36-
func (k *KaiScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *rayv1.RayCluster) error {
36+
func (k *KaiScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ client.Object) error {
3737
return nil
3838
}
3939

@@ -53,6 +53,9 @@ func (k *KaiScheduler) AddMetadataToPod(ctx context.Context, app *rayv1.RayClust
5353
pod.Labels[QueueLabelName] = queue
5454
}
5555

56+
func (k *KaiScheduler) AddMetadataToChildResource(_ context.Context, _ client.Object, _ string, _ client.Object) {
57+
}
58+
5659
func (kf *KaiSchedulerFactory) New(_ context.Context, _ *rest.Config, _ client.Client) (schedulerinterface.BatchScheduler, error) {
5760
return &KaiScheduler{}, nil
5861
}

ray-operator/controllers/ray/batchscheduler/schedulermanager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ func getSchedulerFactory(rayConfigs configapi.Configuration) (schedulerinterface
8383
return factory, nil
8484
}
8585

86-
func (batch *SchedulerManager) GetSchedulerForCluster() (schedulerinterface.BatchScheduler, error) {
86+
func (batch *SchedulerManager) GetScheduler() (schedulerinterface.BatchScheduler, error) {
8787
return batch.scheduler, nil
8888
}
8989

0 commit comments

Comments
 (0)