Skip to content

Commit abca6f9

Browse files
committed
fix: e2e tests per PR.
1 parent 9475047 commit abca6f9

File tree

6 files changed

+18
-105
lines changed

6 files changed

+18
-105
lines changed

tests/e2e/heterogeneous_clusters_kind_test.py

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -31,27 +31,6 @@ def test_heterogeneous_clusters(self):
3131
def run_heterogeneous_clusters(
3232
self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
3333
):
34-
# Use GPU-enabled Ray image when GPUs are requested
35-
from codeflare_sdk.common.utils import constants
36-
37-
ray_image = (
38-
f"rayproject/ray:{constants.RAY_VERSION}-gpu"
39-
if number_of_gpus > 0
40-
else f"rayproject/ray:{constants.RAY_VERSION}"
41-
)
42-
43-
# GPU images need more memory due to CUDA libraries
44-
if number_of_gpus > 0:
45-
head_memory_requests = 4
46-
head_memory_limits = 6
47-
worker_memory_requests = 4
48-
worker_memory_limits = 8
49-
else:
50-
head_memory_requests = 2
51-
head_memory_limits = 2
52-
worker_memory_requests = 1
53-
worker_memory_limits = 4
54-
5534
for flavor in self.resource_flavors:
5635
node_labels = (
5736
get_flavor_spec(self, flavor).get("spec", {}).get("nodeLabels", {})
@@ -70,16 +49,15 @@ def run_heterogeneous_clusters(
7049
num_workers=1,
7150
head_cpu_requests="500m",
7251
head_cpu_limits="500m",
73-
head_memory_requests=head_memory_requests,
74-
head_memory_limits=head_memory_limits,
52+
head_memory_requests=2,
53+
head_memory_limits=2,
7554
worker_cpu_requests="500m",
7655
worker_cpu_limits=1,
77-
worker_memory_requests=worker_memory_requests,
78-
worker_memory_limits=worker_memory_limits,
56+
worker_memory_requests=1,
57+
worker_memory_limits=4,
7958
worker_extended_resource_requests={
8059
gpu_resource_name: number_of_gpus
8160
},
82-
image=ray_image,
8361
write_to_file=True,
8462
verify_tls=False,
8563
local_queue=queue_name,

tests/e2e/local_interactive_sdk_kind_test.py

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,42 +49,18 @@ def run_local_interactives(
4949

5050
ray.shutdown()
5151

52-
# Use GPU-enabled Ray image when GPUs are requested
53-
from codeflare_sdk.common.utils import constants
54-
55-
ray_image = (
56-
f"rayproject/ray:{constants.RAY_VERSION}-gpu"
57-
if number_of_gpus > 0
58-
else f"rayproject/ray:{constants.RAY_VERSION}"
59-
)
60-
61-
# GPU images need more memory due to CUDA libraries
62-
if number_of_gpus > 0:
63-
head_memory_requests = 4
64-
head_memory_limits = 6
65-
worker_memory_requests = 4
66-
worker_memory_limits = 8
67-
else:
68-
head_memory_requests = None
69-
head_memory_limits = None
70-
worker_memory_requests = 1
71-
worker_memory_limits = 4
72-
7352
cluster = Cluster(
7453
ClusterConfiguration(
7554
name=cluster_name,
7655
namespace=self.namespace,
7756
num_workers=1,
7857
head_cpu_requests="500m",
7958
head_cpu_limits="500m",
80-
head_memory_requests=head_memory_requests,
81-
head_memory_limits=head_memory_limits,
8259
worker_cpu_requests="500m",
8360
worker_cpu_limits=1,
84-
worker_memory_requests=worker_memory_requests,
85-
worker_memory_limits=worker_memory_limits,
61+
worker_memory_requests=1,
62+
worker_memory_limits=4,
8663
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
87-
image=ray_image,
8864
verify_tls=False,
8965
)
9066
)

tests/e2e/mnist.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@
4242
print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
4343
ACCELERATOR = os.getenv("ACCELERATOR")
4444

45+
# If GPU is requested but CUDA is not available, fall back to CPU
46+
if ACCELERATOR == "gpu" and not torch.cuda.is_available():
47+
print("Warning: GPU requested but CUDA is not available. Falling back to CPU.")
48+
ACCELERATOR = "cpu"
49+
4550
STORAGE_BUCKET_EXISTS = "AWS_DEFAULT_ENDPOINT" in os.environ
4651
print("STORAGE_BUCKET_EXISTS: ", STORAGE_BUCKET_EXISTS)
4752

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
--extra-index-url https://download.pytorch.org/whl/cu118
2+
torch==2.5.1
3+
torchvision==0.20.1
14
pytorch_lightning==1.9.5
25
torchmetrics==0.9.1
3-
torchvision==0.20.1
46
minio

tests/e2e/mnist_raycluster_sdk_aw_kind_test.py

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -37,42 +37,18 @@ def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
3737
def run_mnist_raycluster_sdk_kind(
3838
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
3939
):
40-
# Use GPU-enabled Ray image when GPUs are requested
41-
from codeflare_sdk.common.utils import constants
42-
43-
ray_image = (
44-
f"rayproject/ray:{constants.RAY_VERSION}-gpu"
45-
if number_of_gpus > 0
46-
else f"rayproject/ray:{constants.RAY_VERSION}"
47-
)
48-
49-
# GPU images need more memory due to CUDA libraries
50-
if number_of_gpus > 0:
51-
head_memory_requests = 4
52-
head_memory_limits = 6
53-
worker_memory_requests = 4
54-
worker_memory_limits = 8
55-
else:
56-
head_memory_requests = None
57-
head_memory_limits = None
58-
worker_memory_requests = 1
59-
worker_memory_limits = 4
60-
6140
cluster = Cluster(
6241
ClusterConfiguration(
6342
name="mnist",
6443
namespace=self.namespace,
6544
num_workers=1,
6645
head_cpu_requests="500m",
6746
head_cpu_limits="500m",
68-
head_memory_requests=head_memory_requests,
69-
head_memory_limits=head_memory_limits,
7047
worker_cpu_requests="500m",
7148
worker_cpu_limits=1,
72-
worker_memory_requests=worker_memory_requests,
73-
worker_memory_limits=worker_memory_limits,
49+
worker_memory_requests=1,
50+
worker_memory_limits=4,
7451
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
75-
image=ray_image,
7652
write_to_file=True,
7753
verify_tls=False,
7854
appwrapper=True,

tests/e2e/mnist_raycluster_sdk_kind_test.py

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -37,42 +37,18 @@ def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self):
3737
def run_mnist_raycluster_sdk_kind(
3838
self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0
3939
):
40-
# Use GPU-enabled Ray image when GPUs are requested
41-
from codeflare_sdk.common.utils import constants
42-
43-
ray_image = (
44-
f"rayproject/ray:{constants.RAY_VERSION}-gpu"
45-
if number_of_gpus > 0
46-
else f"rayproject/ray:{constants.RAY_VERSION}"
47-
)
48-
49-
# GPU images need more memory due to CUDA libraries
50-
if number_of_gpus > 0:
51-
head_memory_requests = 4
52-
head_memory_limits = 6
53-
worker_memory_requests = 4
54-
worker_memory_limits = 8
55-
else:
56-
head_memory_requests = None
57-
head_memory_limits = None
58-
worker_memory_requests = 1
59-
worker_memory_limits = 4
60-
6140
cluster = Cluster(
6241
ClusterConfiguration(
6342
name="mnist",
6443
namespace=self.namespace,
6544
num_workers=1,
6645
head_cpu_requests="500m",
6746
head_cpu_limits="500m",
68-
head_memory_requests=head_memory_requests,
69-
head_memory_limits=head_memory_limits,
7047
worker_cpu_requests="500m",
7148
worker_cpu_limits=1,
72-
worker_memory_requests=worker_memory_requests,
73-
worker_memory_limits=worker_memory_limits,
49+
worker_memory_requests=1,
50+
worker_memory_limits=4,
7451
worker_extended_resource_requests={gpu_resource_name: number_of_gpus},
75-
image=ray_image,
7652
write_to_file=True,
7753
verify_tls=False,
7854
)

0 commit comments

Comments
 (0)