Skip to content

Commit 2badb50

Browse files
committed
divide out tensor engine tests (#649)
Summary: Pull Request resolved: #649 Main changes: - Pulls out tests that require GPUs into `tensor_engine` folder - Changes `test_cpu.py` step of OSS to run all non tensor engine tests - Changes `test_gpu.py` step of OSS to run all tensor engine tests. - Changes wheels.yml to run `test_python_actors` for a minimal set of tests to validate wheel correctness. Differential Revision: D78989844
1 parent 8d09dd9 commit 2badb50

14 files changed

+20
-13
lines changed

.github/workflows/test-cpu.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ jobs:
2525
# Source common setup functions
2626
source scripts/common-setup.sh
2727
28+
# Setup test environment
29+
setup_test_environment
30+
2831
# Setup test environment
2932
setup_conda_environment
3033
@@ -34,7 +37,4 @@ jobs:
3437
# Install the built wheel from artifact
3538
install_wheel_from_artifact
3639
37-
# Currently a no-op.
38-
# Tests requiring tensor engine / GPU need to be identified and flagged to skip.
39-
# We will just ensure monarch can be imported successfully.
40-
python -c "import monarch; print('Monarch imported successfully')"
40+
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" --ignore=python/tests/tensor_engine -n 4

.github/workflows/test-cuda.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,5 +55,5 @@ jobs:
5555
pyright python/tests/test_python_actors.py
5656
5757
# Run CUDA tests
58-
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
59-
python python/tests/test_mock_cuda.py
58+
LC_ALL=C pytest python/tests/tensor_engine -s -v -m "not oss_skip" -n 4
59+
python python/tests/tensor_engine/test_mock_cuda.py

.github/workflows/wheels.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ jobs:
3636
source scripts/common-setup.sh
3737
setup_build_environment ${{ matrix.python-version }}
3838
39+
# Setup test environment
40+
setup_test_environment
41+
3942
# Setup Tensor Engine dependencies
4043
setup_tensor_engine
4144
@@ -54,7 +57,8 @@ jobs:
5457
# Run tests
5558
install_python_test_dependencies
5659
pip install dist/*.whl
57-
python -c "import monarch"
60+
LC_ALL=C pytest python/tests/test_python_actors.py -s -v -m "not oss_skip"
61+
5862
publish:
5963
name: Publish to PyPI
6064
needs: build

python/monarch/_testing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import tempfile
1111
import time
1212
from contextlib import contextmanager, ExitStack
13+
from enum import Enum
1314
from typing import Any, Callable, Dict, Generator, Literal, Optional
1415

1516
import monarch_supervisor
@@ -225,7 +226,7 @@ def exit(
225226
return dm
226227

227228

228-
class BackendType:
229+
class BackendType(Enum):
229230
PY = "py"
230231
RS = "rs"
231232
MESH = "mesh"

python/tests/test_coalescing.py renamed to python/tests/tensor_engine/test_coalescing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def local_device_mesh(
7878
num_hosts,
7979
gpu_per_host,
8080
activate,
81-
backend=str(backend_type),
81+
backend=backend_type.value,
8282
)
8383

8484
@property

python/tests/test_controller.py renamed to python/tests/tensor_engine/test_controller.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ def local_rust_device_mesh(
9696
torch.cuda.device_count() < 2,
9797
reason="Not enough GPUs, this test requires at least 2 GPUs",
9898
)
99-
@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS, "mesh"])
99+
@pytest.mark.parametrize(
100+
"backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH]
101+
)
100102
# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
101103
# out is not counted as a failure, so we set a more restrictive timeout to
102104
# ensure we see a hard failure in CI.
@@ -114,7 +116,7 @@ def local_device_mesh(
114116
N,
115117
gpu_per_host,
116118
activate,
117-
backend=str(backend_type),
119+
backend=backend_type.value,
118120
)
119121

120122
def test_errors(self, backend_type):
@@ -176,7 +178,7 @@ def test_sub_mesh_use_only_one(self, backend_type):
176178
local_x = local_x.result(timeout=20)
177179
assert torch.equal(local_x, torch.ones(3, 4))
178180

179-
def test_sub_mesh_process_grop(self, backend_type):
181+
def test_sub_mesh_process_group(self, backend_type):
180182
with self.local_device_mesh(2, 2, backend_type, activate=False) as device_mesh:
181183
h0 = device_mesh.slice(host=0)
182184
pg0 = h0.process_group(("gpu",))
@@ -603,7 +605,7 @@ def test_to_mesh_pytree(self, backend_type):
603605
assert torch.equal(moved_tensor_b, torch.tensor([2.0]))
604606

605607
def test_hanging_error(self, backend_type):
606-
if backend_type != "mesh":
608+
if backend_type != BackendType.MESH:
607609
pytest.skip("only relevant for mesh backend")
608610
with self.local_device_mesh(2, 2, backend_type) as device_mesh:
609611
remote(lambda: torch.rand(3) + torch.rand(4), propagate=lambda: None)()

0 commit comments

Comments
 (0)