Skip to content

Commit 9d747c2

Browse files
committed
divide out tensor engine tests (#649)
Summary: Pull Request resolved: #649 Main changes: - Pulls out tests that require GPUs into `tensor_engine` folder - Changes `test_cpu.py` step of OSS to run all non tensor engine tests - Changes `test_gpu.py` step of OSS to run all tensor engine tests. - Changes wheels.yml to run `test_python_actors` for a minimal set of tests to validate wheel correctness. Differential Revision: D78989844
1 parent 43569d1 commit 9d747c2

17 files changed

+27
-19
lines changed

.github/workflows/build-cpu.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,8 @@ jobs:
2929
# Setup build environment (conda + system deps + rust + build deps)
3030
setup_build_environment
3131
32+
# Build the process allocator binary
33+
build_process_allocator
34+
3235
# Build monarch (No tensor engine, CPU version)
3336
USE_TENSOR_ENGINE=0 python setup.py bdist_wheel

.github/workflows/test-cpu.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,17 @@ jobs:
2626
source scripts/common-setup.sh
2727
2828
# Setup test environment
29-
setup_conda_environment
29+
setup_test_environment
30+
31+
# Install cargo binaries
32+
mkdir cargo_bin && mv ${RUNNER_ARTIFACT_DIR}/cargo_bin/* cargo_bin
33+
chmod +x cargo_bin/process_allocator
34+
export PATH=$(pwd)/cargo_bin:$PATH
3035
3136
# Disable tensor engine
3237
export USE_TENSOR_ENGINE=0
3338
3439
# Install the built wheel from artifact
3540
install_wheel_from_artifact
3641
37-
# Currently a no-op.
38-
# Tests requiring tensor engine / GPU need to be identified and flagged to skip.
39-
# We will just ensure monarch can be imported successfully.
40-
python -c "import monarch; print('Monarch imported successfully')"
42+
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" --ignore=python/tests/tensor_engine -n 4

.github/workflows/test-cuda.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,5 +55,5 @@ jobs:
5555
pyright python/tests/test_python_actors.py
5656
5757
# Run CUDA tests
58-
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
59-
python python/tests/test_mock_cuda.py
58+
LC_ALL=C pytest python/tests/tensor_engine -s -v -m "not oss_skip" -n 4
59+
python python/tests/tensor_engine/test_mock_cuda.py

.github/workflows/wheels.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,11 @@ jobs:
3535
script: |
3636
source scripts/common-setup.sh
3737
setup_build_environment ${{ matrix.python-version }}
38+
cargo install --path monarch_hyperactor
3839
3940
# Setup Tensor Engine dependencies
4041
setup_tensor_engine
4142
42-
cargo install --path monarch_hyperactor
43-
4443
# Build wheel
4544
export MONARCH_PACKAGE_NAME="torchmonarch-nightly"
4645
export MONARCH_VERSION=$(date +'%Y.%m.%d')
@@ -54,7 +53,8 @@ jobs:
5453
# Run tests
5554
install_python_test_dependencies
5655
pip install dist/*.whl
57-
python -c "import monarch"
56+
LC_ALL=C pytest python/tests/test_python_actors.py -s -v -m "not oss_skip"
57+
5858
publish:
5959
name: Publish to PyPI
6060
needs: build

python/monarch/_testing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import tempfile
1111
import time
1212
from contextlib import contextmanager, ExitStack
13+
from enum import Enum
1314
from typing import Any, Callable, Dict, Generator, Literal, Optional
1415

1516
import monarch_supervisor
@@ -225,7 +226,7 @@ def exit(
225226
return dm
226227

227228

228-
class BackendType:
229+
class BackendType(Enum):
229230
PY = "py"
230231
RS = "rs"
231232
MESH = "mesh"

python/tests/builtins/test_log.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True)
3030
num_hosts,
3131
gpu_per_host,
3232
activate,
33-
backend=str(backend_type),
33+
backend=backend_type.value,
3434
)
3535

3636
@patch("monarch.builtins.log.logger")

python/tests/builtins/test_random.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True)
4343
num_hosts,
4444
gpu_per_host,
4545
activate,
46-
backend=str(backend_type),
46+
backend=backend_type.value,
4747
)
4848

4949
def test_set_manual_seed_remote(self, backend_type):

python/tests/test_coalescing.py renamed to python/tests/tensor_engine/test_coalescing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def local_device_mesh(
7878
num_hosts,
7979
gpu_per_host,
8080
activate,
81-
backend=str(backend_type),
81+
backend=backend_type.value,
8282
)
8383

8484
@property

python/tests/test_controller.py renamed to python/tests/tensor_engine/test_controller.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ def local_rust_device_mesh(
9696
torch.cuda.device_count() < 2,
9797
reason="Not enough GPUs, this test requires at least 2 GPUs",
9898
)
99-
@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS, "mesh"])
99+
@pytest.mark.parametrize(
100+
"backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH]
101+
)
100102
# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
101103
# out is not counted as a failure, so we set a more restrictive timeout to
102104
# ensure we see a hard failure in CI.
@@ -114,7 +116,7 @@ def local_device_mesh(
114116
N,
115117
gpu_per_host,
116118
activate,
117-
backend=str(backend_type),
119+
backend=backend_type.value,
118120
)
119121

120122
def test_errors(self, backend_type):
@@ -176,7 +178,7 @@ def test_sub_mesh_use_only_one(self, backend_type):
176178
local_x = local_x.result(timeout=20)
177179
assert torch.equal(local_x, torch.ones(3, 4))
178180

179-
def test_sub_mesh_process_grop(self, backend_type):
181+
def test_sub_mesh_process_group(self, backend_type):
180182
with self.local_device_mesh(2, 2, backend_type, activate=False) as device_mesh:
181183
h0 = device_mesh.slice(host=0)
182184
pg0 = h0.process_group(("gpu",))
@@ -603,7 +605,7 @@ def test_to_mesh_pytree(self, backend_type):
603605
assert torch.equal(moved_tensor_b, torch.tensor([2.0]))
604606

605607
def test_hanging_error(self, backend_type):
606-
if backend_type != "mesh":
608+
if backend_type != BackendType.MESH:
607609
pytest.skip("only relevant for mesh backend")
608610
with self.local_device_mesh(2, 2, backend_type) as device_mesh:
609611
remote(lambda: torch.rand(3) + torch.rand(4), propagate=lambda: None)()

0 commit comments

Comments
 (0)