diff --git a/.github/workflows/build-cpu.yml b/.github/workflows/build-cpu.yml index 86130b87..6a46b92a 100644 --- a/.github/workflows/build-cpu.yml +++ b/.github/workflows/build-cpu.yml @@ -29,5 +29,8 @@ jobs: # Setup build environment (conda + system deps + rust + build deps) setup_build_environment + # Build the process allocator binary + build_process_allocator + # Build monarch (No tensor engine, CPU version) USE_TENSOR_ENGINE=0 python setup.py bdist_wheel diff --git a/.github/workflows/test-cpu.yml b/.github/workflows/test-cpu.yml index 914f67dd..49c5e9e6 100644 --- a/.github/workflows/test-cpu.yml +++ b/.github/workflows/test-cpu.yml @@ -26,7 +26,12 @@ jobs: source scripts/common-setup.sh # Setup test environment - setup_conda_environment + setup_test_environment + + # Install cargo binaries + mkdir cargo_bin && mv ${RUNNER_ARTIFACT_DIR}/cargo_bin/* cargo_bin + chmod +x cargo_bin/process_allocator + export PATH=$(pwd)/cargo_bin:$PATH # Disable tensor engine export USE_TENSOR_ENGINE=0 @@ -34,7 +39,4 @@ jobs: # Install the built wheel from artifact install_wheel_from_artifact - # Currently a no-op. - # Tests requiring tensor engine / GPU need to be identified and flagged to skip. - # We will just ensure monarch can be imported successfully. - python -c "import monarch; print('Monarch imported successfully')" + LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" --ignore=python/tests/tensor_engine -n 4 diff --git a/.github/workflows/test-cuda.yml b/.github/workflows/test-cuda.yml index ab9f74d8..6c250cf5 100644 --- a/.github/workflows/test-cuda.yml +++ b/.github/workflows/test-cuda.yml @@ -55,5 +55,5 @@ jobs: pyright python/tests/test_python_actors.py # Run CUDA tests - LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" - python python/tests/test_mock_cuda.py + LC_ALL=C pytest python/tests/tensor_engine -s -v -m "not oss_skip" -n 4 + python python/tests/tensor_engine/test_mock_cuda.py diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 9bfefbe8..aef903dc 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -35,12 +35,11 @@ jobs: script: | source scripts/common-setup.sh setup_build_environment ${{ matrix.python-version }} + cargo install --path monarch_hyperactor # Setup Tensor Engine dependencies setup_tensor_engine - cargo install --path monarch_hyperactor - # Build wheel export MONARCH_PACKAGE_NAME="torchmonarch-nightly" export MONARCH_VERSION=$(date +'%Y.%m.%d') @@ -54,7 +53,8 @@ jobs: # Run tests install_python_test_dependencies pip install dist/*.whl - python -c "import monarch" + LC_ALL=C pytest python/tests/test_python_actors.py -s -v -m "not oss_skip" + publish: name: Publish to PyPI needs: build diff --git a/python/monarch/_testing.py b/python/monarch/_testing.py index 052992ff..2b19c49a 100644 --- a/python/monarch/_testing.py +++ b/python/monarch/_testing.py @@ -10,6 +10,7 @@ import tempfile import time from contextlib import contextmanager, ExitStack +from enum import Enum from typing import Any, Callable, Dict, Generator, Literal, Optional import monarch_supervisor @@ -225,7 +226,7 @@ def exit( return dm -class BackendType: +class BackendType(Enum): PY = "py" RS = "rs" MESH = "mesh" diff --git a/python/tests/builtins/test_log.py b/python/tests/builtins/test_log.py index da4c1d4d..6727608c 100644 --- a/python/tests/builtins/test_log.py +++ b/python/tests/builtins/test_log.py @@ -30,7 +30,7 @@ def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True) num_hosts, gpu_per_host, activate, - backend=str(backend_type), + backend=backend_type.value, ) @patch("monarch.builtins.log.logger") diff --git a/python/tests/builtins/test_random.py b/python/tests/builtins/test_random.py index 768e0f93..f2004ac7 100644 --- a/python/tests/builtins/test_random.py +++ b/python/tests/builtins/test_random.py @@ -43,7 +43,7 @@ def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True) num_hosts, gpu_per_host, activate, - backend=str(backend_type), + backend=backend_type.value, ) def test_set_manual_seed_remote(self, backend_type): diff --git a/python/tests/test_coalescing.py b/python/tests/tensor_engine/test_coalescing.py similarity index 99% rename from python/tests/test_coalescing.py rename to python/tests/tensor_engine/test_coalescing.py index 86568fc4..41bf4dba 100644 --- a/python/tests/test_coalescing.py +++ b/python/tests/tensor_engine/test_coalescing.py @@ -78,7 +78,7 @@ def local_device_mesh( num_hosts, gpu_per_host, activate, - backend=str(backend_type), + backend=backend_type.value, ) @property diff --git a/python/tests/test_controller.py b/python/tests/tensor_engine/test_controller.py similarity index 99% rename from python/tests/test_controller.py rename to python/tests/tensor_engine/test_controller.py index 88f29021..f6f82c6a 100644 --- a/python/tests/test_controller.py +++ b/python/tests/tensor_engine/test_controller.py @@ -96,7 +96,9 @@ def local_rust_device_mesh( torch.cuda.device_count() < 2, reason="Not enough GPUs, this test requires at least 2 GPUs", ) -@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS, "mesh"]) +@pytest.mark.parametrize( + "backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH] +) # Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times # out is not counted as a failure, so we set a more restrictive timeout to # ensure we see a hard failure in CI. @@ -114,7 +116,7 @@ def local_device_mesh( N, gpu_per_host, activate, - backend=str(backend_type), + backend=backend_type.value, ) def test_errors(self, backend_type): @@ -176,7 +178,7 @@ def test_sub_mesh_use_only_one(self, backend_type): local_x = local_x.result(timeout=20) assert torch.equal(local_x, torch.ones(3, 4)) - def test_sub_mesh_process_grop(self, backend_type): + def test_sub_mesh_process_group(self, backend_type): with self.local_device_mesh(2, 2, backend_type, activate=False) as device_mesh: h0 = device_mesh.slice(host=0) pg0 = h0.process_group(("gpu",)) @@ -603,7 +605,7 @@ def test_to_mesh_pytree(self, backend_type): assert torch.equal(moved_tensor_b, torch.tensor([2.0])) def test_hanging_error(self, backend_type): - if backend_type != "mesh": + if backend_type != BackendType.MESH: pytest.skip("only relevant for mesh backend") with self.local_device_mesh(2, 2, backend_type) as device_mesh: remote(lambda: torch.rand(3) + torch.rand(4), propagate=lambda: None)() diff --git a/python/tests/test_debugger.py b/python/tests/tensor_engine/test_debugger.py similarity index 100% rename from python/tests/test_debugger.py rename to python/tests/tensor_engine/test_debugger.py diff --git a/python/tests/test_fault_tolerance.py b/python/tests/tensor_engine/test_fault_tolerance.py similarity index 100% rename from python/tests/test_fault_tolerance.py rename to python/tests/tensor_engine/test_fault_tolerance.py diff --git a/python/tests/test_mock_cuda.py b/python/tests/tensor_engine/test_mock_cuda.py similarity index 100% rename from python/tests/test_mock_cuda.py rename to python/tests/tensor_engine/test_mock_cuda.py diff --git a/python/tests/test_pdb_actor.py b/python/tests/tensor_engine/test_pdb_actor.py similarity index 100% rename from python/tests/test_pdb_actor.py rename to python/tests/tensor_engine/test_pdb_actor.py diff --git a/python/tests/test_rdma.py b/python/tests/tensor_engine/test_rdma.py similarity index 100% rename from python/tests/test_rdma.py rename to python/tests/tensor_engine/test_rdma.py diff --git a/python/tests/test_remote_functions.py b/python/tests/tensor_engine/test_remote_functions.py similarity index 99% rename from python/tests/test_remote_functions.py rename to python/tests/tensor_engine/test_remote_functions.py index 42adfd82..2dd5fd69 100644 --- a/python/tests/test_remote_functions.py +++ b/python/tests/tensor_engine/test_remote_functions.py @@ -172,7 +172,7 @@ def local_device_mesh( num_hosts, gpu_per_host, activate, - backend=str(backend_type), + backend=backend_type.value, ) @@ -1289,7 +1289,7 @@ def return_them(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.T ) class TestMeshSpecific(RemoteFunctionsTestBase): def test_value_mesh(self): - with self.local_device_mesh(2, 2, "mesh") as device_mesh: + with self.local_device_mesh(2, 2, BackendType.MESH) as device_mesh: x = device_mesh.rank("host") y = device_mesh.rank("gpu") r = return_them.call(x, y).get() diff --git a/python/tests/test_rust_backend.py b/python/tests/tensor_engine/test_rust_backend.py similarity index 100% rename from python/tests/test_rust_backend.py rename to python/tests/tensor_engine/test_rust_backend.py diff --git a/python/tests/test_tensor_engine.py b/python/tests/tensor_engine/test_tensor_engine.py similarity index 100% rename from python/tests/test_tensor_engine.py rename to python/tests/tensor_engine/test_tensor_engine.py