Skip to content

divide out tensor engine tests #649

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/build-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,8 @@ jobs:
# Setup build environment (conda + system deps + rust + build deps)
setup_build_environment

# Build the process allocator binary
build_process_allocator

# Build monarch (No tensor engine, CPU version)
USE_TENSOR_ENGINE=0 python setup.py bdist_wheel
12 changes: 7 additions & 5 deletions .github/workflows/test-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,17 @@ jobs:
source scripts/common-setup.sh

# Setup test environment
setup_conda_environment
setup_test_environment

# Install cargo binaries
mkdir cargo_bin && mv ${RUNNER_ARTIFACT_DIR}/cargo_bin/* cargo_bin
chmod +x cargo_bin/process_allocator
export PATH=$(pwd)/cargo_bin:$PATH

# Disable tensor engine
export USE_TENSOR_ENGINE=0

# Install the built wheel from artifact
install_wheel_from_artifact

# Currently a no-op.
# Tests requiring tensor engine / GPU need to be identified and flagged to skip.
# We will just ensure monarch can be imported successfully.
python -c "import monarch; print('Monarch imported successfully')"
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" --ignore=python/tests/tensor_engine -n 4
4 changes: 2 additions & 2 deletions .github/workflows/test-cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,5 @@ jobs:
pyright python/tests/test_python_actors.py

# Run CUDA tests
LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
python python/tests/test_mock_cuda.py
LC_ALL=C pytest python/tests/tensor_engine -s -v -m "not oss_skip" -n 4
python python/tests/tensor_engine/test_mock_cuda.py
6 changes: 3 additions & 3 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,11 @@ jobs:
script: |
source scripts/common-setup.sh
setup_build_environment ${{ matrix.python-version }}
cargo install --path monarch_hyperactor

# Setup Tensor Engine dependencies
setup_tensor_engine

cargo install --path monarch_hyperactor

# Build wheel
export MONARCH_PACKAGE_NAME="torchmonarch-nightly"
export MONARCH_VERSION=$(date +'%Y.%m.%d')
Expand All @@ -54,7 +53,8 @@ jobs:
# Run tests
install_python_test_dependencies
pip install dist/*.whl
python -c "import monarch"
LC_ALL=C pytest python/tests/test_python_actors.py -s -v -m "not oss_skip"

publish:
name: Publish to PyPI
needs: build
Expand Down
3 changes: 2 additions & 1 deletion python/monarch/_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import tempfile
import time
from contextlib import contextmanager, ExitStack
from enum import Enum
from typing import Any, Callable, Dict, Generator, Literal, Optional

import monarch_supervisor
Expand Down Expand Up @@ -225,7 +226,7 @@ def exit(
return dm


class BackendType:
class BackendType(Enum):
PY = "py"
RS = "rs"
MESH = "mesh"
2 changes: 1 addition & 1 deletion python/tests/builtins/test_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True)
num_hosts,
gpu_per_host,
activate,
backend=str(backend_type),
backend=backend_type.value,
)

@patch("monarch.builtins.log.logger")
Expand Down
2 changes: 1 addition & 1 deletion python/tests/builtins/test_random.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True)
num_hosts,
gpu_per_host,
activate,
backend=str(backend_type),
backend=backend_type.value,
)

def test_set_manual_seed_remote(self, backend_type):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def local_device_mesh(
num_hosts,
gpu_per_host,
activate,
backend=str(backend_type),
backend=backend_type.value,
)

@property
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ def local_rust_device_mesh(
torch.cuda.device_count() < 2,
reason="Not enough GPUs, this test requires at least 2 GPUs",
)
@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS, "mesh"])
@pytest.mark.parametrize(
"backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH]
)
# Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
# out is not counted as a failure, so we set a more restrictive timeout to
# ensure we see a hard failure in CI.
Expand All @@ -114,7 +116,7 @@ def local_device_mesh(
N,
gpu_per_host,
activate,
backend=str(backend_type),
backend=backend_type.value,
)

def test_errors(self, backend_type):
Expand Down Expand Up @@ -176,7 +178,7 @@ def test_sub_mesh_use_only_one(self, backend_type):
local_x = local_x.result(timeout=20)
assert torch.equal(local_x, torch.ones(3, 4))

def test_sub_mesh_process_grop(self, backend_type):
def test_sub_mesh_process_group(self, backend_type):
with self.local_device_mesh(2, 2, backend_type, activate=False) as device_mesh:
h0 = device_mesh.slice(host=0)
pg0 = h0.process_group(("gpu",))
Expand Down Expand Up @@ -603,7 +605,7 @@ def test_to_mesh_pytree(self, backend_type):
assert torch.equal(moved_tensor_b, torch.tensor([2.0]))

def test_hanging_error(self, backend_type):
if backend_type != "mesh":
if backend_type != BackendType.MESH:
pytest.skip("only relevant for mesh backend")
with self.local_device_mesh(2, 2, backend_type) as device_mesh:
remote(lambda: torch.rand(3) + torch.rand(4), propagate=lambda: None)()
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def local_device_mesh(
num_hosts,
gpu_per_host,
activate,
backend=str(backend_type),
backend=backend_type.value,
)


Expand Down Expand Up @@ -1289,7 +1289,7 @@ def return_them(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.T
)
class TestMeshSpecific(RemoteFunctionsTestBase):
def test_value_mesh(self):
with self.local_device_mesh(2, 2, "mesh") as device_mesh:
with self.local_device_mesh(2, 2, BackendType.MESH) as device_mesh:
x = device_mesh.rank("host")
y = device_mesh.rank("gpu")
r = return_them.call(x, y).get()
Expand Down
Loading