diff --git a/.github/workflows/build-cpu.yml b/.github/workflows/build-cpu.yml
index 86130b87..6a46b92a 100644
--- a/.github/workflows/build-cpu.yml
+++ b/.github/workflows/build-cpu.yml
@@ -29,5 +29,8 @@ jobs:
         # Setup build environment (conda + system deps + rust + build deps)
         setup_build_environment
 
+        # Build the process allocator binary
+        build_process_allocator
+
         # Build monarch (No tensor engine, CPU version)
         USE_TENSOR_ENGINE=0 python setup.py bdist_wheel
diff --git a/.github/workflows/test-cpu.yml b/.github/workflows/test-cpu.yml
index 914f67dd..49c5e9e6 100644
--- a/.github/workflows/test-cpu.yml
+++ b/.github/workflows/test-cpu.yml
@@ -26,7 +26,12 @@ jobs:
         source scripts/common-setup.sh
 
         # Setup test environment
-        setup_conda_environment
+        setup_test_environment
+
+        # Install cargo binaries
+        mkdir cargo_bin && mv ${RUNNER_ARTIFACT_DIR}/cargo_bin/* cargo_bin
+        chmod +x cargo_bin/process_allocator
+        export PATH=$(pwd)/cargo_bin:$PATH
 
         # Disable tensor engine
         export USE_TENSOR_ENGINE=0
@@ -34,7 +39,4 @@ jobs:
         # Install the built wheel from artifact
         install_wheel_from_artifact
 
-        # Currently a no-op.
-        # Tests requiring tensor engine / GPU need to be identified and flagged to skip.
-        # We will just ensure monarch can be imported successfully.
-        python -c "import monarch; print('Monarch imported successfully')"
+        LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip" --ignore=python/tests/tensor_engine -n 4
diff --git a/.github/workflows/test-cuda.yml b/.github/workflows/test-cuda.yml
index ab9f74d8..6c250cf5 100644
--- a/.github/workflows/test-cuda.yml
+++ b/.github/workflows/test-cuda.yml
@@ -55,5 +55,5 @@ jobs:
         pyright python/tests/test_python_actors.py
 
         # Run CUDA tests
-        LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
-        python python/tests/test_mock_cuda.py
+        LC_ALL=C pytest python/tests/tensor_engine -s -v -m "not oss_skip" -n 4
+        python python/tests/tensor_engine/test_mock_cuda.py
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 9bfefbe8..aef903dc 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -35,12 +35,11 @@ jobs:
       script: |
         source scripts/common-setup.sh
         setup_build_environment ${{ matrix.python-version }}
+        cargo install --path monarch_hyperactor
 
         # Setup Tensor Engine dependencies
         setup_tensor_engine
 
-        cargo install --path monarch_hyperactor
-
         # Build wheel
         export MONARCH_PACKAGE_NAME="torchmonarch-nightly"
         export MONARCH_VERSION=$(date +'%Y.%m.%d')
@@ -54,7 +53,8 @@ jobs:
         # Run tests
         install_python_test_dependencies
         pip install dist/*.whl
-        python -c "import monarch"
+        LC_ALL=C pytest python/tests/test_python_actors.py -s -v -m "not oss_skip"
+
   publish:
     name: Publish to PyPI
     needs: build
diff --git a/python/monarch/_testing.py b/python/monarch/_testing.py
index 052992ff..2b19c49a 100644
--- a/python/monarch/_testing.py
+++ b/python/monarch/_testing.py
@@ -10,6 +10,7 @@
 import tempfile
 import time
 from contextlib import contextmanager, ExitStack
+from enum import Enum
 from typing import Any, Callable, Dict, Generator, Literal, Optional
 
 import monarch_supervisor
@@ -225,7 +226,7 @@ def exit(
     return dm
 
 
-class BackendType:
+class BackendType(Enum):
     PY = "py"
     RS = "rs"
     MESH = "mesh"
diff --git a/python/tests/builtins/test_log.py b/python/tests/builtins/test_log.py
index da4c1d4d..6727608c 100644
--- a/python/tests/builtins/test_log.py
+++ b/python/tests/builtins/test_log.py
@@ -30,7 +30,7 @@ def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True)
             num_hosts,
             gpu_per_host,
             activate,
-            backend=str(backend_type),
+            backend=backend_type.value,
         )
 
     @patch("monarch.builtins.log.logger")
diff --git a/python/tests/builtins/test_random.py b/python/tests/builtins/test_random.py
index 768e0f93..f2004ac7 100644
--- a/python/tests/builtins/test_random.py
+++ b/python/tests/builtins/test_random.py
@@ -43,7 +43,7 @@ def local_device_mesh(cls, num_hosts, gpu_per_host, backend_type, activate=True)
             num_hosts,
             gpu_per_host,
             activate,
-            backend=str(backend_type),
+            backend=backend_type.value,
         )
 
     def test_set_manual_seed_remote(self, backend_type):
diff --git a/python/tests/test_coalescing.py b/python/tests/tensor_engine/test_coalescing.py
similarity index 99%
rename from python/tests/test_coalescing.py
rename to python/tests/tensor_engine/test_coalescing.py
index 86568fc4..41bf4dba 100644
--- a/python/tests/test_coalescing.py
+++ b/python/tests/tensor_engine/test_coalescing.py
@@ -78,7 +78,7 @@ def local_device_mesh(
             num_hosts,
             gpu_per_host,
             activate,
-            backend=str(backend_type),
+            backend=backend_type.value,
         )
 
     @property
diff --git a/python/tests/test_controller.py b/python/tests/tensor_engine/test_controller.py
similarity index 99%
rename from python/tests/test_controller.py
rename to python/tests/tensor_engine/test_controller.py
index 88f29021..f6f82c6a 100644
--- a/python/tests/test_controller.py
+++ b/python/tests/tensor_engine/test_controller.py
@@ -96,7 +96,9 @@ def local_rust_device_mesh(
     torch.cuda.device_count() < 2,
     reason="Not enough GPUs, this test requires at least 2 GPUs",
 )
-@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS, "mesh"])
+@pytest.mark.parametrize(
+    "backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH]
+)
 # Set global timeout--sandcastle's timeout is 600s. A test that sandcastle times
 # out is not counted as a failure, so we set a more restrictive timeout to
 # ensure we see a hard failure in CI.
@@ -114,7 +116,7 @@ def local_device_mesh(
             N,
             gpu_per_host,
             activate,
-            backend=str(backend_type),
+            backend=backend_type.value,
         )
 
     def test_errors(self, backend_type):
@@ -176,7 +178,7 @@ def test_sub_mesh_use_only_one(self, backend_type):
             local_x = local_x.result(timeout=20)
             assert torch.equal(local_x, torch.ones(3, 4))
 
-    def test_sub_mesh_process_grop(self, backend_type):
+    def test_sub_mesh_process_group(self, backend_type):
         with self.local_device_mesh(2, 2, backend_type, activate=False) as device_mesh:
             h0 = device_mesh.slice(host=0)
             pg0 = h0.process_group(("gpu",))
@@ -603,7 +605,7 @@ def test_to_mesh_pytree(self, backend_type):
         assert torch.equal(moved_tensor_b, torch.tensor([2.0]))
 
     def test_hanging_error(self, backend_type):
-        if backend_type != "mesh":
+        if backend_type != BackendType.MESH:
             pytest.skip("only relevant for mesh backend")
         with self.local_device_mesh(2, 2, backend_type) as device_mesh:
             remote(lambda: torch.rand(3) + torch.rand(4), propagate=lambda: None)()
diff --git a/python/tests/test_debugger.py b/python/tests/tensor_engine/test_debugger.py
similarity index 100%
rename from python/tests/test_debugger.py
rename to python/tests/tensor_engine/test_debugger.py
diff --git a/python/tests/test_fault_tolerance.py b/python/tests/tensor_engine/test_fault_tolerance.py
similarity index 100%
rename from python/tests/test_fault_tolerance.py
rename to python/tests/tensor_engine/test_fault_tolerance.py
diff --git a/python/tests/test_mock_cuda.py b/python/tests/tensor_engine/test_mock_cuda.py
similarity index 100%
rename from python/tests/test_mock_cuda.py
rename to python/tests/tensor_engine/test_mock_cuda.py
diff --git a/python/tests/test_pdb_actor.py b/python/tests/tensor_engine/test_pdb_actor.py
similarity index 100%
rename from python/tests/test_pdb_actor.py
rename to python/tests/tensor_engine/test_pdb_actor.py
diff --git a/python/tests/test_rdma.py b/python/tests/tensor_engine/test_rdma.py
similarity index 100%
rename from python/tests/test_rdma.py
rename to python/tests/tensor_engine/test_rdma.py
diff --git a/python/tests/test_remote_functions.py b/python/tests/tensor_engine/test_remote_functions.py
similarity index 99%
rename from python/tests/test_remote_functions.py
rename to python/tests/tensor_engine/test_remote_functions.py
index 42adfd82..2dd5fd69 100644
--- a/python/tests/test_remote_functions.py
+++ b/python/tests/tensor_engine/test_remote_functions.py
@@ -172,7 +172,7 @@ def local_device_mesh(
             num_hosts,
             gpu_per_host,
             activate,
-            backend=str(backend_type),
+            backend=backend_type.value,
         )
 
 
@@ -1289,7 +1289,7 @@ def return_them(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.T
 )
 class TestMeshSpecific(RemoteFunctionsTestBase):
     def test_value_mesh(self):
-        with self.local_device_mesh(2, 2, "mesh") as device_mesh:
+        with self.local_device_mesh(2, 2, BackendType.MESH) as device_mesh:
             x = device_mesh.rank("host")
             y = device_mesh.rank("gpu")
             r = return_them.call(x, y).get()
diff --git a/python/tests/test_rust_backend.py b/python/tests/tensor_engine/test_rust_backend.py
similarity index 100%
rename from python/tests/test_rust_backend.py
rename to python/tests/tensor_engine/test_rust_backend.py
diff --git a/python/tests/test_tensor_engine.py b/python/tests/tensor_engine/test_tensor_engine.py
similarity index 100%
rename from python/tests/test_tensor_engine.py
rename to python/tests/tensor_engine/test_tensor_engine.py