Sample PR to fix CI (#182)

xmfan · web-flow · commit d397aa6f4847 · 2025-10-01T18:13:30.000-07:00
stack-info: PR: #182, branch: xmfan/stack/11
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
@@ -30,8 +30,11 @@ jobs:
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
-      python-version: "3.12"
       script: |
+        conda create --yes --quiet --name py312 python=3.12
+        source $(conda info --base)/etc/profile.d/conda.sh
+        conda activate py312
+
         pip install --quiet -r requirements-test.txt
         # For some reason the spec above isnt working
         pip uninstall -y torch
diff --git a/.github/workflows/test_torchtitan.yml b/.github/workflows/test_torchtitan.yml
@@ -30,8 +30,11 @@ jobs:
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}
       submodules: recursive
-      python-version: "3.12"
       script: |
+        conda create --yes --quiet --name py312 python=3.12
+        source $(conda info --base)/etc/profile.d/conda.sh
+        conda activate py312
+
         pip install --quiet -r requirements-test.txt
         # For some reason the spec above isnt working
         pip uninstall -y torch
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
@@ -177,9 +177,6 @@ def build_sharding_metadata(self):
                     assert (
                         local_map_kwargs.get("in_grad_placements", None) is None
                     ), "Not yet implemented"
-                    assert (
-                        local_map_kwargs.get("device_mesh", None) is None
-                    ), "Must be provided by Autoparallel"
                     assert not user_kwargs
                     # TODO: get rid of this when HOP can install as a subgraph
                     assert "call_local_map" in str(
diff --git a/examples/example_local_map.py b/examples/example_local_map.py
@@ -15,6 +15,23 @@
 
 from autoparallel.api import AutoParallel
 
+world_size = 256
+
+fake_store = FakeStore()
+torch.distributed.init_process_group(
+    "fake", store=fake_store, rank=0, world_size=world_size
+)
+mesh = torch.distributed.device_mesh.init_device_mesh(
+    "cuda",
+    (world_size // 32, 8, 4),
+    mesh_dim_names=(
+        "dp",
+        "tp",
+        "cp",
+    ),
+)
+assert mesh.ndim == 3, "Please also update local_map"
+
 
 def policy_fn(ctx, op, *args, **kwargs):
     if (
@@ -37,7 +54,7 @@ def policy_fn(ctx, op, *args, **kwargs):
     ),
     redistribute_inputs=True,
     in_grad_placements=None,
-    device_mesh=None,
+    device_mesh=mesh,
 )
 def replicate_linear(w, x):
     return torch.matmul(x, w.t())
@@ -54,7 +71,7 @@ def replicate_linear(w, x):
     ),
     redistribute_inputs=True,
     in_grad_placements=None,
-    device_mesh=None,
+    device_mesh=mesh,
 )
 def sharded_pointwise(x, scalar):
     return x + scalar, scalar
@@ -69,7 +86,7 @@ def sharded_pointwise(x, scalar):
     ),
     redistribute_inputs=True,
     in_grad_placements=None,
-    device_mesh=None,
+    device_mesh=mesh,
 )
 def context_parallel_attention(query, key, value):
     out = nn.functional.scaled_dot_product_attention(
@@ -128,22 +145,6 @@ def forward(self, x):
         return o
 
 
-world_size = 256
-
-fake_store = FakeStore()
-torch.distributed.init_process_group(
-    "fake", store=fake_store, rank=0, world_size=world_size
-)
-mesh = torch.distributed.device_mesh.init_device_mesh(
-    "cuda",
-    (world_size // 32, 8, 4),
-    mesh_dim_names=(
-        "dp",
-        "tp",
-        "cp",
-    ),
-)
-
 bs = 8 * mesh.shape[0]
 seq_len = 256
 nheads = 48