feat(models): multibackend all_to_all wrapper (#95)

cathalobrien · HCookie · web-flow · commit 6819be150641 · 2025-11-19T10:28:23.000Z
Small addition to fallback to support alltoall when using the Gloo backend to torch.distributed. This PR is needed to be able to run the transformer model on CPU. for 99.9% of users running on GPUs with the NCCL background, this change should not effect them Gloo does not offer an alltoall primitive, as shown [here](https://pytorch.org/docs/stable/distributed.html#backends) This commit implements am all_to_all fallback for Gloo, using the 'Linear Shift' algorithm from [Hoffman and Rünger, 2013](https://www.tu-chemnitz.de/informatik/PI/forschung/publikationen/download/HR_eurompi13.pdf). Because of syntax for `torch.dist` changing in torch 2.6, older versions of torch are not supported. --------- Co-authored-by: Harrison Cook <Harrison.cook@ecmwf.int>
diff --git a/models/src/anemoi/models/distributed/transformer.py b/models/src/anemoi/models/distributed/transformer.py
@@ -18,6 +18,42 @@
 from anemoi.models.distributed.utils import get_memory_format
 
 
+def _alltoallwrapper(output_list: list, input_list: list, group: ProcessGroup):
+    """Wrapper function for all_to_all across NCCL, MPI and Gloo backends.
+    There is no all_to_all primitive for the Gloo backend. In that case each
+    process broadcasts its tensor asynchronously.
+
+    Retuns nothing but modifies output_list in-place
+
+    """
+    comm_size = dist.get_world_size(group=group)
+
+    if dist.get_backend(group) == "gloo":
+
+        # Need to check torch version here bc the syntax for dist.send/recv changed in torch v2.6
+        torch_version = torch.__version__.split(".")
+        torch_major_version = int(torch_version[0])
+        torch_minor_version = int(torch_version[1])
+        if torch_major_version <= 2 and torch_minor_version < 6:
+            raise NotImplementedError("Gloo all_to_all not implemented for torch < v2.6")
+
+        reqs = []
+        rank = dist.get_rank(group=group)
+        # Here we implement the linear shift algorithm from Hofmann and Ruenger, 2013
+        for i in range(0, comm_size):
+            j = (i - rank + comm_size) % comm_size
+            if j != rank:
+                # exchange data with rank j
+                reqs.append(dist.isend(input_list[j], group_dst=j, group=group))
+                reqs.append(dist.irecv(output_list[j], group_src=j, group=group))
+            else:
+                output_list[rank] = input_list[rank]
+        for req in reqs:
+            req.wait()
+    else:
+        dist.all_to_all(output_list, input_list, group=group)
+
+
 def _headsalltoall(input_: Tensor, shapes: list, group: Optional[ProcessGroup] = None) -> Tensor:
     """Apply all_to_all along the head dimension.
 
@@ -52,7 +88,7 @@ def _headsalltoall(input_: Tensor, shapes: list, group: Optional[ProcessGroup] =
         for rank in range(comm_size)
     ]
 
-    dist.all_to_all(output_list, input_list, group=group)
+    _alltoallwrapper(output_list, input_list, group=group)
 
     # Note: torch.cat already creates a contiguous tensor.
     return torch.cat(output_list, dim=-2).contiguous(memory_format=input_format)
@@ -79,7 +115,7 @@ def _seqalltoall(input_: Tensor, shapes: list, group: Optional[ProcessGroup] = N
 
     output_list = [torch.empty_like(input_list[comm_rank]) for _ in range(comm_size)]
 
-    dist.all_to_all(output_list, input_list, group=group)
+    _alltoallwrapper(output_list, input_list, group=group)
 
     # Note: torch.cat already creates a contiguous tensor.
     return torch.cat(output_list, dim=-3).contiguous(memory_format=input_format)