add nvshmem sum_reduce for mnnvl allreduce (#1152)

Amir-19 · web-flow · commit f70b66dc8287 · 2025-06-25T12:44:00.000-07:00
## 📌 Description  add support for MNNVL all reduce through NVSHMEM sum reduce ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/nvshmem_binding.cu b/csrc/nvshmem_binding.cu
@@ -83,6 +83,71 @@ void alltoall(at::Tensor dest, at::Tensor source) {
 
 void fake_alltoall(at::Tensor dest, at::Tensor source) {}
 
+void sum_reduce(at::Tensor dest, at::Tensor source, int64_t nelems) {
+  TORCH_CHECK(dest.is_contiguous(), "dest must be contiguous");
+  TORCH_CHECK(source.is_contiguous(), "source must be contiguous");
+  TORCH_CHECK(dest.scalar_type() == source.scalar_type(),
+              "dest and source must have the same dtype");
+
+  // Add validation and conversion
+  TORCH_CHECK(nelems >= 0, "nelems must be non-negative, got ", nelems);
+  TORCH_CHECK(nelems <= SIZE_MAX, "nelems too large: ", nelems, " > ", SIZE_MAX);
+  size_t nelems_size_t = static_cast<size_t>(nelems);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  switch (dest.scalar_type()) {
+    case at::kHalf:  // float16
+      NVSHMEMCHECK(nvshmemx_half_sum_reduce_on_stream(NVSHMEM_TEAM_WORLD, (__half*)dest.data_ptr(),
+                                                      (__half*)source.data_ptr(), nelems_size_t,
+                                                      stream));
+      break;
+    case at::kFloat:  // float32
+      NVSHMEMCHECK(nvshmemx_float_sum_reduce_on_stream(NVSHMEM_TEAM_WORLD, (float*)dest.data_ptr(),
+                                                       (float*)source.data_ptr(), nelems_size_t,
+                                                       stream));
+      break;
+    case at::kBFloat16:  // bfloat16
+      NVSHMEMCHECK(nvshmemx_bfloat16_sum_reduce_on_stream(
+          NVSHMEM_TEAM_WORLD, (__nv_bfloat16*)dest.data_ptr(), (__nv_bfloat16*)source.data_ptr(),
+          nelems_size_t, stream));
+      break;
+
+    default:
+      TORCH_CHECK(false, "Unsupported dtype for nvshmem_sum_reduce: ", dest.scalar_type());
+  }
+}
+
+void fake_sum_reduce(at::Tensor dest, at::Tensor source, int64_t nelems) {}
+
+void allreduce_on_stream_with_copy(at::Tensor dest_symm, at::Tensor source_symm,
+                                   at::Tensor dest_local, at::Tensor source_local, int64_t nelems) {
+  TORCH_CHECK(dest_symm.is_contiguous(), "dest_symm must be contiguous");
+  TORCH_CHECK(source_symm.is_contiguous(), "source_symm must be contiguous");
+  TORCH_CHECK(dest_local.is_contiguous(), "dest_local must be contiguous");
+  TORCH_CHECK(source_local.is_contiguous(), "source_local must be contiguous");
+  TORCH_CHECK(dest_symm.scalar_type() == source_symm.scalar_type(),
+              "dest_symm and source_symm must have the same dtype");
+  TORCH_CHECK(dest_symm.scalar_type() == source_local.scalar_type(),
+              "dest_symm and source_local must have the same dtype");
+  TORCH_CHECK(dest_local.scalar_type() == source_local.scalar_type(),
+              "dest_local and source_local must have the same dtype");
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  cudaMemcpyAsync(source_symm.data_ptr(), source_local.data_ptr(),
+                  nelems * source_local.element_size(), cudaMemcpyDefault, stream);
+  nvshmemx_barrier_on_stream(NVSHMEM_TEAM_WORLD, stream);
+  sum_reduce(dest_symm, source_symm, nelems);
+  cudaMemcpyAsync(dest_local.data_ptr(), dest_symm.data_ptr(), nelems * dest_local.element_size(),
+                  cudaMemcpyDefault, stream);
+  cudaStreamSynchronize(stream);
+}
+
+void fake_allreduce_on_stream_with_copy(at::Tensor dest_symm, at::Tensor source_symm,
+                                        at::Tensor dest_local, at::Tensor source_local,
+                                        int64_t nelems) {}
+
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("nvshmem_get_unique_id", &get_unique_id);
   m.def("nvshmem_unique_id_size", &unique_id_size);
@@ -96,6 +161,14 @@ TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("nvshmem_alltoall(Tensor! dest, Tensor src) -> ()");
   m.impl("nvshmem_alltoall", c10::kCUDA, &alltoall);
   m.impl("nvshmem_alltoall", c10::kMeta, &fake_alltoall);
+  m.def("nvshmem_sum_reduce(Tensor! dest, Tensor src, int nelems) -> ()");
+  m.impl("nvshmem_sum_reduce", c10::kCUDA, &sum_reduce);
+  m.impl("nvshmem_sum_reduce", c10::kMeta, &fake_sum_reduce);
+  m.def(
+      "nvshmem_allreduce_on_stream_with_copy(Tensor! dest_symm, Tensor source_symm, Tensor "
+      "dest_local, Tensor source_local, int nelems) -> ()");
+  m.impl("nvshmem_allreduce_on_stream_with_copy", c10::kCUDA, &allreduce_on_stream_with_copy);
+  m.impl("nvshmem_allreduce_on_stream_with_copy", c10::kMeta, &fake_allreduce_on_stream_with_copy);
 };
 
 }  // namespace
diff --git a/flashinfer/comm/nvshmem_allreduce.py b/flashinfer/comm/nvshmem_allreduce.py
@@ -0,0 +1,134 @@
+"""
+Copyright (c) 2023 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from .nvshmem import get_nvshmem_module
+
+
+class NVSHMEMAllReduce:
+    """
+    An AllReduce implementation for Single-Node and Multi-Node NVLink communication.
+    This class handles NVLINK-specific allreduce operations, optimized for NVLink-enabled clusters.
+    Note: Requires an active torch.distributed process group to be initialized
+    prior to creating an instance of this class.
+
+    Args:
+        local_rank (int): The local rank of the current process.
+        world_size (int): The total number of processes in the distributed group.
+        max_buffer_elements (int): The maximum number of elements that can be stored in
+        the buffer. This is used to allocate memory in nvshmem symm heap. set to the
+        largest tensor size you will be reducing.
+        dtype (torch.dtype): The data type of the tensors to be reduced.
+        device (torch.device): The device on which the tensors are located.
+        group (torch.distributed.ProcessGroup, optional): The torch.distributed process group to use.
+        should_init (bool, optional): Whether to initialize nvshmem. Defaults to True.
+    Raises:
+        RuntimeError: If nvshmem fails to initialize.
+    """
+
+    def __init__(
+        self,
+        local_rank: int,
+        world_size: int,
+        max_buffer_elements: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        group: Optional[ProcessGroup] = None,
+        should_init: bool = True,
+    ):
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.dtype = dtype
+        self.device = device
+        self.max_buffer_elements = max_buffer_elements
+        self.group = group
+        self.nvshmem_module = get_nvshmem_module()
+
+        self.should_init = should_init
+        if self.should_init:
+            self.init_nvshmem()
+
+        # assert PE and world size match
+        my_pe = self.nvshmem_module.nvshmem_my_pe()
+        n_pes = self.nvshmem_module.nvshmem_n_pes()
+        if my_pe != local_rank:
+            print(
+                f"WARNING: Rank {local_rank}: PE mismatch! Expected PE {local_rank}, got PE {my_pe}",
+                flush=True,
+            )
+        if n_pes != world_size:
+            print(
+                f"WARNING: Rank {local_rank}: World size mismatch! Expected {world_size}, got {n_pes}",
+                flush=True,
+            )
+
+        # allocate memory in nvshmem symm heap
+        self.symm_buffer_input = self.nvshmem_module.nvshmem_malloc(
+            [max_buffer_elements],
+            self.dtype,
+            self.device,
+        )
+        self.symm_buffer_output = self.nvshmem_module.nvshmem_malloc(
+            [max_buffer_elements],
+            self.dtype,
+            self.device,
+        )
+        torch.distributed.barrier(self.group)
+
+    def init_nvshmem(self):
+        torch.zeros(
+            self.nvshmem_module.nvshmem_unique_id_size(),
+            dtype=torch.uint8,
+            device="cpu",
+        )
+        if self.local_rank == 0:
+            uid = self.nvshmem_module.nvshmem_get_unique_id()
+        else:
+            uid = torch.zeros(
+                self.nvshmem_module.nvshmem_unique_id_size(),
+                dtype=torch.uint8,
+                device="cpu",
+            )
+        torch.distributed.broadcast(uid, src=0)
+        torch.distributed.barrier(self.group)
+        init_status = self.nvshmem_module.nvshmem_init(
+            uid, self.local_rank, self.world_size
+        )
+        torch.cuda.synchronize()
+        if init_status != 0:
+            raise RuntimeError("Failed to initialize nvshmem")
+
+    def all_reduce(self, inp: torch.Tensor, out: torch.Tensor) -> None:
+        self.nvshmem_module.nvshmem_allreduce_on_stream_with_copy(
+            self.symm_buffer_output,
+            self.symm_buffer_input,
+            out,
+            inp,
+            inp.numel(),
+        )
+
+    def shutdown(self):
+        del self.symm_buffer_input
+        del self.symm_buffer_output
+        torch.distributed.barrier(self.group)
+        torch.cuda.synchronize()
+        if self.should_init:
+            self.nvshmem_module.nvshmem_finalize()
diff --git a/tests/test_nvshmem_allreduce.py b/tests/test_nvshmem_allreduce.py
@@ -0,0 +1,113 @@
+import logging
+import multiprocessing as mp
+import os
+import socket
+from typing import Any
+
+import pytest
+import torch
+import torch.distributed as dist
+
+from flashinfer.comm.nvshmem_allreduce import NVSHMEMAllReduce
+
+logger = logging.getLogger(__name__)
+
+
+def _run_correctness_worker(world_size, rank, distributed_init_port):
+    assert rank >= 0
+    torch.cuda.set_device(rank)
+    device = torch.device("cuda", rank)
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+    dist.init_process_group(
+        backend="cpu:gloo,cuda:nccl",
+        rank=rank,
+        world_size=world_size,
+        device_id=device,
+        init_method=distributed_init_method,
+    )
+    group = dist.group.WORLD
+    num_ranks = torch.distributed.get_world_size()
+    rank_id = torch.distributed.get_rank()
+
+    batch_sizes = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+    max_batch_size = 4096
+    hidden_dim = 8192
+    test_loop = 10
+    tensor_dtype = torch.bfloat16
+    nvshmem_allreduce = NVSHMEMAllReduce(
+        rank_id,
+        num_ranks,
+        max_batch_size * hidden_dim,
+        tensor_dtype,
+        device,
+        group,
+    )
+
+    try:
+        for batch_size in batch_sizes:
+            for _ in range(test_loop):
+                tensor_size = batch_size * hidden_dim
+                inp1 = torch.full(
+                    [tensor_size], rank_id, dtype=tensor_dtype, device=device
+                )
+                inp1_ref = inp1.clone()
+                out1 = torch.empty_like(inp1)
+                nvshmem_allreduce.all_reduce(inp1, out1)
+                torch.distributed.all_reduce(inp1_ref, group=group)
+                torch.cuda.synchronize()
+                torch.testing.assert_close(out1, inp1_ref)
+                torch.distributed.barrier(group)
+    except Exception as e:
+        print(f"Rank {rank_id}: Exception during test: {e}")
+        raise
+    finally:
+        torch.distributed.barrier(group)
+        nvshmem_allreduce.shutdown()
+        torch.distributed.destroy_process_group(group)
+
+
+def get_open_port() -> int:
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("127.0.0.1", 0))
+            return s.getsockname()[1]
+    except OSError:
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("::1", 0))
+            return s.getsockname()[1]
+
+
+def multi_process_parallel(
+    world_size: int, test_target: Any, target_args: tuple = ()
+) -> None:
+    mp.set_start_method("spawn", force=True)
+
+    procs = []
+    distributed_init_port = get_open_port()
+    for i in range(world_size):
+        proc_args = (world_size, i, distributed_init_port) + target_args
+        proc = mp.Process(target=test_target, args=proc_args, name=f"Worker-{i}")
+        proc.start()
+        procs.append(proc)
+
+    for i in range(world_size):
+        procs[i].join()
+        assert (
+            procs[i].exitcode == 0
+        ), f"Process {i} failed with exit code {procs[i].exitcode}"
+
+
+@pytest.mark.parametrize("world_size", [8])
+def test_nvshmem_allreduce(world_size):
+    available_gpus = torch.cuda.device_count()
+    if world_size > available_gpus:
+        raise ValueError(
+            f"world_size {world_size} is greater than available_gpus {available_gpus}"
+        )
+    print(f"Running test for world_size={world_size}")
+    multi_process_parallel(
+        world_size,
+        _run_correctness_worker,
+        target_args=(),
+    )
+    print(f"NVSHMEM allreduce tp = {world_size}: OK")