Add hopper support

ilmarkov · ilmarkov · commit aad0d311a1c7 · 2025-07-22T15:46:12.000Z
Signed-off-by: ilmarkov &lt;markovilya197@gmail.com&gt;
diff --git a/docs/design/v1/multiprocessing.md b/docs/design/v1/multiprocessing.md
@@ -77,7 +77,7 @@ The `multiproc_xpu_executor` forces the use of `spawn`.
 
 There are other miscellaneous places hard-coding the use of `spawn`:
 
-- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L135>
+- <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/distributed/device_communicators/all_reduce_utils.py#L135>
 - <https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/entrypoints/openai/api_server.py#L184>
 
 Related PRs:
diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py
@@ -38,7 +38,7 @@
     'vllm/distributed/utils.py',
     'vllm/distributed/parallel_state.py',
     'vllm/engine/multiprocessing/client.py',
-    'vllm/distributed/device_communicators/custom_all_reduce_utils.py',
+    'vllm/distributed/device_communicators/all_reduce_utils.py',
     'vllm/distributed/device_communicators/shm_broadcast.py',
     'vllm/engine/multiprocessing/engine.py',
     'benchmarks/kernels/graph_machete_bench.py',
diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -23,6 +23,39 @@
 
 logger = init_logger(__name__)
 
+MiB = 1024 * 1024
+# Max size for each world size in case symmetric memory is available
+# For different SM architectures
+CUSTOM_ALL_REDUCE_MAX_SIZES = {
+    "9.0": {
+        2: 64 * MiB,  # 64 MB
+        4: 32 * MiB,  # 32 MB
+        6: MiB // 2,  # 512 KB
+        8: MiB // 4,  # 256 KB
+    },
+    "10.0": {
+        2: 2 * MiB,  # 2 MB
+        4: 2 * MiB,  # 2 MB
+        6: 2 * MiB,  # 2 MB
+        8: 2 * MiB,  # 2 MB
+    }
+}
+
+SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
+    "9.0": {
+        2: 64 * MiB,  # 64 MB
+        4: 32 * MiB,  # 32 MB
+        6: 64 * MiB,  # 64 MB
+        8: 64 * MiB,  # 64 MB
+    },
+    "10.0": {
+        2: 8 * MiB,  # 8 MB
+        4: 32 * MiB,  # 32 MB
+        6: 128 * MiB,  # 128 MB
+        8: 128 * MiB,  # 128 MB
+    }
+}
+
 
 def producer(batch_src: Sequence[int],
              producer_queue,
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -115,7 +115,7 @@ def all_reduce(self, input_):
             assert out is not None
             return out
         symm_mem_comm = self.symm_mem_comm
-        if symm_mem_comm is not None and not symm_mem_comm.disabled and \
+        if symm_mem_comm is not None and \
             symm_mem_comm.should_use_symm_mem(input_):
             out = symm_mem_comm.all_reduce(input_)
             assert out is not None
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -10,8 +10,8 @@
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.distributed.device_communicators.custom_all_reduce_utils import (
-    gpu_p2p_access_check)
+from vllm.distributed.device_communicators.all_reduce_utils import (
+    CUSTOM_ALL_REDUCE_MAX_SIZES, gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -49,14 +49,6 @@ def is_weak_contiguous(inp: torch.Tensor):
 class CustomAllreduce:
 
     _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
-    MiB = 1024 * 1024
-    # Max sizes for each world size in case symmetric memory is available
-    _MAX_SIZES = {
-        2: 2 * MiB,  # 1 MB
-        4: 2 * MiB,  # 1 MB
-        6: MiB,  # 512 KB
-        8: MiB // 2,  # 512 KB
-    }
 
     # max_size: max supported allreduce size
     def __init__(self,
@@ -117,9 +109,13 @@ def __init__(self,
         # now `device` is a `torch.device` object
         assert isinstance(device, torch.device)
         self.device = device
-        if current_platform.is_cuda() and envs.VLLM_ALLREDUCE_USE_SYMM_MEM:
-            max_size = CustomAllreduce._MAX_SIZES[world_size]
-
+        device_capability = current_platform.get_device_capability(
+        ).as_version_str()
+        if (current_platform.is_cuda() and envs.VLLM_ALLREDUCE_USE_SYMM_MEM
+                and device_capability in CUSTOM_ALL_REDUCE_MAX_SIZES):
+            max_size = min(
+                CUSTOM_ALL_REDUCE_MAX_SIZES[device_capability][world_size],
+                max_size)
         cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES
         if cuda_visible_devices:
             device_ids = list(map(int, cuda_visible_devices.split(",")))
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
@@ -6,6 +6,8 @@
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 
+from vllm.distributed.device_communicators.all_reduce_utils import (
+    SYMM_MEM_ALL_REDUCE_MAX_SIZES)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -20,13 +22,9 @@
 
 
 class SymmMemCommunicator:
-    MiB = 1024 * 1024
-    # Max sizes for each world size
-    _MAX_SIZES = {
-        2: 8 * MiB,
-        4: 32 * MiB,
-        6: 128 * MiB,
-        8: 128 * MiB,
+    _WORLD_SIZES_MULTIMEM = {
+        "9.0": [4, 6, 8],
+        "10.0": [6, 8],
     }
 
     def __init__(self, group: ProcessGroup, device: Union[int, str,
@@ -49,15 +47,27 @@ def __init__(self, group: ProcessGroup, device: Union[int, str,
         self.device = device
         self.group = group
         self.world_size = dist.get_world_size(self.group)
-        if self.world_size not in self._MAX_SIZES:
+        self.device_capability = current_platform.get_device_capability(
+        ).as_version_str()
+        if self.device_capability not in SYMM_MEM_ALL_REDUCE_MAX_SIZES:
+            logger.warning(
+                "SymmMemCommunicator: Device capability %s not supported, "
+                "communicator is not available.",
+                self.device_capability,
+            )
+            return
+        if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[
+                self.device_capability]:
             logger.warning(
                 "SymmMemCommunicator: World size %d not supported, "
                 "communicator is not available.",
                 self.world_size,
             )
             return
+        self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
+            self.world_size]
         self.buffer = torch_symm_mem.empty(
-            self._MAX_SIZES[self.world_size] // self.dtype.itemsize,
+            self.max_size // self.dtype.itemsize,
             device=self.device,
             dtype=self.dtype,
         )
@@ -76,7 +86,7 @@ def should_use_symm_mem(self, inp: torch.Tensor):
         inp_size = inp.numel() * inp.element_size()
         if inp_size % 4 != 0:
             return False
-        return inp_size <= self._MAX_SIZES[self.world_size]
+        return inp_size < self.max_size
 
     def all_reduce(
             self,
@@ -88,14 +98,13 @@ def all_reduce(
         if out is None:
             out = torch.empty_like(inp)
         self.buffer[:inp.numel()].copy_(inp.view(-1))
-        if self.world_size in [2, 4]:
-            # Use two-shot all-reduce for 2 and 4 GPUs
-            torch.ops.symm_mem.two_shot_all_reduce_(self.buffer[:inp.numel()],
+        if self.world_size in self._WORLD_SIZES_MULTIMEM[
+                self.device_capability]:
+            torch.ops.symm_mem.multimem_all_reduce_(self.buffer[:inp.numel()],
                                                     "sum",
                                                     self.group.group_name)
         else:
-            # Use multi-mem all-reduce for 6 and 8 GPUs
-            torch.ops.symm_mem.multimem_all_reduce_(self.buffer[:inp.numel()],
+            torch.ops.symm_mem.two_shot_all_reduce_(self.buffer[:inp.numel()],
                                                     "sum",
                                                     self.group.group_name)
         out.copy_(self.buffer[:inp.numel()].view(out.shape))
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -625,7 +625,7 @@ def get_vllm_port() -> Optional[int]:
      ("1", "true")),
 
     # By default, vLLM will check the peer-to-peer capability itself,
-    # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa
+    # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/all_reduce_utils.py#L101-L108 for details. # noqa
     # If this env var is set to 1, vLLM will skip the peer-to-peer check,
     # and trust the driver's peer-to-peer capability report.
     "VLLM_SKIP_P2P_CHECK":