fix and enhance all_gather (#1228)

HydrogenSulfate · web-flow · commit 3ef1044d723a · 2025-10-14T11:23:22.000+08:00
diff --git a/ppsci/__init__.py b/ppsci/__init__.py
@@ -30,6 +30,7 @@
 from ppsci.utils.checker import run_check  # isort:skip
 from ppsci.utils.checker import run_check_mesh  # isort:skip
 from ppsci.utils import lambdify  # isort:skip
+from ppsci.utils import misc  # isort:skip
 
 
 try:
@@ -58,6 +59,7 @@
     "run_check",
     "run_check_mesh",
     "lambdify",
+    "misc",
 ]
 
 
diff --git a/ppsci/arch/cuboid_transformer_encoder.py b/ppsci/arch/cuboid_transformer_encoder.py
@@ -313,9 +313,9 @@ def compute_cuboid_self_attention_mask(
     """Compute the shift window attention mask
 
     Args:
-        data_shape (Tuple[int,....]): Should be (T, H, W).
-        cuboid_size (Tuple[int,....]): Size of the cuboid.
-        shift_size (Tuple[int,....]): The shift size.
+        data_shape (Tuple[int, ...]): Should be (T, H, W).
+        cuboid_size (Tuple[int, ...]): Size of the cuboid.
+        shift_size (Tuple[int, ...]): The shift size.
         strategy (str): The decomposition strategy.
         padding_type (str): Type of the padding.
         device (str): The device.
diff --git a/ppsci/arch/extformer_moe_cuboid_encoder.py b/ppsci/arch/extformer_moe_cuboid_encoder.py
@@ -337,9 +337,9 @@ def compute_cuboid_self_attention_mask(
     """Compute the shift window attention mask
 
     Args:
-        data_shape (Tuple[int,....]): Should be (T, H, W).
-        cuboid_size (Tuple[int,....]): Size of the cuboid.
-        shift_size (Tuple[int,....]): The shift size.
+        data_shape (Tuple[int, ...]): Should be (T, H, W).
+        cuboid_size (Tuple[int, ...]): Size of the cuboid.
+        shift_size (Tuple[int, ...]): The shift size.
         strategy (str): The decomposition strategy.
         padding_type (str): Type of the padding.
         device (str): The device.
diff --git a/ppsci/metric/func.py b/ppsci/metric/func.py
@@ -63,4 +63,10 @@ def __init__(
         self.metric_expr = metric_expr
 
     def forward(self, output_dict, label_dict=None) -> Dict[str, "paddle.Tensor"]:
-        return self.metric_expr(output_dict, label_dict)
+        metric: "paddle.Tensor" = self.metric_expr(output_dict, label_dict)
+        if self.keep_batch:
+            assert metric.ndim >= 1, (
+                f"metric.shape should be like [batch_size, ...], but got {metric.shape} when keep_batch is True, "
+                "please check the return value of your metric_expr function."
+            )
+        return metric
diff --git a/ppsci/solver/solver.py b/ppsci/solver/solver.py
@@ -280,11 +280,27 @@ def __init__(
                 *[_v.metric.values() for _v in self.validator.values()]
             ):
                 if metric.keep_batch ^ self.compute_metric_by_batch:
+                    """
+                    Evaluation has two modes:
+                    1. compute_metric_by_batch=True:
+                        - The metric is computed for each batch separately, and the results
+                            are averaged across all batches.
+                        - Suitable for metrics that support additive aggregation (e.g. accuracy).
+                        - Saves memory since batch outputs are not stored.
+                        - In this mode, metric.keep_batch should be True.
+
+                    2. compute_metric_by_batch=False:
+                        - The outputs and labels of all batches are cached.
+                        - Metric is computed once on the concatenated results at the end.
+                        - Needed for metrics that cannot be computed additively (e.g. L2 relative error).
+                        - In this mode, metric.keep_batch should be False.
+                    """
                     raise ValueError(
                         f"{misc.typename(metric)}.keep_batch should be "
-                        f"{self.compute_metric_by_batch} when compute_metric_by_batch="
+                        f"{self.compute_metric_by_batch} when compute_metric_by_batch is "
                         f"{self.compute_metric_by_batch}."
                     )
+
             # check metric name uniqueness over all validators
             _count = {}
             for _validator in validator.values():
diff --git a/ppsci/utils/misc.py b/ppsci/utils/misc.py
@@ -19,6 +19,7 @@
 import os
 import random
 import time
+import warnings
 from contextlib import ContextDecorator
 from typing import Callable
 from typing import Dict
@@ -32,6 +33,7 @@
 import paddle
 from matplotlib import pyplot as plt
 from paddle import distributed as dist
+from paddle.incubate.distributed.models.moe.moe_layer import AllGather
 
 from ppsci.utils import logger
 
@@ -326,14 +328,18 @@ def convert_to_dict(array: np.ndarray, keys: Tuple[str, ...]) -> Dict[str, np.nd
 
 
 def all_gather(
-    tensor: paddle.Tensor, concat: bool = True, axis: int = 0
+    tensor: paddle.Tensor,
+    concat: bool = True,
+    axis: int = 0,
+    requires_grad: bool = False,
 ) -> Union[paddle.Tensor, List[paddle.Tensor]]:
     """Gather tensor from all devices, concatenate them along given axis if specified.
 
     Args:
         tensor (paddle.Tensor): Tensor to be gathered from all GPUs.
         concat (bool, optional): Whether to concatenate gathered Tensors. Defaults to True.
         axis (int, optional): Axis which concatenated along. Defaults to 0.
+        requires_grad (bool, optional): Whether to require gradient. Defaults to False.
 
     Returns:
         Union[paddle.Tensor, List[paddle.Tensor]]: Gathered Tensors.
@@ -354,7 +360,7 @@ def all_gather(
          [ 7  8  9]
          [10 11 12]]
     """
-    result: List[paddle.Tensor] = []
+    result: Union[paddle.Tensor, List[paddle.Tensor]] = []
 
     # NOTE: Put tensor to CUDAPlace from CUDAPinnedPlace to use communication.
     if tensor.place.is_cuda_pinned_place():
@@ -363,10 +369,27 @@ def all_gather(
     # TODO(HydrogenSulfate): As non-contiguous(strided) tensor is not supported in
     # dist.all_gather, manually convert given Tensor to contiguous below. Strided tensor
     # will be supported in future.
-    dist.all_gather(result, tensor.contiguous())
+    if not requires_grad:
+        dist.all_gather(result, tensor.contiguous())
+        if concat:
+            if tensor.ndim == 0:
+                warnings.warn(
+                    "given tensor is a 0-dim tensor, so we use `paddle.stack` to replace `paddle.concat`",
+                    category=UserWarning,
+                    stacklevel=2,
+                )
+                result = paddle.stack(result, axis)
+            else:
+                result = paddle.concat(result, axis)
+    else:
+        assert (
+            tensor.ndim > 0
+        ), "`all_gather` is not supported for 0-dim tensor when requires_grad=True"
+        assert concat is True, "`requires_grad=True` only support `concat=True`"
+        result = AllGather.apply(
+            tensor.contiguous(), dist.get_rank(), dist.get_world_size(), None
+        )
 
-    if concat:
-        return paddle.concat(result, axis)
     return result