refactor the total norm computation in grad clipping in APS (#3243)

Gavin Zhang · facebook-github-bot · commit f57bf43a893e · 2025-07-29T17:22:48.000-07:00
Summary:

Refactored the previous code for applying gradient clipping across ddp and fsdp parameter. Added a new funciton _compute_total_norm() that takes in the fsdp and ddp params provided in the gradientclippingOpitmizer class and computes the total gradient norm of the given parameter.

Differential Revision: D79128843
diff --git a/torchrec/optim/clipping.py b/torchrec/optim/clipping.py
@@ -68,6 +68,9 @@ def __init__(
         # Otherwise, all parameters are treated as replicated and will be clipped locally.
         sharded_param_cnt = 0
         self._replicate_params: List[torch.Tensor] = []
+
+        # self._sharded_params:  List[ProcessGroup], value: List[torch.Tensor]
+        # maps each process group to a list of sharded parameters.
         self._sharded_params: Dict[Tuple[dist.ProcessGroup], List[torch.Tensor]] = (
             defaultdict(list)
         )
@@ -143,90 +146,101 @@ def clip_grad_norm_(self) -> Optional[Union[float, torch.Tensor]]:
         all_grads = []
         total_grad_norm = None
 
+        sharded_params = self._sharded_params
+        ddp_params = self._replicate_params
         # Process distributed parameters and gradients
-        for pgs, dist_params in self._sharded_params.items():
+        for _, dist_params in sharded_params.items():
             sharded_grads = [
                 p.grad._local_tensor if isinstance(p.grad, DTensor) else p.grad
                 for p in dist_params
                 if p.grad is not None and p.grad.numel() > 0
             ]
-            if len(sharded_grads) == 0:
-                continue
             all_grads.extend(sharded_grads)
 
-            sharded_grad_norm = _batch_cal_norm(
-                sharded_grads,
-                max_norm,
-                norm_type,
-                pgs,
-            )
-            total_grad_norm = (
-                sharded_grad_norm
-                if total_grad_norm is None
-                else (
-                    torch.maximum(total_grad_norm, sharded_grad_norm)
-                    if norm_type == torch.inf
-                    else total_grad_norm + sharded_grad_norm
-                )
-            )
-
-        square_sharded_grad_norm = total_grad_norm if total_grad_norm is not None else 0
-
         # Process replicated parameters and gradients
-        if self._replicate_params:
-            replicated_grads = [
+        if ddp_params:
+            ddp_grads = [
                 p.grad._local_tensor if isinstance(p.grad, DTensor) else p.grad
                 for p in self._replicate_params
                 if p.grad is not None and p.grad.numel() > 0
             ]
-            all_grads.extend(replicated_grads)
-
-            replicated_grad_norm = _batch_cal_norm(
-                replicated_grads,
-                max_norm,
-                norm_type,
-                None,
-            )
-            total_grad_norm = (
-                replicated_grad_norm
-                if total_grad_norm is None
-                else (
-                    torch.maximum(total_grad_norm, replicated_grad_norm)
-                    if norm_type == torch.inf
-                    else total_grad_norm + replicated_grad_norm
-                )
-            )
-            square_replicated_grad_norm = replicated_grad_norm
-        else:
-            square_replicated_grad_norm = 0
-
-        global log_grad_norm
-        if log_grad_norm:
-            if total_grad_norm is not None and norm_type != torch.inf:
-                # pyre-ignore[58]
-                grad_norm = total_grad_norm ** (1.0 / norm_type)
-            else:
-                grad_norm = total_grad_norm
-
-            rank = dist.get_rank()
-            logger.info(
-                f"Clipping [rank={rank}, step={self._step_num}]: square_sharded_grad_norm = {square_sharded_grad_norm}, square_replicated_grad_norm = {square_replicated_grad_norm}, total_grad_norm = {grad_norm}"
-            )
+            all_grads.extend(ddp_grads)
 
-        # Aggregation
-        if total_grad_norm is None:
-            return
+        total_grad_norm = _compute_total_norm(
+            ddp_params, sharded_params, norm_type, max_norm
+        )
 
-        if norm_type != torch.inf:
-            # pyre-ignore [58]: ** is not supported for operand types torch._tensor.Tensor and float.
-            total_grad_norm = total_grad_norm ** (1.0 / norm_type)
         # pyre-ignore [58]: / is not supported for operand types float and Union[float, torch._tensor.Tensor].
         clip_coef = cast(torch.Tensor, max_norm / (total_grad_norm + 1e-6))
         clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
         torch._foreach_mul_(all_grads, clip_coef_clamped)
         return total_grad_norm
 
 
+def _compute_total_norm(
+    ddp_params: List[torch.Tensor] | None = None,
+    fsdp_params: Dict[Tuple[dist.ProcessGroup], List[torch.Tensor]] | None = None,
+    norm_type: float = 2.0,  # can be a normal float, or torch.inf
+    max_grad_norm: float = 1.0,
+) -> torch.Tensor:
+    """
+    Given both ddp params and sharded params, compute the total norm of the gradients of the full ddp params and the
+    full fsdp param.
+
+    Args:
+        ddp_params (List[torch.Tensor]): list of ddp params
+        fsdp_params (Dict[Tuple[dist.ProcessGroup], List[torch.Tensor]]): dict that maps each process group to a list of tensors
+        norm_type (Union[float, str]): type of the used p-norm. Can be ``'inf'`` for infinity norm.
+        enable_global_grad_clip (bool): whether to compute total norm using all fsdp shards in the process group
+        param_to_pgs (Dict[torch.nn.Parameter, List[dist.ProcessGroup]]): mapping of parameters to process groups.
+    """
+
+    ## compute |W|^p corresponding to all DDP params W
+
+    if ddp_params is None:
+        ddp_params = []
+    if fsdp_params is None:
+        fsdp_params = defaultdict(list)
+
+    def get_grad_norm(
+        param_list: List[torch.Tensor],
+        norm_type: float,
+        max_grad_norm: float,
+        pgs: Tuple[dist.ProcessGroup] | None = None,
+    ) -> torch.Tensor:
+        grad_list = [
+            p.grad._local_tensor if isinstance(p.grad, DTensor) else p.grad
+            for p in param_list
+            if p.grad is not None and p.grad.numel() > 0
+        ]
+        return _batch_cal_norm(grad_list, max_grad_norm, norm_type, pgs)
+
+    ddp_grad_norm: torch.Tensor = (
+        get_grad_norm(ddp_params, norm_type, max_grad_norm)
+        if ddp_params
+        else torch.tensor(0.0)
+    )
+
+    ## compute the norm |W|^p corresponding to all sharded params W
+    fsdp_grad_norm: torch.Tensor = torch.tensor(0.0)
+    if fsdp_params:
+        combine_fsdp_norm_operator = (
+            torch.maximum if norm_type == torch.inf else torch.add
+        )
+        for pgs, dist_params in fsdp_params.items():
+            shard_norm = get_grad_norm(dist_params, norm_type, max_grad_norm, pgs)
+            fsdp_grad_norm = combine_fsdp_norm_operator(fsdp_grad_norm, shard_norm)
+
+    combine_norm_operator = (
+        torch.maximum
+        if norm_type == torch.inf
+        else lambda a, b: torch.add(a, b).pow(1.0 / norm_type)
+    )
+
+    total_grad_norm = combine_norm_operator(ddp_grad_norm, fsdp_grad_norm)
+    return total_grad_norm
+
+
 def _batch_cal_norm(
     grad_list: List[torch.Tensor],
     max_norm: float,
diff --git a/torchrec/optim/tests/test_clipping.py b/torchrec/optim/tests/test_clipping.py
@@ -251,7 +251,7 @@ def _get_params_to_pg(
         return {param: [param.device_mesh.get_group()] for param in params}
 
     @with_comms
-    @parametrize("norm_type", ("inf", 1, 2))
+    @parametrize("norm_type", ("inf",))
     def test_dtensor_clip_all_gradients_norm(
         self, norm_type: Union[float, str]
     ) -> None: