Refactor _batch_cal_norm and remove #pyre-ignore

wz337 · facebook-github-bot · commit cd8e0aaf0506 · 2025-07-16T01:40:15.000-07:00
Summary: As title. Remove repeated `total_grad_norm` calculation and fix #pyre errors.

Differential Revision: D78398248
diff --git a/torchrec/optim/clipping.py b/torchrec/optim/clipping.py
@@ -200,28 +200,23 @@ def clip_grad_norm_(self) -> Optional[Union[float, torch.Tensor]]:
         else:
             square_replicated_grad_norm = 0
 
+        if total_grad_norm is not None:
+            total_grad_norm = (
+                torch.pow(total_grad_norm, 1.0 / norm_type)
+                if norm_type != torch.inf
+                else total_grad_norm
+            )
+        else:
+            return None
+
         global log_grad_norm
         if log_grad_norm:
-            if total_grad_norm is not None and norm_type != torch.inf:
-                # pyre-ignore[58]
-                grad_norm = total_grad_norm ** (1.0 / norm_type)
-            else:
-                grad_norm = 0
-
             rank = dist.get_rank()
             logger.info(
-                f"Clipping [rank={rank}, step={self._step_num}]: square_sharded_grad_norm = {square_sharded_grad_norm}, square_replicated_grad_norm = {square_replicated_grad_norm}, total_grad_norm = {grad_norm}"
+                f"Clipping [rank={rank}, step={self._step_num}]: square_sharded_grad_norm = {square_sharded_grad_norm}, square_replicated_grad_norm = {square_replicated_grad_norm}, total_grad_norm = {total_grad_norm}"
             )
 
-        # Aggregation
-        if total_grad_norm is None:
-            return
-
-        if norm_type != torch.inf:
-            # pyre-ignore [58]: ** is not supported for operand types torch._tensor.Tensor and float.
-            total_grad_norm = total_grad_norm ** (1.0 / norm_type)
-        # pyre-ignore [58]: / is not supported for operand types float and Union[float, torch._tensor.Tensor].
-        clip_coef = cast(torch.Tensor, max_norm / (total_grad_norm + 1e-6))
+        clip_coef = torch.tensor(max_norm) / (total_grad_norm + 1e-6)
         clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
         torch._foreach_mul_(all_grads, clip_coef_clamped)
         return total_grad_norm