manager: fix allreduce reduction scaling (meta-pytorch#286)

d4l3k · meta-codesync[bot] · commit e4d99b5ff25b · 2025-10-17T11:53:10.000-07:00
Summary: We should only rescale tensors manually for `ReduceOp.AVG`. Pull Request resolved: meta-pytorch#286 Test Plan: Updated test to test all common reductions ``` pytest torchft/manager_test.py ``` Reviewed By: tushar00jain Differential Revision: D84879364 Pulled By: d4l3k fbshipit-source-id: 6c32348466fe920de71183c5fa8014427a8de121
diff --git a/torchft/manager.py b/torchft/manager.py
@@ -387,7 +387,7 @@ def allreduce(
         self,
         tensor: torch.Tensor,
         should_quantize: bool = False,
-        reduce_op: ReduceOp = ReduceOp.SUM,
+        reduce_op: ReduceOp = ReduceOp.AVG,
     ) -> Work:
         """
         Fault tolerant allreduce the tensor and return a Future that will be completed when
@@ -416,21 +416,30 @@ def allreduce(
         if not self.is_participating():
             tensor.zero_()
 
+        # special logic for average
+        pg_reduce_op = reduce_op
+        if reduce_op == ReduceOp.AVG:
+            if not torch.is_floating_point(tensor):
+                raise ValueError(
+                    "average reduce op is only supported for floating point tensors"
+                )
+            pg_reduce_op = ReduceOp.SUM
+
         # TODO: increase timeout when waiting when healing
         try:
             # Run the allreduce async and save the work object so we can wait on
             # it later.
             if should_quantize and IS_TRITON_AVAILABLE:
                 work = allreduce_quantized(
                     [tensor],
-                    reduce_op,
+                    pg_reduce_op,
                     self._pg,
                     # pyre-fixme[6]: Expected `Optional[streams.Stream]` but got `_C.Stream`
                     torch.accelerator.current_stream(),
                 )
             else:
                 opts = AllreduceOptions()
-                opts.reduceOp = reduce_op
+                opts.reduceOp = pg_reduce_op
                 work = self._pg.allreduce([tensor], opts)
 
             # schedule grad normalization as a continuation
@@ -440,7 +449,7 @@ def callback(
                 fut: torch.futures.Future[list[torch.Tensor]],
             ) -> torch.Tensor:
                 nonlocal tensor
-                if reduce_op == ReduceOp.SUM:
+                if reduce_op == ReduceOp.AVG:
                     tensor /= num_participants
                 return tensor
 
diff --git a/torchft/manager_test.py b/torchft/manager_test.py
@@ -13,7 +13,7 @@
 from unittest.mock import create_autospec, MagicMock, patch
 
 import torch
-from torch.distributed import TCPStore
+from torch.distributed import ReduceOp, TCPStore
 
 from torchft._torchft import QuorumResult
 from torchft.checkpointing._rwlock import RWLock
@@ -590,10 +590,28 @@ def test_manager_numerics(self, client_mock: MagicMock) -> None:
         manager._pg.allreduce.return_value = _DummyWork(None)
 
         self.assertTrue(manager.is_participating())
-        tensor = torch.tensor([1.0])
-        work = manager.allreduce(tensor)
-        work.wait()
-        torch.testing.assert_close(tensor, torch.tensor([1.0 / 5]))
+
+        for dtype in (torch.float16, torch.bfloat16, torch.float32, torch.long):
+            orig = torch.tensor([10], dtype=dtype)
+
+            if torch.is_floating_point(orig):
+                tensor = orig.clone()
+                manager.allreduce(tensor).wait()
+                torch.testing.assert_close(tensor, orig / 5)
+
+                tensor = orig.clone()
+                manager.allreduce(tensor, reduce_op=ReduceOp.AVG).wait()
+                torch.testing.assert_close(tensor, orig / 5)
+
+            for reduce_op in [
+                ReduceOp.SUM,
+                ReduceOp.MAX,
+                ReduceOp.MIN,
+                ReduceOp.PRODUCT,
+            ]:
+                tensor = orig.clone()
+                manager.allreduce(tensor, reduce_op=reduce_op).wait()
+                torch.testing.assert_close(tensor, orig)
 
         # check healing numerics
         manager._healing = True