add ModuleCopyMixin to allow inference allocation overrides (#314)

Zheng Yan · facebook-github-bot · commit 0c5d5917510d · 2022-05-12T01:46:45.000-07:00
Summary: Pull Request resolved: #314 need to override .copy for some modules to allow it to be placed on CPU during inference time Reviewed By: yinghai Differential Revision: D36235459 fbshipit-source-id: 19786560ae51960deb574895604c168f81373715
diff --git a/torchrec/distributed/model_parallel.py b/torchrec/distributed/model_parallel.py
@@ -29,6 +29,7 @@
     ShardedModule,
     ShardingEnv,
     ShardingPlan,
+    ModuleCopyMixin,
 )
 from torchrec.distributed.utils import (
     add_prefix_to_state_dict,
@@ -284,14 +285,17 @@ def _copy_if_device_match(tensor: torch.Tensor) -> torch.Tensor:
                 return tensor
 
         # if this is a sharded module, customize the copy
-        if isinstance(module, ShardedModule):
+        if isinstance(module, ModuleCopyMixin):
             return module.copy(device)
         # this could be dense or a compound module
         for name, child in module.named_children():
             # potential DFS cache or bottom-up can save runtime
             # search immediate submodules
             if not any(
-                [isinstance(submodule, ShardedModule) for submodule in child.modules()]
+                [
+                    isinstance(submodule, ModuleCopyMixin)
+                    for submodule in child.modules()
+                ]
             ):
                 # if not containing ShardedModule down this submodule (this is a dense module)
                 # copy it.
diff --git a/torchrec/distributed/tests/test_quant_model_parallel.py b/torchrec/distributed/tests/test_quant_model_parallel.py
@@ -17,7 +17,12 @@
     _get_default_rtol_and_atol,
     TestSparseNN,
 )
-from torchrec.distributed.types import ShardedModule, ShardingEnv, ShardingType
+from torchrec.distributed.types import (
+    ShardedModule,
+    ShardingType,
+    ShardingEnv,
+    ModuleCopyMixin,
+)
 from torchrec.modules.embedding_configs import EmbeddingBagConfig
 from torchrec.modules.embedding_modules import EmbeddingBagCollection
 from torchrec.quant.embedding_modules import (
@@ -61,6 +66,25 @@ def _quantize(module: nn.Module, inplace: bool) -> nn.Module:
     )
 
 
+class CopyModule(nn.Module, ModuleCopyMixin):
+    def __init__(self) -> None:
+        super().__init__()
+        self.tensor: torch.Tensor = torch.empty((10), device="cpu")
+
+    def copy(self, device: torch.device) -> nn.Module:
+        self.tensor = self.tensor.to(device)
+        return self
+
+
+class NoCopyModule(nn.Module, ModuleCopyMixin):
+    def __init__(self) -> None:
+        super().__init__()
+        self.tensor: torch.Tensor = torch.empty((10), device="cpu")
+
+    def copy(self, device: torch.device) -> nn.Module:
+        return self
+
+
 class QuantModelParallelModelCopyTest(unittest.TestCase):
     def setUp(self) -> None:
         num_features = 4
@@ -173,3 +197,45 @@ def test_quant_pred(self) -> None:
         )
         dmp_1 = dmp.copy(device_1)
         self._recursive_device_check(dmp.module, dmp_1.module, device, device_1)
+
+    # pyre-fixme[56]
+    @unittest.skipIf(
+        torch.cuda.device_count() <= 1,
+        "Not enough GPUs available",
+    )
+    def test_copy_mixin(self) -> None:
+        device = torch.device("cuda:0")
+        device_1 = torch.device("cuda:1")
+        model = TestSparseNN(
+            tables=self.tables,
+            weighted_tables=self.weighted_tables,
+            num_float_features=10,
+            dense_device=device,
+            sparse_device=torch.device("meta"),
+        )
+        # pyre-ignore [16]
+        model.copy = CopyModule()
+        # pyre-ignore [16]
+        model.no_copy = NoCopyModule()
+        quant_model = _quantize(model, inplace=True)
+        dmp = DistributedModelParallel(
+            quant_model,
+            sharders=[
+                cast(
+                    ModuleSharder[torch.nn.Module],
+                    TestQuantEBCSharder(
+                        sharding_type=ShardingType.TABLE_WISE.value,
+                        kernel_type=EmbeddingComputeKernel.BATCHED_QUANT.value,
+                    ),
+                )
+            ],
+            device=None,
+            env=ShardingEnv.from_local(world_size=2, rank=0),
+            init_data_parallel=False,
+        )
+
+        dmp_1 = dmp.copy(device_1)
+        # pyre-ignore [16]
+        self.assertEqual(dmp_1.module.copy.tensor.device, device_1)
+        # pyre-ignore [16]
+        self.assertEqual(dmp_1.module.no_copy.tensor.device, torch.device("cpu"))
diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
@@ -356,7 +356,17 @@ def from_local(cls, world_size: int, rank: int) -> "ShardingEnv":
         return cls(world_size, rank, None)
 
 
-class ShardedModule(abc.ABC, nn.Module, Generic[CompIn, DistOut, Out]):
+class ModuleCopyMixin:
+    """
+    A mixin to allow modules to override copy behaviros in DMP.
+    """
+
+    def copy(self, device: torch.device) -> nn.Module:
+        # pyre-ignore [16]
+        return self.to(device)
+
+
+class ShardedModule(abc.ABC, nn.Module, Generic[CompIn, DistOut, Out], ModuleCopyMixin):
     """
     All model-parallel modules implement this interface.
     Inputs and outputs are data-parallel.
@@ -423,9 +433,6 @@ def sharded_parameter_names(self, prefix: str = "") -> Iterator[str]:
         for key, _ in self.named_parameters(prefix):
             yield key
 
-    def copy(self, device: torch.device) -> nn.Module:
-        return self.to(device)
-
 
 class ModuleSharder(abc.ABC, Generic[M]):
     """