Reshard API Performance Benchmarking

isururanawaka · facebook-github-bot · commit 71e160f4ff88 · 2025-07-21T16:06:34.000-07:00
Summary:
- Identify baseline performance with and without reshard API
- Identify different baselines for different sharding strategies under different data sets

Differential Revision: D78672730
diff --git a/torchrec/distributed/benchmark/benchmark_train.py b/torchrec/distributed/benchmark/benchmark_train.py
@@ -28,8 +28,9 @@
     write_report,
 )
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel, ShardingType
+from torchrec.distributed.sharding.dynamic_sharding import output_sharding_plan_delta
 from torchrec.distributed.test_utils.test_model import TestEBCSharder
-from torchrec.distributed.types import DataType
+from torchrec.distributed.types import DataType, EmbeddingModuleShardingPlan
 from torchrec.modules.embedding_modules import EmbeddingBagCollection
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
 
@@ -53,9 +54,27 @@ def training_func_to_benchmark(
     model: torch.nn.Module,
     bench_inputs: List[KeyedJaggedTensor],
     optimizer: Optional[torch.optim.Optimizer],
+    resharding_plan_diffs: Optional[List[EmbeddingModuleShardingPlan]] = None,
 ) -> None:
 
-    for bench_input in bench_inputs:
+    reshard_idx = 0
+
+    for i, bench_input in enumerate(bench_inputs):
+        if resharding_plan_diffs is not None:
+            if (
+                i > 0
+                and len(resharding_plan_diffs) > 0
+                and i % (len(bench_inputs) / len(resharding_plan_diffs)) == 0
+            ):
+
+                plan_difference = output_sharding_plan_delta(
+                    # Pyre-ignore
+                    model.plan.plan["_module"],
+                    resharding_plan_diffs[reshard_idx],
+                )
+                # Pyre-ignore
+                model.reshard("_module", plan_difference)
+                reshard_idx += 1
         pooled_embeddings = model(bench_input)
         vals = []
         for _name, param in pooled_embeddings.to_dict().items():
diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py
@@ -55,14 +55,21 @@
     EmbeddingStorageEstimator,
 )
 from torchrec.distributed.shard import _shard_modules
+
+from torchrec.distributed.sharding.dynamic_sharding import output_sharding_plan_delta
 from torchrec.distributed.sharding_plan import (
     construct_module_sharding_plan,
     get_sharding_constructor_from_type,
 )
 from torchrec.distributed.test_utils.multi_process import MultiProcessContext
 from torchrec.distributed.test_utils.test_model import ModelInput
 
-from torchrec.distributed.types import DataType, ModuleSharder, ShardingEnv
+from torchrec.distributed.types import (
+    DataType,
+    EmbeddingModuleShardingPlan,
+    ModuleSharder,
+    ShardingEnv,
+)
 from torchrec.fx import symbolic_trace
 from torchrec.modules.embedding_configs import EmbeddingBagConfig, EmbeddingConfig
 from torchrec.quant.embedding_modules import (
@@ -317,7 +324,7 @@ def _generate_rank_placements(
     world_size: int,
     num_tables: int,
     ranks_per_tables: List[int],
-    random_seed: int = None,
+    random_seed: Optional[int] = None,
 ) -> List[List[int]]:
     # Cannot include old/new rank generation with hypothesis library due to depedency on world_size
     if random_seed is None:
@@ -1077,58 +1084,61 @@ def init_module_and_run_benchmark(
         if rank != -1
         else contextlib.nullcontext()
     ) as ctx:
-        module = transform_module(
-            module=module,
-            device=device,
-            inputs=warmup_inputs_cuda,
-            sharder=sharder,
-            sharding_type=sharding_type,
-            compile_mode=compile_mode,
-            world_size=world_size,
-            batch_size=batch_size,
-            # pyre-ignore[6]
-            ctx=ctx,
-            benchmark_unsharded_module=benchmark_unsharded_module,
-        )
-
-        if benchmark_unsharded_module:
-            name = "unsharded" + compile_mode.name
-        else:
-            name = benchmark_type_name(compile_mode, sharding_type)
 
         resharding_plans = []
 
-        import fbvscode
-
-        fbvscode.set_trace()
-
         if new_ranks_per_plan is not None and len(new_ranks_per_plan) > 0:
             sharding_type_constructor = get_sharding_constructor_from_type(
                 sharding_type
             )
-            for i, new_ranks in enumerate(new_ranks_per_plan):
+            for new_ranks_per_table in new_ranks_per_plan:
                 new_per_param_sharding = {}
-                for table in tables:
+                for table_id, table in enumerate(tables):
                     if sharding_type == ShardingType.TABLE_WISE:
                         new_per_param_sharding[table.name] = sharding_type_constructor(
-                            rank=new_ranks, compute_kernel=sharder._kernel_type
+                            rank=new_ranks_per_table[table_id][0],
+                            compute_kernel=sharder._kernel_type,
                         )
                     elif sharding_type == ShardingType.COLUMN_WISE:
                         new_per_param_sharding[table.name] = sharding_type_constructor(
-                            ranks=new_ranks
+                            ranks=new_ranks_per_table[table_id]
                         )
 
                 new_module_sharding_plan = construct_module_sharding_plan(
-                    module=module.module,
+                    module=module._module,  # Pyre-ignore
                     # Pyre-ignore
                     sharder=sharder,
                     per_param_sharding=new_per_param_sharding,
                     local_size=world_size,
                     world_size=world_size,
-                    device_type="cuda" if torch.cuda.is_available() else "cpu",
+                    device_type=device.type,
                 )
                 resharding_plans.append(new_module_sharding_plan)
-            benchmark_func_kwargs["resharding_plans"] = resharding_plans
+
+        module = transform_module(
+            module=module,
+            device=device,
+            inputs=warmup_inputs_cuda,
+            sharder=sharder,
+            sharding_type=sharding_type,
+            compile_mode=compile_mode,
+            world_size=world_size,
+            batch_size=batch_size,
+            # pyre-ignore[6]
+            ctx=ctx,
+            benchmark_unsharded_module=benchmark_unsharded_module,
+        )
+
+        if benchmark_unsharded_module:
+            name = "unsharded" + compile_mode.name
+        else:
+            name = benchmark_type_name(compile_mode, sharding_type)
+
+        # plan_difference = [
+        #     output_sharding_plan_delta(module.plan.plan["_module"], reshard_plan)
+        #     for reshard_plan in resharding_plans
+        # ]
+        benchmark_func_kwargs["resharding_plan_diffs"] = resharding_plans
 
         res = benchmark(
             name,
@@ -1317,22 +1327,18 @@ def benchmark_module(
             )
 
             if train:
-                total_plans_per_benchmark = bench_iters // resharding_interval
-                total_plans_per_benchmark = max(1, total_plans_per_benchmark)
+
                 new_ranks_per_plan = []
+
                 if enable_resharding:
+                    total_plans_per_benchmark = bench_iters // resharding_interval
+                    total_plans_per_benchmark = max(1, total_plans_per_benchmark)
+
                     num_tables = len(tables)
-                    new_ranks_count_per_plan = [
-                        [] for _ in range(total_plans_per_benchmark)
-                    ]
+                    ranks_per_tables = []
+
                     if sharding_type == ShardingType.TABLE_WISE:
                         ranks_per_tables = [1 for _ in range(num_tables)]
-                        new_ranks_per_plan = [
-                            _generate_rank_placements(
-                                world_size, num_tables, ranks_per_tables
-                            )
-                            for _ in range(total_plans_per_benchmark)
-                        ]
 
                     elif sharding_type == ShardingType.COLUMN_WISE:
                         valid_candidates = [
@@ -1343,11 +1349,12 @@ def benchmark_module(
                         ranks_per_tables = [
                             random.choice(valid_candidates) for _ in range(num_tables)
                         ]
+
                         new_ranks_per_plan = [
                             _generate_rank_placements(
                                 world_size, num_tables, ranks_per_tables
                             )
-                            for ranks_per_tables in (new_ranks_count_per_plan)
+                            for _ in range(total_plans_per_benchmark)
                         ]
 
                 res = multi_process_benchmark(
diff --git a/torchrec/distributed/model_parallel.py b/torchrec/distributed/model_parallel.py
@@ -742,7 +742,7 @@ def reshard(
 
         # Need to use .module to maintain FQN consistency
         self._optim: CombinedOptimizer = self._init_optim(
-            self._dmp_wrapped_module.module  # pyre-ignore
+            self._dmp_wrapped_module.module if hasattr(self._dmp_wrapped_module, "module") else self._dmp_wrapped_module._module  # pyre-ignore
         )
         self._plan.plan[sharded_module_fqn] = sharded_module.module_sharding_plan
         return sharded_module

Original file line number	Diff line number	Diff line change
`@@ -742,7 +742,7 @@ def reshard(`
`742`	`742`
`743`	`743`	`# Need to use .module to maintain FQN consistency`
`744`	`744`	`self._optim: CombinedOptimizer = self._init_optim(`
`745`		`- self._dmp_wrapped_module.module # pyre-ignore`
	`745`	`+ self._dmp_wrapped_module.module if hasattr(self._dmp_wrapped_module, "module") else self._dmp_wrapped_module._module # pyre-ignore`
`746`	`746`	`)`
`747`	`747`	`self._plan.plan[sharded_module_fqn] = sharded_module.module_sharding_plan`
`748`	`748`	`return sharded_module`