rename function and labels (#3434)

TroyGarden · meta-codesync[bot] · commit 3faf5e51d944 · 2025-10-06T11:06:49.000-07:00
Summary: Pull Request resolved: #3434 # context * rename the over-generic function name `benchmark` to `benchmark_model_with_warmup` * make the argument name consistency in the base.py and benchmark_train_pipeline.py, i.e., `profile_name` ==> `name`, `profile` ==> `profile_dir` * modify the record_function labels in comm_ops.py to reduce duplication and confusion Reviewed By: spmex Differential Revision: D83923279 fbshipit-source-id: 74a5b9de66e02338859c2716cb49b30fc6875ee7
diff --git a/torchrec/distributed/benchmark/README.md b/torchrec/distributed/benchmark/README.md
@@ -1,14 +1,14 @@
 # TorchRec Benchmark
-## usage
+## benchmark_train_pipeline usage
 - internal:
 ```
 buck2 run @fbcode//mode/opt fbcode//torchrec/distributed/benchmark:benchmark_train_pipeline -- \
     --yaml_config=fbcode/torchrec/distributed/benchmark/yaml/sparse_data_dist_base.yml \
-    --profile_name=sparse_data_dist_base_$(hg whereami | cut -c 1-10 || echo $USER) # overrides the yaml config
+    --name=sparse_data_dist_base_$(hg whereami | cut -c 1-10 || echo $USER) # overrides the yaml config
 ```
 - oss:
 ```
 python -m torchrec.distributed.benchmark.benchmark_train_pipeline \
     --yaml_config=fbcode/torchrec/distributed/benchmark/yaml/sparse_data_dist_base.yml \
-    --profile_name=sparse_data_dist_base_$(git rev-parse --short HEAD || echo $USER) # overrides the yaml config
+    --name=sparse_data_dist_base_$(git rev-parse --short HEAD || echo $USER) # overrides the yaml config
 ```
diff --git a/torchrec/distributed/benchmark/base.py b/torchrec/distributed/benchmark/base.py
@@ -681,7 +681,7 @@ def _trace_handler(prof: torch.profiler.profile) -> None:
     )
 
 
-def benchmark(
+def benchmark_model_with_warmup(
     name: str,
     model: torch.nn.Module,
     warmup_inputs: Union[List[KeyedJaggedTensor], List[Dict[str, Any]]],
@@ -750,6 +750,26 @@ def benchmark_func(
     pre_gpu_load: int = 0,
     export_stacks: bool = False,
 ) -> BenchmarkResult:
+    """
+    Args:
+        name: Human-readable benchmark name.
+
+        bench_inputs: List[Dict[str, Any]] will be fed to the function at once
+        prof_inputs: List[Dict[str, Any]] will be fed to the function at once
+        benchmark_func_kwargs: kwargs to be passed to func_to_benchmark
+        func_to_benchmark: Callable that executes one measured iteration.
+            func_to_benchmark(batch_inputs, **kwargs)
+
+        world_size, rank: Distributed context to correctly reset / collect GPU
+            stats. ``rank == -1`` means single-process mode.
+        num_benchmarks: Number of measured iterations.
+        device_type: "cuda" or "cpu".
+        profile_dir: Where to write chrome traces / stack files.
+
+        pre_gpu_load: Number of dummy matmul operations to run before the first
+            measured iteration (helps simulating a loaded allocator).
+        export_stacks: Whether to export flamegraph-compatible stack files.
+    """
     if benchmark_func_kwargs is None:
         benchmark_func_kwargs = {}
 
diff --git a/torchrec/distributed/benchmark/benchmark_train_pipeline.py b/torchrec/distributed/benchmark/benchmark_train_pipeline.py
@@ -78,7 +78,7 @@ class RunOptions:
             Default is "kjt" (KeyedJaggedTensor).
         profile (str): Directory to save profiling results. If empty, profiling is disabled.
             Default is "" (disabled).
-        profile_name (str): Name of the profiling file. Default is pipeline classname.
+        name (str): Name of the profiling file. Default is pipeline classname.
         planner_type (str): Type of sharding planner to use. Options are:
             - "embedding": EmbeddingShardingPlanner (default)
             - "hetero": HeteroEmbeddingShardingPlanner
@@ -100,8 +100,8 @@ class RunOptions:
     sharding_type: ShardingType = ShardingType.TABLE_WISE
     compute_kernel: EmbeddingComputeKernel = EmbeddingComputeKernel.FUSED
     input_type: str = "kjt"
-    profile: str = ""
-    profile_name: str = ""
+    name: str = ""
+    profile_dir: str = ""
     planner_type: str = "embedding"
     pooling_factors: Optional[List[float]] = None
     num_poolings: Optional[List[float]] = None
@@ -261,15 +261,13 @@ def _func_to_benchmark(
 
         result = benchmark_func(
             name=(
-                type(pipeline).__name__
-                if run_option.profile_name == ""
-                else run_option.profile_name
+                type(pipeline).__name__ if run_option.name == "" else run_option.name
             ),
             bench_inputs=bench_inputs,  # pyre-ignore
             prof_inputs=bench_inputs,  # pyre-ignore
             num_benchmarks=5,
             num_profiles=2,
-            profile_dir=run_option.profile,
+            profile_dir=run_option.profile_dir,
             world_size=run_option.world_size,
             func_to_benchmark=_func_to_benchmark,
             benchmark_func_kwargs={"model": sharded_model, "pipeline": pipeline},
diff --git a/torchrec/distributed/benchmark/embedding_collection_wrappers.py b/torchrec/distributed/benchmark/embedding_collection_wrappers.py
@@ -57,7 +57,12 @@
 from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor, KeyedTensor
 
 # Import the shared types and utilities from benchmark_utils
-from .base import benchmark, BenchmarkResult, CompileMode, multi_process_benchmark
+from .base import (
+    benchmark_model_with_warmup,
+    BenchmarkResult,
+    CompileMode,
+    multi_process_benchmark,
+)
 
 logger: logging.Logger = logging.getLogger()
 
@@ -456,7 +461,7 @@ def _init_module_and_run_benchmark(
         else:
             name = _benchmark_type_name(compile_mode, sharding_type)
 
-        res = benchmark(
+        res = benchmark_model_with_warmup(
             name,
             module,
             warmup_inputs_cuda,
diff --git a/torchrec/distributed/benchmark/yaml/sparse_data_dist_base.yml b/torchrec/distributed/benchmark/yaml/sparse_data_dist_base.yml
@@ -4,8 +4,8 @@ RunOptions:
   world_size: 2
   num_batches: 10
   sharding_type: table_wise
-  profile: "."
-  profile_name: "sparse_data_dist_base"
+  profile_dir: "."
+  name: "sparse_data_dist_base"
   # export_stacks: True # enable this to export stack traces
 PipelineConfig:
   pipeline: "sparse"
diff --git a/torchrec/distributed/comm_ops.py b/torchrec/distributed/comm_ops.py
@@ -440,7 +440,7 @@ def all2all_pooled_sync(
         input_split_sizes = [D_local_sum * B_rank for B_rank in batch_size_per_rank]
         qcomm_ctx = None
 
-    with record_function("## alltoall_fwd_single ##"):
+    with record_function("## all2all_pooled ##"):
         sharded_output_embeddings = AllToAllSingle.apply(
             sharded_input_embeddings,
             output_split_sizes,
@@ -558,7 +558,7 @@ def variable_batch_all2all_pooled_sync(
             for split in input_split_sizes
         ]
 
-    with record_function("## alltoall_fwd_single ##"):
+    with record_function("## variable_batch_all2all_pooled ##"):
         if pg._get_backend_name() == "custom":
             sharded_output_embeddings = torch.empty(
                 sum(output_split_sizes),
@@ -674,7 +674,7 @@ def all2all_sequence_sync(
 
     local_T = lengths_after_sparse_data_all2all.shape[0]
     if local_T > 0:
-        with record_function("## alltoall_seq_embedding_fwd_permute ##"):
+        with record_function("## all2all_sequence_permute ##"):
             if not variable_batch_size:
                 (
                     permuted_lengths_after_sparse_data_all2all,
@@ -719,7 +719,7 @@ def all2all_sequence_sync(
     else:
         qcomm_ctx = None
 
-    with record_function("## alltoall_seq_embedding_fwd_single ##"):
+    with record_function("## all2all_sequence ##"):
         sharded_output_embeddings = AllToAllSingle.apply(
             sharded_input_embeddings,
             output_splits,
@@ -989,7 +989,7 @@ def reduce_scatter_v_sync(
         input = rsi.codecs.forward.encode(input)
 
     if rsi.equal_splits:
-        with record_function("## reduce_scatter_base ##"):
+        with record_function("## reduce_scatter_v ##"):
             output = torch.ops.torchrec.reduce_scatter_tensor(
                 input,
                 reduceOp="sum",
@@ -998,7 +998,7 @@ def reduce_scatter_v_sync(
                 gradient_division=get_gradient_division(),
             )
     else:
-        with record_function("## reduce_scatter_v_via_all_to_all_single ##"):
+        with record_function("## reduce_scatter_v (AllToAllSingle) ##"):
             input_splits = rsi.input_splits
             output_splits = [rsi.input_splits[rank]] * world_size
             # TODO(ivankobzarev): Replace with _functional_collectives.reduce_scatter_v when it is added
@@ -1197,7 +1197,7 @@ def forward(
             device=sharded_input_embeddings.device,
         )
 
-        with record_function("## alltoall_fwd_single ##"):
+        with record_function("## All2All_Pooled_fwd ##"):
             req = dist.all_to_all_single(
                 output=sharded_output_embeddings,
                 input=sharded_input_embeddings,
@@ -1218,7 +1218,6 @@ def forward(
 
     @staticmethod
     # pyre-fixme[2]: Parameter must be annotated.
-    # pyre-fixme[2]: Parameter must be annotated.
     def backward(ctx, *unused) -> Tuple[None, None, None, Tensor]:
         pg = ctx.pg
         my_rank = dist.get_rank(pg)
@@ -1360,7 +1359,7 @@ def backward(ctx, grad_output: Tensor) -> Tuple[None, None, Tensor]:
             device=sharded_grad_output.device,
             dtype=sharded_grad_output.dtype,
         )
-        with record_function("## alltoall_bwd_single ##"):
+        with record_function("## All2All_Pooled_bwd ##"):
             req = dist.all_to_all_single(
                 output=sharded_grad_input,
                 input=sharded_grad_output,
@@ -1445,7 +1444,7 @@ def forward(
             device=sharded_input_embeddings.device,
         )
 
-        with record_function("## alltoall_fwd_single ##"):
+        with record_function("## Variable_Batch_All2All_Pooled_fwd ##"):
             req = dist.all_to_all_single(
                 output=sharded_output_embeddings,
                 input=sharded_input_embeddings,
@@ -1564,7 +1563,7 @@ def backward(ctx, grad_output: Tensor) -> Tuple[None, None, Tensor]:
             device=sharded_grad_output.device,
             dtype=sharded_grad_output.dtype,
         )
-        with record_function("## alltoall_bwd_single ##"):
+        with record_function("## Variable_Batch_All2All_Pooled_bwd ##"):
             req = dist.all_to_all_single(
                 output=sharded_grad_input,
                 input=sharded_grad_output,
@@ -1605,7 +1604,7 @@ def forward(
 
         local_T = lengths_after_sparse_data_all2all.shape[0]
         if local_T > 0:
-            with record_function("## alltoall_seq_embedding_fwd_permute ##"):
+            with record_function("## All2All_Seq_fwd_permute ##"):
                 if not variable_batch_size:
                     (
                         permuted_lengths_after_sparse_data_all2all,
@@ -1659,7 +1658,7 @@ def forward(
             device=sharded_input_embeddings.device,
         )
 
-        with record_function("## alltoall_seq_embedding_fwd_single ##"):
+        with record_function("## All2All_Seq_fwd ##"):
             req = dist.all_to_all_single(
                 output=sharded_output_embeddings,
                 input=sharded_input_embeddings,
@@ -1707,7 +1706,7 @@ def backward(ctx, *unused) -> Tuple[None, None, None, Tensor]:
         myreq.dummy_tensor = None
 
         if permuted_lengths_after_sparse_data_all2all is not None:
-            with record_function("## alltoall_seq_embedding_bwd_permute ##"):
+            with record_function("## All2All_Seq_bwd_permute ##"):
                 if not variable_batch_size:
                     _, sharded_grad_input, _ = torch.ops.fbgemm.permute_2D_sparse_data(
                         backward_recat_tensor,
@@ -1788,7 +1787,7 @@ def backward(ctx, sharded_grad_output: Tensor) -> Tuple[None, None, Tensor]:
             device=sharded_grad_output.device,
             dtype=sharded_grad_output.dtype,
         )
-        with record_function("## alltoall_seq_embedding_bwd_single ##"):
+        with record_function("## All2All_Seq_bwd ##"):
             req = dist.all_to_all_single(
                 output=sharded_grad_input,
                 input=sharded_grad_output.view(-1),
@@ -1822,7 +1821,7 @@ def forward(
             input = a2ai.codecs.forward.encode(input)
 
         output = input.new_empty(sum(output_split_sizes))
-        with record_function("## alltoallv_bwd_single ##"):
+        with record_function("## All2Allv_fwd ##"):
             req = dist.all_to_all_single(
                 output,
                 input,
@@ -1908,7 +1907,7 @@ def backward(ctx, *grad_outputs) -> Tuple[None, None, Tensor]:
         grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs]
         grad_output = torch.cat(grad_outputs)
         grad_input = grad_output.new_empty([a2ai.B_global * sum(a2ai.D_local_list)])
-        with record_function("## alltoall_bwd_single ##"):
+        with record_function("## All2Allv_bwd ##"):
             req = dist.all_to_all_single(
                 grad_input,
                 grad_output,
@@ -1944,7 +1943,7 @@ def forward(
             dtype=inputs[my_rank].dtype,
             device=inputs[my_rank].device,
         )
-        with record_function("## reduce_scatter ##"):
+        with record_function("## ReduceScatter_fwd ##"):
             req = dist.reduce_scatter(
                 output,
                 list(inputs),
@@ -2023,7 +2022,7 @@ def backward(ctx, grad_output: Tensor) -> Tuple[None, None, Tensor]:
             for in_size in rsi.input_sizes
         ]
 
-        with record_function("## reduce_scatter_bw (all_gather) ##"):
+        with record_function("## ReduceScatter_bwd (all_gather) ##"):
             req = dist.all_gather(
                 grad_inputs,
                 grad_output.contiguous(),
@@ -2051,7 +2050,7 @@ def forward(
         if rsi.codecs is not None:
             inputs = rsi.codecs.forward.encode(inputs)
         output = inputs.new_empty((inputs.size(0) // my_size, inputs.size(1)))
-        with record_function("## reduce_scatter_tensor ##"):
+        with record_function("## ReduceScatterBase_fwd (tensor) ##"):
             req = dist.reduce_scatter_tensor(
                 output,
                 inputs,
@@ -2119,7 +2118,7 @@ def backward(ctx, grad_output: Tensor) -> Tuple[None, None, Tensor]:
         if rsi.codecs is not None:
             grad_output = rsi.codecs.backward.encode(grad_output)
         grad_inputs = grad_output.new_empty(rsi.input_sizes)
-        with record_function("## reduce_scatter_base_bw (all_gather) ##"):
+        with record_function("## ReduceScatterBase_bwd (all_gather) ##"):
             req = dist.all_gather_into_tensor(
                 grad_inputs,
                 grad_output.contiguous(),
@@ -2148,7 +2147,7 @@ def forward(
             input = agi.codecs.forward.encode(input)
 
         outputs = input.new_empty((input.size(0) * my_size, input.size(1)))
-        with record_function("## all_gather_into_tensor ##"):
+        with record_function("## AllGatherBase_fwd (into_tensor) ##"):
             req = dist.all_gather_into_tensor(
                 outputs,
                 input,
@@ -2216,7 +2215,7 @@ def backward(ctx, grad_outputs: Tensor) -> Tuple[None, None, Tensor]:
         if agi.codecs is not None:
             grad_outputs = agi.codecs.backward.encode(grad_outputs)
         grad_input = grad_outputs.new_empty(agi.input_size)
-        with record_function("## all_gather_base_bw (reduce_scatter) ##"):
+        with record_function("## AllGatherBase_bw (reduce_scatter_tensor) ##"):
             req = dist.reduce_scatter_tensor(
                 grad_input,
                 grad_outputs.contiguous(),
@@ -2250,15 +2249,15 @@ def forward(
         # Use dist.reduce_scatter_tensor when a vector reduce-scatter is not needed
         # else use dist.reduce_scatter which internally supports vector reduce-scatter
         if rsi.equal_splits:
-            with record_function("## reduce_scatter_tensor ##"):
+            with record_function("## ReduceScatterV_fwd (tensor) ##"):
                 req = dist.reduce_scatter_tensor(
                     output,
                     input,
                     group=pg,
                     async_op=True,
                 )
         else:
-            with record_function("## reduce_scatter_v ##"):
+            with record_function("## ReduceScatterV_fwd ##"):
                 req = dist.reduce_scatter(
                     output,
                     list(torch.split(input, rsi.input_splits)),
@@ -2331,15 +2330,15 @@ def backward(ctx, grad_output: Tensor) -> Tuple[None, None, Tensor]:
         grad_input = grad_output.new_empty(rsi.total_input_size)
 
         if rsi.equal_splits:
-            with record_function("## reduce_scatter_base_bw (all_gather) ##"):
+            with record_function("## ReduceScatterV_bwd (all_gather) ##"):
                 req = dist.all_gather_into_tensor(
                     grad_input,
                     grad_output.contiguous(),
                     group=ctx.pg,
                     async_op=True,
                 )
         else:
-            with record_function("## reduce_scatter_v_bw (all_gather_v) ##"):
+            with record_function("## ReduceScatterV_bwd (all_gather_v) ##"):
                 req = dist.all_gather(
                     list(torch.split(grad_input, rsi.input_splits)),
                     grad_output.contiguous(),
diff --git a/torchrec/distributed/test_utils/multi_process.py b/torchrec/distributed/test_utils/multi_process.py
@@ -222,6 +222,7 @@ def run_multi_process_func(
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
     if world_size == 1:
+        # skip multiprocess env for single-rank job
         kwargs["world_size"] = 1
         kwargs["rank"] = 0
         result = func(**kwargs)
diff --git a/torchrec/sparse/tests/jagged_tensor_benchmark.py b/torchrec/sparse/tests/jagged_tensor_benchmark.py
@@ -16,7 +16,7 @@
 
 import torch
 from torchrec.distributed.benchmark.base import (
-    benchmark,
+    benchmark_model_with_warmup,
     BenchmarkResult,
     CPUMemoryStats,
     GPUMemoryStats,
@@ -77,7 +77,7 @@ def wrapped_func(
     setattr(model, "forward", lambda kwargs: fn(**kwargs))
     prof_num = 10
     if device_type == "cuda":
-        result = benchmark(
+        result = benchmark_model_with_warmup(
             name=name,
             model=model,
             warmup_inputs=[],