Add tensorboard to display training and evaluation metrics and revise implementation to support DLRMv2 (#3163)

lizhouyu · facebook-github-bot · commit 3ba83dc03673 · 2025-07-14T13:03:33.000-07:00
Summary: Pull Request resolved: #3163 ### Major changes - Add tensorboard to the benchmark testbed, specifically in `benchmark_zch.py`. - Count the number of unique values received by each rank in each epoch by revising `benchmark_zch_utils.py`. - Revise `data/non_zch_remapper.py` to not depend on `batch.to_dict()` method, instead it fetch dataclass `batch`'s attribute with the built-in `vars()` method. - Revise DLRMv2 model EBC config initialization to make the table name identical with the feature name. - Revise DLRMv2 configuration yaml file to set table size for each feature. - Revise the default value for "num_embeddings" parameter in `arguments.py` to None. Differential Revision: D77841795
diff --git a/torchrec/distributed/benchmark/benchmark_zch/arguments.py b/torchrec/distributed/benchmark/benchmark_zch/arguments.py
@@ -1,3 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 import argparse
 from typing import List
 
@@ -25,7 +33,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
     parser.add_argument(
         "--num_embeddings",  # ratio of feature ids to embedding table size # 3 axis: x-bath_idx; y-collisions; zembedding table sizes
         type=int,
-        default=100_000,
+        default=None,
         help="max_ind_size. The number of embeddings in each embedding table. Defaults"
         " to 100_000 if num_embeddings_per_feature is not supplied.",
     )
diff --git a/torchrec/distributed/benchmark/benchmark_zch/benchmark_zch.py b/torchrec/distributed/benchmark/benchmark_zch/benchmark_zch.py
diff --git a/torchrec/distributed/benchmark/benchmark_zch/benchmark_zch_utils.py b/torchrec/distributed/benchmark/benchmark_zch/benchmark_zch_utils.py
@@ -1,23 +1,21 @@
-import argparse
-import copy
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 import json
 import logging
 import os
-from typing import Any, Dict
+from typing import Any, Dict, Set
 
 import numpy as np
 
 import torch
 import torch.nn as nn
-import yaml
 from torchrec.modules.mc_embedding_modules import ManagedCollisionEmbeddingCollection
-from torchrec.modules.mc_modules import (
-    DistanceLFU_EvictionPolicy,
-    ManagedCollisionCollection,
-    MCHManagedCollisionModule,
-)
-
-from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
 
 
 def get_module_from_instance(
@@ -104,6 +102,7 @@ def __init__(
         self._mch_stats: Dict[str, Any] = (
             {}
         )  # dictionary of {table_name [str]: {metric_name [str]: metric_value [int]}}
+        self.feature_name_unique_queried_values_set_dict: Dict[str, Set[int]] = {}
 
     # record mcec state to file
     def record_mcec_state(self, stage: str) -> None:
@@ -260,6 +259,7 @@ def update(self) -> None:
                     "collision_cnt": 0,
                     "rank_total_cnt": 0,
                     "num_empty_slots": 0,
+                    "num_unique_queries": 0,
                 }
             # get the input faeture values
             input_feature_values = np.array(rank_feature_value_before_fwd[feature_name])
@@ -313,4 +313,16 @@ def update(self) -> None:
                 this_rank_total_count - this_rank_hits_count - this_rank_insert_count
             )
             batch_stats[feature_name]["collision_cnt"] += int(this_rank_collision_count)
+            # get the unique values in the input feature values
+            if feature_name not in self.feature_name_unique_queried_values_set_dict:
+                self.feature_name_unique_queried_values_set_dict[feature_name] = set(
+                    input_feature_values.tolist()
+                )
+            else:
+                self.feature_name_unique_queried_values_set_dict[feature_name].update(
+                    set(input_feature_values.tolist())
+                )
+            batch_stats[feature_name]["num_unique_queries"] = len(
+                self.feature_name_unique_queried_values_set_dict[feature_name]
+            )
         self._mch_stats = batch_stats
diff --git a/torchrec/distributed/benchmark/benchmark_zch/count_dataset_distributions.py b/torchrec/distributed/benchmark/benchmark_zch/count_dataset_distributions.py
@@ -1,3 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 import argparse
 import json
 import multiprocessing
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/configs/criteo_kaggle.yaml b/torchrec/distributed/benchmark/benchmark_zch/data/configs/criteo_kaggle.yaml
@@ -1,3 +1,7 @@
-dataset_path: "/home/lizhouyu/oss_github/dlrm/torchrec_dlrm/criteo_1tb/criteo_kaggle_processed"
+dataset_path: "/home/lizhouyu/datasets/criteo_kaggle_processed"
 batch_size: 4096
 seed: 0
+multitask_configs:
+  - task_name: is_click
+    task_weight: 1
+    task_type: classification
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/configs/kuairand_1k.yaml b/torchrec/distributed/benchmark/benchmark_zch/data/configs/kuairand_1k.yaml
@@ -1,4 +1,4 @@
-dataset_path: "/home/lizhouyu/oss_github/generative-recommenders/generative_recommenders/dlrm_v3/data/KuaiRand-1K/data"
+dataset_path: "/home/lizhouyu/datasets/kuairand-1k/data"
 batch_size: 16
 train_split_percentage: 0.75
 num_workers: 4
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/get_dataloader.py b/torchrec/distributed/benchmark/benchmark_zch/data/get_dataloader.py
@@ -1,3 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 import argparse
 import os
 
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/get_metric_modules.py b/torchrec/distributed/benchmark/benchmark_zch/data/get_metric_modules.py
@@ -1,3 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 import argparse
 import os
 import sys
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/nonzch_remapper.py b/torchrec/distributed/benchmark/benchmark_zch/data/nonzch_remapper.py
@@ -1,3 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -99,6 +107,24 @@ def __init__(
             )
         self._input_hash_size = input_hash_size
 
+    def get_batch_kjt_dict(self, batch: Batch) -> Dict[str, KeyedJaggedTensor]:
+        """
+        Get the KJT in each batch
+        Parameters:
+            batch: the batch whose KJT is ought to be fetched
+        Returns:
+            batch_kjt_dict: a dictionary of [batch_attribute_name: KeyedJaggedTensor]
+            where only attributes whose values are KeyedJaggedTensor are fetched.
+        """
+        batch_kjt_dict = {}  # create a dictionary for return
+        batch_attr_dict = vars(batch)  # get batch's attributes and values
+        for batch_attr_name, batch_attr_value in batch_attr_dict.items():
+            if isinstance(
+                batch_attr_value, KeyedJaggedTensor
+            ):  # only fetch attributes whose values are KeyedJaggedTensor
+                batch_kjt_dict[batch_attr_name] = batch_attr_value
+        return batch_kjt_dict
+
     def remap(self, batch: Batch) -> Batch:
         # for all the attributes under batch, like batch.uih_features, batch.candidates_features,
         # get the kjt as a dict, and remap the kjt
@@ -118,7 +144,7 @@ def remap(self, batch: Batch) -> Batch:
         #     candidates_features: KeyedJaggedTensor
 
         # for every attribute in batch, remap the kjt
-        for attr_name, feature_kjt_dict in batch.get_dict().items():
+        for attr_name, feature_kjt_dict in self.get_batch_kjt_dict(batch).items():
             # separate feature kjt with {feature_name_1: feature_kjt_1, feature_name_2: feature_kjt_2, ...}
             # to multiple dict with {feature_name_1: jt_1}, {feature_name_2: jt_2}, ...
             attr_feature_jt_dict = {}
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/preprocess/kuairand_1k.py b/torchrec/distributed/benchmark/benchmark_zch/data/preprocess/kuairand_1k.py
@@ -1,3 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 import argparse
 
 import json
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/preprocess/kuairand_27k.py b/torchrec/distributed/benchmark/benchmark_zch/data/preprocess/kuairand_27k.py
@@ -1,3 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 import argparse
 
 import json
diff --git a/torchrec/distributed/benchmark/benchmark_zch/data/preprocess/movielens_1m.py b/torchrec/distributed/benchmark/benchmark_zch/data/preprocess/movielens_1m.py
@@ -1,3 +1,11 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
 import argparse
 import os
 
diff --git a/torchrec/distributed/benchmark/benchmark_zch/models/apply_optimizers.py b/torchrec/distributed/benchmark/benchmark_zch/models/apply_optimizers.py
@@ -1,3 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
 import argparse
 import os
 
diff --git a/torchrec/distributed/benchmark/benchmark_zch/models/configs/dlrmv2.yaml b/torchrec/distributed/benchmark/benchmark_zch/models/configs/dlrmv2.yaml
@@ -9,31 +9,31 @@ over_arch_layer_sizes:
   - 1
 embedding_dim: 64
 num_embeddings_per_feature:
-  cat_0: 100000
-  cat_1: 100000
-  cat_2: 100000
-  cat_3: 100000
-  cat_4: 100000
-  cat_5: 100000
-  cat_6: 100000
-  cat_7: 100000
-  cat_8: 100000
-  cat_9: 100000
-  cat_10: 100000
-  cat_11: 100000
-  cat_12: 100000
-  cat_13: 100000
-  cat_14: 100000
-  cat_15: 100000
-  cat_16: 100000
-  cat_17: 100000
-  cat_18: 100000
-  cat_19: 100000
-  cat_20: 100000
-  cat_21: 100000
-  cat_22: 100000
-  cat_23: 100000
-  cat_24: 100000
-  cat_25: 100000
+  cat_0: 40000000
+  cat_1: 39060
+  cat_2: 17295
+  cat_3: 7424
+  cat_4: 20265
+  cat_5: 3
+  cat_6: 7122
+  cat_7: 1543
+  cat_8: 63
+  cat_9: 40000000
+  cat_10: 3067956
+  cat_11: 405282
+  cat_12: 10
+  cat_13: 2209
+  cat_14: 11938
+  cat_15: 155
+  cat_16: 4
+  cat_17: 976
+  cat_18: 14
+  cat_19: 40000000
+  cat_20: 40000000
+  cat_21: 40000000
+  cat_22: 590152
+  cat_23: 12973
+  cat_24: 108
+  cat_25: 36
 embedding_module_attribute_path: "dlrm.sparse_arch.embedding_bag_collection" # the attribute path after model
 managed_collision_module_attribute_path: "module.dlrm.sparse_arch.embedding_bag_collection.mc_embedding_bag_collection._managed_collision_collection._managed_collision_modules" # the attribute path of managed collision module after model
diff --git a/torchrec/distributed/benchmark/benchmark_zch/models/make_model.py b/torchrec/distributed/benchmark/benchmark_zch/models/make_model.py
@@ -1,3 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
 import argparse
 import os
 from typing import Any, Dict, Tuple
diff --git a/torchrec/distributed/benchmark/benchmark_zch/models/models/dlrmv2.py b/torchrec/distributed/benchmark/benchmark_zch/models/models/dlrmv2.py
@@ -1,3 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
 import argparse
 
 from typing import Any, Dict, List, Optional, Tuple
@@ -35,6 +44,9 @@ def __init__(
             dense_device=dense_device,
         )
         self.train_model = DLRMTrain(self.dlrm)
+        self.table_configs: List[EmbeddingBagConfig] = list(
+            embedding_bag_collection.embedding_bag_configs()
+        )
 
     def forward(
         self, batch: Batch
@@ -55,10 +67,10 @@ def make_model_dlrmv2(
 ) -> nn.Module:
     ebc_configs = [
         EmbeddingBagConfig(
-            name=f"t_{feature_name}",
+            name=f"{feature_name}",
             embedding_dim=configs["embedding_dim"],
             num_embeddings=(
-                none_throws(configs["num_embeddings_per_feature"])[feature_idx]
+                none_throws(configs["num_embeddings_per_feature"])[feature_name]
                 if args.num_embeddings is None
                 else args.num_embeddings
             ),
@@ -76,8 +88,9 @@ def make_model_dlrmv2(
                 input_hash_size=args.input_hash_size,
                 device=torch.device("meta"),
                 world_size=get_local_size(),
-                use_mpzch=True,
+                zch_method="mpzch",
                 mpzch_num_buckets=args.num_buckets,
+                mpzch_max_probe=args.max_probe,
             )
         )
     else:
diff --git a/torchrec/distributed/benchmark/benchmark_zch/models/models/dlrmv3.py b/torchrec/distributed/benchmark/benchmark_zch/models/models/dlrmv3.py
@@ -1,3 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
 import argparse
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
diff --git a/torchrec/distributed/benchmark/benchmark_zch/models/shard_model.py b/torchrec/distributed/benchmark/benchmark_zch/models/shard_model.py
@@ -1,3 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
 import argparse
 from typing import cast
 
diff --git a/torchrec/modules/mc_adapter.py b/torchrec/modules/mc_adapter.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-dataset_path: "/home/lizhouyu/oss_github/generative-recommenders/generative_recommenders/dlrm_v3/data/KuaiRand-1K/data"`
	`1`	`+dataset_path: "/home/lizhouyu/datasets/kuairand-1k/data"`
`2`	`2`	`batch_size: 16`
`3`	`3`	`train_split_percentage: 0.75`
`4`	`4`	`num_workers: 4`