fix fetch eviction metadata bug and adjust UT to surface it (#4701)

duduyi2013 · facebook-github-bot · commit f42b56a41395 · 2025-08-14T08:17:53.000-07:00
Summary: Pull Request resolved: #4701 X-link: facebookresearch/FBGEMM#1726 pad id with table offset, to get the linearzied id and pass it into eviction metadata fetching logic to get the corresponding metaheader info. adjust UT to make it catch the bug locally Reviewed By: EddyLXJ Differential Revision: D80234997 fbshipit-source-id: d6a614bdef462221c9356bee407f4dbc1eadc16d
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -2979,7 +2979,7 @@ def split_embedding_weights(
                     )
                 )
                 metadata_tensor = self._ssd_db.get_kv_zch_eviction_metadata_by_snapshot(
-                    bucket_ascending_id_tensor,
+                    bucket_ascending_id_tensor + table_offset,
                     torch.as_tensor(bucket_ascending_id_tensor.size(0)),
                     snapshot_handle,
                 )
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py
@@ -471,13 +471,12 @@ def generate_kvzch_tbes(
             Ds = [D] * T
             Es = [E] * T
         else:
-            Ds = [
-                round_up(np.random.randint(low=int(0.25 * D), high=int(1.0 * D)), 4)
-                for _ in range(T)
-            ]
-            Es = [
-                np.random.randint(low=int(0.5 * E), high=int(2.0 * E)) for _ in range(T)
-            ]
+            # Ds = [
+            #     round_up(np.random.randint(low=int(0.25 * D), high=int(1.0 * D)), 4)
+            #     for _ in range(T)
+            # ]
+            Ds = [D] * T
+            Es = [np.random.randint(low=int(0.5 * E), high=int(E)) for _ in range(T)]
 
         if pooling_mode == PoolingMode.SUM:
             mode = "sum"
@@ -571,9 +570,9 @@ def generate_kvzch_tbes(
             pad_opt = torch.zeros(emb_ref_.size(0), pad_opt_width, dtype=emb_ref_.dtype)
             emb_opt_ref = torch.cat((emb_ref_, pad_opt), dim=1)
             emb.ssd_db.set_cuda(
-                torch.arange(t * virtual_E, t * virtual_E + E).to(torch.int64),
+                torch.arange(t * virtual_E, t * virtual_E + Es[t]).to(torch.int64),
                 emb_opt_ref,
-                torch.as_tensor([E]),
+                torch.as_tensor([Es[t]]),
                 t,
             )
             emb_ref_cpu.append(emb_ref_)
@@ -2099,6 +2098,7 @@ def test_kv_emb_state_dict(
             num_buckets=num_buckets,
             enable_optimizer_offloading=enable_optimizer_offloading,
             backend_type=backend_type,
+            mixed=True,
         )
 
         # Generate inputs

Original file line number	Diff line number	Diff line change
`@@ -2979,7 +2979,7 @@ def split_embedding_weights(`
`2979`	`2979`	`)`
`2980`	`2980`	`)`
`2981`	`2981`	`metadata_tensor = self._ssd_db.get_kv_zch_eviction_metadata_by_snapshot(`
`2982`		`- bucket_ascending_id_tensor,`
	`2982`	`+ bucket_ascending_id_tensor + table_offset,`
`2983`	`2983`	`torch.as_tensor(bucket_ascending_id_tensor.size(0)),`
`2984`	`2984`	`snapshot_handle,`
`2985`	`2985`	`)`