Skip to content

Commit acbc170

Browse files
Kathy Xufacebook-github-bot
authored andcommitted
emitTB metrics + bug fix
Summary: X-link: facebookresearch/FBGEMM#1714 - Use accumulative DRAM cache size instead - Only emit rank0 metrics to avoid QPS regression and flooded logging in D77638456 Reviewed By: emlin Differential Revision: D78707680 fbshipit-source-id: 97396aaafbf53677a121896c39c0770ac47e294c
1 parent 44f27b1 commit acbc170

File tree

2 files changed

+5
-13
lines changed

2 files changed

+5
-13
lines changed

fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1975,7 +1975,9 @@ def _prefetch( # noqa C901
19751975
self.ssd_cache_stats = torch.add(
19761976
self.ssd_cache_stats, self.local_ssd_cache_stats
19771977
)
1978-
self._report_kv_backend_stats()
1978+
# only report metrics from rank0 to avoid flooded logging
1979+
if dist.get_rank() == 0:
1980+
self._report_kv_backend_stats()
19791981

19801982
# Fetch data from SSD
19811983
if linear_cache_indices.numel() > 0:

fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1184,9 +1184,6 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
11841184
const int64_t step,
11851185
const int64_t interval) {
11861186
std::vector<double> ret(22, 0); // num metrics
1187-
1188-
allocated_memory_ = get_map_used_memsize_in_bytes();
1189-
actual_used_chunk_memory_ = get_map_actual_used_chunk_in_bytes();
11901187
if (step > 0 && step % interval == 0) {
11911188
int reset_val = 0;
11921189

@@ -1233,10 +1230,6 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
12331230
auto dram_bwd_l1_cnflct_miss_write_missing_load_ =
12341231
bwd_l1_cnflct_miss_write_missing_load_avg_.exchange(reset_val);
12351232

1236-
auto dram_allocated_memory = allocated_memory_.exchange(reset_val);
1237-
auto dram_actual_used_chunk_memory =
1238-
actual_used_chunk_memory_.exchange(reset_val);
1239-
12401233
ret[0] = dram_read_total_duration / interval;
12411234
ret[1] = dram_read_sharding_total_duration / interval;
12421235
ret[2] = dram_read_cache_hit_copy_duration / interval;
@@ -1260,8 +1253,8 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
12601253
ret[18] = dram_bwd_l1_cnflct_miss_write_acquire_lock_duration_ / interval;
12611254
ret[19] = dram_bwd_l1_cnflct_miss_write_missing_load_ / interval;
12621255

1263-
ret[20] = dram_allocated_memory / interval;
1264-
ret[21] = dram_actual_used_chunk_memory / interval;
1256+
ret[20] = get_map_used_memsize_in_bytes();
1257+
ret[21] = get_map_actual_used_chunk_in_bytes();
12651258
}
12661259
return ret;
12671260
}
@@ -1495,9 +1488,6 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
14951488
std::atomic<int64_t> fwd_l1_eviction_write_acquire_lock_avg_duration_{0};
14961489
std::atomic<int64_t> fwd_l1_eviction_write_missing_load_avg_{0};
14971490

1498-
std::atomic<int64_t> allocated_memory_{0};
1499-
std::atomic<int64_t> actual_used_chunk_memory_{0};
1500-
15011491
std::atomic<int64_t> inplace_update_hit_cnt_{0};
15021492
std::atomic<int64_t> inplace_update_miss_cnt_{0};
15031493
}; // class DramKVEmbeddingCache

0 commit comments

Comments
 (0)