Skip to content

Commit 8ecece6

Browse files
EddyLXJfacebook-github-bot
authored andcommitted
Report dry run eviction threshold metric (#4709)
Summary: Pull Request resolved: #4709 X-link: facebookresearch/FBGEMM#1732 As title Reviewed By: emlin Differential Revision: D80198960 fbshipit-source-id: 58db1cb657d92d27eaa1d0e6d6edc93054d1d5d6
1 parent 15bb0b0 commit 8ecece6

File tree

4 files changed

+53
-3
lines changed

4 files changed

+53
-3
lines changed

fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,9 @@ def __init__(
10181018
self.stats_reporter.register_stats(
10191019
f"eviction.feature_table.{t}.processed_counts"
10201020
)
1021+
self.stats_reporter.register_stats(
1022+
f"eviction.feature_table.{t}.eviction_threshold_with_dry_run"
1023+
)
10211024
self.stats_reporter.register_stats(
10221025
f"eviction.feature_table.{t}.evict_rate"
10231026
)
@@ -3820,12 +3823,14 @@ def _report_eviction_stats(self) -> None:
38203823
T = len(set(self.feature_table_map))
38213824
evicted_counts = torch.zeros(T, dtype=torch.int64)
38223825
processed_counts = torch.zeros(T, dtype=torch.int64)
3826+
eviction_threshold_with_dry_run = torch.zeros(T, dtype=torch.float)
38233827
full_duration_ms = torch.tensor(0, dtype=torch.int64)
38243828
exec_duration_ms = torch.tensor(0, dtype=torch.int64)
38253829
dry_run_exec_duration_ms = torch.tensor(0, dtype=torch.int64)
38263830
self.ssd_db.get_feature_evict_metric(
38273831
evicted_counts,
38283832
processed_counts,
3833+
eviction_threshold_with_dry_run,
38293834
full_duration_ms,
38303835
exec_duration_ms,
38313836
dry_run_exec_duration_ms,
@@ -3860,6 +3865,11 @@ def _report_eviction_stats(self) -> None:
38603865
event_name=f"eviction.feature_table.{t}.processed_counts",
38613866
data_bytes=int(processed_counts[t].item()),
38623867
)
3868+
stats_reporter.report_data_amount(
3869+
iteration_step=self.step,
3870+
event_name=f"eviction.feature_table.{t}.eviction_threshold_with_dry_run",
3871+
data_bytes=float(eviction_threshold_with_dry_run[t].item()),
3872+
)
38633873
if processed_counts[t].item() != 0:
38643874
stats_reporter.report_data_amount(
38653875
iteration_step=self.step,

fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache_wrapper.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
146146
void get_feature_evict_metric(
147147
at::Tensor evicted_counts,
148148
at::Tensor processed_counts,
149+
at::Tensor eviction_threshold_with_dry_run,
149150
at::Tensor full_duration_ms,
150151
at::Tensor exec_duration_ms,
151152
at::Tensor dry_run_exec_duration_ms) {
@@ -155,6 +156,10 @@ class DramKVEmbeddingCacheWrapper : public torch::jit::CustomClassHolder {
155156
metrics.value().evicted_counts); // evicted_counts (Long)
156157
processed_counts.copy_(
157158
metrics.value().processed_counts); // processed_counts (Long)
159+
eviction_threshold_with_dry_run.copy_(
160+
metrics.value()
161+
.eviction_threshold_with_dry_run); // eviction threshold with dry
162+
// run (float)
158163
full_duration_ms.copy_(
159164
metrics.value().full_duration_ms); // full duration (Long)
160165
exec_duration_ms.copy_(

fbgemm_gpu/src/dram_kv_embedding_cache/feature_evict.h

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ struct FeatureEvictMetrics {
265265
explicit FeatureEvictMetrics(int table_num) {
266266
evicted_counts.resize(table_num, 0);
267267
processed_counts.resize(table_num, 0);
268+
eviction_threshold_with_dry_run.resize(table_num, 0.0);
268269
exec_duration_ms = 0;
269270
full_duration_ms = 0;
270271
dry_run_exec_duration_ms = 0;
@@ -273,6 +274,10 @@ struct FeatureEvictMetrics {
273274
void reset() {
274275
std::fill(evicted_counts.begin(), evicted_counts.end(), 0);
275276
std::fill(processed_counts.begin(), processed_counts.end(), 0);
277+
std::fill(
278+
eviction_threshold_with_dry_run.begin(),
279+
eviction_threshold_with_dry_run.end(),
280+
0.0);
276281
exec_duration_ms = 0;
277282
full_duration_ms = 0;
278283
dry_run_exec_duration_ms = 0;
@@ -296,6 +301,7 @@ struct FeatureEvictMetrics {
296301

297302
std::vector<int64_t> evicted_counts;
298303
std::vector<int64_t> processed_counts;
304+
std::vector<float> eviction_threshold_with_dry_run;
299305
int64_t exec_duration_ms;
300306
int64_t full_duration_ms;
301307
int64_t dry_run_exec_duration_ms;
@@ -307,6 +313,7 @@ struct FeatureEvictMetricTensors {
307313
explicit FeatureEvictMetricTensors(int64_t table_num)
308314
: evicted_counts(at::zeros({table_num}, at::kLong)),
309315
processed_counts(at::zeros({table_num}, at::kLong)),
316+
eviction_threshold_with_dry_run(at::zeros({table_num}, at::kFloat)),
310317
exec_duration_ms(at::scalar_tensor(0, at::kLong)),
311318
dry_run_exec_duration_ms(at::scalar_tensor(0, at::kLong)),
312319
full_duration_ms(at::scalar_tensor(0, at::kLong)) {}
@@ -315,11 +322,14 @@ struct FeatureEvictMetricTensors {
315322
FeatureEvictMetricTensors(
316323
at::Tensor evicted,
317324
at::Tensor processed,
325+
at::Tensor eviction_threshold_with_dry_run,
318326
at::Tensor exec_duration,
319327
at::Tensor dry_run_exec_duration_ms,
320328
at::Tensor full_duration)
321329
: evicted_counts(std::move(evicted)),
322330
processed_counts(std::move(processed)),
331+
eviction_threshold_with_dry_run(
332+
std::move(eviction_threshold_with_dry_run)),
323333
exec_duration_ms(std::move(exec_duration)),
324334
dry_run_exec_duration_ms(std::move(dry_run_exec_duration_ms)),
325335
full_duration_ms(std::move(full_duration)) {}
@@ -328,6 +338,7 @@ struct FeatureEvictMetricTensors {
328338
return FeatureEvictMetricTensors{
329339
evicted_counts.clone(),
330340
processed_counts.clone(),
341+
eviction_threshold_with_dry_run.clone(),
331342
exec_duration_ms.clone(),
332343
dry_run_exec_duration_ms.clone(),
333344
full_duration_ms.clone()};
@@ -337,6 +348,8 @@ struct FeatureEvictMetricTensors {
337348
at::Tensor evicted_counts;
338349
// feature count before evict
339350
at::Tensor processed_counts;
351+
// feature evict threshold with dry run
352+
at::Tensor eviction_threshold_with_dry_run;
340353
// feature evict exec duration
341354
at::Tensor exec_duration_ms;
342355
// feature evict dry run exec duration
@@ -893,6 +906,14 @@ class FeatureEvict {
893906
at::kLong)
894907
.clone();
895908

909+
metric_tensors_.eviction_threshold_with_dry_run =
910+
at::from_blob(
911+
const_cast<float*>(metrics_.eviction_threshold_with_dry_run.data()),
912+
{static_cast<int64_t>(
913+
metrics_.eviction_threshold_with_dry_run.size())},
914+
at::kFloat)
915+
.clone();
916+
896917
metric_tensors_.full_duration_ms =
897918
at::scalar_tensor(metrics_.full_duration_ms, at::kLong);
898919
metric_tensors_.exec_duration_ms =
@@ -913,14 +934,16 @@ class FeatureEvict {
913934
" - dryrun Time taken: {}ms\n"
914935
" - Total blocks processed: [{}]\n"
915936
" - Blocks evicted: [{}]\n"
916-
" - Eviction rate: [{}]%\n",
937+
" - Eviction rate: [{}]%\n"
938+
" - Eviction threshold dry run: [{}]\n",
917939
metrics_.full_duration_ms,
918940
metrics_.exec_duration_ms,
919941
metrics_.exec_duration_ms * 100.0f / metrics_.full_duration_ms,
920942
metrics_.dry_run_exec_duration_ms,
921943
fmt::join(metrics_.processed_counts, ", "),
922944
fmt::join(metrics_.evicted_counts, ", "),
923-
fmt::join(evict_rates, ", "));
945+
fmt::join(evict_rates, ", "),
946+
fmt::join(metrics_.eviction_threshold_with_dry_run, ", "));
924947
}
925948

926949
// Thread pool.
@@ -1320,6 +1343,11 @@ class FeatureScoreBasedEvict : public FeatureEvict<weight_type> {
13201343
std::chrono::high_resolution_clock::now().time_since_epoch())
13211344
.count();
13221345
}
1346+
1347+
for (int table_id = 0; table_id < num_tables_; ++table_id) {
1348+
this->metrics_.eviction_threshold_with_dry_run[table_id] =
1349+
thresholds_[table_id];
1350+
}
13231351
}
13241352

13251353
private:

fbgemm_gpu/test/tbe/ssd/kv_backend_test.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -821,6 +821,7 @@ def test_dram_kv_eviction(self) -> None:
821821

822822
evicted_counts = torch.zeros(T, dtype=torch.int64)
823823
processed_counts = torch.zeros(T, dtype=torch.int64)
824+
eviction_threshold_with_dry_run = torch.zeros(T, dtype=torch.float)
824825
full_duration_ms = torch.ones(1, dtype=torch.int64) * -1
825826
exec_duration_ms = torch.empty(1, dtype=torch.int64)
826827
dry_run_exec_duration_ms = torch.empty(1, dtype=torch.int64)
@@ -831,6 +832,7 @@ def test_dram_kv_eviction(self) -> None:
831832
dram_kv_backend.get_feature_evict_metric( # pyre-ignore
832833
evicted_counts,
833834
processed_counts,
835+
eviction_threshold_with_dry_run,
834836
full_duration_ms,
835837
exec_duration_ms,
836838
dry_run_exec_duration_ms,
@@ -840,10 +842,11 @@ def test_dram_kv_eviction(self) -> None:
840842
dram_kv_backend.set(indices, weights, count)
841843
time.sleep(0.01) # 20ms, stimulate training forward time
842844
dram_kv_backend.set_cuda(indices, weights, count, 1, True) # pyre-ignore
843-
time.sleep(0.01) # 20ms, stimulate training backward time
845+
time.sleep(1) # 20ms, stimulate training backward time
844846
dram_kv_backend.get_feature_evict_metric( # pyre-ignore
845847
evicted_counts,
846848
processed_counts,
849+
eviction_threshold_with_dry_run,
847850
full_duration_ms,
848851
exec_duration_ms,
849852
dry_run_exec_duration_ms,
@@ -876,6 +879,7 @@ def test_dram_kv_eviction(self) -> None:
876879
dram_kv_backend.get_feature_evict_metric(
877880
evicted_counts,
878881
processed_counts,
882+
eviction_threshold_with_dry_run,
879883
full_duration_ms,
880884
exec_duration_ms,
881885
dry_run_exec_duration_ms,
@@ -946,6 +950,7 @@ def test_dram_kv_feature_score_eviction(self) -> None:
946950
full_duration_ms = torch.ones(1, dtype=torch.int64) * -1
947951
exec_duration_ms = torch.empty(1, dtype=torch.int64)
948952
dry_run_exec_duration_ms = torch.empty(1, dtype=torch.int64)
953+
eviction_threshold_with_dry_run = torch.zeros(T, dtype=torch.float)
949954

950955
shard_load = E / 4
951956
# init
@@ -969,6 +974,7 @@ def test_dram_kv_feature_score_eviction(self) -> None:
969974
dram_kv_backend.get_feature_evict_metric( # pyre-ignore
970975
evicted_counts,
971976
processed_counts,
977+
eviction_threshold_with_dry_run,
972978
full_duration_ms,
973979
exec_duration_ms,
974980
dry_run_exec_duration_ms,
@@ -978,6 +984,7 @@ def test_dram_kv_feature_score_eviction(self) -> None:
978984
self.assertTrue(full_duration_ms.item() > 0)
979985
self.assertTrue(exec_duration_ms.item() >= 0)
980986
self.assertTrue(dry_run_exec_duration_ms.item() > 0)
987+
self.assertTrue(all(eviction_threshold_with_dry_run > 0))
981988

982989
@given(
983990
T=st.integers(min_value=2, max_value=10),

0 commit comments

Comments
 (0)