From f102bbb74726566e2eaf846cc87582cabed67d13 Mon Sep 17 00:00:00 2001 From: Pernekhan Utemuratov Date: Tue, 3 Sep 2024 13:54:16 -0700 Subject: [PATCH] Add missing kv_cache related metrics --- .../src/custom_metrics_reporter/custom_metrics_reporter.cc | 7 +++++-- inflight_batcher_llm/src/model_instance_state.cc | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc index 1e76c8dc..a4865534 100644 --- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc +++ b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc @@ -43,8 +43,11 @@ const std::vector CustomMetricsReporter::runtime_memory_keys_{ const std::vector CustomMetricsReporter::runtime_memory_labels_{"cpu", "gpu", "pinned"}; const std::vector CustomMetricsReporter::kv_cache_keys_{ - "Max KV cache blocks", "Free KV cache blocks", "Used KV cache blocks", "Tokens per KV cache block"}; -const std::vector CustomMetricsReporter::kv_cache_labels_{"max", "free", "used", "tokens_per"}; + "Max KV cache blocks", "Free KV cache blocks", "Used KV cache blocks", "Tokens per KV cache block", + "Alloc Total KV cache blocks", "Alloc New KV cache blocks", "Reused KV cache blocks"}; +const std::vector CustomMetricsReporter::kv_cache_labels_{ + "max", "free", "used", "tokens_per", + "alloc_total", "alloc_new", "reused"}; const std::vector CustomMetricsReporter::v1_specific_keys_{ "Total Context Tokens", "Total Generation Tokens", "Empty Generation Slots"}; diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc index eb55c40d..21aa3d79 100644 --- a/inflight_batcher_llm/src/model_instance_state.cc +++ b/inflight_batcher_llm/src/model_instance_state.cc @@ -1068,6 +1068,9 @@ void ModelInstanceState::WaitForStats() statJson.append("\"Max KV cache blocks\":" + std::to_string(kvStats.maxNumBlocks) + ","); statJson.append("\"Tokens per KV cache block\":" + std::to_string(kvStats.tokensPerBlock) + ","); statJson.append("\"Used KV cache blocks\":" + std::to_string(kvStats.usedNumBlocks) + ","); + statJson.append("\"Alloc Total KV cache blocks\":" + std::to_string(kvStats.allocTotalBlocks) + ","); + statJson.append("\"Alloc New KV cache blocks\":" + std::to_string(kvStats.allocNewBlocks) + ","); + statJson.append("\"Reused KV cache blocks\":" + std::to_string(kvStats.reusedBlocks) + ","); } statJson.back() = '}';