From f102bbb74726566e2eaf846cc87582cabed67d13 Mon Sep 17 00:00:00 2001
From: Pernekhan Utemuratov <pernekhan@deepinfra.com>
Date: Tue, 3 Sep 2024 13:54:16 -0700
Subject: [PATCH] Add missing kv_cache related metrics

---
 .../src/custom_metrics_reporter/custom_metrics_reporter.cc | 7 +++++--
 inflight_batcher_llm/src/model_instance_state.cc           | 3 +++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
index 1e76c8dc..a4865534 100644
--- a/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
+++ b/inflight_batcher_llm/src/custom_metrics_reporter/custom_metrics_reporter.cc
@@ -43,8 +43,11 @@ const std::vector<std::string> CustomMetricsReporter::runtime_memory_keys_{
 const std::vector<std::string> CustomMetricsReporter::runtime_memory_labels_{"cpu", "gpu", "pinned"};
 
 const std::vector<std::string> CustomMetricsReporter::kv_cache_keys_{
-    "Max KV cache blocks", "Free KV cache blocks", "Used KV cache blocks", "Tokens per KV cache block"};
-const std::vector<std::string> CustomMetricsReporter::kv_cache_labels_{"max", "free", "used", "tokens_per"};
+    "Max KV cache blocks", "Free KV cache blocks", "Used KV cache blocks", "Tokens per KV cache block",
+    "Alloc Total KV cache blocks", "Alloc New KV cache blocks", "Reused KV cache blocks"};
+const std::vector<std::string> CustomMetricsReporter::kv_cache_labels_{
+    "max", "free", "used", "tokens_per",
+    "alloc_total", "alloc_new", "reused"};
 
 const std::vector<std::string> CustomMetricsReporter::v1_specific_keys_{
     "Total Context Tokens", "Total Generation Tokens", "Empty Generation Slots"};
diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
index eb55c40d..21aa3d79 100644
--- a/inflight_batcher_llm/src/model_instance_state.cc
+++ b/inflight_batcher_llm/src/model_instance_state.cc
@@ -1068,6 +1068,9 @@ void ModelInstanceState::WaitForStats()
                 statJson.append("\"Max KV cache blocks\":" + std::to_string(kvStats.maxNumBlocks) + ",");
                 statJson.append("\"Tokens per KV cache block\":" + std::to_string(kvStats.tokensPerBlock) + ",");
                 statJson.append("\"Used KV cache blocks\":" + std::to_string(kvStats.usedNumBlocks) + ",");
+                statJson.append("\"Alloc Total KV cache blocks\":" + std::to_string(kvStats.allocTotalBlocks) + ",");
+                statJson.append("\"Alloc New KV cache blocks\":" + std::to_string(kvStats.allocNewBlocks) + ",");
+                statJson.append("\"Reused KV cache blocks\":" + std::to_string(kvStats.reusedBlocks) + ",");
             }
 
             statJson.back() = '}';