feat: add page-aligned tensor creator for host KV cache.

Kang-Meng · Kang-Meng · commit 90457da851ac · 2025-11-08T16:00:50.000+08:00
diff --git a/xllm/core/distributed_runtime/comm_channel.cpp b/xllm/core/distributed_runtime/comm_channel.cpp
@@ -351,11 +351,7 @@ class ClientStreamReceiver : public brpc::StreamInputHandler {
 
   ~ClientStreamReceiver() {
     if (!promise_set_.exchange(true)) {
-      try {
-        close_promise_.set_value();
-      } catch (const std::exception& e) {
-        LOG(WARNING) << "Exception in destructor: " << e.what();
-      }
+      close_promise_.set_value();
     }
   }
 
diff --git a/xllm/core/distributed_runtime/worker_service.cpp b/xllm/core/distributed_runtime/worker_service.cpp
@@ -442,11 +442,7 @@ class ServerStreamHandler : public brpc::StreamInputHandler {
  public:
   ~ServerStreamHandler() {
     if (!promise_set_.exchange(true)) {
-      try {
-        close_promise_.set_value();
-      } catch (const std::exception& e) {
-        LOG(WARNING) << "Exception in destructor: " << e.what();
-      }
+      close_promise_.set_value();
     }
   }
 
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.cpp b/xllm/core/framework/kv_cache/kv_cache_store.cpp
@@ -55,30 +55,18 @@ bool KVCacheStore::init(const StoreConfig& config,
   LOG(INFO) << "v_cache_size_per_block: " << v_cache_size_per_block_;
 
   if (config_.protocol == "rdma") {
-    for (int block = 0; block < host_kv_caches_->size(); block++) {
-      void* key_cache = static_cast<char*>(
-          host_kv_caches_->at(block).get_k_cache().data_ptr());
-
-      auto register_k_result = client_ptr_->RegisterLocalMemory(
-          key_cache, k_cache_size_per_block_, "cpu:0", false, false);
-
-      if (!register_k_result.has_value()) {
-        LOG(ERROR) << "Failed to register local memory for key cache: "
-                   << toString(register_k_result.error());
-        return false;
-      }
-
-      void* value_cache = static_cast<char*>(
-          host_kv_caches_->at(block).get_v_cache().data_ptr());
-
-      auto register_v_result = client_ptr_->RegisterLocalMemory(
-          value_cache, v_cache_size_per_block_, "cpu:0", false, false);
-
-      if (!register_v_result.has_value()) {
-        LOG(ERROR) << "Failed to register local memory for value cache: "
-                   << toString(register_v_result.error());
+    if (config_.total_size > 0 && config_.tensor_data != nullptr) {
+      auto result = client_ptr_->RegisterLocalMemory(
+          config_.tensor_data, config_.total_size, "cpu:0", false, false);
+      if (!result.has_value()) {
+        LOG(ERROR) << "Failed to register local memory: "
+                   << toString(result.error());
         return false;
       }
+    } else {
+      LOG(FATAL) << "rdma must RegisterLocalMemory, but got register size: "
+                 << config_.total_size
+                 << ", and data ptr: " << uint64_t(config_.tensor_data);
     }
   }
   is_initialized_ = true;
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.h b/xllm/core/framework/kv_cache/kv_cache_store.h
@@ -19,6 +19,8 @@ struct StoreConfig {
   std::string master_server_address = "";
   int replica_num = 1;
   uint32_t tp_rank = 0;
+  size_t total_size = 0;
+  void* tensor_data = nullptr;
 };
 
 class KVCacheStore {
diff --git a/xllm/core/runtime/worker_impl.cpp b/xllm/core/runtime/worker_impl.cpp
@@ -152,18 +152,9 @@ bool WorkerImpl::allocate_host_kv_cache(
   host_kv_cache_shape[1][0] = num_layers;
 
   // create a KVCache shape: block_size * [layers, token, head, dim]
-  host_kv_caches_.reserve(host_bolck_size);
+  aligned_tensor_creater_ = std::make_unique<AlignedTensorCreater>(
+      host_kv_cache_shape, dtype_, host_bolck_size, &host_kv_caches_);
 
-  for (int64_t i = 0; i < host_bolck_size; ++i) {
-    torch::Tensor key_cache, value_cache;
-    key_cache = torch::empty(host_kv_cache_shape[0],
-                             torch::dtype(dtype_).device(torch::kCPU))
-                    .pin_memory();
-    value_cache = torch::empty(host_kv_cache_shape[1],
-                               torch::dtype(dtype_).device(torch::kCPU))
-                      .pin_memory();
-    host_kv_caches_.emplace_back(key_cache, value_cache);
-  }
   LOG(INFO) << "Initializing host kv block size: " << host_bolck_size;
 
   int32_t device_id = device_.index();
@@ -188,6 +179,8 @@ bool WorkerImpl::allocate_host_kv_cache(
     config.tp_rank = options_.dp_size() > 1
                          ? options_.node_rank() % options_.dp_size()
                          : options_.node_rank();
+    config.total_size = aligned_tensor_creater_->get_total_size();
+    config.tensor_data = aligned_tensor_creater_->get_base_ptr();
 
     if (!KVCacheStore::get_instance().init(config, &host_kv_caches_)) {
       LOG(ERROR) << "Init KVCacheStore fail!";
@@ -1026,4 +1019,68 @@ uint32_t WorkerImpl::prefetch_from_storage(
       .get();
 }
 
+AlignedTensorCreater::AlignedTensorCreater(
+    const std::vector<std::vector<int64_t>>& tensor_shapes,
+    const torch::ScalarType dtype,
+    const uint32_t num_tensors,
+    std::vector<xllm::KVCache>* tensors) {
+  CHECK(tensor_shapes.size() == 2)
+      << "tensor_shapes.size() must equal to 2, but got "
+      << tensor_shapes.size();
+
+  int64_t elements_per_k_tensor = 1;
+  int64_t elements_per_v_tensor = 1;
+
+  for (auto dim : tensor_shapes[0]) {
+    elements_per_k_tensor *= dim;
+  }
+  for (auto dim : tensor_shapes[1]) {
+    elements_per_v_tensor *= dim;
+  }
+
+  size_t element_size = torch::elementSize(dtype);
+  size_t bytes_per_k_tensor = elements_per_k_tensor * element_size;
+  size_t bytes_per_v_tensor = elements_per_v_tensor * element_size;
+  size_t page_size = sysconf(_SC_PAGESIZE);
+  total_size_ = num_tensors * (bytes_per_k_tensor + bytes_per_v_tensor);
+  total_size_ = ((total_size_ + page_size - 1) / page_size) * page_size;
+
+  base_ptr_ = mmap(nullptr,
+                   total_size_,
+                   PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS,
+                   -1,
+                   0);
+
+  if (base_ptr_ == MAP_FAILED) {
+    LOG(FATAL) << "Failed to allocate aligned memory pool!";
+  }
+
+  if (mlock(base_ptr_, total_size_) != 0) {
+    munmap(base_ptr_, total_size_);
+    LOG(FATAL) << "Failed to lock memory pool!";
+  }
+
+  size_t current_offset = 0;
+  auto options = torch::TensorOptions().dtype(dtype).device(torch::kCPU);
+  tensors->reserve(num_tensors);
+
+  for (size_t i = 0; i < num_tensors; ++i) {
+    void* k_tensor_ptr = static_cast<char*>(base_ptr_) + current_offset;
+    torch::Tensor k_tensor =
+        torch::from_blob(k_tensor_ptr, tensor_shapes[0], options);
+    current_offset += bytes_per_k_tensor;
+
+    void* v_tensor_ptr = static_cast<char*>(base_ptr_) + current_offset;
+    torch::Tensor v_tensor =
+        torch::from_blob(v_tensor_ptr, tensor_shapes[1], options);
+    current_offset += bytes_per_v_tensor;
+
+    tensors->emplace_back(k_tensor, v_tensor);
+  }
+
+  LOG(INFO) << "Page aligned: "
+            << ((uintptr_t)base_ptr_ % page_size == 0 ? "YES" : "NO");
+}
+
 }  // namespace xllm
diff --git a/xllm/core/runtime/worker_impl.h b/xllm/core/runtime/worker_impl.h
@@ -16,6 +16,7 @@ limitations under the License.
 #pragma once
 
 #include <folly/futures/Future.h>
+#include <sys/mman.h>
 #include <torch/torch.h>
 
 #include <memory>
@@ -45,6 +46,8 @@ limitations under the License.
 
 namespace xllm {
 
+class AlignedTensorCreater;
+
 class WorkerImpl {
  public:
   enum Status : int8_t {
@@ -237,6 +240,7 @@ class WorkerImpl {
   // kv caches
   std::vector<xllm::KVCache> kv_caches_;
   std::vector<xllm::KVCache> host_kv_caches_;
+  std::unique_ptr<AlignedTensorCreater> aligned_tensor_creater_;
 
   // causal LM model
   std::unique_ptr<CausalLM> model_;
@@ -277,4 +281,26 @@ class WorkerImpl {
       layer_wise_load_synchronizer_;
 };
 
+class AlignedTensorCreater {
+ private:
+  void* base_ptr_;
+  size_t total_size_;
+
+ public:
+  AlignedTensorCreater(const std::vector<std::vector<int64_t>>& tensor_shapes,
+                       const torch::ScalarType dtype,
+                       const uint32_t num_tensors,
+                       std::vector<xllm::KVCache>* tensors);
+
+  ~AlignedTensorCreater() {
+    if (base_ptr_ != nullptr) {
+      munlock(base_ptr_, total_size_);
+      munmap(base_ptr_, total_size_);
+    }
+  }
+
+  void* get_base_ptr() const { return base_ptr_; }
+  size_t get_total_size() const { return total_size_; }
+};
+
 }  // namespace xllm

Original file line number	Diff line number	Diff line change
`@@ -351,11 +351,7 @@ class ClientStreamReceiver : public brpc::StreamInputHandler {`
`351`	`351`
`352`	`352`	`~ClientStreamReceiver() {`
`353`	`353`	`if (!promise_set_.exchange(true)) {`
`354`		`- try {`
`355`		`- close_promise_.set_value();`
`356`		`- } catch (const std::exception& e) {`
`357`		`- LOG(WARNING) << "Exception in destructor: " << e.what();`
`358`		`- }`
	`354`	`+ close_promise_.set_value();`
`359`	`355`	`}`
`360`	`356`	`}`
`361`	`357`
Original file line number	Diff line number	Diff line change
`@@ -442,11 +442,7 @@ class ServerStreamHandler : public brpc::StreamInputHandler {`
`442`	`442`	`public:`
`443`	`443`	`~ServerStreamHandler() {`
`444`	`444`	`if (!promise_set_.exchange(true)) {`
`445`		`- try {`
`446`		`- close_promise_.set_value();`
`447`		`- } catch (const std::exception& e) {`
`448`		`- LOG(WARNING) << "Exception in destructor: " << e.what();`
`449`		`- }`
	`445`	`+ close_promise_.set_value();`
`450`	`446`	`}`
`451`	`447`	`}`
`452`	`448`