feat: add page-aligned tensor creator for host KV cache.

Kang-Meng · Kang-Meng · commit b996af0393b9 · 2025-11-08T20:12:34.000+08:00
diff --git a/xllm/core/distributed_runtime/comm_channel.cpp b/xllm/core/distributed_runtime/comm_channel.cpp
@@ -351,11 +351,7 @@ class ClientStreamReceiver : public brpc::StreamInputHandler {
 
   ~ClientStreamReceiver() {
     if (!promise_set_.exchange(true)) {
-      try {
-        close_promise_.set_value();
-      } catch (const std::exception& e) {
-        LOG(WARNING) << "Exception in destructor: " << e.what();
-      }
+      close_promise_.set_value();
     }
   }
 
diff --git a/xllm/core/distributed_runtime/worker_service.cpp b/xllm/core/distributed_runtime/worker_service.cpp
@@ -442,11 +442,7 @@ class ServerStreamHandler : public brpc::StreamInputHandler {
  public:
   ~ServerStreamHandler() {
     if (!promise_set_.exchange(true)) {
-      try {
-        close_promise_.set_value();
-      } catch (const std::exception& e) {
-        LOG(WARNING) << "Exception in destructor: " << e.what();
-      }
+      close_promise_.set_value();
     }
   }
 
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.cpp b/xllm/core/framework/kv_cache/kv_cache_store.cpp
@@ -55,30 +55,18 @@ bool KVCacheStore::init(const StoreConfig& config,
   LOG(INFO) << "v_cache_size_per_block: " << v_cache_size_per_block_;
 
   if (config_.protocol == "rdma") {
-    for (int block = 0; block < host_kv_caches_->size(); block++) {
-      void* key_cache = static_cast<char*>(
-          host_kv_caches_->at(block).get_k_cache().data_ptr());
-
-      auto register_k_result = client_ptr_->RegisterLocalMemory(
-          key_cache, k_cache_size_per_block_, "cpu:0", false, false);
-
-      if (!register_k_result.has_value()) {
-        LOG(ERROR) << "Failed to register local memory for key cache: "
-                   << toString(register_k_result.error());
-        return false;
-      }
-
-      void* value_cache = static_cast<char*>(
-          host_kv_caches_->at(block).get_v_cache().data_ptr());
-
-      auto register_v_result = client_ptr_->RegisterLocalMemory(
-          value_cache, v_cache_size_per_block_, "cpu:0", false, false);
-
-      if (!register_v_result.has_value()) {
-        LOG(ERROR) << "Failed to register local memory for value cache: "
-                   << toString(register_v_result.error());
+    if (config_.total_size > 0 && config_.tensor_data != nullptr) {
+      auto result = client_ptr_->RegisterLocalMemory(
+          config_.tensor_data, config_.total_size, "cpu:0", false, false);
+      if (!result.has_value()) {
+        LOG(ERROR) << "Failed to register local memory: "
+                   << toString(result.error());
         return false;
       }
+    } else {
+      LOG(FATAL) << "rdma must RegisterLocalMemory, but got register size: "
+                 << config_.total_size
+                 << ", and data ptr: " << uint64_t(config_.tensor_data);
     }
   }
   is_initialized_ = true;
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.h b/xllm/core/framework/kv_cache/kv_cache_store.h
@@ -19,6 +19,8 @@ struct StoreConfig {
   std::string master_server_address = "";
   int replica_num = 1;
   uint32_t tp_rank = 0;
+  size_t total_size = 0;
+  void* tensor_data = nullptr;
 };
 
 class KVCacheStore {
diff --git a/xllm/core/runtime/worker_impl.cpp b/xllm/core/runtime/worker_impl.cpp
@@ -75,21 +75,13 @@ WorkerImpl::WorkerImpl(const ParallelArgs& parallel_args,
   threadpool_.schedule([this]() mutable { device_.set_device(); });
   for (int i = 0; i < h2d_threadpool_.size(); i++) {
     h2d_threadpool_.schedule_with_tid(
-        [this]() mutable {
-          device_.set_device();
-          h2d_stream_[std::this_thread::get_id()] =
-              device_.get_stream_from_pool(TIMEOUT_MS);
-        },
-        i);
+        [this]() mutable { device_.set_device(); }, i);
+    copy_stream_.enqueue(device_.get_stream_from_pool(TIMEOUT_MS));
   }
   for (int i = 0; i < d2h_threadpool_.size(); i++) {
     d2h_threadpool_.schedule_with_tid(
-        [this]() mutable {
-          device_.set_device();
-          d2h_stream_[std::this_thread::get_id()] =
-              device_.get_stream_from_pool(TIMEOUT_MS);
-        },
-        i);
+        [this]() mutable { device_.set_device(); }, i);
+    copy_stream_.enqueue(device_.get_stream_from_pool(TIMEOUT_MS));
   }
 
   prepare_stream_ = device_.get_stream_from_pool();
@@ -152,18 +144,9 @@ bool WorkerImpl::allocate_host_kv_cache(
   host_kv_cache_shape[1][0] = num_layers;
 
   // create a KVCache shape: block_size * [layers, token, head, dim]
-  host_kv_caches_.reserve(host_bolck_size);
+  aligned_tensor_creater_ = std::make_unique<AlignedTensorCreater>(
+      host_kv_cache_shape, dtype_, host_bolck_size, &host_kv_caches_);
 
-  for (int64_t i = 0; i < host_bolck_size; ++i) {
-    torch::Tensor key_cache, value_cache;
-    key_cache = torch::empty(host_kv_cache_shape[0],
-                             torch::dtype(dtype_).device(torch::kCPU))
-                    .pin_memory();
-    value_cache = torch::empty(host_kv_cache_shape[1],
-                               torch::dtype(dtype_).device(torch::kCPU))
-                      .pin_memory();
-    host_kv_caches_.emplace_back(key_cache, value_cache);
-  }
   LOG(INFO) << "Initializing host kv block size: " << host_bolck_size;
 
   int32_t device_id = device_.index();
@@ -188,6 +171,8 @@ bool WorkerImpl::allocate_host_kv_cache(
     config.tp_rank = options_.dp_size() > 1
                          ? options_.node_rank() % options_.dp_size()
                          : options_.node_rank();
+    config.total_size = aligned_tensor_creater_->get_total_size();
+    config.tensor_data = aligned_tensor_creater_->get_base_ptr();
 
     if (!KVCacheStore::get_instance().init(config, &host_kv_caches_)) {
       LOG(ERROR) << "Init KVCacheStore fail!";
@@ -805,9 +790,6 @@ uint32_t WorkerImpl::offload_kv_blocks(
 
 bool WorkerImpl::d2h_batch_copy(Slice<BlockTransferInfo>& block_transfer_info) {
 #if defined(USE_NPU)
-  CHECK(d2h_stream_.count(std::this_thread::get_id()) != 0)
-      << "WorkerImpl::d2h_batch_copy can only be called in d2h_threadpool_.";
-
   const int64_t num_layers = context_.get_model_args().n_layers();
   uint32_t num_batches = block_transfer_info.size() * num_layers * 2;
   void** srcs = new void*[num_batches];
@@ -840,8 +822,9 @@ bool WorkerImpl::d2h_batch_copy(Slice<BlockTransferInfo>& block_transfer_info) {
     }
   }
 
-  c10::StreamGuard streamGuard =
-      d2h_stream_[std::this_thread::get_id()]->set_stream_guard();
+  std::unique_ptr<Stream> stream;
+  copy_stream_.wait_dequeue(stream);
+  c10::StreamGuard streamGuard = stream->set_stream_guard();
 
   // TODO(kangmeng): change to async API
   aclError ret = aclrtMemcpyBatch(dsts,
@@ -856,14 +839,18 @@ bool WorkerImpl::d2h_batch_copy(Slice<BlockTransferInfo>& block_transfer_info) {
   if (ret != 0 || fail_index != SIZE_MAX) {
     LOG(ERROR) << "aclrtMemcpyBatch error: " << ret
                << ", fail_index:" << fail_index;
+    copy_stream_.enqueue(std::move(stream));
     return false;
   }
 
-  if (d2h_stream_[std::this_thread::get_id()]->synchronize() != 0) {
+  if (stream->synchronize() != 0) {
     LOG(ERROR) << "d2h_batch_copy timeout!";
+    copy_stream_.enqueue(std::move(stream));
     return false;
   }
 
+  copy_stream_.enqueue(std::move(stream));
+
   delete[] dsts;
   delete[] srcs;
   delete[] copy_size;
@@ -875,8 +862,6 @@ bool WorkerImpl::d2h_batch_copy(Slice<BlockTransferInfo>& block_transfer_info) {
 bool WorkerImpl::h2d_batch_copy(const uint64_t batch_id,
                                 Slice<BlockTransferInfo>& block_transfer_info) {
 #if defined(USE_NPU)
-  CHECK(h2d_stream_.count(std::this_thread::get_id()) != 0)
-      << "WorkerImpl::h2d_batch_copy can only be called in h2d_threadpool_.";
   CHECK(block_transfer_info.size() < BATCH_COPY_MAX_SIZE / 2)
       << "h2d_batch_copy support copy blocks less than "
       << BATCH_COPY_MAX_SIZE / 2 << ", but got " << block_transfer_info.size();
@@ -903,9 +888,10 @@ bool WorkerImpl::h2d_batch_copy(const uint64_t batch_id,
   aclrtMemcpyBatchAttr attrs[1] = {h2d_attrs_};
   size_t attrs_indexes[1] = {0};
 
-  c10::StreamGuard streamGuard =
-      h2d_stream_[std::this_thread::get_id()]->set_stream_guard();
-  auto stream = h2d_stream_[std::this_thread::get_id()]->get_stream()->stream();
+  std::unique_ptr<Stream> stream;
+  copy_stream_.wait_dequeue(stream);
+  c10::StreamGuard streamGuard = stream->set_stream_guard();
+
   aclError ret = 0;
 
   for (int layer_id = 0; layer_id < num_layers; layer_id++) {
@@ -946,7 +932,7 @@ bool WorkerImpl::h2d_batch_copy(const uint64_t batch_id,
       LOG(ERROR) << "aclrtMemcpyBatch error: " << ret
                  << ", fail_index:" << fail_index;
     } else {
-      ret = aclrtRecordEvent(*event, stream);
+      ret = aclrtRecordEvent(*event, stream->get_stream()->stream());
       if (ret != 0) {
         LOG(ERROR) << "aclrtRecordEvent error: " << ret;
       }
@@ -955,10 +941,12 @@ bool WorkerImpl::h2d_batch_copy(const uint64_t batch_id,
     if (ret != 0) break;
   }
 
-  if (h2d_stream_[std::this_thread::get_id()]->synchronize() != 0) {
+  if (stream->synchronize() != 0) {
     LOG(ERROR) << "h2d_batch_copy timeout!";
+    copy_stream_.enqueue(std::move(stream));
     return false;
   }
+  copy_stream_.enqueue(std::move(stream));
 
   delete[] dsts;
   delete[] srcs;
@@ -1026,4 +1014,68 @@ uint32_t WorkerImpl::prefetch_from_storage(
       .get();
 }
 
+AlignedTensorCreater::AlignedTensorCreater(
+    const std::vector<std::vector<int64_t>>& tensor_shapes,
+    const torch::ScalarType dtype,
+    const uint32_t num_tensors,
+    std::vector<xllm::KVCache>* tensors) {
+  CHECK(tensor_shapes.size() == 2)
+      << "tensor_shapes.size() must equal to 2, but got "
+      << tensor_shapes.size();
+
+  int64_t elements_per_k_tensor = 1;
+  int64_t elements_per_v_tensor = 1;
+
+  for (auto dim : tensor_shapes[0]) {
+    elements_per_k_tensor *= dim;
+  }
+  for (auto dim : tensor_shapes[1]) {
+    elements_per_v_tensor *= dim;
+  }
+
+  size_t element_size = torch::elementSize(dtype);
+  size_t bytes_per_k_tensor = elements_per_k_tensor * element_size;
+  size_t bytes_per_v_tensor = elements_per_v_tensor * element_size;
+  size_t page_size = sysconf(_SC_PAGESIZE);
+  total_size_ = num_tensors * (bytes_per_k_tensor + bytes_per_v_tensor);
+  total_size_ = ((total_size_ + page_size - 1) / page_size) * page_size;
+
+  base_ptr_ = mmap(nullptr,
+                   total_size_,
+                   PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANONYMOUS,
+                   -1,
+                   0);
+
+  if (base_ptr_ == MAP_FAILED) {
+    LOG(FATAL) << "Failed to allocate aligned memory pool!";
+  }
+
+  if (mlock(base_ptr_, total_size_) != 0) {
+    munmap(base_ptr_, total_size_);
+    LOG(FATAL) << "Failed to lock memory pool!";
+  }
+
+  size_t current_offset = 0;
+  auto options = torch::TensorOptions().dtype(dtype).device(torch::kCPU);
+  tensors->reserve(num_tensors);
+
+  for (size_t i = 0; i < num_tensors; ++i) {
+    void* k_tensor_ptr = static_cast<char*>(base_ptr_) + current_offset;
+    torch::Tensor k_tensor =
+        torch::from_blob(k_tensor_ptr, tensor_shapes[0], options);
+    current_offset += bytes_per_k_tensor;
+
+    void* v_tensor_ptr = static_cast<char*>(base_ptr_) + current_offset;
+    torch::Tensor v_tensor =
+        torch::from_blob(v_tensor_ptr, tensor_shapes[1], options);
+    current_offset += bytes_per_v_tensor;
+
+    tensors->emplace_back(k_tensor, v_tensor);
+  }
+
+  LOG(INFO) << "Page aligned: "
+            << ((uintptr_t)base_ptr_ % page_size == 0 ? "YES" : "NO");
+}
+
 }  // namespace xllm
diff --git a/xllm/core/runtime/worker_impl.h b/xllm/core/runtime/worker_impl.h
@@ -16,6 +16,7 @@ limitations under the License.
 #pragma once
 
 #include <folly/futures/Future.h>
+#include <sys/mman.h>
 #include <torch/torch.h>
 
 #include <memory>
@@ -45,6 +46,8 @@ limitations under the License.
 
 namespace xllm {
 
+class AlignedTensorCreater;
+
 class WorkerImpl {
  public:
   enum Status : int8_t {
@@ -214,11 +217,8 @@ class WorkerImpl {
   ThreadPool d2h_threadpool_{5};
   ThreadPool batchget_threadpool_{5};
   ThreadPool batchput_threadpool_{2};
-  // copy streams
-  // only can be used in h2d_threadpool_
-  std::unordered_map<std::thread::id, std::unique_ptr<Stream>> h2d_stream_;
-  // only can be used in d2h_threadpool_
-  std::unordered_map<std::thread::id, std::unique_ptr<Stream>> d2h_stream_;
+  // copy streams only can be used in h2d_threadpool_ and d2h_threadpool_
+  moodycamel::BlockingConcurrentQueue<std::unique_ptr<Stream>> copy_stream_;
 
   // dtype of the model
   torch::ScalarType dtype_;
@@ -237,6 +237,7 @@ class WorkerImpl {
   // kv caches
   std::vector<xllm::KVCache> kv_caches_;
   std::vector<xllm::KVCache> host_kv_caches_;
+  std::unique_ptr<AlignedTensorCreater> aligned_tensor_creater_;
 
   // causal LM model
   std::unique_ptr<CausalLM> model_;
@@ -277,4 +278,26 @@ class WorkerImpl {
       layer_wise_load_synchronizer_;
 };
 
+class AlignedTensorCreater {
+ private:
+  void* base_ptr_;
+  size_t total_size_;
+
+ public:
+  AlignedTensorCreater(const std::vector<std::vector<int64_t>>& tensor_shapes,
+                       const torch::ScalarType dtype,
+                       const uint32_t num_tensors,
+                       std::vector<xllm::KVCache>* tensors);
+
+  ~AlignedTensorCreater() {
+    if (base_ptr_ != nullptr) {
+      munlock(base_ptr_, total_size_);
+      munmap(base_ptr_, total_size_);
+    }
+  }
+
+  void* get_base_ptr() const { return base_ptr_; }
+  size_t get_total_size() const { return total_size_; }
+};
+
 }  // namespace xllm

Original file line number	Diff line number	Diff line change
`@@ -351,11 +351,7 @@ class ClientStreamReceiver : public brpc::StreamInputHandler {`
`351`	`351`
`352`	`352`	`~ClientStreamReceiver() {`
`353`	`353`	`if (!promise_set_.exchange(true)) {`
`354`		`- try {`
`355`		`- close_promise_.set_value();`
`356`		`- } catch (const std::exception& e) {`
`357`		`- LOG(WARNING) << "Exception in destructor: " << e.what();`
`358`		`- }`
	`354`	`+ close_promise_.set_value();`
`359`	`355`	`}`
`360`	`356`	`}`
`361`	`357`
Original file line number	Diff line number	Diff line change
`@@ -442,11 +442,7 @@ class ServerStreamHandler : public brpc::StreamInputHandler {`
`442`	`442`	`public:`
`443`	`443`	`~ServerStreamHandler() {`
`444`	`444`	`if (!promise_set_.exchange(true)) {`
`445`		`- try {`
`446`		`- close_promise_.set_value();`
`447`		`- } catch (const std::exception& e) {`
`448`		`- LOG(WARNING) << "Exception in destructor: " << e.what();`
`449`		`- }`
	`445`	`+ close_promise_.set_value();`
`450`	`446`	`}`
`451`	`447`	`}`
`452`	`448`