diff --git a/Makefile b/Makefile
index 2aee18cc749f0..614ad7532cb74 100644
--- a/Makefile
+++ b/Makefile
@@ -178,6 +178,7 @@ pb: vendor-build generate-pb fmt
 
 VERSION_INFO :=-X '$(GO_MODULE)/pkg/version.GoVersion=$(GO_VERSION)' -X '$(GO_MODULE)/pkg/version.BranchName=$(BRANCH_NAME)' -X '$(GO_MODULE)/pkg/version.CommitID=$(LAST_COMMIT_ID)' -X '$(GO_MODULE)/pkg/version.BuildTime=$(BUILD_TIME)' -X '$(GO_MODULE)/pkg/version.Version=$(MO_VERSION)'
 THIRDPARTIES_INSTALL_DIR=$(ROOT_DIR)/thirdparties/install
+CGO_DIR=$(ROOT_DIR)/cgo
 RACE_OPT :=
 DEBUG_OPT :=
 CGO_DEBUG_OPT :=
@@ -188,7 +189,7 @@ ifeq ($(MO_CL_CUDA),1)
     $(error CONDA_PREFIX env variable not found.)
   endif
 	CUVS_CFLAGS := -I$(CONDA_PREFIX)/include
-	CUVS_LDFLAGS := -L$(CONDA_PREFIX)/envs/go/lib -lcuvs -lcuvs_c
+	CUVS_LDFLAGS := -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c
 	CUDA_CFLAGS := -I/usr/local/cuda/include $(CUVS_CFLAGS)
 	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart $(CUVS_LDFLAGS) -lstdc++
 	TAGS += -tags "gpu"
@@ -198,11 +199,11 @@ ifeq ($(TYPECHECK),1)
 	TAGS += -tags "typecheck"
 endif
 
-CGO_OPTS :=CGO_CFLAGS="-I$(THIRDPARTIES_INSTALL_DIR)/include $(CUDA_CFLAGS)"
-GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib -fopenmp' $(VERSION_INFO)"
+CGO_OPTS :=CGO_CFLAGS="-I$(CGO_DIR) -I$(THIRDPARTIES_INSTALL_DIR)/include $(CUDA_CFLAGS)"
+GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(CGO_DIR) -lmo -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib -fopenmp' $(VERSION_INFO)"
 
 ifeq ("$(UNAME_S)","darwin")
-GOLDFLAGS:=-ldflags="-extldflags '-L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,@executable_path/lib' $(VERSION_INFO)"
+GOLDFLAGS:=-ldflags="-extldflags '-L$(CGO_DIR) -lmo -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,@executable_path/lib' $(VERSION_INFO)"
 endif
 
 ifeq ($(GOBUILD_OPT),)
diff --git a/cgo/Makefile b/cgo/Makefile
index 5678f16cf5814..d25f0400aab96 100644
--- a/cgo/Makefile
+++ b/cgo/Makefile
@@ -1,48 +1,77 @@
 DEBUG_OPT :=
 UNAME_M := $(shell uname -m)
+UNAME_S := $(shell uname -s)
+CC ?= gcc
 
 # Yeah, fast math.  We want it to be fast, for all xcall, 
 # IEEE compliance should not be an issue.
 OPT_LV := -O3 -ffast-math -ftree-vectorize -funroll-loops
-CFLAGS=-std=c99 -g ${OPT_LV} -Wall -Werror -I../thirdparties/install/include
-OBJS=mo.o arith.o compare.o logic.o xcall.o usearchex.o bloom.o
-CUDA_OBJS=
+COMMON_CFLAGS := -g $(OPT_LV) -Wall -Werror -fPIC -I../thirdparties/install/include
+CFLAGS := -std=c99 $(COMMON_CFLAGS)
+OBJS := mo.o arith.o compare.o logic.o xcall.o usearchex.o bloom.o
+CUDA_OBJS :=
+LDFLAGS := -L../thirdparties/install/lib -lusearch_c
+TARGET_LIB := libmo.so
+
+ifeq ($(UNAME_S),Darwin)
+	TARGET_LIB := libmo.dylib
+	LDFLAGS += -dynamiclib -undefined dynamic_lookup -install_name @rpath/$(TARGET_LIB)
+else
+	LDFLAGS += -shared
+endif
 
 ifeq ($(UNAME_M), x86_64)
-	CFLAGS+= -march=haswell
+	CFLAGS += -march=haswell
 endif
 
 ifeq ($(MO_CL_CUDA),1)
+	ifeq ($(CONDA_PREFIX),)
+		$(error CONDA_PREFIX env variable not found. Please activate your conda environment.)
+	endif
 	CC = /usr/local/cuda/bin/nvcc 
-	CFLAGS = -ccbin g++ -m64 --shared -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
+	CFLAGS = -ccbin g++ -m64 -Xcompiler -fPIC -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
 	CFLAGS += -I../thirdparties/install/include -DMO_CL_CUDA
 	CUDA_OBJS += cuda/cuda.o
-	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -lstdc++
+	# Explicitly include all needed libraries for shared library linking
+	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lstdc++
+	LDFLAGS += $(CUDA_LDFLAGS)
 endif
 
-all: libmo.a
+.PHONY: all clean test debug
+
+all: $(TARGET_LIB) libmo.a
 
-libmo.a: $(OBJS) 
+$(TARGET_LIB): $(OBJS) 
 ifeq ($(MO_CL_CUDA),1)
-	make -C cuda
+	$(MAKE) -C cuda
+	$(MAKE) -C cuvs
+	$(CC) $(LDFLAGS) -o $@ $(OBJS) $(CUDA_OBJS) cuvs/*.o
+else
+	$(CC) $(LDFLAGS) -o $@ $(OBJS)
 endif
-	ar -rcs libmo.a $(OBJS) $(CUDA_OBJS)
 
-# 
-#	$(CC) -o libmo.a $(OBJS) $(CUDA_OBJS) $(CUDA_LDFLAGS)
+libmo.a: $(OBJS)
+ifeq ($(MO_CL_CUDA),1)
+	$(MAKE) -C cuda
+	$(MAKE) -C cuvs
+	ar -rcs $@ $(OBJS) $(CUDA_OBJS) cuvs/*.o
+else
+	ar -rcs $@ $(OBJS)
+endif
 
+%.o: %.c
+	$(CC) $(CFLAGS) -c $< -o $@
 
-test: libmo.a
-	make -C test
+test: $(TARGET_LIB)
+	$(MAKE) -C test
 
-.PHONY: debug
 debug: override OPT_LV := -O0
 debug: override DEBUG_OPT := debug
 debug: all
 
-.PHONY: clean
 clean:
-	rm -f *.o *.a *.so
+	rm -f *.o *.a *.so *.dylib
 ifeq ($(MO_CL_CUDA),1)
-	make -C cuda clean
+	$(MAKE) -C cuda clean
+	$(MAKE) -C cuvs clean
 endif
diff --git a/cgo/README.md b/cgo/README.md
index 5699ca4d292a2..ffb190c652bc3 100644
--- a/cgo/README.md
+++ b/cgo/README.md
@@ -1,25 +1,28 @@
 MatrixOne CGO Kernel
 ===============================
 
-This directory contains cgo source code for MO.   Running
-make should produce two files to be used by go code.
-On go side, go will `include "mo.h"` and `-lmo`.   
+This directory contains CGO source code for MatrixOne. Running `make` produces the core library files used by Go code.
+
+On the Go side, the integration typically uses `mo.h` and links against the generated libraries:
 ```
 mo.h
-libmo.a
+libmo.a / libmo.so
 ```
 
-`mo.h` should be pristine, meaning it only contains C function
-prototype used by go.  The only datatypes that can be passed 
-between go and c code are int and float/double and pointer.   
-Always explicitly specify int size such as `int32_t`, `uint64_t`.
-Do not use `int`, `long`, etc.
+`mo.h` should remain pristine, containing only C function prototypes for Go to consume. Data passed between Go and C should be limited to standard types (int, float, double, pointers). Always specify explicit integer sizes (e.g., `int32_t`, `uint64_t`) and avoid platform-dependent types like `int` or `long`.
+
+GPU Support (CUDA & cuVS)
+-------------------------
+The kernel supports GPU acceleration for certain operations (e.g., vector search) via NVIDIA CUDA and the cuVS library.
+
+- **Build Flag:** GPU support is enabled by setting `MO_CL_CUDA=1` during the build.
+- **Environment:** Requires a working CUDA installation and a Conda environment with `cuvs` and `rmm` installed.
+- **Source Code:** GPU-specific code resides in the `cuda/` and `cuvs/` subdirectories.
 
 Implementation Notes
---------------------------------
+--------------------
 
-1. Pure C.
-2. Use memory passed from go.  Try not allocate memory in C code.
-3. Only depends on libc and libm.
-4. If 3rd party lib is absolutely necessary, import source code 
-   and build from source. If 3rd party lib is C++, wrap it completely in C.
+1. **Language:** Core kernel is Pure C. GPU extensions use C++ and CUDA, wrapped in a C-compatible interface.
+2. **Memory Management:** Prefer using memory allocated and passed from Go. Minimize internal allocations in C/C++ code.
+3. **Dependencies:** The base kernel depends only on `libc`, `libm`, and `libusearch`. GPU builds introduce dependencies on CUDA, `cuvs`, and `rmm`.
+4. **Third-party Libraries:** If a third-party library is necessary, it should be built from source (see `thirdparties/` directory). C++ libraries must be fully wrapped in C before being exposed to Go.
diff --git a/cgo/cuda/Makefile b/cgo/cuda/Makefile
index a95913b014d58..eca30f9be2b98 100644
--- a/cgo/cuda/Makefile
+++ b/cgo/cuda/Makefile
@@ -395,7 +395,7 @@ $(FATBIN_FILE): mocl.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -fatbin $<
 
 cuda.o: cuda.cpp
-	$(EXEC) $(NVCC) $(INCLUDES) -O3 --shared $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+	$(EXEC) $(NVCC) $(INCLUDES) -O3 --shared -Xcompiler -fPIC $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
 mytest.o: cuda.cpp $(FATBIN_FILE)
 	$(EXEC) $(NVCC) $(INCLUDES) -DTEST_RUN -g -O0 $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
diff --git a/cgo/cuvs/Makefile b/cgo/cuvs/Makefile
new file mode 100644
index 0000000000000..86ff4fd319723
--- /dev/null
+++ b/cgo/cuvs/Makefile
@@ -0,0 +1,75 @@
+# Makefile for MatrixOne cuVS C Wrapper
+
+UNAME_M := $(shell uname -m)
+CUDA_PATH ?= /usr/local/cuda
+NVCC := $(CUDA_PATH)/bin/nvcc
+
+ifeq ($(CONDA_PREFIX),)
+  $(error CONDA_PREFIX env variable not found. Please activate your conda environment.)
+endif
+
+# Compilation flags
+# Added --extended-lambda because raft/core/copy.cuh requires it for some internal headers
+NVCC_FLAGS := -std=c++17 -x cu -Xcompiler "-Wall -Wextra -fPIC -O2" --extended-lambda --expt-relaxed-constexpr
+NVCC_FLAGS += -I. -I$(CUDA_PATH)/include -I$(CONDA_PREFIX)/include -I$(CONDA_PREFIX)/include/rapids -I$(CONDA_PREFIX)/include/raft -I$(CONDA_PREFIX)/include/cuvs
+NVCC_FLAGS += -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DRAFT_SYSTEM_LITTLE_ENDIAN=1
+
+# Linking flags
+LDFLAGS := -shared
+LDFLAGS += -L$(CUDA_PATH)/lib64/stubs -lcuda -L$(CUDA_PATH)/lib64 -lcudart
+LDFLAGS += -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lrapids_logger
+LDFLAGS += -Xlinker -lpthread -Xlinker -lm
+
+# Target library
+TARGET := libmocuvs.so
+
+# Source files
+SRCS := brute_force_c.cpp ivf_flat_c.cpp ivf_pq_c.cpp cagra_c.cpp kmeans_c.cpp helper.cpp adhoc_c.cpp distance_c.cpp
+OBJS := $(SRCS:.cpp=.o)
+
+# Test configuration
+TESTDIR := test
+OBJDIR := obj
+TEST_EXE := test_cuvs_worker
+TEST_SRCS := $(TESTDIR)/main_test.cu \
+             $(TESTDIR)/brute_force_test.cu \
+             $(TESTDIR)/ivf_flat_test.cu \
+             $(TESTDIR)/ivf_pq_test.cu \
+             $(TESTDIR)/cagra_test.cu \
+             $(TESTDIR)/kmeans_test.cu \
+             $(TESTDIR)/quantize_test.cu \
+             $(TESTDIR)/distance_test.cu \
+             $(TESTDIR)/batching_test.cu
+
+TEST_OBJS := $(patsubst $(TESTDIR)/%.cu, $(OBJDIR)/test/%.o, $(TEST_SRCS))
+
+.PHONY: all clean test
+
+all: $(OBJS)
+
+$(TARGET): $(OBJS)
+	@echo "Linking shared library $@"
+	$(NVCC) $(LDFLAGS) $^ -o $@
+
+%.o: %.cpp
+	@echo "Compiling $< with NVCC"
+	$(NVCC) $(NVCC_FLAGS) -c $< -o $@
+
+# Test targets
+test: $(TEST_EXE)
+	@echo "Running tests..."
+	./$(TEST_EXE)
+
+$(TEST_EXE): $(TEST_OBJS) helper.o
+	@echo "NVCCLD $@"
+	$(NVCC) $(subst -x cu,,$(NVCC_FLAGS)) $^ $(subst -shared,,$(LDFLAGS)) -o $@
+
+$(OBJDIR)/test/%.o: $(TESTDIR)/%.cu
+	@mkdir -p $(@D)
+	@echo "NVCC $<"
+	$(NVCC) -std=c++17 -Xcompiler "-Wall -Wextra -fPIC -O2" --extended-lambda --expt-relaxed-constexpr -I. -I$(CUDA_PATH)/include -I$(CONDA_PREFIX)/include -I$(CONDA_PREFIX)/include/rapids -I$(CONDA_PREFIX)/include/raft -I$(CONDA_PREFIX)/include/cuvs -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DRAFT_SYSTEM_LITTLE_ENDIAN=1 -c $< -o $@
+
+clean:
+	@echo "Cleaning up..."
+	rm -f $(TARGET) *.o $(TEST_EXE)
+	rm -rf $(OBJDIR)
diff --git a/cgo/cuvs/README.md b/cgo/cuvs/README.md
new file mode 100644
index 0000000000000..7f0ac3b5c169a
--- /dev/null
+++ b/cgo/cuvs/README.md
@@ -0,0 +1,119 @@
+✦ Architecture Design: cuVS-Accelerated Vector Indexing
+
+  1. Overview
+  The MatrixOne cuvs package provides a high-performance, GPU-accelerated vector search and clustering infrastructure. It acts as
+  a bridge between the Go-based database kernel and NVIDIA's cuVS and RAFT libraries. The architecture is designed to solve three
+  primary challenges:
+   1. Impedance Mismatch: Reconciling Go’s concurrent goroutine scheduler with CUDA’s thread-specific resource requirements.
+   2. Scalability: Supporting datasets that exceed single-GPU memory (Sharding) or high-concurrency search requirements
+      (Replicated).
+   3. Efficiency: Minimizing CUDA kernel launch overhead via dynamic query batching.
+
+  ---
+
+  2. Core Component: cuvs_worker_t
+  The cuvs_worker_t is the foundational engine of the architecture.
+
+  Implementation Details:
+   * Persistent C++ Thread Pool: Instead of executing CUDA calls directly from CGO (which could be scheduled on any OS thread),
+     the worker maintains a dedicated pool of long-lived C++ threads. Each thread is pinned to a specific GPU device.
+   * Job Queuing: Requests from the Go layer are submitted as "Jobs" to an internal thread-safe queue. The worker returns a
+     std::future, allowing the Go layer to perform other tasks while the GPU processes the request.
+   * Context Stability: By using dedicated threads, we ensure that CUDA context and RAFT resource handles remain stable and
+     cached, avoiding the expensive overhead of context creation or handle re-initialization.
+
+  ---
+
+  3. Distribution Modes
+  The system supports three distinct modes to leverage multi-GPU hardware:
+
+  A. Single GPU Mode
+   * Design: The index resides entirely on one device.
+   * Use Case: Small to medium datasets where latency is the priority.
+
+  B. Replicated Mode (Scaling Throughput)
+   * Design: The full index is loaded onto multiple GPUs simultaneously.
+   * Mechanism: The cuvs_worker implements a load-balancing strategy (typically round-robin). Incoming queries are dispatched to
+     the next available GPU.
+   * Benefit: Linearly scales the Queries Per Second (QPS) by utilizing the compute power of all available GPUs.
+
+  C. Sharded Mode (Scaling Capacity)
+   * Design: The dataset is partitioned into $N$ shards across $N$ GPUs.
+   * Mechanism:
+       1. Broadcast: A search request is sent to all GPUs.
+       2. Local Search: Each GPU searches its local shard independently using RAFT resources.
+       3. Top-K Merge: The worker aggregates the results ($N \times K$ candidates) and performs a final merge-sort (often on the
+          CPU or via a fast GPU kernel) to return the global top-K.
+   * Benefit: Enables indexing of massive datasets (e.g., 100M+ vectors) that would not fit in the memory of a single GPU.
+
+  ---
+
+  4. RAFT Resource Management
+  The package relies on RAFT (raft::resources) for all CUDA-accelerated operations.
+
+   * Resource Caching: raft::resources objects (containing CUDA streams, cuBLAS handles, and workspace memory) are held within the
+     cuvs_worker threads. They are created once at Start() and reused for the lifetime of the index.
+   * Stream-Based Parallelism: Every index operation is executed asynchronously on a RAFT-managed CUDA stream. This allows the
+     system to overlap data transfers (Host-to-Device) with kernel execution, maximizing hardware utilization.
+   * Memory Layout: Leveraging raft::mdspan and raft::mdarray ensures that memory is handled in a layout-aware manner
+     (C-contiguous or Fortran-contiguous), matching the requirements of optimized BLAS and LAPACK kernels.
+
+  ---
+
+  5. Dynamic Batching: The Throughput Key
+  In a database environment, queries often arrive one by one from different users. Processing these as individual CUDA kernels is
+  inefficient due to launch overhead and under-utilization of GPU warps.
+
+  The Dynamic Batching Mechanism:
+   * Aggregation Window: When multiple search requests arrive at the worker within a small time window (microseconds), the worker
+     stalls briefly to aggregate them.
+   * Matrix Consolidation: Individual query vectors are packed into a single large query matrix.
+   * Consolidated Search: A single cuvs::neighbors::search call is made. GPUs are significantly more efficient at processing one
+     $64 \times D$ matrix than 64 individual $1 \times D$ vectors.
+   * Automatic Fulfilling: Once the batch search completes, the worker de-multiplexes the results and fulfills the specific
+     std::future for each individual Go request.
+
+  ---
+
+ 6. Automatic Type Quantization
+  To optimize memory footprint and search speed, the architecture features an automated quantization pipeline that converts
+  high-precision float32 vectors into compressed formats.
+
+   * Transparent Conversion: The Go layer can consistently provide float32 data. The system automatically handles the conversion
+     to the index's internal type (half, int8, or uint8) directly on the GPU.
+   * FP16 (Half Precision):
+       * Mechanism: Uses raft::copy to perform bit-level conversion from 32-bit to 16-bit floating point.
+       * Benefit: 2x memory reduction with negligible impact on search recall.
+   * 8-Bit Integer (int8/uint8):
+       * Mechanism: Implements a learned Scalar Quantizer. The system samples the dataset to determine optimal min and max
+         clipping bounds.
+       * Training: Before building, the quantizer is "trained" on a subset of the data to ensure the 256 available integer levels
+         are mapped to the most significant range of the distribution.
+       * Benefit: 4x memory reduction, enabling massive datasets to reside in VRAM.
+   * GPU-Accelerated: All quantization kernels are executed on the device. This minimizes CPU usage and avoids the latency of
+     converting data before sending it over the PCIe bus.
+
+  7. Supported Index Types
+  The following indexes are fully integrated into the MatrixOne GPU architecture:
+
+
+  ┌──────────┬──────────────────────┬───────────────────────────────────────────────────────────────────────────────┐
+  │ Index    │ Algorithm            │ Strengths                                                                     │
+  ├──────────┼──────────────────────┼───────────────────────────────────────────────────────────────────────────────┤
+  │ CAGRA    │ Hardware-accelerated │ Best-in-class search speed and high recall. Optimized for hardware graph      │
+  │          │ Graph                │ traversal.                                                                    │
+  │ IVF-Flat │ Inverted File Index  │ High accuracy and fast search. Excellent for general-purpose use.             │
+  │ IVF-PQ   │ Product Quantization │ Extreme compression. Supports billions of vectors via lossy code compression. │
+  │ Brute    │ Exact Flat Search    │ 100% recall. Ideal for small datasets or generating ground-truth for          │
+  │ Force    │                      │ benchmarks.                                                                   │
+  │ K-Means  │ Clustering           │ High-performance centroid calculation for data partitioning and unsupervised  │
+  │          │                      │ learning.                                                                     │
+  └──────────┴──────────────────────┴───────────────────────────────────────────────────────────────────────────────┘
+
+
+  8. Operational Telemetry
+  All indexes implement a unified Info() method that returns a JSON-formatted string. This allows the database to programmatically
+  verify:
+   * Hardware Mapping: Which GPU devices are holding which shards.
+   * Data Layout: Element sizes, dimensions, and current vector counts.
+   * Hyper-parameters: Internal tuning values like NLists, GraphDegree, or PQBits.
diff --git a/cgo/cuvs/adhoc.hpp b/cgo/cuvs/adhoc.hpp
new file mode 100644
index 0000000000000..310db80fbc336
--- /dev/null
+++ b/cgo/cuvs/adhoc.hpp
@@ -0,0 +1,127 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resources.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/copy.cuh>
+#include <cuvs/neighbors/brute_force.hpp>
+#include <cuvs/distance/distance.hpp>
+#include "helper.h"
+#include <vector>
+#include <cstdint>
+
+namespace matrixone {
+
+/**
+ * @brief Performs an ad-hoc brute-force search on GPU without using a worker thread.
+ *        This is intended for scenarios where an index is not pre-built and the
+ *        search needs to be executed immediately in the current thread context.
+ * 
+ * @tparam T Data type of the vector elements (e.g., float, half).
+ * @param res RAFT resources handle.
+ * @param dataset Host pointer to the dataset vectors.
+ * @param n_rows Number of vectors in the dataset.
+ * @param dim Dimension of each vector.
+ * @param queries Host pointer to the query vectors.
+ * @param n_queries Number of query vectors.
+ * @param limit Number of nearest neighbors to find (k).
+ * @param metric Distance metric to use.
+ * @param neighbors Host pointer to store the resulting neighbor IDs (size: n_queries * limit).
+ * @param distances Host pointer to store the resulting distances (size: n_queries * limit).
+ */
+template <typename T>
+void adhoc_brute_force_search(const raft::resources& res,
+                              const T* dataset,
+                              uint64_t n_rows,
+                              uint32_t dim,
+                              const T* queries,
+                              uint64_t n_queries,
+                              uint32_t limit,
+                              cuvs::distance::DistanceType metric,
+                              int64_t* neighbors,
+                              float* distances) {
+    auto stream = raft::resource::get_cuda_stream(res);
+
+    // Helper to align sizes to 256 bytes (CUDA default alignment)
+    auto align_size = [](size_t size) {
+        return (size + 255) & ~255;
+    };
+
+    // 1. Calculate total buffer sizes with alignment
+    size_t dataset_bytes = n_rows * dim * sizeof(T);
+    size_t queries_bytes = n_queries * dim * sizeof(T);
+    size_t neighbors_bytes = n_queries * limit * sizeof(int64_t);
+    size_t distances_bytes = n_queries * limit * sizeof(float);
+
+    size_t dataset_alloc = align_size(dataset_bytes);
+    size_t queries_alloc = align_size(queries_bytes);
+    size_t neighbors_alloc = align_size(neighbors_bytes);
+    size_t total_bytes = dataset_alloc + queries_alloc + neighbors_alloc + distances_bytes;
+
+    // Use a single allocation for all temporary buffers to reduce overhead
+    void* d_ptr = nullptr;
+    RAFT_CUDA_TRY(cudaMallocAsync(&d_ptr, total_bytes, stream));
+
+    char* d_dataset = static_cast<char*>(d_ptr);
+    char* d_queries = d_dataset + dataset_alloc;
+    char* d_neighbors = d_queries + queries_alloc;
+    char* d_distances = d_neighbors + neighbors_alloc;
+
+    // 2. Async copies to Device
+    RAFT_CUDA_TRY(cudaMemcpyAsync(d_dataset, dataset, dataset_bytes, cudaMemcpyHostToDevice, stream));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(d_queries, queries, queries_bytes, cudaMemcpyHostToDevice, stream));
+
+    // 3. Prepare Views (zero allocation)
+    auto dataset_view = raft::make_device_matrix_view<const T, int64_t>(reinterpret_cast<const T*>(d_dataset), n_rows, dim);
+    auto queries_view = raft::make_device_matrix_view<const T, int64_t>(reinterpret_cast<const T*>(d_queries), n_queries, dim);
+    auto neighbors_view = raft::make_device_matrix_view<int64_t, int64_t>(reinterpret_cast<int64_t*>(d_neighbors), n_queries, limit);
+    auto distances_view = raft::make_device_matrix_view<float, int64_t>(reinterpret_cast<float*>(d_distances), n_queries, limit);
+
+    // 4. Build temporary index (view-based, very fast)
+    cuvs::neighbors::brute_force::index_params index_params;
+    index_params.metric = metric;
+    auto index = cuvs::neighbors::brute_force::build(res, index_params, raft::make_const_mdspan(dataset_view));
+
+    // 5. Execute Search
+    cuvs::neighbors::brute_force::search_params search_params;
+    cuvs::neighbors::brute_force::search(res, search_params, index, 
+                                         raft::make_const_mdspan(queries_view), 
+                                         neighbors_view, 
+                                         distances_view);
+
+    // 6. Async copy results back to host
+    RAFT_CUDA_TRY(cudaMemcpyAsync(neighbors, d_neighbors, neighbors_bytes, cudaMemcpyDeviceToHost, stream));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(distances, d_distances, distances_bytes, cudaMemcpyDeviceToHost, stream));
+
+    // 7. Synchronize
+    raft::resource::sync_stream(res);
+
+    // 8. Async free
+    RAFT_CUDA_TRY(cudaFreeAsync(d_ptr, stream));
+
+    // Handle invalid neighbor indices (consistent with existing brute_force.hpp)
+    for (size_t i = 0; i < n_queries * limit; ++i) {
+        if (neighbors[i] == std::numeric_limits<int64_t>::max() || 
+            neighbors[i] == 4294967295LL || neighbors[i] < 0) {
+            neighbors[i] = -1;
+        }
+    }
+}
+
+} // namespace matrixone
diff --git a/cgo/cuvs/adhoc_c.cpp b/cgo/cuvs/adhoc_c.cpp
new file mode 100644
index 0000000000000..a28099297f2ed
--- /dev/null
+++ b/cgo/cuvs/adhoc_c.cpp
@@ -0,0 +1,79 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "adhoc_c.h"
+#include "adhoc.hpp"
+#include "helper.h"
+#include <raft/core/resources.hpp>
+#include <cuda_runtime.h>
+
+extern "C" {
+
+void gpu_adhoc_brute_force_search(const void* dataset,
+                                  uint64_t n_rows,
+                                  uint32_t dim,
+                                  const void* queries,
+                                  uint64_t n_queries,
+                                  uint32_t limit,
+                                  distance_type_t metric,
+                                  quantization_t qtype,
+                                  int device_id,
+                                  int64_t* neighbors,
+                                  float* distances,
+                                  void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cudaSetDevice(device_id);
+        const auto& res = matrixone::get_raft_resources();
+        auto m = static_cast<cuvs::distance::DistanceType>(metric);
+
+        if (qtype == Quantization_F32) {
+            matrixone::adhoc_brute_force_search<float>(res, 
+                                                       static_cast<const float*>(dataset), 
+                                                       n_rows, dim, 
+                                                       static_cast<const float*>(queries), 
+                                                       n_queries, limit, m, 
+                                                       neighbors, distances);
+        } else if (qtype == Quantization_F16) {
+            matrixone::adhoc_brute_force_search<half>(res, 
+                                                      static_cast<const half*>(dataset), 
+                                                      n_rows, dim, 
+                                                      static_cast<const half*>(queries), 
+                                                      n_queries, limit, m, 
+                                                      neighbors, distances);
+        } else {
+            throw std::runtime_error("Unsupported quantization type for adhoc search");
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_adhoc_brute_force_search", e.what());
+    }
+}
+
+void gpu_adhoc_brute_force_search_float(const float* dataset,
+                                        uint64_t n_rows,
+                                        uint32_t dim,
+                                        const float* queries,
+                                        uint64_t n_queries,
+                                        uint32_t limit,
+                                        distance_type_t metric,
+                                        int device_id,
+                                        int64_t* neighbors,
+                                        float* distances,
+                                        void* errmsg) {
+    gpu_adhoc_brute_force_search(dataset, n_rows, dim, queries, n_queries, limit, metric, Quantization_F32, device_id, neighbors, distances, errmsg);
+}
+
+} // extern "C"
diff --git a/cgo/cuvs/adhoc_c.h b/cgo/cuvs/adhoc_c.h
new file mode 100644
index 0000000000000..43146bf4deed7
--- /dev/null
+++ b/cgo/cuvs/adhoc_c.h
@@ -0,0 +1,72 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ADHOC_C_H
+#define ADHOC_C_H
+
+#include "helper.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Performs an ad-hoc brute-force search on GPU.
+ * 
+ * @param dataset Host pointer to the dataset vectors.
+ * @param n_rows Number of vectors in the dataset.
+ * @param dim Dimension of each vector.
+ * @param queries Host pointer to the query vectors.
+ * @param n_queries Number of query vectors.
+ * @param limit Number of nearest neighbors to find (k).
+ * @param metric Distance metric to use.
+ * @param qtype Quantization type (F32, F16).
+ * @param device_id GPU device ID to use.
+ * @param neighbors Host pointer to store the resulting neighbor IDs (size: n_queries * limit).
+ * @param distances Host pointer to store the resulting distances (size: n_queries * limit).
+ * @param errmsg Pointer to store error message if any.
+ */
+void gpu_adhoc_brute_force_search(const void* dataset,
+                                  uint64_t n_rows,
+                                  uint32_t dim,
+                                  const void* queries,
+                                  uint64_t n_queries,
+                                  uint32_t limit,
+                                  distance_type_t metric,
+                                  quantization_t qtype,
+                                  int device_id,
+                                  int64_t* neighbors,
+                                  float* distances,
+                                  void* errmsg);
+
+void gpu_adhoc_brute_force_search_float(const float* dataset,
+                                        uint64_t n_rows,
+                                        uint32_t dim,
+                                        const float* queries,
+                                        uint64_t n_queries,
+                                        uint32_t limit,
+                                        distance_type_t metric,
+                                        int device_id,
+                                        int64_t* neighbors,
+                                        float* distances,
+                                        void* errmsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ADHOC_C_H
diff --git a/cgo/cuvs/blog.md b/cgo/cuvs/blog.md
new file mode 100644
index 0000000000000..b49773aee0de3
--- /dev/null
+++ b/cgo/cuvs/blog.md
@@ -0,0 +1,52 @@
+# Scaling 50 Million Vectors on Modest Hardware: How MatrixOne Leverages cuVS for Extreme IVF-Flat Performance
+
+As AI applications proliferate, the demand for efficient vector search at scale has moved from a "nice-to-have" to a core database requirement. At MatrixOrigin, we recently faced a significant engineering challenge: **How do we build and search an IVF-Flat index of 50 million 1024-dimensional vectors on a server with only 16 cores and 64GB of RAM?**
+
+Traditional CPU-based approaches were hitting a wall. Building the index took days, and search latency was inconsistent. By integrating NVIDIA’s **cuVS** and **RAFT** libraries into our architecture, we transformed our performance profile. Here is the step-by-step story of how we did it.
+
+## The Challenge: The "Giant Index" Problem
+Our target was an IVF-Flat index with approximately 8,000 clusters holding 50 million vectors. On a 16-core machine, we encountered three primary bottlenecks:
+1.  **Clustering Latency**: Standard K-Means was slow and often produced unbalanced clusters, leading to "hotspots" that slowed down search.
+2.  **Assignment Overhead**: Mapping 50 million vectors to their nearest centroids is computationally expensive. On CPUs, this task competed for resources with data loading and decompression, dragging the process out to 24 hours.
+3.  **The GPU "Single Query" Trap**: Databases typically process one query at a time. GPUs, however, only show their true strength when processing large batches.
+
+## Step 1: Solving Clustering with Balanced K-Means
+Standard K-Means often results in some clusters having thousands of vectors while others have almost none. In an IVF index, this leads to unpredictable IO and search times.
+
+We initially implemented our own balanced K-Means, which brought the clustering time down from 30 minutes to 5 minutes. However, by switching to the **cuVS Balanced K-Means algorithm**, we utilized GPU parallelism to its fullest. 
+*   **Result**: Clustering time dropped from **5 minutes to just 5 seconds**. 
+
+## Step 2: Offloading Assignment to Brute-Force GPU Kernels
+Once the 8,000 centroids are defined, every one of the 50 million vectors must be assigned to its closest cluster. Doing this on a 16-core CPU is a nightmare of cache misses and thread contention.
+
+By using the **cuVS Brute-Force index** to "offline" this distance computation to the GPU, we eliminated the CPU bottleneck entirely.
+*   **Result**: The assignment phase dropped from **24 hours to 30 minutes**.
+
+## Step 3: The Architecture—`cuvs_worker_t` and Dynamic Batching
+To solve the "Single Query" problem, we designed a sophisticated bridge between Go and CUDA: the `cuvs_worker_t`.
+
+### Dynamic Batching: The Secret Sauce
+Instead of launching a new CUDA kernel for every incoming request, our worker implements **Dynamic Batching**. It holds incoming queries for a tiny microsecond window, consolidates them into a single matrix, and executes one large GPU search.
+*   This maximizes warp utilization and reduces kernel launch overhead.
+*   **Performance Gain**: Provides a **5x-10x throughput boost** in high-concurrency environments.
+
+### RAFT Resource Management
+We leverage the **RAFT** library to manage long-lived `raft::resources`. By caching CUDA streams and handles within persistent C++ threads, we ensure that our Go-based kernel can interact with the GPU with near-zero resource initialization overhead.
+
+## Step 4: Staying Within 64GB with Auto-Quantization
+50 million 1024D vectors in `float32` require roughly 200GB of space—far exceeding our 64GB RAM limit. To solve this, we implemented **Automatic Type Quantization** directly on the GPU.
+*   **FP16 (Half Precision)**: Reduces memory by 2x with almost zero recall loss.
+*   **8-Bit Integer (int8/uint8)**: Uses a learned Scalar Quantizer to compress vectors by 4x.
+*   Because conversion happens on the GPU, we avoid taxing the CPU and minimize PCIe bus traffic.
+
+## Summary of Supported Indexes
+Our architecture now supports a suite of high-performance indexes:
+*   **CAGRA**: A hardware-accelerated graph index for state-of-the-art search speed.
+*   **IVF-Flat**: The workhorse for high-accuracy general-purpose search.
+*   **IVF-PQ**: For extreme compression of billion-scale datasets.
+*   **K-Means**: For high-speed data partitioning.
+
+## Conclusion
+By shifting the heavy lifting of clustering, assignment, and quantization to the GPU through cuVS, MatrixOne can now handle massive vector datasets on surprisingly modest hardware. What once took a full day now takes less than an hour, with search latencies that remain low even under heavy load.
+
+The integration of `cuvs_worker_t` and dynamic batching ensures that we don't just have a "fast index," but a **production-ready database engine** capable of scaling with the needs of modern AI.
diff --git a/cgo/cuvs/brute_force.hpp b/cgo/cuvs/brute_force.hpp
new file mode 100644
index 0000000000000..25b3178be6363
--- /dev/null
+++ b/cgo/cuvs/brute_force.hpp
@@ -0,0 +1,326 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "index_base.hpp"
+#include "cuvs_worker.hpp" // For cuvs_worker_t and raft_handle_wrapper_t
+#include <raft/util/cudart_utils.hpp> // For RAFT_CUDA_TRY
+#include <cuda_fp16.h> // For half
+
+// Standard library includes
+#include <algorithm>   // For std::copy
+#include <iostream>    // For simulation debug logs
+#include <memory>
+#include <numeric>     // For std::iota
+#include <stdexcept>   // For std::runtime_error
+#include <string>      
+#include <type_traits> 
+#include <vector>
+#include <future>      // For std::promise and std::future
+#include <limits>      // For std::numeric_limits
+#include <shared_mutex> // For std::shared_mutex
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+// RAFT includes
+#include <raft/core/device_mdarray.hpp> // For raft::device_matrix
+#include <raft/core/device_mdspan.hpp>   // Required for device_matrix_view
+#include <raft/core/host_mdarray.hpp> // For raft::host_matrix
+#include <raft/core/resources.hpp>       // Core resource handle
+#include <raft/linalg/map.cuh>           
+#include <raft/core/copy.cuh>            // For raft::copy with type conversion
+
+
+// cuVS includes
+#include <cuvs/distance/distance.hpp>    // cuVS distance API
+#include <cuvs/neighbors/brute_force.hpp>
+#include "quantize.hpp"
+#pragma GCC diagnostic pop
+
+
+namespace matrixone {
+
+/**
+ * @brief Brute-force nearest neighbor search on GPU.
+ * @tparam T Data type of the vector elements (e.g., float, half).
+ */
+template <typename T>
+class gpu_brute_force_t : public gpu_index_base_t<T, brute_force_build_params_t> {
+public:
+    std::unique_ptr<cuvs::neighbors::brute_force::index<T, float>> index;
+
+    ~gpu_brute_force_t() override {
+        this->destroy();
+    }
+
+    /**
+     * @brief Constructor for brute-force search.
+     */
+    gpu_brute_force_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, cuvs::distance::DistanceType m,
+                       uint32_t nthread, int device_id = 0) {
+        
+        this->dimension = dimension;
+        this->count = static_cast<uint32_t>(count_vectors);
+        this->metric = m;
+        this->devices_ = {device_id};
+        this->current_offset_ = static_cast<uint32_t>(count_vectors);
+
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_);
+
+        this->flattened_host_dataset.resize(this->count * this->dimension);
+        if (dataset_data) {
+            std::copy(dataset_data, dataset_data + (this->count * this->dimension), this->flattened_host_dataset.begin());
+        }
+    }
+
+    /**
+     * @brief Constructor for an empty index (chunked addition support).
+     */
+    gpu_brute_force_t(uint64_t total_count, uint32_t dimension, cuvs::distance::DistanceType m, 
+                       uint32_t nthread, int device_id = 0) {
+        
+        this->dimension = dimension;
+        this->count = static_cast<uint32_t>(total_count);
+        this->metric = m;
+        this->devices_ = {device_id};
+        this->current_offset_ = 0;
+
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_);
+        this->flattened_host_dataset.resize(this->count * this->dimension);
+    }
+
+    /**
+     * @brief Starts the worker and initializes resources.
+     */
+    void start() {
+        auto init_fn = [](raft_handle_wrapper_t&) -> std::any {
+            return std::any();
+        };
+
+        auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any {
+            std::unique_lock<std::shared_mutex> lock(this->mutex_);
+            index.reset();
+            this->dataset_device_ptr_.reset();
+            return std::any();
+        };
+
+        this->worker->start(init_fn, stop_fn);
+    }
+
+    /**
+     * @brief Loads the dataset to the GPU and builds the index.
+     */
+    void build() {
+        std::unique_lock<std::shared_mutex> lock(this->mutex_);
+        if (this->is_loaded_) return;
+
+        if (this->count == 0) {
+            index = nullptr;
+            this->is_loaded_ = true;
+            return;
+        }
+
+        if (this->current_offset_ > 0 && this->current_offset_ < this->count) {
+            this->count = static_cast<uint32_t>(this->current_offset_);
+            this->flattened_host_dataset.resize(this->count * this->dimension);
+        }
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                this->build_internal(handle);
+                return std::any();
+            }
+        );
+
+        auto result_wait = this->worker->wait(job_id).get();
+        if (result_wait.error) std::rethrow_exception(result_wait.error);
+        this->is_loaded_ = true;
+        // Clear host dataset after building to save memory
+        this->flattened_host_dataset.clear();
+        this->flattened_host_dataset.shrink_to_fit();
+    }
+
+    /**
+     * @brief Internal build implementation (no worker submission)
+     */
+    void build_internal(raft_handle_wrapper_t& handle) {
+        auto res = handle.get_raft_resources();
+        if (this->flattened_host_dataset.empty()) {
+            index = nullptr;
+            return;
+        }
+
+        auto dataset_device = new auto(raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+            *res, static_cast<int64_t>(this->count), static_cast<int64_t>(this->dimension)));
+        
+        this->dataset_device_ptr_ = std::shared_ptr<void>(dataset_device, [](void* ptr) {
+            delete static_cast<raft::device_matrix<T, int64_t, raft::layout_c_contiguous>*>(ptr);
+        });
+
+        RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), this->flattened_host_dataset.data(),
+                                    this->flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice,
+                                    raft::resource::get_cuda_stream(*res)));
+
+        cuvs::neighbors::brute_force::index_params index_params;
+        index_params.metric = this->metric;
+
+        index = std::make_unique<cuvs::neighbors::brute_force::index<T, float>>(
+            cuvs::neighbors::brute_force::build(*res, index_params, raft::make_const_mdspan(dataset_device->view())));
+
+        raft::resource::sync_stream(*res);
+    }
+
+    /**
+     * @brief Search result containing neighbor IDs and distances.
+     */
+    struct search_result_t {
+        std::vector<int64_t> neighbors; // Indices of nearest neighbors
+        std::vector<float> distances;  // Distances to nearest neighbors
+    };
+
+    /**
+     * @brief Performs brute-force search for given queries.
+     */
+    search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit) {
+        if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{};
+        if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch");
+        if (!this->is_loaded_ || !index) return search_result_t{};
+
+        uint64_t job_id = this->worker->submit(
+            [&, num_queries, limit, queries_data](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                
+                auto queries_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                    *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(this->dimension));
+                RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data,
+                                         num_queries * this->dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                auto neighbors_device = raft::make_device_matrix<int64_t, int64_t, raft::layout_c_contiguous>(
+                    *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+                auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                    *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+                cuvs::neighbors::brute_force::search_params search_params;
+                cuvs::neighbors::brute_force::search(*res, search_params, *index,
+                                                     raft::make_const_mdspan(queries_device.view()), neighbors_device.view(), distances_device.view());
+
+                search_result_t s_res;
+                s_res.neighbors.resize(num_queries * limit);
+                s_res.distances.resize(num_queries * limit);
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(s_res.neighbors.data(), neighbors_device.data_handle(),
+                                         s_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                RAFT_CUDA_TRY(cudaMemcpyAsync(s_res.distances.data(), distances_device.data_handle(),
+                                         s_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                raft::resource::sync_stream(*res);
+
+                for (size_t i = 0; i < s_res.neighbors.size(); ++i) {
+                    if (s_res.neighbors[i] == std::numeric_limits<int64_t>::max() || 
+                        s_res.neighbors[i] == 4294967295LL || s_res.neighbors[i] < 0) {
+                        s_res.neighbors[i] = -1;
+                    }
+                }
+                return s_res;
+            }
+        );
+
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<search_result_t>(result.result);
+    }
+
+    /**
+     * @brief Performs brute-force search for given float32 queries, with on-the-fly conversion if needed.
+     */
+    search_result_t search_float(const float* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit) {
+        if constexpr (std::is_same_v<T, float>) {
+            return search(queries_data, num_queries, query_dimension, limit);
+        }
+
+        if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{};
+        if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch");
+        if (!this->is_loaded_ || !index) return search_result_t{};
+
+        uint64_t job_id = this->worker->submit(
+            [&, num_queries, limit, queries_data](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                
+                auto queries_device_float = raft::make_device_matrix<float, int64_t>(*res, num_queries, this->dimension);
+                raft::copy(*res, queries_device_float.view(), raft::make_host_matrix_view<const float, int64_t>(queries_data, num_queries, this->dimension));
+                
+                auto queries_device_target = raft::make_device_matrix<T, int64_t>(*res, num_queries, this->dimension);
+                raft::copy(*res, queries_device_target.view(), queries_device_float.view());
+
+                auto neighbors_device = raft::make_device_matrix<int64_t, int64_t, raft::layout_c_contiguous>(
+                    *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+                auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                    *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+                cuvs::neighbors::brute_force::search_params search_params;
+                cuvs::neighbors::brute_force::search(*res, search_params, *index,
+                                                     raft::make_const_mdspan(queries_device_target.view()), neighbors_device.view(), distances_device.view());
+
+                search_result_t s_res;
+                s_res.neighbors.resize(num_queries * limit);
+                s_res.distances.resize(num_queries * limit);
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(s_res.neighbors.data(), neighbors_device.data_handle(),
+                                         s_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                RAFT_CUDA_TRY(cudaMemcpyAsync(s_res.distances.data(), distances_device.data_handle(),
+                                         s_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                raft::resource::sync_stream(*res);
+
+                for (size_t i = 0; i < s_res.neighbors.size(); ++i) {
+                    if (s_res.neighbors[i] == std::numeric_limits<int64_t>::max() || 
+                        s_res.neighbors[i] == 4294967295LL || s_res.neighbors[i] < 0) {
+                        s_res.neighbors[i] = -1;
+                    }
+                }
+                return s_res;
+            }
+        );
+
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<search_result_t>(result.result);
+    }
+
+    std::string info() const override {
+        std::string json = gpu_index_base_t<T, brute_force_build_params_t>::info();
+        json += ", \"type\": \"BruteForce\", \"brute_force\": {";
+        if (index) {
+            json += "\"size\": " + std::to_string(index->size());
+        } else {
+            json += "\"size\": 0, \"built\": false";
+        }
+        json += "}}";
+        return json;
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/brute_force_c.cpp b/cgo/cuvs/brute_force_c.cpp
new file mode 100644
index 0000000000000..f880115b10b2e
--- /dev/null
+++ b/cgo/cuvs/brute_force_c.cpp
@@ -0,0 +1,274 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "brute_force_c.h"
+#include "brute_force.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <cstring>
+
+struct gpu_brute_force_any_t {
+
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_brute_force_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_brute_force_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_brute_force_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_brute_force_t<half>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_brute_force_c gpu_brute_force_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        void* index_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                index_ptr = new matrixone::gpu_brute_force_t<float>(static_cast<const float*>(dataset_data), count_vectors, dimension, metric, nthread, device_id);
+                break;
+            case Quantization_F16:
+                index_ptr = new matrixone::gpu_brute_force_t<half>(static_cast<const half*>(dataset_data), count_vectors, dimension, metric, nthread, device_id);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for brute force (only f32 and f16 supported)");
+        }
+        return static_cast<gpu_brute_force_c>(new gpu_brute_force_any_t(qtype, index_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_new", e.what());
+        return nullptr;
+    }
+}
+
+gpu_brute_force_c gpu_brute_force_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric_c, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        void* index_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                index_ptr = new matrixone::gpu_brute_force_t<float>(total_count, dimension, metric, nthread, device_id);
+                break;
+            case Quantization_F16:
+                index_ptr = new matrixone::gpu_brute_force_t<half>(total_count, dimension, metric, nthread, device_id);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for brute force (only f32 and f16 supported)");
+        }
+        return static_cast<gpu_brute_force_c>(new gpu_brute_force_any_t(qtype, index_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_new_empty", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_brute_force_start(gpu_brute_force_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->start(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->start(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_start", e.what());
+    }
+}
+
+void gpu_brute_force_build(gpu_brute_force_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->build(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->build(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_build", e.what());
+    }
+}
+
+void gpu_brute_force_add_chunk(gpu_brute_force_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->add_chunk(static_cast<const float*>(chunk_data), chunk_count); break;
+            case Quantization_F16: static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->add_chunk(static_cast<const half*>(chunk_data), chunk_count); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_add_chunk", e.what());
+    }
+}
+
+void gpu_brute_force_add_chunk_float(gpu_brute_force_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_F16: static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_add_chunk_float", e.what());
+    }
+}
+
+gpu_brute_force_search_result_c gpu_brute_force_search(gpu_brute_force_c index_c, const void* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        void* result_ptr = nullptr;
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto res = std::make_unique<matrixone::gpu_brute_force_t<float>::search_result_t>();
+                *res = static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->search(static_cast<const float*>(queries_data), num_queries, query_dimension, limit);
+                result_ptr = res.release();
+                break;
+            }
+            case Quantization_F16: {
+                auto res = std::make_unique<matrixone::gpu_brute_force_t<half>::search_result_t>();
+                *res = static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->search(static_cast<const half*>(queries_data), num_queries, query_dimension, limit);
+                result_ptr = res.release();
+                break;
+            }
+            default: break;
+        }
+        return static_cast<gpu_brute_force_search_result_c>(result_ptr);
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_search", e.what());
+        return nullptr;
+    }
+}
+
+gpu_brute_force_search_result_c gpu_brute_force_search_float(gpu_brute_force_c index_c, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        void* result_ptr = nullptr;
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto res = std::make_unique<matrixone::gpu_brute_force_t<float>::search_result_t>();
+                *res = static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit);
+                result_ptr = res.release();
+                break;
+            }
+            case Quantization_F16: {
+                auto res = std::make_unique<matrixone::gpu_brute_force_t<half>::search_result_t>();
+                *res = static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit);
+                result_ptr = res.release();
+                break;
+            }
+            default: break;
+        }
+        return static_cast<gpu_brute_force_search_result_c>(result_ptr);
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_search_float", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_brute_force_get_results(gpu_brute_force_search_result_c result_c, uint64_t num_queries, uint32_t limit, int64_t* neighbors, float* distances) {
+    if (!result_c) return;
+    auto* search_result = static_cast<matrixone::gpu_brute_force_t<float>::search_result_t*>(result_c);
+
+    size_t total = num_queries * limit;
+    if (search_result->neighbors.size() >= total) {
+        std::copy(search_result->neighbors.begin(), search_result->neighbors.begin() + total, neighbors);
+    } else {
+        std::fill(neighbors, neighbors + total, -1);
+    }
+
+    if (search_result->distances.size() >= total) {
+        std::copy(search_result->distances.begin(), search_result->distances.begin() + total, distances);
+    } else {
+        std::fill(distances, distances + total, std::numeric_limits<float>::infinity());
+    }
+}
+
+void gpu_brute_force_free_search_result(gpu_brute_force_search_result_c result_c) {
+    if (!result_c) return;
+    delete static_cast<matrixone::gpu_brute_force_t<float>::search_result_t*>(result_c);
+}
+
+uint32_t gpu_brute_force_cap(gpu_brute_force_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->cap();
+        case Quantization_F16: return static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->cap();
+        default: return 0;
+    }
+}
+
+uint32_t gpu_brute_force_len(gpu_brute_force_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->len();
+        case Quantization_F16: return static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->len();
+        default: return 0;
+    }
+}
+
+char* gpu_brute_force_info(gpu_brute_force_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    if (!index_c) return nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        std::string info;
+        switch (any->qtype) {
+            case Quantization_F32: info = static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->info(); break;
+            case Quantization_F16: info = static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->info(); break;
+            default: return nullptr;
+        }
+        return strdup(info.c_str());
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_info", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_brute_force_destroy(gpu_brute_force_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_destroy", e.what());
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_brute_force_t<float>;
+template class gpu_brute_force_t<half>;
+}
diff --git a/cgo/cuvs/brute_force_c.h b/cgo/cuvs/brute_force_c.h
new file mode 100644
index 0000000000000..3c28e47e2bdfd
--- /dev/null
+++ b/cgo/cuvs/brute_force_c.h
@@ -0,0 +1,78 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BRUTE_FORCE_C_H
+#define BRUTE_FORCE_C_H
+
+#include "helper.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_brute_force_t object
+typedef void* gpu_brute_force_c;
+
+// Opaque pointer to the C++ search result object
+typedef void* gpu_brute_force_search_result_c;
+
+// Constructor for gpu_brute_force_t
+gpu_brute_force_c gpu_brute_force_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg);
+
+// Constructor for an empty index (pre-allocates)
+gpu_brute_force_c gpu_brute_force_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg);
+
+// Starts the worker and initializes resources
+void gpu_brute_force_start(gpu_brute_force_c index_c, void* errmsg);
+
+// Builds the index (loads the dataset to the GPU)
+void gpu_brute_force_build(gpu_brute_force_c index_c, void* errmsg);
+
+// Add chunk of data (same type as index quantization)
+void gpu_brute_force_add_chunk(gpu_brute_force_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg);
+
+// Add chunk of data (from float, with on-the-fly conversion if needed)
+void gpu_brute_force_add_chunk_float(gpu_brute_force_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg);
+
+// Performs a search operation
+gpu_brute_force_search_result_c gpu_brute_force_search(gpu_brute_force_c index_c, const void* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg);
+
+// Performs a search operation with float32 queries
+gpu_brute_force_search_result_c gpu_brute_force_search_float(gpu_brute_force_c index_c, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg);
+
+// Retrieves the results from a search operation
+void gpu_brute_force_get_results(gpu_brute_force_search_result_c result_c, uint64_t num_queries, uint32_t limit, int64_t* neighbors, float* distances);
+
+// Frees the memory for a gpu_brute_force_search_result_c object
+void gpu_brute_force_free_search_result(gpu_brute_force_search_result_c result_c);
+
+// Returns the capacity of the index buffer
+uint32_t gpu_brute_force_cap(gpu_brute_force_c index_c);
+
+// Returns the current number of vectors in the index
+uint32_t gpu_brute_force_len(gpu_brute_force_c index_c);
+
+// Returns info about the index as a JSON string
+char* gpu_brute_force_info(gpu_brute_force_c index_c, void* errmsg);
+
+// Destroys the gpu_brute_force_t object and frees associated resources
+void gpu_brute_force_destroy(gpu_brute_force_c index_c, void* errmsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // BRUTE_FORCE_C_H
diff --git a/cgo/cuvs/cagra.hpp b/cgo/cuvs/cagra.hpp
new file mode 100644
index 0000000000000..c5dcd3a0e8db2
--- /dev/null
+++ b/cgo/cuvs/cagra.hpp
@@ -0,0 +1,754 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "index_base.hpp"
+#include "cuvs_worker.hpp"
+#include "cuvs_types.h"
+#include "quantize.hpp"
+
+#include <cuda_fp16.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <algorithm>
+#include <future>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <shared_mutex>
+#include <stdexcept>
+#include <string>      
+#include <type_traits> 
+#include <vector>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include <raft/core/copy.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources_snmg.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/cagra.hpp>
+#pragma GCC diagnostic pop
+
+namespace matrixone {
+
+/**
+ * @brief Search result containing neighbor IDs and distances.
+ * Common for all CAGRA instantiations.
+ */
+struct cagra_search_result_t {
+    std::vector<uint32_t> neighbors; // Indices of nearest neighbors
+    std::vector<float> distances;   // Distances to nearest neighbors
+};
+
+/**
+ * @brief gpu_cagra_t implements a CAGRA index that can run on a single GPU or sharded across multiple GPUs.
+ * It automatically chooses between single-GPU and multi-GPU (SNMG) cuVS APIs based on the RAFT handle resources.
+ */
+template <typename T>
+class gpu_cagra_t : public gpu_index_base_t<T, cagra_build_params_t> {
+public:
+    using cagra_index = cuvs::neighbors::cagra::index<T, uint32_t>;
+    using mg_index = cuvs::neighbors::mg_index<cagra_index, T, uint32_t>;
+    using search_result_t = cagra_search_result_t;
+
+    // Internal index storage
+    std::unique_ptr<cagra_index> index_;
+    std::unique_ptr<mg_index> mg_index_;
+
+    ~gpu_cagra_t() override {
+        this->destroy();
+    }
+
+    // Unified Constructor for building from dataset
+    gpu_cagra_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                  cuvs::distance::DistanceType m, const cagra_build_params_t& bp,
+                  const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode) {
+        
+        this->dimension = dimension;
+        this->count = static_cast<uint32_t>(count_vectors);
+        this->metric = m;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = static_cast<uint32_t>(count_vectors);
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+
+        this->flattened_host_dataset.resize(this->count * this->dimension);
+        if (dataset_data) {
+            std::copy(dataset_data, dataset_data + (this->count * this->dimension), this->flattened_host_dataset.begin());
+        }
+    }
+
+    // Constructor for chunked input (pre-allocates)
+    gpu_cagra_t(uint64_t total_count, uint32_t dimension, cuvs::distance::DistanceType m, 
+                    const cagra_build_params_t& bp, const std::vector<int>& devices, 
+                    uint32_t nthread, distribution_mode_t mode) {
+        
+        this->dimension = dimension;
+        this->count = static_cast<uint32_t>(total_count);
+        this->metric = m;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = 0;
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+
+        this->flattened_host_dataset.resize(this->count * this->dimension);
+    }
+
+    // Unified Constructor for loading from file
+    gpu_cagra_t(const std::string& filename, uint32_t dimension, cuvs::distance::DistanceType m, 
+                    const cagra_build_params_t& bp, const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode) {
+        
+        this->filename_ = filename;
+        this->dimension = dimension;
+        this->metric = m;
+        this->count = 0;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = 0;
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+    }
+
+    // Private constructor for creating from an existing cuVS index (used by merge)
+    gpu_cagra_t(std::unique_ptr<cagra_index> idx, 
+                  uint32_t dim, cuvs::distance::DistanceType m, uint32_t nthread, const std::vector<int>& devices)
+        : index_(std::move(idx)) {
+        
+        this->metric = m;
+        this->dimension = dim;
+        this->devices_ = devices;
+
+        // Merge result is currently a single-GPU index.
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, false);
+        
+        this->count = static_cast<uint32_t>(index_->size());
+        this->build_params.graph_degree = static_cast<size_t>(index_->graph_degree());
+        this->build_params.intermediate_graph_degree = this->build_params.graph_degree * 2; // Best guess
+        this->dist_mode = DistributionMode_SINGLE_GPU;
+        this->current_offset_ = this->count;
+        this->is_loaded_ = true;
+    }
+
+    /**
+     * @brief Starts the worker and initializes resources.
+     */
+    void start() {
+        auto init_fn = [](raft_handle_wrapper_t&) -> std::any {
+            return std::any();
+        };
+
+        auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any {
+            std::unique_lock<std::shared_mutex> lock(this->mutex_);
+            index_.reset();
+            mg_index_.reset();
+            this->quantizer_.reset();
+            this->dataset_device_ptr_.reset();
+            return std::any();
+        };
+
+        this->worker->start(init_fn, stop_fn);
+    }
+
+    /**
+     * @brief Loads the index from file or builds it from the dataset.
+     */
+    void build() {
+        std::unique_lock<std::shared_mutex> lock(this->mutex_);
+        if (this->is_loaded_) return;
+
+        if (this->filename_.empty() && !index_ && this->current_offset_ > 0 && this->current_offset_ < this->count) {
+            this->count = static_cast<uint32_t>(this->current_offset_);
+            this->flattened_host_dataset.resize(this->count * this->dimension);
+        }
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                this->build_internal(handle);
+                return std::any();
+            }
+        );
+
+        auto result_wait = this->worker->wait(job_id).get();
+        if (result_wait.error) std::rethrow_exception(result_wait.error);
+        
+        this->is_loaded_ = true;
+        // Clear host dataset after building to save memory
+        if (this->filename_.empty()) {
+            this->flattened_host_dataset.clear();
+            this->flattened_host_dataset.shrink_to_fit();
+        }
+    }
+
+    /**
+     * @brief Internal build implementation (no worker submission)
+     */
+    void build_internal(raft_handle_wrapper_t& handle) {
+        auto res = handle.get_raft_resources();
+        bool is_mg = is_snmg_handle(res);
+
+        if (!this->filename_.empty()) {
+            if (is_mg) {
+                mg_index_ = std::make_unique<mg_index>(
+                    cuvs::neighbors::cagra::deserialize<T, uint32_t>(*res, this->filename_));
+                this->count = 0;
+                for (const auto& iface : mg_index_->ann_interfaces_) {
+                    if (iface.index_.has_value()) this->count += static_cast<uint32_t>(iface.index_.value().size());
+                }
+                if (!mg_index_->ann_interfaces_.empty() && mg_index_->ann_interfaces_[0].index_.has_value()) {
+                    this->build_params.graph_degree = static_cast<size_t>(mg_index_->ann_interfaces_[0].index_.value().graph_degree());
+                }
+            } else {
+                index_ = std::make_unique<cagra_index>(*res);
+                cuvs::neighbors::cagra::deserialize(*res, this->filename_, index_.get());
+                this->count = static_cast<uint32_t>(index_->size());
+                this->build_params.graph_degree = static_cast<size_t>(index_->graph_degree());
+            }
+            raft::resource::sync_stream(*res);
+        } else if (!this->flattened_host_dataset.empty()) {
+            if (is_mg) {
+                auto dataset_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                    this->flattened_host_dataset.data(), (int64_t)this->count, (int64_t)this->dimension);
+
+                cuvs::neighbors::cagra::index_params index_params;
+                index_params.metric = this->metric;
+                index_params.intermediate_graph_degree = this->build_params.intermediate_graph_degree;
+                index_params.graph_degree = this->build_params.graph_degree;
+
+                cuvs::neighbors::mg_index_params<cuvs::neighbors::cagra::index_params> mg_params(index_params);
+                if (this->dist_mode == DistributionMode_REPLICATED) {
+                    mg_params.mode = cuvs::neighbors::distribution_mode::REPLICATED;
+                } else {
+                    mg_params.mode = cuvs::neighbors::distribution_mode::SHARDED;
+                }
+
+                mg_index_ = std::make_unique<mg_index>(
+                    cuvs::neighbors::cagra::build(*res, mg_params, dataset_host_view));
+            } else {
+                auto dataset_device = new auto(raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                    *res, static_cast<int64_t>(this->count), static_cast<int64_t>(this->dimension)));
+                
+                this->dataset_device_ptr_ = std::shared_ptr<void>(dataset_device, [](void* ptr) {
+                    delete static_cast<raft::device_matrix<T, int64_t, raft::layout_c_contiguous>*>(ptr);
+                });
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), this->flattened_host_dataset.data(),
+                                            this->flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice,
+                                            raft::resource::get_cuda_stream(*res)));
+
+                cuvs::neighbors::cagra::index_params index_params;
+                index_params.metric = this->metric;
+                index_params.intermediate_graph_degree = this->build_params.intermediate_graph_degree;
+                index_params.graph_degree = this->build_params.graph_degree;
+                index_params.attach_dataset_on_build = this->build_params.attach_dataset_on_build;
+
+                index_ = std::make_unique<cagra_index>(
+                    cuvs::neighbors::cagra::build(*res, index_params, raft::make_const_mdspan(dataset_device->view())));
+            }
+            raft::resource::sync_stream(*res);
+        }
+    }
+
+    /**
+     * @brief Extends the existing index with additional vectors.
+     * @param additional_data Pointer to additional vectors on host.
+     * @param num_vectors Number of vectors to add.
+     */
+    void extend(const T* additional_data, uint64_t num_vectors) {
+        if (!this->is_loaded_ || !index_) {
+            uint64_t old_size = this->flattened_host_dataset.size();
+            this->flattened_host_dataset.resize(old_size + num_vectors * this->dimension);
+            std::copy(additional_data, additional_data + num_vectors * this->dimension, this->flattened_host_dataset.begin() + old_size);
+            this->count += static_cast<uint32_t>(num_vectors);
+            this->current_offset_ += static_cast<uint32_t>(num_vectors);
+            return;
+        }
+
+        if constexpr (std::is_same_v<T, half>) {
+             throw std::runtime_error("CAGRA single-GPU extend is not supported for float16 (half) by cuVS.");
+        } else {
+            if (num_vectors == 0) return;
+
+            std::unique_lock<std::shared_mutex> lock(this->mutex_);
+
+            uint64_t job_id = this->worker->submit_main(
+                [&, additional_data, num_vectors](raft_handle_wrapper_t& handle) -> std::any {
+                    auto res = handle.get_raft_resources();
+                    
+                    auto additional_dataset_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(num_vectors), static_cast<int64_t>(this->dimension));
+                    
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(additional_dataset_device.data_handle(), additional_data,
+                                            num_vectors * this->dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                            raft::resource::get_cuda_stream(*res)));
+
+                    cuvs::neighbors::cagra::extend_params params;
+                    cuvs::neighbors::cagra::extend(*res, params, raft::make_const_mdspan(additional_dataset_device.view()), *index_);
+
+                    raft::resource::sync_stream(*res);
+                    return std::any();
+                }
+            );
+
+            cuvs_task_result_t result = this->worker->wait(job_id).get();
+            if (result.error) std::rethrow_exception(result.error);
+
+            this->count = static_cast<uint32_t>(index_->size());
+            this->current_offset_ = this->count;
+        }
+    }
+
+    /**
+     * @brief Merges multiple single-GPU CAGRA indices into a single index.
+     * @param indices Vector of pointers to indices to merge.
+     * @param nthread Number of worker threads for the merged index.
+     * @param devices GPU devices to use for the merged index.
+     * @return A new merged CAGRA index.
+     */
+    static std::unique_ptr<gpu_cagra_t<T>> merge(const std::vector<gpu_cagra_t<T>*>& indices, uint32_t nthread, const std::vector<int>& devices) {
+        if (indices.empty()) throw std::invalid_argument("indices empty");
+        uint32_t dim = indices[0]->dimension;
+        cuvs::distance::DistanceType m = indices[0]->metric;
+
+        cuvs_worker_t transient_worker(1, devices, false);
+        transient_worker.start();
+
+        uint64_t job_id = transient_worker.submit_main(
+            [&indices](raft_handle_wrapper_t& handle) -> std::any {
+                auto res = handle.get_raft_resources();
+                
+                std::vector<cagra_index*> cagra_indices;
+                for (auto* idx : indices) {
+                    if (!idx->is_loaded_ || !idx->index_) {
+                        throw std::runtime_error("One of the indices to merge is not loaded or is a multi-GPU index (merge only supports single-GPU indices).");
+                    }
+                    cagra_indices.push_back(idx->index_.get());
+                }
+                
+                cuvs::neighbors::cagra::index_params index_params;
+                auto merged = cuvs::neighbors::cagra::merge(*res, index_params, cagra_indices);
+                raft::resource::sync_stream(*res);
+                return new cagra_index(std::move(merged));
+            }
+        );
+
+        auto result = transient_worker.wait(job_id).get();
+        if (result.error) {
+            transient_worker.stop();
+            std::rethrow_exception(result.error);
+        }
+        
+        auto* merged_idx_ptr = std::any_cast<cagra_index*>(result.result);
+        std::unique_ptr<cagra_index> merged_idx(merged_idx_ptr);
+        transient_worker.stop();
+        
+        auto new_idx = std::make_unique<gpu_cagra_t<T>>(
+            std::move(merged_idx),
+            dim, m, nthread, devices
+        );
+        new_idx->is_loaded_ = true;
+        return new_idx;
+    }
+
+    /**
+     * @brief Serializes the index to a file.
+     * @param filename Path to the output file.
+     */
+    void save(const std::string& filename) {
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) throw std::runtime_error("index not loaded");
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                if (is_snmg_handle(res)) {
+                    cuvs::neighbors::cagra::serialize(*res, *mg_index_, filename);
+                } else {
+                    cuvs::neighbors::cagra::serialize(*res, filename, *index_);
+                }
+                raft::resource::sync_stream(*res);
+                return std::any();
+            }
+        );
+
+        cuvs_task_result_t result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+    }
+
+    /**
+     * @brief Performs CAGRA search for given queries.
+     * @param queries_data Pointer to flattened query vectors on host.
+     * @param num_queries Number of query vectors.
+     * @param query_dimension Dimension of query vectors.
+     * @param limit Number of nearest neighbors to find.
+     * @param sp CAGRA search parameters.
+     * @return Search results.
+     */
+    search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const cagra_search_params_t& sp) {
+        if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{};
+        if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch");
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{};
+
+        // For large batches or if batching is explicitly disabled, use standard path
+        if (num_queries > 16 || !this->worker->use_batching()) {
+            uint64_t job_id = this->worker->submit(
+                [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any {
+                    return this->search_internal(handle, queries_data, num_queries, limit, sp);
+                }
+            );
+            auto result_wait = this->worker->wait(job_id).get();
+            if (result_wait.error) std::rethrow_exception(result_wait.error);
+            return std::any_cast<search_result_t>(result_wait.result);
+        }
+
+        return this->search_batch_internal(queries_data, num_queries, limit, sp);
+    }
+
+    /**
+     * @brief Internal batch search implementation
+     */
+    search_result_t search_batch_internal(const T* queries_data, uint64_t num_queries, uint32_t limit, const cagra_search_params_t& sp) {
+        // Dynamic batching for small query counts
+        struct search_req_t {
+            const T* data;
+            uint64_t n;
+        };
+
+        std::string batch_key = "cagra_s_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.itopk_size);
+        
+        auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector<std::any>& reqs, const std::vector<std::function<void(std::any)>>& setters) {
+            uint64_t total_queries = 0;
+            for (const auto& r : reqs) total_queries += std::any_cast<search_req_t>(r).n;
+
+            std::vector<T> aggregated_queries(total_queries * this->dimension);
+            uint64_t offset = 0;
+            for (const auto& r : reqs) {
+                auto req = std::any_cast<search_req_t>(r);
+                std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension));
+                offset += req.n;
+            }
+
+            auto results = this->search_internal(handle, aggregated_queries.data(), total_queries, limit, sp);
+
+            offset = 0;
+            for (size_t i = 0; i < reqs.size(); ++i) {
+                auto req = std::any_cast<search_req_t>(reqs[i]);
+                search_result_t individual_res;
+                individual_res.neighbors.resize(req.n * limit);
+                individual_res.distances.resize(req.n * limit);
+                std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin());
+                std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin());
+                setters[i](individual_res);
+                offset += req.n;
+            }
+        };
+
+        auto future = this->worker->template submit_batched<search_result_t>(batch_key, search_req_t{queries_data, num_queries}, exec_fn);
+        return future.get();
+    }
+
+    /**
+     * @brief Internal search implementation (no worker submission)
+     */
+    search_result_t search_internal(raft_handle_wrapper_t& handle, const T* queries_data, uint64_t num_queries, uint32_t limit, const cagra_search_params_t& sp) {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        auto res = handle.get_raft_resources();
+
+        search_result_t search_res;
+        search_res.neighbors.resize(num_queries * limit);
+        search_res.distances.resize(num_queries * limit);
+
+        cuvs::neighbors::cagra::search_params search_params;
+        search_params.itopk_size = sp.itopk_size;
+        search_params.search_width = sp.search_width;
+
+        const cagra_index* local_index = index_.get();
+        if (!local_index && mg_index_) {
+            int current_device;
+            RAFT_CUDA_TRY(cudaGetDevice(&current_device));
+            for (size_t i = 0; i < this->devices_.size(); ++i) {
+                if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) {
+                    if (mg_index_->ann_interfaces_[i].index_.has_value()) {
+                        local_index = &mg_index_->ann_interfaces_[i].index_.value();
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (is_snmg_handle(res) && mg_index_) {
+            auto queries_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                queries_data, (int64_t)num_queries, (int64_t)this->dimension);
+            auto neighbors_host_view = raft::make_host_matrix_view<uint32_t, int64_t>(
+                search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit);
+            auto distances_host_view = raft::make_host_matrix_view<float, int64_t>(
+                search_res.distances.data(), (int64_t)num_queries, (int64_t)limit);
+
+            cuvs::neighbors::mg_search_params<cuvs::neighbors::cagra::search_params> mg_search_params(search_params);
+            cuvs::neighbors::cagra::search(*res, *mg_index_, mg_search_params,
+                                                queries_host_view, neighbors_host_view, distances_host_view);
+        } else if (local_index) {
+            auto queries_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(this->dimension));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data,
+                                        num_queries * this->dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                        raft::resource::get_cuda_stream(*res)));
+
+            auto neighbors_device = raft::make_device_matrix<uint32_t, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+            auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+            cuvs::neighbors::cagra::search(*res, search_params, *local_index,
+                                            raft::make_const_mdspan(queries_device.view()), 
+                                            neighbors_device.view(), distances_device.view());
+
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(),
+                                        search_res.neighbors.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(),
+                                        search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+        } else {
+            throw std::runtime_error("Index not loaded or failed to find local index shard for current device.");
+        }
+
+        raft::resource::sync_stream(*res);
+
+        for (size_t i = 0; i < search_res.neighbors.size(); ++i) {
+            if (search_res.neighbors[i] == std::numeric_limits<uint32_t>::max()) {
+                search_res.neighbors[i] = static_cast<uint32_t>(-1); 
+            }
+        }
+        return search_res;
+    }
+
+    /**
+     * @brief Performs CAGRA search for given float32 queries, with on-the-fly quantization if needed.
+     */
+    search_result_t search_float(const float* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const cagra_search_params_t& sp) {
+        if constexpr (std::is_same_v<T, float>) {
+            return search(queries_data, num_queries, query_dimension, limit, sp);
+        }
+
+        if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{};
+        if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch");
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{};
+
+        // For large batches or if batching is explicitly disabled, use standard path
+        if (num_queries > 16 || !this->worker->use_batching()) {
+            uint64_t job_id = this->worker->submit(
+                [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any {
+                    return this->search_float_internal(handle, queries_data, num_queries, query_dimension, limit, sp);
+                }
+            );
+            auto result_wait = this->worker->wait(job_id).get();
+            if (result_wait.error) std::rethrow_exception(result_wait.error);
+            return std::any_cast<search_result_t>(result_wait.result);
+        }
+
+        return this->search_float_batch_internal(queries_data, num_queries, limit, sp);
+    }
+
+    /**
+     * @brief Internal batch search implementation for float32 queries
+     */
+    search_result_t search_float_batch_internal(const float* queries_data, uint64_t num_queries, uint32_t limit, const cagra_search_params_t& sp) {
+        // Dynamic batching for small query counts
+        struct search_req_t {
+            const float* data;
+            uint64_t n;
+        };
+
+        std::string batch_key = "cagra_sf_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.itopk_size);
+        
+        auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector<std::any>& reqs, const std::vector<std::function<void(std::any)>>& setters) {
+            uint64_t total_queries = 0;
+            for (const auto& r : reqs) total_queries += std::any_cast<search_req_t>(r).n;
+
+            std::vector<float> aggregated_queries(total_queries * this->dimension);
+            uint64_t offset = 0;
+            for (const auto& r : reqs) {
+                auto req = std::any_cast<search_req_t>(r);
+                std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension));
+                offset += req.n;
+            }
+
+            auto results = this->search_float_internal(handle, aggregated_queries.data(), total_queries, this->dimension, limit, sp);
+
+            offset = 0;
+            for (size_t i = 0; i < reqs.size(); ++i) {
+                auto req = std::any_cast<search_req_t>(reqs[i]);
+                search_result_t individual_res;
+                individual_res.neighbors.resize(req.n * limit);
+                individual_res.distances.resize(req.n * limit);
+                std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin());
+                std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin());
+                setters[i](individual_res);
+                offset += req.n;
+            }
+        };
+
+        auto future = this->worker->template submit_batched<search_result_t>(batch_key, search_req_t{queries_data, num_queries}, exec_fn);
+        return future.get();
+    }
+
+    /**
+     * @brief Internal search_float implementation (no worker submission)
+     */
+    search_result_t search_float_internal(raft_handle_wrapper_t& handle, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const cagra_search_params_t& sp) {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        auto res = handle.get_raft_resources();
+
+        // 1. Quantize/Convert float queries to T on device
+        auto queries_device_float = raft::make_device_matrix<float, int64_t>(*res, num_queries, this->dimension);
+        raft::copy(*res, queries_device_float.view(), raft::make_host_matrix_view<const float, int64_t>(queries_data, num_queries, this->dimension));
+        
+        auto queries_device_target = raft::make_device_matrix<T, int64_t>(*res, num_queries, this->dimension);
+        if constexpr (sizeof(T) == 1) {
+            if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained");
+            this->quantizer_.template transform<T>(*res, queries_device_float.view(), queries_device_target.data_handle(), true);
+            raft::resource::sync_stream(*res);
+        } else {
+            raft::copy(*res, queries_device_target.view(), queries_device_float.view());
+        }
+
+        // 2. Perform search
+        search_result_t search_res;
+        search_res.neighbors.resize(num_queries * limit);
+        search_res.distances.resize(num_queries * limit);
+
+        cuvs::neighbors::cagra::search_params search_params;
+        search_params.itopk_size = sp.itopk_size;
+        search_params.search_width = sp.search_width;
+
+        const cagra_index* local_index = index_.get();
+        if (!local_index && mg_index_) {
+            int current_device;
+            RAFT_CUDA_TRY(cudaGetDevice(&current_device));
+            for (size_t i = 0; i < this->devices_.size(); ++i) {
+                if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) {
+                    if (mg_index_->ann_interfaces_[i].index_.has_value()) {
+                        local_index = &mg_index_->ann_interfaces_[i].index_.value();
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (is_snmg_handle(res) && mg_index_) {
+            auto queries_host_target = raft::make_host_matrix<T, int64_t>(num_queries, this->dimension);
+            raft::copy(*res, queries_host_target.view(), queries_device_target.view());
+            raft::resource::sync_stream(*res);
+
+            auto neighbors_host_view = raft::make_host_matrix_view<uint32_t, int64_t>(
+                search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit);
+            auto distances_host_view = raft::make_host_matrix_view<float, int64_t>(
+                search_res.distances.data(), (int64_t)num_queries, (int64_t)limit);
+
+            cuvs::neighbors::mg_search_params<cuvs::neighbors::cagra::search_params> mg_search_params(search_params);
+            cuvs::neighbors::cagra::search(*res, *mg_index_, mg_search_params,
+                                                queries_host_target.view(), 
+                                                neighbors_host_view, distances_host_view);
+        } else if (local_index) {
+            auto neighbors_device = raft::make_device_matrix<uint32_t, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+            auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+            cuvs::neighbors::cagra::search(*res, search_params, *local_index,
+                                            raft::make_const_mdspan(queries_device_target.view()), 
+                                            neighbors_device.view(), distances_device.view());
+
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(),
+                                        search_res.neighbors.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(),
+                                        search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+        } else {
+            throw std::runtime_error("Index not loaded or failed to find local index shard for current device.");
+        }
+
+        raft::resource::sync_stream(*res);
+
+        for (size_t i = 0; i < search_res.neighbors.size(); ++i) {
+            if (search_res.neighbors[i] == std::numeric_limits<uint32_t>::max()) {
+                search_res.neighbors[i] = static_cast<uint32_t>(-1); 
+            }
+        }
+        return search_res;
+    }
+
+    std::string info() const override {
+        std::string json = gpu_index_base_t<T, cagra_build_params_t>::info();
+        json += ", \"type\": \"CAGRA\", \"cagra\": {";
+        if (index_) {
+            json += "\"mode\": \"Single-GPU\", \"size\": " + std::to_string(index_->size()) + 
+                    ", \"graph_degree\": " + std::to_string(index_->graph_degree());
+        } else if (mg_index_) {
+            json += "\"mode\": \"Multi-GPU\", \"shards\": [";
+            for (size_t i = 0; i < mg_index_->ann_interfaces_.size(); ++i) {
+                const auto& iface = mg_index_->ann_interfaces_[i];
+                json += "{\"device\": " + std::to_string(this->devices_[i]);
+                if (iface.index_.has_value()) {
+                    json += ", \"size\": " + std::to_string(iface.index_.value().size()) + 
+                            ", \"graph_degree\": " + std::to_string(iface.index_.value().graph_degree());
+                } else {
+                    json += ", \"status\": \"Not loaded\"";
+                }
+                json += "}" + std::string(i == mg_index_->ann_interfaces_.size() - 1 ? "" : ", ");
+            }
+            json += "]";
+        } else {
+            json += "\"built\": false";
+        }
+        json += "}}";
+        return json;
+    }
+
+    void destroy() override {
+        if (this->worker) {
+            this->worker->stop();
+        }
+        std::unique_lock<std::shared_mutex> lock(this->mutex_);
+        index_.reset();
+        mg_index_.reset();
+        this->quantizer_.reset();
+        this->dataset_device_ptr_.reset();
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/cagra_c.cpp b/cgo/cuvs/cagra_c.cpp
new file mode 100644
index 0000000000000..ba282895c1fe7
--- /dev/null
+++ b/cgo/cuvs/cagra_c.cpp
@@ -0,0 +1,502 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cagra_c.h"
+#include "cagra.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <cstring>
+
+struct gpu_cagra_any_t {
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_cagra_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_cagra_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_cagra_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_cagra_t<half>*>(ptr); break;
+            case Quantization_INT8: delete static_cast<matrixone::gpu_cagra_t<int8_t>*>(ptr); break;
+            case Quantization_UINT8: delete static_cast<matrixone::gpu_cagra_t<uint8_t>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_cagra_c gpu_cagra_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c,
+                                 cagra_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* cagra_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                cagra_ptr = new matrixone::gpu_cagra_t<float>(static_cast<const float*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                cagra_ptr = new matrixone::gpu_cagra_t<half>(static_cast<const half*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<int8_t>(static_cast<const int8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<uint8_t>(static_cast<const uint8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for CAGRA");
+        }
+        return static_cast<gpu_cagra_c>(new gpu_cagra_any_t(qtype, cagra_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_new", e.what());
+        return nullptr;
+    }
+}
+
+gpu_cagra_c gpu_cagra_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric_c,
+                                      cagra_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* cagra_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                cagra_ptr = new matrixone::gpu_cagra_t<float>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                cagra_ptr = new matrixone::gpu_cagra_t<half>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<int8_t>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<uint8_t>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for CAGRA");
+        }
+        return static_cast<gpu_cagra_c>(new gpu_cagra_any_t(qtype, cagra_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_new_empty", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_cagra_add_chunk(gpu_cagra_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->add_chunk(static_cast<const float*>(chunk_data), chunk_count); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->add_chunk(static_cast<const half*>(chunk_data), chunk_count); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->add_chunk(static_cast<const int8_t*>(chunk_data), chunk_count); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->add_chunk(static_cast<const uint8_t*>(chunk_data), chunk_count); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_add_chunk", e.what());
+    }
+}
+
+void gpu_cagra_add_chunk_float(gpu_cagra_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_add_chunk_float", e.what());
+    }
+}
+
+void gpu_cagra_train_quantizer(gpu_cagra_c index_c, const float* train_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_train_quantizer", e.what());
+    }
+}
+
+void gpu_cagra_set_per_thread_device(gpu_cagra_c index_c, bool enable, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->set_per_thread_device(enable); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_set_per_thread_device", e.what());
+    }
+}
+
+void gpu_cagra_set_use_batching(gpu_cagra_c index_c, bool enable, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->set_use_batching(enable); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_set_use_batching", e.what());
+    }
+}
+
+void gpu_cagra_set_quantizer(gpu_cagra_c index_c, float min, float max, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->set_quantizer(min, max); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_set_quantizer", e.what());
+    }
+}
+
+void gpu_cagra_get_quantizer(gpu_cagra_c index_c, float* min, float* max, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->get_quantizer(min, max); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_get_quantizer", e.what());
+    }
+}
+
+gpu_cagra_c gpu_cagra_load_file(const char* filename, uint32_t dimension, distance_type_t metric_c,
+                                      cagra_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* cagra_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                cagra_ptr = new matrixone::gpu_cagra_t<float>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                cagra_ptr = new matrixone::gpu_cagra_t<half>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<int8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<uint8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for CAGRA");
+        }
+        return static_cast<gpu_cagra_c>(new gpu_cagra_any_t(qtype, cagra_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_load_file", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_cagra_destroy(gpu_cagra_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_destroy", e.what());
+    }
+}
+
+void gpu_cagra_start(gpu_cagra_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->start(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->start(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->start(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->start(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_start", e.what());
+    }
+}
+
+void gpu_cagra_build(gpu_cagra_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->build(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->build(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->build(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->build(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_build", e.what());
+    }
+}
+
+void gpu_cagra_save(gpu_cagra_c index_c, const char* filename, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->save(filename); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->save(filename); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->save(filename); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->save(filename); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_save", e.what());
+    }
+}
+
+gpu_cagra_search_res_t gpu_cagra_search(gpu_cagra_c index_c, const void* queries_data, uint64_t num_queries, 
+                                            uint32_t query_dimension, uint32_t limit, 
+                                            cagra_search_params_t search_params, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_cagra_search_res_t res = {nullptr};
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        auto* cpp_res = new matrixone::cagra_search_result_t();
+        switch (any->qtype) {
+            case Quantization_F32: 
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->search(static_cast<const float*>(queries_data), num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_F16: 
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->search(static_cast<const half*>(queries_data), num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_INT8: 
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->search(static_cast<const int8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_UINT8: 
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->search(static_cast<const uint8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                break;
+            default: break;
+        }
+        res.result_ptr = static_cast<gpu_cagra_result_c>(cpp_res);
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_search", e.what());
+    }
+    return res;
+}
+
+gpu_cagra_search_res_t gpu_cagra_search_float(gpu_cagra_c index_c, const float* queries_data, uint64_t num_queries, 
+                                                  uint32_t query_dimension, uint32_t limit, 
+                                                  cagra_search_params_t search_params, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_cagra_search_res_t res = {nullptr};
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        auto* cpp_res = new matrixone::cagra_search_result_t();
+        switch (any->qtype) {
+            case Quantization_F32: 
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_F16: 
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_INT8: 
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_UINT8: 
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                break;
+            default: break;
+        }
+        res.result_ptr = static_cast<gpu_cagra_result_c>(cpp_res);
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_search_float", e.what());
+    }
+    return res;
+}
+
+void gpu_cagra_get_neighbors(gpu_cagra_result_c result_c, uint64_t total_elements, uint32_t* neighbors) {
+    if (!result_c) return;
+    auto* neighbors_vec = &static_cast<matrixone::cagra_search_result_t*>(result_c)->neighbors;
+    if (neighbors_vec->size() >= total_elements) {
+        std::copy(neighbors_vec->begin(), neighbors_vec->begin() + total_elements, neighbors);
+    }
+}
+
+void gpu_cagra_get_distances(gpu_cagra_result_c result_c, uint64_t total_elements, float* distances) {
+    if (!result_c) return;
+    auto* distances_vec = &static_cast<matrixone::cagra_search_result_t*>(result_c)->distances;
+    if (distances_vec->size() >= total_elements) {
+        std::copy(distances_vec->begin(), distances_vec->begin() + total_elements, distances);
+    }
+}
+
+void gpu_cagra_free_result(gpu_cagra_result_c result_c) {
+    if (!result_c) return;
+    delete static_cast<matrixone::cagra_search_result_t*>(result_c);
+}
+
+uint32_t gpu_cagra_cap(gpu_cagra_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->cap();
+        case Quantization_F16: return static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->cap();
+        case Quantization_INT8: return static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->cap();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->cap();
+        default: return 0;
+    }
+}
+
+uint32_t gpu_cagra_len(gpu_cagra_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->len();
+        case Quantization_F16: return static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->len();
+        case Quantization_INT8: return static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->len();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->len();
+        default: return 0;
+    }
+}
+
+char* gpu_cagra_info(gpu_cagra_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    if (!index_c) return nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        std::string info;
+        switch (any->qtype) {
+            case Quantization_F32: info = static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->info(); break;
+            case Quantization_F16: info = static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->info(); break;
+            case Quantization_INT8: info = static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->info(); break;
+            case Quantization_UINT8: info = static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->info(); break;
+            default: return nullptr;
+        }
+        return strdup(info.c_str());
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_info", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_cagra_extend(gpu_cagra_c index_c, const void* additional_data, uint64_t num_vectors, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->extend(static_cast<const float*>(additional_data), num_vectors); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->extend(static_cast<const half*>(additional_data), num_vectors); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->extend(static_cast<const int8_t*>(additional_data), num_vectors); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->extend(static_cast<const uint8_t*>(additional_data), num_vectors); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_extend", e.what());
+    }
+}
+
+gpu_cagra_c gpu_cagra_merge(gpu_cagra_c* indices_c, int count, uint32_t nthread, const int* devices, int device_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        if (count <= 0) return nullptr;
+        std::vector<int> devs(devices, devices + device_count);
+        auto* first_any = static_cast<gpu_cagra_any_t*>(indices_c[0]);
+        quantization_t qtype = first_any->qtype;
+
+        void* merged_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32: {
+                std::vector<matrixone::gpu_cagra_t<float>*> indices;
+                for (int i = 0; i < count; ++i) indices.push_back(static_cast<matrixone::gpu_cagra_t<float>*>(static_cast<gpu_cagra_any_t*>(indices_c[i])->ptr));
+                merged_ptr = matrixone::gpu_cagra_t<float>::merge(indices, nthread, devs).release();
+                break;
+            }
+            case Quantization_F16: {
+                std::vector<matrixone::gpu_cagra_t<half>*> indices;
+                for (int i = 0; i < count; ++i) indices.push_back(static_cast<matrixone::gpu_cagra_t<half>*>(static_cast<gpu_cagra_any_t*>(indices_c[i])->ptr));
+                merged_ptr = matrixone::gpu_cagra_t<half>::merge(indices, nthread, devs).release();
+                break;
+            }
+            case Quantization_INT8: {
+                std::vector<matrixone::gpu_cagra_t<int8_t>*> indices;
+                for (int i = 0; i < count; ++i) indices.push_back(static_cast<matrixone::gpu_cagra_t<int8_t>*>(static_cast<gpu_cagra_any_t*>(indices_c[i])->ptr));
+                merged_ptr = matrixone::gpu_cagra_t<int8_t>::merge(indices, nthread, devs).release();
+                break;
+            }
+            case Quantization_UINT8: {
+                std::vector<matrixone::gpu_cagra_t<uint8_t>*> indices;
+                for (int i = 0; i < count; ++i) indices.push_back(static_cast<matrixone::gpu_cagra_t<uint8_t>*>(static_cast<gpu_cagra_any_t*>(indices_c[i])->ptr));
+                merged_ptr = matrixone::gpu_cagra_t<uint8_t>::merge(indices, nthread, devs).release();
+                break;
+            }
+            default: break;
+        }
+        return static_cast<gpu_cagra_c>(new gpu_cagra_any_t(qtype, merged_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_merge", e.what());
+        return nullptr;
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_cagra_t<float>;
+template class gpu_cagra_t<half>;
+template class gpu_cagra_t<int8_t>;
+template class gpu_cagra_t<uint8_t>;
+} // namespace matrixone
diff --git a/cgo/cuvs/cagra_c.h b/cgo/cuvs/cagra_c.h
new file mode 100644
index 0000000000000..587547ba87d17
--- /dev/null
+++ b/cgo/cuvs/cagra_c.h
@@ -0,0 +1,118 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAGRA_C_H
+#define CAGRA_C_H
+
+#include "helper.h"
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_cagra_t object
+typedef void* gpu_cagra_c;
+
+// Opaque pointer to the C++ CAGRA search result object
+typedef void* gpu_cagra_result_c;
+
+// Constructor for building from dataset
+gpu_cagra_c gpu_cagra_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                            distance_type_t metric, cagra_build_params_t build_params,
+                            const int* devices, int device_count, uint32_t nthread, 
+                            distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Constructor for loading from file
+gpu_cagra_c gpu_cagra_load_file(const char* filename, uint32_t dimension, distance_type_t metric,
+                                 cagra_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Destructor
+void gpu_cagra_destroy(gpu_cagra_c index_c, void* errmsg);
+
+// Start function (initializes worker and resources)
+void gpu_cagra_start(gpu_cagra_c index_c, void* errmsg);
+
+// Build function (actually triggers the build/load logic)
+void gpu_cagra_build(gpu_cagra_c index_c, void* errmsg);
+
+// Constructor for an empty index (pre-allocates)
+gpu_cagra_c gpu_cagra_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric, 
+                                     cagra_build_params_t build_params,
+                                     const int* devices, int device_count, uint32_t nthread, 
+                                     distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Add chunk of data (same type as index quantization)
+void gpu_cagra_add_chunk(gpu_cagra_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg);
+
+// Add chunk of data (from float, with on-the-fly quantization if needed)
+void gpu_cagra_add_chunk_float(gpu_cagra_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg);
+
+// Trains the scalar quantizer (if T is 1-byte)
+void gpu_cagra_train_quantizer(gpu_cagra_c index_c, const float* train_data, uint64_t n_samples, void* errmsg);
+
+void gpu_cagra_set_per_thread_device(gpu_cagra_c index_c, bool enable, void* errmsg);
+void gpu_cagra_set_use_batching(gpu_cagra_c index_c, bool enable, void* errmsg);
+
+void gpu_cagra_set_quantizer(gpu_cagra_c index_c, float min, float max, void* errmsg);
+void gpu_cagra_get_quantizer(gpu_cagra_c index_c, float* min, float* max, void* errmsg);
+
+// Destructor
+
+
+void gpu_cagra_save(gpu_cagra_c index_c, const char* filename, void* errmsg);
+
+// Search function
+typedef struct {
+    gpu_cagra_result_c result_ptr;
+} gpu_cagra_search_res_t;
+
+gpu_cagra_search_res_t gpu_cagra_search(gpu_cagra_c index_c, const void* queries_data, uint64_t num_queries, 
+                                            uint32_t query_dimension, uint32_t limit, 
+                                            cagra_search_params_t search_params, void* errmsg);
+
+gpu_cagra_search_res_t gpu_cagra_search_float(gpu_cagra_c index_c, const float* queries_data, uint64_t num_queries, 
+                                                  uint32_t query_dimension, uint32_t limit, 
+                                                  cagra_search_params_t search_params, void* errmsg);
+// Get results from result object
+void gpu_cagra_get_neighbors(gpu_cagra_result_c result_c, uint64_t total_elements, uint32_t* neighbors);
+void gpu_cagra_get_distances(gpu_cagra_result_c result_c, uint64_t total_elements, float* distances);
+
+// Free result object
+void gpu_cagra_free_result(gpu_cagra_result_c result_c);
+
+// Returns the capacity of the index buffer
+uint32_t gpu_cagra_cap(gpu_cagra_c index_c);
+
+// Returns the current number of vectors in the index
+uint32_t gpu_cagra_len(gpu_cagra_c index_c);
+
+// Returns info about the index as a JSON string
+char* gpu_cagra_info(gpu_cagra_c index_c, void* errmsg);
+
+// Extend function
+void gpu_cagra_extend(gpu_cagra_c index_c, const void* additional_data, uint64_t num_vectors, void* errmsg);
+
+// Merge function
+gpu_cagra_c gpu_cagra_merge(gpu_cagra_c* indices_c, int num_indices, uint32_t nthread, const int* devices, int device_count, void* errmsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CAGRA_C_H
diff --git a/cgo/cuvs/cuvs_types.h b/cgo/cuvs/cuvs_types.h
new file mode 100644
index 0000000000000..c5b028fc45d47
--- /dev/null
+++ b/cgo/cuvs/cuvs_types.h
@@ -0,0 +1,181 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MO_CUVS_TYPES_H
+#define MO_CUVS_TYPES_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Distance metrics supported by cuVS.
+ */
+typedef enum {
+    DistanceType_L2Expanded = 0,        // Squared L2 distance: sum((x-y)^2)
+    DistanceType_L2SqrtExpanded = 1,    // L2 distance: sqrt(sum((x-y)^2))
+    DistanceType_CosineExpanded = 2,    // Cosine distance: 1 - (x.y)/(|x||y|)
+    DistanceType_L1 = 3,                // L1 (Manhattan) distance: sum(|x-y|)
+    DistanceType_L2Unexpanded = 4,      // L2 distance without expansion
+    DistanceType_L2SqrtUnexpanded = 5,  // L2 distance with sqrt without expansion
+    DistanceType_InnerProduct = 6,      // Inner product: x.y
+    DistanceType_Linf = 7,              // Chebyshev distance: max(|x-y|)
+    DistanceType_Canberra = 8,          // Canberra distance
+    DistanceType_LpUnexpanded = 9,      // Lp distance
+    DistanceType_CorrelationExpanded = 10, // Correlation distance
+    DistanceType_JaccardExpanded = 11,  // Jaccard distance
+    DistanceType_HellingerExpanded = 12, // Hellinger distance
+    DistanceType_Haversine = 13,        // Haversine distance
+    DistanceType_BrayCurtis = 14,       // Bray-Curtis distance
+    DistanceType_JensenShannon = 15,    // Jensen-Shannon distance
+    DistanceType_HammingUnexpanded = 16, // Hamming distance
+    DistanceType_KLDivergence = 17,     // Kullback-Leibler divergence
+    DistanceType_RusselRaoExpanded = 18, // Russel-Rao distance
+    DistanceType_DiceExpanded = 19,     // Dice distance
+    DistanceType_BitwiseHamming = 20,   // Bitwise Hamming distance
+    DistanceType_Precomputed = 100,     // Precomputed distance
+    // Aliases
+    DistanceType_CosineSimilarity = 2,  // Alias for Cosine distance
+    DistanceType_Jaccard = 11,           // Alias for Jaccard distance
+    DistanceType_Hamming = 16,           // Alias for Hamming distance
+    DistanceType_Unknown = 255          // Unknown distance type
+} distance_type_t;
+
+/**
+ * @brief Data quantization types.
+ */
+typedef enum {
+    Quantization_F32,   // 32-bit floating point
+    Quantization_F16,   // 16-bit floating point (half)
+    Quantization_INT8,  // 8-bit signed integer
+    Quantization_UINT8  // 8-bit unsigned integer
+} quantization_t;
+
+/**
+ * @brief GPU distribution modes.
+ */
+typedef enum {
+    DistributionMode_SINGLE_GPU, // Single GPU mode
+    DistributionMode_SHARDED,    // Sharded across multiple GPUs
+    DistributionMode_REPLICATED  // Replicated across multiple GPUs
+} distribution_mode_t;
+
+/**
+ * @brief CAGRA index build parameters.
+ */
+typedef struct {
+    size_t intermediate_graph_degree; // Degree of the intermediate graph (default 128)
+    size_t graph_degree;              // Degree of the final graph (default 64)
+    bool attach_dataset_on_build;     // Whether to attach the dataset to the index (default true)
+} cagra_build_params_t;
+
+/**
+ * @brief CAGRA search parameters.
+ */
+typedef struct {
+    size_t itopk_size;   // Internal top-k size (default 64)
+    size_t search_width; // Number of search paths (default 1)
+} cagra_search_params_t;
+
+/**
+ * @brief IVF-Flat index build parameters.
+ */
+typedef struct {
+    uint32_t n_lists;             // Number of inverted lists (clusters) (default 1024)
+    bool add_data_on_build;       // Whether to add data to the index during build (default true)
+    double kmeans_trainset_fraction; // Fraction of data to use for k-means training (default 0.5)
+} ivf_flat_build_params_t;
+
+/**
+ * @brief IVF-Flat search parameters.
+ */
+typedef struct {
+    uint32_t n_probes; // Number of lists to probe during search (default 20)
+} ivf_flat_search_params_t;
+
+/**
+ * @brief IVF-PQ index build parameters.
+ */
+typedef struct {
+    uint32_t n_lists;             // Number of inverted lists (clusters) (default 1024)
+    uint32_t m;                   // Number of sub-vectors (default 16)
+    uint32_t bits_per_code;       // Bits per code (default 8)
+    bool add_data_on_build;       // Whether to add data to the index during build (default true)
+    double kmeans_trainset_fraction; // Fraction of data to use for k-means training (default 0.5)
+} ivf_pq_build_params_t;
+
+/**
+ * @brief IVF-PQ search parameters.
+ */
+typedef struct {
+    uint32_t n_probes; // Number of lists to probe during search (default 20)
+} ivf_pq_search_params_t;
+
+/**
+ * @brief Brute-force index build parameters (dummy).
+ */
+typedef struct {
+} brute_force_build_params_t;
+
+/**
+ * @brief K-Means build parameters (dummy for inheritance).
+ */
+typedef struct {
+} kmeans_build_params_t;
+
+#ifdef __cplusplus
+static inline cagra_build_params_t cagra_build_params_default() {
+    return {128, 64, true};
+}
+
+static inline cagra_search_params_t cagra_search_params_default() {
+    return {64, 1};
+}
+
+static inline ivf_flat_build_params_t ivf_flat_build_params_default() {
+    return {1024, true, 0.5};
+}
+
+static inline ivf_flat_search_params_t ivf_flat_search_params_default() {
+    return {20};
+}
+
+static inline ivf_pq_build_params_t ivf_pq_build_params_default() {
+    return {1024, 16, 8, true, 0.5};
+}
+
+static inline ivf_pq_search_params_t ivf_pq_search_params_default() {
+    return {20};
+}
+
+static inline brute_force_build_params_t brute_force_build_params_default() {
+    return {};
+}
+
+static inline kmeans_build_params_t kmeans_build_params_default() {
+    return {};
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MO_CUVS_TYPES_H
diff --git a/cgo/cuvs/cuvs_worker.hpp b/cgo/cuvs/cuvs_worker.hpp
new file mode 100644
index 0000000000000..eeaca3551a32c
--- /dev/null
+++ b/cgo/cuvs/cuvs_worker.hpp
@@ -0,0 +1,585 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <any>
+#include <atomic>
+#include <condition_variable>
+#include <chrono>
+#include <deque>
+#include <functional>
+#include <future>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+#include <limits>
+
+#ifdef __linux__
+#include <pthread.h>
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/device_resources_snmg.hpp>
+#pragma GCC diagnostic pop
+
+namespace matrixone {
+
+/**
+ * @brief Wrapper for RAFT resources to manage their lifecycle.
+ * Supports both single-GPU and single-node multi-GPU (SNMG) modes.
+ */
+class raft_handle_wrapper_t {
+public:
+    // Default constructor for single-GPU mode (uses current device)
+    raft_handle_wrapper_t() : resources_(std::make_unique<raft::device_resources>()) {}
+
+    // Constructor for single-GPU mode with a specific device ID
+    explicit raft_handle_wrapper_t(int device_id) {
+        RAFT_CUDA_TRY(cudaSetDevice(device_id));
+        resources_ = std::make_unique<raft::device_resources>();
+    }
+
+    // Constructor for multi-GPU mode (SNMG)
+    // force_mg: If true, use device_resources_snmg even if devices.size() == 1 (useful for testing)
+    explicit raft_handle_wrapper_t(const std::vector<int>& devices, bool force_mg = false) {
+        if (devices.empty()) {
+            resources_ = std::make_unique<raft::device_resources>();
+        } else if (devices.size() == 1 && !force_mg) {
+            RAFT_CUDA_TRY(cudaSetDevice(devices[0]));
+            resources_ = std::make_unique<raft::device_resources>();
+        } else {
+            // Ensure the main device is set before creating SNMG resources
+            RAFT_CUDA_TRY(cudaSetDevice(devices[0]));
+            resources_ = std::make_unique<raft::device_resources_snmg>(devices);
+        }
+    }
+
+    ~raft_handle_wrapper_t() = default;
+
+    raft::resources* get_raft_resources() const { return resources_.get(); }
+
+private:
+    std::unique_ptr<raft::resources> resources_;
+};
+
+/**
+ * @brief Helper to check if a RAFT handle is configured for Multi-GPU (SNMG).
+ */
+static inline bool is_snmg_handle(raft::resources* res) {
+    return dynamic_cast<const raft::device_resources_snmg*>(res) != nullptr;
+}
+
+/**
+ * @brief A thread-safe blocking queue for task distribution.
+ */
+template <typename T>
+class thread_safe_queue_t {
+public:
+    void set_capacity(size_t capacity) {
+        std::lock_guard<std::mutex> lock(mu_);
+        capacity_ = capacity;
+    }
+
+    void push(T value) {
+        std::unique_lock<std::mutex> lock(mu_);
+        cv_full_.wait(lock, [this] { return queue_.size() < capacity_ || stopped_; });
+        if (stopped_) return;
+        queue_.push_back(std::move(value));
+        cv_empty_.notify_one();
+    }
+
+    bool pop(T& value) {
+        std::unique_lock<std::mutex> lock(mu_);
+        cv_empty_.wait(lock, [this] { return !queue_.empty() || stopped_; });
+        if (stopped_) return false;
+        value = std::move(queue_.front());
+        queue_.pop_front();
+        cv_full_.notify_one();
+        return true;
+    }
+
+    bool try_pop(T& value) {
+        std::lock_guard<std::mutex> lock(mu_);
+        if (queue_.empty() || stopped_) return false;
+        value = std::move(queue_.front());
+        queue_.pop_front();
+        cv_full_.notify_one();
+        return true;
+    }
+
+    void stop() {
+        {
+            std::lock_guard<std::mutex> lock(mu_);
+            stopped_ = true;
+        }
+        cv_empty_.notify_all();
+        cv_full_.notify_all();
+    }
+
+    bool is_stopped() const {
+        std::lock_guard<std::mutex> lock(mu_);
+        return stopped_;
+    }
+
+    bool empty() const {
+        std::lock_guard<std::mutex> lock(mu_);
+        return queue_.empty();
+    }
+
+    size_t size() const {
+        std::lock_guard<std::mutex> lock(mu_);
+        return queue_.size();
+    }
+
+private:
+    std::deque<T> queue_;
+    mutable std::mutex mu_;
+    std::condition_variable cv_empty_;
+    std::condition_variable cv_full_;
+    size_t capacity_ = std::numeric_limits<size_t>::max();
+    bool stopped_ = false;
+};
+
+struct cuvs_task_result_t {
+    uint64_t id;
+    std::any result;
+    std::exception_ptr error;
+};
+
+/**
+ * @brief Manages storage and retrieval of task results.
+ */
+class cuvs_task_result_store_t {
+public:
+    cuvs_task_result_store_t() : next_id_(1), stopped_(false) {}
+
+    uint64_t get_next_job_id() { return next_id_.fetch_add(1); }
+
+    void store(const cuvs_task_result_t& result) {
+        std::unique_lock<std::mutex> lock(mu_);
+        if (auto it = pending_.find(result.id); it != pending_.end()) {
+            auto promise = std::move(it->second);
+            pending_.erase(it);
+            lock.unlock();
+            promise->set_value(result);
+        } else {
+            results_[result.id] = result;
+        }
+    }
+
+    std::future<cuvs_task_result_t> wait(uint64_t job_id) {
+        std::unique_lock<std::mutex> lock(mu_);
+        if (stopped_) {
+            std::promise<cuvs_task_result_t> p;
+            p.set_exception(std::make_exception_ptr(std::runtime_error("cuvs_task_result_store_t stopped before result was available")));
+            return p.get_future();
+        }
+
+        if (auto it = results_.find(job_id); it != results_.end()) {
+            std::promise<cuvs_task_result_t> p;
+            p.set_value(std::move(it->second));
+            results_.erase(it);
+            return p.get_future();
+        }
+
+        auto promise = std::make_shared<std::promise<cuvs_task_result_t>>();
+        pending_[job_id] = promise;
+        return promise->get_future();
+    }
+
+    void stop() {
+        std::lock_guard<std::mutex> lock(mu_);
+        stopped_ = true;
+        for (auto& pair : pending_) {
+            pair.second->set_exception(std::make_exception_ptr(std::runtime_error("cuvs_task_result_store_t stopped before result was available")));
+        }
+        pending_.clear();
+        results_.clear();
+    }
+
+private:
+    std::atomic<uint64_t> next_id_;
+    std::mutex mu_;
+    std::map<uint64_t, std::shared_ptr<std::promise<cuvs_task_result_t>>> pending_;
+    std::map<uint64_t, cuvs_task_result_t> results_;
+    bool stopped_;
+};
+
+/**
+ * @brief dedicated worker pool for executing cuVS (RAFT) tasks in GPU-enabled threads.
+ */
+class cuvs_worker_t {
+public:
+    using raft_handle = raft_handle_wrapper_t;
+    using user_task_fn = std::function<std::any(raft_handle&)>;
+    using batch_exec_fn = std::function<void(raft_handle&, const std::vector<std::any>&, const std::vector<std::function<void(std::any)>>&)>;
+
+    struct cuvs_task_t {
+        uint64_t id;
+        user_task_fn fn;
+    };
+
+    explicit cuvs_worker_t(size_t n_threads, int device_id = -1) 
+        : n_threads_(n_threads), device_id_(device_id) {
+        if (n_threads == 0) throw std::invalid_argument("Thread count must be > 0");
+        size_t cap = 2 * n_threads;
+        main_tasks_.set_capacity(cap);
+        worker_tasks_.set_capacity(cap);
+    }
+
+    cuvs_worker_t(size_t n_threads, const std::vector<int>& devices, bool force_mg = false)
+        : n_threads_(n_threads), devices_(devices), force_mg_(force_mg) {
+        if (n_threads == 0) throw std::invalid_argument("Thread count must be > 0");
+        size_t cap = 2 * n_threads;
+        main_tasks_.set_capacity(cap);
+        worker_tasks_.set_capacity(cap);
+    }
+
+    ~cuvs_worker_t() { stop(); }
+
+    cuvs_worker_t(const cuvs_worker_t&) = delete;
+    cuvs_worker_t& operator=(const cuvs_worker_t&) = delete;
+
+    void start(user_task_fn init_fn = nullptr, user_task_fn stop_fn = nullptr) {
+        if (started_.exchange(true)) return;
+        main_thread_ = std::thread(&cuvs_worker_t::run_main_loop, this, std::move(init_fn), std::move(stop_fn));
+    }
+
+    void set_per_thread_device(bool enable) { per_thread_device_ = enable; }
+    void set_use_batching(bool enable) { use_batching_ = enable; }
+    bool use_batching() const { return use_batching_; }
+
+    void stop() {
+        if (!started_.load() || stopped_.exchange(true)) return;
+
+        {
+            std::lock_guard<std::mutex> lock(worker_mu_);
+            should_stop_ = true;
+            main_tasks_.stop();
+            worker_tasks_.stop();
+        }
+        worker_cv_.notify_all();
+
+        if (main_thread_.joinable()) main_thread_.join();
+        for (auto& t : sub_workers_) if (t.joinable()) t.join();
+        
+        sub_workers_.clear();
+        result_store_.stop();
+    }
+
+    uint64_t submit(user_task_fn fn) {
+        if (stopped_.load()) throw std::runtime_error("Cannot submit task: worker stopped");
+        uint64_t id = result_store_.get_next_job_id();
+        worker_tasks_.push({id, std::move(fn)});
+        worker_cv_.notify_all();
+        return id;
+    }
+
+    uint64_t submit_main(user_task_fn fn) {
+        if (stopped_.load()) throw std::runtime_error("Cannot submit main task: worker stopped");
+        uint64_t id = result_store_.get_next_job_id();
+        main_tasks_.push({id, std::move(fn)});
+        worker_cv_.notify_all();
+        return id;
+    }
+
+    std::future<cuvs_task_result_t> wait(uint64_t id) { return result_store_.wait(id); }
+
+    /**
+     * @brief Submits a task that can be merged with other tasks having the same batch_key.
+     * 
+     * @tparam T The expected return type.
+     * @param batch_key Unique identifier for grouping compatible tasks.
+     * @param request The data for this individual request.
+     * @param exec_fn Callback to execute the combined batch.
+     * @return std::future<T> Future for the individual result.
+     */
+    template<typename T>
+    std::future<T> submit_batched(const std::string& batch_key, std::any request, batch_exec_fn exec_fn) {
+        if (stopped_.load()) throw std::runtime_error("Cannot submit batched task: worker stopped");
+
+        if (!use_batching_ || n_threads_ <= 1) {
+            // Direct submission without batching
+            auto promise = std::make_shared<std::promise<T>>();
+            auto future = promise->get_future();
+            submit([promise, request, exec_fn](raft_handle& handle) -> std::any {
+                try {
+                    std::vector<std::any> reqs = {request};
+                    std::vector<std::function<void(std::any)>> setters = {[promise](std::any val) {
+                        try {
+                            if (val.type() == typeid(std::exception_ptr)) promise->set_exception(std::any_cast<std::exception_ptr>(val));
+                            else promise->set_value(std::any_cast<T>(val));
+                        } catch (...) { promise->set_exception(std::current_exception()); }
+                    }};
+                    exec_fn(handle, reqs, setters);
+                } catch (...) {
+                    promise->set_exception(std::current_exception());
+                }
+                return std::any();
+            });
+            return future;
+        }
+
+        auto promise = std::make_shared<std::promise<T>>();
+        auto future = promise->get_future();
+
+        // Setter to resolve the promise from a std::any result
+        auto setter = [promise](std::any val) {
+            try {
+                if (val.type() == typeid(std::exception_ptr)) {
+                    promise->set_exception(std::any_cast<std::exception_ptr>(val));
+                } else {
+                    promise->set_value(std::any_cast<T>(val));
+                }
+            } catch (...) {
+                promise->set_exception(std::current_exception());
+            }
+        };
+
+        std::shared_ptr<batch_t> batch;
+        {
+            std::lock_guard<std::mutex> lock(batches_mu_);
+            auto it = batches_.find(batch_key);
+            if (it == batches_.end()) {
+                batch = std::make_shared<batch_t>();
+                batches_[batch_key] = batch;
+            } else {
+                batch = it->second;
+            }
+
+            // Simple periodic cleanup of old batches
+            static size_t cleanup_counter = 0;
+            if (++cleanup_counter % 1000 == 0) {
+                for (auto bit = batches_.begin(); bit != batches_.end(); ) {
+                    std::lock_guard<std::mutex> block(bit->second->mu);
+                    if (!bit->second->scheduled && bit->second->requests.empty()) {
+                        bit = batches_.erase(bit);
+                    } else {
+                        ++bit;
+                    }
+                }
+            }
+        }
+
+        bool trigger = false;
+        {
+            std::lock_guard<std::mutex> lock(batch->mu);
+            batch->requests.push_back(std::move(request));
+            batch->setters.push_back(std::move(setter));
+            if (!batch->scheduled) {
+                batch->scheduled = true;
+                trigger = true;
+            }
+        }
+
+        if (trigger) {
+            // Submit a trigger task that will wait a tiny bit then drain the batch
+            submit([this, batch, exec_fn](raft_handle& handle) -> std::any {
+                // Micro-batching wait: allows more goroutines to join the batch
+                std::this_thread::sleep_for(std::chrono::microseconds(100));
+
+                std::vector<std::any> reqs;
+                std::vector<std::function<void(std::any)>> setters;
+
+                {
+                    std::lock_guard<std::mutex> lock(batch->mu);
+                    reqs = std::move(batch->requests);
+                    setters = std::move(batch->setters);
+                    batch->scheduled = false;
+                }
+
+                if (!reqs.empty()) {
+                    try {
+                        exec_fn(handle, reqs, setters);
+                    } catch (...) {
+                        auto err = std::current_exception();
+                        for (auto& s : setters) s(err);
+                    }
+                }
+                return std::any();
+            });
+        }
+
+        return future;
+    }
+
+    std::exception_ptr get_first_error() {
+        std::lock_guard<std::mutex> lock(event_mu_);
+        return fatal_error_;
+    }
+
+private:
+    void run_main_loop(user_task_fn init_fn, user_task_fn stop_fn) {
+        pin_thread(0);
+        auto resource = setup_resource_internal(0, true);
+        if (!resource) return;
+
+        if (init_fn) {
+            try { init_fn(*resource); }
+            catch (...) { report_fatal_error(std::current_exception()); return; }
+        }
+
+        auto defer_cleanup = [&]() { if (stop_fn) try { stop_fn(*resource); } catch (...) {} };
+        std::shared_ptr<void> cleanup_guard(nullptr, [&](...) { defer_cleanup(); });
+
+        if (n_threads_ > 1) {
+            for (size_t i = 1; i < n_threads_; ++i) {
+                sub_workers_.emplace_back(&cuvs_worker_t::worker_sub_loop, this, i);
+            }
+        }
+
+        while (true) {
+            cuvs_task_t task;
+            bool found = false;
+
+            {
+                std::unique_lock<std::mutex> lock(worker_mu_);
+                worker_cv_.wait(lock, [&] {
+                    return !main_tasks_.empty() || !worker_tasks_.empty() || should_stop_ || fatal_error_;
+                });
+
+                if (should_stop_ || fatal_error_) break;
+
+                if (main_tasks_.try_pop(task)) {
+                    found = true;
+                } else if (worker_tasks_.try_pop(task)) {
+                    found = true;
+                }
+            }
+
+            if (found) {
+                execute_task(task, *resource);
+            }
+        }
+    }
+
+    void worker_sub_loop(size_t thread_idx) {
+        pin_thread(-1);
+        auto resource = setup_resource_internal(thread_idx, false);
+        if (!resource) return;
+
+        cuvs_task_t task;
+        while (worker_tasks_.pop(task)) {
+            if (fatal_error_) break;
+            execute_task(task, *resource);
+        }
+    }
+
+    void execute_task(const cuvs_task_t& task, raft_handle& resource) {
+        cuvs_task_result_t res;
+        res.id = task.id;
+        try { res.result = task.fn(resource); }
+        catch (...) { 
+            res.error = std::current_exception(); 
+            std::cerr << "ERROR: Task " << task.id << " failed." << std::endl;
+        }
+        result_store_.store(res);
+    }
+
+    std::unique_ptr<raft_handle> setup_resource_internal(size_t thread_idx, bool is_main_thread) {
+        try {
+            if (!devices_.empty()) {
+                if (is_main_thread) {
+                    return std::make_unique<raft_handle>(devices_, force_mg_);
+                }
+                if (per_thread_device_ && n_threads_ > 1) {
+                    int dev = devices_[thread_idx % devices_.size()];
+                    return std::make_unique<raft_handle>(dev);
+                }
+                return std::make_unique<raft_handle>(devices_, force_mg_);
+            } else if (device_id_ >= 0) {
+                return std::make_unique<raft_handle>(device_id_);
+            } else {
+                return std::make_unique<raft_handle>();
+            }
+        } catch (...) {
+            report_fatal_error(std::current_exception());
+            std::cerr << "ERROR: Failed to setup RAFT resource." << std::endl;
+            return nullptr;
+        }
+    }
+
+    void report_fatal_error(std::exception_ptr err) {
+        std::lock_guard<std::mutex> lock(event_mu_);
+        if (!fatal_error_) fatal_error_ = err;
+        {
+            std::lock_guard<std::mutex> lock_w(worker_mu_);
+            should_stop_ = true; // NEW: Ensure we signal stop on fatal error
+        }
+        worker_cv_.notify_all();
+    }
+
+    void pin_thread(int cpu_id) {
+#ifdef __linux__
+        static std::atomic<int> next_cpu_id{1};
+        int id = (cpu_id >= 0) ? cpu_id : (next_cpu_id.fetch_add(1) % std::thread::hardware_concurrency());
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(id, &cpuset);
+        if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0) {
+            std::cerr << "WARNING: Failed to set affinity for thread to core " << id << std::endl;
+        }
+#endif
+    }
+
+    size_t n_threads_;
+    int device_id_ = -1;
+    std::vector<int> devices_;
+    bool force_mg_ = false;
+    bool per_thread_device_ = false;
+    bool use_batching_ = false;
+    std::atomic<bool> started_{false};
+    std::atomic<bool> stopped_{false};
+    
+    // Unified Task Management
+    std::mutex worker_mu_;
+    std::condition_variable worker_cv_;
+    thread_safe_queue_t<cuvs_task_t> main_tasks_;
+    thread_safe_queue_t<cuvs_task_t> worker_tasks_;
+    bool should_stop_ = false;
+
+    cuvs_task_result_store_t result_store_;
+    std::thread main_thread_;
+    std::vector<std::thread> sub_workers_;
+
+    std::mutex event_mu_;
+    std::exception_ptr fatal_error_;
+
+    // Batching support
+    struct batch_t {
+        std::mutex mu;
+        std::vector<std::any> requests;
+        std::vector<std::function<void(std::any)>> setters;
+        bool scheduled = false;
+    };
+    std::mutex batches_mu_;
+    std::map<std::string, std::shared_ptr<batch_t>> batches_;
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/distance.hpp b/cgo/cuvs/distance.hpp
new file mode 100644
index 0000000000000..e98539b74b3ca
--- /dev/null
+++ b/cgo/cuvs/distance.hpp
@@ -0,0 +1,98 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resources.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/copy.cuh>
+#include <cuvs/distance/distance.hpp>
+#include "helper.h"
+#include <vector>
+#include <cstdint>
+
+namespace matrixone {
+
+/**
+ * @brief Performs a pairwise distance calculation on GPU.
+ * 
+ * @tparam T Data type of the vector elements (e.g., float, half).
+ * @param res RAFT resources handle.
+ * @param x Host pointer to the first set of vectors (X).
+ * @param n_x Number of vectors in X.
+ * @param y Host pointer to the second set of vectors (Y).
+ * @param n_y Number of vectors in Y.
+ * @param dim Dimension of each vector.
+ * @param metric Distance metric to use.
+ * @param dist Host pointer to store the resulting distances (size: n_x * n_y).
+ */
+template <typename T>
+void pairwise_distance(const raft::resources& res,
+                       const T* x,
+                       uint64_t n_x,
+                       const T* y,
+                       uint64_t n_y,
+                       uint32_t dim,
+                       cuvs::distance::DistanceType metric,
+                       float* dist) {
+    auto stream = raft::resource::get_cuda_stream(res);
+
+    // Helper to align sizes to 256 bytes (CUDA default alignment)
+    auto align_size = [](size_t size) {
+        return (size + 255) & ~255;
+    };
+
+    // 1. Calculate total buffer sizes with alignment
+    size_t x_bytes = n_x * dim * sizeof(T);
+    size_t y_bytes = n_y * dim * sizeof(T);
+    size_t dist_bytes = n_x * n_y * sizeof(float);
+
+    size_t x_alloc = align_size(x_bytes);
+    size_t y_alloc = align_size(y_bytes);
+    size_t total_bytes = x_alloc + y_alloc + dist_bytes;
+
+    // Use a single allocation for all temporary buffers to reduce overhead
+    void* d_ptr = nullptr;
+    RAFT_CUDA_TRY(cudaMallocAsync(&d_ptr, total_bytes, stream));
+
+    char* d_x = static_cast<char*>(d_ptr);
+    char* d_y = d_x + x_alloc;
+    char* d_dist = d_y + y_alloc;
+
+    // 2. Async copies to Device
+    RAFT_CUDA_TRY(cudaMemcpyAsync(d_x, x, x_bytes, cudaMemcpyHostToDevice, stream));
+    RAFT_CUDA_TRY(cudaMemcpyAsync(d_y, y, y_bytes, cudaMemcpyHostToDevice, stream));
+
+    // 3. Prepare Views (zero allocation)
+    auto x_view = raft::make_device_matrix_view<const T, int64_t>(reinterpret_cast<const T*>(d_x), (int64_t)n_x, (int64_t)dim);
+    auto y_view = raft::make_device_matrix_view<const T, int64_t>(reinterpret_cast<const T*>(d_y), (int64_t)n_y, (int64_t)dim);
+    auto dist_view = raft::make_device_matrix_view<float, int64_t>(reinterpret_cast<float*>(d_dist), (int64_t)n_x, (int64_t)n_y);
+
+    // 4. Execute Pairwise Distance
+    cuvs::distance::pairwise_distance(res, x_view, y_view, dist_view, metric);
+
+    // 5. Async copy results back to host
+    RAFT_CUDA_TRY(cudaMemcpyAsync(dist, d_dist, dist_bytes, cudaMemcpyDeviceToHost, stream));
+
+    // 6. Synchronize
+    raft::resource::sync_stream(res);
+
+    // 7. Async free
+    RAFT_CUDA_TRY(cudaFreeAsync(d_ptr, stream));
+}
+
+} // namespace matrixone
diff --git a/cgo/cuvs/distance_c.cpp b/cgo/cuvs/distance_c.cpp
new file mode 100644
index 0000000000000..e3c3b02db7d99
--- /dev/null
+++ b/cgo/cuvs/distance_c.cpp
@@ -0,0 +1,55 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distance_c.h"
+#include "distance.hpp"
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+extern "C" {
+
+void gpu_pairwise_distance(const void* x,
+                           uint64_t n_x,
+                           const void* y,
+                           uint64_t n_y,
+                           uint32_t dim,
+                           distance_type_t metric,
+                           quantization_t qtype,
+                           int device_id,
+                           float* dist,
+                           void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        if (!x || !y || !dist || n_x == 0 || n_y == 0 || dim == 0) return;
+
+        RAFT_CUDA_TRY(cudaSetDevice(device_id));
+        const raft::resources& res = matrixone::get_raft_resources();
+        cuvs::distance::DistanceType metric_cuvs = matrixone::convert_distance_type(metric);
+
+        if (qtype == Quantization_F32) {
+            matrixone::pairwise_distance<float>(res, static_cast<const float*>(x), n_x, static_cast<const float*>(y), n_y, dim, metric_cuvs, dist);
+        } else if (qtype == Quantization_F16) {
+            matrixone::pairwise_distance<half>(res, static_cast<const half*>(x), n_x, static_cast<const half*>(y), n_y, dim, metric_cuvs, dist);
+        } else {
+            throw std::runtime_error("Unsupported quantization type for pairwise_distance");
+        }
+
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_pairwise_distance", e.what());
+    }
+}
+
+} // extern "C"
diff --git a/cgo/cuvs/distance_c.h b/cgo/cuvs/distance_c.h
new file mode 100644
index 0000000000000..fe35660afb194
--- /dev/null
+++ b/cgo/cuvs/distance_c.h
@@ -0,0 +1,56 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DISTANCE_C_H
+#define DISTANCE_C_H
+
+#include "helper.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Performs a pairwise distance calculation on GPU.
+ * 
+ * @param x Host pointer to the first set of vectors (X).
+ * @param n_x Number of vectors in X.
+ * @param y Host pointer to the second set of vectors (Y).
+ * @param n_y Number of vectors in Y.
+ * @param dim Dimension of each vector.
+ * @param metric Distance metric to use.
+ * @param qtype Quantization type (F32, F16).
+ * @param device_id GPU device ID to use.
+ * @param dist Host pointer to store the resulting distances (size: n_x * n_y).
+ * @param errmsg Pointer to store error message if any.
+ */
+void gpu_pairwise_distance(const void* x,
+                           uint64_t n_x,
+                           const void* y,
+                           uint64_t n_y,
+                           uint32_t dim,
+                           distance_type_t metric,
+                           quantization_t qtype,
+                           int device_id,
+                           float* dist,
+                           void* errmsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // DISTANCE_C_H
diff --git a/cgo/cuvs/helper.cpp b/cgo/cuvs/helper.cpp
new file mode 100644
index 0000000000000..506f72b662b27
--- /dev/null
+++ b/cgo/cuvs/helper.cpp
@@ -0,0 +1,158 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helper.h"
+#include "cuvs_worker.hpp"
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <stdexcept>
+#include <string>
+#include <cstring>
+#include <iostream>
+#include <raft/util/cudart_utils.hpp>
+
+namespace matrixone {
+cuvs::distance::DistanceType convert_distance_type(distance_type_t metric_c) {
+    switch (metric_c) {
+        case DistanceType_L2Expanded: return cuvs::distance::DistanceType::L2Expanded;
+        case DistanceType_L2SqrtExpanded: return cuvs::distance::DistanceType::L2SqrtExpanded;
+        case DistanceType_CosineExpanded: return cuvs::distance::DistanceType::CosineExpanded;
+        case DistanceType_L1: return cuvs::distance::DistanceType::L1;
+        case DistanceType_L2Unexpanded: return cuvs::distance::DistanceType::L2Unexpanded;
+        case DistanceType_L2SqrtUnexpanded: return cuvs::distance::DistanceType::L2SqrtUnexpanded;
+        case DistanceType_InnerProduct: return cuvs::distance::DistanceType::InnerProduct;
+        case DistanceType_Linf: return cuvs::distance::DistanceType::Linf;
+        case DistanceType_Canberra: return cuvs::distance::DistanceType::Canberra;
+        case DistanceType_LpUnexpanded: return cuvs::distance::DistanceType::LpUnexpanded;
+        case DistanceType_CorrelationExpanded: return cuvs::distance::DistanceType::CorrelationExpanded;
+        case DistanceType_JaccardExpanded: return cuvs::distance::DistanceType::JaccardExpanded;
+        case DistanceType_HellingerExpanded: return cuvs::distance::DistanceType::HellingerExpanded;
+        case DistanceType_Haversine: return cuvs::distance::DistanceType::Haversine;
+        case DistanceType_BrayCurtis: return cuvs::distance::DistanceType::BrayCurtis;
+        case DistanceType_JensenShannon: return cuvs::distance::DistanceType::JensenShannon;
+        case DistanceType_HammingUnexpanded: return cuvs::distance::DistanceType::HammingUnexpanded;
+        case DistanceType_KLDivergence: return cuvs::distance::DistanceType::KLDivergence;
+        case DistanceType_RusselRaoExpanded: return cuvs::distance::DistanceType::RusselRaoExpanded;
+        case DistanceType_DiceExpanded: return cuvs::distance::DistanceType::DiceExpanded;
+        case DistanceType_BitwiseHamming: return cuvs::distance::DistanceType::BitwiseHamming;
+        case DistanceType_Precomputed: return cuvs::distance::DistanceType::Precomputed;
+        default:
+            throw std::runtime_error("Unknown or unsupported distance type");
+    }
+}
+
+const raft::resources& get_raft_resources() {
+    thread_local raft::resources res;
+    return res;
+}
+}
+
+// Vectorized kernel processing 2 elements per thread
+__global__ void f32_to_f16_vectorized_kernel(const float2* src, half2* dst, uint64_t n_pairs) {
+    uint64_t i = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x;
+    if (i < n_pairs) {
+        dst[i] = __float22half2_rn(src[i]);
+    }
+}
+
+// Fallback kernel for the last element if total_elements is odd
+__global__ void f32_to_f16_tail_kernel(const float* src, half* dst, uint64_t index) {
+    dst[index] = __float2half(src[index]);
+}
+
+extern "C" {
+
+int gpu_get_device_count() {
+    int count = 0;
+    cudaError_t err = cudaGetDeviceCount(&count);
+    if (err != cudaSuccess) {
+        return -1;
+    }
+    return count;
+}
+
+int gpu_get_device_list(int* devices, int max_count) {
+    int count = 0;
+    cudaError_t err = cudaGetDeviceCount(&count);
+    if (err != cudaSuccess) {
+        return -1;
+    }
+    int actual_count = (count > max_count) ? max_count : count;
+    for (int i = 0; i < actual_count; ++i) {
+        devices[i] = i;
+    }
+    return actual_count;
+}
+
+void set_errmsg(void* errmsg, const char* prefix, const char* what) {
+    if (errmsg) {
+        std::string err_str = std::string(prefix) + ": " + std::string(what);
+        char* msg = (char*)malloc(err_str.length() + 1);
+        if (msg) {
+            std::strcpy(msg, err_str.c_str());
+            *(static_cast<char**>(errmsg)) = msg;
+        }
+    } else {
+        std::cerr << prefix << ": " << what << std::endl;
+    }
+}
+
+void gpu_convert_f32_to_f16(const float* src, void* dst, uint64_t total_elements, int device_id, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        if (!src || !dst || total_elements == 0) return;
+
+        RAFT_CUDA_TRY(cudaSetDevice(device_id));
+
+        float *d_src = nullptr;
+        half *d_dst = nullptr;
+
+        // Allocate device memory
+        RAFT_CUDA_TRY(cudaMalloc(&d_src, total_elements * sizeof(float)));
+        RAFT_CUDA_TRY(cudaMalloc(&d_dst, total_elements * sizeof(half)));
+
+        // Copy source to device
+        RAFT_CUDA_TRY(cudaMemcpy(d_src, src, total_elements * sizeof(float), cudaMemcpyHostToDevice));
+
+        // Launch vectorized kernel for pairs
+        uint64_t n_pairs = total_elements / 2;
+        if (n_pairs > 0) {
+            uint32_t threads_per_block = 256;
+            uint32_t blocks = (n_pairs + threads_per_block - 1) / threads_per_block;
+            f32_to_f16_vectorized_kernel<<<blocks, threads_per_block>>>((const float2*)d_src, (half2*)d_dst, n_pairs);
+        }
+
+        // Handle the tail if odd
+        if (total_elements % 2 != 0) {
+            f32_to_f16_tail_kernel<<<1, 1>>>(d_src, d_dst, total_elements - 1);
+        }
+        
+        RAFT_CUDA_TRY(cudaPeekAtLastError());
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+        // Copy result back to host
+        RAFT_CUDA_TRY(cudaMemcpy(dst, d_dst, total_elements * sizeof(half), cudaMemcpyDeviceToHost));
+
+        // Free device memory
+        cudaFree(d_src);
+        cudaFree(d_dst);
+
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_convert_f32_to_f16", e.what());
+    }
+}
+
+} // extern "C"
diff --git a/cgo/cuvs/helper.h b/cgo/cuvs/helper.h
new file mode 100644
index 0000000000000..095f2188fd692
--- /dev/null
+++ b/cgo/cuvs/helper.h
@@ -0,0 +1,68 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MO_CUVS_C_HELPER_H
+#define MO_CUVS_C_HELPER_H
+
+#include "cuvs_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Returns the number of CUDA-capable devices available.
+ * @return Number of GPU devices.
+ */
+int gpu_get_device_count();
+
+/**
+ * @brief Lists the IDs of available CUDA devices.
+ * @param devices Output array to store device IDs.
+ * @param max_count Maximum number of device IDs to store.
+ * @return Number of device IDs written to the array.
+ */
+int gpu_get_device_list(int* devices, int max_count);
+
+/**
+ * @brief Converts float32 data to float16 (half) on GPU.
+ * @param src Pointer to source float32 data on host or device.
+ * @param dst Pointer to destination float16 data on device.
+ * @param total_elements Total number of elements to convert.
+ * @param device_id ID of the GPU device to use.
+ * @param errmsg Pointer to store error message if any.
+ */
+void gpu_convert_f32_to_f16(const float* src, void* dst, uint64_t total_elements, int device_id, void* errmsg);
+
+/**
+ * @brief Standardized helper to set an error message.
+ * @param errmsg Pointer to the error message destination.
+ * @param prefix Prefix for the error message (e.g., function name).
+ * @param what The actual error description.
+ */
+void set_errmsg(void* errmsg, const char* prefix, const char* what);
+
+#ifdef __cplusplus
+}
+
+#include <cuvs/distance/distance.hpp>
+namespace matrixone {
+    cuvs::distance::DistanceType convert_distance_type(distance_type_t metric_c);
+    const raft::resources& get_raft_resources();
+}
+#endif
+
+#endif // MO_CUVS_C_HELPER_H
diff --git a/cgo/cuvs/index_base.hpp b/cgo/cuvs/index_base.hpp
new file mode 100644
index 0000000000000..614fe7bf73c55
--- /dev/null
+++ b/cgo/cuvs/index_base.hpp
@@ -0,0 +1,183 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuvs_worker.hpp"
+#include "cuvs_types.h"
+#include "quantize.hpp"
+#include <vector>
+#include <string>
+#include <memory>
+#include <shared_mutex>
+#include <algorithm>
+#include <stdexcept>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include <raft/core/resources.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/copy.cuh>
+#pragma GCC diagnostic pop
+
+// cuVS includes
+#include <cuvs/distance/distance.hpp>
+
+namespace matrixone {
+
+/**
+ * @brief gpu_index_base_t provides common functionality for all GPU-based indexes.
+ * It manages host dataset, worker pool, quantization, and basic properties.
+ */
+template <typename T, typename BuildParams>
+class gpu_index_base_t {
+public:
+    std::vector<T> flattened_host_dataset;
+    std::vector<int> devices_;
+    std::string filename_;
+    
+    cuvs::distance::DistanceType metric;
+    uint32_t dimension;
+    uint32_t count;
+    BuildParams build_params;
+    distribution_mode_t dist_mode;
+
+    std::unique_ptr<cuvs_worker_t> worker;
+    mutable std::shared_mutex mutex_;
+    bool is_loaded_ = false;
+    std::shared_ptr<void> dataset_device_ptr_; // Keep device memory alive
+
+    gpu_index_base_t() = default;
+    virtual ~gpu_index_base_t() {
+        destroy();
+    }
+
+    // Common management methods
+    virtual void destroy() {
+        if (worker) worker->stop();
+    }
+
+    void set_use_batching(bool enable) {
+        if (worker) worker->set_use_batching(enable);
+    }
+
+    void set_per_thread_device(bool enable) {
+        if (worker) worker->set_per_thread_device(enable);
+    }
+
+    void set_quantizer(float min, float max) {
+        quantizer_ = scalar_quantizer_t<float>(min, max);
+    }
+
+    void get_quantizer(float* min, float* max) const {
+        if (!quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained");
+        *min = quantizer_.min();
+        *max = quantizer_.max();
+    }
+
+    void train_quantizer(const float* train_data, uint64_t n_samples) {
+        if (!train_data || n_samples == 0) return;
+        uint64_t job_id = worker->submit_main(
+            [&, train_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                auto res = handle.get_raft_resources();
+                auto train_device = raft::make_device_matrix<float, int64_t>(*res, n_samples, dimension);
+                raft::copy(*res, train_device.view(), raft::make_host_matrix_view<const float, int64_t>(train_data, n_samples, dimension));
+                quantizer_.train(*res, train_device.view());
+                return std::any();
+            }
+        );
+        auto result_wait = worker->wait(job_id).get();
+        if (result_wait.error) std::rethrow_exception(result_wait.error);
+    }
+
+    void add_chunk(const T* chunk_data, uint64_t chunk_count) {
+        if (current_offset_ + chunk_count > count) throw std::runtime_error("offset out of bounds");
+        std::copy(chunk_data, chunk_data + (chunk_count * dimension), flattened_host_dataset.begin() + (current_offset_ * dimension));
+        current_offset_ += chunk_count;
+    }
+
+    void add_chunk_float(const float* chunk_data, uint64_t chunk_count) {
+        if (current_offset_ + chunk_count > count) throw std::runtime_error("offset out of bounds");
+        
+        uint64_t row_offset = current_offset_;
+        uint64_t job_id = worker->submit_main(
+            [&, chunk_data, chunk_count, row_offset](raft_handle_wrapper_t& handle) -> std::any {
+                auto res = handle.get_raft_resources();
+                
+                // If quantization is needed (T is 1-byte)
+                if constexpr (sizeof(T) == 1) {
+                    if (!quantizer_.is_trained()) {
+                        int64_t n_train = std::min(static_cast<int64_t>(chunk_count), static_cast<int64_t>(500));
+                        auto train_device = raft::make_device_matrix<float, int64_t>(*res, n_train, dimension);
+                        raft::copy(*res, train_device.view(), raft::make_host_matrix_view<const float, int64_t>(chunk_data, n_train, dimension));
+                        quantizer_.train(*res, train_device.view());
+                    }
+
+                    auto chunk_device_float = raft::make_device_matrix<float, int64_t>(*res, chunk_count, dimension);
+                    raft::copy(*res, chunk_device_float.view(), raft::make_host_matrix_view<const float, int64_t>(chunk_data, chunk_count, dimension));
+                    quantizer_.template transform<T>(*res, chunk_device_float.view(), flattened_host_dataset.data() + (row_offset * dimension), false);
+                    raft::resource::sync_stream(*res);
+                } else if constexpr (std::is_same_v<T, float>) {
+                    std::copy(chunk_data, chunk_data + (chunk_count * dimension), flattened_host_dataset.begin() + (row_offset * dimension));
+                } else {
+                    auto chunk_device_float = raft::make_device_matrix<float, int64_t>(*res, chunk_count, dimension);
+                    raft::copy(*res, chunk_device_float.view(), raft::make_host_matrix_view<const float, int64_t>(chunk_data, chunk_count, dimension));
+                    auto out_view = raft::make_host_matrix_view<T, int64_t>(flattened_host_dataset.data() + (row_offset * dimension), chunk_count, dimension);
+                    raft::copy(*res, out_view, chunk_device_float.view());
+                    raft::resource::sync_stream(*res);
+                }
+                return std::any();
+            }
+        );
+        
+        auto result_wait = worker->wait(job_id).get();
+        if (result_wait.error) std::rethrow_exception(result_wait.error);
+        current_offset_ += chunk_count;
+    }
+
+    uint32_t cap() const {
+        return count;
+    }
+
+    uint32_t len() const {
+        return static_cast<uint32_t>(current_offset_);
+    }
+
+    virtual std::string info() const {
+        std::string json = "{";
+        json += "\"element_size\": " + std::to_string(sizeof(T)) + ", ";
+        json += "\"dimension\": " + std::to_string(dimension) + ", ";
+        json += "\"metric\": " + std::to_string(static_cast<int>(metric)) + ", ";
+        json += "\"status\": \"" + std::string(is_loaded_ ? "Loaded" : "Not Loaded") + "\", ";
+        json += "\"capacity\": " + std::to_string(count) + ", ";
+        json += "\"current_length\": " + std::to_string(current_offset_) + ", ";
+        json += "\"devices\": [";
+        for (size_t i = 0; i < devices_.size(); ++i) {
+            json += std::to_string(devices_[i]) + (i == devices_.size() - 1 ? "" : ", ");
+        }
+        json += "]";
+        return json; // Caller will close the object or add more fields
+    }
+
+protected:
+    scalar_quantizer_t<float> quantizer_;
+    uint64_t current_offset_ = 0;
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/ivf_flat.hpp b/cgo/cuvs/ivf_flat.hpp
new file mode 100644
index 0000000000000..7096d5f2e1640
--- /dev/null
+++ b/cgo/cuvs/ivf_flat.hpp
@@ -0,0 +1,695 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "index_base.hpp"
+#include "cuvs_worker.hpp"
+#include "cuvs_types.h"
+#include "quantize.hpp"
+
+#include <cuda_fp16.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <algorithm>
+#include <future>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <shared_mutex>
+#include <stdexcept>
+#include <string>      
+#include <type_traits> 
+#include <vector>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include <raft/core/copy.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources_snmg.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
+#pragma GCC diagnostic pop
+
+
+namespace matrixone {
+
+/**
+ * @brief Search result containing neighbor IDs and distances.
+ * Common for all IVF-Flat instantiations.
+ */
+struct ivf_flat_search_result_t {
+    std::vector<int64_t> neighbors; // Indices of nearest neighbors
+    std::vector<float> distances;  // Distances to nearest neighbors
+};
+
+/**
+ * @brief gpu_ivf_flat_t implements an IVF-Flat index that can run on a single GPU or sharded across multiple GPUs.
+ * It automatically chooses between single-GPU and multi-GPU (SNMG) cuVS APIs based on the RAFT handle resources.
+ */
+template <typename T>
+class gpu_ivf_flat_t : public gpu_index_base_t<T, ivf_flat_build_params_t> {
+public:
+    using ivf_flat_index = cuvs::neighbors::ivf_flat::index<T, int64_t>;
+    using mg_index = cuvs::neighbors::mg_index<ivf_flat_index, T, int64_t>;
+    using search_result_t = ivf_flat_search_result_t;
+
+    // Internal index storage
+    std::unique_ptr<ivf_flat_index> index_;
+    std::unique_ptr<mg_index> mg_index_;
+
+    ~gpu_ivf_flat_t() override {
+        this->destroy();
+    }
+
+    // Unified Constructor for building from dataset
+    gpu_ivf_flat_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                    cuvs::distance::DistanceType m, const ivf_flat_build_params_t& bp, 
+                    const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode) {
+        
+        this->dimension = dimension;
+        this->count = static_cast<uint32_t>(count_vectors);
+        this->metric = m;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = static_cast<uint32_t>(count_vectors);
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+
+        this->flattened_host_dataset.resize(this->count * this->dimension);
+        if (dataset_data) {
+            std::copy(dataset_data, dataset_data + (this->count * this->dimension), this->flattened_host_dataset.begin());
+        }
+    }
+
+    // Constructor for chunked input (pre-allocates)
+    gpu_ivf_flat_t(uint64_t total_count, uint32_t dimension, cuvs::distance::DistanceType m, 
+                    const ivf_flat_build_params_t& bp, const std::vector<int>& devices, 
+                    uint32_t nthread, distribution_mode_t mode) {
+        
+        this->dimension = dimension;
+        this->count = static_cast<uint32_t>(total_count);
+        this->metric = m;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = 0;
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+
+        this->flattened_host_dataset.resize(this->count * this->dimension);
+    }
+
+    // Unified Constructor for loading from file
+    gpu_ivf_flat_t(const std::string& filename, uint32_t dimension, cuvs::distance::DistanceType m, 
+                    const ivf_flat_build_params_t& bp, const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode) {
+        
+        this->filename_ = filename;
+        this->dimension = dimension;
+        this->metric = m;
+        this->count = 0;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = 0;
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+    }
+
+    void destroy() override {
+        if (this->worker) {
+            this->worker->stop();
+        }
+        std::unique_lock<std::shared_mutex> lock(this->mutex_);
+        index_.reset();
+        mg_index_.reset();
+        this->quantizer_.reset();
+        this->dataset_device_ptr_.reset();
+    }
+
+    /**
+     * @brief Starts the worker and initializes resources.
+     */
+    void start() {
+        auto init_fn = [](raft_handle_wrapper_t&) -> std::any {
+            return std::any();
+        };
+
+        auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any {
+            std::unique_lock<std::shared_mutex> lock(this->mutex_);
+            index_.reset();
+            mg_index_.reset();
+            this->quantizer_.reset();
+            this->dataset_device_ptr_.reset();
+            return std::any();
+        };
+
+        this->worker->start(init_fn, stop_fn);
+    }
+
+    /**
+     * @brief Loads the index from file or builds it from the dataset.
+     */
+    void build() {
+        std::unique_lock<std::shared_mutex> lock(this->mutex_);
+        if (this->is_loaded_) return;
+
+        if (this->filename_.empty() && this->current_offset_ > 0 && this->current_offset_ < this->count) {
+            this->count = static_cast<uint32_t>(this->current_offset_);
+            this->flattened_host_dataset.resize(this->count * this->dimension);
+        }
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                this->build_internal(handle);
+                return std::any();
+            }
+        );
+
+        auto result_wait = this->worker->wait(job_id).get();
+        if (result_wait.error) std::rethrow_exception(result_wait.error);
+        this->is_loaded_ = true;
+        // Clear host dataset after building to save memory
+        if (this->filename_.empty()) {
+            this->flattened_host_dataset.clear();
+            this->flattened_host_dataset.shrink_to_fit();
+        }
+    }
+
+    /**
+     * @brief Internal build implementation (no worker submission)
+     */
+    void build_internal(raft_handle_wrapper_t& handle) {
+        auto res = handle.get_raft_resources();
+        bool is_mg = is_snmg_handle(res);
+
+        if (!this->filename_.empty()) {
+            if (is_mg) {
+                mg_index_ = std::make_unique<mg_index>(
+                    cuvs::neighbors::ivf_flat::deserialize<T, int64_t>(*res, this->filename_));
+                // Update metadata
+                this->count = 0;
+                for (const auto& iface : mg_index_->ann_interfaces_) {
+                    if (iface.index_.has_value()) this->count += static_cast<uint32_t>(iface.index_.value().size());
+                }
+                if (!mg_index_->ann_interfaces_.empty() && mg_index_->ann_interfaces_[0].index_.has_value()) {
+                    this->build_params.n_lists = static_cast<uint32_t>(mg_index_->ann_interfaces_[0].index_.value().n_lists());
+                }
+            } else {
+                cuvs::neighbors::ivf_flat::index_params index_params;
+                index_params.metric = this->metric;
+                index_ = std::make_unique<ivf_flat_index>(*res, index_params, this->dimension);
+                cuvs::neighbors::ivf_flat::deserialize(*res, this->filename_, index_.get());
+                this->count = static_cast<uint32_t>(index_->size());
+                this->build_params.n_lists = static_cast<uint32_t>(index_->n_lists());
+            }
+            raft::resource::sync_stream(*res);
+        } else if (!this->flattened_host_dataset.empty()) {
+            if (this->count < this->build_params.n_lists) {
+                throw std::runtime_error("Dataset too small: count (" + std::to_string(this->count) + 
+                                        ") must be >= n_list (" + std::to_string(this->build_params.n_lists) + 
+                                        ") to build IVF index.");
+            }
+
+            if (is_mg) {
+                auto dataset_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                    this->flattened_host_dataset.data(), (int64_t)this->count, (int64_t)this->dimension);
+
+                cuvs::neighbors::ivf_flat::index_params index_params;
+                index_params.metric = this->metric;
+                index_params.n_lists = this->build_params.n_lists;
+                index_params.add_data_on_build = this->build_params.add_data_on_build;
+                index_params.kmeans_trainset_fraction = this->build_params.kmeans_trainset_fraction;
+
+                cuvs::neighbors::mg_index_params<cuvs::neighbors::ivf_flat::index_params> mg_params(index_params);
+                if (this->dist_mode == DistributionMode_REPLICATED) {
+                    mg_params.mode = cuvs::neighbors::distribution_mode::REPLICATED;
+                } else {
+                    mg_params.mode = cuvs::neighbors::distribution_mode::SHARDED;
+                }
+
+                mg_index_ = std::make_unique<mg_index>(
+                    cuvs::neighbors::ivf_flat::build(*res, mg_params, dataset_host_view));
+            } else {
+                auto dataset_device = new auto(raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                    *res, static_cast<int64_t>(this->count), static_cast<int64_t>(this->dimension)));
+                
+                this->dataset_device_ptr_ = std::shared_ptr<void>(dataset_device, [](void* ptr) {
+                    delete static_cast<raft::device_matrix<T, int64_t, raft::layout_c_contiguous>*>(ptr);
+                });
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), this->flattened_host_dataset.data(),
+                                            this->flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice,
+                                            raft::resource::get_cuda_stream(*res)));
+
+                cuvs::neighbors::ivf_flat::index_params index_params;
+                index_params.metric = this->metric;
+                index_params.n_lists = this->build_params.n_lists;
+                index_params.add_data_on_build = this->build_params.add_data_on_build;
+                index_params.kmeans_trainset_fraction = this->build_params.kmeans_trainset_fraction;
+
+                index_ = std::make_unique<ivf_flat_index>(
+                    cuvs::neighbors::ivf_flat::build(*res, index_params, raft::make_const_mdspan(dataset_device->view())));
+            }
+            raft::resource::sync_stream(*res);
+        }
+    }
+
+    /**
+     * @brief Serializes the index to a file.
+     * @param filename Path to the output file.
+     */
+    void save(const std::string& filename) {
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) throw std::runtime_error("index not loaded");
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                if (is_snmg_handle(res)) {
+                    cuvs::neighbors::ivf_flat::serialize(*res, *mg_index_, filename);
+                } else {
+                    cuvs::neighbors::ivf_flat::serialize(*res, filename, *index_);
+                }
+                raft::resource::sync_stream(*res);
+                return std::any();
+            }
+        );
+
+        cuvs_task_result_t result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+    }
+
+    /**
+     * @brief Performs IVF-Flat search for given queries.
+     * @param queries_data Pointer to flattened query vectors on host.
+     * @param num_queries Number of query vectors.
+     * @param query_dimension Dimension of query vectors.
+     * @param limit Number of nearest neighbors to find.
+     * @param sp IVF-Flat search parameters.
+     * @return Search results.
+     */
+    search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const ivf_flat_search_params_t& sp) {
+        if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{};
+        if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch");
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{};
+
+        // For large batches or if batching is explicitly disabled, use standard path
+        if (num_queries > 16 || !this->worker->use_batching()) {
+            uint64_t job_id = this->worker->submit(
+                [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any {
+                    return this->search_internal(handle, queries_data, num_queries, limit, sp);
+                }
+            );
+            auto result_wait = this->worker->wait(job_id).get();
+            if (result_wait.error) std::rethrow_exception(result_wait.error);
+            return std::any_cast<search_result_t>(result_wait.result);
+        }
+
+        return this->search_batch_internal(queries_data, num_queries, limit, sp);
+    }
+
+    /**
+     * @brief Internal batch search implementation
+     */
+    search_result_t search_batch_internal(const T* queries_data, uint64_t num_queries, uint32_t limit, const ivf_flat_search_params_t& sp) {
+        // Dynamic batching for small query counts
+        struct search_req_t {
+            const T* data;
+            uint64_t n;
+        };
+
+        std::string batch_key = "ivf_flat_s_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.n_probes);
+        
+        auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector<std::any>& reqs, const std::vector<std::function<void(std::any)>>& setters) {
+            uint64_t total_queries = 0;
+            for (const auto& r : reqs) total_queries += std::any_cast<search_req_t>(r).n;
+
+            std::vector<T> aggregated_queries(total_queries * this->dimension);
+            uint64_t offset = 0;
+            for (const auto& r : reqs) {
+                auto req = std::any_cast<search_req_t>(r);
+                std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension));
+                offset += req.n;
+            }
+
+            auto results = this->search_internal(handle, aggregated_queries.data(), total_queries, limit, sp);
+
+            offset = 0;
+            for (size_t i = 0; i < reqs.size(); ++i) {
+                auto req = std::any_cast<search_req_t>(reqs[i]);
+                search_result_t individual_res;
+                individual_res.neighbors.resize(req.n * limit);
+                individual_res.distances.resize(req.n * limit);
+                std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin());
+                std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin());
+                setters[i](individual_res);
+                offset += req.n;
+            }
+        };
+
+        auto future = this->worker->template submit_batched<search_result_t>(batch_key, search_req_t{queries_data, num_queries}, exec_fn);
+        return future.get();
+    }
+
+    /**
+     * @brief Internal search implementation (no worker submission)
+     */
+    search_result_t search_internal(raft_handle_wrapper_t& handle, const T* queries_data, uint64_t num_queries, uint32_t limit, const ivf_flat_search_params_t& sp) {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        auto res = handle.get_raft_resources();
+
+        search_result_t search_res;
+        search_res.neighbors.resize(num_queries * limit);
+        search_res.distances.resize(num_queries * limit);
+
+        cuvs::neighbors::ivf_flat::search_params search_params;
+        search_params.n_probes = sp.n_probes;
+
+        const ivf_flat_index* local_index = index_.get();
+        if (!local_index && mg_index_) {
+            int current_device;
+            RAFT_CUDA_TRY(cudaGetDevice(&current_device));
+            for (size_t i = 0; i < this->devices_.size(); ++i) {
+                if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) {
+                    if (mg_index_->ann_interfaces_[i].index_.has_value()) {
+                        local_index = &mg_index_->ann_interfaces_[i].index_.value();
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (is_snmg_handle(res) && mg_index_) {
+            auto queries_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                queries_data, (int64_t)num_queries, (int64_t)this->dimension);
+            auto neighbors_host_view = raft::make_host_matrix_view<int64_t, int64_t>(
+                search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit);
+            auto distances_host_view = raft::make_host_matrix_view<float, int64_t>(
+                search_res.distances.data(), (int64_t)num_queries, (int64_t)limit);
+
+            cuvs::neighbors::mg_search_params<cuvs::neighbors::ivf_flat::search_params> mg_search_params(search_params);
+            cuvs::neighbors::ivf_flat::search(*res, *mg_index_, mg_search_params,
+                                                queries_host_view, neighbors_host_view, distances_host_view);
+        } else if (local_index) {
+            auto queries_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(this->dimension));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data,
+                                        num_queries * this->dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                        raft::resource::get_cuda_stream(*res)));
+
+            auto neighbors_device = raft::make_device_matrix<int64_t, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+            auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+            cuvs::neighbors::ivf_flat::search(*res, search_params, *local_index,
+                                                raft::make_const_mdspan(queries_device.view()), 
+                                                neighbors_device.view(), distances_device.view());
+
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(),
+                                        search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(),
+                                        search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+        } else {
+            throw std::runtime_error("Index not loaded or failed to find local index shard for current device.");
+        }
+
+        raft::resource::sync_stream(*res);
+
+        for (size_t i = 0; i < search_res.neighbors.size(); ++i) {
+            if (search_res.neighbors[i] == std::numeric_limits<int64_t>::max() || 
+                search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) {
+                search_res.neighbors[i] = -1;
+            }
+        }
+        return search_res;
+    }
+
+    /**
+     * @brief Performs IVF-Flat search for given float32 queries, with on-the-fly quantization if needed.
+     */
+    search_result_t search_float(const float* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const ivf_flat_search_params_t& sp) {
+        if constexpr (std::is_same_v<T, float>) {
+            return search(queries_data, num_queries, query_dimension, limit, sp);
+        }
+
+        if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{};
+        if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch");
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{};
+
+        // For large batches or if batching is explicitly disabled, use standard path
+        if (num_queries > 16 || !this->worker->use_batching()) {
+            uint64_t job_id = this->worker->submit(
+                [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any {
+                    return this->search_float_internal(handle, queries_data, num_queries, query_dimension, limit, sp);
+                }
+            );
+            auto result_wait = this->worker->wait(job_id).get();
+            if (result_wait.error) std::rethrow_exception(result_wait.error);
+            return std::any_cast<search_result_t>(result_wait.result);
+        }
+
+        return this->search_float_batch_internal(queries_data, num_queries, limit, sp);
+    }
+
+    /**
+     * @brief Internal batch search implementation for float32 queries
+     */
+    search_result_t search_float_batch_internal(const float* queries_data, uint64_t num_queries, uint32_t limit, const ivf_flat_search_params_t& sp) {
+        // Dynamic batching for small query counts
+        struct search_req_t {
+            const float* data;
+            uint64_t n;
+        };
+
+        std::string batch_key = "ivf_flat_sf_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.n_probes);
+        
+        auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector<std::any>& reqs, const std::vector<std::function<void(std::any)>>& setters) {
+            uint64_t total_queries = 0;
+            for (const auto& r : reqs) total_queries += std::any_cast<search_req_t>(r).n;
+
+            std::vector<float> aggregated_queries(total_queries * this->dimension);
+            uint64_t offset = 0;
+            for (const auto& r : reqs) {
+                auto req = std::any_cast<search_req_t>(r);
+                std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension));
+                offset += req.n;
+            }
+
+            auto results = this->search_float_internal(handle, aggregated_queries.data(), total_queries, this->dimension, limit, sp);
+
+            offset = 0;
+            for (size_t i = 0; i < reqs.size(); ++i) {
+                auto req = std::any_cast<search_req_t>(reqs[i]);
+                search_result_t individual_res;
+                individual_res.neighbors.resize(req.n * limit);
+                individual_res.distances.resize(req.n * limit);
+                std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin());
+                std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin());
+                setters[i](individual_res);
+                offset += req.n;
+            }
+        };
+
+        auto future = this->worker->template submit_batched<search_result_t>(batch_key, search_req_t{queries_data, num_queries}, exec_fn);
+        return future.get();
+    }
+
+    /**
+     * @brief Internal search_float implementation (no worker submission)
+     */
+    search_result_t search_float_internal(raft_handle_wrapper_t& handle, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const ivf_flat_search_params_t& sp) {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        auto res = handle.get_raft_resources();
+
+        // 1. Quantize/Convert float queries to T on device
+        auto queries_device_float = raft::make_device_matrix<float, int64_t>(*res, num_queries, this->dimension);
+        raft::copy(*res, queries_device_float.view(), raft::make_host_matrix_view<const float, int64_t>(queries_data, num_queries, this->dimension));
+        
+        auto queries_device_target = raft::make_device_matrix<T, int64_t>(*res, num_queries, this->dimension);
+        if constexpr (sizeof(T) == 1) {
+            if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained");
+            this->quantizer_.template transform<T>(*res, queries_device_float.view(), queries_device_target.data_handle(), true);
+            raft::resource::sync_stream(*res);
+        } else {
+            raft::copy(*res, queries_device_target.view(), queries_device_float.view());
+        }
+
+        // 2. Perform search
+        search_result_t search_res;
+        search_res.neighbors.resize(num_queries * limit);
+        search_res.distances.resize(num_queries * limit);
+
+        cuvs::neighbors::ivf_flat::search_params search_params;
+        search_params.n_probes = sp.n_probes;
+
+        const ivf_flat_index* local_index = index_.get();
+        if (!local_index && mg_index_) {
+            int current_device;
+            RAFT_CUDA_TRY(cudaGetDevice(&current_device));
+            for (size_t i = 0; i < this->devices_.size(); ++i) {
+                if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) {
+                    if (mg_index_->ann_interfaces_[i].index_.has_value()) {
+                        local_index = &mg_index_->ann_interfaces_[i].index_.value();
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (is_snmg_handle(res) && mg_index_) {
+            auto queries_host_target = raft::make_host_matrix<T, int64_t>(num_queries, this->dimension);
+            raft::copy(*res, queries_host_target.view(), queries_device_target.view());
+            raft::resource::sync_stream(*res);
+
+            auto neighbors_host_view = raft::make_host_matrix_view<int64_t, int64_t>(
+                search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit);
+            auto distances_host_view = raft::make_host_matrix_view<float, int64_t>(
+                search_res.distances.data(), (int64_t)num_queries, (int64_t)limit);
+
+            cuvs::neighbors::mg_search_params<cuvs::neighbors::ivf_flat::search_params> mg_search_params(search_params);
+            cuvs::neighbors::ivf_flat::search(*res, *mg_index_, mg_search_params,
+                                                queries_host_target.view(), 
+                                                neighbors_host_view, distances_host_view);
+        } else if (local_index) {
+            auto neighbors_device = raft::make_device_matrix<int64_t, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+            auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+            cuvs::neighbors::ivf_flat::search(*res, search_params, *local_index,
+                                                raft::make_const_mdspan(queries_device_target.view()), 
+                                                neighbors_device.view(), distances_device.view());
+
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(),
+                                        search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(),
+                                        search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+        } else {
+            throw std::runtime_error("Index not loaded or failed to find local index shard for current device.");
+        }
+
+        raft::resource::sync_stream(*res);
+
+        for (size_t i = 0; i < search_res.neighbors.size(); ++i) {
+            if (search_res.neighbors[i] == std::numeric_limits<int64_t>::max() || 
+                search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) {
+                search_res.neighbors[i] = -1;
+            }
+        }
+        return search_res;
+    }
+
+    std::vector<T> get_centers() {
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) return {};
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                
+                const ivf_flat_index* local_index = nullptr;
+                if (index_) {
+                    local_index = index_.get();
+                } else if (mg_index_) {
+                    for (const auto& iface : mg_index_->ann_interfaces_) {
+                        if (iface.index_.has_value()) {
+                            local_index = &iface.index_.value();
+                            break;
+                        }
+                    }
+                }
+
+                if (!local_index) return std::vector<T>{};
+
+                auto centers_view = local_index->centers();
+                size_t n_centers = centers_view.extent(0);
+                size_t dim = centers_view.extent(1);
+                std::vector<T> host_centers(n_centers * dim);
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_centers.data(), centers_view.data_handle(),
+                                         host_centers.size() * sizeof(T), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                
+                raft::resource::sync_stream(*res);
+                return host_centers;
+            }
+        );
+
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<std::vector<T>>(result.result);
+    }
+
+    uint32_t get_n_list() {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        if (index_) return static_cast<uint32_t>(index_->n_lists());
+        if (mg_index_) {
+            for (const auto& iface : mg_index_->ann_interfaces_) {
+                if (iface.index_.has_value()) return static_cast<uint32_t>(iface.index_.value().n_lists());
+            }
+        }
+        return this->build_params.n_lists;
+    }
+
+    std::string info() const override {
+        std::string json = gpu_index_base_t<T, ivf_flat_build_params_t>::info();
+        json += ", \"type\": \"IVF-Flat\", \"ivf_flat\": {";
+        if (index_) {
+            json += "\"mode\": \"Single-GPU\", \"size\": " + std::to_string(index_->size()) + 
+                    ", \"n_lists\": " + std::to_string(index_->n_lists());
+        } else if (mg_index_) {
+            json += "\"mode\": \"Multi-GPU\", \"shards\": [";
+            for (size_t i = 0; i < mg_index_->ann_interfaces_.size(); ++i) {
+                const auto& iface = mg_index_->ann_interfaces_[i];
+                json += "{\"device\": " + std::to_string(this->devices_[i]);
+                if (iface.index_.has_value()) {
+                    json += ", \"size\": " + std::to_string(iface.index_.value().size()) + 
+                            ", \"n_lists\": " + std::to_string(iface.index_.value().n_lists());
+                } else {
+                    json += ", \"status\": \"Not loaded\"";
+                }
+                json += "}" + std::string(i == mg_index_->ann_interfaces_.size() - 1 ? "" : ", ");
+            }
+            json += "]";
+        } else {
+            json += "\"built\": false";
+        }
+        json += "}}";
+        return json;
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/ivf_flat_c.cpp b/cgo/cuvs/ivf_flat_c.cpp
new file mode 100644
index 0000000000000..215090156c2bc
--- /dev/null
+++ b/cgo/cuvs/ivf_flat_c.cpp
@@ -0,0 +1,507 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ivf_flat_c.h"
+#include "ivf_flat.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <cstring>
+
+struct gpu_ivf_flat_any_t {
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_ivf_flat_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_ivf_flat_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_ivf_flat_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_ivf_flat_t<half>*>(ptr); break;
+            case Quantization_INT8: delete static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(ptr); break;
+            case Quantization_UINT8: delete static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_ivf_flat_c gpu_ivf_flat_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c,
+                                 ivf_flat_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<float>(static_cast<const float*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<half>(static_cast<const half*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<int8_t>(static_cast<const int8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<uint8_t>(static_cast<const uint8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-Flat");
+        }
+        return static_cast<gpu_ivf_flat_c>(new gpu_ivf_flat_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_new", e.what());
+        return nullptr;
+    }
+}
+
+gpu_ivf_flat_c gpu_ivf_flat_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric_c,
+                                      ivf_flat_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<float>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<half>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<int8_t>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<uint8_t>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-Flat");
+        }
+        return static_cast<gpu_ivf_flat_c>(new gpu_ivf_flat_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_new_empty", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_ivf_flat_add_chunk(gpu_ivf_flat_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->add_chunk(static_cast<const float*>(chunk_data), chunk_count); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->add_chunk(static_cast<const half*>(chunk_data), chunk_count); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->add_chunk(static_cast<const int8_t*>(chunk_data), chunk_count); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->add_chunk(static_cast<const uint8_t*>(chunk_data), chunk_count); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_add_chunk", e.what());
+    }
+}
+
+void gpu_ivf_flat_add_chunk_float(gpu_ivf_flat_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_add_chunk_float", e.what());
+    }
+}
+
+void gpu_ivf_flat_train_quantizer(gpu_ivf_flat_c index_c, const float* train_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_train_quantizer", e.what());
+    }
+}
+
+void gpu_ivf_flat_set_per_thread_device(gpu_ivf_flat_c index_c, bool enable, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->set_per_thread_device(enable); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_set_per_thread_device", e.what());
+    }
+}
+
+void gpu_ivf_flat_set_use_batching(gpu_ivf_flat_c index_c, bool enable, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->set_use_batching(enable); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_set_use_batching", e.what());
+    }
+}
+
+void gpu_ivf_flat_set_quantizer(gpu_ivf_flat_c index_c, float min, float max, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->set_quantizer(min, max); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_set_quantizer", e.what());
+    }
+}
+
+void gpu_ivf_flat_get_quantizer(gpu_ivf_flat_c index_c, float* min, float* max, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->get_quantizer(min, max); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_get_quantizer", e.what());
+    }
+}
+
+gpu_ivf_flat_c gpu_ivf_flat_load_file(const char* filename, uint32_t dimension, distance_type_t metric_c,
+                                      ivf_flat_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<float>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<half>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<int8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<uint8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-Flat");
+        }
+        return static_cast<gpu_ivf_flat_c>(new gpu_ivf_flat_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_load_file", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_ivf_flat_destroy(gpu_ivf_flat_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_destroy", e.what());
+    }
+}
+
+void gpu_ivf_flat_start(gpu_ivf_flat_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->start(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->start(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->start(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->start(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_start", e.what());
+    }
+}
+
+void gpu_ivf_flat_build(gpu_ivf_flat_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->build(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->build(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->build(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->build(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_build", e.what());
+    }
+}
+
+void gpu_ivf_flat_save(gpu_ivf_flat_c index_c, const char* filename, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->save(filename); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->save(filename); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->save(filename); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->save(filename); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_save", e.what());
+    }
+}
+
+gpu_ivf_flat_search_res_t gpu_ivf_flat_search(gpu_ivf_flat_c index_c, const void* queries_data, uint64_t num_queries, 
+                                                uint32_t query_dimension, uint32_t limit, 
+                                                ivf_flat_search_params_t search_params, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_ivf_flat_search_res_t res = {nullptr};
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<float>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->search(static_cast<const float*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_F16: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<half>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->search(static_cast<const half*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_INT8: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<int8_t>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->search(static_cast<const int8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_UINT8: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<uint8_t>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->search(static_cast<const uint8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_search", e.what());
+    }
+    return res;
+}
+
+gpu_ivf_flat_search_res_t gpu_ivf_flat_search_float(gpu_ivf_flat_c index_c, const float* queries_data, uint64_t num_queries, 
+                                                      uint32_t query_dimension, uint32_t limit, 
+                                                      ivf_flat_search_params_t search_params, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_ivf_flat_search_res_t res = {nullptr};
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<float>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_F16: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<half>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_INT8: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<int8_t>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_UINT8: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<uint8_t>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_search_float", e.what());
+    }
+    return res;
+}
+
+void gpu_ivf_flat_get_neighbors(gpu_ivf_flat_result_c result_c, uint64_t total_elements, int64_t* neighbors) {
+    if (!result_c) return;
+    auto* neighbors_vec = &static_cast<matrixone::gpu_ivf_flat_t<float>::search_result_t*>(result_c)->neighbors;
+    if (neighbors_vec->size() >= total_elements) {
+        std::copy(neighbors_vec->begin(), neighbors_vec->begin() + total_elements, neighbors);
+    }
+}
+
+void gpu_ivf_flat_get_distances(gpu_ivf_flat_result_c result_c, uint64_t total_elements, float* distances) {
+    if (!result_c) return;
+    auto* distances_vec = &static_cast<matrixone::gpu_ivf_flat_t<float>::search_result_t*>(result_c)->distances;
+    if (distances_vec->size() >= total_elements) {
+        std::copy(distances_vec->begin(), distances_vec->begin() + total_elements, distances);
+    }
+}
+
+void gpu_ivf_flat_free_result(gpu_ivf_flat_result_c result_c) {
+    if (!result_c) return;
+    delete static_cast<matrixone::gpu_ivf_flat_t<float>::search_result_t*>(result_c);
+}
+
+uint32_t gpu_ivf_flat_cap(gpu_ivf_flat_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->cap();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->cap();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->cap();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->cap();
+        default: return 0;
+    }
+}
+
+uint32_t gpu_ivf_flat_len(gpu_ivf_flat_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->len();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->len();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->len();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->len();
+        default: return 0;
+    }
+}
+
+char* gpu_ivf_flat_info(gpu_ivf_flat_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    if (!index_c) return nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        std::string info;
+        switch (any->qtype) {
+            case Quantization_F32: info = static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->info(); break;
+            case Quantization_F16: info = static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->info(); break;
+            case Quantization_INT8: info = static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->info(); break;
+            case Quantization_UINT8: info = static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->info(); break;
+            default: return nullptr;
+        }
+        return strdup(info.c_str());
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_info", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_ivf_flat_get_centers(gpu_ivf_flat_c index_c, void* centers, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto host_centers = static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->get_centers();
+                std::copy(host_centers.begin(), host_centers.end(), static_cast<float*>(centers));
+                break;
+            }
+            case Quantization_F16: {
+                auto host_centers = static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->get_centers();
+                std::copy(host_centers.begin(), host_centers.end(), static_cast<half*>(centers));
+                break;
+            }
+            case Quantization_INT8: {
+                auto host_centers = static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->get_centers();
+                std::copy(host_centers.begin(), host_centers.end(), static_cast<int8_t*>(centers));
+                break;
+            }
+            case Quantization_UINT8: {
+                auto host_centers = static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->get_centers();
+                std::copy(host_centers.begin(), host_centers.end(), static_cast<uint8_t*>(centers));
+                break;
+            }
+            default: throw std::runtime_error("Unsupported quantization type");
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_get_centers", e.what());
+    }
+}
+
+uint32_t gpu_ivf_flat_get_n_list(gpu_ivf_flat_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->get_n_list();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->get_n_list();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->get_n_list();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->get_n_list();
+        default: return 0;
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_ivf_flat_t<float>;
+template class gpu_ivf_flat_t<half>;
+template class gpu_ivf_flat_t<int8_t>;
+template class gpu_ivf_flat_t<uint8_t>;
+} // namespace matrixone
diff --git a/cgo/cuvs/ivf_flat_c.h b/cgo/cuvs/ivf_flat_c.h
new file mode 100644
index 0000000000000..79c1243060bf6
--- /dev/null
+++ b/cgo/cuvs/ivf_flat_c.h
@@ -0,0 +1,117 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef IVF_FLAT_C_H
+#define IVF_FLAT_C_H
+
+#include "helper.h"
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_ivf_flat_t object
+typedef void* gpu_ivf_flat_c;
+
+// Opaque pointer to the C++ IVF-Flat search result object
+typedef void* gpu_ivf_flat_result_c;
+
+// Constructor for building from dataset
+gpu_ivf_flat_c gpu_ivf_flat_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                                 distance_type_t metric, ivf_flat_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Constructor for loading from file
+gpu_ivf_flat_c gpu_ivf_flat_load_file(const char* filename, uint32_t dimension, distance_type_t metric,
+                                      ivf_flat_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Destructor
+void gpu_ivf_flat_destroy(gpu_ivf_flat_c index_c, void* errmsg);
+
+// Start function (initializes worker and resources)
+void gpu_ivf_flat_start(gpu_ivf_flat_c index_c, void* errmsg);
+
+// Build function (actually triggers the build/load logic)
+void gpu_ivf_flat_build(gpu_ivf_flat_c index_c, void* errmsg);
+
+// Constructor for an empty index (pre-allocates)
+gpu_ivf_flat_c gpu_ivf_flat_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric, 
+                                           ivf_flat_build_params_t build_params,
+                                           const int* devices, int device_count, uint32_t nthread, 
+                                           distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Add chunk of data (same type as index quantization)
+void gpu_ivf_flat_add_chunk(gpu_ivf_flat_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg);
+
+// Add chunk of data (from float, with on-the-fly quantization if needed)
+void gpu_ivf_flat_add_chunk_float(gpu_ivf_flat_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg);
+
+// Trains the scalar quantizer (if T is 1-byte)
+void gpu_ivf_flat_train_quantizer(gpu_ivf_flat_c index_c, const float* train_data, uint64_t n_samples, void* errmsg);
+
+void gpu_ivf_flat_set_per_thread_device(gpu_ivf_flat_c index_c, bool enable, void* errmsg);
+void gpu_ivf_flat_set_use_batching(gpu_ivf_flat_c index_c, bool enable, void* errmsg);
+
+void gpu_ivf_flat_set_quantizer(gpu_ivf_flat_c index_c, float min, float max, void* errmsg);
+void gpu_ivf_flat_get_quantizer(gpu_ivf_flat_c index_c, float* min, float* max, void* errmsg);
+
+// Destructor
+
+void gpu_ivf_flat_save(gpu_ivf_flat_c index_c, const char* filename, void* errmsg);
+
+// Search function
+typedef struct {
+    gpu_ivf_flat_result_c result_ptr;
+} gpu_ivf_flat_search_res_t;
+
+gpu_ivf_flat_search_res_t gpu_ivf_flat_search(gpu_ivf_flat_c index_c, const void* queries_data, uint64_t num_queries, 
+                                                uint32_t query_dimension, uint32_t limit, 
+                                                ivf_flat_search_params_t search_params, void* errmsg);
+
+gpu_ivf_flat_search_res_t gpu_ivf_flat_search_float(gpu_ivf_flat_c index_c, const float* queries_data, uint64_t num_queries, 
+                                                      uint32_t query_dimension, uint32_t limit, 
+                                                      ivf_flat_search_params_t search_params, void* errmsg);
+// Get results from result object
+void gpu_ivf_flat_get_neighbors(gpu_ivf_flat_result_c result_c, uint64_t total_elements, int64_t* neighbors);
+void gpu_ivf_flat_get_distances(gpu_ivf_flat_result_c result_c, uint64_t total_elements, float* distances);
+
+// Free result object
+void gpu_ivf_flat_free_result(gpu_ivf_flat_result_c result_c);
+
+// Returns the capacity of the index buffer
+uint32_t gpu_ivf_flat_cap(gpu_ivf_flat_c index_c);
+
+// Returns the current number of vectors in the index
+uint32_t gpu_ivf_flat_len(gpu_ivf_flat_c index_c);
+
+// Returns info about the index as a JSON string
+char* gpu_ivf_flat_info(gpu_ivf_flat_c index_c, void* errmsg);
+
+// Gets the trained centroids
+void gpu_ivf_flat_get_centers(gpu_ivf_flat_c index_c, void* centers, void* errmsg);
+
+// Gets the number of lists (centroids)
+uint32_t gpu_ivf_flat_get_n_list(gpu_ivf_flat_c index_c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // IVF_FLAT_C_H
diff --git a/cgo/cuvs/ivf_pq.hpp b/cgo/cuvs/ivf_pq.hpp
new file mode 100644
index 0000000000000..8d06a844a99cb
--- /dev/null
+++ b/cgo/cuvs/ivf_pq.hpp
@@ -0,0 +1,778 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "index_base.hpp"
+#include "cuvs_worker.hpp"
+#include "cuvs_types.h"
+#include "quantize.hpp"
+
+#include <cuda_fp16.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <algorithm>
+#include <future>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <shared_mutex>
+#include <stdexcept>
+#include <string>      
+#include <type_traits> 
+#include <vector>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include <raft/core/copy.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources_snmg.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
+#pragma GCC diagnostic pop
+
+
+namespace matrixone {
+
+/**
+ * @brief Search result containing neighbor IDs and distances.
+ * Common for all IVF-PQ instantiations.
+ */
+struct ivf_pq_search_result_t {
+    std::vector<int64_t> neighbors; // Indices of nearest neighbors
+    std::vector<float> distances;  // Distances to nearest neighbors
+};
+
+/**
+ * @brief gpu_ivf_pq_t implements an IVF-PQ index that can run on a single GPU or sharded/replicated across multiple GPUs.
+ */
+template <typename T>
+class gpu_ivf_pq_t : public gpu_index_base_t<T, ivf_pq_build_params_t> {
+public:
+    using ivf_pq_index = cuvs::neighbors::ivf_pq::index<int64_t>;
+    using mg_index = cuvs::neighbors::mg_index<ivf_pq_index, T, int64_t>;
+    using search_result_t = ivf_pq_search_result_t;
+
+    // Internal index storage
+    std::unique_ptr<ivf_pq_index> index_;
+    std::unique_ptr<mg_index> mg_index_;
+
+    ~gpu_ivf_pq_t() override {
+        this->destroy();
+    }
+
+    // Unified Constructor for building from dataset
+    gpu_ivf_pq_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                    cuvs::distance::DistanceType m, const ivf_pq_build_params_t& bp, 
+                    const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode) {
+        
+        this->dimension = dimension;
+        this->count = static_cast<uint32_t>(count_vectors);
+        this->metric = m;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = static_cast<uint32_t>(count_vectors);
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+
+        this->flattened_host_dataset.resize(this->count * this->dimension);
+        if (dataset_data) {
+            std::copy(dataset_data, dataset_data + (this->count * this->dimension), this->flattened_host_dataset.begin());
+        }
+    }
+
+    // Constructor for chunked input (pre-allocates)
+    gpu_ivf_pq_t(uint64_t total_count, uint32_t dimension, cuvs::distance::DistanceType m, 
+                    const ivf_pq_build_params_t& bp, const std::vector<int>& devices, 
+                    uint32_t nthread, distribution_mode_t mode) {
+        
+        this->dimension = dimension;
+        this->count = static_cast<uint32_t>(total_count);
+        this->metric = m;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = 0;
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+
+        this->flattened_host_dataset.resize(this->count * this->dimension);
+    }
+
+    // Constructor for building from MODF datafile
+    gpu_ivf_pq_t(const std::string& data_filename, cuvs::distance::DistanceType m, 
+                    const ivf_pq_build_params_t& bp, const std::vector<int>& devices, 
+                    uint32_t nthread, distribution_mode_t mode) {
+        
+        this->metric = m;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+
+        uint64_t file_count = 0;
+        uint64_t file_dim = 0;
+        load_host_matrix<T>(data_filename, this->flattened_host_dataset, file_count, file_dim);
+        
+        this->count = static_cast<uint32_t>(file_count);
+        this->dimension = static_cast<uint32_t>(file_dim);
+        this->current_offset_ = this->count;
+    }
+
+    // Unified Constructor for loading from file
+    gpu_ivf_pq_t(const std::string& filename, uint32_t dimension, cuvs::distance::DistanceType m, 
+                    const ivf_pq_build_params_t& bp, const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode) {
+        
+        this->filename_ = filename;
+        this->dimension = dimension;
+        this->metric = m;
+        this->count = 0;
+        this->build_params = bp;
+        this->dist_mode = mode;
+        this->devices_ = devices;
+        this->current_offset_ = 0;
+
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_, force_mg || (this->devices_.size() > 1));
+    }
+
+    void destroy() override {
+        if (this->worker) {
+            this->worker->stop();
+        }
+        std::unique_lock<std::shared_mutex> lock(this->mutex_);
+        index_.reset();
+        mg_index_.reset();
+        this->quantizer_.reset();
+        this->dataset_device_ptr_.reset();
+    }
+
+    /**
+     * @brief Starts the worker and initializes resources.
+     */
+    void start() {
+        auto init_fn = [](raft_handle_wrapper_t&) -> std::any {
+            return std::any();
+        };
+
+        auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any {
+            std::unique_lock<std::shared_mutex> lock(this->mutex_);
+            index_.reset();
+            mg_index_.reset();
+            this->quantizer_.reset();
+            this->dataset_device_ptr_.reset();
+            return std::any();
+        };
+
+        this->worker->start(init_fn, stop_fn);
+    }
+
+    /**
+     * @brief Loads the index from file or builds it from the dataset.
+     */
+    void build() {
+        std::unique_lock<std::shared_mutex> lock(this->mutex_);
+        if (this->is_loaded_) return;
+
+        if (this->filename_.empty() && this->current_offset_ > 0 && this->current_offset_ < this->count) {
+            this->count = static_cast<uint32_t>(this->current_offset_);
+            this->flattened_host_dataset.resize(this->count * this->dimension);
+        }
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                this->build_internal(handle);
+                return std::any();
+            }
+        );
+
+        auto result_wait = this->worker->wait(job_id).get();
+        if (result_wait.error) std::rethrow_exception(result_wait.error);
+        this->is_loaded_ = true;
+        // Clear host dataset after building to save memory (IVF-PQ stores its own copy on device)
+        if (this->filename_.empty()) {
+            this->flattened_host_dataset.clear();
+            this->flattened_host_dataset.shrink_to_fit();
+        }
+    }
+
+    /**
+     * @brief Internal build implementation (no worker submission)
+     */
+    void build_internal(raft_handle_wrapper_t& handle) {
+        auto res = handle.get_raft_resources();
+        bool is_mg = is_snmg_handle(res);
+
+        if (!this->filename_.empty()) {
+            if (is_mg) {
+                mg_index_ = std::make_unique<mg_index>(
+                    cuvs::neighbors::ivf_pq::deserialize<T, int64_t>(*res, this->filename_));
+                // Update metadata
+                this->count = 0;
+                for (const auto& iface : mg_index_->ann_interfaces_) {
+                    if (iface.index_.has_value()) this->count += static_cast<uint32_t>(iface.index_.value().size());
+                }
+                if (!mg_index_->ann_interfaces_.empty() && mg_index_->ann_interfaces_[0].index_.has_value()) {
+                    this->build_params.n_lists = static_cast<uint32_t>(mg_index_->ann_interfaces_[0].index_.value().n_lists());
+                    this->build_params.m = static_cast<uint32_t>(mg_index_->ann_interfaces_[0].index_.value().pq_dim());
+                    this->build_params.bits_per_code = static_cast<uint32_t>(mg_index_->ann_interfaces_[0].index_.value().pq_bits());
+                }
+            } else {
+                index_ = std::make_unique<ivf_pq_index>(*res);
+                cuvs::neighbors::ivf_pq::deserialize(*res, this->filename_, index_.get());
+                this->count = static_cast<uint32_t>(index_->size());
+                this->build_params.n_lists = static_cast<uint32_t>(index_->n_lists());
+                this->build_params.m = static_cast<uint32_t>(index_->pq_dim());
+                this->build_params.bits_per_code = static_cast<uint32_t>(index_->pq_bits());
+            }
+            raft::resource::sync_stream(*res);
+        } else if (!this->flattened_host_dataset.empty()) {
+            if (this->count < this->build_params.n_lists) {
+                throw std::runtime_error("Dataset too small: count (" + std::to_string(this->count) + 
+                                        ") must be >= n_list (" + std::to_string(this->build_params.n_lists) + 
+                                        ") to build IVF index.");
+            }
+
+            cuvs::neighbors::ivf_pq::index_params index_params;
+            index_params.metric = this->metric;
+            index_params.n_lists = this->build_params.n_lists;
+            index_params.pq_dim = this->build_params.m;
+            index_params.pq_bits = this->build_params.bits_per_code;
+            index_params.add_data_on_build = this->build_params.add_data_on_build;
+            index_params.kmeans_trainset_fraction = this->build_params.kmeans_trainset_fraction;
+
+            if (is_mg) {
+                auto dataset_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                    this->flattened_host_dataset.data(), (int64_t)this->count, (int64_t)this->dimension);
+
+                cuvs::neighbors::mg_index_params<cuvs::neighbors::ivf_pq::index_params> mg_params(index_params);
+                if (this->dist_mode == DistributionMode_REPLICATED) {
+                    mg_params.mode = cuvs::neighbors::distribution_mode::REPLICATED;
+                } else {
+                    mg_params.mode = cuvs::neighbors::distribution_mode::SHARDED;
+                }
+
+                mg_index_ = std::make_unique<mg_index>(
+                    cuvs::neighbors::ivf_pq::build(*res, mg_params, dataset_host_view));
+            } else {
+                auto dataset_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                    *res, static_cast<int64_t>(this->count), static_cast<int64_t>(this->dimension));
+                
+                RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device.data_handle(), this->flattened_host_dataset.data(),
+                                            this->flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice,
+                                            raft::resource::get_cuda_stream(*res)));
+
+                index_ = std::make_unique<ivf_pq_index>(
+                    cuvs::neighbors::ivf_pq::build(*res, index_params, raft::make_const_mdspan(dataset_device.view())));
+            }
+            raft::resource::sync_stream(*res);
+        }
+    }
+
+    /**
+     * @brief Serializes the index to a file.
+     * @param filename Path to the output file.
+     */
+    void save(const std::string& filename) {
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) throw std::runtime_error("index not loaded");
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                if (is_snmg_handle(res)) {
+                    cuvs::neighbors::ivf_pq::serialize(*res, *mg_index_, filename);
+                } else {
+                    cuvs::neighbors::ivf_pq::serialize(*res, filename, *index_);
+                }
+                raft::resource::sync_stream(*res);
+                return std::any();
+            }
+        );
+
+        cuvs_task_result_t result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+    }
+
+    /**
+     * @brief Performs IVF-PQ search for given queries.
+     * @param queries_data Pointer to flattened query vectors on host.
+     * @param num_queries Number of query vectors.
+     * @param query_dimension Dimension of query vectors.
+     * @param limit Number of nearest neighbors to find.
+     * @param sp IVF-PQ search parameters.
+     * @return Search results.
+     */
+    search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const ivf_pq_search_params_t& sp) {
+        if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{};
+        if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch");
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{};
+
+        // For large batches or if batching is explicitly disabled, use standard path
+        if (num_queries > 16 || !this->worker->use_batching()) {
+            uint64_t job_id = this->worker->submit(
+                [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any {
+                    return this->search_internal(handle, queries_data, num_queries, query_dimension, limit, sp);
+                }
+            );
+            auto result_wait = this->worker->wait(job_id).get();
+            if (result_wait.error) std::rethrow_exception(result_wait.error);
+            return std::any_cast<search_result_t>(result_wait.result);
+        }
+
+        return this->search_batch_internal(queries_data, num_queries, limit, sp);
+    }
+
+    /**
+     * @brief Internal batch search implementation
+     */
+    search_result_t search_batch_internal(const T* queries_data, uint64_t num_queries, uint32_t limit, const ivf_pq_search_params_t& sp) {
+        // Dynamic batching for small query counts
+        struct search_req_t {
+            const T* data;
+            uint64_t n;
+        };
+
+        std::string batch_key = "ivf_pq_s_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.n_probes);
+        
+        auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector<std::any>& reqs, const std::vector<std::function<void(std::any)>>& setters) {
+            uint64_t total_queries = 0;
+            for (const auto& r : reqs) total_queries += std::any_cast<search_req_t>(r).n;
+
+            std::vector<T> aggregated_queries(total_queries * this->dimension);
+            uint64_t offset = 0;
+            for (const auto& r : reqs) {
+                auto req = std::any_cast<search_req_t>(r);
+                std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension));
+                offset += req.n;
+            }
+
+            auto results = this->search_internal(handle, aggregated_queries.data(), total_queries, this->dimension, limit, sp);
+
+            offset = 0;
+            for (size_t i = 0; i < reqs.size(); ++i) {
+                auto req = std::any_cast<search_req_t>(reqs[i]);
+                search_result_t individual_res;
+                individual_res.neighbors.resize(req.n * limit);
+                individual_res.distances.resize(req.n * limit);
+                std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin());
+                std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin());
+                setters[i](individual_res);
+                offset += req.n;
+            }
+        };
+
+        auto future = this->worker->template submit_batched<search_result_t>(batch_key, search_req_t{queries_data, num_queries}, exec_fn);
+        return future.get();
+    }
+
+    /**
+     * @brief Performs IVF-PQ search for given float32 queries, with on-the-fly quantization if needed.
+     */
+    search_result_t search_float(const float* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                               uint32_t limit, const ivf_pq_search_params_t& sp) {
+        if constexpr (std::is_same_v<T, float>) {
+            return search(queries_data, num_queries, query_dimension, limit, sp);
+        }
+
+        if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{};
+        if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch");
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{};
+
+        if (num_queries > 16 || !this->worker->use_batching()) {
+            uint64_t job_id = this->worker->submit(
+                [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any {
+                    return this->search_float_internal(handle, queries_data, num_queries, query_dimension, limit, sp);
+                }
+            );
+            auto result_wait = this->worker->wait(job_id).get();
+            if (result_wait.error) std::rethrow_exception(result_wait.error);
+            return std::any_cast<search_result_t>(result_wait.result);
+        }
+
+        return this->search_float_batch_internal(queries_data, num_queries, limit, sp);
+    }
+
+    /**
+     * @brief Internal batch search implementation for float32 queries
+     */
+    search_result_t search_float_batch_internal(const float* queries_data, uint64_t num_queries, uint32_t limit, const ivf_pq_search_params_t& sp) {
+        // Dynamic batching for small query counts
+        struct search_req_t {
+            const float* data;
+            uint64_t n;
+        };
+
+        std::string batch_key = "ivf_pq_sf_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.n_probes);
+        
+        auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector<std::any>& reqs, const std::vector<std::function<void(std::any)>>& setters) {
+            uint64_t total_queries = 0;
+            for (const auto& r : reqs) total_queries += std::any_cast<search_req_t>(r).n;
+
+            std::vector<float> aggregated_queries(total_queries * this->dimension);
+            uint64_t offset = 0;
+            for (const auto& r : reqs) {
+                auto req = std::any_cast<search_req_t>(r);
+                std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension));
+                offset += req.n;
+            }
+
+            auto results = this->search_float_internal(handle, aggregated_queries.data(), total_queries, this->dimension, limit, sp);
+
+            offset = 0;
+            for (size_t i = 0; i < reqs.size(); ++i) {
+                auto req = std::any_cast<search_req_t>(reqs[i]);
+                search_result_t individual_res;
+                individual_res.neighbors.resize(req.n * limit);
+                individual_res.distances.resize(req.n * limit);
+                std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin());
+                std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin());
+                setters[i](individual_res);
+                offset += req.n;
+            }
+        };
+
+        auto future = this->worker->template submit_batched<search_result_t>(batch_key, search_req_t{queries_data, num_queries}, exec_fn);
+        return future.get();
+    }
+
+    /**
+     * @brief Internal search implementation (no worker submission)
+     */
+    search_result_t search_internal(raft_handle_wrapper_t& handle, const T* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const ivf_pq_search_params_t& sp) {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        auto res = handle.get_raft_resources();
+
+        search_result_t search_res;
+        search_res.neighbors.resize(num_queries * limit);
+        search_res.distances.resize(num_queries * limit);
+
+        cuvs::neighbors::ivf_pq::search_params search_params;
+        search_params.n_probes = sp.n_probes;
+
+        const ivf_pq_index* local_index = index_.get();
+        if (!local_index && mg_index_) {
+            int current_device;
+            RAFT_CUDA_TRY(cudaGetDevice(&current_device));
+            for (size_t i = 0; i < this->devices_.size(); ++i) {
+                if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) {
+                    if (mg_index_->ann_interfaces_[i].index_.has_value()) {
+                        local_index = &mg_index_->ann_interfaces_[i].index_.value();
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (is_snmg_handle(res) && mg_index_) {
+            auto queries_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                queries_data, (int64_t)num_queries, (int64_t)this->dimension);
+            auto neighbors_host_view = raft::make_host_matrix_view<int64_t, int64_t>(
+                search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit);
+            auto distances_host_view = raft::make_host_matrix_view<float, int64_t>(
+                search_res.distances.data(), (int64_t)num_queries, (int64_t)limit);
+
+            cuvs::neighbors::mg_search_params<cuvs::neighbors::ivf_pq::search_params> mg_search_params(search_params);
+            cuvs::neighbors::ivf_pq::search(*res, *mg_index_, mg_search_params,
+                                                queries_host_view, neighbors_host_view, distances_host_view);
+        } else if (local_index) {
+            auto queries_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(this->dimension));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data,
+                                        num_queries * this->dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                        raft::resource::get_cuda_stream(*res)));
+
+            auto neighbors_device = raft::make_device_matrix<int64_t, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+            auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+            cuvs::neighbors::ivf_pq::search(*res, search_params, *local_index,
+                                                raft::make_const_mdspan(queries_device.view()), 
+                                                neighbors_device.view(), distances_device.view());
+
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(),
+                                        search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(),
+                                        search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+        } else {
+            throw std::runtime_error("Index not loaded or failed to find local index shard for current device.");
+        }
+
+        raft::resource::sync_stream(*res);
+
+        for (size_t i = 0; i < search_res.neighbors.size(); ++i) {
+            if (search_res.neighbors[i] == std::numeric_limits<int64_t>::max() || 
+                search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) {
+                search_res.neighbors[i] = -1;
+            }
+        }
+        return search_res;
+    }
+
+    /**
+     * @brief Internal search_float implementation (no worker submission)
+     */
+    search_result_t search_float_internal(raft_handle_wrapper_t& handle, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const ivf_pq_search_params_t& sp) {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        auto res = handle.get_raft_resources();
+
+        // 1. Quantize/Convert float queries to T on device
+        auto queries_device_float = raft::make_device_matrix<float, int64_t>(*res, num_queries, this->dimension);
+        raft::copy(*res, queries_device_float.view(), raft::make_host_matrix_view<const float, int64_t>(queries_data, num_queries, this->dimension));
+        
+        auto queries_device_target = raft::make_device_matrix<T, int64_t>(*res, num_queries, this->dimension);
+        if constexpr (sizeof(T) == 1) {
+            if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained");
+            this->quantizer_.template transform<T>(*res, queries_device_float.view(), queries_device_target.data_handle(), true);
+        } else {
+            raft::copy(*res, queries_device_target.view(), queries_device_float.view());
+        }
+
+        // 2. Perform search
+        search_result_t search_res;
+        search_res.neighbors.resize(num_queries * limit);
+        search_res.distances.resize(num_queries * limit);
+
+        cuvs::neighbors::ivf_pq::search_params search_params;
+        search_params.n_probes = sp.n_probes;
+
+        const ivf_pq_index* local_index = index_.get();
+        if (!local_index && mg_index_) {
+            int current_device;
+            RAFT_CUDA_TRY(cudaGetDevice(&current_device));
+            for (size_t i = 0; i < this->devices_.size(); ++i) {
+                if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) {
+                    if (mg_index_->ann_interfaces_[i].index_.has_value()) {
+                        local_index = &mg_index_->ann_interfaces_[i].index_.value();
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (is_snmg_handle(res) && mg_index_) {
+            auto queries_host_target = raft::make_host_matrix<T, int64_t>(num_queries, this->dimension);
+            raft::copy(*res, queries_host_target.view(), queries_device_target.view());
+            raft::resource::sync_stream(*res);
+
+            auto neighbors_host_view = raft::make_host_matrix_view<int64_t, int64_t>(
+                search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit);
+            auto distances_host_view = raft::make_host_matrix_view<float, int64_t>(
+                search_res.distances.data(), (int64_t)num_queries, (int64_t)limit);
+
+            cuvs::neighbors::mg_search_params<cuvs::neighbors::ivf_pq::search_params> mg_search_params(search_params);
+            cuvs::neighbors::ivf_pq::search(*res, *mg_index_, mg_search_params,
+                                                queries_host_target.view(), 
+                                                neighbors_host_view, distances_host_view);
+        } else if (local_index) {
+            auto neighbors_device = raft::make_device_matrix<int64_t, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+            auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+            cuvs::neighbors::ivf_pq::search(*res, search_params, *local_index,
+                                                raft::make_const_mdspan(queries_device_target.view()), 
+                                                neighbors_device.view(), distances_device.view());
+
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(),
+                                        search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+            RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(),
+                                        search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                        raft::resource::get_cuda_stream(*res)));
+        } else {
+            throw std::runtime_error("Index not loaded or failed to find local index shard for current device.");
+        }
+
+        raft::resource::sync_stream(*res);
+
+        for (size_t i = 0; i < search_res.neighbors.size(); ++i) {
+            if (search_res.neighbors[i] == std::numeric_limits<int64_t>::max() || 
+                search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) {
+                search_res.neighbors[i] = -1;
+            }
+        }
+        return search_res;
+    }
+
+    std::vector<T> get_centers() {
+        if (!this->is_loaded_ || (!index_ && !mg_index_)) return {};
+
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                
+                const ivf_pq_index* local_index = nullptr;
+                if (index_) {
+                    local_index = index_.get();
+                } else if (mg_index_) {
+                    for (const auto& iface : mg_index_->ann_interfaces_) {
+                        if (iface.index_.has_value()) {
+                            local_index = &iface.index_.value();
+                            break;
+                        }
+                    }
+                }
+
+                if (!local_index) return std::vector<T>{};
+
+                auto centers_view = local_index->centers();
+                size_t n_centers = centers_view.extent(0);
+                size_t dim = centers_view.extent(1);
+
+                // 1. Convert centers from float to T on device
+                auto centers_device_target = raft::make_device_matrix<T, int64_t>(*res, n_centers, dim);
+                if constexpr (sizeof(T) == 1) {
+                    if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained");
+                    this->quantizer_.template transform<T>(*res, centers_view, centers_device_target.data_handle(), true);
+                } else {
+                    raft::copy(*res, centers_device_target.view(), centers_view);
+                }
+
+                // 2. Copy to host
+                std::vector<T> host_centers(n_centers * dim);
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_centers.data(), centers_device_target.data_handle(),
+                                         host_centers.size() * sizeof(T), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                
+                raft::resource::sync_stream(*res);
+                return host_centers;
+            }
+        );
+
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<std::vector<T>>(result.result);
+    }
+
+    uint32_t get_n_list() {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        if (index_) return static_cast<uint32_t>(index_->n_lists());
+        if (mg_index_) {
+            for (const auto& iface : mg_index_->ann_interfaces_) {
+                if (iface.index_.has_value()) return static_cast<uint32_t>(iface.index_.value().n_lists());
+            }
+        }
+        return this->build_params.n_lists;
+    }
+
+    uint32_t get_pq_dim() {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        if (index_) return static_cast<uint32_t>(index_->pq_dim());
+        if (mg_index_) {
+            for (const auto& iface : mg_index_->ann_interfaces_) {
+                if (iface.index_.has_value()) return static_cast<uint32_t>(iface.index_.value().pq_dim());
+            }
+        }
+        return this->build_params.m;
+    }
+
+    uint32_t get_pq_bits() {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        if (index_) return static_cast<uint32_t>(index_->pq_bits());
+        if (mg_index_) {
+            for (const auto& iface : mg_index_->ann_interfaces_) {
+                if (iface.index_.has_value()) return static_cast<uint32_t>(iface.index_.value().pq_bits());
+            }
+        }
+        return this->build_params.bits_per_code;
+    }
+
+    uint32_t get_dim() {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        if (index_) return static_cast<uint32_t>(index_->dim());
+        if (mg_index_) {
+            for (const auto& iface : mg_index_->ann_interfaces_) {
+                if (iface.index_.has_value()) return static_cast<uint32_t>(iface.index_.value().dim());
+            }
+        }
+        return this->dimension;
+    }
+
+    uint32_t get_rot_dim() {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        if (index_) return static_cast<uint32_t>(index_->rot_dim());
+        if (mg_index_) {
+            for (const auto& iface : mg_index_->ann_interfaces_) {
+                if (iface.index_.has_value()) return static_cast<uint32_t>(iface.index_.value().rot_dim());
+            }
+        }
+        return this->dimension;
+    }
+
+    uint32_t get_dim_ext() {
+        std::shared_lock<std::shared_mutex> lock(this->mutex_);
+        if (index_) return static_cast<uint32_t>(index_->dim_ext());
+        if (mg_index_) {
+            for (const auto& iface : mg_index_->ann_interfaces_) {
+                if (iface.index_.has_value()) return static_cast<uint32_t>(iface.index_.value().dim_ext());
+            }
+        }
+        return this->dimension;
+    }
+
+    std::string info() const override {
+        std::string json = gpu_index_base_t<T, ivf_pq_build_params_t>::info();
+        json += ", \"type\": \"IVF-PQ\", \"ivf_pq\": {";
+        if (index_) {
+            json += "\"mode\": \"Single-GPU\", \"size\": " + std::to_string(index_->size()) + 
+                    ", \"n_lists\": " + std::to_string(index_->n_lists()) +
+                    ", \"pq_dim\": " + std::to_string(index_->pq_dim()) +
+                    ", \"pq_bits\": " + std::to_string(index_->pq_bits());
+        } else if (mg_index_) {
+            json += "\"mode\": \"Multi-GPU\", \"shards\": [";
+            for (size_t i = 0; i < mg_index_->ann_interfaces_.size(); ++i) {
+                const auto& iface = mg_index_->ann_interfaces_[i];
+                json += "{\"device\": " + std::to_string(this->devices_[i]);
+                if (iface.index_.has_value()) {
+                    json += ", \"size\": " + std::to_string(iface.index_.value().size()) + 
+                            ", \"n_lists\": " + std::to_string(iface.index_.value().n_lists()) +
+                            ", \"pq_dim\": " + std::to_string(iface.index_.value().pq_dim()) +
+                            ", \"pq_bits\": " + std::to_string(iface.index_.value().pq_bits());
+                } else {
+                    json += ", \"status\": \"Not loaded\"";
+                }
+                json += "}" + std::string(i == mg_index_->ann_interfaces_.size() - 1 ? "" : ", ");
+            }
+            json += "]";
+        } else {
+            json += "\"built\": false";
+        }
+        json += "}}";
+        return json;
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/ivf_pq_c.cpp b/cgo/cuvs/ivf_pq_c.cpp
new file mode 100644
index 0000000000000..5835f0fd2cab6
--- /dev/null
+++ b/cgo/cuvs/ivf_pq_c.cpp
@@ -0,0 +1,583 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ivf_pq_c.h"
+#include "ivf_pq.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <cstring>
+
+struct gpu_ivf_pq_any_t {
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_ivf_pq_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_ivf_pq_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_ivf_pq_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_ivf_pq_t<half>*>(ptr); break;
+            case Quantization_INT8: delete static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(ptr); break;
+            case Quantization_UINT8: delete static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_ivf_pq_c gpu_ivf_pq_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c,
+                                 ivf_pq_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<float>(static_cast<const float*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<half>(static_cast<const half*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<int8_t>(static_cast<const int8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<uint8_t>(static_cast<const uint8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-PQ");
+        }
+        return static_cast<gpu_ivf_pq_c>(new gpu_ivf_pq_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_new", e.what());
+        return nullptr;
+    }
+}
+
+gpu_ivf_pq_c gpu_ivf_pq_new_from_data_file(const char* data_filename, distance_type_t metric_c, 
+                                                ivf_pq_build_params_t build_params,
+                                                const int* devices, int device_count, uint32_t nthread, 
+                                                distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<float>(std::string(data_filename), metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<half>(std::string(data_filename), metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<int8_t>(std::string(data_filename), metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<uint8_t>(std::string(data_filename), metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-PQ");
+        }
+        return static_cast<gpu_ivf_pq_c>(new gpu_ivf_pq_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_new_from_data_file", e.what());
+        return nullptr;
+    }
+}
+
+gpu_ivf_pq_c gpu_ivf_pq_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric_c,
+                                      ivf_pq_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<float>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<half>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<int8_t>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<uint8_t>(total_count, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-PQ");
+        }
+        return static_cast<gpu_ivf_pq_c>(new gpu_ivf_pq_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_new_empty", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_ivf_pq_add_chunk(gpu_ivf_pq_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->add_chunk(static_cast<const float*>(chunk_data), chunk_count); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->add_chunk(static_cast<const half*>(chunk_data), chunk_count); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->add_chunk(static_cast<const int8_t*>(chunk_data), chunk_count); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->add_chunk(static_cast<const uint8_t*>(chunk_data), chunk_count); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_add_chunk", e.what());
+    }
+}
+
+void gpu_ivf_pq_add_chunk_float(gpu_ivf_pq_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_add_chunk_float", e.what());
+    }
+}
+
+void gpu_ivf_pq_train_quantizer(gpu_ivf_pq_c index_c, const float* train_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_train_quantizer", e.what());
+    }
+}
+
+void gpu_ivf_pq_set_per_thread_device(gpu_ivf_pq_c index_c, bool enable, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->set_per_thread_device(enable); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->set_per_thread_device(enable); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_set_per_thread_device", e.what());
+    }
+}
+
+void gpu_ivf_pq_set_use_batching(gpu_ivf_pq_c index_c, bool enable, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->set_use_batching(enable); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->set_use_batching(enable); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_set_use_batching", e.what());
+    }
+}
+
+void gpu_ivf_pq_set_quantizer(gpu_ivf_pq_c index_c, float min, float max, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->set_quantizer(min, max); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_set_quantizer", e.what());
+    }
+}
+
+void gpu_ivf_pq_get_quantizer(gpu_ivf_pq_c index_c, float* min, float* max, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->get_quantizer(min, max); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_get_quantizer", e.what());
+    }
+}
+
+gpu_ivf_pq_c gpu_ivf_pq_load_file(const char* filename, uint32_t dimension, distance_type_t metric_c,
+                                      ivf_pq_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<float>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<half>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<int8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_pq_t<uint8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-PQ");
+        }
+        return static_cast<gpu_ivf_pq_c>(new gpu_ivf_pq_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_load_file", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_ivf_pq_destroy(gpu_ivf_pq_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_destroy", e.what());
+    }
+}
+
+void gpu_ivf_pq_start(gpu_ivf_pq_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->start(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->start(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->start(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->start(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_start", e.what());
+    }
+}
+
+void gpu_ivf_pq_build(gpu_ivf_pq_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->build(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->build(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->build(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->build(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_build", e.what());
+    }
+}
+
+void gpu_ivf_pq_save(gpu_ivf_pq_c index_c, const char* filename, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->save(filename); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->save(filename); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->save(filename); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->save(filename); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_save", e.what());
+    }
+}
+
+gpu_ivf_pq_search_res_t gpu_ivf_pq_search(gpu_ivf_pq_c index_c, const void* queries_data, uint64_t num_queries, 
+                                              uint32_t query_dimension, uint32_t limit, 
+                                              ivf_pq_search_params_t search_params, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_ivf_pq_search_res_t res = {nullptr};
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        auto* cpp_res = new matrixone::ivf_pq_search_result_t();
+        switch (any->qtype) {
+            case Quantization_F32: 
+                *cpp_res = static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->search(static_cast<const float*>(queries_data), num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_F16: 
+                *cpp_res = static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->search(static_cast<const half*>(queries_data), num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_INT8: 
+                *cpp_res = static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->search(static_cast<const int8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_UINT8: 
+                *cpp_res = static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->search(static_cast<const uint8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                break;
+            default: break;
+        }
+        res.result_ptr = static_cast<gpu_ivf_pq_result_c>(cpp_res);
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_search", e.what());
+    }
+    return res;
+}
+
+gpu_ivf_pq_search_res_t gpu_ivf_pq_search_float(gpu_ivf_pq_c index_c, const float* queries_data, uint64_t num_queries, 
+                                                    uint32_t query_dimension, uint32_t limit, 
+                                                    ivf_pq_search_params_t search_params, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_ivf_pq_search_res_t res = {nullptr};
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        auto* cpp_res = new matrixone::ivf_pq_search_result_t();
+        switch (any->qtype) {
+            case Quantization_F32: 
+                *cpp_res = static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_F16: 
+                *cpp_res = static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_INT8: 
+                *cpp_res = static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                break;
+            case Quantization_UINT8: 
+                *cpp_res = static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params);
+                break;
+            default: break;
+        }
+        res.result_ptr = static_cast<gpu_ivf_pq_result_c>(cpp_res);
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_search_float", e.what());
+    }
+    return res;
+}
+
+void gpu_ivf_pq_get_neighbors(gpu_ivf_pq_result_c result_c, uint64_t total_elements, int64_t* neighbors) {
+    if (!result_c) return;
+    auto* neighbors_vec = &static_cast<matrixone::ivf_pq_search_result_t*>(result_c)->neighbors;
+    if (neighbors_vec->size() >= total_elements) {
+        std::copy(neighbors_vec->begin(), neighbors_vec->begin() + total_elements, neighbors);
+    }
+}
+
+void gpu_ivf_pq_get_distances(gpu_ivf_pq_result_c result_c, uint64_t total_elements, float* distances) {
+    if (!result_c) return;
+    auto* distances_vec = &static_cast<matrixone::ivf_pq_search_result_t*>(result_c)->distances;
+    if (distances_vec->size() >= total_elements) {
+        std::copy(distances_vec->begin(), distances_vec->begin() + total_elements, distances);
+    }
+}
+
+void gpu_ivf_pq_free_result(gpu_ivf_pq_result_c result_c) {
+    if (!result_c) return;
+    delete static_cast<matrixone::ivf_pq_search_result_t*>(result_c);
+}
+
+uint32_t gpu_ivf_pq_cap(gpu_ivf_pq_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->cap();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->cap();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->cap();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->cap();
+        default: return 0;
+    }
+}
+
+uint32_t gpu_ivf_pq_len(gpu_ivf_pq_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->len();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->len();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->len();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->len();
+        default: return 0;
+    }
+}
+
+char* gpu_ivf_pq_info(gpu_ivf_pq_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    if (!index_c) return nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        std::string info;
+        switch (any->qtype) {
+            case Quantization_F32: info = static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->info(); break;
+            case Quantization_F16: info = static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->info(); break;
+            case Quantization_INT8: info = static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->info(); break;
+            case Quantization_UINT8: info = static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->info(); break;
+            default: return nullptr;
+        }
+        return strdup(info.c_str());
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_info", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_ivf_pq_get_centers(gpu_ivf_pq_c index_c, void* centers, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto host_centers = static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->get_centers();
+                if (!host_centers.empty()) std::copy(host_centers.begin(), host_centers.end(), static_cast<float*>(centers));
+                break;
+            }
+            case Quantization_F16: {
+                auto host_centers = static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->get_centers();
+                if (!host_centers.empty()) std::copy(host_centers.begin(), host_centers.end(), static_cast<half*>(centers));
+                break;
+            }
+            case Quantization_INT8: {
+                auto host_centers = static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->get_centers();
+                if (!host_centers.empty()) std::copy(host_centers.begin(), host_centers.end(), static_cast<int8_t*>(centers));
+                break;
+            }
+            case Quantization_UINT8: {
+                auto host_centers = static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->get_centers();
+                if (!host_centers.empty()) std::copy(host_centers.begin(), host_centers.end(), static_cast<uint8_t*>(centers));
+                break;
+            }
+            default: throw std::runtime_error("Unsupported quantization type");
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_pq_get_centers", e.what());
+    }
+}
+
+uint32_t gpu_ivf_pq_get_n_list(gpu_ivf_pq_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->get_n_list();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->get_n_list();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->get_n_list();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->get_n_list();
+        default: return 0;
+    }
+}
+
+uint32_t gpu_ivf_pq_get_dim(gpu_ivf_pq_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->get_dim();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->get_dim();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->get_dim();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->get_dim();
+        default: return 0;
+    }
+}
+
+uint32_t gpu_ivf_pq_get_rot_dim(gpu_ivf_pq_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->get_rot_dim();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->get_rot_dim();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->get_rot_dim();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->get_rot_dim();
+        default: return 0;
+    }
+}
+
+uint32_t gpu_ivf_pq_get_dim_ext(gpu_ivf_pq_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->get_dim_ext();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->get_dim_ext();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->get_dim_ext();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->get_dim_ext();
+        default: return 0;
+    }
+}
+
+void gpu_ivf_pq_get_dataset(gpu_ivf_pq_c index_c, void* out_data) {
+    if (!index_c) return;
+    auto* any = static_cast<gpu_ivf_pq_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: {
+            auto& ds = static_cast<matrixone::gpu_ivf_pq_t<float>*>(any->ptr)->flattened_host_dataset;
+            std::copy(ds.begin(), ds.end(), static_cast<float*>(out_data));
+            break;
+        }
+        case Quantization_F16: {
+            auto& ds = static_cast<matrixone::gpu_ivf_pq_t<half>*>(any->ptr)->flattened_host_dataset;
+            std::copy(ds.begin(), ds.end(), static_cast<half*>(out_data));
+            break;
+        }
+        case Quantization_INT8: {
+            auto& ds = static_cast<matrixone::gpu_ivf_pq_t<int8_t>*>(any->ptr)->flattened_host_dataset;
+            std::copy(ds.begin(), ds.end(), static_cast<int8_t*>(out_data));
+            break;
+        }
+        case Quantization_UINT8: {
+            auto& ds = static_cast<matrixone::gpu_ivf_pq_t<uint8_t>*>(any->ptr)->flattened_host_dataset;
+            std::copy(ds.begin(), ds.end(), static_cast<uint8_t*>(out_data));
+            break;
+        }
+        default: break;
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_ivf_pq_t<float>;
+template class gpu_ivf_pq_t<half>;
+template class gpu_ivf_pq_t<int8_t>;
+template class gpu_ivf_pq_t<uint8_t>;
+} // namespace matrixone
diff --git a/cgo/cuvs/ivf_pq_c.h b/cgo/cuvs/ivf_pq_c.h
new file mode 100644
index 0000000000000..27a2dd08e3868
--- /dev/null
+++ b/cgo/cuvs/ivf_pq_c.h
@@ -0,0 +1,135 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef IVF_PQ_C_H
+#define IVF_PQ_C_H
+
+#include "helper.h"
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_ivf_pq_t object
+typedef void* gpu_ivf_pq_c;
+
+// Opaque pointer to the C++ IVF-PQ search result object
+typedef void* gpu_ivf_pq_result_c;
+
+// Constructor for building from dataset
+gpu_ivf_pq_c gpu_ivf_pq_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                                 distance_type_t metric, ivf_pq_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Constructor for building from MODF datafile
+gpu_ivf_pq_c gpu_ivf_pq_new_from_data_file(const char* data_filename, distance_type_t metric, 
+                                                ivf_pq_build_params_t build_params,
+                                                const int* devices, int device_count, uint32_t nthread, 
+                                                distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Constructor for loading from file
+gpu_ivf_pq_c gpu_ivf_pq_load_file(const char* filename, uint32_t dimension, distance_type_t metric,
+                                      ivf_pq_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Constructor for an empty index (pre-allocates)
+gpu_ivf_pq_c gpu_ivf_pq_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric, 
+                                       ivf_pq_build_params_t build_params,
+                                       const int* devices, int device_count, uint32_t nthread, 
+                                       distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Add chunk of data (same type as index quantization)
+void gpu_ivf_pq_add_chunk(gpu_ivf_pq_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg);
+
+// Add chunk of data (from float, with on-the-fly quantization if needed)
+void gpu_ivf_pq_add_chunk_float(gpu_ivf_pq_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg);
+
+// Trains the scalar quantizer (if T is 1-byte)
+void gpu_ivf_pq_train_quantizer(gpu_ivf_pq_c index_c, const float* train_data, uint64_t n_samples, void* errmsg);
+
+void gpu_ivf_pq_set_per_thread_device(gpu_ivf_pq_c index_c, bool enable, void* errmsg);
+void gpu_ivf_pq_set_use_batching(gpu_ivf_pq_c index_c, bool enable, void* errmsg);
+
+void gpu_ivf_pq_set_quantizer(gpu_ivf_pq_c index_c, float min, float max, void* errmsg);
+void gpu_ivf_pq_get_quantizer(gpu_ivf_pq_c index_c, float* min, float* max, void* errmsg);
+
+// Destructor
+void gpu_ivf_pq_destroy(gpu_ivf_pq_c index_c, void* errmsg);
+
+// Start function (initializes worker and resources)
+void gpu_ivf_pq_start(gpu_ivf_pq_c index_c, void* errmsg);
+
+// Build function (actually triggers the build/load logic)
+void gpu_ivf_pq_build(gpu_ivf_pq_c index_c, void* errmsg);
+
+// Save function
+void gpu_ivf_pq_save(gpu_ivf_pq_c index_c, const char* filename, void* errmsg);
+
+// Search function
+typedef struct {
+    gpu_ivf_pq_result_c result_ptr;
+} gpu_ivf_pq_search_res_t;
+
+gpu_ivf_pq_search_res_t gpu_ivf_pq_search(gpu_ivf_pq_c index_c, const void* queries_data, uint64_t num_queries, 
+                                              uint32_t query_dimension, uint32_t limit, 
+                                              ivf_pq_search_params_t search_params, void* errmsg);
+
+gpu_ivf_pq_search_res_t gpu_ivf_pq_search_float(gpu_ivf_pq_c index_c, const float* queries_data, uint64_t num_queries, 
+                                                    uint32_t query_dimension, uint32_t limit, 
+                                                    ivf_pq_search_params_t search_params, void* errmsg);
+
+// Get results from result object
+void gpu_ivf_pq_get_neighbors(gpu_ivf_pq_result_c result_c, uint64_t total_elements, int64_t* neighbors);
+void gpu_ivf_pq_get_distances(gpu_ivf_pq_result_c result_c, uint64_t total_elements, float* distances);
+
+// Free result object
+void gpu_ivf_pq_free_result(gpu_ivf_pq_result_c result_c);
+
+// Returns the capacity of the index buffer
+uint32_t gpu_ivf_pq_cap(gpu_ivf_pq_c index_c);
+
+// Returns the current number of vectors in the index
+uint32_t gpu_ivf_pq_len(gpu_ivf_pq_c index_c);
+
+// Returns info about the index as a JSON string
+char* gpu_ivf_pq_info(gpu_ivf_pq_c index_c, void* errmsg);
+
+// Gets the trained centroids
+void gpu_ivf_pq_get_centers(gpu_ivf_pq_c index_c, void* centers, void* errmsg);
+
+// Gets the number of lists (centroids)
+uint32_t gpu_ivf_pq_get_n_list(gpu_ivf_pq_c index_c);
+
+// Gets the dimension of the index
+uint32_t gpu_ivf_pq_get_dim(gpu_ivf_pq_c index_c);
+
+// Gets the rotated dimension of the index (dimension used for centers)
+uint32_t gpu_ivf_pq_get_rot_dim(gpu_ivf_pq_c index_c);
+
+// Gets the extended dimension of the index (including norms and padding)
+uint32_t gpu_ivf_pq_get_dim_ext(gpu_ivf_pq_c index_c);
+
+// Gets the flattened dataset (for debugging)
+void gpu_ivf_pq_get_dataset(gpu_ivf_pq_c index_c, void* out_data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // IVF_PQ_C_H
diff --git a/cgo/cuvs/kmeans.hpp b/cgo/cuvs/kmeans.hpp
new file mode 100644
index 0000000000000..59894fd552e46
--- /dev/null
+++ b/cgo/cuvs/kmeans.hpp
@@ -0,0 +1,447 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "index_base.hpp"
+#include "cuvs_worker.hpp" // For cuvs_worker_t and raft_handle_wrapper_t
+#include "cuvs_types.h"    // For distance_type_t and quantization_t
+#include <raft/util/cudart_utils.hpp> // For RAFT_CUDA_TRY
+#include <cuda_fp16.h> // For half
+
+// Standard library includes
+#include <algorithm>   
+#include <memory>
+#include <vector>
+#include <future>      
+#include <shared_mutex> 
+#include <optional>
+#include <type_traits>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+// RAFT includes
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/map.cuh>
+
+// cuVS includes
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/cluster/kmeans.hpp>
+#include "quantize.hpp"
+#pragma GCC diagnostic pop
+
+namespace matrixone {
+
+/**
+ * @brief Search/Predict result for K-Means.
+ * Common for all KMeans instantiations.
+ */
+struct kmeans_result_t {
+    std::vector<int64_t> labels;
+    float inertia;
+    int64_t n_iter;
+};
+
+/**
+ * @brief gpu_kmeans_t implements K-Means clustering on GPU using cuVS.
+ */
+template <typename T>
+class gpu_kmeans_t : public gpu_index_base_t<T, kmeans_build_params_t> {
+public:
+    using predict_result_t = kmeans_result_t;
+    using fit_predict_result_t = kmeans_result_t;
+
+    uint32_t n_clusters;
+    
+    cuvs::cluster::kmeans::balanced_params params;
+
+    // Type of centroids and inertia. cuVS uses float for these even if input is half, int8, or uint8.
+    using CentroidT = float; 
+
+    // Internal storage for centroids on device
+    std::unique_ptr<raft::device_matrix<CentroidT, int64_t>> centroids_;
+
+    gpu_kmeans_t(uint32_t n_clusters, uint32_t dimension, cuvs::distance::DistanceType metric,
+                 int max_iter = 20, int device_id = 0, uint32_t nthread = 1)
+        : n_clusters(n_clusters) {
+        
+        this->dimension = dimension;
+        params.n_iters = static_cast<uint32_t>(max_iter);
+        params.metric = metric;
+        this->devices_ = {device_id};
+
+        this->worker = std::make_unique<cuvs_worker_t>(nthread, this->devices_);
+    }
+
+    ~gpu_kmeans_t() override {
+        this->destroy();
+    }
+
+    /**
+     * @brief Starts the worker and initializes resources.
+     */
+    void start() {
+        auto init_fn = [](raft_handle_wrapper_t&) -> std::any {
+            return std::any();
+        };
+
+        auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any {
+            std::unique_lock<std::shared_mutex> lock(this->mutex_);
+            centroids_.reset();
+            this->quantizer_.reset();
+            return std::any();
+        };
+
+        this->worker->start(init_fn, stop_fn);
+    }
+
+    struct fit_result_t {
+        float inertia;
+        int64_t n_iter;
+    };
+
+    /**
+     * @brief Computes the cluster centroids.
+     */
+    fit_result_t fit(const T* X_data, uint64_t n_samples) {
+        if (!X_data || n_samples == 0) return {0, 0};
+
+        uint64_t job_id = this->worker->submit_main(
+            [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                return this->fit_internal(handle, X_data, n_samples);
+            }
+        );
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<fit_result_t>(result.result);
+    }
+
+    /**
+     * @brief Internal fit implementation (no worker submission)
+     */
+    fit_result_t fit_internal(raft_handle_wrapper_t& handle, const T* X_data, uint64_t n_samples) {
+        std::unique_lock<std::shared_mutex> lock(this->mutex_);
+        auto res = handle.get_raft_resources();
+        
+        auto X_device = raft::make_device_matrix<T, int64_t>(
+            *res, static_cast<int64_t>(n_samples), static_cast<int64_t>(this->dimension));
+        
+        RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data,
+                                    n_samples * this->dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                    raft::resource::get_cuda_stream(*res)));
+
+        if (!centroids_) {
+            centroids_ = std::make_unique<raft::device_matrix<CentroidT, int64_t>>(
+                raft::make_device_matrix<CentroidT, int64_t>(*res, static_cast<int64_t>(n_clusters), static_cast<int64_t>(this->dimension)));
+        }
+
+        cuvs::cluster::kmeans::fit(*res, params, 
+                                    raft::make_const_mdspan(X_device.view()), 
+                                    centroids_->view());
+
+        raft::resource::sync_stream(*res);
+        return fit_result_t{0.0f, static_cast<int64_t>(params.n_iters)};
+    }
+
+    /**
+     * @brief Assigns labels to new data based on existing centroids.
+     */
+    predict_result_t predict(const T* X_data, uint64_t n_samples) {
+        if (!X_data || n_samples == 0) return {{}, 0, 0};
+
+        uint64_t job_id = this->worker->submit_main(
+            [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                if (!centroids_) throw std::runtime_error("KMeans centroids not trained. Call fit() first.");
+
+                auto res = handle.get_raft_resources();
+                
+                auto X_device = raft::make_device_matrix<T, int64_t>(
+                    *res, static_cast<int64_t>(n_samples), static_cast<int64_t>(this->dimension));
+                
+                RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data,
+                                         n_samples * this->dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                predict_result_t res_out;
+                res_out.labels.resize(n_samples);
+                auto labels_device = raft::make_device_vector<uint32_t, int64_t>(*res, static_cast<int64_t>(n_samples));
+                
+                cuvs::cluster::kmeans::predict(*res, params,
+                                               raft::make_const_mdspan(X_device.view()),
+                                               raft::make_const_mdspan(centroids_->view()),
+                                               labels_device.view());
+
+                std::vector<uint32_t> host_labels(n_samples);
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(),
+                                         n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                
+                raft::resource::sync_stream(*res);
+                for(uint64_t i=0; i<n_samples; ++i) res_out.labels[i] = (int64_t)host_labels[i];
+                res_out.inertia = 0.0f;
+                res_out.n_iter = 0;
+                return res_out;
+            }
+        );
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<predict_result_t>(result.result);
+    }
+
+    /**
+     * @brief Assigns labels to new float32 data, performing on-the-fly quantization if needed.
+     */
+    predict_result_t predict_float(const float* X_data, uint64_t n_samples) {
+        if constexpr (std::is_same_v<T, float>) {
+            return predict(X_data, n_samples);
+        }
+
+        if (!X_data || n_samples == 0) return {{}, 0, 0};
+
+        uint64_t job_id = this->worker->submit_main(
+            [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                if (!centroids_) throw std::runtime_error("KMeans centroids not trained. Call fit() first.");
+
+                auto res = handle.get_raft_resources();
+                
+                // 1. Quantize/Convert float data to T on device
+                auto X_device_float = raft::make_device_matrix<float, int64_t>(*res, n_samples, this->dimension);
+                raft::copy(*res, X_device_float.view(), raft::make_host_matrix_view<const float, int64_t>(X_data, n_samples, this->dimension));
+                
+                auto X_device_target = raft::make_device_matrix<T, int64_t>(*res, n_samples, this->dimension);
+                if constexpr (sizeof(T) == 1) {
+                    if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained");
+                    this->quantizer_.template transform<T>(*res, X_device_float.view(), X_device_target.data_handle(), true);
+                    raft::resource::sync_stream(*res);
+                } else {
+                    raft::copy(*res, X_device_target.view(), X_device_float.view());
+                }
+
+                // 2. Perform prediction
+                predict_result_t res_out;
+                res_out.labels.resize(n_samples);
+                auto labels_device = raft::make_device_vector<uint32_t, int64_t>(*res, static_cast<int64_t>(n_samples));
+                
+                cuvs::cluster::kmeans::predict(*res, params,
+                                               raft::make_const_mdspan(X_device_target.view()),
+                                               raft::make_const_mdspan(centroids_->view()),
+                                               labels_device.view());
+
+                std::vector<uint32_t> host_labels(n_samples);
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(),
+                                         n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                
+                raft::resource::sync_stream(*res);
+                for(uint64_t i=0; i<n_samples; ++i) res_out.labels[i] = (int64_t)host_labels[i];
+                res_out.inertia = 0.0f;
+                res_out.n_iter = 0;
+                return res_out;
+            }
+        );
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<predict_result_t>(result.result);
+    }
+
+    /**
+     * @brief Performs both fitting and labeling in one step.
+     */
+    fit_predict_result_t fit_predict(const T* X_data, uint64_t n_samples) {
+        if (!X_data || n_samples == 0) return {{}, 0, 0};
+
+        uint64_t job_id = this->worker->submit_main(
+            [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                std::unique_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                
+                auto X_device = raft::make_device_matrix<T, int64_t>(
+                    *res, static_cast<int64_t>(n_samples), static_cast<int64_t>(this->dimension));
+                
+                RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data,
+                                         n_samples * this->dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                if (!centroids_) {
+                    centroids_ = std::make_unique<raft::device_matrix<CentroidT, int64_t>>(
+                        raft::make_device_matrix<CentroidT, int64_t>(*res, static_cast<int64_t>(n_clusters), static_cast<int64_t>(this->dimension)));
+                }
+
+                fit_predict_result_t res_out;
+                res_out.labels.resize(n_samples);
+                auto labels_device = raft::make_device_vector<uint32_t, int64_t>(*res, static_cast<int64_t>(n_samples));
+
+                if constexpr (std::is_same_v<T, float> || std::is_same_v<T, int8_t>) {
+                    cuvs::cluster::kmeans::fit_predict(*res, params,
+                                                       raft::make_const_mdspan(X_device.view()),
+                                                       centroids_->view(),
+                                                       labels_device.view());
+                } else {
+                    // Fallback for half and uint8_t
+                    cuvs::cluster::kmeans::fit(*res, params,
+                                               raft::make_const_mdspan(X_device.view()),
+                                               centroids_->view());
+                    cuvs::cluster::kmeans::predict(*res, params,
+                                                   raft::make_const_mdspan(X_device.view()),
+                                                   raft::make_const_mdspan(centroids_->view()),
+                                                   labels_device.view());
+                }
+
+                std::vector<uint32_t> host_labels(n_samples);
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(),
+                                         n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                
+                raft::resource::sync_stream(*res);
+                for(uint64_t i=0; i<n_samples; ++i) res_out.labels[i] = (int64_t)host_labels[i];
+                res_out.inertia = 0.0f;
+                res_out.n_iter = static_cast<int64_t>(params.n_iters);
+                return res_out;
+            }
+        );
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<fit_predict_result_t>(result.result);
+    }
+
+    /**
+     * @brief Performs fitting and prediction for float32 data, with on-the-fly quantization if needed.
+     */
+    fit_predict_result_t fit_predict_float(const float* X_data, uint64_t n_samples) {
+        if constexpr (std::is_same_v<T, float>) {
+            return fit_predict(X_data, n_samples);
+        }
+
+        if (!X_data || n_samples == 0) return {{}, 0, 0};
+
+        uint64_t job_id = this->worker->submit_main(
+            [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                std::unique_lock<std::shared_mutex> lock(this->mutex_);
+                auto res = handle.get_raft_resources();
+                
+                // 1. Quantize/Convert float data to T on device
+                auto X_device_float = raft::make_device_matrix<float, int64_t>(*res, n_samples, this->dimension);
+                raft::copy(*res, X_device_float.view(), raft::make_host_matrix_view<const float, int64_t>(X_data, n_samples, this->dimension));
+                
+                auto X_device_target = raft::make_device_matrix<T, int64_t>(*res, n_samples, this->dimension);
+                if constexpr (sizeof(T) == 1) {
+                    if (!this->quantizer_.is_trained()) {
+                        int64_t n_train = std::min(static_cast<int64_t>(n_samples), static_cast<int64_t>(500));
+                        auto train_view = raft::make_device_matrix_view<const float, int64_t>(X_device_float.data_handle(), n_train, this->dimension);
+                        this->quantizer_.train(*res, train_view);
+                    }
+                    this->quantizer_.template transform<T>(*res, X_device_float.view(), X_device_target.data_handle(), true);
+                    raft::resource::sync_stream(*res);
+                } else {
+                    raft::copy(*res, X_device_target.view(), X_device_float.view());
+                }
+
+                // 2. Perform fit_predict
+                if (!centroids_) {
+                    centroids_ = std::make_unique<raft::device_matrix<CentroidT, int64_t>>(
+                        raft::make_device_matrix<CentroidT, int64_t>(*res, static_cast<int64_t>(n_clusters), static_cast<int64_t>(this->dimension)));
+                }
+
+                fit_predict_result_t res_out;
+                res_out.labels.resize(n_samples);
+                auto labels_device = raft::make_device_vector<uint32_t, int64_t>(*res, static_cast<int64_t>(n_samples));
+
+                if constexpr (std::is_same_v<T, int8_t>) {
+                    cuvs::cluster::kmeans::fit_predict(*res, params,
+                                                       raft::make_const_mdspan(X_device_target.view()),
+                                                       centroids_->view(),
+                                                       labels_device.view());
+                } else {
+                    // Fallback for half and uint8_t
+                    cuvs::cluster::kmeans::fit(*res, params,
+                                               raft::make_const_mdspan(X_device_target.view()),
+                                               centroids_->view());
+                    cuvs::cluster::kmeans::predict(*res, params,
+                                                   raft::make_const_mdspan(X_device_target.view()),
+                                                   raft::make_const_mdspan(centroids_->view()),
+                                                   labels_device.view());
+                }
+
+                std::vector<uint32_t> host_labels(n_samples);
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(),
+                                         n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                
+                raft::resource::sync_stream(*res);
+                for(uint64_t i=0; i<n_samples; ++i) res_out.labels[i] = (int64_t)host_labels[i];
+                res_out.inertia = 0.0f;
+                res_out.n_iter = static_cast<int64_t>(params.n_iters);
+                return res_out;
+            }
+        );
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<fit_predict_result_t>(result.result);
+    }
+
+    /**
+     * @brief Returns the trained centroids.
+     */
+    std::vector<T> get_centroids() {
+        uint64_t job_id = this->worker->submit_main(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(this->mutex_);
+                if (!centroids_) return std::vector<T>{};
+
+                auto res = handle.get_raft_resources();
+                
+                // 1. Convert centroids from float to T on device
+                auto centroids_device_target = raft::make_device_matrix<T, int64_t>(*res, n_clusters, this->dimension);
+                if constexpr (sizeof(T) == 1) {
+                    if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained");
+                    this->quantizer_.template transform<T>(*res, centroids_->view(), centroids_device_target.data_handle(), true);
+                } else {
+                    raft::copy(*res, centroids_device_target.view(), centroids_->view());
+                }
+
+                // 2. Copy to host
+                std::vector<T> host_centroids(n_clusters * this->dimension);
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_centroids.data(), centroids_device_target.data_handle(),
+                                         host_centroids.size() * sizeof(T), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                raft::resource::sync_stream(*res);
+                return host_centroids;
+            }
+        );
+        auto result = this->worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<std::vector<T>>(result.result);
+    }
+
+    std::string info() const override {
+        std::string json = gpu_index_base_t<T, kmeans_build_params_t>::info();
+        json += ", \"type\": \"KMeans\", \"kmeans\": {";
+        json += "\"n_clusters\": " + std::to_string(n_clusters) + ", ";
+        json += "\"centroids_trained\": " + std::string(centroids_ ? "true" : "false");
+        json += "}}";
+        return json;
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/kmeans_c.cpp b/cgo/cuvs/kmeans_c.cpp
new file mode 100644
index 0000000000000..ef0bebe54a9b9
--- /dev/null
+++ b/cgo/cuvs/kmeans_c.cpp
@@ -0,0 +1,371 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kmeans_c.h"
+#include "kmeans.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <cstring>
+
+struct gpu_kmeans_any_t {
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_kmeans_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_kmeans_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_kmeans_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_kmeans_t<half>*>(ptr); break;
+            case Quantization_INT8: delete static_cast<matrixone::gpu_kmeans_t<int8_t>*>(ptr); break;
+            case Quantization_UINT8: delete static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_kmeans_c gpu_kmeans_new(uint32_t n_clusters, uint32_t dimension, distance_type_t metric_c,
+                            int max_iter, int device_id, uint32_t nthread, 
+                            quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        void* kmeans_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                kmeans_ptr = new matrixone::gpu_kmeans_t<float>(n_clusters, dimension, metric, max_iter, device_id, nthread);
+                break;
+            case Quantization_F16:
+                kmeans_ptr = new matrixone::gpu_kmeans_t<half>(n_clusters, dimension, metric, max_iter, device_id, nthread);
+                break;
+            case Quantization_INT8:
+                kmeans_ptr = new matrixone::gpu_kmeans_t<int8_t>(n_clusters, dimension, metric, max_iter, device_id, nthread);
+                break;
+            case Quantization_UINT8:
+                kmeans_ptr = new matrixone::gpu_kmeans_t<uint8_t>(n_clusters, dimension, metric, max_iter, device_id, nthread);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for KMeans");
+        }
+        return static_cast<gpu_kmeans_c>(new gpu_kmeans_any_t(qtype, kmeans_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_new", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_kmeans_destroy(gpu_kmeans_c kmeans_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_destroy", e.what());
+    }
+}
+
+void gpu_kmeans_start(gpu_kmeans_c kmeans_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->start(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->start(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->start(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->start(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_start", e.what());
+    }
+}
+
+void gpu_kmeans_train_quantizer(gpu_kmeans_c kmeans_c, const float* train_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_F16: static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->train_quantizer(train_data, n_samples); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_train_quantizer", e.what());
+    }
+}
+
+void gpu_kmeans_set_quantizer(gpu_kmeans_c kmeans_c, float min, float max, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_F16: static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->set_quantizer(min, max); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->set_quantizer(min, max); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_set_quantizer", e.what());
+    }
+}
+
+void gpu_kmeans_get_quantizer(gpu_kmeans_c kmeans_c, float* min, float* max, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_F16: static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->get_quantizer(min, max); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->get_quantizer(min, max); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_get_quantizer", e.what());
+    }
+}
+
+gpu_kmeans_fit_res_t gpu_kmeans_fit(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_kmeans_fit_res_t res = {0.0f, 0};
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto cpp_res = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->fit(static_cast<const float*>(X_data), n_samples);
+                res.inertia = cpp_res.inertia; res.n_iter = cpp_res.n_iter;
+                break;
+            }
+            case Quantization_F16: {
+                auto cpp_res = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->fit(static_cast<const half*>(X_data), n_samples);
+                res.inertia = cpp_res.inertia; res.n_iter = cpp_res.n_iter;
+                break;
+            }
+            case Quantization_INT8: {
+                auto cpp_res = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->fit(static_cast<const int8_t*>(X_data), n_samples);
+                res.inertia = cpp_res.inertia; res.n_iter = cpp_res.n_iter;
+                break;
+            }
+            case Quantization_UINT8: {
+                auto cpp_res = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->fit(static_cast<const uint8_t*>(X_data), n_samples);
+                res.inertia = cpp_res.inertia; res.n_iter = cpp_res.n_iter;
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_fit", e.what());
+    }
+    return res;
+}
+
+gpu_kmeans_predict_res_t gpu_kmeans_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_kmeans_predict_res_t res = {nullptr, 0.0f};
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        auto* cpp_res = new matrixone::kmeans_result_t();
+        switch (any->qtype) {
+            case Quantization_F32: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->predict(static_cast<const float*>(X_data), n_samples);
+                break;
+            case Quantization_F16: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->predict(static_cast<const half*>(X_data), n_samples);
+                break;
+            case Quantization_INT8: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->predict(static_cast<const int8_t*>(X_data), n_samples);
+                break;
+            case Quantization_UINT8: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->predict(static_cast<const uint8_t*>(X_data), n_samples);
+                break;
+            default: break;
+        }
+        res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+        res.inertia = cpp_res->inertia;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_predict", e.what());
+    }
+    return res;
+}
+
+gpu_kmeans_predict_res_t gpu_kmeans_predict_float(gpu_kmeans_c kmeans_c, const float* X_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_kmeans_predict_res_t res = {nullptr, 0.0f};
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        auto* cpp_res = new matrixone::kmeans_result_t();
+        switch (any->qtype) {
+            case Quantization_F32: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->predict_float(X_data, n_samples);
+                break;
+            case Quantization_F16: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->predict_float(X_data, n_samples);
+                break;
+            case Quantization_INT8: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->predict_float(X_data, n_samples);
+                break;
+            case Quantization_UINT8: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->predict_float(X_data, n_samples);
+                break;
+            default: break;
+        }
+        res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+        res.inertia = cpp_res->inertia;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_predict_float", e.what());
+    }
+    return res;
+}
+
+gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_kmeans_fit_predict_res_t res = {nullptr, 0.0f, 0};
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        auto* cpp_res = new matrixone::kmeans_result_t();
+        switch (any->qtype) {
+            case Quantization_F32: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->fit_predict(static_cast<const float*>(X_data), n_samples);
+                break;
+            case Quantization_F16: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->fit_predict(static_cast<const half*>(X_data), n_samples);
+                break;
+            case Quantization_INT8: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->fit_predict(static_cast<const int8_t*>(X_data), n_samples);
+                break;
+            case Quantization_UINT8: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->fit_predict(static_cast<const uint8_t*>(X_data), n_samples);
+                break;
+            default: break;
+        }
+        res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+        res.inertia = cpp_res->inertia; res.n_iter = cpp_res->n_iter;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_fit_predict", e.what());
+    }
+    return res;
+}
+
+gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict_float(gpu_kmeans_c kmeans_c, const float* X_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_kmeans_fit_predict_res_t res = {nullptr, 0.0f, 0};
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        auto* cpp_res = new matrixone::kmeans_result_t();
+        switch (any->qtype) {
+            case Quantization_F32: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->fit_predict_float(X_data, n_samples);
+                break;
+            case Quantization_F16: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->fit_predict_float(X_data, n_samples);
+                break;
+            case Quantization_INT8: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->fit_predict_float(X_data, n_samples);
+                break;
+            case Quantization_UINT8: 
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->fit_predict_float(X_data, n_samples);
+                break;
+            default: break;
+        }
+        res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+        res.inertia = cpp_res->inertia; res.n_iter = cpp_res->n_iter;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_fit_predict_float", e.what());
+    }
+    return res;
+}
+
+void gpu_kmeans_get_labels(gpu_kmeans_result_c result_c, uint64_t n_samples, int64_t* labels) {
+    if (!result_c) return;
+    auto* labels_vec = &static_cast<matrixone::kmeans_result_t*>(result_c)->labels;
+    if (labels_vec->size() >= n_samples) {
+        std::copy(labels_vec->begin(), labels_vec->begin() + n_samples, labels);
+    }
+}
+
+void gpu_kmeans_free_result(gpu_kmeans_result_c result_c) {
+    if (!result_c) return;
+    delete static_cast<matrixone::kmeans_result_t*>(result_c);
+}
+
+void gpu_kmeans_get_centroids(gpu_kmeans_c kmeans_c, void* centroids, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto host_centers = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->get_centroids();
+                std::copy(host_centers.begin(), host_centers.end(), static_cast<float*>(centroids));
+                break;
+            }
+            case Quantization_F16: {
+                auto host_centers = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->get_centroids();
+                std::copy(host_centers.begin(), host_centers.end(), static_cast<half*>(centroids));
+                break;
+            }
+            case Quantization_INT8: {
+                auto host_centers = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->get_centroids();
+                std::copy(host_centers.begin(), host_centers.end(), static_cast<int8_t*>(centroids));
+                break;
+            }
+            case Quantization_UINT8: {
+                auto host_centers = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->get_centroids();
+                std::copy(host_centers.begin(), host_centers.end(), static_cast<uint8_t*>(centroids));
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_get_centroids", e.what());
+    }
+}
+
+char* gpu_kmeans_info(gpu_kmeans_c kmeans_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    if (!kmeans_c) return nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        std::string info;
+        switch (any->qtype) {
+            case Quantization_F32: info = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->info(); break;
+            case Quantization_F16: info = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->info(); break;
+            case Quantization_INT8: info = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->info(); break;
+            case Quantization_UINT8: info = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->info(); break;
+            default: return nullptr;
+        }
+        return strdup(info.c_str());
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_info", e.what());
+        return nullptr;
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_kmeans_t<float>;
+template class gpu_kmeans_t<half>;
+template class gpu_kmeans_t<int8_t>;
+template class gpu_kmeans_t<uint8_t>;
+} // namespace matrixone
diff --git a/cgo/cuvs/kmeans_c.h b/cgo/cuvs/kmeans_c.h
new file mode 100644
index 0000000000000..0e726ad698cdb
--- /dev/null
+++ b/cgo/cuvs/kmeans_c.h
@@ -0,0 +1,95 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KMEANS_C_H
+#define KMEANS_C_H
+
+#include "helper.h"
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_kmeans_t object
+typedef void* gpu_kmeans_c;
+
+// Opaque pointer to the C++ KMeans result object
+typedef void* gpu_kmeans_result_c;
+
+// Constructor
+gpu_kmeans_c gpu_kmeans_new(uint32_t n_clusters, uint32_t dimension, distance_type_t metric,
+                            int max_iter, int device_id, uint32_t nthread, 
+                            quantization_t qtype, void* errmsg);
+
+// Destructor
+void gpu_kmeans_destroy(gpu_kmeans_c kmeans_c, void* errmsg);
+
+// Starts the worker and initializes resources
+void gpu_kmeans_start(gpu_kmeans_c kmeans_c, void* errmsg);
+
+// Trains the scalar quantizer (if T is 1-byte)
+void gpu_kmeans_train_quantizer(gpu_kmeans_c kmeans_c, const float* train_data, uint64_t n_samples, void* errmsg);
+
+void gpu_kmeans_set_quantizer(gpu_kmeans_c kmeans_c, float min, float max, void* errmsg);
+void gpu_kmeans_get_quantizer(gpu_kmeans_c kmeans_c, float* min, float* max, void* errmsg);
+
+// Fit function
+typedef struct {
+    float inertia;
+    int64_t n_iter;
+} gpu_kmeans_fit_res_t;
+
+gpu_kmeans_fit_res_t gpu_kmeans_fit(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg);
+
+// Predict function
+typedef struct {
+    gpu_kmeans_result_c result_ptr;
+    float inertia;
+} gpu_kmeans_predict_res_t;
+
+gpu_kmeans_predict_res_t gpu_kmeans_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg);
+
+gpu_kmeans_predict_res_t gpu_kmeans_predict_float(gpu_kmeans_c kmeans_c, const float* X_data, uint64_t n_samples, void* errmsg);
+
+// FitPredict function
+typedef struct {
+    gpu_kmeans_result_c result_ptr;
+    float inertia;
+    int64_t n_iter;
+} gpu_kmeans_fit_predict_res_t;
+
+gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg);
+
+gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict_float(gpu_kmeans_c kmeans_c, const float* X_data, uint64_t n_samples, void* errmsg);
+
+// Get results from result object
+void gpu_kmeans_get_labels(gpu_kmeans_result_c result_c, uint64_t n_samples, int64_t* labels);
+
+// Free result object
+void gpu_kmeans_free_result(gpu_kmeans_result_c result_c);
+
+// Get centroids
+void gpu_kmeans_get_centroids(gpu_kmeans_c kmeans_c, void* centroids, void* errmsg);
+
+// Returns info about the kmeans as a JSON string
+char* gpu_kmeans_info(gpu_kmeans_c kmeans_c, void* errmsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KMEANS_C_H
diff --git a/cgo/cuvs/quantize.hpp b/cgo/cuvs/quantize.hpp
new file mode 100644
index 0000000000000..a677f822e0bd5
--- /dev/null
+++ b/cgo/cuvs/quantize.hpp
@@ -0,0 +1,443 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/core/copy.cuh>
+#include <cuvs/preprocessing/quantize/scalar.hpp>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include <cstdint>
+#include <cstring>
+#include <algorithm>
+#include <type_traits>
+#include <cuda_fp16.h>
+
+namespace matrixone {
+
+#pragma pack(push, 1)
+struct file_header_t {
+    char magic[4];              // "MODF"
+    uint64_t count;             // 8 bytes
+    uint64_t dimension;         // 8 bytes
+    uint32_t data_type_size;    // 4 bytes
+};
+#pragma pack(pop)
+
+/**
+ * @brief Helper to manage cuVS scalar quantizer lifecycle and operations.
+ * 
+ * @tparam S Source type (float, half, double)
+ */
+template <typename S>
+class scalar_quantizer_t {
+public:
+    using quantizer_type = cuvs::preprocessing::quantize::scalar::quantizer<S>;
+
+    scalar_quantizer_t() = default;
+
+    /**
+     * @brief Constructor that initializes the quantizer with specific min and max values.
+     */
+    scalar_quantizer_t(S min, S max)
+        : quantizer_(std::make_unique<quantizer_type>(quantizer_type{min, max})) {}
+
+    /**
+     * @brief Trains the quantizer on a device matrix.
+     */
+    void train(const raft::resources& res, raft::device_matrix_view<const S, int64_t> train_view) {
+        cuvs::preprocessing::quantize::scalar::params q_params;
+        quantizer_ = std::make_unique<quantizer_type>(
+            cuvs::preprocessing::quantize::scalar::train(res, q_params, train_view));
+        raft::resource::sync_stream(res);
+    }
+
+    /**
+     * @brief Sets the quantizer range manually.
+     */
+    void set_quantizer(S min, S max) {
+        quantizer_ = std::make_unique<quantizer_type>(quantizer_type{min, max});
+    }
+
+    /**
+     * @brief Transforms a chunk of data into quantized 8-bit integers.
+     * 
+     * @tparam T Target type (int8_t or uint8_t)
+     * @param res RAFT resources handle.
+     * @param src_view Source data view on device.
+     * @param out_ptr Destination pointer (host or device).
+     * @param is_device_ptr Whether out_ptr is in device memory.
+     */
+    template <typename T>
+    void transform(const raft::resources& res, raft::device_matrix_view<const S, int64_t> src_view, T* out_ptr, bool is_device_ptr) {
+        if (!quantizer_) throw std::runtime_error("Quantizer not trained");
+        static_assert(sizeof(T) == 1, "Quantization target must be 1-byte");
+
+        int64_t n_rows = src_view.extent(0);
+        int64_t n_cols = src_view.extent(1);
+
+        auto chunk_device_int8 = raft::make_device_matrix<int8_t, int64_t>(res, n_rows, n_cols);
+        cuvs::preprocessing::quantize::scalar::transform(res, *quantizer_, src_view, chunk_device_int8.view());
+
+        if (is_device_ptr) {
+            auto out_view = raft::make_device_matrix_view<T, int64_t>(out_ptr, n_rows, n_cols);
+            raft::copy(res, out_view, chunk_device_int8.view());
+        } else {
+            auto out_view = raft::make_host_matrix_view<T, int64_t>(out_ptr, n_rows, n_cols);
+            raft::copy(res, out_view, chunk_device_int8.view());
+            raft::resource::sync_stream(res);
+        }
+    }
+
+    bool is_trained() const { return quantizer_ != nullptr; }
+    void reset() { quantizer_.reset(); }
+
+    /**
+     * @brief Gets the minimum value of the quantizer range.
+     */
+    S min() const {
+        if (!quantizer_) throw std::runtime_error("Quantizer not trained");
+        return quantizer_->min_;
+    }
+
+    /**
+     * @brief Gets the maximum value of the quantizer range.
+     */
+    S max() const {
+        if (!quantizer_) throw std::runtime_error("Quantizer not trained");
+        return quantizer_->max_;
+    }
+
+    /**
+     * @brief Serializes the quantizer state to an output stream.
+     */
+    void serialize(std::ostream& os) const {
+        if (!quantizer_) throw std::runtime_error("Quantizer not trained");
+        os.write(reinterpret_cast<const char*>(&quantizer_->min_), sizeof(S));
+        os.write(reinterpret_cast<const char*>(&quantizer_->max_), sizeof(S));
+    }
+
+    /**
+     * @brief Deserializes the quantizer state from an input stream.
+     */
+    void deserialize(std::istream& is) {
+        S params[2];
+        is.read(reinterpret_cast<char*>(params), 2 * sizeof(S));
+        if (is.gcount() != static_cast<std::streamsize>(2 * sizeof(S))) {
+            throw std::runtime_error("Failed to read quantizer parameters from stream");
+        }
+        quantizer_ = std::make_unique<quantizer_type>(quantizer_type{params[0], params[1]});
+    }
+
+    /**
+     * @brief Saves the quantizer state to a file.
+     */
+    void save_to_file(const std::string& filename) const {
+        std::ofstream os(filename, std::ios::binary);
+        if (!os.is_open()) throw std::runtime_error("Failed to open file for writing: " + filename);
+        serialize(os);
+    }
+
+    /**
+     * @brief Loads the quantizer state from a file.
+     */
+    void load_from_file(const std::string& filename) {
+        std::ifstream is(filename, std::ios::binary);
+        if (!is.is_open()) throw std::runtime_error("Failed to open file for reading: " + filename);
+        deserialize(is);
+    }
+
+private:
+    std::unique_ptr<quantizer_type> quantizer_;
+};
+
+namespace detail {
+
+static constexpr int64_t DEFAULT_CHUNK_SIZE = 16384;
+
+/**
+ * @brief Internal helper to read a binary file into a raw pointer using chunking.
+ */
+template <typename S>
+void load_matrix_raw_ptr(const raft::resources& res, const std::string& filename, const file_header_t& header, S* out_ptr, bool is_device_ptr) {
+    int64_t n_rows = static_cast<int64_t>(header.count);
+    int64_t n_cols = static_cast<int64_t>(header.dimension);
+
+    if (n_rows == 0 || n_cols == 0) return;
+
+    std::ifstream file(filename, std::ios::binary);
+    file.seekg(sizeof(file_header_t));
+
+    if (!is_device_ptr) {
+        file.read(reinterpret_cast<char*>(out_ptr), n_rows * n_cols * sizeof(S));
+        if (file.gcount() != static_cast<std::streamsize>(n_rows * n_cols * sizeof(S))) {
+            throw std::runtime_error("Failed to read data content from: " + filename);
+        }
+    } else {
+        std::vector<S> chunk_host;
+        for (int64_t row_offset = 0; row_offset < n_rows; row_offset += DEFAULT_CHUNK_SIZE) {
+            int64_t current_chunk_rows = std::min(DEFAULT_CHUNK_SIZE, n_rows - row_offset);
+            size_t total_chunk_elements = current_chunk_rows * n_cols;
+            chunk_host.resize(total_chunk_elements);
+            file.read(reinterpret_cast<char*>(chunk_host.data()), total_chunk_elements * sizeof(S));
+            raft::copy(out_ptr + (row_offset * n_cols), chunk_host.data(), total_chunk_elements, raft::resource::get_cuda_stream(res));
+        }
+        raft::resource::sync_stream(res);
+    }
+}
+
+/**
+ * @brief Internal helper to perform chunked quantization or conversion from datafile to a raw pointer.
+ */
+template <typename S, typename T, bool DoQuantize>
+void load_matrix_chunked_ptr(const raft::resources& res, const std::string& filename, const file_header_t& header, T* out_ptr, bool is_device_ptr) {
+    int64_t n_rows = static_cast<int64_t>(header.count);
+    int64_t n_cols = static_cast<int64_t>(header.dimension);
+    if (n_rows == 0 || n_cols == 0) return;
+
+    std::ifstream file(filename, std::ios::binary);
+    file.seekg(sizeof(file_header_t));
+
+    scalar_quantizer_t<S> quantizer;
+    if constexpr (DoQuantize) {
+        int64_t n_train = std::min(n_rows, static_cast<int64_t>(500));
+        std::vector<S> train_host(n_train * n_cols);
+        file.read(reinterpret_cast<char*>(train_host.data()), train_host.size() * sizeof(S));
+        auto train_device = raft::make_device_matrix<S, int64_t>(res, n_train, n_cols);
+        raft::copy(train_device.data_handle(), train_host.data(), train_host.size(), raft::resource::get_cuda_stream(res));
+        quantizer.train(res, train_device.view());
+        file.seekg(sizeof(file_header_t));
+    }
+
+    std::vector<S> chunk_host;
+    auto chunk_device_src = raft::make_device_matrix<S, int64_t>(res, DEFAULT_CHUNK_SIZE, n_cols);
+    
+    for (int64_t row_offset = 0; row_offset < n_rows; row_offset += DEFAULT_CHUNK_SIZE) {
+        int64_t current_chunk_rows = std::min(DEFAULT_CHUNK_SIZE, n_rows - row_offset);
+        size_t total_chunk_elements = current_chunk_rows * n_cols;
+        chunk_host.resize(total_chunk_elements);
+        file.read(reinterpret_cast<char*>(chunk_host.data()), total_chunk_elements * sizeof(S));
+        raft::copy(chunk_device_src.data_handle(), chunk_host.data(), total_chunk_elements, raft::resource::get_cuda_stream(res));
+        
+        auto current_chunk_src_view = raft::make_device_matrix_view<const S, int64_t>(chunk_device_src.data_handle(), current_chunk_rows, n_cols);
+
+        if constexpr (DoQuantize) {
+            quantizer.template transform<T>(res, current_chunk_src_view, out_ptr + (row_offset * n_cols), is_device_ptr);
+        } else {
+            if (is_device_ptr) {
+                auto out_chunk_view = raft::make_device_matrix_view<T, int64_t>(out_ptr + (row_offset * n_cols), current_chunk_rows, n_cols);
+                raft::copy(res, out_chunk_view, current_chunk_src_view);
+            } else {
+                auto out_chunk_view = raft::make_host_matrix_view<T, int64_t>(out_ptr + (row_offset * n_cols), current_chunk_rows, n_cols);
+                raft::copy(res, out_chunk_view, current_chunk_src_view);
+            }
+        }
+    }
+    raft::resource::sync_stream(res);
+}
+
+} // namespace detail
+
+/**
+ * @brief Reads a binary file into a CUDA device matrix.
+ */
+template <typename T>
+auto load_device_matrix(const raft::resources& res, const std::string& filename) {
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) throw std::runtime_error("Failed to open file: " + filename);
+
+    file_header_t header;
+    file.read(reinterpret_cast<char*>(&header), sizeof(file_header_t));
+    if (std::string(header.magic, 4) != "MODF") throw std::runtime_error("Invalid magic: " + filename);
+
+    auto matrix = raft::make_device_matrix<T, int64_t>(res, static_cast<int64_t>(header.count), static_cast<int64_t>(header.dimension));
+    if (header.data_type_size == sizeof(T)) {
+        detail::load_matrix_raw_ptr<T>(res, filename, header, matrix.data_handle(), true);
+    } else if (header.data_type_size == 4) {
+        if constexpr (sizeof(T) == 2) {
+            detail::load_matrix_chunked_ptr<float, T, false>(res, filename, header, matrix.data_handle(), true);
+        } else if constexpr (sizeof(T) == 1) {
+            detail::load_matrix_chunked_ptr<float, T, true>(res, filename, header, matrix.data_handle(), true);
+        } else {
+            throw std::runtime_error("Unsupported conversion from float to requested size");
+        }
+    } else if (header.data_type_size == 2) {
+        if constexpr (sizeof(T) == 1) {
+            detail::load_matrix_chunked_ptr<half, T, true>(res, filename, header, matrix.data_handle(), true);
+        } else if constexpr (sizeof(T) == 4) {
+            detail::load_matrix_chunked_ptr<half, T, false>(res, filename, header, matrix.data_handle(), true);
+        } else {
+            throw std::runtime_error("Unsupported conversion from half to requested size");
+        }
+    } else {
+        throw std::runtime_error("Type size mismatch and conversion not supported for source size: " + std::to_string(header.data_type_size));
+    }
+    return matrix;
+}
+
+/**
+ * @brief Reads a binary file into a CUDA device matrix (overload).
+ */
+template <typename T>
+void load_device_matrix(const raft::resources& res, const std::string& filename, raft::device_matrix<T, int64_t>& out_matrix, uint64_t& out_count, uint64_t& out_dimension) {
+    out_matrix = load_device_matrix<T>(res, filename);
+    out_count = static_cast<uint64_t>(out_matrix.extent(0));
+    out_dimension = static_cast<uint64_t>(out_matrix.extent(1));
+}
+
+/**
+ * @brief Reads a binary file into a CUDA host matrix.
+ */
+template <typename T>
+auto load_host_matrix(const std::string& filename) {
+    raft::resources res; 
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) throw std::runtime_error("Failed to open file: " + filename);
+
+    file_header_t header;
+    file.read(reinterpret_cast<char*>(&header), sizeof(file_header_t));
+    if (std::string(header.magic, 4) != "MODF") throw std::runtime_error("Invalid magic: " + filename);
+
+    auto matrix = raft::make_host_matrix<T, int64_t>(static_cast<int64_t>(header.count), static_cast<int64_t>(header.dimension));
+    if (header.data_type_size == sizeof(T)) {
+        detail::load_matrix_raw_ptr<T>(res, filename, header, matrix.data_handle(), false);
+    } else {
+        if (header.data_type_size == 4) {
+            if constexpr (sizeof(T) == 2) {
+                detail::load_matrix_chunked_ptr<float, T, false>(res, filename, header, matrix.data_handle(), false);
+            } else if constexpr (sizeof(T) == 1) {
+                detail::load_matrix_chunked_ptr<float, T, true>(res, filename, header, matrix.data_handle(), false);
+            } else {
+                throw std::runtime_error("Unsupported conversion from float to requested size");
+            }
+        } else if (header.data_type_size == 2) {
+            if constexpr (sizeof(T) == 1) {
+                detail::load_matrix_chunked_ptr<half, T, true>(res, filename, header, matrix.data_handle(), false);
+            } else if constexpr (sizeof(T) == 4) {
+                detail::load_matrix_chunked_ptr<half, T, false>(res, filename, header, matrix.data_handle(), false);
+            } else {
+                throw std::runtime_error("Unsupported conversion from half to requested size");
+            }
+        } else {
+            throw std::runtime_error("Unsupported conversion for host matrix");
+        }
+    }
+    return matrix;
+}
+
+/**
+ * @brief Reads a binary file into a host vector.
+ */
+template <typename T>
+void load_host_matrix(const std::string& filename, std::vector<T>& out_data, uint64_t& out_count, uint64_t& out_dimension) {
+    raft::resources res; 
+    std::ifstream file(filename, std::ios::binary);
+    if (!file.is_open()) throw std::runtime_error("Failed to open file: " + filename);
+
+    file_header_t header;
+    file.read(reinterpret_cast<char*>(&header), sizeof(file_header_t));
+    if (std::string(header.magic, 4) != "MODF") throw std::runtime_error("Invalid magic: " + filename);
+
+    out_count = header.count;
+    out_dimension = header.dimension;
+    out_data.resize(out_count * out_dimension);
+
+    if (header.data_type_size == sizeof(T)) {
+        detail::load_matrix_raw_ptr<T>(res, filename, header, out_data.data(), false);
+    } else {
+        if (header.data_type_size == 4) {
+            if constexpr (sizeof(T) == 2) {
+                detail::load_matrix_chunked_ptr<float, T, false>(res, filename, header, out_data.data(), false);
+            } else if constexpr (sizeof(T) == 1) {
+                detail::load_matrix_chunked_ptr<float, T, true>(res, filename, header, out_data.data(), false);
+            } else {
+                throw std::runtime_error("Unsupported conversion from float to requested size");
+            }
+        } else if (header.data_type_size == 2) {
+            if constexpr (sizeof(T) == 1) {
+                detail::load_matrix_chunked_ptr<half, T, true>(res, filename, header, out_data.data(), false);
+            } else if constexpr (sizeof(T) == 4) {
+                detail::load_matrix_chunked_ptr<half, T, false>(res, filename, header, out_data.data(), false);
+            } else {
+                throw std::runtime_error("Unsupported conversion from half to requested size");
+            }
+        } else {
+            throw std::runtime_error("Unsupported conversion for host matrix");
+        }
+    }
+}
+
+/**
+ * @brief Saves a CUDA device matrix to a binary file in the "MODF" format using chunking.
+ */
+template <typename T, typename Layout, typename IndexT>
+void save_device_matrix(const raft::resources& res, const std::string& filename, 
+                        raft::device_matrix_view<T, IndexT, Layout> matrix) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) throw std::runtime_error("Failed to open file for writing: " + filename);
+
+    file_header_t header;
+    std::memcpy(header.magic, "MODF", 4);
+    header.count = static_cast<uint64_t>(matrix.extent(0));
+    header.dimension = static_cast<uint64_t>(matrix.extent(1));
+    header.data_type_size = sizeof(std::remove_const_t<T>);
+    file.write(reinterpret_cast<const char*>(&header), sizeof(file_header_t));
+
+    int64_t n_rows = static_cast<int64_t>(header.count);
+    int64_t n_cols = static_cast<int64_t>(header.dimension);
+    std::vector<std::remove_const_t<T>> chunk_host;
+
+    for (int64_t row_offset = 0; row_offset < n_rows; row_offset += detail::DEFAULT_CHUNK_SIZE) {
+        int64_t current_chunk_rows = std::min(detail::DEFAULT_CHUNK_SIZE, n_rows - row_offset);
+        size_t total_chunk_elements = current_chunk_rows * n_cols;
+        chunk_host.resize(total_chunk_elements);
+        
+        auto src_chunk_view = raft::make_device_matrix_view<const T, int64_t>(matrix.data_handle() + (row_offset * n_cols), current_chunk_rows, n_cols);
+        auto host_chunk_view = raft::make_host_matrix_view<std::remove_const_t<T>, int64_t>(chunk_host.data(), current_chunk_rows, n_cols);
+        
+        raft::copy(res, host_chunk_view, src_chunk_view);
+        raft::resource::sync_stream(res);
+        file.write(reinterpret_cast<const char*>(chunk_host.data()), total_chunk_elements * sizeof(std::remove_const_t<T>));
+    }
+}
+
+/**
+ * @brief Saves a host matrix to a binary file in the "MODF" format.
+ */
+template <typename T, typename Layout, typename IndexT>
+void save_host_matrix(const std::string& filename, 
+                      raft::host_matrix_view<T, IndexT, Layout> matrix) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) throw std::runtime_error("Failed to open file for writing: " + filename);
+
+    file_header_t header;
+    std::memcpy(header.magic, "MODF", 4);
+    header.count = static_cast<uint64_t>(matrix.extent(0));
+    header.dimension = static_cast<uint64_t>(matrix.extent(1));
+    header.data_type_size = sizeof(std::remove_const_t<T>);
+    file.write(reinterpret_cast<const char*>(&header), sizeof(file_header_t));
+
+    if (matrix.size() > 0) {
+        file.write(reinterpret_cast<const char*>(matrix.data_handle()), matrix.size() * sizeof(std::remove_const_t<T>));
+    }
+}
+
+} // namespace matrixone
diff --git a/cgo/cuvs/test/batching_test.cu b/cgo/cuvs/test/batching_test.cu
new file mode 100644
index 0000000000000..c789e5ee12bcc
--- /dev/null
+++ b/cgo/cuvs/test/batching_test.cu
@@ -0,0 +1,132 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "cagra.hpp"
+#include "ivf_flat.hpp"
+#include "ivf_pq.hpp"
+#include "helper.h"
+#include "test_framework.hpp"
+#include <vector>
+#include <thread>
+#include <future>
+
+using namespace matrixone;
+
+TEST(DynamicBatchingTest, CagraConcurrentSearch) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / count;
+
+    std::vector<int> devices = {0};
+    cagra_build_params_t bp = cagra_build_params_default();
+    gpu_cagra_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 8, DistributionMode_SINGLE_GPU);
+    
+    index.set_use_batching(true);
+    index.start();
+    index.build();
+
+    const int num_threads = 8;
+    std::vector<std::future<cagra_search_result_t>> futures;
+    
+    for (int i = 0; i < num_threads; ++i) {
+        futures.push_back(std::async(std::launch::async, [&index, dimension, i]() {
+            std::vector<float> query(dimension);
+            for (uint32_t j = 0; j < dimension; ++j) query[j] = (float)i / 10.0f;
+            cagra_search_params_t sp = cagra_search_params_default();
+            return index.search(query.data(), 1, dimension, 5, sp);
+        }));
+    }
+
+    for (auto& f : futures) {
+        auto res = f.get();
+        ASSERT_EQ(res.neighbors.size(), (size_t)5);
+    }
+
+    index.destroy();
+}
+
+TEST(DynamicBatchingTest, IvfFlatConcurrentSearch) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / count;
+
+    std::vector<int> devices = {0};
+    ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+    bp.n_lists = 10;
+    gpu_ivf_flat_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 8, DistributionMode_SINGLE_GPU);
+    
+    index.set_use_batching(true);
+    index.start();
+    index.build();
+
+    const int num_threads = 8;
+    std::vector<std::future<ivf_flat_search_result_t>> futures;
+    
+    for (int i = 0; i < num_threads; ++i) {
+        futures.push_back(std::async(std::launch::async, [&index, dimension, i]() {
+            std::vector<float> query(dimension);
+            for (uint32_t j = 0; j < dimension; ++j) query[j] = (float)i / 10.0f;
+            ivf_flat_search_params_t sp = ivf_flat_search_params_default();
+            return index.search(query.data(), 1, dimension, 5, sp);
+        }));
+    }
+
+    for (auto& f : futures) {
+        auto res = f.get();
+        ASSERT_EQ(res.neighbors.size(), (size_t)5);
+    }
+
+    index.destroy();
+}
+
+TEST(DynamicBatchingTest, IvfPqConcurrentSearch) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / count;
+
+    std::vector<int> devices = {0};
+    ivf_pq_build_params_t bp = ivf_pq_build_params_default();
+    bp.n_lists = 10;
+    bp.m = 8;
+    gpu_ivf_pq_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 8, DistributionMode_SINGLE_GPU);
+    
+    index.set_use_batching(true);
+    index.start();
+    index.build();
+
+    const int num_threads = 8;
+    std::vector<std::future<ivf_pq_search_result_t>> futures;
+    
+    for (int i = 0; i < num_threads; ++i) {
+        futures.push_back(std::async(std::launch::async, [&index, dimension, i]() {
+            std::vector<float> query(dimension);
+            for (uint32_t j = 0; j < dimension; ++j) query[j] = (float)i / 10.0f;
+            ivf_pq_search_params_t sp = ivf_pq_search_params_default();
+            return index.search(query.data(), 1, dimension, 5, sp);
+        }));
+    }
+
+    for (auto& f : futures) {
+        auto res = f.get();
+        ASSERT_EQ(res.neighbors.size(), (size_t)5);
+    }
+
+    index.destroy();
+}
diff --git a/cgo/cuvs/test/brute_force_test.cu b/cgo/cuvs/test/brute_force_test.cu
new file mode 100644
index 0000000000000..1d641b6ac088b
--- /dev/null
+++ b/cgo/cuvs/test/brute_force_test.cu
@@ -0,0 +1,219 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "brute_force.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_fp16.h>
+
+using namespace matrixone;
+
+// --- Helper to convert float to half ---
+static std::vector<half> float_to_half(const std::vector<float>& src) {
+    std::vector<half> dst(src.size());
+    for (size_t i = 0; i < src.size(); ++i) {
+        dst[i] = __float2half(src[i]);
+    }
+    return dst;
+}
+
+// --- GpuBruteForceTest ---
+
+TEST(GpuBruteForceTest, BasicLoadAndSearch) {
+    const uint32_t dimension = 3;
+    const uint64_t count = 2;
+    std::vector<float> dataset = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.start();
+    index.build();
+
+    std::vector<float> queries = {1.0, 2.0, 3.0};
+    auto result = index.search(queries.data(), 1, dimension, 1);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)1);
+    ASSERT_EQ(result.neighbors[0], 0u);
+    ASSERT_EQ(result.distances[0], 0.0);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, SearchWithMultipleQueries) {
+    const uint32_t dimension = 4;
+    const uint64_t count = 4;
+    std::vector<float> dataset = {
+        1.0, 0.0, 0.0, 0.0, // ID 0
+        0.0, 1.0, 0.0, 0.0, // ID 1
+        0.0, 0.0, 1.0, 0.0, // ID 2
+        0.0, 0.0, 0.0, 1.0  // ID 3
+    };
+    
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.start();
+    index.build();
+
+    std::vector<float> queries = {
+        1.0, 0.0, 0.0, 0.0, // Should match ID 0
+        0.0, 0.0, 1.0, 0.0  // Should match ID 2
+    };
+    auto result = index.search(queries.data(), 2, dimension, 1);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)2);
+    ASSERT_EQ(result.neighbors[0], 0u);
+    ASSERT_EQ(result.neighbors[1], 2u);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, SearchWithFloat16) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 2;
+    std::vector<float> f_dataset = {1.0, 1.0, 2.0, 2.0};
+    std::vector<half> h_dataset = float_to_half(f_dataset);
+    
+    gpu_brute_force_t<half> index(h_dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.start();
+    index.build();
+
+    std::vector<float> f_queries = {1.0, 1.0};
+    std::vector<half> h_queries = float_to_half(f_queries);
+    auto result = index.search(h_queries.data(), 1, dimension, 1);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)1);
+    ASSERT_EQ(result.neighbors[0], 0u);
+    ASSERT_EQ(result.distances[0], 0.0);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, SearchWithInnerProduct) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 2;
+    std::vector<float> dataset = {
+        1.0, 0.0,
+        0.0, 1.0
+    };
+    
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::InnerProduct, 1, 0);
+    index.start();
+    index.build();
+
+    std::vector<float> queries = {1.0, 0.0};
+    auto result = index.search(queries.data(), 1, dimension, 2);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)2);
+    ASSERT_EQ(result.neighbors[0], 0u);
+    ASSERT_EQ(result.neighbors[1], 1u);
+    
+    // dot product should be 1.0 for exact match
+    ASSERT_TRUE(std::abs(result.distances[0] - 1.0) < 1e-5);
+    ASSERT_TRUE(std::abs(result.distances[1] - 0.0) < 1e-5);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, EmptyDataset) {
+    const uint32_t dimension = 128;
+    const uint64_t count = 0;
+    
+    gpu_brute_force_t<float> index(nullptr, count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.build();
+
+    std::vector<float> queries(dimension, 0.0);
+    auto result = index.search(queries.data(), 1, dimension, 5);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)0);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, LargeLimit) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 5;
+    std::vector<float> dataset(count * dimension, 1.0);
+    
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.start();
+    index.build();
+
+    std::vector<float> queries(dimension, 1.0);
+    uint32_t limit = 10;
+    auto result = index.search(queries.data(), 1, dimension, limit);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)limit);
+    for (int i = 0; i < 5; ++i) ASSERT_GE(result.neighbors[i], 0);
+    for (int i = 5; i < 10; ++i) ASSERT_EQ((int64_t)result.neighbors[i], (int64_t)-1);
+
+    index.destroy();
+}
+
+// --- CuvsWorkerTest ---
+
+TEST(CuvsWorkerTest, BruteForceSearch) {
+    uint32_t n_threads = 1;
+    cuvs_worker_t worker(n_threads, 0); // Added device_id
+    worker.start();
+
+    const uint32_t dimension = 128;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.start();
+    index.build();
+
+    std::vector<float> queries = std::vector<float>(dataset.begin(), dataset.begin() + dimension);
+    auto result = index.search(queries.data(), 1, dimension, 5);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0u);
+
+    index.destroy();
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, ConcurrentSearches) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 100;
+    std::vector<float> dataset(count * dimension);
+    // Use very distinct values to ensure unique neighbors
+    for (size_t i = 0; i < count; ++i) {
+        for (size_t j = 0; j < dimension; ++j) {
+            dataset[i * dimension + j] = (float)i * 100.0f + (float)j;
+        }
+    }
+
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 4, 0);
+    index.start();
+    index.build();
+
+    const int num_threads = 4;
+    std::vector<std::future<void>> futures;
+    for (int i = 0; i < num_threads; ++i) {
+        futures.push_back(std::async(std::launch::async, [&index, dimension, &dataset, i]() {
+            std::vector<float> query = std::vector<float>(dataset.begin() + i * dimension, dataset.begin() + (i + 1) * dimension);
+            auto res = index.search(query.data(), 1, dimension, 1);
+            ASSERT_EQ(res.neighbors[0], (int64_t)i);
+        }));
+    }
+
+    for (auto& f : futures) f.get();
+
+    index.destroy();
+}
diff --git a/cgo/cuvs/test/cagra_test.cu b/cgo/cuvs/test/cagra_test.cu
new file mode 100644
index 0000000000000..641ba5fbe1006
--- /dev/null
+++ b/cgo/cuvs/test/cagra_test.cu
@@ -0,0 +1,134 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "cagra.hpp"
+#include "helper.h"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+
+using namespace matrixone;
+
+TEST(GpuCagraTest, BasicLoadAndSearch) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    
+    std::vector<int> devices = {0};
+    cagra_build_params_t bp = cagra_build_params_default();
+    gpu_cagra_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+    index.start();
+    index.build();
+
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    cagra_search_params_t sp = cagra_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0u);
+
+    index.destroy();
+}
+
+TEST(GpuCagraTest, SaveAndLoadFromFile) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    std::string filename = "test_cagra.bin";
+    std::vector<int> devices = {0};
+
+    // 1. Build and Save
+    {
+        cagra_build_params_t bp = cagra_build_params_default();
+        gpu_cagra_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.start();
+        index.build();
+        index.save(filename);
+        index.destroy();
+    }
+
+    // 2. Load and Search
+    {
+        cagra_build_params_t bp = cagra_build_params_default();
+        gpu_cagra_t<float> index(filename, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.start();
+        index.build();
+        
+        std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+        cagra_search_params_t sp = cagra_search_params_default();
+        auto result = index.search(queries.data(), 1, dimension, 5, sp);
+        
+        ASSERT_EQ(result.neighbors.size(), (size_t)5);
+        ASSERT_EQ(result.neighbors[0], 0u);
+
+        index.destroy();
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(GpuCagraTest, ShardedModeSimulation) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    
+    int dev_count = gpu_get_device_count();
+    ASSERT_TRUE(dev_count > 0);
+    std::vector<int> devices(dev_count);
+    gpu_get_device_list(devices.data(), dev_count);
+
+    cagra_build_params_t bp = cagra_build_params_default();
+    gpu_cagra_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SHARDED);
+    index.start();
+    index.build();
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    cagra_search_params_t sp = cagra_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0u);
+
+    index.destroy();
+}
+
+TEST(GpuCagraTest, ReplicatedModeSimulation) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    
+    int dev_count = gpu_get_device_count();
+    ASSERT_TRUE(dev_count > 0);
+    std::vector<int> devices(dev_count);
+    gpu_get_device_list(devices.data(), dev_count);
+
+    cagra_build_params_t bp = cagra_build_params_default();
+    gpu_cagra_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_REPLICATED);
+    index.start();
+    index.build();
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    cagra_search_params_t sp = cagra_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0u);
+
+    index.destroy();
+}
diff --git a/cgo/cuvs/test/distance_test.cu b/cgo/cuvs/test/distance_test.cu
new file mode 100644
index 0000000000000..c0558bf4997b7
--- /dev/null
+++ b/cgo/cuvs/test/distance_test.cu
@@ -0,0 +1,105 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distance.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_fp16.h>
+#include <vector>
+#include <cmath>
+
+using namespace matrixone;
+
+#define ASSERT_NEAR(val1, val2, abs_error) ASSERT_TRUE(std::abs((val1) - (val2)) <= (abs_error))
+
+TEST(PairwiseDistanceTest, BasicF32) {
+    const uint32_t dimension = 3;
+    const uint64_t n_x = 2;
+    const uint64_t n_y = 2;
+
+    std::vector<float> x = {
+        1.0, 0.0, 0.0,
+        0.0, 1.0, 0.0
+    };
+    std::vector<float> y = {
+        1.0, 0.0, 0.0,
+        0.0, 1.0, 0.0
+    };
+
+    std::vector<float> dist(n_x * n_y);
+    const raft::resources& res = get_raft_resources();
+
+    pairwise_distance<float>(res, x.data(), n_x, y.data(), n_y, dimension, cuvs::distance::DistanceType::L2Expanded, dist.data());
+
+    // Expected results for L2Squared:
+    // dist[0,0] = (1-1)^2 + (0-0)^2 + (0-0)^2 = 0
+    // dist[0,1] = (1-0)^2 + (0-1)^2 + (0-0)^2 = 2
+    // dist[1,0] = (0-1)^2 + (1-0)^2 + (0-0)^2 = 2
+    // dist[1,1] = (0-0)^2 + (1-1)^2 + (0-0)^2 = 0
+
+    ASSERT_NEAR(dist[0], 0.0f, 1e-5f);
+    ASSERT_NEAR(dist[1], 2.0f, 1e-5f);
+    ASSERT_NEAR(dist[2], 2.0f, 1e-5f);
+    ASSERT_NEAR(dist[3], 0.0f, 1e-5f);
+}
+
+TEST(PairwiseDistanceTest, BasicF16) {
+    const uint32_t dimension = 2;
+    const uint64_t n_x = 1;
+    const uint64_t n_y = 1;
+
+    std::vector<half> x = {__float2half(1.0f), __float2half(2.0f)};
+    std::vector<half> y = {__float2half(1.0f), __float2half(2.0f)};
+
+    std::vector<float> dist(n_x * n_y);
+    const raft::resources& res = get_raft_resources();
+
+    pairwise_distance<half>(res, x.data(), n_x, y.data(), n_y, dimension, cuvs::distance::DistanceType::L2Expanded, dist.data());
+
+    ASSERT_NEAR(dist[0], 0.0f, 1e-3f);
+}
+
+TEST(PairwiseDistanceTest, InnerProductF32) {
+    const uint32_t dimension = 2;
+    const uint64_t n_x = 2;
+    const uint64_t n_y = 2;
+
+    std::vector<float> x = {
+        1.0, 0.0,
+        0.0, 1.0
+    };
+    std::vector<float> y = {
+        1.0, 0.0,
+        0.0, 1.0
+    };
+
+    std::vector<float> dist(n_x * n_y);
+    const raft::resources& res = get_raft_resources();
+
+    pairwise_distance<float>(res, x.data(), n_x, y.data(), n_y, dimension, cuvs::distance::DistanceType::InnerProduct, dist.data());
+
+    // Inner product:
+    // dist[0,0] = 1*1 + 0*0 = 1
+    // dist[0,1] = 1*0 + 0*1 = 0
+    // dist[1,0] = 0*1 + 1*0 = 0
+    // dist[1,1] = 0*0 + 1*1 = 1
+
+    ASSERT_NEAR(dist[0], 1.0f, 1e-5f);
+    ASSERT_NEAR(dist[1], 0.0f, 1e-5f);
+    ASSERT_NEAR(dist[2], 0.0f, 1e-5f);
+    ASSERT_NEAR(dist[3], 1.0f, 1e-5f);
+}
diff --git a/cgo/cuvs/test/ivf_flat_test.cu b/cgo/cuvs/test/ivf_flat_test.cu
new file mode 100644
index 0000000000000..4088c209dc4b7
--- /dev/null
+++ b/cgo/cuvs/test/ivf_flat_test.cu
@@ -0,0 +1,174 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "ivf_flat.hpp"
+#include "helper.h"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+
+using namespace matrixone;
+
+TEST(GpuIvfFlatTest, BasicLoadSearchAndCenters) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 4;
+    std::vector<float> dataset = {
+        1.0, 1.0,
+        1.1, 1.1,
+        100.0, 100.0,
+        101.0, 101.0
+    };
+    
+    std::vector<int> devices = {0};
+    ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+    bp.n_lists = 2;
+    gpu_ivf_flat_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+    index.start();
+    index.build();
+
+    // Verify centers
+    auto centers = index.get_centers();
+    ASSERT_EQ(centers.size(), (size_t)(2 * dimension));
+    TEST_LOG("IVF-Flat Centers: " << centers[0] << ", " << centers[1]);
+
+    std::vector<float> queries = {1.05, 1.05};
+    ivf_flat_search_params_t sp = ivf_flat_search_params_default();
+    sp.n_probes = 2;
+    auto result = index.search(queries.data(), 1, dimension, 2, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)2);
+    // Should be either 0 or 1
+    ASSERT_TRUE(result.neighbors[0] == 0 || result.neighbors[0] == 1);
+
+    index.destroy();
+}
+
+TEST(GpuIvfFlatTest, SaveAndLoadFromFile) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 4;
+    std::vector<float> dataset = {1.0, 1.0, 1.1, 1.1, 100.0, 100.0, 101.0, 101.0};
+    std::string filename = "test_ivf_flat.bin";
+    std::vector<int> devices = {0};
+
+    // 1. Build and Save
+    {
+        ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+        bp.n_lists = 2;
+        gpu_ivf_flat_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.start();
+        index.build();
+        index.save(filename);
+        index.destroy();
+    }
+
+    // 2. Load and Search
+    {
+        ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+        bp.n_lists = 2;
+        gpu_ivf_flat_t<float> index(filename, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.start();
+        index.build();
+
+        std::vector<float> queries = {100.5, 100.5};
+
+        ivf_flat_search_params_t sp = ivf_flat_search_params_default();
+        sp.n_probes = 2;
+        auto result = index.search(queries.data(), 1, dimension, 2, sp);
+        
+        ASSERT_EQ(result.neighbors.size(), (size_t)2);
+        ASSERT_TRUE(result.neighbors[0] == 2 || result.neighbors[0] == 3);
+
+        index.destroy();
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(GpuIvfFlatTest, ShardedModeSimulation) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / dataset.size();
+    
+    std::vector<int> devices = {0}; 
+    ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+    bp.n_lists = 5;
+    gpu_ivf_flat_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SHARDED);
+    index.start();
+    index.build();
+
+    auto centers = index.get_centers();
+    ASSERT_EQ(centers.size(), (size_t)(5 * dimension));
+
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    ivf_flat_search_params_t sp = ivf_flat_search_params_default();
+    sp.n_probes = 2;
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0u);
+
+    index.destroy();
+}
+
+TEST(GpuIvfFlatTest, ReplicatedModeSimulation) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    
+    int dev_count = gpu_get_device_count();
+    ASSERT_TRUE(dev_count > 0);
+    std::vector<int> devices(dev_count);
+    gpu_get_device_list(devices.data(), dev_count);
+
+    ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+    bp.n_lists = 10;
+    gpu_ivf_flat_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_REPLICATED);
+    index.start();
+    index.build();
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    ivf_flat_search_params_t sp = ivf_flat_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0u);
+
+    index.destroy();
+}
+
+TEST(GpuIvfFlatTest, SetGetQuantizer) {
+    const uint32_t dimension = 4;
+    const uint64_t count = 10;
+    ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+    std::vector<int> devices = {0};
+    
+    gpu_ivf_flat_t<int8_t> index(count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+    
+    float min = -1.5f;
+    float max = 2.5f;
+    index.set_quantizer(min, max);
+    
+    float gMin = 0, gMax = 0;
+    index.get_quantizer(&gMin, &gMax);
+    
+    ASSERT_EQ(min, gMin);
+    ASSERT_EQ(max, gMax);
+    
+    index.destroy();
+}
+
diff --git a/cgo/cuvs/test/ivf_pq_test.cu b/cgo/cuvs/test/ivf_pq_test.cu
new file mode 100644
index 0000000000000..d5bf0abb337af
--- /dev/null
+++ b/cgo/cuvs/test/ivf_pq_test.cu
@@ -0,0 +1,201 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "ivf_pq.hpp"
+#include "helper.h"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+
+using namespace matrixone;
+
+TEST(GpuIvfPqTest, BasicLoadSearchAndCenters) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 4;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < count; ++i) {
+        for (size_t j = 0; j < dimension; ++j) {
+            dataset[i * dimension + j] = (float)i;
+        }
+    }
+    
+    std::vector<int> devices = {0};
+    ivf_pq_build_params_t bp = ivf_pq_build_params_default();
+    bp.n_lists = 2;
+    bp.m = 8;
+    gpu_ivf_pq_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+    index.start();
+    index.build();
+
+    // Verify centers
+    auto centers = index.get_centers();
+    ASSERT_TRUE(centers.size() % index.get_n_list() == 0);
+    ASSERT_EQ(centers.size(), (size_t)(index.get_n_list() * index.get_dim_ext()));
+
+    std::vector<float> queries(dimension);
+    for (size_t j = 0; j < dimension; ++j) queries[j] = 0.9f;
+    
+    ivf_pq_search_params_t sp = ivf_pq_search_params_default();
+    sp.n_probes = 2;
+    auto result = index.search(queries.data(), 1, dimension, 2, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)2);
+    // Should be either 0 or 1
+    ASSERT_TRUE(result.neighbors[0] == 0 || result.neighbors[0] == 1);
+
+    index.destroy();
+}
+
+TEST(GpuIvfPqTest, SaveAndLoadFromFile) {
+    const uint32_t dimension = 4;
+    const uint64_t count = 4;
+    std::vector<float> dataset = {
+        0.0, 0.0, 0.0, 0.0,
+        1.0, 1.0, 1.0, 1.0,
+        10.0, 10.0, 10.0, 10.0,
+        11.0, 11.0, 11.0, 11.0
+    };
+    std::string filename = "test_ivf_pq.bin";
+    std::vector<int> devices = {0};
+
+    // 1. Build and Save
+    {
+        ivf_pq_build_params_t bp = ivf_pq_build_params_default();
+        bp.n_lists = 2;
+        bp.m = 2;
+        gpu_ivf_pq_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.start();
+        index.build();
+        index.save(filename);
+        index.destroy();
+    }
+
+    // 2. Load and Search
+    {
+        ivf_pq_build_params_t bp = ivf_pq_build_params_default();
+        bp.n_lists = 2;
+        bp.m = 2;
+        gpu_ivf_pq_t<float> index(filename, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.start();
+        index.build();
+        
+        std::vector<float> queries = {10.5, 10.5, 10.5, 10.5};
+        ivf_pq_search_params_t sp = ivf_pq_search_params_default();
+        sp.n_probes = 2;
+        auto result = index.search(queries.data(), 1, dimension, 2, sp);
+        
+        ASSERT_EQ(result.neighbors.size(), (size_t)2);
+        ASSERT_TRUE(result.neighbors[0] == 2 || result.neighbors[0] == 3);
+
+        index.destroy();
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(GpuIvfPqTest, BuildFromDataFile) {
+    const uint32_t dimension = 8;
+    const uint64_t count = 100;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) {
+        dataset[i] = static_cast<float>(i % 10);
+    }
+
+    std::string data_filename = "test_dataset.modf";
+    {
+        // Use our utility to save the dataset in MODF format
+        raft::resources res;
+        auto matrix = raft::make_host_matrix<float, int64_t>(count, dimension);
+        std::copy(dataset.begin(), dataset.end(), matrix.data_handle());
+        save_host_matrix(data_filename, matrix.view());
+    }
+
+    std::vector<int> devices = {0};
+    ivf_pq_build_params_t bp = ivf_pq_build_params_default();
+    bp.n_lists = 10;
+    bp.m = 4;
+
+    gpu_ivf_pq_t<float> index(data_filename, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+    index.start();
+    index.build();
+
+    ASSERT_EQ(index.get_dim(), dimension);
+    ASSERT_EQ(index.count, static_cast<uint32_t>(count));
+
+    std::vector<float> queries(dimension, 0.0f);
+    ivf_pq_search_params_t sp = ivf_pq_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 1, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)1);
+    
+    index.destroy();
+    std::remove(data_filename.c_str());
+}
+
+TEST(GpuIvfPqTest, ShardedModeSimulation) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    
+    int dev_count = gpu_get_device_count();
+    ASSERT_TRUE(dev_count > 0);
+    std::vector<int> devices(dev_count);
+    gpu_get_device_list(devices.data(), dev_count);
+
+    ivf_pq_build_params_t bp = ivf_pq_build_params_default();
+    bp.n_lists = 10;
+    bp.m = 8;
+    gpu_ivf_pq_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SHARDED);
+    index.start();
+    index.build();
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    ivf_pq_search_params_t sp = ivf_pq_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0u);
+
+    index.destroy();
+}
+
+TEST(GpuIvfPqTest, ReplicatedModeSimulation) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    
+    int dev_count = gpu_get_device_count();
+    ASSERT_TRUE(dev_count > 0);
+    std::vector<int> devices(dev_count);
+    gpu_get_device_list(devices.data(), dev_count);
+
+    ivf_pq_build_params_t bp = ivf_pq_build_params_default();
+    bp.n_lists = 10;
+    bp.m = 8;
+    gpu_ivf_pq_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_REPLICATED);
+    index.start();
+    index.build();
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    ivf_pq_search_params_t sp = ivf_pq_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0u);
+
+    index.destroy();
+}
diff --git a/cgo/cuvs/test/kmeans_test.cu b/cgo/cuvs/test/kmeans_test.cu
new file mode 100644
index 0000000000000..4b4b34bfe9587
--- /dev/null
+++ b/cgo/cuvs/test/kmeans_test.cu
@@ -0,0 +1,89 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "kmeans.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+using namespace matrixone;
+
+TEST(GpuKMeansTest, BasicFitAndPredict) {
+    const uint32_t n_clusters = 3;
+    const uint32_t dimension = 2;
+    const uint64_t n_samples = 9;
+
+    // Create 3 clusters of points
+    std::vector<float> dataset = {
+        0.1f, 0.1f,   0.0f, 0.2f,   0.2f, 0.0f,  // Cluster 0
+        10.1f, 10.1f, 10.0f, 10.2f, 10.2f, 10.0f, // Cluster 1
+        20.1f, 20.1f, 20.0f, 20.2f, 20.2f, 20.0f  // Cluster 2
+    };
+
+    gpu_kmeans_t<float> kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1);
+    kmeans.start();
+
+    auto fit_res = kmeans.fit(dataset.data(), n_samples);
+    ASSERT_GE(fit_res.n_iter, 1);
+
+    auto predict_res = kmeans.predict(dataset.data(), n_samples);
+    ASSERT_EQ(predict_res.labels.size(), (size_t)n_samples);
+
+    // Since we use balanced_params, it might prioritize balancing cluster sizes over spatial distance 
+    // on very small datasets. We just check that all labels are within range [0, nClusters).
+    for (size_t i = 0; i < n_samples; ++i) {
+        ASSERT_TRUE(predict_res.labels[i] >= 0 && predict_res.labels[i] < (int64_t)n_clusters);
+    }
+
+    kmeans.destroy();
+}
+
+TEST(GpuKMeansTest, FitPredict) {
+    const uint32_t n_clusters = 2;
+    const uint32_t dimension = 4;
+    const uint64_t n_samples = 10;
+    std::vector<float> dataset(n_samples * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+
+    gpu_kmeans_t<float> kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1);
+    kmeans.start();
+
+    auto res = kmeans.fit_predict(dataset.data(), n_samples);
+    ASSERT_EQ(res.labels.size(), (size_t)n_samples);
+    ASSERT_GE(res.n_iter, 1);
+
+    kmeans.destroy();
+}
+
+TEST(GpuKMeansTest, GetCentroids) {
+    const uint32_t n_clusters = 5;
+    const uint32_t dimension = 8;
+    const uint64_t n_samples = 50;
+    std::vector<float> dataset(n_samples * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+
+    gpu_kmeans_t<float> kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1);
+    kmeans.start();
+
+    kmeans.fit(dataset.data(), n_samples);
+    auto centroids = kmeans.get_centroids();
+
+    ASSERT_EQ(centroids.size(), (size_t)(n_clusters * dimension));
+
+    kmeans.destroy();
+}
diff --git a/cgo/cuvs/test/main_test.cu b/cgo/cuvs/test/main_test.cu
new file mode 100644
index 0000000000000..3a9c373b90031
--- /dev/null
+++ b/cgo/cuvs/test/main_test.cu
@@ -0,0 +1,377 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+
+using namespace matrixone;
+
+thread_local bool current_test_failed = false;
+
+// --- thread_safe_queue_t Tests ---
+
+TEST(ThreadSafeQueueTest, BasicPushPop) {
+    thread_safe_queue_t<int> q;
+    q.push(1);
+    q.push(2);
+
+    int val;
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 1);
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 2);
+}
+
+TEST(ThreadSafeQueueTest, PopEmptyBlocking) {
+    thread_safe_queue_t<int> q;
+    int val = 0;
+
+    auto fut = std::async(std::launch::async, [&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        q.push(42);
+    });
+
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 42);
+}
+
+TEST(ThreadSafeQueueTest, StopQueue) {
+    thread_safe_queue_t<int> q;
+    int val;
+
+    auto fut = std::async(std::launch::async, [&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        q.stop();
+    });
+
+    ASSERT_FALSE(q.pop(val)); // Should return false after stop
+    ASSERT_TRUE(q.is_stopped());
+}
+
+TEST(ThreadSafeQueueTest, PushBlocking) {
+    thread_safe_queue_t<int> q;
+    q.set_capacity(2);
+    
+    q.push(1);
+    q.push(2);
+    
+    std::atomic<bool> pushed_third{false};
+    std::thread t([&]() {
+        q.push(3); // Should block
+        pushed_third.store(true);
+    });
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    ASSERT_FALSE(pushed_third.load());
+
+    int val;
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 1);
+
+    // Now the third push should unblock
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    ASSERT_TRUE(pushed_third.load());
+
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 2);
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 3);
+    
+    t.join();
+}
+
+TEST(ThreadSafeQueueTest, ProducerConsumerStress) {
+    thread_safe_queue_t<int> q;
+    q.set_capacity(10);
+    const int num_producers = 4;
+    const int num_consumers = 4;
+    const int items_per_producer = 1000;
+    
+    std::atomic<int> sum_pushed{0};
+    std::atomic<int> sum_popped{0};
+    std::atomic<int> count_popped{0};
+
+    auto producer = [&]() {
+        for (int i = 0; i < items_per_producer; ++i) {
+            q.push(1);
+            sum_pushed.fetch_add(1);
+        }
+    };
+
+    auto consumer = [&]() {
+        int val;
+        while (q.pop(val)) {
+            sum_popped.fetch_add(val);
+            count_popped.fetch_add(1);
+            if (count_popped.load() == num_producers * items_per_producer) {
+                q.stop();
+            }
+        }
+    };
+
+    std::vector<std::thread> threads;
+    for (int i = 0; i < num_producers; ++i) threads.emplace_back(producer);
+    for (int i = 0; i < num_consumers; ++i) threads.emplace_back(consumer);
+
+    for (auto& t : threads) t.join();
+
+    ASSERT_EQ(sum_pushed.load(), sum_popped.load());
+    ASSERT_EQ(count_popped.load(), num_producers * items_per_producer);
+}
+
+TEST(ThreadSafeQueueTest, StopUnblocksProducer) {
+    thread_safe_queue_t<int> q;
+    q.set_capacity(1);
+    q.push(1);
+
+    std::atomic<bool> push_exited{false};
+    std::thread t([&]() {
+        q.push(2); // Blocks
+        push_exited.store(true);
+    });
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    ASSERT_FALSE(push_exited.load());
+
+    q.stop();
+    t.join();
+    ASSERT_TRUE(push_exited.load());
+}
+
+// --- cuvs_task_result_store_t Tests ---
+
+TEST(CuvsTaskResultStoreTest, BasicStoreRetrieve) {
+    cuvs_task_result_store_t store;
+    uint64_t id = store.get_next_job_id();
+    
+    cuvs_task_result_t res{id, 100, nullptr};
+    store.store(res);
+
+    auto fut = store.wait(id);
+    auto retrieved = fut.get();
+    ASSERT_EQ(std::any_cast<int>(retrieved.result), 100);
+}
+
+TEST(CuvsTaskResultStoreTest, AsyncWait) {
+    cuvs_task_result_store_t store;
+    uint64_t id = store.get_next_job_id();
+
+    auto fut = store.wait(id);
+    
+    std::thread t([&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        store.store({id, std::string("async"), nullptr});
+    });
+
+    auto retrieved = fut.get();
+    ASSERT_EQ(std::any_cast<std::string>(retrieved.result), std::string("async"));
+    t.join();
+}
+
+TEST(CuvsTaskResultStoreTest, StopStore) {
+    cuvs_task_result_store_t store;
+    uint64_t id = store.get_next_job_id();
+    auto fut = store.wait(id);
+
+    store.stop();
+    
+    ASSERT_THROW(fut.get(), std::runtime_error);
+}
+
+// --- raft_handle_wrapper_t and is_snmg_handle Tests ---
+
+TEST(RaftHandleWrapperTest, DetectSingleGpu) {
+    std::vector<int> devices = {0};
+    raft_handle_wrapper_t wrapper(devices, false); // force_mg = false
+    ASSERT_FALSE(is_snmg_handle(wrapper.get_raft_resources()));
+}
+
+TEST(RaftHandleWrapperTest, DetectMultiGpuForced) {
+    std::vector<int> devices = {0};
+    raft_handle_wrapper_t wrapper(devices, true); // force_mg = true
+    ASSERT_TRUE(is_snmg_handle(wrapper.get_raft_resources()));
+}
+
+// --- cuvs_worker_t Tests ---
+
+TEST(CuvsWorkerTest, BasicLifecycle) {
+    uint32_t n_threads = 1;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, SubmitTask) {
+    uint32_t n_threads = 1;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+
+    auto task = [](raft_handle_wrapper_t&) -> std::any {
+        return std::string("success");
+    };
+
+    uint64_t job_id = worker.submit(task);
+    auto result = worker.wait(job_id).get();
+
+    ASSERT_EQ(std::any_cast<std::string>(result.result), std::string("success"));
+
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, MultipleThreads) {
+    uint32_t n_threads = 4;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+
+    std::vector<uint64_t> ids;
+    for (int i = 0; i < 10; ++i) {
+        ids.push_back(worker.submit([i](raft_handle_wrapper_t&) -> std::any {
+            return i * 2;
+        }));
+    }
+
+    for (int i = 0; i < 10; ++i) {
+        auto res = worker.wait(ids[i]).get();
+        ASSERT_EQ(std::any_cast<int>(res.result), i * 2);
+    }
+
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, TaskErrorHandling) {
+    uint32_t n_threads = 1;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+
+    auto fail_task = [](raft_handle_wrapper_t&) -> std::any {
+        throw std::runtime_error("task failed intentionally");
+    };
+
+    uint64_t job_id = worker.submit(fail_task);
+    auto result = worker.wait(job_id).get();
+
+    ASSERT_TRUE(result.error != nullptr);
+    ASSERT_TRUE(has_exception<std::runtime_error>(result.error));
+
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, SubmitMain) {
+    uint32_t n_threads = 2;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+
+    // Task that identifies the thread it's running on
+    auto task = [](raft_handle_wrapper_t&) -> std::any {
+        return std::this_thread::get_id();
+    };
+
+    // Submit many tasks to main to ensure they are picked up
+    std::vector<uint64_t> ids;
+    for(int i=0; i<10; ++i) {
+        ids.push_back(worker.submit_main(task));
+    }
+
+    for(auto id : ids) {
+        auto res = worker.wait(id).get();
+        ASSERT_TRUE(res.error == nullptr);
+    }
+
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, BoundedQueueStress) {
+    const uint32_t n_workers = 4;
+    const uint32_t n_producers = 4;
+    const uint32_t tasks_per_producer = 500;
+    
+    cuvs_worker_t worker(n_workers);
+    worker.start();
+
+    std::atomic<uint32_t> tasks_completed{0};
+    auto task = [&](raft_handle_wrapper_t&) -> std::any {
+        tasks_completed.fetch_add(1);
+        // Small sleep to ensure queue builds up
+        std::this_thread::sleep_for(std::chrono::microseconds(10));
+        return std::any();
+    };
+
+    std::vector<std::thread> producers;
+    for (uint32_t i = 0; i < n_producers; ++i) {
+        producers.emplace_back([&, i]() {
+            for (uint32_t j = 0; j < tasks_per_producer; ++j) {
+                // Mix of submit and submit_main
+                if ((i + j) % 2 == 0) {
+                    worker.submit(task);
+                } else {
+                    worker.submit_main(task);
+                }
+            }
+        });
+    }
+
+    for (auto& t : producers) t.join();
+
+    // Wait for all tasks to complete (since we didn't keep track of IDs here for simplicity,
+    // we just check the counter)
+    const uint32_t total_tasks = n_producers * tasks_per_producer;
+    auto start_time = std::chrono::steady_clock::now();
+    while (tasks_completed.load() < total_tasks) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        if (std::chrono::steady_clock::now() - start_time > std::chrono::seconds(10)) {
+            REPORT_FAILURE("BoundedQueueStress timed out - possible hang");
+        }
+    }
+
+    ASSERT_EQ(tasks_completed.load(), total_tasks);
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, StopUnderLoad) {
+    const uint32_t n_workers = 4;
+    cuvs_worker_t worker(n_workers);
+    worker.start();
+
+    std::atomic<bool> producer_should_stop{false};
+    std::thread producer([&]() {
+        auto task = [](raft_handle_wrapper_t&) -> std::any {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            return std::any();
+        };
+        while (!producer_should_stop.load()) {
+            try {
+                worker.submit(task);
+            } catch (...) {
+                // Expected when worker stops
+                break;
+            }
+        }
+    });
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    
+    // Stop the worker while tasks are being submitted/processed
+    worker.stop();
+    
+    producer_should_stop.store(true);
+    if (producer.joinable()) producer.join();
+}
+
+int main() {
+    return RUN_ALL_TESTS();
+}
diff --git a/cgo/cuvs/test/quantize_test.cu b/cgo/cuvs/test/quantize_test.cu
new file mode 100644
index 0000000000000..fcb7bbf3a194c
--- /dev/null
+++ b/cgo/cuvs/test/quantize_test.cu
@@ -0,0 +1,330 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "quantize.hpp"
+#include "test_framework.hpp"
+#include <raft/core/resources.hpp>
+#include <cstdio>
+#include <vector>
+#include <cmath>
+#include <sstream>
+
+using namespace matrixone;
+
+TEST(UtilsTest, SaveLoadHostMatrix) {
+    const std::string filename = "test_host_matrix.modf";
+    const int64_t count = 10;
+    const int64_t dimension = 4;
+
+    auto matrix = raft::make_host_matrix<float, int64_t>(count, dimension);
+    for (int64_t i = 0; i < count * dimension; ++i) {
+        matrix.data_handle()[i] = static_cast<float>(i);
+    }
+
+    // Save
+    ASSERT_NO_THROW(save_host_matrix(filename, matrix.view()));
+
+    // Load
+    auto loaded_matrix = load_host_matrix<float>(filename);
+
+    // Verify
+    ASSERT_EQ(loaded_matrix.extent(0), count);
+    ASSERT_EQ(loaded_matrix.extent(1), dimension);
+
+    for (int64_t i = 0; i < count * dimension; ++i) {
+        ASSERT_EQ(loaded_matrix.data_handle()[i], static_cast<float>(i));
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(UtilsTest, SaveLoadDeviceMatrix) {
+    raft::resources res;
+    const std::string filename = "test_device_matrix.modf";
+    const int64_t count = 5;
+    const int64_t dimension = 3;
+
+    auto matrix = raft::make_device_matrix<float, int64_t>(res, count, dimension);
+    std::vector<float> host_data(count * dimension);
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        host_data[i] = static_cast<float>(i) * 1.1f;
+    }
+    raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    // Save
+    ASSERT_NO_THROW(save_device_matrix(res, filename, matrix.view()));
+
+    // Load
+    auto loaded_matrix = load_device_matrix<float>(res, filename);
+
+    // Verify
+    ASSERT_EQ(loaded_matrix.extent(0), count);
+    ASSERT_EQ(loaded_matrix.extent(1), dimension);
+
+    std::vector<float> loaded_host_data(count * dimension);
+    raft::copy(loaded_host_data.data(), loaded_matrix.data_handle(), loaded_host_data.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        ASSERT_EQ(loaded_host_data[i], host_data[i]);
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(UtilsTest, SaveLoadDeviceMatrixOverload) {
+    raft::resources res;
+    const std::string filename = "test_device_matrix_overload.modf";
+    const int64_t count = 3;
+    const int64_t dimension = 2;
+
+    auto matrix = raft::make_device_matrix<float, int64_t>(res, count, dimension);
+    std::vector<float> host_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+    raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    // Save
+    save_device_matrix(res, filename, matrix.view());
+
+    // Load using overload
+    uint64_t loaded_count = 0;
+    uint64_t loaded_dimension = 0;
+    // We must initialize device_matrix with some dimensions if we want to declare it, 
+    // but the overload will re-assign it. 
+    // Actually, the simplest is to just use the returned value or if we must use the overload reference:
+    auto loaded_matrix = raft::make_device_matrix<float, int64_t>(res, 0, 0);
+    load_device_matrix<float>(res, filename, loaded_matrix, loaded_count, loaded_dimension);
+
+    // Verify
+    ASSERT_EQ(loaded_count, (uint64_t)count);
+    ASSERT_EQ(loaded_dimension, (uint64_t)dimension);
+    ASSERT_EQ(loaded_matrix.extent(0), count);
+    ASSERT_EQ(loaded_matrix.extent(1), dimension);
+
+    std::vector<float> loaded_host_data(count * dimension);
+    raft::copy(loaded_host_data.data(), loaded_matrix.data_handle(), loaded_host_data.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        ASSERT_EQ(loaded_host_data[i], host_data[i]);
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(UtilsTest, LoadWithQuantization) {
+    raft::resources res;
+    const std::string filename = "test_quantization.modf";
+    const int64_t count = 100;
+    const int64_t dimension = 8;
+
+    // 1. Create and save float data
+    auto matrix = raft::make_device_matrix<float, int64_t>(res, count, dimension);
+    std::vector<float> host_data(count * dimension);
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        // Values between -1.0 and 1.0 to make quantization meaningful
+        host_data[i] = static_cast<float>(i % 100) / 50.0f - 1.0f;
+    }
+    raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+    save_device_matrix(res, filename, matrix.view());
+
+    // 2. Load as int8_t (should trigger quantization)
+    auto quantized_matrix = load_device_matrix<int8_t>(res, filename);
+
+    // 3. Verify metadata
+    ASSERT_EQ(quantized_matrix.extent(0), count);
+    ASSERT_EQ(quantized_matrix.extent(1), dimension);
+
+    // 4. Basic check that data is loaded
+    std::vector<int8_t> result_host(count * dimension);
+    raft::copy(result_host.data(), quantized_matrix.data_handle(), result_host.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    // We don't check exact values as quantization is lossy, but it should not be all zeros if input wasn't
+    bool non_zero = false;
+    for (auto v : result_host) if (v != 0) non_zero = true;
+    ASSERT_TRUE(non_zero);
+
+    std::remove(filename.c_str());
+}
+
+TEST(UtilsTest, FloatToHalfConversion) {
+    raft::resources res;
+    const std::string filename = "test_f32_to_f16.modf";
+    const int64_t count = 10;
+    const int64_t dimension = 4;
+
+    // 1. Save float data
+    auto matrix = raft::make_device_matrix<float, int64_t>(res, count, dimension);
+    std::vector<float> host_data(count * dimension);
+    for (size_t i = 0; i < host_data.size(); ++i) host_data[i] = static_cast<float>(i);
+    raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res));
+    save_device_matrix(res, filename, matrix.view());
+
+    // 2. Load as half (should trigger conversion)
+    auto half_matrix = load_device_matrix<half>(res, filename);
+
+    // 3. Verify
+    ASSERT_EQ(half_matrix.extent(0), count);
+    ASSERT_EQ(half_matrix.extent(1), dimension);
+
+    std::vector<half> result_host(count * dimension);
+    raft::copy(result_host.data(), half_matrix.data_handle(), result_host.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        ASSERT_EQ(static_cast<float>(result_host[i]), host_data[i]);
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(UtilsTest, HalfToUint8Quantization) {
+    raft::resources res;
+    const std::string filename = "test_f16_to_u8.modf";
+    const int64_t count = 100;
+    const int64_t dimension = 8;
+
+    // 1. Save half data
+    auto matrix = raft::make_host_matrix<half, int64_t>(count, dimension);
+    for (size_t i = 0; i < count * dimension; ++i) {
+        matrix.data_handle()[i] = static_cast<half>(static_cast<float>(i % 100) / 100.0f);
+    }
+    save_host_matrix(filename, matrix.view());
+
+    // 2. Load as uint8_t (should trigger quantization from half)
+    auto u8_matrix = load_device_matrix<uint8_t>(res, filename);
+
+    // 3. Verify
+    ASSERT_EQ(u8_matrix.extent(0), count);
+    ASSERT_EQ(u8_matrix.extent(1), dimension);
+
+    std::vector<uint8_t> result_host(count * dimension);
+    raft::copy(result_host.data(), u8_matrix.data_handle(), result_host.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+
+    bool non_zero = false;
+    for (auto v : result_host) if (v != 0) non_zero = true;
+    ASSERT_TRUE(non_zero);
+
+    std::remove(filename.c_str());
+}
+
+TEST(UtilsTest, LoadInvalidMagic) {
+    const std::string filename = "invalid_magic.modf";
+    std::ofstream file(filename, std::ios::binary);
+    file.write("NOTM", 4);
+    file.close();
+
+    ASSERT_THROW(load_host_matrix<float>(filename), std::runtime_error);
+
+    std::remove(filename.c_str());
+}
+
+TEST(UtilsTest, LoadTypeSizeMismatch) {
+    const std::string filename = "size_mismatch.modf";
+    file_header_t header;
+    std::memcpy(header.magic, "MODF", 4);
+    header.count = 1;
+    header.dimension = 1;
+    header.data_type_size = 8; // Double size
+
+    std::ofstream file(filename, std::ios::binary);
+    file.write(reinterpret_cast<char*>(&header), sizeof(file_header_t));
+    file.close();
+
+    // Try to load as float (size 4) should throw
+    ASSERT_THROW(load_host_matrix<float>(filename), std::runtime_error);
+
+    std::remove(filename.c_str());
+}
+
+TEST(UtilsTest, ScalarQuantizerLifecycle) {
+    raft::resources res;
+    const int64_t count = 100;
+    const int64_t dimension = 8;
+    
+    // 1. Train
+    scalar_quantizer_t<float> quantizer;
+    ASSERT_FALSE(quantizer.is_trained());
+    
+    auto matrix = raft::make_device_matrix<float, int64_t>(res, count, dimension);
+    std::vector<float> host_data(count * dimension);
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        host_data[i] = static_cast<float>(i % 100) / 50.0f - 1.0f; // range [-1, 0.98]
+    }
+    raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res));
+    raft::resource::sync_stream(res);
+    
+    quantizer.train(res, matrix.view());
+    ASSERT_TRUE(quantizer.is_trained());
+    
+    // 2. Getters
+    float q_min = quantizer.min();
+    float q_max = quantizer.max();
+    // Default quantile is 1.0, so it should be exactly -1.0 and 0.98
+    ASSERT_TRUE(std::abs(q_min - (-1.0f)) < 1e-5f);
+    ASSERT_TRUE(std::abs(q_max - 0.98f) < 1e-5f);
+    
+    // 3. Constructor
+    scalar_quantizer_t<float> quantizer2(q_min, q_max);
+    ASSERT_TRUE(quantizer2.is_trained());
+    ASSERT_EQ(quantizer2.min(), q_min);
+    ASSERT_EQ(quantizer2.max(), q_max);
+    
+    // 4. Save/Load
+    const std::string filename = "test_quantizer.bin";
+    quantizer.save_to_file(filename);
+    
+    scalar_quantizer_t<float> quantizer3;
+    quantizer3.load_from_file(filename);
+    ASSERT_TRUE(quantizer3.is_trained());
+    ASSERT_EQ(quantizer3.min(), q_min);
+    ASSERT_EQ(quantizer3.max(), q_max);
+    std::remove(filename.c_str());
+    
+    // 5. Serialize/Deserialize
+    std::stringstream ss;
+    quantizer.serialize(ss);
+    
+    scalar_quantizer_t<float> quantizer4;
+    quantizer4.deserialize(ss);
+    ASSERT_TRUE(quantizer4.is_trained());
+    ASSERT_EQ(quantizer4.min(), q_min);
+    ASSERT_EQ(quantizer4.max(), q_max);
+
+    // 6. SetQuantizer
+    scalar_quantizer_t<float> quantizer5;
+    quantizer5.set_quantizer(0.1f, 0.9f);
+    ASSERT_TRUE(quantizer5.is_trained());
+    ASSERT_EQ(quantizer5.min(), 0.1f);
+    ASSERT_EQ(quantizer5.max(), 0.9f);
+
+    // 7. Getters again
+    ASSERT_EQ(quantizer5.min(), 0.1f);
+    ASSERT_EQ(quantizer5.max(), 0.9f);
+    
+    // 8. Transform
+    std::vector<int8_t> result_host(count * dimension);
+    quantizer.transform(res, matrix.view(), result_host.data(), false);
+    
+    bool non_zero = false;
+    for (auto v : result_host) if (v != 0) non_zero = true;
+    ASSERT_TRUE(non_zero);
+}
+
diff --git a/cgo/cuvs/test/test_framework.hpp b/cgo/cuvs/test/test_framework.hpp
new file mode 100644
index 0000000000000..f995f514686da
--- /dev/null
+++ b/cgo/cuvs/test/test_framework.hpp
@@ -0,0 +1,150 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <functional>
+#include <chrono>
+#include <numeric> // For std::iota
+#include <future>  // For std::async
+#include <atomic>
+#include <map>
+#include <stdexcept>
+#include <sstream> // For building string messages
+#include <algorithm> // For std::sort
+#include <any> // For std::any comparisons in assertions
+
+// --- Minimal Custom Test Framework (Stub for compilation) ---
+
+// Logging - minimal versions
+#define TEST_LOG(msg) std::cout << "[INFO    ] " << msg << std::endl
+#define TEST_ERROR(msg) std::cerr << "[ERROR   ] " << msg << std::endl
+
+// Global flag to indicate if the current test has failed (kept minimal)
+extern thread_local bool current_test_failed;
+
+// Helper to build string messages for assertions (handles various types)
+template <typename T>
+std::string to_string_for_assertion(const T& val) {
+    std::ostringstream oss;
+    oss << val;
+    return oss.str();
+}
+inline std::string to_string_for_assertion(const std::any&) { return "std::any"; } // Simplified
+inline std::string to_string_for_assertion(const char* val) { return std::string(val); }
+
+// Helper to check if an exception_ptr holds a specific exception type (kept minimal)
+template <typename E>
+inline bool has_exception(const std::exception_ptr& ep) {
+    if (!ep) return false;
+    try {
+        std::rethrow_exception(ep);
+    } catch (const E& e) {
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
+// Assertions - simplified to just return/log if condition is false
+#define REPORT_FAILURE(msg_str) do { TEST_ERROR(msg_str); current_test_failed = true; return; } while (0)
+#define ASSERT_TRUE(condition) do { if (!(condition)) { REPORT_FAILURE("ASSERT_TRUE failed: " #condition); } } while (0)
+#define ASSERT_FALSE(condition) ASSERT_TRUE(!(condition))
+#define ASSERT_EQ(val1, val2) do { \
+    auto v1 = (val1); \
+    auto v2 = (val2); \
+    if (!(v1 == v2)) { \
+        std::ostringstream oss; \
+        oss << "ASSERT_EQ failed: " << #val1 << " (" << v1 << ") vs " << #val2 << " (" << v2 << ")"; \
+        REPORT_FAILURE(oss.str()); \
+    } \
+} while (0)
+#define ASSERT_NE(val1, val2) do { if (!((val1) != (val2))) { REPORT_FAILURE("ASSERT_NE failed: " #val1 " vs " #val2); } } while (0)
+#define ASSERT_GE(val1, val2) do { if (!((val1) >= (val2))) { REPORT_FAILURE("ASSERT_GE failed: " #val1 " vs " #val2); } } while (0)
+#define ASSERT_THROW(statement, expected_exception) do { bool caught = false; try { statement; } catch (const expected_exception&) { caught = true; } if (!caught) { REPORT_FAILURE("ASSERT_THROW failed"); } } while (0)
+#define ASSERT_NO_THROW(statement) do { try { statement; } catch (...) { REPORT_FAILURE("ASSERT_NO_THROW failed"); } } while (0)
+
+// Test registration
+struct TestCase {
+    std::string name;
+    std::function<void()> func;
+    bool failed = false;
+};
+
+inline std::vector<TestCase>& get_test_cases() {
+    static std::vector<TestCase> test_cases;
+    return test_cases;
+}
+
+// Simplified TEST macro for compilation
+#define TEST(suite, name) \
+    static void test_func_##suite##_##name(); \
+    struct RegisterTest_##suite##_##name { \
+        RegisterTest_##suite##_##name() { \
+            get_test_cases().push_back({#suite "::" #name, test_func_##suite##_##name}); \
+        } \
+    }; \
+    static RegisterTest_##suite##_##name register_test_##suite##_##name; \
+    static void test_func_##suite##_##name()
+
+inline int RUN_ALL_TESTS() {
+    int passed_count = 0;
+    int failed_count = 0;
+    TEST_LOG("Running " << get_test_cases().size() << " tests (minimal framework)...");
+
+    for (auto& test_case : get_test_cases()) {
+        current_test_failed = false; // Reset for each test
+        TEST_LOG("[ RUN      ] " << test_case.name);
+        try {
+            test_case.func();
+        } catch (const std::exception& e) {
+            TEST_ERROR("Test threw unhandled exception: " << e.what());
+            current_test_failed = true;
+        } catch (...) {
+            TEST_ERROR("Test threw unhandled unknown exception.");
+            current_test_failed = true;
+        }
+
+        if (current_test_failed) {
+            test_case.failed = true;
+            failed_count++;
+            TEST_LOG("[  FAILED  ] " << test_case.name);
+        } else {
+            passed_count++;
+            TEST_LOG("[       OK ] " << test_case.name);
+        }
+    }
+
+    TEST_LOG("--------------------------------------------------");
+    TEST_LOG("[==========] " << passed_count + failed_count << " tests ran.");
+    TEST_LOG("[  PASSED  ] " << passed_count << " tests.");
+    if (failed_count > 0) {
+        TEST_ERROR("[  FAILED  ] " << failed_count << " tests, listed below:");
+        for (const auto& test_case : get_test_cases()) {
+            if (test_case.failed) {
+                TEST_ERROR("    " << test_case.name);
+            }
+        }
+    }
+    TEST_LOG("--------------------------------------------------");
+
+    return failed_count;
+}
+
+// --- End of Minimal Custom Test Framework (Stub for compilation) ---
diff --git a/cgo/test/Makefile b/cgo/test/Makefile
index 506722a91f6e6..f0de3ac25285f 100644
--- a/cgo/test/Makefile
+++ b/cgo/test/Makefile
@@ -1,18 +1,47 @@
-CFLAGS=-I.. -g -Wall -Werror -lm -I../../thirdparties/install/include
+UNAME_S := $(shell uname -s)
 
-all: test_add.exe test_bloom.exe test_varlena.exe bloom_whole_test.exe
+ifeq ($(MO_CL_CUDA),1)
+        ifeq ($(CONDA_PREFIX),)
+                $(error CONDA_PREFIX env variable not found. Please activate your conda environment.)
+        endif
+        CC = /usr/local/cuda/bin/nvcc
+	COMPILER_FLAGS := -Xcompiler "-Wall -Werror"
+	# When using nvcc to link, we need to pass the libraries and rpath
+	LINKER_FLAGS := -Xlinker "-rpath=$(shell realpath ..)"
+	# We must also include the cuVS and other deps that libmo.so needs if linked statically, 
+	# but since libmo.so is shared, we just need to link against it.
+	LIBS += -L.. -lmo -L../../thirdparties/install/lib -lusearch_c -L$(CUDA_PATH)/lib64/stubs -lcuda -L$(CUDA_PATH)/lib64 -lcudart
+	LIBS += -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lpthread  -lgomp
+	LIBS += -Xlinker -lpthread -Xlinker -lm
+else
+	COMPILER_FLAGS := -Wall -Werror
+	ifeq ($(UNAME_S),Darwin)
+		LINKER_FLAGS := -Wl,-rpath,$(shell realpath ..)
+	else
+		LINKER_FLAGS := -Wl,-rpath=$(shell realpath ..)
+	endif
+	LIBS := -L.. -lmo -L../../thirdparties/install/lib -lusearch_c -lm -lstdc++
+	ifneq ($(UNAME_S),Darwin)
+		LIBS += -fopenmp
+	endif
+endif
 
-test_add.exe: test_add.c ../libmo.a
-	$(CC) $(CFLAGS) -o test_add.exe test_add.c -L.. -lmo
+CFLAGS := -I.. -g -I../../thirdparties/install/include $(COMPILER_FLAGS)
+LDFLAGS := $(LIBS) $(LINKER_FLAGS)
 
-test_bloom.exe: test_bloom.c ../libmo.a
-	$(CC) $(CFLAGS) -o test_bloom.exe test_bloom.c -L.. -lmo
+all: test_add.exe test_bloom.exe test_varlena.exe
 
-test_varlena.exe: varlena_test.c ../libmo.a
-	$(CC) $(CFLAGS) -o test_varlena.exe varlena_test.c -L.. -lmo
+test_add.exe: test_add.c 
+	$(CC) $(CFLAGS) -o $@ test_add.c $(LDFLAGS)
 
-bloom_whole_test.exe: bloom_whole_test.c ../libmo.a
-	$(CC) $(CFLAGS) -o bloom_whole_test.exe bloom_whole_test.c -L.. -lmo
+test_bloom.exe: test_bloom.c
+	$(CC) $(CFLAGS) -o $@ test_bloom.c $(LDFLAGS)
+
+test_varlena.exe: varlena_test.c
+	$(CC) $(CFLAGS) -o $@ varlena_test.c $(LDFLAGS)
+
+bloom_whole_test.exe: bloom_whole_test.c 
+	$(CC) $(CFLAGS) $(NVCC_FLAGS) -o bloom_whole_test.exe bloom_whole_test.c $(LDFLAGS)
 
 clean:
 	rm -f *.o *.exe
diff --git a/cgo/test/bloom_whole_test.c b/cgo/test/bloom_whole_test.c
new file mode 100644
index 0000000000000..23bf08586f94d
--- /dev/null
+++ b/cgo/test/bloom_whole_test.c
@@ -0,0 +1,122 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "../bloom.h"
+#include "../varlena.h"
+
+// Helper to create a packed buffer of varlenas
+int create_test_buffer(uint8_t *buffer, uint8_t *area) {
+    uint8_t *ptr = buffer;
+    int nitem = 0;
+
+    // --- Element 1: small ---
+    const char *str1 = "apple";
+    uint8_t len1 = strlen(str1);
+    ptr[0] = len1;
+    memcpy(ptr + 1, str1, len1);
+    ptr += VARLENA_SIZE;
+    nitem++;
+
+    // --- Element 2: big ---
+    const char *str2 = "banana_long_string_to_test_big_varlena";
+    uint32_t len2 = strlen(str2);
+    uint32_t offset2 = 50; 
+    memcpy(area + offset2, str2, len2);
+    
+    varlena_set_big_offset_len(ptr, offset2, len2);
+    ptr += VARLENA_SIZE;
+    nitem++;
+
+    // --- Element 3: small ---
+    const char *str3 = "cherry";
+    uint8_t len3 = strlen(str3);
+    ptr[0] = len3;
+    memcpy(ptr + 1, str3, len3);
+    ptr += VARLENA_SIZE;
+    nitem++;
+    
+    return nitem;
+}
+
+void test_add_and_test_varlena() {
+    printf("--- Running test_add_and_test_varlena ---\n");
+    
+    bloomfilter_t *bf = bloomfilter_init(1000, 3);
+    assert(bf != NULL);
+
+    uint8_t buffer[200];
+    uint8_t area[200];
+    int nitem = create_test_buffer(buffer, area);
+
+    // Add all items from the buffer
+    bloomfilter_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0);
+
+    // Test if all added items exist
+    bool results[nitem];
+    bloomfilter_test_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results);
+    
+    for (int i = 0; i < nitem; i++) {
+        assert(results[i]);
+    }
+
+    // Test for a non-existent item
+    const char *str_not_exist = "grape";
+    assert(!bloomfilter_test(bf, str_not_exist, strlen(str_not_exist)));
+
+    bloomfilter_free(bf);
+    printf("test_add_and_test_whole passed.\n\n");
+}
+
+void test_test_and_add_varlena() {
+    printf("--- Running test_test_and_add_varlena ---\n");
+
+    bloomfilter_t *bf = bloomfilter_init(1000, 3);
+    assert(bf != NULL);
+
+    uint8_t buffer[200];
+    uint8_t area[200];
+    int nitem = create_test_buffer(buffer, area);
+    
+    bool results1[nitem];
+    bool results2[nitem];
+
+    // First call: should report all items as non-existent and add them
+    bloomfilter_test_and_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results2);
+    for (int i = 0; i < nitem; i++) {
+        assert(!results1[i]);
+    }
+
+    // Second call: should report all items as existent
+    bloomfilter_test_and_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results2);
+    for (int i = 0; i < nitem; i++) {
+        assert(results2[i]);
+    }
+
+    bloomfilter_free(bf);
+    printf("test_test_and_add_whole passed.\n\n");
+}
+
+int main() {
+    test_add_and_test_varlena();
+    test_test_and_add_varlena();
+    printf("All bloom_varlena_test passed!\n");
+    return 0;
+}
diff --git a/go.mod b/go.mod
index ae09fa2ae94df..d1dcf1ba27f2d 100644
--- a/go.mod
+++ b/go.mod
@@ -76,7 +76,6 @@ require (
 	github.com/prashantv/gostub v1.1.0
 	github.com/prometheus/client_golang v1.17.0
 	github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16
-	github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d
 	github.com/robfig/cron/v3 v3.0.1
 	github.com/samber/lo v1.38.1
 	github.com/segmentio/encoding v0.4.0
@@ -92,7 +91,7 @@ require (
 	github.com/tidwall/btree v1.7.0
 	github.com/tidwall/pretty v1.2.1
 	github.com/tmc/langchaingo v0.1.13
-	github.com/unum-cloud/usearch/golang v0.0.0-20260106013029-7306bb446be5
+	github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9
 	go.starlark.net v0.0.0-20250701195324-d457b4515e0e
 	go.uber.org/automaxprocs v1.5.3
 	go.uber.org/ratelimit v0.2.0
@@ -259,9 +258,6 @@ replace (
 	github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef => github.com/matrixorigin/dragonboat/v4 v4.0.0-20251214113216-2ddf81ef2a85
 	github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4 => github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4
 	github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376 => github.com/matrixorigin/vfs v0.2.1-0.20220616104132-8852fd867376
-
-	github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d => github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6
-	github.com/unum-cloud/usearch/golang v0.0.0-20260106013029-7306bb446be5 => github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9
 )
 
 replace github.com/shoenig/go-m1cpu => github.com/shoenig/go-m1cpu v0.1.7
diff --git a/go.sum b/go.sum
index a22d3b1eeecea..8821ade189a9a 100644
--- a/go.sum
+++ b/go.sum
@@ -207,12 +207,8 @@ github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8Nz
 github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
 github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI=
 github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
-github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6 h1:hn6US40835XeZRilkHLIUpWTF2RYBRXCpBLn1PPOSjg=
-github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6/go.mod h1:Ju9l9IcIHZOPLO1tjN9dEYSgEPFowDPF9pM70W9nNGs=
 github.com/cpegeric/pdftotext-go v0.0.0-20241112123704-49cb86a3790e h1:tQSCiEjYPRU+AuuVR+zd+xYVOsEqX1clPhmIAM6FCHU=
 github.com/cpegeric/pdftotext-go v0.0.0-20241112123704-49cb86a3790e/go.mod h1:zt7uTOYu0EEeKatGaTi9JiP0I9ePHpDvjAwpfPXh/N0=
-github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9 h1:jnClZ1ddCpjYQLMem6YSlVm7Ois6sXbRr2CP6n/rc/s=
-github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9/go.mod h1:3SN8SakyyBWzb14DNZn4t5yX8dOa7ae45KpqDioi4RA=
 github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E=
 github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
 github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
@@ -877,6 +873,8 @@ github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGr
 github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
 github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
 github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
+github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9 h1:KtfoWJQXPrvEfFCuk1FGgiPfBoIhSIqiTLaZLHjoKM4=
+github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9/go.mod h1:NxBpQibuBBeA/V8RGbrNzVAv4OyWWL5yNao7mVz656k=
 github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
 github.com/valyala/fasthttp v1.6.0/go.mod h1:FstJa9V+Pj9vQ7OJie2qMHdwemEDaDiSdBnvPM1Su9w=
diff --git a/optools/images/Dockerfile b/optools/images/Dockerfile
index 837b501811348..7383c0941b937 100644
--- a/optools/images/Dockerfile
+++ b/optools/images/Dockerfile
@@ -32,6 +32,7 @@ FROM matrixorigin/ubuntu:22.04
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/mo-service /mo-service
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/etc /etc
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/thirdparties/install/lib/*.so /usr/local/lib
+COPY --from=builder /go/src/github.com/matrixorigin/matrixone/cgo/*.so /usr/local/lib
 
 # ldconfig and run mo-service to check if the shared library is found
 RUN ldconfig && /mo-service -h
diff --git a/optools/images/gpu/Dockerfile b/optools/images/gpu/Dockerfile
index 8e3640083e614..3549a0d249d70 100644
--- a/optools/images/gpu/Dockerfile
+++ b/optools/images/gpu/Dockerfile
@@ -8,7 +8,7 @@ RUN export LANG=en_US.utf8
 ARG DEBIAN_FRONTEND=noninteractive
 ENV MOHOME=/matrixone
 ENV PATH="/usr/local/cuda/bin:${PATH}"
-ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${MOHOME}/thirdparties/install/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${MOHOME}/thirdparties/install/lib:${MOHOME}/cgo:${LD_LIBRARY_PATH}"
 
 WORKDIR /matrixone
 COPY . .
@@ -52,6 +52,7 @@ FROM nvidia/cuda:13.0.2-cudnn-runtime-ubuntu24.04
 COPY --from=builder /matrixone/mo-service /mo-service
 COPY --from=builder /matrixone/etc /etc
 COPY --from=builder /matrixone/thirdparties/install/lib/*.so /usr/local/lib
+COPY --from=builder /matrixone/cgo/*.so /usr/local/lib
 COPY --from=builder /root/miniconda/envs/go/lib /root/miniconda/envs/go/lib
 
 ENV PATH="/usr/local/cuda/bin:${PATH}"
diff --git a/optools/run_ut.sh b/optools/run_ut.sh
index a8a8205891efe..aa7307fd3c424 100755
--- a/optools/run_ut.sh
+++ b/optools/run_ut.sh
@@ -47,6 +47,27 @@ UT_COUNT="$G_WKSP/$G_TS-UT-Count.out"
 CODE_COVERAGE="$G_WKSP/$G_TS-UT-Coverage.html"
 RAW_COVERAGE="coverage.out"
 IS_BUILD_FAIL=""
+TAGS="matrixone_test"
+
+THIRDPARTIES_INSTALL_DIR=${BUILD_WKSP}/thirdparties/install
+CGO_CFLAGS="-I${BUILD_WKSP}/cgo -I${THIRDPARTIES_INSTALL_DIR}/include"
+CGO_LDFLAGS="-Wl,-rpath,${THIRDPARTIES_INSTALL_DIR}/lib:${BUILD_WKSP}/cgo -L${THIRDPARTIES_INSTALL_DIR}/lib -L${BUILD_WKSP}/cgo -lmo -lusearch_c -lm"
+LD_LIBRARY_PATH="${THIRDPARTIES_INSTALL_DIR}/lib:${BUILD_WKSP}/cgo"
+
+if [[ -n "${MO_CL_CUDA:-}" ]] ; then
+    if [[ ${MO_CL_CUDA} == "1" ]] ; then
+         if [[ -z "${CONDA_PREFIX:-}" ]] ; then
+		 echo "CONDA_PREFIX environment variable not found"
+		 exit 1
+	 fi
+
+         CUDA_HOME=/usr/local/cuda
+         CGO_CFLAGS="${CGO_CFLAGS} -I${CUDA_HOME}/include -I${CONDA_PREFIX}/include"
+         CGO_LDFLAGS="${CGO_LDFLAGS} -L${CUDA_HOME}/lib64/stubs -lcuda -L${CUDA_HOME}/lib64 -lcudart -L${CONDA_PREFIX}/lib -lcuvs -lcuvs_c -lstdc++"
+         LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${CONDA_PREFIX}/lib"
+	 TAGS="${TAGS},gpu"
+    fi
+fi
 
 if [[ -f $SCA_REPORT ]]; then rm $SCA_REPORT; fi
 if [[ -f $UT_REPORT ]]; then rm $UT_REPORT; fi
@@ -70,7 +91,7 @@ function run_vet(){
 
     if [[ -f $SCA_REPORT ]]; then rm $SCA_REPORT; fi
     logger "INF" "Test is in progress... "
-    go vet -tags matrixone_test -unsafeptr=false ./pkg/... 2>&1 | tee $SCA_REPORT
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go vet -tags "${TAGS}" -unsafeptr=false ./pkg/... 2>&1 | tee $SCA_REPORT
     logger "INF" "Refer to $SCA_REPORT for details"
 
 }
@@ -95,18 +116,14 @@ function run_tests(){
     local cover_profile='profile.raw'
     make cgo
     make thirdparties
-    THIRDPARTIES_INSTALL_DIR=${BUILD_WKSP}/thirdparties/install
-
-    local CGO_CFLAGS="-I${BUILD_WKSP}/cgo -I${THIRDPARTIES_INSTALL_DIR}/include"
-    local CGO_LDFLAGS="-Wl,-rpath,${THIRDPARTIES_INSTALL_DIR}/lib -L${THIRDPARTIES_INSTALL_DIR}/lib -L${BUILD_WKSP}/cgo -lmo -lm"
 
     if [[ $SKIP_TESTS == 'race' ]]; then
         logger "INF" "Run UT without race check"
-	    CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m"  $test_scope > $UT_REPORT
+        LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags "${TAGS}" -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m"  $test_scope > $UT_REPORT
 
     else
         logger "INF" "Run UT with race check"
-        CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT
+        LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags "${TAGS}" -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT
     fi
 }
 
diff --git a/pkg/common/concurrent/asyncworkerpool.go b/pkg/common/concurrent/asyncworkerpool.go
new file mode 100644
index 0000000000000..844e3cd31a7a3
--- /dev/null
+++ b/pkg/common/concurrent/asyncworkerpool.go
@@ -0,0 +1,351 @@
+// Copyright 2024 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package concurrent
+
+import (
+	"os"
+	"os/signal"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/logutil"
+	"go.uber.org/zap"
+)
+
+// AsyncTask represents a task to be executed by the AsyncWorkerPool.
+type AsyncTask struct {
+	ID uint64
+	Fn func(res any) (any, error)
+}
+
+// AsyncTaskResult holds the result of a AsyncTask execution.
+type AsyncTaskResult struct {
+	ID     uint64
+	Result any
+	Error  error
+}
+
+// AsyncTaskResultStore manages the storage and retrieval of AsyncTaskResults.
+type AsyncTaskResultStore struct {
+	states    map[uint64]*taskState
+	mu        sync.Mutex
+	nextJobID uint64
+	stopCh    chan struct{}
+	stopped   atomic.Bool
+}
+
+type taskState struct {
+	done   chan struct{}
+	result *AsyncTaskResult
+}
+
+// NewAsyncTaskResultStore creates a new AsyncTaskResultStore.
+func NewAsyncTaskResultStore() *AsyncTaskResultStore {
+	return &AsyncTaskResultStore{
+		states:    make(map[uint64]*taskState),
+		nextJobID: 0,
+		stopCh:    make(chan struct{}),
+		stopped:   atomic.Bool{},
+	}
+}
+
+// Store saves a AsyncTaskResult in the store and signals any waiting goroutines.
+func (s *AsyncTaskResultStore) Store(result *AsyncTaskResult) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	state, ok := s.states[result.ID]
+	if !ok {
+		state = &taskState{done: make(chan struct{})}
+		s.states[result.ID] = state
+	}
+	state.result = result
+	close(state.done)
+}
+
+// Wait blocks until the result for the given jobID is available and returns it.
+// The result is removed from the internal map after being retrieved.
+func (s *AsyncTaskResultStore) Wait(jobID uint64) (*AsyncTaskResult, error) {
+	s.mu.Lock()
+	state, ok := s.states[jobID]
+	if !ok {
+		// If task was not submitted yet, create state and wait.
+		state = &taskState{done: make(chan struct{})}
+		s.states[jobID] = state
+		s.mu.Unlock() // Release lock before blocking
+	} else if state.result != nil {
+		// If result is already available, return it immediately without blocking.
+		delete(s.states, jobID) // Remove after retrieval
+		s.mu.Unlock()
+		return state.result, nil
+	} else {
+		// Task was submitted, but result not yet available. Release lock and wait.
+		s.mu.Unlock() // Release lock before blocking
+	}
+
+	select {
+	case <-state.done:
+		s.mu.Lock()
+		delete(s.states, jobID)
+		s.mu.Unlock()
+		return state.result, nil
+	case <-s.stopCh:
+		return nil, moerr.NewInternalErrorNoCtx("AsyncTaskResultStore stopped before result was available")
+	}
+}
+
+// GetNextJobID atomically increments and returns a new unique job ID.
+func (s *AsyncTaskResultStore) GetNextJobID() uint64 {
+	return atomic.AddUint64(&s.nextJobID, 1)
+}
+
+// Stop signals the AsyncTaskResultStore to stop processing new waits.
+func (s *AsyncTaskResultStore) Stop() {
+	if s.stopped.CompareAndSwap(false, true) {
+		close(s.stopCh)
+	}
+}
+
+// AsyncWorkerPool runs tasks in a dedicated OS thread with a CUDA context.
+type AsyncWorkerPool struct {
+	tasks                 chan *AsyncTask
+	stopCh                chan struct{}
+	wg                    sync.WaitGroup
+	stopped               atomic.Bool // Indicates if the worker has been stopped
+	firstError            atomic.Value
+	*AsyncTaskResultStore // Embed the result store
+	nthread               uint
+	sigc                  chan os.Signal // Add this field
+	errch                 chan error
+	createResource        func() (any, error)
+	cleanupResource       func(any)
+}
+
+// NewAsyncWorkerPool creates a new AsyncWorkerPool.
+func NewAsyncWorkerPool(nthread uint, createResource func() (any, error), cleanupResource func(any)) *AsyncWorkerPool {
+	return &AsyncWorkerPool{
+		tasks:                make(chan *AsyncTask, nthread),
+		stopCh:               make(chan struct{}),
+		stopped:              atomic.Bool{}, // Initialize to false
+		AsyncTaskResultStore: NewAsyncTaskResultStore(),
+		nthread:              nthread,
+		sigc:                 make(chan os.Signal, 1),   // Initialize sigc
+		errch:                make(chan error, nthread), // Initialize errch
+		createResource:       createResource,
+		cleanupResource:      cleanupResource,
+	}
+}
+
+// handleAndStoreTask processes a single AsyncTask and stores its result.
+func (w *AsyncWorkerPool) handleAndStoreTask(task *AsyncTask, resource any) {
+	result, err := task.Fn(resource)
+	asyncResult := &AsyncTaskResult{
+		ID:     task.ID,
+		Result: result,
+		Error:  err,
+	}
+	w.AsyncTaskResultStore.Store(asyncResult)
+}
+
+// drainAndProcessTasks drains the w.tasks channel and processes each task.
+// It stops when the channel is empty or closed.
+func (w *AsyncWorkerPool) drainAndProcessTasks(resource any) {
+	for {
+		select {
+		case task, ok := <-w.tasks:
+			if !ok {
+				return // Channel closed, no more tasks. Exit.
+			}
+			w.handleAndStoreTask(task, resource)
+		default:
+			return // All tasks drained, or channel is empty.
+		}
+	}
+}
+
+// Start begins the worker's execution loop.
+func (w *AsyncWorkerPool) Start(initFn func(res any) error, stopFn func(resource any) error) {
+	w.wg.Add(1) // for w.run
+	go w.run(initFn, stopFn)
+
+	signal.Notify(w.sigc, syscall.SIGTERM, syscall.SIGINT) // Notify signals to sigc
+
+	w.wg.Add(1) // for the signal handler goroutine
+	go func() {
+		defer w.wg.Done() // Ensure wg.Done() is called when this goroutine exits
+		select {
+		case <-w.sigc: // Wait for a signal
+			logutil.Info("AsyncWorkerPool received shutdown signal, stopping...")
+			if w.stopped.CompareAndSwap(false, true) {
+				close(w.stopCh) // Signal run() to stop.
+				close(w.tasks)  // Close tasks channel here.
+			}
+		case err := <-w.errch: // Listen for errors from worker goroutines
+			logutil.Error("AsyncWorkerPool received internal error, stopping...", zap.Error(err))
+			if w.firstError.Load() == nil {
+				w.firstError.Store(err)
+			}
+			if w.stopped.CompareAndSwap(false, true) {
+				close(w.stopCh) // Signal run() to stop.
+				close(w.tasks)  // Close tasks channel here.
+			}
+		case <-w.stopCh: // Listen for internal stop signal from w.Stop()
+			logutil.Info("AsyncWorkerPool signal handler received internal stop signal, exiting...")
+			// Do nothing, just exit. w.Stop() will handle the rest.
+		}
+	}()
+}
+
+// Stop signals the worker to terminate.
+func (w *AsyncWorkerPool) Stop() {
+	if w.stopped.CompareAndSwap(false, true) {
+		close(w.stopCh) // Signal run() to stop.
+		close(w.tasks)  // Close tasks channel here.
+	}
+	w.wg.Wait()
+	w.AsyncTaskResultStore.Stop() // Signal the result store to stop
+}
+
+// Submit sends a task to the worker.
+func (w *AsyncWorkerPool) Submit(fn func(res any) (any, error)) (uint64, error) {
+	if w.stopped.Load() {
+		return 0, moerr.NewInternalErrorNoCtx("cannot submit task: worker is stopped")
+	}
+	jobID := w.GetNextJobID()
+	task := &AsyncTask{
+		ID: jobID,
+		Fn: fn,
+	}
+	w.tasks <- task
+	return jobID, nil
+}
+
+func (w *AsyncWorkerPool) workerLoop(wg *sync.WaitGroup) {
+	defer wg.Done()
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	var resource any
+	var err error
+	if w.createResource != nil {
+		resource, err = w.createResource()
+		if err != nil {
+			w.errch <- err
+			return
+		}
+	}
+	if w.cleanupResource != nil {
+		defer w.cleanupResource(resource)
+	}
+
+	for {
+		select {
+		case task, ok := <-w.tasks:
+			if !ok { // tasks channel closed
+				return // No more tasks, and channel is closed. Exit.
+			}
+			w.handleAndStoreTask(task, resource) // Pass resource directly
+		case <-w.stopCh:
+			// stopCh signaled. Drain remaining tasks from w.tasks then exit.
+			w.drainAndProcessTasks(resource) // Pass resource directly
+			return
+		}
+	}
+}
+
+func (w *AsyncWorkerPool) run(initFn func(res any) error, stopFn func(resource any) error) {
+	defer w.wg.Done()
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	var parentResource any
+	var err error
+	if w.createResource != nil {
+		parentResource, err = w.createResource()
+		if err != nil {
+			w.errch <- err
+			return
+		}
+	}
+	if w.cleanupResource != nil {
+		defer w.cleanupResource(parentResource)
+	}
+
+	// Execute initFn once.
+	if initFn != nil {
+		if err := initFn(parentResource); err != nil {
+			logutil.Error("failed to initialize async resource with provided function", zap.Error(err))
+			w.errch <- err
+
+			return
+		}
+	}
+
+	if stopFn != nil {
+		defer func() {
+			if err := stopFn(parentResource); err != nil {
+				logutil.Error("error during async resource stop function", zap.Error(err))
+				w.errch <- err
+			}
+		}()
+	}
+
+	if w.nthread == 1 {
+		// Special case: nthread is 1, process tasks directly in this goroutine
+		for {
+			select {
+			case task, ok := <-w.tasks:
+				if !ok { // tasks channel closed
+					return // Channel closed, no more tasks. Exit.
+				}
+				w.handleAndStoreTask(task, parentResource)
+			case <-w.stopCh:
+				// Drain the tasks channel before exiting
+				w.drainAndProcessTasks(parentResource)
+				return
+			}
+		}
+	} else {
+		// General case: nthread > 1, create worker goroutines
+		var workerWg sync.WaitGroup
+		workerWg.Add(int(w.nthread))
+		for i := 0; i < int(w.nthread); i++ {
+			go w.workerLoop(&workerWg)
+		}
+
+		// Wait for stop signal
+		<-w.stopCh
+
+		// Signal workers to stop and wait for them to finish.
+		workerWg.Wait()
+	}
+}
+
+// Wait blocks until the result for the given jobID is available and returns it.
+// The result is removed from the internal map after being retrieved.
+func (w *AsyncWorkerPool) Wait(jobID uint64) (*AsyncTaskResult, error) {
+	return w.AsyncTaskResultStore.Wait(jobID)
+}
+
+// GetFirstError returns the first internal error encountered by the worker.
+func (w *AsyncWorkerPool) GetFirstError() error {
+	err := w.firstError.Load()
+	if err == nil {
+		return nil
+	}
+	return err.(error)
+}
diff --git a/pkg/common/concurrent/asyncworkerpool_test.go b/pkg/common/concurrent/asyncworkerpool_test.go
new file mode 100644
index 0000000000000..76c78314d17c3
--- /dev/null
+++ b/pkg/common/concurrent/asyncworkerpool_test.go
@@ -0,0 +1,509 @@
+// Copyright 2024 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package concurrent
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestNewAsyncTaskResultStore(t *testing.T) {
+	store := NewAsyncTaskResultStore()
+	assert.NotNil(t, store)
+	assert.NotNil(t, store.states)
+	assert.Equal(t, uint64(0), store.nextJobID)
+}
+
+func TestAsyncTaskResultStore_GetNextJobID(t *testing.T) {
+	store := NewAsyncTaskResultStore()
+	id1 := store.GetNextJobID()
+	id2 := store.GetNextJobID()
+	id3 := store.GetNextJobID()
+
+	assert.Equal(t, uint64(1), id1)
+	assert.Equal(t, uint64(2), id2)
+	assert.Equal(t, uint64(3), id3)
+}
+
+func TestAsyncTaskResultStore_StoreAndWait(t *testing.T) {
+	store := NewAsyncTaskResultStore()
+	jobID := store.GetNextJobID()
+	expectedResult := "task completed"
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		time.Sleep(10 * time.Millisecond) // Simulate some work before storing
+		store.Store(&AsyncTaskResult{
+			ID:     jobID,
+			Result: expectedResult,
+			Error:  nil,
+		})
+	}()
+
+	result, err := store.Wait(jobID)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, jobID, result.ID)
+	assert.Equal(t, expectedResult, result.Result)
+	assert.Nil(t, result.Error)
+
+	wg.Wait()
+
+	// Verify that the result is removed after retrieval
+	store.mu.Lock()
+	_, ok := store.states[jobID]
+	store.mu.Unlock()
+	assert.False(t, ok, "Result should be removed from store after Wait")
+}
+
+func TestAsyncTaskResultStore_ConcurrentStoreAndWait(t *testing.T) {
+	store := NewAsyncTaskResultStore()
+	numTasks := 100
+
+	var submitWg sync.WaitGroup
+	var waitWg sync.WaitGroup
+	submitWg.Add(numTasks)
+	waitWg.Add(numTasks)
+
+	results := make(chan *AsyncTaskResult, numTasks)
+
+	// Launch goroutines to wait for results
+	for i := 0; i < numTasks; i++ {
+		jobID := store.GetNextJobID() // Pre-generate job IDs
+		go func(id uint64) {
+			defer waitWg.Done()
+			result, err := store.Wait(id)
+			assert.NoError(t, err)
+			results <- result
+		}(jobID)
+	}
+
+	// Launch goroutines to store results
+	for i := 1; i <= numTasks; i++ {
+		go func(id uint64) {
+			defer submitWg.Done()
+			// Simulate random delay
+			time.Sleep(time.Duration(id%10) * time.Millisecond)
+			store.Store(&AsyncTaskResult{
+				ID:     id,
+				Result: fmt.Sprintf("result-%d", id),
+				Error:  nil,
+			})
+		}(uint64(i))
+	}
+
+	submitWg.Wait()
+	waitWg.Wait() // Ensure all waiters have completed
+	close(results)
+
+	receivedResults := make(map[uint64]string)
+	for r := range results {
+		receivedResults[r.ID] = r.Result.(string)
+	}
+
+	assert.Len(t, receivedResults, numTasks)
+	for i := 1; i <= numTasks; i++ {
+		assert.Equal(t, fmt.Sprintf("result-%d", i), receivedResults[uint64(i)])
+	}
+}
+
+type dummyResource struct {
+	closed bool
+}
+
+func (m *dummyResource) Close() {
+	m.closed = true
+}
+
+func testCreateResource() (any, error) {
+	return &dummyResource{}, nil
+}
+
+func testCleanupResource(res any) {
+	if res == nil {
+		return
+	}
+	resource := res.(*dummyResource)
+	resource.Close()
+}
+
+func TestAsyncWorkerPool_LifecycleAndTaskExecution(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource)
+	require.NotNil(t, worker)
+
+	// Start the worker
+	worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn
+
+	// Submit a task
+	expectedTaskResult := "processed by CUDA (mocked)"
+	taskID, err := worker.Submit(func(res any) (any, error) {
+		// In a real scenario, this would use the real resource
+		// For testing, we just return a value.
+		// Assert that res is not nil, even if it's a dummy one.
+		assert.NotNil(t, res)
+		return expectedTaskResult, nil
+	})
+	require.NoError(t, err)
+
+	// Wait for the result
+	result, err := worker.Wait(taskID)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, taskID, result.ID)
+	assert.Equal(t, expectedTaskResult, result.Result)
+	assert.Nil(t, result.Error)
+
+	// Submit another task
+	expectedTaskResult2 := 123
+	taskID2, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		return expectedTaskResult2, nil
+	})
+	require.NoError(t, err)
+
+	result2, err := worker.Wait(taskID2)
+	assert.NoError(t, err)
+	assert.NotNil(t, result2)
+	assert.Equal(t, taskID2, result2.ID)
+	assert.Equal(t, expectedTaskResult2, result2.Result)
+	assert.Nil(t, result2.Error)
+
+	// Test a task that returns an error
+	expectedError := fmt.Errorf("cuda operation failed")
+	taskID3, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		return nil, expectedError
+	})
+	require.NoError(t, err)
+
+	result3, err := worker.Wait(taskID3)
+	assert.NoError(t, err) // Error is returned in AsyncTaskResult, not as return value of Wait
+	assert.NotNil(t, result3)
+	assert.Equal(t, taskID3, result3.ID)
+	assert.Nil(t, result3.Result)
+	assert.Equal(t, expectedError, result3.Error)
+
+	// Stop the worker
+	worker.Stop()
+
+	t.Log("AsyncWorkerPool stopped. Further submissions would block or panic.")
+}
+
+func TestAsyncWorkerPool_StopDuringTaskProcessing(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource)
+	worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn
+
+	// Submit a long-running task
+	longTaskSignal := make(chan struct{})
+	longTaskID, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		<-longTaskSignal // Block until signaled
+		return "long task done", nil
+	})
+	require.NoError(t, err)
+
+	// Give the worker a moment to pick up the task
+	time.Sleep(50 * time.Millisecond)
+
+	// Stop the worker while the task is running
+	doneStopping := make(chan struct{})
+	go func() {
+		worker.Stop()
+		close(doneStopping)
+	}()
+
+	// Wait for a short period to see if Stop is blocked by the task
+	select {
+	case <-doneStopping:
+		t.Fatal("Worker stopped too quickly, long task might not have started blocking")
+	case <-time.After(100 * time.Millisecond):
+		// This means Stop is likely waiting for the `run` goroutine, which is blocked by the task.
+		t.Log("Worker.Stop is blocked by the long-running task as expected.")
+	}
+
+	// Now unblock the long-running task
+	close(longTaskSignal)
+
+	// The worker should now be able to stop
+	select {
+	case <-doneStopping:
+		t.Log("Worker successfully stopped after long task completed.")
+	case <-time.After(500 * time.Millisecond):
+		t.Fatal("Worker did not stop even after long task completed.")
+	}
+
+	// Verify that the long task result was stored
+	result, err := worker.Wait(longTaskID)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, longTaskID, result.ID)
+	assert.Equal(t, "long task done", result.Result)
+}
+
+func TestAsyncWorkerPool_MultipleSubmitsBeforeStart(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource)
+
+	// Start the worker - now takes initFn
+	worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn
+
+	// Submit multiple tasks before starting the worker
+	numTasks := 5
+	taskIDs := make([]uint64, numTasks) // Still need to collect IDs
+	for i := 0; i < numTasks; i++ {
+		var err error
+		taskIDs[i], err = worker.Submit(func(res any) (any, error) {
+			assert.NotNil(t, res)
+			return fmt.Sprintf("result-%d", i), nil
+		})
+		require.NoError(t, err)
+	}
+
+	// Start the worker
+	// worker.Start() // Already started above, remove duplicate
+
+	// Wait for all results
+	for i, id := range taskIDs {
+		result, err := worker.Wait(id)
+		assert.NoError(t, err)
+		assert.NotNil(t, result)
+		assert.Equal(t, id, result.ID)
+		assert.Equal(t, fmt.Sprintf("result-%d", i), result.Result)
+	}
+
+	worker.Stop()
+}
+
+func TestAsyncWorkerPool_GracefulShutdown(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource)
+	worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn
+
+	var wg sync.WaitGroup
+	numTasks := 10
+	results := make(chan *AsyncTaskResult, numTasks) // Changed type
+
+	// Submit tasks
+	for i := 0; i < numTasks; i++ {
+		wg.Add(1)
+		// Capture loop index for the anonymous function
+		loopIndex := i
+
+		var submitErr error
+		taskID, submitErr := worker.Submit(func(res any) (any, error) {
+			assert.NotNil(t, res)
+			time.Sleep(10 * time.Millisecond)                     // Simulate work
+			return fmt.Sprintf("final-result-%d", loopIndex), nil // Use captured loop index
+		})
+		require.NoError(t, submitErr)
+
+		go func(id uint64) {
+			defer wg.Done()
+			r, waitErr := worker.Wait(id)
+			assert.NoError(t, waitErr)
+			results <- r
+		}(taskID)
+	}
+
+	// Give some time for tasks to be submitted and processed
+	time.Sleep(50 * time.Millisecond)
+
+	// Stop the worker
+	worker.Stop()
+
+	// All tasks submitted before Stop should complete and their results should be retrievable
+	wg.Wait()
+	close(results)
+
+	assert.Len(t, results, numTasks)
+	for r := range results {
+		assert.Contains(t, r.Result.(string), "final-result-")
+	}
+
+	// Ensure new tasks cannot be submitted after stop
+	_, err := worker.Submit(func(res any) (any, error) { // Use := for first declaration of err in this scope
+		return "should not be processed", nil
+	})
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "worker is stopped")
+}
+
+func TestAsyncWorkerPool_SignalTermination(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource) // Use 1 thread for easier control and observation
+	require.NotNil(t, worker)
+
+	worker.Start(nil, func(_ any) error { return nil })
+
+	// Submit a task that will complete after the signal, to ensure graceful processing
+	taskDone := make(chan struct{})
+	taskID1, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		<-taskDone // Wait for signal to complete
+		return "task1 processed", nil
+	})
+	require.NoError(t, err)
+
+	// Submit a second quick task that should complete before or around the signal
+	taskID2, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		return "task2 processed", nil
+	})
+	require.NoError(t, err)
+
+	// Give the worker a moment to pick up the tasks
+	time.Sleep(50 * time.Millisecond)
+
+	// Simulate SIGTERM by sending to the signal channel
+	t.Log("Simulating SIGTERM to AsyncWorkerPool")
+	worker.sigc <- syscall.SIGTERM
+
+	// Allow some time for the signal handler to process and call worker.Stop()
+	time.Sleep(100 * time.Millisecond)
+
+	// Unblock the long-running task to allow it to finish and the worker to fully stop
+	close(taskDone)
+
+	// Wait for all worker goroutines to finish
+	// The worker.Stop() method, which is called by the signal handler,
+	// internally waits for worker.wg.Wait().
+	// So, we can verify by checking if new submissions fail and if old tasks results are available.
+
+	// Check if previously submitted tasks completed
+	result1, err := worker.Wait(taskID1)
+	assert.NoError(t, err)
+	assert.NotNil(t, result1)
+	assert.Equal(t, taskID1, result1.ID)
+	assert.Equal(t, "task1 processed", result1.Result)
+
+	result2, err := worker.Wait(taskID2)
+	assert.NoError(t, err)
+	assert.NotNil(t, result2)
+	assert.Equal(t, taskID2, result2.ID)
+	assert.Equal(t, "task2 processed", result2.Result)
+
+	// Attempt to submit a new task after termination. It should fail.
+	_, err = worker.Submit(func(res any) (any, error) {
+		return "should not be processed", nil
+	})
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "worker is stopped")
+}
+
+func TestAsyncWorkerPool_GetFirstError(t *testing.T) {
+
+	var err error // Explicitly declare err here
+
+	worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource)
+	assert.Nil(t, worker.GetFirstError(), "GetFirstError should be nil initially")
+
+	// Trigger an error in initFn, which will be pushed to w.errch
+	expectedErr1 := fmt.Errorf("simulated init error 1")
+	initFn1 := func(resource any) error {
+		return expectedErr1
+	}
+	stopFn := func(_ any) error { return nil }
+
+	worker.Start(initFn1, stopFn)
+
+	// Give the `run` goroutine and the signal handler a moment to process initFn and store the first error.
+	time.Sleep(50 * time.Millisecond)
+
+	// GetFirstError should now return the expected error
+	assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should return the first recorded error")
+
+	// Submit a task that causes an error (this error won't be saved as firstError via w.errch)
+	// This ensures that only errors propagated through w.errch are considered.
+	_, err = worker.Submit(func(res any) (any, error) { // Use = for assignment
+		assert.NotNil(t, res)
+		return nil, fmt.Errorf("task error, should not affect GetFirstError()")
+	})
+	require.Error(t, err) // Expect an error because the worker should be stopped
+	assert.Contains(t, err.Error(), "worker is stopped")
+
+	// Give some time for the task to be processed, if it affects anything
+	time.Sleep(50 * time.Millisecond)
+
+	// Ensure GetFirstError remains the same even if other errors (from tasks) occur.
+	assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should not change after the first error is set")
+
+	worker.Stop()
+
+	// After stop, GetFirstError should still be the same.
+	assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should retain the first error after stopping")
+}
+
+func TestAsyncWorkerPool_MultipleStopCalls(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource) // Use 1 thread
+	require.NotNil(t, worker)
+
+	worker.Start(nil, func(_ any) error { return nil })
+
+	// Call Stop multiple times from the main goroutine
+	worker.Stop()
+	worker.Stop()
+	worker.Stop()
+
+	// Call Stop from another goroutine
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		worker.Stop()
+	}()
+	wg.Wait()
+
+	// Ensure no panics occurred during multiple Stop calls
+	// (Go's testing framework will catch panics)
+
+	// Optionally, try submitting a task again to ensure it's truly stopped
+	_, err := worker.Submit(func(res any) (any, error) { return nil, nil })
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "worker is stopped")
+
+	t.Log("Successfully called Stop multiple times without panic.")
+}
+
+func TestAsyncWorkerPool_NilCallbacks(t *testing.T) {
+	worker := NewAsyncWorkerPool(2, nil, nil)
+	require.NotNil(t, worker)
+
+	worker.Start(nil, nil)
+
+	expectedResult := "no resource needed"
+	taskID, err := worker.Submit(func(res any) (any, error) {
+		assert.Nil(t, res)
+		return expectedResult, nil
+	})
+	require.NoError(t, err)
+
+	result, err := worker.Wait(taskID)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, expectedResult, result.Result)
+
+	worker.Stop()
+}
diff --git a/pkg/common/concurrent/executor.go b/pkg/common/concurrent/executor.go
index 1cc21cf82cdaf..0eac95c6f5a4c 100644
--- a/pkg/common/concurrent/executor.go
+++ b/pkg/common/concurrent/executor.go
@@ -37,6 +37,14 @@ func (e ThreadPoolExecutor) Execute(
 	nitems int,
 	fn func(ctx context.Context, thread_id int, start, end int) error) (err error) {
 
+	if nitems <= 0 {
+		return nil
+	}
+
+	if e.nthreads <= 1 {
+		return fn(ctx, 0, 0, nitems)
+	}
+
 	g, ctx := errgroup.WithContext(ctx)
 
 	q := nitems / e.nthreads
diff --git a/pkg/common/concurrent/executor_test.go b/pkg/common/concurrent/executor_test.go
index 61f4856f15e88..50ef97b2df16e 100644
--- a/pkg/common/concurrent/executor_test.go
+++ b/pkg/common/concurrent/executor_test.go
@@ -87,3 +87,40 @@ func TestExecutorDistribution(t *testing.T) {
 
 	require.Equal(t, 9, count)
 }
+
+func TestExecutorSingleThread(t *testing.T) {
+	ctx := context.Background()
+	nitems := 10
+	nthreads := 1
+
+	e := NewThreadPoolExecutor(nthreads)
+
+	called := false
+	err := e.Execute(ctx, nitems, func(ctx context.Context, thread_id int, start, end int) error {
+		called = true
+		require.Equal(t, 0, thread_id)
+		require.Equal(t, 0, start)
+		require.Equal(t, nitems, end)
+		return nil
+	})
+
+	require.NoError(t, err)
+	require.True(t, called)
+}
+
+func TestExecutorZeroItems(t *testing.T) {
+	ctx := context.Background()
+	nitems := 0
+	nthreads := 4
+
+	e := NewThreadPoolExecutor(nthreads)
+
+	called := false
+	err := e.Execute(ctx, nitems, func(ctx context.Context, thread_id int, start, end int) error {
+		called = true
+		return nil
+	})
+
+	require.NoError(t, err)
+	require.False(t, called)
+}
diff --git a/pkg/cuvs/adhoc.go b/pkg/cuvs/adhoc.go
new file mode 100644
index 0000000000000..6ca8e4c2a11fa
--- /dev/null
+++ b/pkg/cuvs/adhoc.go
@@ -0,0 +1,74 @@
+//go:build gpu
+
+/*
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/adhoc_c.h"
+#include <stdlib.h>
+*/
+import "C"
+import (
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"unsafe"
+)
+
+// AdhocBruteForceSearch performs an ad-hoc brute-force search on GPU without using a worker thread.
+func AdhocBruteForceSearch[T VectorType](
+	dataset []T,
+	nRows uint64,
+	dim uint32,
+	queries []T,
+	nQueries uint64,
+	limit uint32,
+	metric DistanceType,
+	deviceID int,
+) ([]int64, []float32, error) {
+	if len(dataset) == 0 || len(queries) == 0 {
+		return nil, nil, moerr.NewInternalErrorNoCtx("empty dataset or queries")
+	}
+
+	qtype := GetQuantization[T]()
+
+	neighbors := make([]int64, nQueries*uint64(limit))
+	distances := make([]float32, nQueries*uint64(limit))
+
+	var errmsg *C.char
+	C.gpu_adhoc_brute_force_search(
+		unsafe.Pointer(&dataset[0]),
+		C.uint64_t(nRows),
+		C.uint32_t(dim),
+		unsafe.Pointer(&queries[0]),
+		C.uint64_t(nQueries),
+		C.uint32_t(limit),
+		C.distance_type_t(metric),
+		C.quantization_t(qtype),
+		C.int(deviceID),
+		(*C.int64_t)(unsafe.Pointer(&neighbors[0])),
+		(*C.float)(unsafe.Pointer(&distances[0])),
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	return neighbors, distances, nil
+}
diff --git a/pkg/cuvs/adhoc_test.go b/pkg/cuvs/adhoc_test.go
new file mode 100644
index 0000000000000..dec4b48fa8f94
--- /dev/null
+++ b/pkg/cuvs/adhoc_test.go
@@ -0,0 +1,60 @@
+//go:build gpu
+
+/*
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cuvs
+
+import (
+	"testing"
+)
+
+func TestAdhocBruteForceSearch(t *testing.T) {
+	dim := uint32(3)
+	nRows := uint64(2)
+	nQueries := uint64(1)
+	limit := uint32(1)
+
+	dataset := []float32{
+		1.0, 2.0, 3.0,
+		4.0, 5.0, 6.0,
+	}
+	queries := []float32{
+		1.1, 2.1, 3.1,
+	}
+
+	neighbors, distances, err := AdhocBruteForceSearch[float32](
+		dataset, nRows, dim,
+		queries, nQueries, limit,
+		L2Expanded, 0,
+	)
+
+	if err != nil {
+		t.Fatalf("AdhocBruteForceSearch failed: %v", err)
+	}
+
+	if len(neighbors) != int(nQueries*uint64(limit)) {
+		t.Errorf("Expected %d neighbors, got %d", nQueries*uint64(limit), len(neighbors))
+	}
+
+	if neighbors[0] != 0 {
+		t.Errorf("Expected neighbor 0, got %d", neighbors[0])
+	}
+
+	if distances[0] > 0.1 {
+		t.Errorf("Expected small distance, got %f", distances[0])
+	}
+}
diff --git a/pkg/cuvs/brute_force.go b/pkg/cuvs/brute_force.go
new file mode 100644
index 0000000000000..ea3914fd8d855
--- /dev/null
+++ b/pkg/cuvs/brute_force.go
@@ -0,0 +1,317 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/brute_force_c.h"
+#include <stdlib.h>
+*/
+import "C"
+import (
+	"runtime"
+	"unsafe"
+
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+)
+
+// GpuBruteForce represents the C++ gpu_brute_force_t object
+type GpuBruteForce[T VectorType] struct {
+	cIndex C.gpu_brute_force_c
+}
+
+// NewGpuBruteForce creates a new GpuBruteForce instance
+func NewGpuBruteForce[T VectorType](dataset []T, count_vectors uint64, dimension uint32, metric DistanceType, nthread uint32, device_id int) (*GpuBruteForce[T], error) {
+	if len(dataset) == 0 || count_vectors == 0 || dimension == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("dataset, count_vectors, and dimension cannot be zero")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cIndex := C.gpu_brute_force_new(
+		unsafe.Pointer(&dataset[0]),
+		C.uint64_t(count_vectors),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		C.uint32_t(nthread),
+		C.int(device_id),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cIndex == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create GpuBruteForce")
+	}
+	return &GpuBruteForce[T]{cIndex: cIndex}, nil
+}
+
+// NewGpuBruteForceEmpty creates a new GpuBruteForce instance with pre-allocated buffer but no data yet.
+func NewGpuBruteForceEmpty[T VectorType](totalCount uint64, dimension uint32, metric DistanceType,
+	nthread uint32, deviceID int) (*GpuBruteForce[T], error) {
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+
+	cBruteForce := C.gpu_brute_force_new_empty(
+		C.uint64_t(totalCount),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		C.uint32_t(nthread),
+		C.int(deviceID),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cBruteForce == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create GpuBruteForce")
+	}
+
+	return &GpuBruteForce[T]{cIndex: cBruteForce}, nil
+}
+
+// Start initializes the worker and resources
+func (gb *GpuBruteForce[T]) Start() error {
+	if gb.cIndex == nil {
+		return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+	}
+	var errmsg *C.char
+	C.gpu_brute_force_start(gb.cIndex, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Build triggers the dataset loading to GPU
+func (gb *GpuBruteForce[T]) Build() error {
+	if gb.cIndex == nil {
+		return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+	}
+	var errmsg *C.char
+	C.gpu_brute_force_build(gb.cIndex, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// AddChunk adds a chunk of data to the pre-allocated buffer.
+func (gb *GpuBruteForce[T]) AddChunk(chunk []T, chunkCount uint64) error {
+	if gb.cIndex == nil {
+		return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+	}
+	if len(chunk) == 0 || chunkCount == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_brute_force_add_chunk(
+		gb.cIndex,
+		unsafe.Pointer(&chunk[0]),
+		C.uint64_t(chunkCount),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(chunk)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// AddChunkFloat adds a chunk of float32 data, performing on-the-fly conversion if needed.
+func (gb *GpuBruteForce[T]) AddChunkFloat(chunk []float32, chunkCount uint64) error {
+	if gb.cIndex == nil {
+		return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+	}
+	if len(chunk) == 0 || chunkCount == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_brute_force_add_chunk_float(
+		gb.cIndex,
+		(*C.float)(&chunk[0]),
+		C.uint64_t(chunkCount),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(chunk)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Search performs a search operation
+func (gb *GpuBruteForce[T]) Search(queries []T, num_queries uint64, query_dimension uint32, limit uint32) ([]int64, []float32, error) {
+	if gb.cIndex == nil {
+		return nil, nil, moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+	}
+	if len(queries) == 0 || num_queries == 0 || query_dimension == 0 {
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries, num_queries, and query_dimension cannot be zero")
+	}
+
+	var errmsg *C.char
+	cResult := C.gpu_brute_force_search(
+		gb.cIndex,
+		unsafe.Pointer(&queries[0]),
+		C.uint64_t(num_queries),
+		C.uint32_t(query_dimension),
+		C.uint32_t(limit),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	if cResult == nil {
+		return nil, nil, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	// Allocate slices for results
+	neighbors := make([]int64, num_queries*uint64(limit))
+	distances := make([]float32, num_queries*uint64(limit))
+
+	C.gpu_brute_force_get_results(cResult, C.uint64_t(num_queries), C.uint32_t(limit), (*C.int64_t)(unsafe.Pointer(&neighbors[0])), (*C.float)(unsafe.Pointer(&distances[0])))
+	runtime.KeepAlive(neighbors)
+	runtime.KeepAlive(distances)
+
+	C.gpu_brute_force_free_search_result(cResult)
+
+	return neighbors, distances, nil
+}
+
+// SearchFloat performs a search operation with float32 queries
+func (gb *GpuBruteForce[T]) SearchFloat(queries []float32, num_queries uint64, query_dimension uint32, limit uint32) ([]int64, []float32, error) {
+	if gb.cIndex == nil {
+		return nil, nil, moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+	}
+	if len(queries) == 0 || num_queries == 0 || query_dimension == 0 {
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries, num_queries, and query_dimension cannot be zero")
+	}
+
+	var errmsg *C.char
+	cResult := C.gpu_brute_force_search_float(
+		gb.cIndex,
+		(*C.float)(unsafe.Pointer(&queries[0])),
+		C.uint64_t(num_queries),
+		C.uint32_t(query_dimension),
+		C.uint32_t(limit),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	if cResult == nil {
+		return nil, nil, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	// Allocate slices for results
+	neighbors := make([]int64, num_queries*uint64(limit))
+	distances := make([]float32, num_queries*uint64(limit))
+
+	C.gpu_brute_force_get_results(cResult, C.uint64_t(num_queries), C.uint32_t(limit), (*C.int64_t)(unsafe.Pointer(&neighbors[0])), (*C.float)(unsafe.Pointer(&distances[0])))
+	runtime.KeepAlive(neighbors)
+	runtime.KeepAlive(distances)
+
+	C.gpu_brute_force_free_search_result(cResult)
+
+	return neighbors, distances, nil
+}
+
+// Cap returns the capacity of the index buffer
+func (gb *GpuBruteForce[T]) Cap() uint32 {
+	if gb.cIndex == nil {
+		return 0
+	}
+	return uint32(C.gpu_brute_force_cap(gb.cIndex))
+}
+
+// Len returns current number of vectors in index
+func (gb *GpuBruteForce[T]) Len() uint32 {
+	if gb.cIndex == nil {
+		return 0
+	}
+	return uint32(C.gpu_brute_force_len(gb.cIndex))
+}
+
+// Info returns detailed information about the index as a JSON string.
+func (gb *GpuBruteForce[T]) Info() (string, error) {
+	if gb.cIndex == nil {
+		return "", moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+	}
+	var errmsg *C.char
+	infoPtr := C.gpu_brute_force_info(gb.cIndex, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		if infoPtr != nil {
+			C.free(unsafe.Pointer(infoPtr))
+		}
+		return "", moerr.NewInternalErrorNoCtx(errStr)
+	}
+	if infoPtr == nil {
+		return "{}", nil
+	}
+	info := C.GoString(infoPtr)
+	C.free(unsafe.Pointer(infoPtr))
+	return info, nil
+}
+
+// Destroy frees the C++ GpuBruteForce instance
+func (gb *GpuBruteForce[T]) Destroy() error {
+	if gb.cIndex == nil {
+		return nil
+	}
+	var errmsg *C.char
+	C.gpu_brute_force_destroy(gb.cIndex, unsafe.Pointer(&errmsg))
+	gb.cIndex = nil // Mark as destroyed
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
diff --git a/pkg/cuvs/brute_force_test.go b/pkg/cuvs/brute_force_test.go
new file mode 100644
index 0000000000000..2ebbe0261024c
--- /dev/null
+++ b/pkg/cuvs/brute_force_test.go
@@ -0,0 +1,220 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"math/rand"
+	"testing"
+)
+
+func TestGpuBruteForce(t *testing.T) {
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		dataset[i*uint64(dimension)] = float32(i)
+		dataset[i*uint64(dimension)+1] = float32(i)
+	}
+
+	index, err := NewGpuBruteForce[float32](dataset, n_vectors, dimension, L2Expanded, 1, 0)
+	if err != nil {
+		t.Fatalf("Failed to create GpuBruteForce: %v", err)
+	}
+	defer index.Destroy()
+
+	index.Start()
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Failed to load GpuBruteForce: %v", err)
+	}
+
+	queries := []float32{1.0, 1.0, 100.0, 100.0}
+	neighbors, distances, err := index.Search(queries, 2, dimension, 1)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+
+	t.Logf("Neighbors: %v, Distances: %v", neighbors, distances)
+	if neighbors[0] != 1 {
+		t.Errorf("Expected neighbor 1, got %d", neighbors[0])
+	}
+	if neighbors[1] != 100 {
+		t.Errorf("Expected neighbor 100, got %d", neighbors[1])
+	}
+}
+
+func TestGpuBruteForceChunked(t *testing.T) {
+	dimension := uint32(8)
+	totalCount := uint64(100)
+
+	// Create empty index (target type half)
+	index, err := NewGpuBruteForceEmpty[Float16](totalCount, dimension, L2Expanded, 1, 0)
+	if err != nil {
+		t.Fatalf("Failed to create GpuBruteForceEmpty: %v", err)
+	}
+	defer index.Destroy()
+
+	err = index.Start()
+	if err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+
+	if index.Cap() != uint32(totalCount) {
+		t.Errorf("Expected capacity %d, got %d", totalCount, index.Cap())
+	}
+	if index.Len() != 0 {
+		t.Errorf("Expected length 0, got %d", index.Len())
+	}
+
+	// Add data in chunks (from float32, triggers on-the-fly conversion to half)
+	chunkSize := uint64(50)
+	for i := uint64(0); i < totalCount; i += chunkSize {
+		chunk := make([]float32, chunkSize*uint64(dimension))
+		val := float32(i/chunkSize*100 + 1)
+		for j := range chunk {
+			chunk[j] = val
+		}
+		err = index.AddChunkFloat(chunk, chunkSize)
+		if err != nil {
+			t.Fatalf("AddChunkFloat failed at offset %d: %v", i, err)
+		}
+
+		expectedLen := uint32(i + chunkSize)
+		if index.Len() != expectedLen {
+			t.Errorf("Expected length %d, got %d", expectedLen, index.Len())
+		}
+	}
+
+	// Build index
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load failed: %v", err)
+	}
+
+	// Search
+	query := make([]Float16, dimension)
+	for i := range query {
+		query[i] = Float16(1) // matches first chunk
+	}
+	neighbors, _, err := index.Search(query, 1, dimension, 1)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+
+	if neighbors[0] < 0 || neighbors[0] >= 50 {
+		t.Errorf("Expected neighbor from first chunk (0-49), got %d", neighbors[0])
+	}
+}
+
+func TestGpuBruteForceFloat16(t *testing.T) {
+	dimension := uint32(2)
+	count := uint64(2)
+	dataset := []float32{1.0, 1.0, 2.0, 2.0}
+
+	// Convert to Float16 on GPU
+	hDataset := make([]Float16, len(dataset))
+	err := GpuConvertF32ToF16(dataset, hDataset, 0)
+	if err != nil {
+		t.Fatalf("Failed to convert dataset to F16: %v", err)
+	}
+
+	index, err := NewGpuBruteForce(hDataset, count, dimension, L2Expanded, 1, 0)
+	if err != nil {
+		t.Fatalf("Failed to create F16 GpuBruteForce: %v", err)
+	}
+	defer index.Destroy()
+
+	index.Start()
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Failed to load: %v", err)
+	}
+
+	queries := []float32{1.0, 1.0}
+	hQueries := make([]Float16, len(queries))
+	GpuConvertF32ToF16(queries, hQueries, 0)
+
+	neighbors, distances, err := index.Search(hQueries, 1, dimension, 1)
+	if err != nil {
+		t.Fatalf("Failed to search F16: %v", err)
+	}
+
+	if neighbors[0] != 0 {
+		t.Errorf("Expected first neighbor 0, got %d", neighbors[0])
+	}
+	if distances[0] != 0.0 {
+		t.Errorf("Expected distance 0.0, got %f", distances[0])
+	}
+}
+
+func BenchmarkGpuAddChunkAndSearchBruteForceF16(b *testing.B) {
+	const dimension = 1024
+	const totalCount = 100000
+	const chunkSize = 10000
+
+	dataset := make([]float32, totalCount*dimension)
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	// Use Float16 as internal type
+	index, err := NewGpuBruteForceEmpty[Float16](uint64(totalCount), dimension, L2Expanded, 8, 0)
+	if err != nil {
+		b.Fatalf("Failed to create index: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks using AddChunkFloat
+	for i := 0; i < totalCount; i += chunkSize {
+		chunk := dataset[i*dimension : (i+chunkSize)*dimension]
+		if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil {
+			b.Fatalf("AddChunkFloat failed at %d: %v", i, err)
+		}
+	}
+
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		queries := make([]float32, dimension)
+		for i := range queries {
+			queries[i] = rand.Float32()
+		}
+		for pb.Next() {
+			_, _, err := index.SearchFloat(queries, 1, dimension, 10)
+			if err != nil {
+				b.Fatalf("Search failed: %v", err)
+			}
+		}
+	})
+	b.StopTimer()
+	ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+		neighbors, _, err := index.SearchFloat(queries, numQueries, dimension, limit)
+		if err != nil {
+			return nil, err
+		}
+		return neighbors, nil
+	})
+}
diff --git a/pkg/cuvs/cagra.go b/pkg/cuvs/cagra.go
new file mode 100644
index 0000000000000..7de30613dc299
--- /dev/null
+++ b/pkg/cuvs/cagra.go
@@ -0,0 +1,639 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/cagra_c.h"
+#include <stdlib.h>
+#include <stdbool.h>
+*/
+import "C"
+import (
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"runtime"
+	"unsafe"
+)
+
+// GpuCagra represents the C++ gpu_cagra_t object.
+type GpuCagra[T VectorType] struct {
+	cCagra      C.gpu_cagra_c
+	dimension   uint32
+	nthread     uint32
+	distMode    DistributionMode
+	useBatching bool
+}
+
+// SetUseBatching enables or disables dynamic batching for search operations.
+func (gi *GpuCagra[T]) SetUseBatching(enable bool) error {
+	gi.useBatching = enable
+	if gi.cCagra != nil {
+		var errmsg *C.char
+		C.gpu_cagra_set_use_batching(gi.cCagra, C.bool(enable), unsafe.Pointer(&errmsg))
+		if errmsg != nil {
+			errStr := C.GoString(errmsg)
+			C.free(unsafe.Pointer(errmsg))
+			return moerr.NewInternalErrorNoCtx(errStr)
+		}
+	}
+	return nil
+}
+
+// NewGpuCagra creates a new GpuCagra instance from a dataset.
+func NewGpuCagra[T VectorType](dataset []T, count uint64, dimension uint32, metric DistanceType,
+	bp CagraBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuCagra[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.cagra_build_params_t{
+		intermediate_graph_degree: C.size_t(bp.IntermediateGraphDegree),
+		graph_degree:              C.size_t(bp.GraphDegree),
+		attach_dataset_on_build:   C.bool(bp.AttachDatasetOnBuild),
+	}
+
+	cCagra := C.gpu_cagra_new(
+		unsafe.Pointer(&dataset[0]),
+		C.uint64_t(count),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cCagra == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create GpuCagra")
+	}
+
+	return &GpuCagra[T]{
+		cCagra:    cCagra,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// NewGpuCagraFromFile creates a new GpuCagra instance by loading from a file.
+func NewGpuCagraFromFile[T VectorType](filename string, dimension uint32, metric DistanceType,
+	bp CagraBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuCagra[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cFilename := C.CString(filename)
+	defer C.free(unsafe.Pointer(cFilename))
+
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.cagra_build_params_t{
+		intermediate_graph_degree: C.size_t(bp.IntermediateGraphDegree),
+		graph_degree:              C.size_t(bp.GraphDegree),
+		attach_dataset_on_build:   C.bool(bp.AttachDatasetOnBuild),
+	}
+
+	cCagra := C.gpu_cagra_load_file(
+		cFilename,
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cCagra == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to load GpuCagra from file")
+	}
+
+	return &GpuCagra[T]{
+		cCagra:    cCagra,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// Destroy frees the C++ gpu_cagra_t instance
+func (gi *GpuCagra[T]) Destroy() error {
+	if gi.cCagra == nil {
+		return nil
+	}
+	var errmsg *C.char
+	C.gpu_cagra_destroy(gi.cCagra, unsafe.Pointer(&errmsg))
+	gi.cCagra = nil
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Start initializes the worker and resources
+func (gi *GpuCagra[T]) Start() error {
+	if gi.cCagra == nil {
+		return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+
+	if gi.distMode == Replicated && gi.nthread > 1 {
+		var errmsg *C.char
+		C.gpu_cagra_set_per_thread_device(gi.cCagra, C.bool(true), unsafe.Pointer(&errmsg))
+		if errmsg != nil {
+			errStr := C.GoString(errmsg)
+			C.free(unsafe.Pointer(errmsg))
+			return moerr.NewInternalErrorNoCtx(errStr)
+		}
+	}
+
+	if gi.useBatching {
+		if err := gi.SetUseBatching(true); err != nil {
+			return err
+		}
+	}
+
+	var errmsg *C.char
+	C.gpu_cagra_start(gi.cCagra, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Build triggers the build or file loading process
+func (gi *GpuCagra[T]) Build() error {
+	if gi.cCagra == nil {
+		return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	var errmsg *C.char
+	C.gpu_cagra_build(gi.cCagra, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// NewGpuCagraEmpty creates a new GpuCagra instance with pre-allocated buffer but no data yet.
+func NewGpuCagraEmpty[T VectorType](totalCount uint64, dimension uint32, metric DistanceType,
+	bp CagraBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuCagra[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.cagra_build_params_t{
+		intermediate_graph_degree: C.size_t(bp.IntermediateGraphDegree),
+		graph_degree:              C.size_t(bp.GraphDegree),
+		attach_dataset_on_build:   C.bool(bp.AttachDatasetOnBuild),
+	}
+
+	cCagra := C.gpu_cagra_new_empty(
+		C.uint64_t(totalCount),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cCagra == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create empty GpuCagra")
+	}
+
+	return &GpuCagra[T]{
+		cCagra:    cCagra,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// AddChunk adds a chunk of data to the pre-allocated buffer.
+func (gi *GpuCagra[T]) AddChunk(chunk []T, chunkCount uint64) error {
+	if gi.cCagra == nil {
+		return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	if len(chunk) == 0 || chunkCount == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_cagra_add_chunk(
+		gi.cCagra,
+		unsafe.Pointer(&chunk[0]),
+		C.uint64_t(chunkCount),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(chunk)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// AddChunkFloat adds a chunk of float32 data, performing on-the-fly quantization if needed.
+func (gi *GpuCagra[T]) AddChunkFloat(chunk []float32, chunkCount uint64) error {
+	if gi.cCagra == nil {
+		return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	if len(chunk) == 0 || chunkCount == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_cagra_add_chunk_float(
+		gi.cCagra,
+		(*C.float)(&chunk[0]),
+		C.uint64_t(chunkCount),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(chunk)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// TrainQuantizer trains the scalar quantizer (if T is 1-byte)
+func (gi *GpuCagra[T]) TrainQuantizer(trainData []float32, nSamples uint64) error {
+	if gi.cCagra == nil {
+		return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	if len(trainData) == 0 || nSamples == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_cagra_train_quantizer(
+		gi.cCagra,
+		(*C.float)(&trainData[0]),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(trainData)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// SetQuantizer sets the scalar quantizer parameters (if T is 1-byte)
+func (gi *GpuCagra[T]) SetQuantizer(min, max float32) error {
+	if gi.cCagra == nil {
+		return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+
+	var errmsg *C.char
+	C.gpu_cagra_set_quantizer(
+		gi.cCagra,
+		C.float(min),
+		C.float(max),
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// GetQuantizer gets the scalar quantizer parameters (if T is 1-byte)
+func (gi *GpuCagra[T]) GetQuantizer() (float32, float32, error) {
+	if gi.cCagra == nil {
+		return 0, 0, moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+
+	var errmsg *C.char
+	var cMin, cMax C.float
+	C.gpu_cagra_get_quantizer(
+		gi.cCagra,
+		&cMin,
+		&cMax,
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return float32(cMin), float32(cMax), nil
+}
+
+// Save serializes the index to a file
+func (gc *GpuCagra[T]) Save(filename string) error {
+	if gc.cCagra == nil {
+		return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	var errmsg *C.char
+	cFilename := C.CString(filename)
+	defer C.free(unsafe.Pointer(cFilename))
+
+	C.gpu_cagra_save(gc.cCagra, cFilename, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Search performs a K-Nearest Neighbor search
+func (gc *GpuCagra[T]) Search(queries []T, numQueries uint64, dimension uint32, limit uint32, sp CagraSearchParams) (SearchResult, error) {
+	if gc.cCagra == nil {
+		return SearchResult{}, moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	if len(queries) == 0 || numQueries == 0 {
+		return SearchResult{}, nil
+	}
+
+	var errmsg *C.char
+	cSP := C.cagra_search_params_t{
+		itopk_size:   C.size_t(sp.ItopkSize),
+		search_width: C.size_t(sp.SearchWidth),
+	}
+
+	res := C.gpu_cagra_search(
+		gc.cCagra,
+		unsafe.Pointer(&queries[0]),
+		C.uint64_t(numQueries),
+		C.uint32_t(dimension),
+		C.uint32_t(limit),
+		cSP,
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return SearchResult{}, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return SearchResult{}, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	totalElements := uint64(numQueries) * uint64(limit)
+	neighbors := make([]uint32, totalElements)
+	distances := make([]float32, totalElements)
+
+	C.gpu_cagra_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.uint32_t)(unsafe.Pointer(&neighbors[0])))
+	C.gpu_cagra_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0])))
+	runtime.KeepAlive(neighbors)
+	runtime.KeepAlive(distances)
+
+	C.gpu_cagra_free_result(res.result_ptr)
+
+	return SearchResult{
+		Neighbors: neighbors,
+		Distances: distances,
+	}, nil
+}
+
+// SearchFloat performs a K-Nearest Neighbor search with float32 queries
+func (gc *GpuCagra[T]) SearchFloat(queries []float32, numQueries uint64, dimension uint32, limit uint32, sp CagraSearchParams) (SearchResult, error) {
+	if gc.cCagra == nil {
+		return SearchResult{}, moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	if len(queries) == 0 || numQueries == 0 {
+		return SearchResult{}, nil
+	}
+
+	var errmsg *C.char
+	cSP := C.cagra_search_params_t{
+		itopk_size:   C.size_t(sp.ItopkSize),
+		search_width: C.size_t(sp.SearchWidth),
+	}
+
+	res := C.gpu_cagra_search_float(
+		gc.cCagra,
+		(*C.float)(unsafe.Pointer(&queries[0])),
+		C.uint64_t(numQueries),
+		C.uint32_t(dimension),
+		C.uint32_t(limit),
+		cSP,
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return SearchResult{}, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return SearchResult{}, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	totalElements := uint64(numQueries) * uint64(limit)
+	neighbors := make([]uint32, totalElements)
+	distances := make([]float32, totalElements)
+
+	C.gpu_cagra_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.uint32_t)(unsafe.Pointer(&neighbors[0])))
+	C.gpu_cagra_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0])))
+	runtime.KeepAlive(neighbors)
+	runtime.KeepAlive(distances)
+
+	C.gpu_cagra_free_result(res.result_ptr)
+
+	return SearchResult{
+		Neighbors: neighbors,
+		Distances: distances,
+	}, nil
+}
+
+// Cap returns the capacity of the index buffer
+func (gc *GpuCagra[T]) Cap() uint32 {
+	if gc.cCagra == nil {
+		return 0
+	}
+	return uint32(C.gpu_cagra_cap(gc.cCagra))
+}
+
+// Len returns current number of vectors in index
+func (gc *GpuCagra[T]) Len() uint32 {
+	if gc.cCagra == nil {
+		return 0
+	}
+	return uint32(C.gpu_cagra_len(gc.cCagra))
+}
+
+// Info returns detailed information about the index as a JSON string.
+func (gc *GpuCagra[T]) Info() (string, error) {
+	if gc.cCagra == nil {
+		return "", moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	var errmsg *C.char
+	infoPtr := C.gpu_cagra_info(gc.cCagra, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		if infoPtr != nil {
+			C.free(unsafe.Pointer(infoPtr))
+		}
+		return "", moerr.NewInternalErrorNoCtx(errStr)
+	}
+	if infoPtr == nil {
+		return "{}", nil
+	}
+	info := C.GoString(infoPtr)
+	C.free(unsafe.Pointer(infoPtr))
+	return info, nil
+}
+
+// Extend adds more vectors to the index (single-GPU only)
+func (gc *GpuCagra[T]) Extend(additionalData []T, numVectors uint64) error {
+	if gc.cCagra == nil {
+		return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+	}
+	if len(additionalData) == 0 || numVectors == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_cagra_extend(
+		gc.cCagra,
+		unsafe.Pointer(&additionalData[0]),
+		C.uint64_t(numVectors),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(additionalData)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Merge combines multiple single-GPU GpuCagra indices into a new one.
+func MergeGpuCagra[T VectorType](indices []*GpuCagra[T], nthread uint32, devices []int) (*GpuCagra[T], error) {
+	if len(indices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("no indices to merge")
+	}
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	cIndices := make([]C.gpu_cagra_c, len(indices))
+	for i, idx := range indices {
+		cIndices[i] = idx.cCagra
+	}
+
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	var errmsg *C.char
+	cCagra := C.gpu_cagra_merge(
+		&cIndices[0],
+		C.int(len(indices)),
+		C.uint32_t(nthread),
+		&cDevices[0],
+		C.int(len(devices)),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(cIndices)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cCagra == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to merge GpuCagra indices")
+	}
+
+	return &GpuCagra[T]{cCagra: cCagra, dimension: indices[0].dimension}, nil
+}
+
+// SearchResult contains the neighbors and distances from a search.
+type SearchResult struct {
+	Neighbors []uint32
+	Distances []float32
+}
diff --git a/pkg/cuvs/cagra_test.go b/pkg/cuvs/cagra_test.go
new file mode 100644
index 0000000000000..fb9a88c470e5d
--- /dev/null
+++ b/pkg/cuvs/cagra_test.go
@@ -0,0 +1,714 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"fmt"
+	"math/rand"
+	"os"
+	"testing"
+)
+
+func TestGpuCagra(t *testing.T) {
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		dataset[i*uint64(dimension)] = float32(i)
+		dataset[i*uint64(dimension)+1] = float32(i)
+	}
+
+	devices := []int{0}
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuCagra: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Failed to load/build GpuCagra: %v", err)
+	}
+
+	queries := []float32{1.0, 1.0, 100.0, 100.0}
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+	result, err := index.Search(queries, 2, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+
+	t.Logf("Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+	if result.Neighbors[0] != 1 {
+		t.Errorf("Expected neighbor 1, got %d", result.Neighbors[0])
+	}
+	if result.Neighbors[1] != 100 {
+		t.Errorf("Expected neighbor 100, got %d", result.Neighbors[1])
+	}
+}
+
+func TestGpuCagraSaveLoad(t *testing.T) {
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = float32(i)
+	}
+
+	devices := []int{0}
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuCagra: %v", err)
+	}
+	if err := index.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+	index.Build()
+
+	filename := "test_cagra.idx"
+	err = index.Save(filename)
+	if err != nil {
+		t.Fatalf("Save failed: %v", err)
+	}
+	defer os.Remove(filename)
+	index.Destroy()
+
+	index2, err := NewGpuCagraFromFile[float32](filename, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuCagra from file: %v", err)
+	}
+	defer index2.Destroy()
+
+	if err := index2.Start(); err != nil {
+		t.Fatalf("index2 Start failed: %v", err)
+	}
+	err = index2.Build()
+	if err != nil {
+		t.Fatalf("Load from file failed: %v", err)
+	}
+
+	queries := []float32{0.0, 0.0}
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+	result, err := index2.Search(queries, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+	if result.Neighbors[0] != 0 {
+		t.Errorf("Expected 0, got %d", result.Neighbors[0])
+	}
+}
+
+func TestGpuShardedCagra(t *testing.T) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		t.Skip("Need at least 1 GPU for sharded CAGRA test")
+	}
+
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		dataset[i*uint64(dimension)] = float32(i)
+		dataset[i*uint64(dimension)+1] = float32(i)
+	}
+
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Sharded)
+	if err != nil {
+		t.Fatalf("Failed to create sharded CAGRA: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load sharded failed: %v", err)
+	}
+
+	queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5}
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+	result, err := index.Search(queries, 5, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search sharded failed: %v", err)
+	}
+	t.Logf("Sharded Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+}
+
+func TestGpuCagraChunked(t *testing.T) {
+	dimension := uint32(8)
+	totalCount := uint64(100)
+	devices := []int{0}
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+
+	// Create empty index (target type int8)
+	index, err := NewGpuCagraEmpty[int8](totalCount, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuCagraEmpty: %v", err)
+	}
+	defer index.Destroy()
+
+	err = index.Start()
+	if err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks (from float32, triggers on-the-fly quantization)
+	chunkSize := uint64(50)
+	for i := uint64(0); i < totalCount; i += chunkSize {
+		chunk := make([]float32, chunkSize*uint64(dimension))
+		val := float32(i/chunkSize*100 + 1) // 1.0 for first chunk, 101.0 for second
+		for j := range chunk {
+			chunk[j] = val
+		}
+		err = index.AddChunkFloat(chunk, chunkSize)
+		if err != nil {
+			t.Fatalf("AddChunkFloat failed at offset %d: %v", i, err)
+		}
+	}
+
+	// Build index
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load failed: %v", err)
+	}
+
+	// Search for first chunk
+	query1 := make([]int8, dimension)
+	for i := range query1 {
+		query1[i] = -128 // matches first chunk (1.0)
+	}
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+	result1, err := index.Search(query1, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search 1 failed: %v", err)
+	}
+	if result1.Neighbors[0] < 0 || result1.Neighbors[0] >= 50 {
+		t.Errorf("Expected neighbor from first chunk (0-49), got %d", result1.Neighbors[0])
+	}
+
+	// Search for second chunk
+	query2 := make([]int8, dimension)
+	for i := range query2 {
+		query2[i] = 127 // matches second chunk (101.0)
+	}
+	result2, err := index.Search(query2, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search 2 failed: %v", err)
+	}
+	if result2.Neighbors[0] < 50 || result2.Neighbors[0] >= 100 {
+		t.Errorf("Expected neighbor from second chunk (50-99), got %d", result2.Neighbors[0])
+	}
+}
+
+func TestGpuCagraExtend(t *testing.T) {
+	dimension := uint32(16)
+	count := uint64(100)
+	dataset := make([]float32, count*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = float32(i)
+	}
+
+	devices := []int{0}
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	index, err := NewGpuCagra[float32](dataset, count, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuCagra: %v", err)
+	}
+	defer index.Destroy()
+	if err := index.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+	index.Build()
+
+	extra := make([]float32, 10*dimension)
+	for i := range extra {
+		extra[i] = 1000.0
+	}
+	err = index.Extend(extra, 10)
+	if err != nil {
+		t.Fatalf("Extend failed: %v", err)
+	}
+
+	queries := make([]float32, dimension)
+	for i := range queries {
+		queries[i] = 1000.0
+	}
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+	result, err := index.Search(queries, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+	if result.Neighbors[0] < 100 {
+		t.Errorf("Expected neighbor from extended data, got %d", result.Neighbors[0])
+	}
+}
+
+func TestGpuCagraMerge(t *testing.T) {
+	dimension := uint32(16)
+	count := uint64(200)
+
+	// Cluster 1: values around 0
+	ds1 := make([]float32, count*uint64(dimension))
+	for i := range ds1 {
+		ds1[i] = float32(i % 10)
+	}
+	// Cluster 2: values around 1000
+	ds2 := make([]float32, count*uint64(dimension))
+	for i := range ds2 {
+		ds2[i] = float32(1000 + (i % 10))
+	}
+
+	devices := []int{0}
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+
+	idx1, err := NewGpuCagra[float32](ds1, count, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create idx1: %v", err)
+	}
+	idx2, err := NewGpuCagra[float32](ds2, count, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create idx2: %v", err)
+	}
+	if err := idx1.Start(); err != nil {
+		t.Fatalf("idx1 Start failed: %v", err)
+	}
+	idx1.Build()
+	if err := idx2.Start(); err != nil {
+		t.Fatalf("idx2 Start failed: %v", err)
+	}
+	idx2.Build()
+	defer idx1.Destroy()
+	defer idx2.Destroy()
+
+	merged, err := MergeGpuCagra([]*GpuCagra[float32]{idx1, idx2}, 1, devices)
+	if err != nil {
+		t.Fatalf("Merge failed: %v", err)
+	}
+	defer merged.Destroy()
+
+	if err := merged.Start(); err != nil {
+		t.Fatalf("merged Start failed: %v", err)
+	}
+
+	// Query near Cluster 2
+	queries := make([]float32, dimension)
+	for i := range queries {
+		queries[i] = 1000.0
+	}
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+	result, err := merged.Search(queries, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+	// Result should be from second index (index >= 200)
+	if result.Neighbors[0] < 200 {
+		t.Errorf("Expected neighbor from second index (>=200), got %d", result.Neighbors[0])
+	}
+}
+
+func TestGpuReplicatedCagra(t *testing.T) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		t.Skip("Need at least 1 GPU for replicated CAGRA test")
+	}
+
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		dataset[i*uint64(dimension)] = float32(i)
+		dataset[i*uint64(dimension)+1] = float32(i)
+	}
+
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Replicated)
+	if err != nil {
+		t.Fatalf("Failed to create replicated CAGRA: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load replicated failed: %v", err)
+	}
+
+	queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5}
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+	result, err := index.Search(queries, 5, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search replicated failed: %v", err)
+	}
+	t.Logf("Replicated Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+}
+
+func BenchmarkGpuShardedCagra(b *testing.B) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		b.Skip("Need at least 1 GPU for sharded CAGRA benchmark")
+	}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Sharded)
+	if err != nil {
+		b.Fatalf("Failed to create sharded CAGRA: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.Search(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+		})
+	}
+}
+
+func BenchmarkGpuSingleCagra(b *testing.B) {
+	devices := []int{0}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create single CAGRA: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.Search(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+		})
+	}
+}
+
+func BenchmarkGpuReplicatedCagra(b *testing.B) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		b.Skip("Need at least 1 GPU for replicated CAGRA benchmark")
+	}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Replicated)
+	if err != nil {
+		b.Fatalf("Failed to create replicated CAGRA: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.Search(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+		})
+	}
+}
+
+func BenchmarkGpuAddChunkAndSearchCagraF16(b *testing.B) {
+	const dimension = 1024
+	const totalCount = 100000
+	const chunkSize = 10000
+
+	dataset := make([]float32, totalCount*dimension)
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	devices := []int{0}
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	// Use Float16 as internal type
+	index, err := NewGpuCagraEmpty[Float16](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create index: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks using AddChunkFloat
+	for i := 0; i < totalCount; i += chunkSize {
+		chunk := dataset[i*dimension : (i+chunkSize)*dimension]
+		if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil {
+			b.Fatalf("AddChunkFloat failed at %d: %v", i, err)
+		}
+	}
+
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		queries := make([]float32, dimension)
+		for i := range queries {
+			queries[i] = rand.Float32()
+		}
+		for pb.Next() {
+			_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+			if err != nil {
+				b.Fatalf("Search failed: %v", err)
+			}
+		}
+	})
+	b.StopTimer()
+	ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) {
+		res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+		if err != nil {
+			return nil, err
+		}
+		return res.Neighbors, nil
+	})
+}
+
+func BenchmarkGpuAddChunkAndSearchCagraInt8(b *testing.B) {
+	const dimension = 1024
+	const totalCount = 100000
+	const chunkSize = 10000
+
+	dataset := make([]float32, totalCount*dimension)
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	devices := []int{0}
+	bp := DefaultCagraBuildParams()
+	bp.IntermediateGraphDegree = 256
+	bp.GraphDegree = 128
+	// Use int8 as internal type
+	index, err := NewGpuCagraEmpty[int8](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create index: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks using AddChunkFloat
+	for i := 0; i < totalCount; i += chunkSize {
+		chunk := dataset[i*dimension : (i+chunkSize)*dimension]
+		if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil {
+			b.Fatalf("AddChunkFloat failed at %d: %v", i, err)
+		}
+	}
+
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultCagraSearchParams()
+	sp.ItopkSize = 128
+	sp.SearchWidth = 3
+
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		queries := make([]float32, dimension)
+		for i := range queries {
+			queries[i] = rand.Float32()
+		}
+		for pb.Next() {
+			_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+			if err != nil {
+				b.Fatalf("Search failed: %v", err)
+			}
+		}
+	})
+	b.StopTimer()
+	ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) {
+		res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+		if err != nil {
+			return nil, err
+		}
+		return res.Neighbors, nil
+	})
+}
diff --git a/pkg/cuvs/distance.go b/pkg/cuvs/distance.go
new file mode 100644
index 0000000000000..2f29921b9212e
--- /dev/null
+++ b/pkg/cuvs/distance.go
@@ -0,0 +1,73 @@
+//go:build gpu
+
+/*
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/distance_c.h"
+#include <stdlib.h>
+*/
+import "C"
+import (
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"runtime"
+	"unsafe"
+)
+
+// PairwiseDistance performs a pairwise distance calculation on GPU.
+func PairwiseDistance[T VectorType](
+	x []T,
+	nX uint64,
+	y []T,
+	nY uint64,
+	dim uint32,
+	metric DistanceType,
+	deviceID int,
+) ([]float32, error) {
+	if len(x) == 0 || len(y) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("empty x or y")
+	}
+
+	qtype := GetQuantization[T]()
+	dist := make([]float32, nX*nY)
+
+	var errmsg *C.char
+	C.gpu_pairwise_distance(
+		unsafe.Pointer(&x[0]),
+		C.uint64_t(nX),
+		unsafe.Pointer(&y[0]),
+		C.uint64_t(nY),
+		C.uint32_t(dim),
+		C.distance_type_t(metric),
+		C.quantization_t(qtype),
+		C.int(deviceID),
+		(*C.float)(unsafe.Pointer(&dist[0])),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(x)
+	runtime.KeepAlive(y)
+	runtime.KeepAlive(dist)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	return dist, nil
+}
diff --git a/pkg/cuvs/distance_test.go b/pkg/cuvs/distance_test.go
new file mode 100644
index 0000000000000..de63ac79f6f79
--- /dev/null
+++ b/pkg/cuvs/distance_test.go
@@ -0,0 +1,66 @@
+//go:build gpu
+
+/*
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package cuvs
+
+import (
+	"testing"
+)
+
+func TestPairwiseDistance(t *testing.T) {
+	dim := uint32(3)
+	nX := uint64(2)
+	nY := uint64(2)
+
+	x := []float32{
+		1.0, 0.0, 0.0,
+		0.0, 1.0, 0.0,
+	}
+	y := []float32{
+		1.0, 0.0, 0.0,
+		0.0, 1.0, 0.0,
+	}
+
+	dist, err := PairwiseDistance[float32](
+		x, nX,
+		y, nY,
+		dim,
+		L2Expanded, 0,
+	)
+
+	if err != nil {
+		t.Fatalf("PairwiseDistance failed: %v", err)
+	}
+
+	if len(dist) != int(nX*nY) {
+		t.Errorf("Expected %d distances, got %d", nX*nY, len(dist))
+	}
+
+	// Expected results for L2Squared:
+	// dist[0,0] = (1-1)^2 + (0-0)^2 + (0-0)^2 = 0
+	// dist[0,1] = (1-0)^2 + (0-1)^2 + (0-0)^2 = 2
+	// dist[1,0] = (0-1)^2 + (1-0)^2 + (0-0)^2 = 2
+	// dist[1,1] = (0-0)^2 + (1-1)^2 + (0-0)^2 = 0
+
+	expected := []float32{0.0, 2.0, 2.0, 0.0}
+	for i := 0; i < len(expected); i++ {
+		if dist[i] != expected[i] {
+			t.Errorf("Expected dist[%d] = %f, got %f", i, expected[i], dist[i])
+		}
+	}
+}
diff --git a/pkg/cuvs/get_centers_test.go b/pkg/cuvs/get_centers_test.go
new file mode 100644
index 0000000000000..eedadfeeac28f
--- /dev/null
+++ b/pkg/cuvs/get_centers_test.go
@@ -0,0 +1,137 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"testing"
+)
+
+func testIvfFlatGetCenters[T VectorType](t *testing.T, name string) {
+	t.Run(name, func(t *testing.T) {
+		dimension := uint32(16)
+		n_vectors := uint64(1000)
+		dataset := make([]T, n_vectors*uint64(dimension))
+		// Fill some data
+		for i := range dataset {
+			dataset[i] = T(i % 127)
+		}
+
+		devices := []int{0}
+		bp := DefaultIvfFlatBuildParams()
+		bp.NLists = 16
+		index, err := NewGpuIvfFlat[T](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+		if err != nil {
+			t.Fatalf("Failed to create GpuIvfFlat: %v", err)
+		}
+		defer index.Destroy()
+
+		if err := index.Start(); err != nil {
+			t.Fatalf("Start failed: %v", err)
+		}
+		if err := index.Build(); err != nil {
+			t.Fatalf("Build failed: %v", err)
+		}
+
+		nLists := index.GetNList()
+		centers, err := index.GetCenters(nLists)
+		if err != nil {
+			t.Fatalf("GetCenters failed: %v", err)
+		}
+
+		expectedLen := int(nLists * dimension)
+		if len(centers) != expectedLen {
+			t.Errorf("Expected centers length %d, got %d", expectedLen, len(centers))
+		}
+
+		// Check that centers are not all zeros (simple sanity check)
+		allZeros := true
+		for _, v := range centers {
+			if v != 0 {
+				allZeros = false
+				break
+			}
+		}
+		if allZeros {
+			t.Errorf("Centers are all zeros")
+		}
+	})
+}
+
+func TestIvfFlatGetCentersAllTypes(t *testing.T) {
+	testIvfFlatGetCenters[float32](t, "float32")
+	testIvfFlatGetCenters[Float16](t, "Float16")
+	// testIvfFlatGetCenters[int8](t, "int8")
+	// testIvfFlatGetCenters[uint8](t, "uint8")
+}
+
+func testIvfPqGetCenters[T VectorType](t *testing.T, name string) {
+	t.Run(name, func(t *testing.T) {
+		dimension := uint32(16)
+		n_vectors := uint64(1000)
+		dataset := make([]T, n_vectors*uint64(dimension))
+		for i := range dataset {
+			dataset[i] = T(i % 127)
+		}
+
+		devices := []int{0}
+		bp := DefaultIvfPqBuildParams()
+		bp.NLists = 16
+		bp.M = 8
+		index, err := NewGpuIvfPq[T](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+		if err != nil {
+			t.Fatalf("Failed to create GpuIvfPq: %v", err)
+		}
+		defer index.Destroy()
+
+		if err := index.Start(); err != nil {
+			t.Fatalf("Start failed: %v", err)
+		}
+		if err := index.Build(); err != nil {
+			t.Fatalf("Build failed: %v", err)
+		}
+
+		centers, err := index.GetCenters()
+		if err != nil {
+			t.Fatalf("GetCenters failed: %v", err)
+		}
+
+		nLists := index.GetNList()
+		rotDim := index.GetRotDim()
+		expectedLen := int(nLists * rotDim)
+		if len(centers) != expectedLen {
+			t.Errorf("Expected centers length %d, got %d", expectedLen, len(centers))
+		}
+
+		allZeros := true
+		for _, v := range centers {
+			if v != 0 {
+				allZeros = false
+				break
+			}
+		}
+		if allZeros {
+			t.Errorf("Centers are all zeros")
+		}
+	})
+}
+
+func TestIvfPqGetCentersAllTypes(t *testing.T) {
+	testIvfPqGetCenters[float32](t, "float32")
+	testIvfPqGetCenters[Float16](t, "Float16")
+	// testIvfPqGetCenters[int8](t, "int8")
+	// testIvfPqGetCenters[uint8](t, "uint8")
+}
diff --git a/pkg/cuvs/helper.go b/pkg/cuvs/helper.go
new file mode 100644
index 0000000000000..1b00267be4d67
--- /dev/null
+++ b/pkg/cuvs/helper.go
@@ -0,0 +1,256 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/helper.h"
+#include <stdlib.h>
+*/
+import "C"
+import (
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"runtime"
+	"unsafe"
+)
+
+// DistanceType maps to C.distance_type_t
+type DistanceType C.distance_type_t
+
+const (
+	L2Expanded          DistanceType = C.DistanceType_L2Expanded
+	L2SqrtExpanded      DistanceType = C.DistanceType_L2SqrtExpanded
+	CosineExpanded      DistanceType = C.DistanceType_CosineExpanded
+	L1                  DistanceType = C.DistanceType_L1
+	L2Unexpanded        DistanceType = C.DistanceType_L2Unexpanded
+	L2SqrtUnexpanded    DistanceType = C.DistanceType_L2SqrtUnexpanded
+	InnerProduct        DistanceType = C.DistanceType_InnerProduct
+	Linf                DistanceType = C.DistanceType_Linf
+	Canberra            DistanceType = C.DistanceType_Canberra
+	LpUnexpanded        DistanceType = C.DistanceType_LpUnexpanded
+	CorrelationExpanded DistanceType = C.DistanceType_CorrelationExpanded
+	JaccardExpanded     DistanceType = C.DistanceType_JaccardExpanded
+	HellingerExpanded   DistanceType = C.DistanceType_HellingerExpanded
+	Haversine           DistanceType = C.DistanceType_Haversine
+	BrayCurtis          DistanceType = C.DistanceType_BrayCurtis
+	JensenShannon       DistanceType = C.DistanceType_JensenShannon
+	HammingUnexpanded   DistanceType = C.DistanceType_HammingUnexpanded
+	KLDivergence        DistanceType = C.DistanceType_KLDivergence
+	RusselRaoExpanded   DistanceType = C.DistanceType_RusselRaoExpanded
+	DiceExpanded        DistanceType = C.DistanceType_DiceExpanded
+	BitwiseHamming      DistanceType = C.DistanceType_BitwiseHamming
+	Precomputed         DistanceType = C.DistanceType_Precomputed
+	// Aliases
+	CosineSimilarity DistanceType = C.DistanceType_CosineSimilarity
+	Jaccard          DistanceType = C.DistanceType_Jaccard
+	Hamming          DistanceType = C.DistanceType_Hamming
+	Unknown          DistanceType = C.DistanceType_Unknown
+)
+
+// Quantization maps to C.quantization_t
+type Quantization C.quantization_t
+
+const (
+	F32   Quantization = C.Quantization_F32
+	F16   Quantization = C.Quantization_F16
+	INT8  Quantization = C.Quantization_INT8
+	UINT8 Quantization = C.Quantization_UINT8
+)
+
+// DistributionMode maps to C.distribution_mode_t
+type DistributionMode C.distribution_mode_t
+
+const (
+	SingleGpu  DistributionMode = C.DistributionMode_SINGLE_GPU
+	Sharded    DistributionMode = C.DistributionMode_SHARDED
+	Replicated DistributionMode = C.DistributionMode_REPLICATED
+)
+
+// CagraBuildParams maps to C.cagra_build_params_t
+type CagraBuildParams struct {
+	IntermediateGraphDegree uint64
+	GraphDegree             uint64
+	AttachDatasetOnBuild    bool
+}
+
+func DefaultCagraBuildParams() CagraBuildParams {
+	return CagraBuildParams{
+		IntermediateGraphDegree: 128,
+		GraphDegree:             64,
+		AttachDatasetOnBuild:    true,
+	}
+}
+
+// CagraSearchParams maps to C.cagra_search_params_t
+type CagraSearchParams struct {
+	ItopkSize   uint64
+	SearchWidth uint64
+}
+
+func DefaultCagraSearchParams() CagraSearchParams {
+	return CagraSearchParams{
+		ItopkSize:   64,
+		SearchWidth: 1,
+	}
+}
+
+// IvfFlatBuildParams maps to C.ivf_flat_build_params_t
+type IvfFlatBuildParams struct {
+	NLists                 uint32
+	AddDataOnBuild         bool
+	KmeansTrainsetFraction float64
+}
+
+func DefaultIvfFlatBuildParams() IvfFlatBuildParams {
+	return IvfFlatBuildParams{
+		NLists:                 1024,
+		AddDataOnBuild:         true,
+		KmeansTrainsetFraction: 0.5,
+	}
+}
+
+// IvfFlatSearchParams maps to C.ivf_flat_search_params_t
+type IvfFlatSearchParams struct {
+	NProbes uint32
+}
+
+func DefaultIvfFlatSearchParams() IvfFlatSearchParams {
+	return IvfFlatSearchParams{
+		NProbes: 20,
+	}
+}
+
+// IvfPqBuildParams maps to C.ivf_pq_build_params_t
+type IvfPqBuildParams struct {
+	NLists                 uint32
+	M                      uint32
+	BitsPerCode            uint32
+	AddDataOnBuild         bool
+	KmeansTrainsetFraction float64
+}
+
+func DefaultIvfPqBuildParams() IvfPqBuildParams {
+	return IvfPqBuildParams{
+		NLists:                 1024,
+		M:                      16,
+		BitsPerCode:            8,
+		AddDataOnBuild:         true,
+		KmeansTrainsetFraction: 0.5,
+	}
+}
+
+// IvfPqSearchParams maps to C.ivf_pq_search_params_t
+type IvfPqSearchParams struct {
+	NProbes uint32
+}
+
+func DefaultIvfPqSearchParams() IvfPqSearchParams {
+	return IvfPqSearchParams{
+		NProbes: 20,
+	}
+}
+
+// Float16 is a 16-bit floating point type (IEEE 754-2008).
+// Go does not have a native float16 type, so we use uint16 to represent its memory layout.
+type Float16 uint16
+
+// VectorType is a constraint for types that can be used as vector data.
+type VectorType interface {
+	float32 | Float16 | int8 | uint8
+}
+
+// GpuIndex is an interface for all GPU-accelerated indexes.
+type GpuIndex interface {
+	Start() error
+	Build() error
+	Destroy() error
+	Info() (string, error)
+}
+
+// GetQuantization returns the Quantization enum for a given VectorType.
+func GetQuantization[T VectorType]() Quantization {
+	var zero T
+	switch any(zero).(type) {
+	case float32:
+		return F32
+	case Float16:
+		return F16
+	case int8:
+		return INT8
+	case uint8:
+		return UINT8
+	default:
+		panic("unsupported vector type")
+	}
+}
+
+// GpuConvertF32ToF16 converts a float32 slice to a Float16 slice using the GPU.
+func GpuConvertF32ToF16(src []float32, dst []Float16, deviceID int) error {
+	if len(src) == 0 {
+		return nil
+	}
+	if len(src) != len(dst) {
+		return moerr.NewInternalErrorNoCtx("source and destination slices must have the same length")
+	}
+
+	var errmsg *C.char
+	C.gpu_convert_f32_to_f16(
+		(*C.float)(unsafe.Pointer(&src[0])),
+		unsafe.Pointer(&dst[0]),
+		C.uint64_t(len(src)),
+		C.int(deviceID),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(src)
+	runtime.KeepAlive(dst)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// GetGpuDeviceCount returns the number of available CUDA devices.
+func GetGpuDeviceCount() (int, error) {
+	count := int(C.gpu_get_device_count())
+	if count < 0 {
+		return 0, moerr.NewInternalErrorNoCtx("failed to get GPU device count")
+	}
+	return count, nil
+}
+
+// GetGpuDeviceList returns a slice of available CUDA device IDs.
+func GetGpuDeviceList() ([]int, error) {
+	count, err := GetGpuDeviceCount()
+	if err != nil {
+		return nil, err
+	}
+	if count == 0 {
+		return []int{}, nil
+	}
+
+	cDevices := make([]C.int, count)
+	actualCount := int(C.gpu_get_device_list(&cDevices[0], C.int(count)))
+
+	devices := make([]int, actualCount)
+	for i := 0; i < actualCount; i++ {
+		devices[i] = int(cDevices[i])
+	}
+	runtime.KeepAlive(cDevices)
+	return devices, nil
+}
diff --git a/pkg/cuvs/helper_test.go b/pkg/cuvs/helper_test.go
new file mode 100644
index 0000000000000..1b4def55e94a5
--- /dev/null
+++ b/pkg/cuvs/helper_test.go
@@ -0,0 +1,48 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"testing"
+)
+
+func TestGpuHelpers(t *testing.T) {
+	count, err := GetGpuDeviceCount()
+	if err != nil {
+		t.Fatalf("GetGpuDeviceCount failed: %v", err)
+	}
+	t.Logf("GPU Device Count: %d", count)
+
+	devices, err := GetGpuDeviceList()
+	if err != nil {
+		t.Fatalf("GetGpuDeviceList failed: %v", err)
+	}
+	t.Logf("GPU Device List: %v", devices)
+}
+
+func TestGpuConvertF32ToF16(t *testing.T) {
+	src := []float32{1.0, 2.0, 3.0, 4.0}
+	deviceID := 0
+
+	// Test conversion to F16
+	dstF16 := make([]Float16, len(src))
+	if err := GpuConvertF32ToF16(src, dstF16, deviceID); err != nil {
+		t.Fatalf("GpuConvertF32ToF16 failed: %v", err)
+	}
+	// We can't easily verify the value without a float16 decoder,
+	// but we can check it didn't error.
+}
diff --git a/pkg/cuvs/info_test.go b/pkg/cuvs/info_test.go
new file mode 100644
index 0000000000000..b52b647aec8ba
--- /dev/null
+++ b/pkg/cuvs/info_test.go
@@ -0,0 +1,212 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"encoding/json"
+	"fmt"
+	"math/rand"
+	"testing"
+)
+
+type commonInfo struct {
+	ElementSize   int    `json:"element_size"`
+	Dimension     int    `json:"dimension"`
+	Metric        int    `json:"metric"`
+	Status        string `json:"status"`
+	Capacity      int    `json:"capacity"`
+	CurrentLength int    `json:"current_length"`
+	Devices       []int  `json:"devices"`
+	Type          string `json:"type"`
+}
+
+func verifyCommonInfo(t *testing.T, infoStr string, expectedType string, expectedDim int, expectedElemSize int) {
+	var info commonInfo
+	err := json.Unmarshal([]byte(infoStr), &info)
+	if err != nil {
+		t.Fatalf("Failed to parse info JSON: %v\nJSON: %s", err, infoStr)
+	}
+
+	if info.Type != expectedType {
+		t.Errorf("Expected type %s, got %s", expectedType, info.Type)
+	}
+	if info.Dimension != expectedDim {
+		t.Errorf("Expected dimension %d, got %d", expectedDim, info.Dimension)
+	}
+	if info.ElementSize != expectedElemSize {
+		t.Errorf("Expected element size %d, got %d", expectedElemSize, info.ElementSize)
+	}
+	if info.Status != "Loaded" {
+		t.Errorf("Expected status Loaded, got %s", info.Status)
+	}
+}
+
+func TestIndexInfoComprehensive(t *testing.T) {
+	devices, err := GetGpuDeviceList()
+	if err != nil {
+		t.Fatalf("Failed to get GPU devices: %v", err)
+	}
+	if len(devices) == 0 {
+		t.Skip("No GPU devices available")
+	}
+
+	dimension := uint32(128)
+	n_vectors := uint64(10000)
+
+	// Test combinations of Index Type, Distribution Mode, and Data Type
+
+	testCases := []struct {
+		indexType string
+		distMode  DistributionMode
+		modeName  string
+	}{
+		{"CAGRA", SingleGpu, "SingleGPU"},
+		{"CAGRA", Sharded, "Sharded"},
+		{"CAGRA", Replicated, "Replicated"},
+		{"IVF-Flat", SingleGpu, "SingleGPU"},
+		{"IVF-Flat", Sharded, "Sharded"},
+		{"IVF-Flat", Replicated, "Replicated"},
+		{"IVF-PQ", SingleGpu, "SingleGPU"},
+		{"IVF-PQ", Sharded, "Sharded"},
+		{"IVF-PQ", Replicated, "Replicated"},
+	}
+
+	runTest := func(t *testing.T, indexType string, distMode DistributionMode, modeName string, dataType string) {
+		name := fmt.Sprintf("%s/%s/%s", indexType, modeName, dataType)
+		t.Run(name, func(t *testing.T) {
+			var index GpuIndex
+			var err error
+			var elemSize int
+
+			// We use a large dataset
+			switch dataType {
+			case "float32":
+				dataset := GenerateRandomDataset(n_vectors, dimension)
+				elemSize = 4
+				switch indexType {
+				case "CAGRA":
+					bp := DefaultCagraBuildParams()
+					bp.IntermediateGraphDegree = 256
+					bp.GraphDegree = 128
+					index, err = NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				case "IVF-Flat":
+					bp := DefaultIvfFlatBuildParams()
+					bp.NLists = 1000
+					index, err = NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				case "IVF-PQ":
+					bp := DefaultIvfPqBuildParams()
+					bp.NLists = 1000
+					bp.M = 16
+					index, err = NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				}
+			case "Float16":
+				dataset := make([]Float16, n_vectors*uint64(dimension))
+				for i := range dataset {
+					dataset[i] = Float16(rand.Uint32())
+				}
+				elemSize = 2
+				switch indexType {
+				case "CAGRA":
+					bp := DefaultCagraBuildParams()
+					bp.IntermediateGraphDegree = 256
+					bp.GraphDegree = 128
+					index, err = NewGpuCagra[Float16](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				case "IVF-Flat":
+					bp := DefaultIvfFlatBuildParams()
+					bp.NLists = 1000
+					index, err = NewGpuIvfFlat[Float16](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				case "IVF-PQ":
+					bp := DefaultIvfPqBuildParams()
+					bp.NLists = 1000
+					bp.M = 16
+					index, err = NewGpuIvfPq[Float16](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				}
+			case "int8":
+				dataset := make([]int8, n_vectors*uint64(dimension))
+				for i := range dataset {
+					dataset[i] = int8(rand.Intn(256) - 128)
+				}
+				elemSize = 1
+				switch indexType {
+				case "CAGRA":
+					bp := DefaultCagraBuildParams()
+					bp.IntermediateGraphDegree = 256
+					bp.GraphDegree = 128
+					index, err = NewGpuCagra[int8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				case "IVF-Flat":
+					bp := DefaultIvfFlatBuildParams()
+					bp.NLists = 1000
+					index, err = NewGpuIvfFlat[int8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				case "IVF-PQ":
+					bp := DefaultIvfPqBuildParams()
+					bp.NLists = 1000
+					bp.M = 16
+					index, err = NewGpuIvfPq[int8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				}
+			case "uint8":
+				dataset := make([]uint8, n_vectors*uint64(dimension))
+				for i := range dataset {
+					dataset[i] = uint8(rand.Intn(256))
+				}
+				elemSize = 1
+				switch indexType {
+				case "CAGRA":
+					bp := DefaultCagraBuildParams()
+					bp.IntermediateGraphDegree = 256
+					bp.GraphDegree = 128
+					index, err = NewGpuCagra[uint8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				case "IVF-Flat":
+					bp := DefaultIvfFlatBuildParams()
+					bp.NLists = 1000
+					index, err = NewGpuIvfFlat[uint8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				case "IVF-PQ":
+					bp := DefaultIvfPqBuildParams()
+					bp.NLists = 1000
+					bp.M = 16
+					index, err = NewGpuIvfPq[uint8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode)
+				}
+			}
+
+			if err != nil {
+				t.Fatalf("Failed to create index: %v", err)
+			}
+			defer index.Destroy()
+
+			if err := index.Start(); err != nil {
+				t.Fatalf("Failed to start index: %v", err)
+			}
+			if err := index.Build(); err != nil {
+				t.Fatalf("Failed to build index: %v", err)
+			}
+
+			infoStr, err := index.Info()
+			if err != nil {
+				t.Fatalf("Failed to get info: %v", err)
+			}
+
+			verifyCommonInfo(t, infoStr, indexType, int(dimension), elemSize)
+		})
+	}
+
+	dataTypes := []string{"float32", "Float16", "int8", "uint8"}
+
+	for _, tc := range testCases {
+		for _, dt := range dataTypes {
+			runTest(t, tc.indexType, tc.distMode, tc.modeName, dt)
+		}
+	}
+}
diff --git a/pkg/cuvs/ivf_flat.go b/pkg/cuvs/ivf_flat.go
new file mode 100644
index 0000000000000..0741b4b52eb2c
--- /dev/null
+++ b/pkg/cuvs/ivf_flat.go
@@ -0,0 +1,593 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/ivf_flat_c.h"
+#include <stdlib.h>
+#include <stdbool.h>
+*/
+import "C"
+import (
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"runtime"
+	"unsafe"
+)
+
+// GpuIvfFlat represents the C++ gpu_ivf_flat_t object.
+type GpuIvfFlat[T VectorType] struct {
+	cIvfFlat    C.gpu_ivf_flat_c
+	dimension   uint32
+	nthread     uint32
+	distMode    DistributionMode
+	useBatching bool
+}
+
+// SetUseBatching enables or disables dynamic batching for search operations.
+func (gi *GpuIvfFlat[T]) SetUseBatching(enable bool) error {
+	gi.useBatching = enable
+	if gi.cIvfFlat != nil {
+		var errmsg *C.char
+		C.gpu_ivf_flat_set_use_batching(gi.cIvfFlat, C.bool(enable), unsafe.Pointer(&errmsg))
+		if errmsg != nil {
+			errStr := C.GoString(errmsg)
+			C.free(unsafe.Pointer(errmsg))
+			return moerr.NewInternalErrorNoCtx(errStr)
+		}
+	}
+	return nil
+}
+
+// NewGpuIvfFlat creates a new GpuIvfFlat instance from a dataset.
+func NewGpuIvfFlat[T VectorType](dataset []T, count uint64, dimension uint32, metric DistanceType,
+	bp IvfFlatBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfFlat[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.ivf_flat_build_params_t{
+		n_lists:                  C.uint32_t(bp.NLists),
+		add_data_on_build:        C.bool(bp.AddDataOnBuild),
+		kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+	}
+
+	cIvfFlat := C.gpu_ivf_flat_new(
+		unsafe.Pointer(&dataset[0]),
+		C.uint64_t(count),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cIvfFlat == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create GpuIvfFlat")
+	}
+
+	return &GpuIvfFlat[T]{
+		cIvfFlat:  cIvfFlat,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// NewGpuIvfFlatFromFile creates a new GpuIvfFlat instance by loading from a file.
+func NewGpuIvfFlatFromFile[T VectorType](filename string, dimension uint32, metric DistanceType,
+	bp IvfFlatBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfFlat[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cFilename := C.CString(filename)
+	defer C.free(unsafe.Pointer(cFilename))
+
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.ivf_flat_build_params_t{
+		n_lists:                  C.uint32_t(bp.NLists),
+		add_data_on_build:        C.bool(bp.AddDataOnBuild),
+		kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+	}
+
+	cIvfFlat := C.gpu_ivf_flat_load_file(
+		cFilename,
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cIvfFlat == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to load GpuIvfFlat from file")
+	}
+
+	return &GpuIvfFlat[T]{
+		cIvfFlat:  cIvfFlat,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// Destroy frees the C++ gpu_ivf_flat_t instance
+func (gi *GpuIvfFlat[T]) Destroy() error {
+	if gi.cIvfFlat == nil {
+		return nil
+	}
+	var errmsg *C.char
+	C.gpu_ivf_flat_destroy(gi.cIvfFlat, unsafe.Pointer(&errmsg))
+	gi.cIvfFlat = nil
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Start initializes the worker and resources
+func (gi *GpuIvfFlat[T]) Start() error {
+	if gi.cIvfFlat == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+
+	if gi.distMode == Replicated && gi.nthread > 1 {
+		var errmsg *C.char
+		C.gpu_ivf_flat_set_per_thread_device(gi.cIvfFlat, C.bool(true), unsafe.Pointer(&errmsg))
+		if errmsg != nil {
+			errStr := C.GoString(errmsg)
+			C.free(unsafe.Pointer(errmsg))
+			return moerr.NewInternalErrorNoCtx(errStr)
+		}
+	}
+
+	if gi.useBatching {
+		if err := gi.SetUseBatching(true); err != nil {
+			return err
+		}
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_flat_start(gi.cIvfFlat, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Build triggers the build or file loading process
+func (gi *GpuIvfFlat[T]) Build() error {
+	if gi.cIvfFlat == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	var errmsg *C.char
+	C.gpu_ivf_flat_build(gi.cIvfFlat, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// NewGpuIvfFlatEmpty creates a new GpuIvfFlat instance with pre-allocated buffer but no data yet.
+func NewGpuIvfFlatEmpty[T VectorType](totalCount uint64, dimension uint32, metric DistanceType,
+	bp IvfFlatBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfFlat[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.ivf_flat_build_params_t{
+		n_lists:                  C.uint32_t(bp.NLists),
+		add_data_on_build:        C.bool(bp.AddDataOnBuild),
+		kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+	}
+
+	cIvfFlat := C.gpu_ivf_flat_new_empty(
+		C.uint64_t(totalCount),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cIvfFlat == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create empty GpuIvfFlat")
+	}
+
+	return &GpuIvfFlat[T]{
+		cIvfFlat:  cIvfFlat,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// AddChunk adds a chunk of data to the pre-allocated buffer.
+func (gi *GpuIvfFlat[T]) AddChunk(chunk []T, chunkCount uint64) error {
+	if gi.cIvfFlat == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	if len(chunk) == 0 || chunkCount == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_flat_add_chunk(
+		gi.cIvfFlat,
+		unsafe.Pointer(&chunk[0]),
+		C.uint64_t(chunkCount),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(chunk)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// AddChunkFloat adds a chunk of float32 data, performing on-the-fly quantization if needed.
+func (gi *GpuIvfFlat[T]) AddChunkFloat(chunk []float32, chunkCount uint64) error {
+	if gi.cIvfFlat == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	if len(chunk) == 0 || chunkCount == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_flat_add_chunk_float(
+		gi.cIvfFlat,
+		(*C.float)(&chunk[0]),
+		C.uint64_t(chunkCount),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(chunk)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// TrainQuantizer trains the scalar quantizer (if T is 1-byte)
+func (gi *GpuIvfFlat[T]) TrainQuantizer(trainData []float32, nSamples uint64) error {
+	if gi.cIvfFlat == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	if len(trainData) == 0 || nSamples == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_flat_train_quantizer(
+		gi.cIvfFlat,
+		(*C.float)(&trainData[0]),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(trainData)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// SetQuantizer sets the scalar quantizer parameters (if T is 1-byte)
+func (gi *GpuIvfFlat[T]) SetQuantizer(min, max float32) error {
+	if gi.cIvfFlat == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_flat_set_quantizer(
+		gi.cIvfFlat,
+		C.float(min),
+		C.float(max),
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// GetQuantizer gets the scalar quantizer parameters (if T is 1-byte)
+func (gi *GpuIvfFlat[T]) GetQuantizer() (float32, float32, error) {
+	if gi.cIvfFlat == nil {
+		return 0, 0, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+
+	var errmsg *C.char
+	var cMin, cMax C.float
+	C.gpu_ivf_flat_get_quantizer(
+		gi.cIvfFlat,
+		&cMin,
+		&cMax,
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return float32(cMin), float32(cMax), nil
+}
+
+// Save serializes the index to a file
+func (gi *GpuIvfFlat[T]) Save(filename string) error {
+	if gi.cIvfFlat == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	var errmsg *C.char
+	cFilename := C.CString(filename)
+	defer C.free(unsafe.Pointer(cFilename))
+
+	C.gpu_ivf_flat_save(gi.cIvfFlat, cFilename, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Search performs a K-Nearest Neighbor search
+func (gi *GpuIvfFlat[T]) Search(queries []T, numQueries uint64, dimension uint32, limit uint32, sp IvfFlatSearchParams) (SearchResultIvfFlat, error) {
+	if gi.cIvfFlat == nil {
+		return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	if len(queries) == 0 || numQueries == 0 {
+		return SearchResultIvfFlat{}, nil
+	}
+
+	var errmsg *C.char
+	cSP := C.ivf_flat_search_params_t{
+		n_probes: C.uint32_t(sp.NProbes),
+	}
+
+	res := C.gpu_ivf_flat_search(
+		gi.cIvfFlat,
+		unsafe.Pointer(&queries[0]),
+		C.uint64_t(numQueries),
+		C.uint32_t(dimension),
+		C.uint32_t(limit),
+		cSP,
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	totalElements := uint64(numQueries) * uint64(limit)
+	neighbors := make([]int64, totalElements)
+	distances := make([]float32, totalElements)
+
+	C.gpu_ivf_flat_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0])))
+	C.gpu_ivf_flat_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0])))
+	runtime.KeepAlive(neighbors)
+	runtime.KeepAlive(distances)
+
+	C.gpu_ivf_flat_free_result(res.result_ptr)
+
+	return SearchResultIvfFlat{
+		Neighbors: neighbors,
+		Distances: distances,
+	}, nil
+}
+
+// SearchFloat performs a K-Nearest Neighbor search with float32 queries
+func (gi *GpuIvfFlat[T]) SearchFloat(queries []float32, numQueries uint64, dimension uint32, limit uint32, sp IvfFlatSearchParams) (SearchResultIvfFlat, error) {
+	if gi.cIvfFlat == nil {
+		return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	if len(queries) == 0 || numQueries == 0 {
+		return SearchResultIvfFlat{}, nil
+	}
+
+	var errmsg *C.char
+	cSP := C.ivf_flat_search_params_t{
+		n_probes: C.uint32_t(sp.NProbes),
+	}
+
+	res := C.gpu_ivf_flat_search_float(
+		gi.cIvfFlat,
+		(*C.float)(unsafe.Pointer(&queries[0])),
+		C.uint64_t(numQueries),
+		C.uint32_t(dimension),
+		C.uint32_t(limit),
+		cSP,
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	totalElements := uint64(numQueries) * uint64(limit)
+	neighbors := make([]int64, totalElements)
+	distances := make([]float32, totalElements)
+
+	C.gpu_ivf_flat_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0])))
+	C.gpu_ivf_flat_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0])))
+	runtime.KeepAlive(neighbors)
+	runtime.KeepAlive(distances)
+
+	C.gpu_ivf_flat_free_result(res.result_ptr)
+
+	return SearchResultIvfFlat{
+		Neighbors: neighbors,
+		Distances: distances,
+	}, nil
+}
+
+// Cap returns the capacity of the index buffer
+func (gi *GpuIvfFlat[T]) Cap() uint32 {
+	if gi.cIvfFlat == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_flat_cap(gi.cIvfFlat))
+}
+
+// Len returns current number of vectors in index
+func (gi *GpuIvfFlat[T]) Len() uint32 {
+	if gi.cIvfFlat == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_flat_len(gi.cIvfFlat))
+}
+
+// Info returns detailed information about the index as a JSON string.
+func (gi *GpuIvfFlat[T]) Info() (string, error) {
+	if gi.cIvfFlat == nil {
+		return "", moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	var errmsg *C.char
+	infoPtr := C.gpu_ivf_flat_info(gi.cIvfFlat, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		if infoPtr != nil {
+			C.free(unsafe.Pointer(infoPtr))
+		}
+		return "", moerr.NewInternalErrorNoCtx(errStr)
+	}
+	if infoPtr == nil {
+		return "{}", nil
+	}
+	info := C.GoString(infoPtr)
+	C.free(unsafe.Pointer(infoPtr))
+	return info, nil
+}
+
+// GetCenters retrieves the trained centroids.
+func (gi *GpuIvfFlat[T]) GetCenters(nLists uint32) ([]T, error) {
+	if gi.cIvfFlat == nil {
+		return nil, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+	}
+	centers := make([]T, nLists*gi.dimension)
+	var errmsg *C.char
+	C.gpu_ivf_flat_get_centers(gi.cIvfFlat, unsafe.Pointer(&centers[0]), unsafe.Pointer(&errmsg))
+	runtime.KeepAlive(centers)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return centers, nil
+}
+
+// GetNList retrieves the number of lists (centroids) in the index.
+func (gi *GpuIvfFlat[T]) GetNList() uint32 {
+	if gi.cIvfFlat == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_flat_get_n_list(gi.cIvfFlat))
+}
+
+// SearchResultIvfFlat contains the neighbors and distances from an IVF-Flat search.
+type SearchResultIvfFlat struct {
+	Neighbors []int64
+	Distances []float32
+}
diff --git a/pkg/cuvs/ivf_flat_test.go b/pkg/cuvs/ivf_flat_test.go
new file mode 100644
index 0000000000000..f1e9b5e3a1d1e
--- /dev/null
+++ b/pkg/cuvs/ivf_flat_test.go
@@ -0,0 +1,580 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"fmt"
+	"math/rand"
+	"os"
+	"testing"
+)
+
+func TestGpuIvfFlat(t *testing.T) {
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		dataset[i*uint64(dimension)] = float32(i)
+		dataset[i*uint64(dimension)+1] = float32(i)
+	}
+
+	devices := []int{0}
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 10
+	index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuIvfFlat: %v", err)
+	}
+	defer index.Destroy()
+
+	index.Start()
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Failed to load/build GpuIvfFlat: %v", err)
+	}
+
+	centers, err := index.GetCenters(10)
+	if err != nil {
+		t.Fatalf("GetCenters failed: %v", err)
+	}
+	t.Logf("Centers: %v", centers[:4])
+
+	queries := []float32{1.0, 1.0, 100.0, 100.0}
+	sp := DefaultIvfFlatSearchParams()
+	sp.NProbes = 5
+	result, err := index.Search(queries, 2, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+
+	t.Logf("Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+	if result.Neighbors[0] != 1 {
+		t.Errorf("Expected neighbor 1, got %d", result.Neighbors[0])
+	}
+	if result.Neighbors[1] != 100 {
+		t.Errorf("Expected neighbor 100, got %d", result.Neighbors[1])
+	}
+}
+
+func TestGpuIvfFlatSaveLoad(t *testing.T) {
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = float32(i)
+	}
+
+	devices := []int{0}
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 2
+	index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuIvfFlat: %v", err)
+	}
+	index.Start()
+	index.Build()
+
+	filename := "test_ivf_flat.idx"
+	err = index.Save(filename)
+	if err != nil {
+		t.Fatalf("Save failed: %v", err)
+	}
+	defer os.Remove(filename)
+	index.Destroy()
+
+	index2, err := NewGpuIvfFlatFromFile[float32](filename, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuIvfFlat from file: %v", err)
+	}
+	defer index2.Destroy()
+
+	index2.Start()
+	err = index2.Build()
+	if err != nil {
+		t.Fatalf("Load from file failed: %v", err)
+	}
+
+	queries := []float32{0.0, 0.0}
+	sp := DefaultIvfFlatSearchParams()
+	result, err := index2.Search(queries, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+	if result.Neighbors[0] != 0 {
+		t.Errorf("Expected 0, got %d", result.Neighbors[0])
+	}
+}
+
+func TestGpuShardedIvfFlat(t *testing.T) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		t.Skip("Need at least 1 GPU for sharded IVF-Flat test")
+	}
+
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		dataset[i*uint64(dimension)] = float32(i)
+		dataset[i*uint64(dimension)+1] = float32(i)
+	}
+
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 10
+	index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Sharded)
+	if err != nil {
+		t.Fatalf("Failed to create sharded IVF-Flat: %v", err)
+	}
+	defer index.Destroy()
+
+	index.Start()
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load sharded failed: %v", err)
+	}
+
+	queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5}
+	sp := DefaultIvfFlatSearchParams()
+	result, err := index.Search(queries, 5, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search sharded failed: %v", err)
+	}
+	t.Logf("Sharded Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+}
+
+func TestGpuReplicatedIvfFlat(t *testing.T) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		t.Skip("Need at least 1 GPU for replicated IVF-Flat test")
+	}
+
+	dimension := uint32(2)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		dataset[i*uint64(dimension)] = float32(i)
+		dataset[i*uint64(dimension)+1] = float32(i)
+	}
+
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 10
+	index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Replicated)
+	if err != nil {
+		t.Fatalf("Failed to create replicated IVF-Flat: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load replicated failed: %v", err)
+	}
+
+	queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5}
+	sp := DefaultIvfFlatSearchParams()
+	result, err := index.Search(queries, 5, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search replicated failed: %v", err)
+	}
+	t.Logf("Replicated Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+}
+
+func BenchmarkGpuShardedIvfFlat(b *testing.B) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		b.Skip("Need at least 1 GPU for sharded IVF-Flat benchmark")
+	}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 1000
+	index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Sharded)
+	if err != nil {
+		b.Fatalf("Failed to create sharded IVF-Flat: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfFlatSearchParams()
+	sp.NProbes = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+
+		})
+	}
+}
+
+func BenchmarkGpuSingleIvfFlat(b *testing.B) {
+	devices := []int{0}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 1000
+	index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create single IVF-Flat: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfFlatSearchParams()
+	sp.NProbes = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+
+		})
+	}
+}
+
+func BenchmarkGpuReplicatedIvfFlat(b *testing.B) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		b.Skip("Need at least 1 GPU for replicated IVF-Flat benchmark")
+	}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 1000
+	index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Replicated)
+	if err != nil {
+		b.Fatalf("Failed to create replicated IVF-Flat: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfFlatSearchParams()
+	sp.NProbes = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+
+		})
+	}
+}
+
+func BenchmarkGpuAddChunkAndSearchIvfFlatF16(b *testing.B) {
+	const dimension = 1024
+	const totalCount = 100000
+	const chunkSize = 10000
+
+	dataset := make([]float32, totalCount*dimension)
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	devices := []int{0}
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 1000
+	// Use Float16 as internal type
+	index, err := NewGpuIvfFlatEmpty[Float16](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create index: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks using AddChunkFloat
+	for i := 0; i < totalCount; i += chunkSize {
+		chunk := dataset[i*dimension : (i+chunkSize)*dimension]
+		if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil {
+			b.Fatalf("AddChunkFloat failed at %d: %v", i, err)
+		}
+	}
+
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfFlatSearchParams()
+	sp.NProbes = 3
+
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		queries := make([]float32, dimension)
+		for i := range queries {
+			queries[i] = rand.Float32()
+		}
+		for pb.Next() {
+			_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+			if err != nil {
+				b.Fatalf("Search failed: %v", err)
+			}
+		}
+	})
+	b.StopTimer()
+	ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+		res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+		if err != nil {
+			return nil, err
+		}
+		return res.Neighbors, nil
+	})
+
+}
+
+func BenchmarkGpuAddChunkAndSearchIvfFlatInt8(b *testing.B) {
+	const dimension = 1024
+	const totalCount = 100000
+	const chunkSize = 10000
+
+	dataset := make([]float32, totalCount*dimension)
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	devices := []int{0}
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 1000
+	// Use int8 as internal type
+	index, err := NewGpuIvfFlatEmpty[int8](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create index: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks using AddChunkFloat
+	for i := 0; i < totalCount; i += chunkSize {
+		chunk := dataset[i*dimension : (i+chunkSize)*dimension]
+		if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil {
+			b.Fatalf("AddChunkFloat failed at %d: %v", i, err)
+		}
+	}
+
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfFlatSearchParams()
+	sp.NProbes = 3
+
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		queries := make([]float32, dimension)
+		for i := range queries {
+			queries[i] = rand.Float32()
+		}
+		for pb.Next() {
+			_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+			if err != nil {
+				b.Fatalf("Search failed: %v", err)
+			}
+		}
+	})
+	b.StopTimer()
+	ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+		res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+		if err != nil {
+			return nil, err
+		}
+		return res.Neighbors, nil
+	})
+
+}
+
+func TestGpuIvfFlatChunked(t *testing.T) {
+	dimension := uint32(8)
+	totalCount := uint64(100)
+	devices := []int{0}
+	bp := DefaultIvfFlatBuildParams()
+	bp.NLists = 10
+
+	// Create empty index (target type int8)
+	index, err := NewGpuIvfFlatEmpty[int8](totalCount, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuIvfFlatEmpty: %v", err)
+	}
+	defer index.Destroy()
+
+	err = index.Start()
+	if err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks (from float32, triggers on-the-fly quantization)
+	chunkSize := uint64(50)
+	for i := uint64(0); i < totalCount; i += chunkSize {
+		chunk := make([]float32, chunkSize*uint64(dimension))
+		val := float32(i/chunkSize*100 + 1) // 1.0 for first chunk, 101.0 for second
+		for j := range chunk {
+			chunk[j] = val
+		}
+		err = index.AddChunkFloat(chunk, chunkSize)
+		if err != nil {
+			t.Fatalf("AddChunkFloat failed at offset %d: %v", i, err)
+		}
+	}
+
+	// Build index
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load failed: %v", err)
+	}
+
+	// Search for first chunk
+	query1 := make([]int8, dimension)
+	for i := range query1 {
+		query1[i] = -128 // matches first chunk (1.0)
+	}
+	sp := DefaultIvfFlatSearchParams()
+	sp.NProbes = 3
+	result1, err := index.Search(query1, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search 1 failed: %v", err)
+	}
+	if result1.Neighbors[0] < 0 || result1.Neighbors[0] >= 50 {
+		t.Errorf("Expected neighbor from first chunk (0-49), got %d", result1.Neighbors[0])
+	}
+
+	// Search for second chunk
+	query2 := make([]int8, dimension)
+	for i := range query2 {
+		query2[i] = 127 // matches second chunk (101.0)
+	}
+	result2, err := index.Search(query2, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search 2 failed: %v", err)
+	}
+	if result2.Neighbors[0] < 50 || result2.Neighbors[0] >= 100 {
+		t.Errorf("Expected neighbor from second chunk (50-99), got %d", result2.Neighbors[0])
+	}
+}
diff --git a/pkg/cuvs/ivf_pq.go b/pkg/cuvs/ivf_pq.go
new file mode 100644
index 0000000000000..ae5165c390165
--- /dev/null
+++ b/pkg/cuvs/ivf_pq.go
@@ -0,0 +1,694 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/ivf_pq_c.h"
+#include <stdlib.h>
+#include <stdbool.h>
+*/
+import "C"
+import (
+	"runtime"
+	"unsafe"
+
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+)
+
+// GpuIvfPq represents the C++ gpu_ivf_pq_t object.
+type GpuIvfPq[T VectorType] struct {
+	cIvfPq      C.gpu_ivf_pq_c
+	dimension   uint32
+	nthread     uint32
+	distMode    DistributionMode
+	useBatching bool
+}
+
+// SetUseBatching enables or disables dynamic batching for search operations.
+func (gi *GpuIvfPq[T]) SetUseBatching(enable bool) error {
+	gi.useBatching = enable
+	if gi.cIvfPq != nil {
+		var errmsg *C.char
+		C.gpu_ivf_pq_set_use_batching(gi.cIvfPq, C.bool(enable), unsafe.Pointer(&errmsg))
+		if errmsg != nil {
+			errStr := C.GoString(errmsg)
+			C.free(unsafe.Pointer(errmsg))
+			return moerr.NewInternalErrorNoCtx(errStr)
+		}
+	}
+	return nil
+}
+
+// NewGpuIvfPq creates a new GpuIvfPq instance from a dataset.
+func NewGpuIvfPq[T VectorType](dataset []T, count uint64, dimension uint32, metric DistanceType,
+	bp IvfPqBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfPq[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.ivf_pq_build_params_t{
+		n_lists:                  C.uint32_t(bp.NLists),
+		m:                        C.uint32_t(bp.M),
+		bits_per_code:            C.uint32_t(bp.BitsPerCode),
+		add_data_on_build:        C.bool(bp.AddDataOnBuild),
+		kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+	}
+
+	cIvfPq := C.gpu_ivf_pq_new(
+		unsafe.Pointer(&dataset[0]),
+		C.uint64_t(count),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cIvfPq == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create GpuIvfPq")
+	}
+
+	return &GpuIvfPq[T]{
+		cIvfPq:    cIvfPq,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// NewGpuIvfPqFromDataFile creates a new GpuIvfPq instance from a MODF datafile.
+func NewGpuIvfPqFromDataFile[T VectorType](datafilename string, metric DistanceType,
+	bp IvfPqBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfPq[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cFilename := C.CString(datafilename)
+	defer C.free(unsafe.Pointer(cFilename))
+
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.ivf_pq_build_params_t{
+		n_lists:                  C.uint32_t(bp.NLists),
+		m:                        C.uint32_t(bp.M),
+		bits_per_code:            C.uint32_t(bp.BitsPerCode),
+		add_data_on_build:        C.bool(bp.AddDataOnBuild),
+		kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+	}
+
+	cIvfPq := C.gpu_ivf_pq_new_from_data_file(
+		cFilename,
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cIvfPq == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create GpuIvfPq from data file")
+	}
+
+	// dimension will be updated when GetDim() is called, but we can set it to 0 for now
+	// or ideally GetDim() should be used.
+	return &GpuIvfPq[T]{
+		cIvfPq:    cIvfPq,
+		dimension: 0,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// NewGpuIvfPqEmpty creates a new GpuIvfPq instance with pre-allocated buffer but no data yet.
+func NewGpuIvfPqEmpty[T VectorType](totalCount uint64, dimension uint32, metric DistanceType,
+	bp IvfPqBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfPq[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.ivf_pq_build_params_t{
+		n_lists:                  C.uint32_t(bp.NLists),
+		m:                        C.uint32_t(bp.M),
+		bits_per_code:            C.uint32_t(bp.BitsPerCode),
+		add_data_on_build:        C.bool(bp.AddDataOnBuild),
+		kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+	}
+
+	cIvfPq := C.gpu_ivf_pq_new_empty(
+		C.uint64_t(totalCount),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cIvfPq == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create empty GpuIvfPq")
+	}
+
+	return &GpuIvfPq[T]{
+		cIvfPq:    cIvfPq,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// AddChunk adds a chunk of data to the pre-allocated buffer.
+func (gi *GpuIvfPq[T]) AddChunk(chunk []T, chunkCount uint64) error {
+	if gi.cIvfPq == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	if len(chunk) == 0 || chunkCount == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_pq_add_chunk(
+		gi.cIvfPq,
+		unsafe.Pointer(&chunk[0]),
+		C.uint64_t(chunkCount),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(chunk)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// AddChunkFloat adds a chunk of float32 data, performing on-the-fly quantization if needed.
+func (gi *GpuIvfPq[T]) AddChunkFloat(chunk []float32, chunkCount uint64) error {
+	if gi.cIvfPq == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	if len(chunk) == 0 || chunkCount == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_pq_add_chunk_float(
+		gi.cIvfPq,
+		(*C.float)(&chunk[0]),
+		C.uint64_t(chunkCount),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(chunk)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// TrainQuantizer trains the scalar quantizer (if T is 1-byte)
+func (gi *GpuIvfPq[T]) TrainQuantizer(trainData []float32, nSamples uint64) error {
+	if gi.cIvfPq == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	if len(trainData) == 0 || nSamples == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_pq_train_quantizer(
+		gi.cIvfPq,
+		(*C.float)(&trainData[0]),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(trainData)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// SetQuantizer sets the scalar quantizer parameters (if T is 1-byte)
+func (gi *GpuIvfPq[T]) SetQuantizer(min, max float32) error {
+	if gi.cIvfPq == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_pq_set_quantizer(
+		gi.cIvfPq,
+		C.float(min),
+		C.float(max),
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// GetQuantizer gets the scalar quantizer parameters (if T is 1-byte)
+func (gi *GpuIvfPq[T]) GetQuantizer() (float32, float32, error) {
+	if gi.cIvfPq == nil {
+		return 0, 0, moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+
+	var errmsg *C.char
+	var cMin, cMax C.float
+	C.gpu_ivf_pq_get_quantizer(
+		gi.cIvfPq,
+		&cMin,
+		&cMax,
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return float32(cMin), float32(cMax), nil
+}
+
+// NewGpuIvfPqFromFile creates a new GpuIvfPq instance by loading from a file.
+func NewGpuIvfPqFromFile[T VectorType](filename string, dimension uint32, metric DistanceType,
+	bp IvfPqBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfPq[T], error) {
+	if len(devices) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+	}
+
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cFilename := C.CString(filename)
+	defer C.free(unsafe.Pointer(cFilename))
+
+	cDevices := make([]C.int, len(devices))
+	for i, d := range devices {
+		cDevices[i] = C.int(d)
+	}
+
+	cBP := C.ivf_pq_build_params_t{
+		n_lists:                  C.uint32_t(bp.NLists),
+		m:                        C.uint32_t(bp.M),
+		bits_per_code:            C.uint32_t(bp.BitsPerCode),
+		add_data_on_build:        C.bool(bp.AddDataOnBuild),
+		kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+	}
+
+	cIvfPq := C.gpu_ivf_pq_load_file(
+		cFilename,
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		cBP,
+		&cDevices[0],
+		C.int(len(devices)),
+		C.uint32_t(nthread),
+		C.distribution_mode_t(mode),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(cDevices)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cIvfPq == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to load GpuIvfPq from file")
+	}
+
+	return &GpuIvfPq[T]{
+		cIvfPq:    cIvfPq,
+		dimension: dimension,
+		nthread:   nthread,
+		distMode:  mode,
+	}, nil
+}
+
+// Destroy frees the C++ gpu_ivf_pq_t instance
+func (gi *GpuIvfPq[T]) Destroy() error {
+	if gi.cIvfPq == nil {
+		return nil
+	}
+	var errmsg *C.char
+	C.gpu_ivf_pq_destroy(gi.cIvfPq, unsafe.Pointer(&errmsg))
+	gi.cIvfPq = nil
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Start initializes the worker and resources
+func (gi *GpuIvfPq[T]) Start() error {
+	if gi.cIvfPq == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+
+	if gi.distMode == Replicated && gi.nthread > 1 {
+		var errmsg *C.char
+		C.gpu_ivf_pq_set_per_thread_device(gi.cIvfPq, C.bool(true), unsafe.Pointer(&errmsg))
+		if errmsg != nil {
+			errStr := C.GoString(errmsg)
+			C.free(unsafe.Pointer(errmsg))
+			return moerr.NewInternalErrorNoCtx(errStr)
+		}
+	}
+
+	if gi.useBatching {
+		if err := gi.SetUseBatching(true); err != nil {
+			return err
+		}
+	}
+
+	var errmsg *C.char
+	C.gpu_ivf_pq_start(gi.cIvfPq, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Build triggers the build or file loading process
+func (gi *GpuIvfPq[T]) Build() error {
+	if gi.cIvfPq == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	var errmsg *C.char
+	C.gpu_ivf_pq_build(gi.cIvfPq, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Save serializes the index to a file
+func (gi *GpuIvfPq[T]) Save(filename string) error {
+	if gi.cIvfPq == nil {
+		return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	var errmsg *C.char
+	cFilename := C.CString(filename)
+	defer C.free(unsafe.Pointer(cFilename))
+
+	C.gpu_ivf_pq_save(gi.cIvfPq, cFilename, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Search performs a K-Nearest Neighbor search
+func (gi *GpuIvfPq[T]) Search(queries []T, numQueries uint64, dimension uint32, limit uint32, sp IvfPqSearchParams) (SearchResultIvfPq, error) {
+	if gi.cIvfPq == nil {
+		return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	if len(queries) == 0 || numQueries == 0 {
+		return SearchResultIvfPq{}, nil
+	}
+
+	var errmsg *C.char
+	cSP := C.ivf_pq_search_params_t{
+		n_probes: C.uint32_t(sp.NProbes),
+	}
+
+	res := C.gpu_ivf_pq_search(
+		gi.cIvfPq,
+		unsafe.Pointer(&queries[0]),
+		C.uint64_t(numQueries),
+		C.uint32_t(dimension),
+		C.uint32_t(limit),
+		cSP,
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	totalElements := uint64(numQueries) * uint64(limit)
+	neighbors := make([]int64, totalElements)
+	distances := make([]float32, totalElements)
+
+	C.gpu_ivf_pq_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0])))
+	C.gpu_ivf_pq_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0])))
+	runtime.KeepAlive(neighbors)
+	runtime.KeepAlive(distances)
+
+	C.gpu_ivf_pq_free_result(res.result_ptr)
+
+	return SearchResultIvfPq{
+		Neighbors: neighbors,
+		Distances: distances,
+	}, nil
+}
+
+// SearchFloat performs an IVF-PQ search operation with float32 queries
+func (gi *GpuIvfPq[T]) SearchFloat(queries []float32, numQueries uint64, dimension uint32, limit uint32, sp IvfPqSearchParams) (SearchResultIvfPq, error) {
+	if gi.cIvfPq == nil {
+		return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	if len(queries) == 0 || numQueries == 0 {
+		return SearchResultIvfPq{}, nil
+	}
+
+	var errmsg *C.char
+	cSP := C.ivf_pq_search_params_t{
+		n_probes: C.uint32_t(sp.NProbes),
+	}
+
+	res := C.gpu_ivf_pq_search_float(
+		gi.cIvfPq,
+		(*C.float)(unsafe.Pointer(&queries[0])),
+		C.uint64_t(numQueries),
+		C.uint32_t(dimension),
+		C.uint32_t(limit),
+		cSP,
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	totalElements := uint64(numQueries) * uint64(limit)
+	neighbors := make([]int64, totalElements)
+	distances := make([]float32, totalElements)
+
+	C.gpu_ivf_pq_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0])))
+	C.gpu_ivf_pq_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0])))
+	runtime.KeepAlive(neighbors)
+	runtime.KeepAlive(distances)
+
+	C.gpu_ivf_pq_free_result(res.result_ptr)
+
+	return SearchResultIvfPq{
+		Neighbors: neighbors,
+		Distances: distances,
+	}, nil
+}
+
+// Cap returns the capacity of the index buffer
+func (gi *GpuIvfPq[T]) Cap() uint32 {
+	if gi.cIvfPq == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_pq_cap(gi.cIvfPq))
+}
+
+// Len returns current number of vectors in index
+func (gi *GpuIvfPq[T]) Len() uint32 {
+	if gi.cIvfPq == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_pq_len(gi.cIvfPq))
+}
+
+// Info returns detailed information about the index as a JSON string.
+func (gi *GpuIvfPq[T]) Info() (string, error) {
+	if gi.cIvfPq == nil {
+		return "", moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	var errmsg *C.char
+	infoPtr := C.gpu_ivf_pq_info(gi.cIvfPq, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		if infoPtr != nil {
+			C.free(unsafe.Pointer(infoPtr))
+		}
+		return "", moerr.NewInternalErrorNoCtx(errStr)
+	}
+	if infoPtr == nil {
+		return "{}", nil
+	}
+	info := C.GoString(infoPtr)
+	C.free(unsafe.Pointer(infoPtr))
+	return info, nil
+}
+
+// GetCenters retrieves the trained centroids.
+func (gi *GpuIvfPq[T]) GetCenters() ([]T, error) {
+	if gi.cIvfPq == nil {
+		return nil, moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized")
+	}
+	nList := gi.GetNList()
+	dim := gi.GetRotDim()
+	centers := make([]T, nList*dim)
+	var errmsg *C.char
+	C.gpu_ivf_pq_get_centers(gi.cIvfPq, unsafe.Pointer(&centers[0]), unsafe.Pointer(&errmsg))
+	runtime.KeepAlive(centers)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return centers, nil
+}
+
+// GetNList retrieves the number of lists (centroids) in the index.
+func (gi *GpuIvfPq[T]) GetNList() uint32 {
+	if gi.cIvfPq == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_pq_get_n_list(gi.cIvfPq))
+}
+
+// GetDim retrieves the dimension of the index.
+func (gi *GpuIvfPq[T]) GetDim() uint32 {
+	if gi.cIvfPq == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_pq_get_dim(gi.cIvfPq))
+}
+
+// GetRotDim retrieves the rotated dimension of the index.
+func (gi *GpuIvfPq[T]) GetRotDim() uint32 {
+	if gi.cIvfPq == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_pq_get_rot_dim(gi.cIvfPq))
+}
+
+// GetDimExt retrieves the extended dimension of the index (including norms and padding).
+func (gi *GpuIvfPq[T]) GetDimExt() uint32 {
+	if gi.cIvfPq == nil {
+		return 0
+	}
+	return uint32(C.gpu_ivf_pq_get_dim_ext(gi.cIvfPq))
+}
+
+// GetDataset retrieves the flattened host dataset (for debugging).
+func (gi *GpuIvfPq[T]) GetDataset(totalElements uint64) []T {
+	if gi.cIvfPq == nil {
+		return nil
+	}
+	data := make([]T, totalElements)
+	C.gpu_ivf_pq_get_dataset(gi.cIvfPq, unsafe.Pointer(&data[0]))
+	return data
+}
+
+// SearchResultIvfPq contains the neighbors and distances from an IVF-PQ search.
+type SearchResultIvfPq struct {
+	Neighbors []int64
+	Distances []float32
+}
diff --git a/pkg/cuvs/ivf_pq_test.go b/pkg/cuvs/ivf_pq_test.go
new file mode 100644
index 0000000000000..7998559210e64
--- /dev/null
+++ b/pkg/cuvs/ivf_pq_test.go
@@ -0,0 +1,605 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"fmt"
+	"math/rand"
+	"os"
+	"testing"
+)
+
+func TestGpuIvfPq(t *testing.T) {
+	dimension := uint32(16)
+	n_vectors := uint64(100)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		for j := uint32(0); j < dimension; j++ {
+			dataset[i*uint64(dimension)+uint64(j)] = float32(i)
+		}
+	}
+
+	devices := []int{0}
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 10
+	bp.M = 8 // dimension 16 is divisible by 8
+	index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuIvfPq: %v", err)
+	}
+	defer index.Destroy()
+
+	err = index.Start()
+	if err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Failed to load/build GpuIvfPq: %v", err)
+	}
+
+	centers, err := index.GetCenters()
+	if err != nil {
+		t.Fatalf("GetCenters failed: %v", err)
+	}
+	t.Logf("Centers count: %d, dim_ext: %d", len(centers)/int(index.GetDimExt()), index.GetDimExt())
+
+	query := make([]float32, dimension)
+	for i := uint32(0); i < dimension; i++ {
+		query[i] = 1.0
+	}
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 5
+	result, err := index.Search(query, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+
+	t.Logf("Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+	if result.Neighbors[0] != 1 {
+		t.Errorf("Expected neighbor 1, got %d", result.Neighbors[0])
+	}
+}
+
+func TestGpuIvfPqSaveLoad(t *testing.T) {
+	dimension := uint32(4)
+	n_vectors := uint64(100)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = float32(i / int(dimension))
+	}
+
+	devices := []int{0}
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 2
+	bp.M = 2
+	index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuIvfPq: %v", err)
+	}
+	index.Start()
+	index.Build()
+
+	filename := "test_ivf_pq.idx"
+	err = index.Save(filename)
+	if err != nil {
+		t.Fatalf("Save failed: %v", err)
+	}
+	defer os.Remove(filename)
+	index.Destroy()
+
+	index2, err := NewGpuIvfPqFromFile[float32](filename, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuIvfPq from file: %v", err)
+	}
+	defer index2.Destroy()
+
+	err = index2.Start()
+	if err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+
+	err = index2.Build()
+	if err != nil {
+		t.Fatalf("Load from file failed: %v", err)
+	}
+
+	query := make([]float32, dimension) // all zeros
+	sp := DefaultIvfPqSearchParams()
+	result, err := index2.Search(query, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search failed: %v", err)
+	}
+	if result.Neighbors[0] != 0 {
+		t.Errorf("Expected 0, got %d", result.Neighbors[0])
+	}
+}
+
+func TestGpuIvfPqChunked(t *testing.T) {
+	dimension := uint32(8)
+	totalCount := uint64(100)
+	devices := []int{0}
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 10
+	bp.M = 4
+
+	// Create empty index (target type int8)
+	index, err := NewGpuIvfPqEmpty[int8](totalCount, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+	if err != nil {
+		t.Fatalf("Failed to create GpuIvfPqEmpty: %v", err)
+	}
+	defer index.Destroy()
+
+	err = index.Start()
+	if err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks (from float32, triggers on-the-fly quantization)
+	chunkSize := uint64(50)
+	for i := uint64(0); i < totalCount; i += chunkSize {
+		chunk := make([]float32, chunkSize*uint64(dimension))
+		val := float32(i/chunkSize*100 + 1) // 1.0 for first chunk, 101.0 for second
+		for j := range chunk {
+			chunk[j] = val
+		}
+		err = index.AddChunkFloat(chunk, chunkSize)
+		if err != nil {
+			t.Fatalf("AddChunkFloat failed at offset %d: %v", i, err)
+		}
+	}
+
+	// Debug: check dataset
+	ds := index.GetDataset(totalCount * uint64(dimension))
+	t.Logf("Dataset[0]: %v, Dataset[50*dim]: %v", ds[0], ds[50*uint64(dimension)])
+
+	// Build index
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load failed: %v", err)
+	}
+
+	// Search for first chunk
+	query1 := make([]int8, dimension)
+	for i := range query1 {
+		query1[i] = -128 // matches first chunk (1.0)
+	}
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 3
+	result1, err := index.Search(query1, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search 1 failed: %v", err)
+	}
+	if result1.Neighbors[0] < 0 || result1.Neighbors[0] >= 50 {
+		t.Errorf("Expected neighbor from first chunk (0-49), got %d", result1.Neighbors[0])
+	}
+
+	// Search for second chunk
+	query2 := make([]int8, dimension)
+	for i := range query2 {
+		query2[i] = 127 // matches second chunk (101.0)
+	}
+	result2, err := index.Search(query2, 1, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search 2 failed: %v", err)
+	}
+	if result2.Neighbors[0] < 50 || result2.Neighbors[0] >= 100 {
+		t.Errorf("Expected neighbor from second chunk (50-99), got %d", result2.Neighbors[0])
+	}
+}
+
+func TestGpuShardedIvfPq(t *testing.T) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		t.Skip("Need at least 1 GPU for sharded IVF-PQ test")
+	}
+
+	dimension := uint32(4)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		for j := uint32(0); j < dimension; j++ {
+			dataset[i*uint64(dimension)+uint64(j)] = float32(i)
+		}
+	}
+
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 10
+	bp.M = 2
+	index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Sharded)
+	if err != nil {
+		t.Fatalf("Failed to create sharded IVF-PQ: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load sharded failed: %v", err)
+	}
+
+	queries := []float32{0.1, 0.1, 0.1, 0.1, 10.1, 10.1, 10.1, 10.1}
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 5
+	result, err := index.Search(queries, 2, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search sharded failed: %v", err)
+	}
+	t.Logf("Sharded Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+}
+
+func TestGpuReplicatedIvfPq(t *testing.T) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		t.Skip("Need at least 1 GPU for replicated IVF-PQ test")
+	}
+
+	dimension := uint32(4)
+	n_vectors := uint64(1000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := uint64(0); i < n_vectors; i++ {
+		for j := uint32(0); j < dimension; j++ {
+			dataset[i*uint64(dimension)+uint64(j)] = float32(i)
+		}
+	}
+
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 10
+	bp.M = 2
+	index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Replicated)
+	if err != nil {
+		t.Fatalf("Failed to create replicated IVF-PQ: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+	err = index.Build()
+	if err != nil {
+		t.Fatalf("Load replicated failed: %v", err)
+	}
+
+	queries := []float32{0.1, 0.1, 0.1, 0.1, 10.1, 10.1, 10.1, 10.1}
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 5
+	result, err := index.Search(queries, 2, dimension, 1, sp)
+	if err != nil {
+		t.Fatalf("Search replicated failed: %v", err)
+	}
+	t.Logf("Replicated Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+}
+
+func BenchmarkGpuShardedIvfPq(b *testing.B) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		b.Skip("Need at least 1 GPU for sharded IVF-PQ benchmark")
+	}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 1000
+	bp.M = 128 // 1024 / 8
+	index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Sharded)
+	if err != nil {
+		b.Fatalf("Failed to create sharded IVF-PQ: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+
+		})
+	}
+}
+
+func BenchmarkGpuSingleIvfPq(b *testing.B) {
+	devices := []int{0}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 1000
+	bp.M = 128 // 1024 / 8
+	index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create single IVF-PQ: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+
+		})
+	}
+}
+
+func BenchmarkGpuReplicatedIvfPq(b *testing.B) {
+	devices, err := GetGpuDeviceList()
+	if err != nil || len(devices) < 1 {
+		b.Skip("Need at least 1 GPU for replicated IVF-PQ benchmark")
+	}
+
+	dimension := uint32(1024)
+	n_vectors := uint64(100000)
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 1000
+	bp.M = 128 // 1024 / 8
+	index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Replicated)
+	if err != nil {
+		b.Fatalf("Failed to create replicated IVF-PQ: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 3
+
+	for _, useBatching := range []bool{false, true} {
+		b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) {
+			index.SetUseBatching(useBatching)
+
+			b.ResetTimer()
+			b.RunParallel(func(pb *testing.PB) {
+				queries := make([]float32, dimension)
+				for i := range queries {
+					queries[i] = rand.Float32()
+				}
+				for pb.Next() {
+					_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+					if err != nil {
+						b.Fatalf("Search failed: %v", err)
+					}
+				}
+			})
+			b.StopTimer()
+			ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+				res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+				if err != nil {
+					return nil, err
+				}
+				return res.Neighbors, nil
+			})
+
+		})
+	}
+}
+
+func BenchmarkGpuAddChunkAndSearchIvfPqF16(b *testing.B) {
+	const dimension = 1024
+	const totalCount = 100000
+	const chunkSize = 10000
+
+	dataset := make([]float32, totalCount*dimension)
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	devices := []int{0}
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 1000
+	// Use Float16 as internal type
+	index, err := NewGpuIvfPqEmpty[Float16](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create index: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks using AddChunkFloat
+	for i := 0; i < totalCount; i += chunkSize {
+		chunk := dataset[i*dimension : (i+chunkSize)*dimension]
+		if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil {
+			b.Fatalf("AddChunkFloat failed at %d: %v", i, err)
+		}
+	}
+
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 3
+
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		queries := make([]float32, dimension)
+		for i := range queries {
+			queries[i] = rand.Float32()
+		}
+		for pb.Next() {
+			_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+			if err != nil {
+				b.Fatalf("Search failed: %v", err)
+			}
+		}
+	})
+	b.StopTimer()
+	ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+		res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+		if err != nil {
+			return nil, err
+		}
+		return res.Neighbors, nil
+	})
+}
+
+func BenchmarkGpuAddChunkAndSearchIvfPqInt8(b *testing.B) {
+	const dimension = 1024
+	const totalCount = 100000
+	const chunkSize = 10000
+
+	dataset := make([]float32, totalCount*dimension)
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+
+	devices := []int{0}
+	bp := DefaultIvfPqBuildParams()
+	bp.NLists = 1000
+	// Use int8 as internal type
+	index, err := NewGpuIvfPqEmpty[int8](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu)
+	if err != nil {
+		b.Fatalf("Failed to create index: %v", err)
+	}
+	defer index.Destroy()
+
+	if err := index.Start(); err != nil {
+		b.Fatalf("Start failed: %v", err)
+	}
+
+	// Add data in chunks using AddChunkFloat
+	for i := 0; i < totalCount; i += chunkSize {
+		chunk := dataset[i*dimension : (i+chunkSize)*dimension]
+		if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil {
+			b.Fatalf("AddChunkFloat failed at %d: %v", i, err)
+		}
+	}
+
+	if err := index.Build(); err != nil {
+		b.Fatalf("Build failed: %v", err)
+	}
+	// info, _ := index.Info()
+	// fmt.Println(info)
+
+	sp := DefaultIvfPqSearchParams()
+	sp.NProbes = 3
+
+	b.ResetTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		queries := make([]float32, dimension)
+		for i := range queries {
+			queries[i] = rand.Float32()
+		}
+		for pb.Next() {
+			_, err := index.SearchFloat(queries, 1, dimension, 10, sp)
+			if err != nil {
+				b.Fatalf("Search failed: %v", err)
+			}
+		}
+	})
+	b.StopTimer()
+	ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) {
+		res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp)
+		if err != nil {
+			return nil, err
+		}
+		return res.Neighbors, nil
+	})
+}
diff --git a/pkg/cuvs/kmeans.go b/pkg/cuvs/kmeans.go
new file mode 100644
index 0000000000000..1c07ea350f2d0
--- /dev/null
+++ b/pkg/cuvs/kmeans.go
@@ -0,0 +1,381 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/kmeans_c.h"
+#include <stdlib.h>
+*/
+import "C"
+import (
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"runtime"
+	"unsafe"
+)
+
+// GpuKMeans represents the C++ gpu_kmeans_t object
+type GpuKMeans[T VectorType] struct {
+	cKMeans   C.gpu_kmeans_c
+	nClusters uint32
+	dimension uint32
+}
+
+// NewGpuKMeans creates a new GpuKMeans instance
+func NewGpuKMeans[T VectorType](nClusters uint32, dimension uint32, metric DistanceType, maxIter int, deviceID int, nthread uint32) (*GpuKMeans[T], error) {
+	qtype := GetQuantization[T]()
+	var errmsg *C.char
+	cKMeans := C.gpu_kmeans_new(
+		C.uint32_t(nClusters),
+		C.uint32_t(dimension),
+		C.distance_type_t(metric),
+		C.int(maxIter),
+		C.int(deviceID),
+		C.uint32_t(nthread),
+		C.quantization_t(qtype),
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if cKMeans == nil {
+		return nil, moerr.NewInternalErrorNoCtx("failed to create GpuKMeans")
+	}
+
+	return &GpuKMeans[T]{cKMeans: cKMeans, nClusters: nClusters, dimension: dimension}, nil
+}
+
+// Destroy frees the C++ gpu_kmeans_t instance
+func (gk *GpuKMeans[T]) Destroy() error {
+	if gk.cKMeans == nil {
+		return nil
+	}
+	var errmsg *C.char
+	C.gpu_kmeans_destroy(gk.cKMeans, unsafe.Pointer(&errmsg))
+	gk.cKMeans = nil
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// Start initializes the worker and resources
+func (gk *GpuKMeans[T]) Start() error {
+	if gk.cKMeans == nil {
+		return moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	var errmsg *C.char
+	C.gpu_kmeans_start(gk.cKMeans, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// TrainQuantizer trains the scalar quantizer (if T is 1-byte)
+func (gk *GpuKMeans[T]) TrainQuantizer(trainData []float32, nSamples uint64) error {
+	if gk.cKMeans == nil {
+		return moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	if len(trainData) == 0 || nSamples == 0 {
+		return nil
+	}
+
+	var errmsg *C.char
+	C.gpu_kmeans_train_quantizer(
+		gk.cKMeans,
+		(*C.float)(&trainData[0]),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(trainData)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// SetQuantizer sets the scalar quantizer parameters (if T is 1-byte)
+func (gk *GpuKMeans[T]) SetQuantizer(min, max float32) error {
+	if gk.cKMeans == nil {
+		return moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+
+	var errmsg *C.char
+	C.gpu_kmeans_set_quantizer(
+		gk.cKMeans,
+		C.float(min),
+		C.float(max),
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return nil
+}
+
+// GetQuantizer gets the scalar quantizer parameters (if T is 1-byte)
+func (gk *GpuKMeans[T]) GetQuantizer() (float32, float32, error) {
+	if gk.cKMeans == nil {
+		return 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+
+	var errmsg *C.char
+	var cMin, cMax C.float
+	C.gpu_kmeans_get_quantizer(
+		gk.cKMeans,
+		&cMin,
+		&cMax,
+		unsafe.Pointer(&errmsg),
+	)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return float32(cMin), float32(cMax), nil
+}
+
+// Fit computes the cluster centroids
+func (gk *GpuKMeans[T]) Fit(dataset []T, nSamples uint64) (float32, int64, error) {
+	if gk.cKMeans == nil {
+		return 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	if len(dataset) == 0 || nSamples == 0 {
+		return 0, 0, nil
+	}
+
+	var errmsg *C.char
+	res := C.gpu_kmeans_fit(
+		gk.cKMeans,
+		unsafe.Pointer(&dataset[0]),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	return float32(res.inertia), int64(res.n_iter), nil
+}
+
+// Predict assigns labels to new data based on existing centroids.
+func (gk *GpuKMeans[T]) Predict(dataset []T, nSamples uint64) ([]int64, float32, error) {
+	if gk.cKMeans == nil {
+		return nil, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	if len(dataset) == 0 || nSamples == 0 {
+		return nil, 0, nil
+	}
+
+	var errmsg *C.char
+	res := C.gpu_kmeans_predict(
+		gk.cKMeans,
+		unsafe.Pointer(&dataset[0]),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return nil, 0, moerr.NewInternalErrorNoCtx("predict returned nil result")
+	}
+
+	labels := make([]int64, nSamples)
+	C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0])))
+	runtime.KeepAlive(labels)
+
+	C.gpu_kmeans_free_result(res.result_ptr)
+
+	return labels, float32(res.inertia), nil
+}
+
+// PredictFloat assigns labels to new float32 data based on existing centroids.
+func (gk *GpuKMeans[T]) PredictFloat(dataset []float32, nSamples uint64) ([]int64, float32, error) {
+	if gk.cKMeans == nil {
+		return nil, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	if len(dataset) == 0 || nSamples == 0 {
+		return nil, 0, nil
+	}
+
+	var errmsg *C.char
+	res := C.gpu_kmeans_predict_float(
+		gk.cKMeans,
+		(*C.float)(unsafe.Pointer(&dataset[0])),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return nil, 0, moerr.NewInternalErrorNoCtx("predict returned nil result")
+	}
+
+	labels := make([]int64, nSamples)
+	C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0])))
+	runtime.KeepAlive(labels)
+
+	C.gpu_kmeans_free_result(res.result_ptr)
+
+	return labels, float32(res.inertia), nil
+}
+
+// FitPredict performs both fitting and labeling in one step.
+func (gk *GpuKMeans[T]) FitPredict(dataset []T, nSamples uint64) ([]int64, float32, int64, error) {
+	if gk.cKMeans == nil {
+		return nil, 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	if len(dataset) == 0 || nSamples == 0 {
+		return nil, 0, 0, nil
+	}
+
+	var errmsg *C.char
+	res := C.gpu_kmeans_fit_predict(
+		gk.cKMeans,
+		unsafe.Pointer(&dataset[0]),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return nil, 0, 0, moerr.NewInternalErrorNoCtx("fit_predict returned nil result")
+	}
+
+	labels := make([]int64, nSamples)
+	C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0])))
+	runtime.KeepAlive(labels)
+
+	C.gpu_kmeans_free_result(res.result_ptr)
+
+	return labels, float32(res.inertia), int64(res.n_iter), nil
+}
+
+// FitPredictFloat performs both fitting and labeling in one step for float32 data.
+func (gk *GpuKMeans[T]) FitPredictFloat(dataset []float32, nSamples uint64) ([]int64, float32, int64, error) {
+	if gk.cKMeans == nil {
+		return nil, 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	if len(dataset) == 0 || nSamples == 0 {
+		return nil, 0, 0, nil
+	}
+
+	var errmsg *C.char
+	res := C.gpu_kmeans_fit_predict_float(
+		gk.cKMeans,
+		(*C.float)(unsafe.Pointer(&dataset[0])),
+		C.uint64_t(nSamples),
+		unsafe.Pointer(&errmsg),
+	)
+	runtime.KeepAlive(dataset)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+	}
+
+	if res.result_ptr == nil {
+		return nil, 0, 0, moerr.NewInternalErrorNoCtx("fit_predict returned nil result")
+	}
+
+	labels := make([]int64, nSamples)
+	C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0])))
+	runtime.KeepAlive(labels)
+
+	C.gpu_kmeans_free_result(res.result_ptr)
+
+	return labels, float32(res.inertia), int64(res.n_iter), nil
+}
+
+// GetCentroids retrieves the trained centroids.
+func (gk *GpuKMeans[T]) GetCentroids() ([]T, error) {
+	if gk.cKMeans == nil {
+		return nil, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	centroids := make([]T, gk.nClusters*gk.dimension)
+	var errmsg *C.char
+	C.gpu_kmeans_get_centroids(gk.cKMeans, unsafe.Pointer(&centroids[0]), unsafe.Pointer(&errmsg))
+	runtime.KeepAlive(centroids)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	return centroids, nil
+}
+
+// Info returns detailed information about the index as a JSON string.
+func (gk *GpuKMeans[T]) Info() (string, error) {
+	if gk.cKMeans == nil {
+		return "", moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+	}
+	var errmsg *C.char
+	infoPtr := C.gpu_kmeans_info(gk.cKMeans, unsafe.Pointer(&errmsg))
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		if infoPtr != nil {
+			C.free(unsafe.Pointer(infoPtr))
+		}
+		return "", moerr.NewInternalErrorNoCtx(errStr)
+	}
+	if infoPtr == nil {
+		return "{}", nil
+	}
+	info := C.GoString(infoPtr)
+	C.free(unsafe.Pointer(infoPtr))
+	return info, nil
+}
diff --git a/pkg/cuvs/kmeans_test.go b/pkg/cuvs/kmeans_test.go
new file mode 100644
index 0000000000000..a14044ac6a6ca
--- /dev/null
+++ b/pkg/cuvs/kmeans_test.go
@@ -0,0 +1,172 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestGpuKMeans_Float32(t *testing.T) {
+	nClusters := uint32(3)
+	dimension := uint32(2)
+	nSamples := uint64(9)
+
+	// Create 3 clusters
+	dataset := []float32{
+		0.1, 0.1, 0.0, 0.2, 0.2, 0.0, // Cluster 0
+		10.1, 10.1, 10.0, 10.2, 10.2, 10.0, // Cluster 1
+		20.1, 20.1, 20.0, 20.2, 20.2, 20.0, // Cluster 2
+	}
+
+	deviceID := 0
+	kmeans, err := NewGpuKMeans[float32](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+	if err != nil {
+		t.Fatalf("Failed to create GpuKMeans: %v", err)
+	}
+	defer kmeans.Destroy()
+
+	kmeans.Start()
+	inertia, nIter, err := kmeans.Fit(dataset, nSamples)
+	if err != nil {
+		t.Fatalf("Fit failed: %v", err)
+	}
+	fmt.Printf("Fit: inertia=%f, nIter=%d\n", inertia, nIter)
+
+	labels, pInertia, err := kmeans.Predict(dataset, nSamples)
+	if err != nil {
+		t.Fatalf("Predict failed: %v", err)
+	}
+	fmt.Printf("Predict labels: %v, inertia=%f\n", labels, pInertia)
+
+	if len(labels) != int(nSamples) {
+		t.Errorf("Expected %d labels, got %d", nSamples, len(labels))
+	}
+
+	// Since we use balanced_params, it might prioritize balancing cluster sizes over spatial distance
+	// on very small datasets. We just check that all labels are within range [0, nClusters).
+	for i, l := range labels {
+		if l < 0 || l >= int64(nClusters) {
+			t.Errorf("Label at index %d is out of range: %d", i, l)
+		}
+	}
+
+	centroids, err := kmeans.GetCentroids()
+	if err != nil {
+		t.Fatalf("GetCentroids failed: %v", err)
+	}
+	if len(centroids) != int(nClusters*dimension) {
+		t.Errorf("Expected %d centroid elements, got %d", nClusters*dimension, len(centroids))
+	}
+}
+
+func TestGpuKMeans_FitPredict_Float16(t *testing.T) {
+	nClusters := uint32(2)
+	dimension := uint32(4)
+	nSamples := uint64(10)
+
+	dataset := make([]float32, nSamples*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = 0.5
+	}
+
+	// Convert to F16
+	datasetF16 := make([]Float16, len(dataset))
+	err := GpuConvertF32ToF16(dataset, datasetF16, 0)
+	if err != nil {
+		t.Fatalf("F32 to F16 conversion failed: %v", err)
+	}
+
+	deviceID := 0
+	kmeans, err := NewGpuKMeans[Float16](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+	if err != nil {
+		t.Fatalf("Failed to create GpuKMeans: %v", err)
+	}
+	defer kmeans.Destroy()
+
+	kmeans.Start()
+	labels, inertia, nIter, err := kmeans.FitPredict(datasetF16, nSamples)
+	if err != nil {
+		t.Fatalf("FitPredict failed: %v", err)
+	}
+	fmt.Printf("FitPredict: inertia=%f, nIter=%d\n", inertia, nIter)
+	if len(labels) != int(nSamples) {
+		t.Errorf("Expected %d labels, got %d", nSamples, len(labels))
+	}
+}
+
+func TestGpuKMeans_Int8(t *testing.T) {
+	nClusters := uint32(2)
+	dimension := uint32(2)
+	nSamples := uint64(4)
+
+	dataset := []int8{
+		0, 0,
+		1, 1,
+		10, 10,
+		11, 11,
+	}
+
+	deviceID := 0
+	kmeans, err := NewGpuKMeans[int8](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+	if err != nil {
+		t.Fatalf("Failed to create GpuKMeans: %v", err)
+	}
+	defer kmeans.Destroy()
+
+	kmeans.Start()
+	labels, _, _, err := kmeans.FitPredict(dataset, nSamples)
+	if err != nil {
+		t.Fatalf("FitPredict failed: %v", err)
+	}
+	fmt.Printf("Int8 Predict labels: %v\n", labels)
+
+	if len(labels) != int(nSamples) {
+		t.Errorf("Expected %d labels, got %d", nSamples, len(labels))
+	}
+}
+
+func TestGpuKMeans_Uint8(t *testing.T) {
+	nClusters := uint32(2)
+	dimension := uint32(2)
+	nSamples := uint64(4)
+
+	dataset := []uint8{
+		0, 0,
+		1, 1,
+		10, 10,
+		11, 11,
+	}
+
+	deviceID := 0
+	kmeans, err := NewGpuKMeans[uint8](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+	if err != nil {
+		t.Fatalf("Failed to create GpuKMeans: %v", err)
+	}
+	defer kmeans.Destroy()
+
+	kmeans.Start()
+	labels, _, _, err := kmeans.FitPredict(dataset, nSamples)
+	if err != nil {
+		t.Fatalf("FitPredict failed: %v", err)
+	}
+	fmt.Printf("Uint8 Predict labels: %v\n", labels)
+
+	if len(labels) != int(nSamples) {
+		t.Errorf("Expected %d labels, got %d", nSamples, len(labels))
+	}
+}
diff --git a/pkg/cuvs/recall_test.go b/pkg/cuvs/recall_test.go
new file mode 100644
index 0000000000000..e5c6676531d5a
--- /dev/null
+++ b/pkg/cuvs/recall_test.go
@@ -0,0 +1,76 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	//"fmt"
+	"math/rand"
+	"testing"
+)
+
+type NeighborType interface {
+	uint32 | int64
+}
+
+// GenerateRandomDataset generates a random float32 dataset.
+func GenerateRandomDataset(n_vectors uint64, dimension uint32) []float32 {
+	dataset := make([]float32, n_vectors*uint64(dimension))
+	for i := range dataset {
+		dataset[i] = rand.Float32()
+	}
+	return dataset
+}
+
+// ReportRecall reports the self-recall for an index.
+// It verifies that querying with a point already in the index returns that point's ID.
+func ReportRecall[T NeighborType](b *testing.B, dataset []float32, n_vectors uint64, dimension uint32, limit uint32, searchFunc func(queries []float32, numQueries uint64, limit uint32) ([]T, error)) {
+	numQueries := uint64(100)
+	if n_vectors < numQueries {
+		numQueries = n_vectors
+	}
+
+	// Use the first numQueries vectors from the dataset as queries.
+	// Since these are the first vectors, we expect their IDs to be 0, 1, 2, ..., numQueries-1.
+	recallQueries := dataset[:numQueries*uint64(dimension)]
+
+	// Search approximate index
+	approxNeighbors, err := searchFunc(recallQueries, numQueries, limit)
+	if err != nil {
+		b.Logf("Warning: Approximate search failed: %v", err)
+		return
+	}
+
+	hitCount := 0
+	for i := uint64(0); i < numQueries; i++ {
+		// For query i (which is dataset[i]), we expect ID 'i' to be in the results
+		expectedID := int64(i)
+		found := false
+		for j := uint32(0); j < limit; j++ {
+			if int64(approxNeighbors[i*uint64(limit)+uint64(j)]) == expectedID {
+				found = true
+				break
+			}
+		}
+		if found {
+			hitCount++
+		}
+	}
+
+	recall := float64(hitCount) / float64(numQueries)
+	//fmt.Printf("Benchmark %s: self_recall_at_%d = %.4f\n", b.Name(), int(limit), recall)
+	b.ReportMetric(recall*float64(b.N), "recall")
+}
diff --git a/pkg/cuvs/search_float_test.go b/pkg/cuvs/search_float_test.go
new file mode 100644
index 0000000000000..2abdef34c37cc
--- /dev/null
+++ b/pkg/cuvs/search_float_test.go
@@ -0,0 +1,169 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cuvs
+
+import (
+	"testing"
+)
+
+func TestGpuSearchFloatAll(t *testing.T) {
+	dimension := uint32(8)
+	n_vectors := uint64(100)
+	deviceID := 0
+
+	// 1. Test IVF-PQ SearchFloat (with int8 quantization)
+	t.Run("IVF-PQ", func(t *testing.T) {
+		dataset := make([]float32, n_vectors*uint64(dimension))
+		for i := range dataset {
+			dataset[i] = float32(i % 10)
+		}
+		bp := IvfPqBuildParams{NLists: 10, M: 4, BitsPerCode: 8, AddDataOnBuild: true}
+		// Create empty index
+		index, err := NewGpuIvfPqEmpty[int8](n_vectors, dimension, L2Expanded, bp, []int{deviceID}, 1, SingleGpu)
+		if err != nil {
+			t.Fatalf("Failed to create IVF-PQ: %v", err)
+		}
+		defer index.Destroy()
+		index.Start()
+
+		// Explicitly train quantizer before adding data
+		err = index.TrainQuantizer(dataset[:dimension*10], 10)
+		if err != nil {
+			t.Fatalf("TrainQuantizer failed: %v", err)
+		}
+
+		err = index.AddChunkFloat(dataset, n_vectors)
+		if err != nil {
+			t.Fatalf("AddChunkFloat failed: %v", err)
+		}
+		index.Build()
+
+		queries := make([]float32, 2*uint64(dimension))
+		for i := range queries {
+			queries[i] = float32(i % 10)
+		}
+		res, err := index.SearchFloat(queries, 2, dimension, 1, IvfPqSearchParams{NProbes: 1})
+		if err != nil {
+			t.Fatalf("SearchFloat failed: %v", err)
+		}
+		if len(res.Neighbors) != 2 {
+			t.Errorf("Expected 2 neighbors, got %d", len(res.Neighbors))
+		}
+	})
+
+	// 2. Test IVF-Flat SearchFloat (with half quantization)
+	t.Run("IVF-Flat", func(t *testing.T) {
+		dataset := make([]Float16, n_vectors*uint64(dimension))
+		bp := IvfFlatBuildParams{NLists: 10, AddDataOnBuild: true}
+		index, err := NewGpuIvfFlat[Float16](dataset, n_vectors, dimension, L2Expanded, bp, []int{deviceID}, 1, SingleGpu)
+		if err != nil {
+			t.Fatalf("Failed to create IVF-Flat: %v", err)
+		}
+		defer index.Destroy()
+		index.Start()
+		index.Build()
+
+		queries := make([]float32, uint64(dimension))
+		res, err := index.SearchFloat(queries, 1, dimension, 1, IvfFlatSearchParams{NProbes: 1})
+		if err != nil {
+			t.Fatalf("SearchFloat failed: %v", err)
+		}
+		if len(res.Neighbors) != 1 {
+			t.Errorf("Expected 1 neighbor, got %d", len(res.Neighbors))
+		}
+	})
+
+	// 3. Test CAGRA SearchFloat (with float32)
+	t.Run("CAGRA", func(t *testing.T) {
+		dataset := make([]float32, n_vectors*uint64(dimension))
+		bp := CagraBuildParams{IntermediateGraphDegree: 64, GraphDegree: 32, AttachDatasetOnBuild: true}
+		index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, []int{deviceID}, 1, SingleGpu)
+		if err != nil {
+			t.Fatalf("Failed to create CAGRA: %v", err)
+		}
+		defer index.Destroy()
+		index.Start()
+		index.Build()
+
+		queries := make([]float32, uint64(dimension))
+		res, err := index.SearchFloat(queries, 1, dimension, 1, CagraSearchParams{ItopkSize: 64, SearchWidth: 1})
+		if err != nil {
+			t.Fatalf("SearchFloat failed: %v", err)
+		}
+		if len(res.Neighbors) != 1 {
+			t.Errorf("Expected 1 neighbor, got %d", len(res.Neighbors))
+		}
+	})
+
+	// 4. Test Brute-Force SearchFloat (with half)
+	t.Run("Brute-Force", func(t *testing.T) {
+		dataset := make([]Float16, n_vectors*uint64(dimension))
+		index, err := NewGpuBruteForce[Float16](dataset, n_vectors, dimension, L2Expanded, 1, deviceID)
+		if err != nil {
+			t.Fatalf("Failed to create Brute-Force: %v", err)
+		}
+		defer index.Destroy()
+		index.Start()
+		index.Build()
+
+		queries := make([]float32, uint64(dimension))
+		neighbors, _, err := index.SearchFloat(queries, 1, dimension, 1)
+		if err != nil {
+			t.Fatalf("SearchFloat failed: %v", err)
+		}
+		if len(neighbors) != 1 {
+			t.Errorf("Expected 1 neighbor, got %d", len(neighbors))
+		}
+	})
+
+	// 5. Test KMeans PredictFloat (with uint8)
+	t.Run("KMeans", func(t *testing.T) {
+		nClusters := uint32(5)
+		km, err := NewGpuKMeans[uint8](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+		if err != nil {
+			t.Fatalf("Failed to create KMeans: %v", err)
+		}
+		defer km.Destroy()
+		km.Start()
+
+		dataset := make([]float32, n_vectors*uint64(dimension))
+		for i := range dataset {
+			dataset[i] = float32(i % 10)
+		}
+
+		// Explicitly train quantizer
+		err = km.TrainQuantizer(dataset[:dimension*10], 10)
+		if err != nil {
+			t.Fatalf("TrainQuantizer failed: %v", err)
+		}
+
+		// FitPredictFloat
+		labels, _, _, err := km.FitPredictFloat(dataset, n_vectors)
+		if err != nil {
+			t.Fatalf("FitPredictFloat failed: %v", err)
+		}
+
+		queries := make([]float32, 2*uint64(dimension))
+		labels, _, err = km.PredictFloat(queries, 2)
+		if err != nil {
+			t.Fatalf("PredictFloat failed: %v", err)
+		}
+		if len(labels) != 2 {
+			t.Errorf("Expected 2 labels, got %d", len(labels))
+		}
+	})
+}
diff --git a/pkg/vectorindex/brute_force/benchmark_test.go b/pkg/vectorindex/brute_force/benchmark_test.go
new file mode 100644
index 0000000000000..bfa2782154525
--- /dev/null
+++ b/pkg/vectorindex/brute_force/benchmark_test.go
@@ -0,0 +1,105 @@
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	"math/rand/v2"
+	"testing"
+
+	"github.com/matrixorigin/matrixone/pkg/common/mpool"
+	"github.com/matrixorigin/matrixone/pkg/testutil"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
+)
+
+func benchmarkBruteForceGeneric(b *testing.B, dsize, qsize int, dimension uint, ncpu uint, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) {
+	b.Helper()
+	m := mpool.MustNewZero()
+	proc := testutil.NewProcessWithMPool(b, "", m)
+	sqlproc := sqlexec.NewSqlProcess(proc)
+	limit := uint(10)
+	elemsz := uint(4) // float32
+
+	dataset := make([][]float32, dsize)
+	for i := range dataset {
+		dataset[i] = make([]float32, dimension)
+		for j := range dataset[i] {
+			dataset[i][j] = rand.Float32()
+		}
+	}
+
+	query := make([][]float32, qsize)
+	for i := range query {
+		query[i] = make([]float32, dimension)
+		for j := range query[i] {
+			query[i][j] = rand.Float32()
+		}
+	}
+
+	idx, err := createFn(dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu)
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer idx.Destroy()
+
+	err = idx.Load(sqlproc)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _, err := idx.Search(sqlproc, query, rt)
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func benchmarkBruteForce(b *testing.B, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) {
+	benchmarkBruteForceGeneric(b, 10000, 100, 1024, 8, createFn)
+}
+
+func benchmarkCentroidSearch(b *testing.B, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) {
+	benchmarkBruteForceGeneric(b, 18000, 1, 1024, 1, createFn)
+}
+
+func BenchmarkGoBruteForce(b *testing.B) {
+	benchmarkBruteForce(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewGoBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
+
+func BenchmarkUsearchBruteForce(b *testing.B) {
+	benchmarkBruteForce(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewUsearchBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
+
+func BenchmarkCentroidSearchGoBruteForce(b *testing.B) {
+	benchmarkCentroidSearch(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewGoBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
+
+func BenchmarkCentroidSearchUsearchBruteForce(b *testing.B) {
+	benchmarkCentroidSearch(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewUsearchBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
diff --git a/pkg/vectorindex/brute_force/brute_force.go b/pkg/vectorindex/brute_force/brute_force.go
index bdf217dd75433..84b529b04bbdf 100644
--- a/pkg/vectorindex/brute_force/brute_force.go
+++ b/pkg/vectorindex/brute_force/brute_force.go
@@ -136,44 +136,73 @@ func NewUsearchBruteForceIndex[T types.RealNumbers](dataset [][]T,
 	return idx, nil
 }
 
+func NewUsearchBruteForceIndexFlattened[T types.RealNumbers](dataset []T,
+	count uint,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+	var err error
+
+	idx := &UsearchBruteForceIndex[T]{}
+	idx.Metric = metric.MetricTypeToUsearchMetric[m]
+	idx.Quantization, err = GetUsearchQuantizationFromType(T(0))
+	if err != nil {
+		return nil, err
+	}
+	idx.Dimension = dimension
+	idx.Count = count
+	idx.ElementSize = elemsz
+	idx.Dataset = &dataset
+
+	return idx, nil
+}
+
 func (idx *UsearchBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) error {
 	return nil
 }
 
 func (idx *UsearchBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (keys any, distances []float64, err error) {
-	queries, ok := _queries.([][]T)
-	if !ok {
-		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
-	}
-
 	var flatten []T
 	var queryDeallocator malloc.Deallocator
-
-	reqSize := len(queries) * int(idx.Dimension)
-	allocator := malloc.NewCAllocator()
-	var _t T
-	switch any(_t).(type) {
-	case float32:
-		slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear)
-		if err2 != nil {
-			return nil, nil, err2
+	var nQueries int
+
+	switch queries := _queries.(type) {
+	case []T:
+		flatten = queries
+		nQueries = len(queries) / int(idx.Dimension)
+	case [][]T:
+		if len(queries) == 0 {
+			return nil, nil, nil
 		}
-		queryDeallocator = dealloc
-		f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize)
-		flatten = any(f32Slice).([]T)
-	case float64:
-		slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*8, malloc.NoClear)
-		if err2 != nil {
-			return nil, nil, err2
+		nQueries = len(queries)
+		reqSize := nQueries * int(idx.Dimension)
+		allocator := malloc.NewCAllocator()
+		var _t T
+		switch any(_t).(type) {
+		case float32:
+			slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear)
+			if err2 != nil {
+				return nil, nil, err2
+			}
+			queryDeallocator = dealloc
+			f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize)
+			flatten = any(f32Slice).([]T)
+		case float64:
+			slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*8, malloc.NoClear)
+			if err2 != nil {
+				return nil, nil, err2
+			}
+			queryDeallocator = dealloc
+			f64Slice := util.UnsafeSliceCastToLength[float64](slice, reqSize)
+			flatten = any(f64Slice).([]T)
 		}
-		queryDeallocator = dealloc
-		f64Slice := util.UnsafeSliceCastToLength[float64](slice, reqSize)
-		flatten = any(f64Slice).([]T)
-	}
 
-	for i := 0; i < len(queries); i++ {
-		offset := i * int(idx.Dimension)
-		copy(flatten[offset:], queries[i])
+		for i := 0; i < nQueries; i++ {
+			offset := i * int(idx.Dimension)
+			copy(flatten[offset:], queries[i])
+		}
+	default:
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
 	}
 
 	if queryDeallocator != nil {
@@ -191,7 +220,7 @@ func (idx *UsearchBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries
 		util.UnsafePointer(&((*idx.Dataset)[0])),
 		util.UnsafePointer(&(flatten[0])),
 		uint(idx.Count),
-		uint(len(queries)),
+		uint(nQueries),
 		idx.Dimension*idx.ElementSize,
 		idx.Dimension*idx.ElementSize,
 		idx.Dimension,
diff --git a/pkg/vectorindex/brute_force/cpu.go b/pkg/vectorindex/brute_force/cpu.go
index b5c65f96cf614..c403cbb9c5181 100644
--- a/pkg/vectorindex/brute_force/cpu.go
+++ b/pkg/vectorindex/brute_force/cpu.go
@@ -30,3 +30,20 @@ func NewBruteForceIndex[T types.RealNumbers](dataset [][]T,
 
 	return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz)
 }
+
+func NewAdhocBruteForceIndex[T types.RealNumbers](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+
+	return NewUsearchBruteForceIndex[T](dataset, dimension, m, elemsz)
+}
+
+func NewAdhocBruteForceIndexFlattened[T types.RealNumbers](dataset []T,
+	count uint,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+
+	return NewUsearchBruteForceIndexFlattened[T](dataset, count, dimension, m, elemsz)
+}
diff --git a/pkg/vectorindex/brute_force/gpu.go b/pkg/vectorindex/brute_force/gpu.go
index 416c2a75d9a75..4c44be80ca0dc 100644
--- a/pkg/vectorindex/brute_force/gpu.go
+++ b/pkg/vectorindex/brute_force/gpu.go
@@ -17,171 +17,339 @@
 package brute_force
 
 import (
-	//	"fmt"
+	"github.com/matrixorigin/matrixone/pkg/common/malloc"
+	"github.com/matrixorigin/matrixone/pkg/common/util"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/cuvs"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
-	cuvs "github.com/rapidsai/cuvs/go"
-	"github.com/rapidsai/cuvs/go/brute_force"
 )
 
-type GpuBruteForceIndex[T cuvs.TensorNumberType] struct {
-	Resource    *cuvs.Resource // shared resource for read-only index
-	Dataset     *cuvs.Tensor[T]
-	Index       *brute_force.BruteForceIndex
-	Metric      cuvs.Distance
-	Dimension   uint
-	Count       uint
-	ElementSize uint
+type GpuAdhocBruteForceIndex[T cuvs.VectorType] struct {
+	dataset   []T
+	dimension uint
+	count     uint
+	metric    metric.MetricType
 }
 
-var _ cache.VectorIndexSearchIf = &GpuBruteForceIndex[float32]{}
+var _ cache.VectorIndexSearchIf = &GpuAdhocBruteForceIndex[float32]{}
 
-// cuvs library has bug.  comment out the GPU version until cuvs fix the bug
-func NewBruteForceIndex[T types.RealNumbers](dataset [][]T,
+func NewAdhocBruteForceIndex[T types.RealNumbers](dataset [][]T,
 	dimension uint,
 	m metric.MetricType,
-	elemsz uint,
-	nthread uint) (cache.VectorIndexSearchIf, error) {
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
+
+	// Threshold for switching between CPU and GPU for adhoc search.
+	// For small datasets, CPU (usearch) is much faster due to lower overhead.
+	const cpuThreshold = 5000
+	if len(dataset) < cpuThreshold {
+		return NewUsearchBruteForceIndex[T](dataset, dimension, m, elemsz)
+	}
 
 	switch dset := any(dataset).(type) {
-	case [][]float64:
-		return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz)
 	case [][]float32:
-		return NewCpuBruteForceIndex[float32](dset, dimension, m, elemsz)
-		//return NewGpuBruteForceIndex[float32](dset, dimension, m, elemsz, nthread)
+		return NewGpuAdhocBruteForceIndex[float32](dset, dimension, m, elemsz)
+	case [][]uint16:
+		// Convert [][]uint16 to [][]cuvs.Float16 to pass to NewGpuAdhocBruteForceIndex
+		f16dset := make([][]cuvs.Float16, len(dset))
+		for i, v := range dset {
+			f16dset[i] = util.UnsafeSliceCast[cuvs.Float16](v)
+		}
+		return NewGpuAdhocBruteForceIndex[cuvs.Float16](f16dset, dimension, m, elemsz)
 	default:
-		return nil, moerr.NewInternalErrorNoCtx("type not supported for BruteForceIndex")
+		return NewUsearchBruteForceIndex[T](dataset, dimension, m, elemsz)
 	}
+}
+
+func NewAdhocBruteForceIndexFlattened[T types.RealNumbers](dataset []T,
+	count uint,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
 
+	const cpuThreshold = 5000
+	if count < cpuThreshold {
+		return NewUsearchBruteForceIndexFlattened[T](dataset, count, dimension, m, elemsz)
+	}
+
+	switch dset := any(dataset).(type) {
+	case []float32:
+		return &GpuAdhocBruteForceIndex[float32]{
+			dataset:   dset,
+			dimension: dimension,
+			count:     count,
+			metric:    m,
+		}, nil
+	case []cuvs.Float16:
+		return &GpuAdhocBruteForceIndex[cuvs.Float16]{
+			dataset:   dset,
+			dimension: dimension,
+			count:     count,
+			metric:    m,
+		}, nil
+	default:
+		return NewUsearchBruteForceIndexFlattened[T](dataset, count, dimension, m, elemsz)
+	}
 }
 
-func NewGpuBruteForceIndex[T cuvs.TensorNumberType](dataset [][]T,
+func NewGpuAdhocBruteForceIndex[T cuvs.VectorType](dataset [][]T,
 	dimension uint,
 	m metric.MetricType,
-	elemsz uint,
-	nthread uint) (cache.VectorIndexSearchIf, error) {
+	elemsz uint) (cache.VectorIndexSearchIf, error) {
 
-	idx := &GpuBruteForceIndex[T]{}
-	resource, _ := cuvs.NewResource(nil)
-	idx.Resource = &resource
-	tensor, err := cuvs.NewTensor(dataset)
-	if err != nil {
-		return nil, err
+	if len(dataset) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("empty dataset")
 	}
-	idx.Dataset = &tensor
-	idx.Metric = metric.MetricTypeToCuvsMetric[m]
-	idx.Dimension = dimension
-	idx.Count = uint(len(dataset))
 
-	idx.ElementSize = elemsz
-	return idx, nil
+	dim := int(dimension)
+	reqSize := len(dataset) * dim
+	flattened := make([]T, reqSize)
+
+	for i, v := range dataset {
+		copy(flattened[i*dim:(i+1)*dim], v)
+	}
 
+	return &GpuAdhocBruteForceIndex[T]{
+		dataset:   flattened,
+		dimension: dimension,
+		count:     uint(len(dataset)),
+		metric:    m,
+	}, nil
 }
 
-func (idx *GpuBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) (err error) {
-	if _, err = idx.Dataset.ToDevice(idx.Resource); err != nil {
-		return err
+func (idx *GpuAdhocBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) error {
+	return nil
+}
+
+func (idx *GpuAdhocBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (retkeys any, retdistances []float64, err error) {
+	var flattenedQueries []T
+	var nQueries uint64
+
+	switch queries := _queries.(type) {
+	case []T:
+		flattenedQueries = queries
+		nQueries = uint64(len(queries) / int(idx.dimension))
+	case [][]T:
+		if len(queries) == 0 {
+			return nil, nil, nil
+		}
+		dim := int(idx.dimension)
+		reqSize := len(queries) * dim
+		flattenedQueries = make([]T, reqSize)
+		for i, v := range queries {
+			copy(flattenedQueries[i*dim:(i+1)*dim], v)
+		}
+		nQueries = uint64(len(queries))
+	default:
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
 	}
 
-	idx.Index, err = brute_force.CreateIndex()
-	if err != nil {
-		return
+	if nQueries == 0 {
+		return nil, nil, nil
 	}
 
-	err = brute_force.BuildIndex[T](*idx.Resource, idx.Dataset, idx.Metric, 0, idx.Index)
+	deviceID := 0
+	neighbors, distances, err := cuvs.AdhocBruteForceSearch[T](
+		idx.dataset, uint64(idx.count), uint32(idx.dimension),
+		flattenedQueries, nQueries, uint32(rt.Limit),
+		resolveCuvsDistance(idx.metric), deviceID,
+	)
 	if err != nil {
-		return
+		return nil, nil, err
 	}
 
-	if err = idx.Resource.Sync(); err != nil {
-		return
+	retdistances = make([]float64, len(distances))
+	for i, d := range distances {
+		retdistances[i] = float64(d)
 	}
 
+	retkeys = neighbors
 	return
 }
 
-func (idx *GpuBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (retkeys any, retdistances []float64, err error) {
-	queriesvec, ok := _queries.([][]T)
-	if !ok {
-		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
+func (idx *GpuAdhocBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf) error {
+	return nil
+}
+
+func (idx *GpuAdhocBruteForceIndex[T]) Destroy() {
+	idx.dataset = nil
+}
+
+type GpuBruteForceIndex[T cuvs.VectorType] struct {
+	index     *cuvs.GpuBruteForce[T]
+	dimension uint
+	count     uint
+}
+
+var _ cache.VectorIndexSearchIf = &GpuBruteForceIndex[float32]{}
+
+func resolveCuvsDistance(m metric.MetricType) cuvs.DistanceType {
+	switch m {
+	case metric.Metric_L2sqDistance:
+		return cuvs.L2Expanded
+	case metric.Metric_L2Distance:
+		return cuvs.L2Expanded
+	case metric.Metric_InnerProduct:
+		return cuvs.InnerProduct
+	case metric.Metric_CosineDistance:
+		return cuvs.CosineSimilarity
+	case metric.Metric_L1Distance:
+		return cuvs.L1
+	default:
+		return cuvs.L2Expanded
 	}
+}
 
-	// local resource for concurrent search
-	resource, err := cuvs.NewResource(nil)
-	if err != nil {
-		return nil, nil, err
+func NewBruteForceIndex[T types.RealNumbers](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint,
+	nthread uint) (cache.VectorIndexSearchIf, error) {
+
+	switch dset := any(dataset).(type) {
+	case [][]float64:
+		return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz)
+	case [][]float32:
+		return NewGpuBruteForceIndex[float32](dset, dimension, m, elemsz, nthread)
+	case [][]uint16:
+		// Convert [][]uint16 to [][]cuvs.Float16 to pass to NewGpuBruteForceIndex
+		f16dset := make([][]cuvs.Float16, len(dset))
+		for i, v := range dset {
+			f16dset[i] = util.UnsafeSliceCast[cuvs.Float16](v)
+		}
+		return NewGpuBruteForceIndex[cuvs.Float16](f16dset, dimension, m, elemsz, nthread)
+	default:
+		return nil, moerr.NewInternalErrorNoCtx("type not supported for BruteForceIndex")
 	}
-	defer resource.Close()
+}
 
-	queries, err := cuvs.NewTensor(queriesvec)
-	if err != nil {
-		return nil, nil, err
+func NewGpuBruteForceIndex[T cuvs.VectorType](dataset [][]T,
+	dimension uint,
+	m metric.MetricType,
+	elemsz uint,
+	nthread uint) (cache.VectorIndexSearchIf, error) {
+
+	if len(dataset) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("empty dataset")
 	}
-	defer queries.Close()
 
-	neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(rt.Limit)})
-	if err != nil {
-		return nil, nil, err
+	dim := int(dimension)
+	reqSize := len(dataset) * dim
+	var flattened []T
+
+	var _t T
+	switch any(_t).(type) {
+	case float32:
+		allocator := malloc.NewCAllocator()
+		slice, deallocator, err := allocator.Allocate(uint64(reqSize*4), malloc.NoClear)
+		if err != nil {
+			return nil, err
+		}
+		defer deallocator.Deallocate()
+		flattened = any(util.UnsafeSliceCast[float32](slice)).([]T)
+	case cuvs.Float16:
+		allocator := malloc.NewCAllocator()
+		slice, deallocator, err := allocator.Allocate(uint64(reqSize*2), malloc.NoClear)
+		if err != nil {
+			return nil, err
+		}
+		defer deallocator.Deallocate()
+		flattened = any(util.UnsafeSliceCast[cuvs.Float16](slice)).([]T)
+	default:
+		ds := make([]T, reqSize)
+		flattened = ds
 	}
-	defer neighbors.Close()
 
-	distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(rt.Limit)})
+	for i, v := range dataset {
+		copy(flattened[i*dim:(i+1)*dim], v)
+	}
+
+	deviceID := 0 // Default to device 0
+	km, err := cuvs.NewGpuBruteForce[T](flattened, uint64(len(dataset)), uint32(dimension), resolveCuvsDistance(m), uint32(nthread), deviceID)
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
-	defer distances.Close()
 
-	if _, err = queries.ToDevice(&resource); err != nil {
-		return nil, nil, err
+	km.Start()
+	return &GpuBruteForceIndex[T]{
+		index:     km,
+		dimension: dimension,
+		count:     uint(len(dataset)),
+	}, nil
+}
+
+func (idx *GpuBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) (err error) {
+	if idx.index == nil {
+		return moerr.NewInternalErrorNoCtx("GpuBruteForce not initialized")
 	}
+	return idx.index.Build()
+}
 
-	err = brute_force.SearchIndex(resource, *idx.Index, &queries, &neighbors, &distances)
-	if err != nil {
-		return nil, nil, err
+func (idx *GpuBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (retkeys any, retdistances []float64, err error) {
+	queriesvec, ok := _queries.([][]T)
+	if !ok {
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
 	}
 
-	if _, err = neighbors.ToHost(&resource); err != nil {
-		return nil, nil, err
+	if len(queriesvec) == 0 {
+		return nil, nil, nil
 	}
 
-	if _, err = distances.ToHost(&resource); err != nil {
-		return nil, nil, err
+	dim := int(idx.dimension)
+	reqSize := len(queriesvec) * dim
+
+	var flattenedQueries []T
+	var queryDeallocator malloc.Deallocator
+
+	var _t T
+	switch any(_t).(type) {
+	case float32:
+		allocator := malloc.NewCAllocator()
+		slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear)
+		if err2 != nil {
+			return nil, nil, err2
+		}
+		queryDeallocator = dealloc
+		f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize)
+		flattenedQueries = any(f32Slice).([]T)
+	case cuvs.Float16:
+		allocator := malloc.NewCAllocator()
+		slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*2, malloc.NoClear)
+		if err2 != nil {
+			return nil, nil, err2
+		}
+		queryDeallocator = dealloc
+		f16Slice := util.UnsafeSliceCastToLength[cuvs.Float16](slice, reqSize)
+		flattenedQueries = any(f16Slice).([]T)
+	default:
+		// Not pooling other types, although T is likely only float32 for CUVS
+		ds := make([]T, reqSize)
+		flattenedQueries = ds
 	}
 
-	if err = resource.Sync(); err != nil {
-		return nil, nil, err
+	for i, v := range queriesvec {
+		copy(flattenedQueries[i*dim:(i+1)*dim], v)
 	}
 
-	neighborsSlice, err := neighbors.Slice()
-	if err != nil {
-		return nil, nil, err
+	if queryDeallocator != nil {
+		defer queryDeallocator.Deallocate()
 	}
 
-	distancesSlice, err := distances.Slice()
+	neighbors, distances, err := idx.index.Search(flattenedQueries, uint64(len(queriesvec)), uint32(idx.dimension), uint32(rt.Limit))
 	if err != nil {
 		return nil, nil, err
 	}
 
-	//fmt.Printf("flattened %v\n", flatten)
-	retdistances = make([]float64, len(distancesSlice)*int(rt.Limit))
-	for i := range distancesSlice {
-		for j, dist := range distancesSlice[i] {
-			retdistances[i*int(rt.Limit)+j] = float64(dist)
-		}
+	retdistances = make([]float64, len(distances))
+	for i, d := range distances {
+		retdistances[i] = float64(d)
 	}
 
-	keys := make([]int64, len(neighborsSlice)*int(rt.Limit))
-	for i := range neighborsSlice {
-		for j, key := range neighborsSlice[i] {
-			keys[i*int(rt.Limit)+j] = int64(key)
-		}
-	}
-	retkeys = keys
+	retkeys = neighbors
 	return
 }
 
@@ -190,13 +358,7 @@ func (idx *GpuBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf) er
 }
 
 func (idx *GpuBruteForceIndex[T]) Destroy() {
-	if idx.Dataset != nil {
-		idx.Dataset.Close()
-	}
-	if idx.Resource != nil {
-		idx.Resource.Close()
-	}
-	if idx.Index != nil {
-		idx.Index.Close()
+	if idx.index != nil {
+		idx.index.Destroy()
 	}
 }
diff --git a/pkg/vectorindex/brute_force/gpu_benchmark_test.go b/pkg/vectorindex/brute_force/gpu_benchmark_test.go
new file mode 100644
index 0000000000000..9c6166b95dbed
--- /dev/null
+++ b/pkg/vectorindex/brute_force/gpu_benchmark_test.go
@@ -0,0 +1,82 @@
+//go:build gpu
+
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	"math/rand/v2"
+	"testing"
+
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+)
+
+func BenchmarkGpuBruteForce(b *testing.B) {
+	benchmarkBruteForce(b, NewGpuBruteForceIndex[float32])
+}
+
+func BenchmarkCentroidSearchGpuBruteForce(b *testing.B) {
+	benchmarkCentroidSearch(b, NewGpuBruteForceIndex[float32])
+}
+
+func BenchmarkGpuAdhocBruteForce(b *testing.B) {
+	benchmarkBruteForce(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewGpuAdhocBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
+
+func BenchmarkCentroidSearchGpuAdhocBruteForce(b *testing.B) {
+	benchmarkCentroidSearch(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewGpuAdhocBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
+
+func BenchmarkGpuAdhocBruteForceSingle(b *testing.B) {
+	dsize := 10000
+	dimension := uint(1024)
+	limit := uint(10)
+	elemsz := uint(4) // float32
+
+	dataset := make([][]float32, dsize)
+	for i := range dataset {
+		dataset[i] = make([]float32, dimension)
+		for j := range dataset[i] {
+			dataset[i][j] = rand.Float32()
+		}
+	}
+
+	query := make([][]float32, 1)
+	query[0] = make([]float32, dimension)
+	for j := range query[0] {
+		query[0][j] = rand.Float32()
+	}
+
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 1}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		idx, err := NewGpuAdhocBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+		if err != nil {
+			b.Fatal(err)
+		}
+		_, _, err = idx.Search(nil, query, rt)
+		if err != nil {
+			b.Fatal(err)
+		}
+		idx.Destroy()
+	}
+}
diff --git a/pkg/vectorindex/brute_force/gpu_test.go b/pkg/vectorindex/brute_force/gpu_test.go
index d9b024f5444cd..d1b341d797c21 100644
--- a/pkg/vectorindex/brute_force/gpu_test.go
+++ b/pkg/vectorindex/brute_force/gpu_test.go
@@ -17,7 +17,6 @@
 package brute_force
 
 import (
-	//"fmt"
 	"math/rand/v2"
 	"sync"
 	"testing"
@@ -35,22 +34,22 @@ func TestGpuBruteForce(t *testing.T) {
 	dataset := [][]float32{{1, 2, 3}, {3, 4, 5}}
 	query := [][]float32{{1, 2, 3}, {3, 4, 5}}
 	dimension := uint(3)
-	ncpu := uint(1)
+	ncpu := uint(8)
 	limit := uint(1)
 	elemsz := uint(4) // float32
 
-	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu)
 	require.NoError(t, err)
 	defer idx.Destroy()
 
 	err = idx.Load(nil)
 	require.NoError(t, err)
 
-	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 1}
 
 	var wg sync.WaitGroup
 
-	for n := 0; n < 4; n++ {
+	for n := 0; n < 8; n++ {
 
 		wg.Add(1)
 		go func() {
@@ -66,7 +65,6 @@ func TestGpuBruteForce(t *testing.T) {
 					require.Equal(t, key, int64(j))
 					require.Equal(t, distances[j], float64(0))
 				}
-				// fmt.Printf("keys %v, dist %v\n", keys, distances)
 			}
 		}()
 	}
@@ -81,7 +79,7 @@ func TestGpuBruteForceConcurrent(t *testing.T) {
 	proc := testutil.NewProcessWithMPool(t, "", m)
 	sqlproc := sqlexec.NewSqlProcess(proc)
 	dimension := uint(128)
-	ncpu := uint(4)
+	ncpu := uint(8)
 	limit := uint(3)
 	elemsz := uint(4) // float32
 
@@ -96,7 +94,7 @@ func TestGpuBruteForceConcurrent(t *testing.T) {
 
 	query := dataset
 
-	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu)
 	require.NoError(t, err)
 	defer idx.Destroy()
 
@@ -105,13 +103,12 @@ func TestGpuBruteForceConcurrent(t *testing.T) {
 
 	// limit 3
 	{
-		rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+		rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 1}
 
 		anykeys, distances, err := idx.Search(sqlproc, query, rt)
 		require.NoError(t, err)
 
 		keys := anykeys.([]int64)
-		// fmt.Printf("keys %v, dist %v\n", keys, distances)
 		require.Equal(t, int(rt.Limit)*len(query), len(keys))
 		for i := range query {
 			offset := i * int(rt.Limit)
@@ -122,13 +119,12 @@ func TestGpuBruteForceConcurrent(t *testing.T) {
 
 	// limit 1
 	{
-		rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: ncpu}
+		rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: 1}
 
 		anykeys, distances, err := idx.Search(sqlproc, query, rt)
 		require.NoError(t, err)
 
 		keys := anykeys.([]int64)
-		// fmt.Printf("keys %v, dist %v\n", keys, distances)
 		require.Equal(t, int(rt.Limit)*len(query), len(keys))
 		for i := range query {
 			offset := i * int(rt.Limit)
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/gpu.go b/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
index ed7eecfd58cf9..357a9bd89f24b 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
@@ -17,84 +17,48 @@
 package device
 
 import (
-	//"os"
-
 	"context"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/cuvs"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/elkans"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
-	cuvs "github.com/rapidsai/cuvs/go"
-	"github.com/rapidsai/cuvs/go/ivf_flat"
 )
 
-type GpuClusterer[T cuvs.TensorNumberType] struct {
-	indexParams *ivf_flat.IndexParams
-	nlist       int
-	dim         int
-	vectors     [][]T
+type GpuClusterer[T cuvs.VectorType] struct {
+	kmeans  *cuvs.GpuKMeans[T]
+	nlist   int
+	dim     int
+	vectors []T
 }
 
 func (c *GpuClusterer[T]) InitCentroids(ctx context.Context) error {
-
 	return nil
 }
 
 func (c *GpuClusterer[T]) Cluster(ctx context.Context) (any, error) {
-
-	resource, err := cuvs.NewResource(nil)
-	if err != nil {
-		return nil, err
+	if c.kmeans == nil {
+		return nil, moerr.NewInternalErrorNoCtx("GpuKMeans not initialized")
 	}
-	defer resource.Close()
 
-	dataset, err := cuvs.NewTensor(c.vectors)
+	nSamples := uint64(len(c.vectors) / c.dim)
+	_, _, err := c.kmeans.Fit(c.vectors, nSamples)
 	if err != nil {
 		return nil, err
 	}
-	defer dataset.Close()
 
-	index, err := ivf_flat.CreateIndex(c.indexParams, &dataset)
+	centroids, err := c.kmeans.GetCentroids()
 	if err != nil {
 		return nil, err
 	}
-	defer index.Close()
 
-	if _, err := dataset.ToDevice(&resource); err != nil {
-		return nil, err
-	}
-
-	centers, err := cuvs.NewTensorOnDevice[T](&resource, []int64{int64(c.nlist), int64(c.dim)})
-	if err != nil {
-		return nil, err
-	}
-	defer centers.Close()
-
-	if err := ivf_flat.BuildIndex(resource, c.indexParams, &dataset, index); err != nil {
-		return nil, err
-	}
-
-	if err := resource.Sync(); err != nil {
-		return nil, err
-	}
-
-	if err := ivf_flat.GetCenters(index, &centers); err != nil {
-		return nil, err
-	}
-
-	if _, err := centers.ToHost(&resource); err != nil {
-		return nil, err
-	}
-
-	if err := resource.Sync(); err != nil {
-		return nil, err
-	}
-
-	result, err := centers.Slice()
-	if err != nil {
-		return nil, err
+	// Reshape centroids back to [][]T
+	result := make([][]T, c.nlist)
+	for i := 0; i < c.nlist; i++ {
+		result[i] = make([]T, c.dim)
+		copy(result[i], centroids[i*c.dim:(i+1)*c.dim])
 	}
 
 	return result, nil
@@ -105,26 +69,26 @@ func (c *GpuClusterer[T]) SSE() (float64, error) {
 }
 
 func (c *GpuClusterer[T]) Close() error {
-	if c.indexParams != nil {
-		c.indexParams.Close()
+	if c.kmeans != nil {
+		return c.kmeans.Destroy()
 	}
 	return nil
 }
 
-func resolveCuvsDistanceForDense(distance metric.MetricType) cuvs.Distance {
+func resolveCuvsDistanceForDense(distance metric.MetricType) cuvs.DistanceType {
 	switch distance {
 	case metric.Metric_L2sqDistance:
-		return cuvs.DistanceL2
+		return cuvs.L2Expanded
 	case metric.Metric_L2Distance:
-		return cuvs.DistanceL2
+		return cuvs.L2Expanded
 	case metric.Metric_InnerProduct:
-		return cuvs.DistanceL2
+		return cuvs.InnerProduct
 	case metric.Metric_CosineDistance:
-		return cuvs.DistanceL2
+		return cuvs.CosineSimilarity
 	case metric.Metric_L1Distance:
-		return cuvs.DistanceL2
+		return cuvs.L1
 	default:
-		return cuvs.DistanceL2
+		return cuvs.L2Expanded
 	}
 }
 
@@ -136,27 +100,36 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
 
 	switch vecs := any(vectors).(type) {
 	case [][]float32:
-
-		c := &GpuClusterer[float32]{}
-		c.nlist = clusterCnt
-		if len(vectors) == 0 {
+		if len(vecs) == 0 {
 			return nil, moerr.NewInternalErrorNoCtx("empty dataset")
 		}
-		c.vectors = vecs
-		c.dim = len(vecs[0])
 
-		indexParams, err := ivf_flat.CreateIndexParams()
+		dim := len(vecs[0])
+		// Flatten vectors for pkg/cuvs
+		flattened := make([]float32, len(vecs)*dim)
+		for i, v := range vecs {
+			copy(flattened[i*dim:(i+1)*dim], v)
+		}
+
+		// cuVS K-Means is currently single-GPU focused in our wrapper
+		deviceID := 0
+		nthread := uint32(1)
+
+		km, err := cuvs.NewGpuKMeans[float32](uint32(clusterCnt), uint32(dim), resolveCuvsDistanceForDense(distanceType), maxIterations, deviceID, nthread)
 		if err != nil {
 			return nil, err
 		}
-		indexParams.SetNLists(uint32(clusterCnt))
-		indexParams.SetMetric(resolveCuvsDistanceForDense(distanceType))
-		indexParams.SetKMeansNIters(uint32(maxIterations))
-		indexParams.SetKMeansTrainsetFraction(1) // train all sample
-		c.indexParams = indexParams
+		km.Start()
+
+		c := &GpuClusterer[float32]{
+			kmeans:  km,
+			nlist:   clusterCnt,
+			dim:     dim,
+			vectors: flattened,
+		}
 		return c, nil
+
 	default:
 		return elkans.NewKMeans(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType, spherical, nworker)
-
 	}
 }
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go b/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
index 1132ef924c17b..72fe4108ca9c7 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
@@ -17,8 +17,8 @@
 package device
 
 import (
-	//"fmt"
 	"context"
+	//"fmt"
 	"math/rand/v2"
 	"sync"
 	"testing"
@@ -33,7 +33,7 @@ import (
 )
 
 func TestGpu(t *testing.T) {
-
+	ctx := context.Background()
 	dim := 128
 	dsize := 1024
 	nlist := 128
@@ -48,7 +48,11 @@ func TestGpu(t *testing.T) {
 	c, err := NewKMeans[float32](vecs, nlist, 10, 0, metric.Metric_L2Distance, 0, false, 0)
 	require.NoError(t, err)
 
-	centers, err := c.Cluster(context.Background())
+	defer c.Close()
+
+	c.InitCentroids(ctx)
+
+	centers, err := c.Cluster(ctx)
 	require.NoError(t, err)
 
 	_, ok := centers.([][]float32)
@@ -63,6 +67,7 @@ func TestGpu(t *testing.T) {
 
 func TestIVFAndBruteForce(t *testing.T) {
 
+	ctx := context.Background()
 	m := mpool.MustNewZero()
 	proc := testutil.NewProcessWithMPool(t, "", m)
 	sqlproc := sqlexec.NewSqlProcess(proc)
@@ -83,8 +88,10 @@ func TestIVFAndBruteForce(t *testing.T) {
 
 	c, err := NewKMeans[float32](vecs, nlist, 10, 0, metric.Metric_L2Distance, 0, false, 0)
 	require.NoError(t, err)
+	defer c.Close()
 
-	centers, err := c.Cluster(context.Background())
+	c.InitCentroids(ctx)
+	centers, err := c.Cluster(ctx)
 	require.NoError(t, err)
 
 	centroids, ok := centers.([][]float32)
@@ -97,7 +104,7 @@ func TestIVFAndBruteForce(t *testing.T) {
 	*/
 
 	queries := vecs[:8192]
-	idx, err := mobf.NewBruteForceIndex[float32](centroids, dimension, metric.Metric_L2sqDistance, elemsz)
+	idx, err := mobf.NewBruteForceIndex[float32](centroids, dimension, metric.Metric_L2sqDistance, elemsz, ncpu)
 	require.NoError(t, err)
 	defer idx.Destroy()
 
@@ -116,21 +123,9 @@ func TestIVFAndBruteForce(t *testing.T) {
 			for i := 0; i < 1000; i++ {
 				_, _, err := idx.Search(sqlproc, queries, rt)
 				require.NoError(t, err)
-				/*
-
-					keys_i64, ok := keys.([]int64)
-					require.Equal(t, ok, true)
-
-					for j, key := range keys_i64 {
-						require.Equal(t, key, int64(j))
-						require.Equal(t, distances[j], float64(0))
-					}
-				*/
-				// fmt.Printf("keys %v, dist %v\n", keys, distances)
 			}
 		}()
 	}
 
 	wg.Wait()
-
 }
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go b/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go
index 17d89be59a97a..8202874c783f0 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go
@@ -17,248 +17,172 @@
 package device
 
 import (
-	//"fmt"
+	"fmt"
 	"math/rand/v2"
+	"runtime"
 	"sync"
 	"testing"
-	//"os"
 
+	"github.com/matrixorigin/matrixone/pkg/cuvs"
 	"github.com/stretchr/testify/require"
-
-	cuvs "github.com/rapidsai/cuvs/go"
-	"github.com/rapidsai/cuvs/go/brute_force"
-	"github.com/rapidsai/cuvs/go/ivf_flat"
 )
 
-func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.Distance, maxIterations int) ([][]float32, error) {
-
-	resource, err := cuvs.NewResource(nil)
-	if err != nil {
-		return nil, err
+func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.DistanceType, maxIterations int) ([][]float32, error) {
+	if len(vecs) == 0 {
+		return nil, fmt.Errorf("empty dataset")
 	}
-	defer resource.Close()
 
-	indexParams, err := ivf_flat.CreateIndexParams()
-	if err != nil {
-		return nil, err
+	// Flatten vectors
+	flattened := make([]float32, len(vecs)*dim)
+	for i, v := range vecs {
+		copy(flattened[i*dim:(i+1)*dim], v)
 	}
-	defer indexParams.Close()
-
-	indexParams.SetNLists(uint32(clusterCnt))
-	indexParams.SetMetric(distanceType)
-	indexParams.SetKMeansNIters(uint32(maxIterations))
-	indexParams.SetKMeansTrainsetFraction(1) // train all sample
 
-	dataset, err := cuvs.NewTensor(vecs)
+	deviceID := 0
+	nthread := uint32(1)
+	km, err := cuvs.NewGpuKMeans[float32](uint32(clusterCnt), uint32(dim), distanceType, maxIterations, deviceID, nthread)
 	if err != nil {
 		return nil, err
 	}
-	defer dataset.Close()
-
-	index, _ := ivf_flat.CreateIndex(indexParams, &dataset)
-	defer index.Close()
-
-	if _, err := dataset.ToDevice(&resource); err != nil {
-		return nil, err
-	}
+	defer km.Destroy()
+	km.Start()
 
-	centers, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(clusterCnt), int64(dim)})
+	_, _, err = km.Fit(flattened, uint64(len(vecs)))
 	if err != nil {
 		return nil, err
 	}
 
-	if err := ivf_flat.BuildIndex(resource, indexParams, &dataset, index); err != nil {
-		return nil, err
-	}
-
-	if err := resource.Sync(); err != nil {
-		return nil, err
-	}
-
-	if err := ivf_flat.GetCenters(index, &centers); err != nil {
-		return nil, err
-	}
-
-	if _, err := centers.ToHost(&resource); err != nil {
-		return nil, err
-	}
-
-	if err := resource.Sync(); err != nil {
+	centroids, err := km.GetCentroids()
+	if err != nil {
 		return nil, err
 	}
 
-	result, err := centers.Slice()
-	if err != nil {
-		return nil, err
+	// Reshape centroids
+	result := make([][]float32, clusterCnt)
+	for i := 0; i < clusterCnt; i++ {
+		result[i] = make([]float32, dim)
+		copy(result[i], centroids[i*dim:(i+1)*dim])
 	}
 
 	return result, nil
-
 }
 
-func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.Distance) (retkeys any, retdistances []float64, err error) {
-	//os.Stderr.WriteString(fmt.Sprintf("probe set %d\n", len(queriesvec)))
-	//os.Stderr.WriteString("brute force index search start\n")
-
-	resource, err := cuvs.NewResource(nil)
-	if err != nil {
-		return
+func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.DistanceType) (retkeys any, retdistances []float64, err error) {
+	if len(datasetvec) == 0 || len(queriesvec) == 0 {
+		return nil, nil, nil
 	}
-	defer resource.Close()
 
-	dataset, err := cuvs.NewTensor(datasetvec)
-	if err != nil {
-		return
+	dim := len(datasetvec[0])
+	flattenedDataset := make([]float32, len(datasetvec)*dim)
+	for i, v := range datasetvec {
+		copy(flattenedDataset[i*dim:(i+1)*dim], v)
 	}
-	defer dataset.Close()
 
-	index, err := brute_force.CreateIndex()
-	if err != nil {
-		return
+	flattenedQueries := make([]float32, len(queriesvec)*dim)
+	for i, v := range queriesvec {
+		copy(flattenedQueries[i*dim:(i+1)*dim], v)
 	}
-	defer index.Close()
 
-	queries, err := cuvs.NewTensor(queriesvec)
+	deviceID := 0
+	nthread := uint32(1)
+	bf, err := cuvs.NewGpuBruteForce[float32](flattenedDataset, uint64(len(datasetvec)), uint32(dim), distanceType, nthread, deviceID)
 	if err != nil {
-		return
+		return nil, nil, err
 	}
-	defer queries.Close()
+	defer bf.Destroy()
+	bf.Start()
 
-	neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(limit)})
+	err = bf.Build()
 	if err != nil {
-		return
+		return nil, nil, err
 	}
-	defer neighbors.Close()
 
-	distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(limit)})
+	neighbors, distances, err := bf.Search(flattenedQueries, uint64(len(queriesvec)), uint32(dim), uint32(limit))
 	if err != nil {
-		return
+		return nil, nil, err
 	}
-	defer distances.Close()
 
-	if _, err = dataset.ToDevice(&resource); err != nil {
-		return
-	}
-
-	if err = resource.Sync(); err != nil {
-		return
-	}
-
-	err = brute_force.BuildIndex(resource, &dataset, distanceType, 2.0, index)
-	if err != nil {
-		//os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed %v\n", err))
-		//os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed centers %v\n", datasetvec))
-		return
+	retdistances = make([]float64, len(distances))
+	for i, d := range distances {
+		retdistances[i] = float64(d)
 	}
 
-	if err = resource.Sync(); err != nil {
-		return
-	}
-	//os.Stderr.WriteString("built brute force index\n")
+	retkeys = neighbors
+	return
+}
 
-	if _, err = queries.ToDevice(&resource); err != nil {
-		return
-	}
+func TestIssueGpu(t *testing.T) {
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
+
+		defer wg.Done()
+
+		dimension := uint(128)
+		dsize := 100000
+		nlist := 128
+		vecs := make([][]float32, dsize)
+		for i := range vecs {
+			vecs[i] = make([]float32, dimension)
+			for j := range vecs[i] {
+				vecs[i][j] = rand.Float32()
+			}
+		}
 
-	//os.Stderr.WriteString("brute force index search Runing....\n")
-	err = brute_force.SearchIndex(resource, *index, &queries, &neighbors, &distances)
-	if err != nil {
-		return
-	}
-	//os.Stderr.WriteString("brute force index search finished Runing....\n")
+		_, err := getCenters(vecs, int(dimension), nlist, cuvs.L2Expanded, 10)
+		require.NoError(t, err)
+	}()
+	wg.Wait()
+}
 
-	if _, err = neighbors.ToHost(&resource); err != nil {
-		return
-	}
-	//os.Stderr.WriteString("brute force index search neighbour to host done....\n")
+func TestIssueIvfAndBruteForceForIssue(t *testing.T) {
+	var wg1 sync.WaitGroup
+	wg1.Add(1)
+	go func() {
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
+
+		defer wg1.Done()
+
+		dimension := uint(128)
+		limit := uint(1)
+		dsize := 100000
+		nlist := 128
+		vecs := make([][]float32, dsize)
+		for i := range vecs {
+			vecs[i] = make([]float32, dimension)
+			for j := range vecs[i] {
+				vecs[i][j] = rand.Float32()
+			}
+		}
+		queries := vecs[:8192]
 
-	if _, err = distances.ToHost(&resource); err != nil {
-		return
-	}
-	//os.Stderr.WriteString("brute force index search distances to host done....\n")
+		centers, err := getCenters(vecs, int(dimension), nlist, cuvs.L2Expanded, 10)
+		require.NoError(t, err)
 
-	if err = resource.Sync(); err != nil {
-		return
-	}
+		fmt.Println("centers DONE")
 
-	//os.Stderr.WriteString("brute force index search return result....\n")
-	neighborsSlice, err := neighbors.Slice()
-	if err != nil {
-		return
-	}
+		var wg sync.WaitGroup
 
-	distancesSlice, err := distances.Slice()
-	if err != nil {
-		return
-	}
+		for n := 0; n < 8; n++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
 
-	//fmt.Printf("flattened %v\n", flatten)
-	retdistances = make([]float64, len(distancesSlice)*int(limit))
-	for i := range distancesSlice {
-		for j, dist := range distancesSlice[i] {
-			retdistances[i*int(limit)+j] = float64(dist)
-		}
-	}
+				runtime.LockOSThread()
+				defer runtime.UnlockOSThread()
 
-	keys := make([]int64, len(neighborsSlice)*int(limit))
-	for i := range neighborsSlice {
-		for j, key := range neighborsSlice[i] {
-			keys[i*int(limit)+j] = int64(key)
+				for i := 0; i < 100; i++ { // Reduced iteration count for faster test run
+					_, _, err := Search(centers, queries, limit, cuvs.L2Expanded)
+					require.NoError(t, err)
+				}
+			}()
 		}
-	}
-	retkeys = keys
-	//os.Stderr.WriteString("brute force index search RETURN NOW....\n")
-	return
-}
 
-func TestIvfAndBruteForceForIssue(t *testing.T) {
-
-	dimension := uint(128)
-	limit := uint(1)
-	/*
-		ncpu := uint(1)
-		elemsz := uint(4) // float32
-	*/
-
-	dsize := 100000
-	nlist := 128
-	vecs := make([][]float32, dsize)
-	for i := range vecs {
-		vecs[i] = make([]float32, dimension)
-		for j := range vecs[i] {
-			vecs[i][j] = rand.Float32()
-		}
-	}
-	queries := vecs[:8192]
-
-	centers, err := getCenters(vecs, int(dimension), nlist, cuvs.DistanceL2, 10)
-	require.NoError(t, err)
-
-	var wg sync.WaitGroup
-
-	for n := 0; n < 4; n++ {
-
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			for i := 0; i < 1000; i++ {
-				_, _, err := Search(centers, queries, limit, cuvs.DistanceL2)
-				require.NoError(t, err)
-
-				/*
-					keys_i64, ok := keys.([]int64)
-					require.Equal(t, ok, true)
-
-					for j, key := range keys_i64 {
-						require.Equal(t, key, int64(j))
-						require.Equal(t, distances[j], float64(0))
-					}
-				*/
-				// fmt.Printf("keys %v, dist %v\n", keys, distances)
-			}
-		}()
-	}
-
-	wg.Wait()
+		wg.Wait()
+	}()
 
+	wg1.Wait()
 }
diff --git a/pkg/vectorindex/metric/cpu.go b/pkg/vectorindex/metric/cpu.go
new file mode 100644
index 0000000000000..716092f44c349
--- /dev/null
+++ b/pkg/vectorindex/metric/cpu.go
@@ -0,0 +1,30 @@
+//go:build !gpu
+
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metric
+
+import (
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+)
+
+func PairWiseDistance[T types.RealNumbers](
+	x [][]T,
+	y [][]T,
+	metric MetricType,
+	_ int,
+) ([]float32, error) {
+	return GoPairWiseDistance(x, y, metric)
+}
diff --git a/pkg/vectorindex/metric/distance_func.go b/pkg/vectorindex/metric/distance_func.go
index cf8ffae96fb22..370c5cc80b61d 100644
--- a/pkg/vectorindex/metric/distance_func.go
+++ b/pkg/vectorindex/metric/distance_func.go
@@ -522,3 +522,35 @@ func ResolveDistanceFn[T types.RealNumbers](metric MetricType) (DistanceFunction
 	}
 	return distanceFunction, nil
 }
+
+func GoPairWiseDistance[T types.RealNumbers](
+	x [][]T,
+	y [][]T,
+	metric MetricType,
+) ([]float32, error) {
+	distFn, err := ResolveDistanceFn[T](metric)
+	if err != nil {
+		return nil, err
+	}
+
+	nX := len(x)
+	nY := len(y)
+	res := make([]float32, nX*nY)
+	for i := 0; i < nX; i++ {
+		for j := 0; j < nY; j++ {
+			d, err := distFn(x[i], y[j])
+			if err != nil {
+				return nil, err
+			}
+			res[i*nY+j] = float32(d)
+		}
+	}
+
+	if metric == Metric_L2Distance {
+		for i := range res {
+			res[i] = float32(math.Sqrt(float64(res[i])))
+		}
+	}
+
+	return res, nil
+}
diff --git a/pkg/vectorindex/metric/gpu.go b/pkg/vectorindex/metric/gpu.go
index d0ad025c1f3f0..9d8365d92049f 100644
--- a/pkg/vectorindex/metric/gpu.go
+++ b/pkg/vectorindex/metric/gpu.go
@@ -17,15 +17,85 @@
 package metric
 
 import (
-	cuvs "github.com/rapidsai/cuvs/go"
+	"math"
+
+	"github.com/matrixorigin/matrixone/pkg/common/malloc"
+	"github.com/matrixorigin/matrixone/pkg/common/util"
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/cuvs"
 )
 
 var (
-	MetricTypeToCuvsMetric = map[MetricType]cuvs.Distance{
-		Metric_L2sqDistance:   cuvs.DistanceSQEuclidean,
-		Metric_L2Distance:     cuvs.DistanceSQEuclidean,
-		Metric_InnerProduct:   cuvs.DistanceInnerProduct,
-		Metric_CosineDistance: cuvs.DistanceCosine,
-		Metric_L1Distance:     cuvs.DistanceL1,
+	MetricTypeToCuvsMetric = map[MetricType]cuvs.DistanceType{
+		Metric_L2sqDistance:   cuvs.L2Expanded,
+		Metric_L2Distance:     cuvs.L2Expanded,
+		Metric_InnerProduct:   cuvs.InnerProduct,
+		Metric_CosineDistance: cuvs.CosineExpanded,
+		Metric_L1Distance:     cuvs.L1,
 	}
 )
+
+func PairWiseDistance[T types.RealNumbers](
+	x [][]T,
+	y [][]T,
+	metric MetricType,
+	deviceID int,
+) ([]float32, error) {
+	nX := len(x)
+	nY := len(y)
+	if nX == 0 || nY == 0 {
+		return nil, nil
+	}
+	dim := len(x[0])
+
+	cuvsMetric, ok := MetricTypeToCuvsMetric[metric]
+	if !ok || nX*nY*dim < 40000*1024 {
+		return GoPairWiseDistance(x, y, metric)
+	}
+
+	// T must be float32 for cuvs.PairwiseDistance as per VectorType constraint
+	// RealNumbers only includes float32/float64. cuvs.VectorType includes float32, Float16, int8, uint8.
+	// For now we only support float32 on GPU via this interface if T is float32.
+	var zero T
+	if any(zero).(interface{}) == any(float32(0)).(interface{}) {
+		allocator := malloc.NewCAllocator()
+
+		xf32Slice, xDeallocator, err := allocator.Allocate(uint64(nX*dim*4), malloc.NoClear)
+		if err != nil {
+			return nil, err
+		}
+		defer xDeallocator.Deallocate()
+		xf32 := util.UnsafeSliceCast[float32](xf32Slice)
+		for i, v := range x {
+			copy(xf32[i*dim:(i+1)*dim], any(v).([]float32))
+		}
+
+		yf32Slice, yDeallocator, err := allocator.Allocate(uint64(nY*dim*4), malloc.NoClear)
+		if err != nil {
+			return nil, err
+		}
+		defer yDeallocator.Deallocate()
+		yf32 := util.UnsafeSliceCast[float32](yf32Slice)
+		for i, v := range y {
+			copy(yf32[i*dim:(i+1)*dim], any(v).([]float32))
+		}
+
+		res, err := cuvs.PairwiseDistance(xf32, uint64(nX), yf32, uint64(nY), uint32(dim), cuvsMetric, deviceID)
+		if err != nil {
+			return nil, err
+		}
+
+		if metric == Metric_L2Distance {
+			for i := range res {
+				res[i] = float32(math.Sqrt(float64(res[i])))
+			}
+		} else if metric == Metric_InnerProduct {
+			for i := range res {
+				res[i] = -res[i]
+			}
+		}
+		return res, nil
+	}
+
+	return GoPairWiseDistance(x, y, metric)
+}
diff --git a/pkg/vectorindex/metric/pairwise_bench_test.go b/pkg/vectorindex/metric/pairwise_bench_test.go
new file mode 100644
index 0000000000000..dd91c06810df5
--- /dev/null
+++ b/pkg/vectorindex/metric/pairwise_bench_test.go
@@ -0,0 +1,80 @@
+// Copyright 2023 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metric
+
+import (
+	"math/rand"
+	"testing"
+)
+
+func BenchmarkPairWiseDistance(b *testing.B) {
+	nX, nY, dim := 100, 100, 128
+	x := make([][]float32, nX)
+	y := make([][]float32, nY)
+	for i := range x {
+		x[i] = make([]float32, dim)
+		for j := range x[i] {
+			x[i][j] = rand.Float32()
+		}
+	}
+	for i := range y {
+		y[i] = make([]float32, dim)
+		for j := range y[i] {
+			y[i][j] = rand.Float32()
+		}
+	}
+
+	b.Run("PairWiseDistance", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			_, _ = PairWiseDistance(x, y, Metric_L2sqDistance, 0)
+		}
+	})
+
+	b.Run("GoPairWiseDistance", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			_, _ = GoPairWiseDistance(x, y, Metric_L2sqDistance)
+		}
+	})
+}
+
+func BenchmarkPairWiseDistanceLarge(b *testing.B) {
+	nX, nY, dim := 10000, 5, 1024
+	x := make([][]float32, nX)
+	y := make([][]float32, nY)
+	for i := range x {
+		x[i] = make([]float32, dim)
+		for j := range x[i] {
+			x[i][j] = rand.Float32()
+		}
+	}
+	for i := range y {
+		y[i] = make([]float32, dim)
+		for j := range y[i] {
+			y[i][j] = rand.Float32()
+		}
+	}
+
+	b.Run("PairWiseDistance-Large", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			_, _ = PairWiseDistance(x, y, Metric_L2sqDistance, 0)
+		}
+	})
+
+	b.Run("GoPairWiseDistance-Large", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			_, _ = GoPairWiseDistance(x, y, Metric_L2sqDistance)
+		}
+	})
+}
diff --git a/pkg/vectorindex/metric/pairwise_test.go b/pkg/vectorindex/metric/pairwise_test.go
new file mode 100644
index 0000000000000..a9487beb46f84
--- /dev/null
+++ b/pkg/vectorindex/metric/pairwise_test.go
@@ -0,0 +1,87 @@
+// Copyright 2023 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metric
+
+import (
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestPairWiseDistance(t *testing.T) {
+	nX, nY := 3, 2
+	x := [][]float32{
+		{1, 0, 0, 0},
+		{0, 1, 0, 0},
+		{0, 0, 1, 0},
+	}
+	y := [][]float32{
+		{1, 0, 0, 0},
+		{0, 1, 1, 0},
+	}
+
+	metrics := []MetricType{
+		Metric_L2sqDistance,
+		Metric_L2Distance,
+		Metric_InnerProduct,
+		Metric_CosineDistance,
+		Metric_L1Distance,
+	}
+
+	for _, m := range metrics {
+		t.Run(MetricTypeToDistFuncName[m], func(t *testing.T) {
+			dist, err := PairWiseDistance(x, y, m, 0)
+			require.NoError(t, err)
+			require.Equal(t, nX*nY, len(dist))
+
+			// Verify against direct calls
+			distFn, err := ResolveDistanceFn[float32](m)
+			require.NoError(t, err)
+
+			for i := 0; i < nX; i++ {
+				for j := 0; j < nY; j++ {
+					expected, err := distFn(x[i], y[j])
+					require.NoError(t, err)
+
+					val := dist[i*nY+j]
+					if m == Metric_L2Distance {
+						require.InDelta(t, math.Sqrt(float64(expected)), float64(val), 1e-5)
+					} else {
+						require.InDelta(t, float64(expected), float64(val), 1e-5)
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestGoPairWiseDistance(t *testing.T) {
+	x := [][]float64{{1, 0}, {0, 1}}
+	y := [][]float64{{1, 0}, {1, 1}}
+
+	dist, err := GoPairWiseDistance(x, y, Metric_L2sqDistance)
+	require.NoError(t, err)
+	require.Equal(t, 4, len(dist))
+
+	// (1,0) to (1,0) -> 0
+	require.InDelta(t, 0.0, float64(dist[0]), 1e-5)
+	// (1,0) to (1,1) -> 1
+	require.InDelta(t, 1.0, float64(dist[1]), 1e-5)
+	// (0,1) to (1,0) -> 2
+	require.InDelta(t, 2.0, float64(dist[2]), 1e-5)
+	// (0,1) to (1,1) -> 1
+	require.InDelta(t, 1.0, float64(dist[3]), 1e-5)
+}
diff --git a/pkg/vm/engine/tae/blockio/read.go b/pkg/vm/engine/tae/blockio/read.go
index a0152bc9db10b..2db8f3482697a 100644
--- a/pkg/vm/engine/tae/blockio/read.go
+++ b/pkg/vm/engine/tae/blockio/read.go
@@ -34,7 +34,6 @@ import (
 	"github.com/matrixorigin/matrixone/pkg/pb/plan"
 	"github.com/matrixorigin/matrixone/pkg/pb/timestamp"
 	v2 "github.com/matrixorigin/matrixone/pkg/util/metric/v2"
-	"github.com/matrixorigin/matrixone/pkg/vectorindex"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
 	"github.com/matrixorigin/matrixone/pkg/vm/engine"
 	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/containers"
@@ -394,23 +393,34 @@ func HandleOrderByLimitOnIVFFlatIndex(
 		return nullsBm.Contains(uint64(row))
 	})
 
-	searchResults := make([]vectorindex.SearchResult, 0, len(selectRows))
-
 	switch orderByLimit.Typ {
 	case types.T_array_float32:
-		distFunc, err := metric.ResolveDistanceFn[float32](orderByLimit.MetricType)
+		rhs := types.BytesToArray[float32](orderByLimit.NumVec)
+		dim := len(rhs)
+		if dim == 0 {
+			return nil, nil, moerr.NewInternalError(ctx, "empty query vector")
+		}
+		nX := len(selectRows)
+		if nX == 0 {
+			return nil, nil, nil
+		}
+
+		lhs := make([][]float32, nX)
+		for i, row := range selectRows {
+			lhs[i] = types.BytesToArray[float32](vecCol.GetBytesAt(int(row)))
+		}
+
+		pairwiseDists, err := metric.PairWiseDistance(lhs, [][]float32{rhs}, orderByLimit.MetricType, 0)
 		if err != nil {
 			return nil, nil, err
 		}
 
-		rhs := types.BytesToArray[float32](orderByLimit.NumVec)
+		resIdx := 0
+		sels := make([]int64, nX)
+		dists := make([]float64, nX)
 
-		for _, row := range selectRows {
-			dist, err := distFunc(types.BytesToArray[float32](vecCol.GetBytesAt(int(row))), rhs)
-			if err != nil {
-				return nil, nil, err
-			}
-			dist64 := float64(dist)
+		for i, row := range selectRows {
+			dist64 := float64(pairwiseDists[i])
 
 			if orderByLimit.LowerBoundType == plan.BoundType_INCLUSIVE {
 				if dist64 < orderByLimit.LowerBound {
@@ -442,25 +452,50 @@ func HandleOrderByLimitOnIVFFlatIndex(
 				heap.Push(&orderByLimit.DistHeap, dist64)
 			}
 
-			searchResults = append(searchResults, vectorindex.SearchResult{
-				Id:       row,
-				Distance: dist64,
-			})
+			sels[resIdx] = row
+			dists[resIdx] = dist64
+			resIdx++
 		}
+		sels = sels[:resIdx]
+		dists = dists[:resIdx]
+
+		finalIdx := 0
+		for i := 0; i < len(sels); i++ {
+			if dists[i] <= orderByLimit.DistHeap[0] {
+				sels[finalIdx] = sels[i]
+				dists[finalIdx] = dists[i]
+				finalIdx++
+			}
+		}
+		return sels[:finalIdx], dists[:finalIdx], nil
 
 	case types.T_array_float64:
-		distFunc, err := metric.ResolveDistanceFn[float64](orderByLimit.MetricType)
+		rhs := types.BytesToArray[float64](orderByLimit.NumVec)
+		dim := len(rhs)
+		if dim == 0 {
+			return nil, nil, moerr.NewInternalError(ctx, "empty query vector")
+		}
+		nX := len(selectRows)
+		if nX == 0 {
+			return nil, nil, nil
+		}
+
+		lhs := make([][]float64, nX)
+		for i, row := range selectRows {
+			lhs[i] = types.BytesToArray[float64](vecCol.GetBytesAt(int(row)))
+		}
+
+		pairwiseDists, err := metric.PairWiseDistance(lhs, [][]float64{rhs}, orderByLimit.MetricType, 0)
 		if err != nil {
 			return nil, nil, err
 		}
 
-		rhs := types.BytesToArray[float64](orderByLimit.NumVec)
+		resIdx := 0
+		sels := make([]int64, nX)
+		dists := make([]float64, nX)
 
-		for _, row := range selectRows {
-			dist64, err := distFunc(types.BytesToArray[float64](vecCol.GetBytesAt(int(row))), rhs)
-			if err != nil {
-				return nil, nil, err
-			}
+		for i, row := range selectRows {
+			dist64 := float64(pairwiseDists[i])
 
 			if orderByLimit.LowerBoundType == plan.BoundType_INCLUSIVE {
 				if dist64 < orderByLimit.LowerBound {
@@ -492,28 +527,26 @@ func HandleOrderByLimitOnIVFFlatIndex(
 				heap.Push(&orderByLimit.DistHeap, dist64)
 			}
 
-			searchResults = append(searchResults, vectorindex.SearchResult{
-				Id:       row,
-				Distance: dist64,
-			})
+			sels[resIdx] = row
+			dists[resIdx] = dist64
+			resIdx++
 		}
+		sels = sels[:resIdx]
+		dists = dists[:resIdx]
+
+		finalIdx := 0
+		for i := 0; i < len(sels); i++ {
+			if dists[i] <= orderByLimit.DistHeap[0] {
+				sels[finalIdx] = sels[i]
+				dists[finalIdx] = dists[i]
+				finalIdx++
+			}
+		}
+		return sels[:finalIdx], dists[:finalIdx], nil
 
 	default:
 		return nil, nil, moerr.NewInternalError(ctx, fmt.Sprintf("only support float32/float64 type for topn: %s", orderByLimit.Typ))
 	}
-
-	searchResults = slices.DeleteFunc(searchResults, func(res vectorindex.SearchResult) bool {
-		return res.Distance > orderByLimit.DistHeap[0]
-	})
-
-	sels := make([]int64, len(searchResults))
-	dists := make([]float64, len(searchResults))
-	for i, res := range searchResults {
-		sels[i] = res.Id
-		dists[i] = res.Distance
-	}
-
-	return sels, dists, nil
 }
 
 func fillOutputBatchBySelectedRows(
diff --git a/test/distributed/cases/vector/vector_ivfflat_null_entry_panic_minimal.result b/test/distributed/cases/vector/vector_ivfflat_null_entry_panic_minimal.result
index 256e4dcea08e2..3e4b3fe0183a5 100644
--- a/test/distributed/cases/vector/vector_ivfflat_null_entry_panic_minimal.result
+++ b/test/distributed/cases/vector/vector_ivfflat_null_entry_panic_minimal.result
@@ -58,7 +58,7 @@ set @q_sql = concat(
 prepare p_q from @q_sql;
 execute p_q;
 ➤ __mo_index_pri_col[12,-1,0]  ¦  d[8,54,0]  𝄀
-r_1  ¦  0.64000004529953
+r_1  ¦  0.800000011920929
 deallocate prepare p_q;
 DROP TABLE IF EXISTS t1;
 DROP DATABASE vec_null_panic_db;