diff --git a/Makefile b/Makefile index 2aee18cc749f0..614ad7532cb74 100644 --- a/Makefile +++ b/Makefile @@ -178,6 +178,7 @@ pb: vendor-build generate-pb fmt VERSION_INFO :=-X '$(GO_MODULE)/pkg/version.GoVersion=$(GO_VERSION)' -X '$(GO_MODULE)/pkg/version.BranchName=$(BRANCH_NAME)' -X '$(GO_MODULE)/pkg/version.CommitID=$(LAST_COMMIT_ID)' -X '$(GO_MODULE)/pkg/version.BuildTime=$(BUILD_TIME)' -X '$(GO_MODULE)/pkg/version.Version=$(MO_VERSION)' THIRDPARTIES_INSTALL_DIR=$(ROOT_DIR)/thirdparties/install +CGO_DIR=$(ROOT_DIR)/cgo RACE_OPT := DEBUG_OPT := CGO_DEBUG_OPT := @@ -188,7 +189,7 @@ ifeq ($(MO_CL_CUDA),1) $(error CONDA_PREFIX env variable not found.) endif CUVS_CFLAGS := -I$(CONDA_PREFIX)/include - CUVS_LDFLAGS := -L$(CONDA_PREFIX)/envs/go/lib -lcuvs -lcuvs_c + CUVS_LDFLAGS := -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c CUDA_CFLAGS := -I/usr/local/cuda/include $(CUVS_CFLAGS) CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart $(CUVS_LDFLAGS) -lstdc++ TAGS += -tags "gpu" @@ -198,11 +199,11 @@ ifeq ($(TYPECHECK),1) TAGS += -tags "typecheck" endif -CGO_OPTS :=CGO_CFLAGS="-I$(THIRDPARTIES_INSTALL_DIR)/include $(CUDA_CFLAGS)" -GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib -fopenmp' $(VERSION_INFO)" +CGO_OPTS :=CGO_CFLAGS="-I$(CGO_DIR) -I$(THIRDPARTIES_INSTALL_DIR)/include $(CUDA_CFLAGS)" +GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(CGO_DIR) -lmo -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib -fopenmp' $(VERSION_INFO)" ifeq ("$(UNAME_S)","darwin") -GOLDFLAGS:=-ldflags="-extldflags '-L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,@executable_path/lib' $(VERSION_INFO)" +GOLDFLAGS:=-ldflags="-extldflags '-L$(CGO_DIR) -lmo -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,@executable_path/lib' $(VERSION_INFO)" endif ifeq ($(GOBUILD_OPT),) diff --git a/cgo/Makefile b/cgo/Makefile index 5678f16cf5814..d25f0400aab96 100644 --- a/cgo/Makefile +++ b/cgo/Makefile @@ -1,48 +1,77 @@ DEBUG_OPT := UNAME_M := $(shell uname -m) +UNAME_S := $(shell uname -s) +CC ?= gcc # Yeah, fast math. We want it to be fast, for all xcall, # IEEE compliance should not be an issue. OPT_LV := -O3 -ffast-math -ftree-vectorize -funroll-loops -CFLAGS=-std=c99 -g ${OPT_LV} -Wall -Werror -I../thirdparties/install/include -OBJS=mo.o arith.o compare.o logic.o xcall.o usearchex.o bloom.o -CUDA_OBJS= +COMMON_CFLAGS := -g $(OPT_LV) -Wall -Werror -fPIC -I../thirdparties/install/include +CFLAGS := -std=c99 $(COMMON_CFLAGS) +OBJS := mo.o arith.o compare.o logic.o xcall.o usearchex.o bloom.o +CUDA_OBJS := +LDFLAGS := -L../thirdparties/install/lib -lusearch_c +TARGET_LIB := libmo.so + +ifeq ($(UNAME_S),Darwin) + TARGET_LIB := libmo.dylib + LDFLAGS += -dynamiclib -undefined dynamic_lookup -install_name @rpath/$(TARGET_LIB) +else + LDFLAGS += -shared +endif ifeq ($(UNAME_M), x86_64) - CFLAGS+= -march=haswell + CFLAGS += -march=haswell endif ifeq ($(MO_CL_CUDA),1) + ifeq ($(CONDA_PREFIX),) + $(error CONDA_PREFIX env variable not found. Please activate your conda environment.) + endif CC = /usr/local/cuda/bin/nvcc - CFLAGS = -ccbin g++ -m64 --shared -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 + CFLAGS = -ccbin g++ -m64 -Xcompiler -fPIC -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 CFLAGS += -I../thirdparties/install/include -DMO_CL_CUDA CUDA_OBJS += cuda/cuda.o - CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -lstdc++ + # Explicitly include all needed libraries for shared library linking + CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lstdc++ + LDFLAGS += $(CUDA_LDFLAGS) endif -all: libmo.a +.PHONY: all clean test debug + +all: $(TARGET_LIB) libmo.a -libmo.a: $(OBJS) +$(TARGET_LIB): $(OBJS) ifeq ($(MO_CL_CUDA),1) - make -C cuda + $(MAKE) -C cuda + $(MAKE) -C cuvs + $(CC) $(LDFLAGS) -o $@ $(OBJS) $(CUDA_OBJS) cuvs/*.o +else + $(CC) $(LDFLAGS) -o $@ $(OBJS) endif - ar -rcs libmo.a $(OBJS) $(CUDA_OBJS) -# -# $(CC) -o libmo.a $(OBJS) $(CUDA_OBJS) $(CUDA_LDFLAGS) +libmo.a: $(OBJS) +ifeq ($(MO_CL_CUDA),1) + $(MAKE) -C cuda + $(MAKE) -C cuvs + ar -rcs $@ $(OBJS) $(CUDA_OBJS) cuvs/*.o +else + ar -rcs $@ $(OBJS) +endif +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ -test: libmo.a - make -C test +test: $(TARGET_LIB) + $(MAKE) -C test -.PHONY: debug debug: override OPT_LV := -O0 debug: override DEBUG_OPT := debug debug: all -.PHONY: clean clean: - rm -f *.o *.a *.so + rm -f *.o *.a *.so *.dylib ifeq ($(MO_CL_CUDA),1) - make -C cuda clean + $(MAKE) -C cuda clean + $(MAKE) -C cuvs clean endif diff --git a/cgo/README.md b/cgo/README.md index 5699ca4d292a2..ffb190c652bc3 100644 --- a/cgo/README.md +++ b/cgo/README.md @@ -1,25 +1,28 @@ MatrixOne CGO Kernel =============================== -This directory contains cgo source code for MO. Running -make should produce two files to be used by go code. -On go side, go will `include "mo.h"` and `-lmo`. +This directory contains CGO source code for MatrixOne. Running `make` produces the core library files used by Go code. + +On the Go side, the integration typically uses `mo.h` and links against the generated libraries: ``` mo.h -libmo.a +libmo.a / libmo.so ``` -`mo.h` should be pristine, meaning it only contains C function -prototype used by go. The only datatypes that can be passed -between go and c code are int and float/double and pointer. -Always explicitly specify int size such as `int32_t`, `uint64_t`. -Do not use `int`, `long`, etc. +`mo.h` should remain pristine, containing only C function prototypes for Go to consume. Data passed between Go and C should be limited to standard types (int, float, double, pointers). Always specify explicit integer sizes (e.g., `int32_t`, `uint64_t`) and avoid platform-dependent types like `int` or `long`. + +GPU Support (CUDA & cuVS) +------------------------- +The kernel supports GPU acceleration for certain operations (e.g., vector search) via NVIDIA CUDA and the cuVS library. + +- **Build Flag:** GPU support is enabled by setting `MO_CL_CUDA=1` during the build. +- **Environment:** Requires a working CUDA installation and a Conda environment with `cuvs` and `rmm` installed. +- **Source Code:** GPU-specific code resides in the `cuda/` and `cuvs/` subdirectories. Implementation Notes --------------------------------- +-------------------- -1. Pure C. -2. Use memory passed from go. Try not allocate memory in C code. -3. Only depends on libc and libm. -4. If 3rd party lib is absolutely necessary, import source code - and build from source. If 3rd party lib is C++, wrap it completely in C. +1. **Language:** Core kernel is Pure C. GPU extensions use C++ and CUDA, wrapped in a C-compatible interface. +2. **Memory Management:** Prefer using memory allocated and passed from Go. Minimize internal allocations in C/C++ code. +3. **Dependencies:** The base kernel depends only on `libc`, `libm`, and `libusearch`. GPU builds introduce dependencies on CUDA, `cuvs`, and `rmm`. +4. **Third-party Libraries:** If a third-party library is necessary, it should be built from source (see `thirdparties/` directory). C++ libraries must be fully wrapped in C before being exposed to Go. diff --git a/cgo/cuda/Makefile b/cgo/cuda/Makefile index a95913b014d58..eca30f9be2b98 100644 --- a/cgo/cuda/Makefile +++ b/cgo/cuda/Makefile @@ -395,7 +395,7 @@ $(FATBIN_FILE): mocl.cu $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -fatbin $< cuda.o: cuda.cpp - $(EXEC) $(NVCC) $(INCLUDES) -O3 --shared $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< + $(EXEC) $(NVCC) $(INCLUDES) -O3 --shared -Xcompiler -fPIC $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< mytest.o: cuda.cpp $(FATBIN_FILE) $(EXEC) $(NVCC) $(INCLUDES) -DTEST_RUN -g -O0 $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< diff --git a/cgo/cuvs/Makefile b/cgo/cuvs/Makefile new file mode 100644 index 0000000000000..86ff4fd319723 --- /dev/null +++ b/cgo/cuvs/Makefile @@ -0,0 +1,75 @@ +# Makefile for MatrixOne cuVS C Wrapper + +UNAME_M := $(shell uname -m) +CUDA_PATH ?= /usr/local/cuda +NVCC := $(CUDA_PATH)/bin/nvcc + +ifeq ($(CONDA_PREFIX),) + $(error CONDA_PREFIX env variable not found. Please activate your conda environment.) +endif + +# Compilation flags +# Added --extended-lambda because raft/core/copy.cuh requires it for some internal headers +NVCC_FLAGS := -std=c++17 -x cu -Xcompiler "-Wall -Wextra -fPIC -O2" --extended-lambda --expt-relaxed-constexpr +NVCC_FLAGS += -I. -I$(CUDA_PATH)/include -I$(CONDA_PREFIX)/include -I$(CONDA_PREFIX)/include/rapids -I$(CONDA_PREFIX)/include/raft -I$(CONDA_PREFIX)/include/cuvs +NVCC_FLAGS += -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DRAFT_SYSTEM_LITTLE_ENDIAN=1 + +# Linking flags +LDFLAGS := -shared +LDFLAGS += -L$(CUDA_PATH)/lib64/stubs -lcuda -L$(CUDA_PATH)/lib64 -lcudart +LDFLAGS += -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lrapids_logger +LDFLAGS += -Xlinker -lpthread -Xlinker -lm + +# Target library +TARGET := libmocuvs.so + +# Source files +SRCS := brute_force_c.cpp ivf_flat_c.cpp ivf_pq_c.cpp cagra_c.cpp kmeans_c.cpp helper.cpp adhoc_c.cpp distance_c.cpp +OBJS := $(SRCS:.cpp=.o) + +# Test configuration +TESTDIR := test +OBJDIR := obj +TEST_EXE := test_cuvs_worker +TEST_SRCS := $(TESTDIR)/main_test.cu \ + $(TESTDIR)/brute_force_test.cu \ + $(TESTDIR)/ivf_flat_test.cu \ + $(TESTDIR)/ivf_pq_test.cu \ + $(TESTDIR)/cagra_test.cu \ + $(TESTDIR)/kmeans_test.cu \ + $(TESTDIR)/quantize_test.cu \ + $(TESTDIR)/distance_test.cu \ + $(TESTDIR)/batching_test.cu + +TEST_OBJS := $(patsubst $(TESTDIR)/%.cu, $(OBJDIR)/test/%.o, $(TEST_SRCS)) + +.PHONY: all clean test + +all: $(OBJS) + +$(TARGET): $(OBJS) + @echo "Linking shared library $@" + $(NVCC) $(LDFLAGS) $^ -o $@ + +%.o: %.cpp + @echo "Compiling $< with NVCC" + $(NVCC) $(NVCC_FLAGS) -c $< -o $@ + +# Test targets +test: $(TEST_EXE) + @echo "Running tests..." + ./$(TEST_EXE) + +$(TEST_EXE): $(TEST_OBJS) helper.o + @echo "NVCCLD $@" + $(NVCC) $(subst -x cu,,$(NVCC_FLAGS)) $^ $(subst -shared,,$(LDFLAGS)) -o $@ + +$(OBJDIR)/test/%.o: $(TESTDIR)/%.cu + @mkdir -p $(@D) + @echo "NVCC $<" + $(NVCC) -std=c++17 -Xcompiler "-Wall -Wextra -fPIC -O2" --extended-lambda --expt-relaxed-constexpr -I. -I$(CUDA_PATH)/include -I$(CONDA_PREFIX)/include -I$(CONDA_PREFIX)/include/rapids -I$(CONDA_PREFIX)/include/raft -I$(CONDA_PREFIX)/include/cuvs -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DRAFT_SYSTEM_LITTLE_ENDIAN=1 -c $< -o $@ + +clean: + @echo "Cleaning up..." + rm -f $(TARGET) *.o $(TEST_EXE) + rm -rf $(OBJDIR) diff --git a/cgo/cuvs/README.md b/cgo/cuvs/README.md new file mode 100644 index 0000000000000..7f0ac3b5c169a --- /dev/null +++ b/cgo/cuvs/README.md @@ -0,0 +1,119 @@ +✦ Architecture Design: cuVS-Accelerated Vector Indexing + + 1. Overview + The MatrixOne cuvs package provides a high-performance, GPU-accelerated vector search and clustering infrastructure. It acts as + a bridge between the Go-based database kernel and NVIDIA's cuVS and RAFT libraries. The architecture is designed to solve three + primary challenges: + 1. Impedance Mismatch: Reconciling Go’s concurrent goroutine scheduler with CUDA’s thread-specific resource requirements. + 2. Scalability: Supporting datasets that exceed single-GPU memory (Sharding) or high-concurrency search requirements + (Replicated). + 3. Efficiency: Minimizing CUDA kernel launch overhead via dynamic query batching. + + --- + + 2. Core Component: cuvs_worker_t + The cuvs_worker_t is the foundational engine of the architecture. + + Implementation Details: + * Persistent C++ Thread Pool: Instead of executing CUDA calls directly from CGO (which could be scheduled on any OS thread), + the worker maintains a dedicated pool of long-lived C++ threads. Each thread is pinned to a specific GPU device. + * Job Queuing: Requests from the Go layer are submitted as "Jobs" to an internal thread-safe queue. The worker returns a + std::future, allowing the Go layer to perform other tasks while the GPU processes the request. + * Context Stability: By using dedicated threads, we ensure that CUDA context and RAFT resource handles remain stable and + cached, avoiding the expensive overhead of context creation or handle re-initialization. + + --- + + 3. Distribution Modes + The system supports three distinct modes to leverage multi-GPU hardware: + + A. Single GPU Mode + * Design: The index resides entirely on one device. + * Use Case: Small to medium datasets where latency is the priority. + + B. Replicated Mode (Scaling Throughput) + * Design: The full index is loaded onto multiple GPUs simultaneously. + * Mechanism: The cuvs_worker implements a load-balancing strategy (typically round-robin). Incoming queries are dispatched to + the next available GPU. + * Benefit: Linearly scales the Queries Per Second (QPS) by utilizing the compute power of all available GPUs. + + C. Sharded Mode (Scaling Capacity) + * Design: The dataset is partitioned into $N$ shards across $N$ GPUs. + * Mechanism: + 1. Broadcast: A search request is sent to all GPUs. + 2. Local Search: Each GPU searches its local shard independently using RAFT resources. + 3. Top-K Merge: The worker aggregates the results ($N \times K$ candidates) and performs a final merge-sort (often on the + CPU or via a fast GPU kernel) to return the global top-K. + * Benefit: Enables indexing of massive datasets (e.g., 100M+ vectors) that would not fit in the memory of a single GPU. + + --- + + 4. RAFT Resource Management + The package relies on RAFT (raft::resources) for all CUDA-accelerated operations. + + * Resource Caching: raft::resources objects (containing CUDA streams, cuBLAS handles, and workspace memory) are held within the + cuvs_worker threads. They are created once at Start() and reused for the lifetime of the index. + * Stream-Based Parallelism: Every index operation is executed asynchronously on a RAFT-managed CUDA stream. This allows the + system to overlap data transfers (Host-to-Device) with kernel execution, maximizing hardware utilization. + * Memory Layout: Leveraging raft::mdspan and raft::mdarray ensures that memory is handled in a layout-aware manner + (C-contiguous or Fortran-contiguous), matching the requirements of optimized BLAS and LAPACK kernels. + + --- + + 5. Dynamic Batching: The Throughput Key + In a database environment, queries often arrive one by one from different users. Processing these as individual CUDA kernels is + inefficient due to launch overhead and under-utilization of GPU warps. + + The Dynamic Batching Mechanism: + * Aggregation Window: When multiple search requests arrive at the worker within a small time window (microseconds), the worker + stalls briefly to aggregate them. + * Matrix Consolidation: Individual query vectors are packed into a single large query matrix. + * Consolidated Search: A single cuvs::neighbors::search call is made. GPUs are significantly more efficient at processing one + $64 \times D$ matrix than 64 individual $1 \times D$ vectors. + * Automatic Fulfilling: Once the batch search completes, the worker de-multiplexes the results and fulfills the specific + std::future for each individual Go request. + + --- + + 6. Automatic Type Quantization + To optimize memory footprint and search speed, the architecture features an automated quantization pipeline that converts + high-precision float32 vectors into compressed formats. + + * Transparent Conversion: The Go layer can consistently provide float32 data. The system automatically handles the conversion + to the index's internal type (half, int8, or uint8) directly on the GPU. + * FP16 (Half Precision): + * Mechanism: Uses raft::copy to perform bit-level conversion from 32-bit to 16-bit floating point. + * Benefit: 2x memory reduction with negligible impact on search recall. + * 8-Bit Integer (int8/uint8): + * Mechanism: Implements a learned Scalar Quantizer. The system samples the dataset to determine optimal min and max + clipping bounds. + * Training: Before building, the quantizer is "trained" on a subset of the data to ensure the 256 available integer levels + are mapped to the most significant range of the distribution. + * Benefit: 4x memory reduction, enabling massive datasets to reside in VRAM. + * GPU-Accelerated: All quantization kernels are executed on the device. This minimizes CPU usage and avoids the latency of + converting data before sending it over the PCIe bus. + + 7. Supported Index Types + The following indexes are fully integrated into the MatrixOne GPU architecture: + + + ┌──────────┬──────────────────────┬───────────────────────────────────────────────────────────────────────────────┐ + │ Index │ Algorithm │ Strengths │ + ├──────────┼──────────────────────┼───────────────────────────────────────────────────────────────────────────────┤ + │ CAGRA │ Hardware-accelerated │ Best-in-class search speed and high recall. Optimized for hardware graph │ + │ │ Graph │ traversal. │ + │ IVF-Flat │ Inverted File Index │ High accuracy and fast search. Excellent for general-purpose use. │ + │ IVF-PQ │ Product Quantization │ Extreme compression. Supports billions of vectors via lossy code compression. │ + │ Brute │ Exact Flat Search │ 100% recall. Ideal for small datasets or generating ground-truth for │ + │ Force │ │ benchmarks. │ + │ K-Means │ Clustering │ High-performance centroid calculation for data partitioning and unsupervised │ + │ │ │ learning. │ + └──────────┴──────────────────────┴───────────────────────────────────────────────────────────────────────────────┘ + + + 8. Operational Telemetry + All indexes implement a unified Info() method that returns a JSON-formatted string. This allows the database to programmatically + verify: + * Hardware Mapping: Which GPU devices are holding which shards. + * Data Layout: Element sizes, dimensions, and current vector counts. + * Hyper-parameters: Internal tuning values like NLists, GraphDegree, or PQBits. diff --git a/cgo/cuvs/adhoc.hpp b/cgo/cuvs/adhoc.hpp new file mode 100644 index 0000000000000..310db80fbc336 --- /dev/null +++ b/cgo/cuvs/adhoc.hpp @@ -0,0 +1,127 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "helper.h" +#include +#include + +namespace matrixone { + +/** + * @brief Performs an ad-hoc brute-force search on GPU without using a worker thread. + * This is intended for scenarios where an index is not pre-built and the + * search needs to be executed immediately in the current thread context. + * + * @tparam T Data type of the vector elements (e.g., float, half). + * @param res RAFT resources handle. + * @param dataset Host pointer to the dataset vectors. + * @param n_rows Number of vectors in the dataset. + * @param dim Dimension of each vector. + * @param queries Host pointer to the query vectors. + * @param n_queries Number of query vectors. + * @param limit Number of nearest neighbors to find (k). + * @param metric Distance metric to use. + * @param neighbors Host pointer to store the resulting neighbor IDs (size: n_queries * limit). + * @param distances Host pointer to store the resulting distances (size: n_queries * limit). + */ +template +void adhoc_brute_force_search(const raft::resources& res, + const T* dataset, + uint64_t n_rows, + uint32_t dim, + const T* queries, + uint64_t n_queries, + uint32_t limit, + cuvs::distance::DistanceType metric, + int64_t* neighbors, + float* distances) { + auto stream = raft::resource::get_cuda_stream(res); + + // Helper to align sizes to 256 bytes (CUDA default alignment) + auto align_size = [](size_t size) { + return (size + 255) & ~255; + }; + + // 1. Calculate total buffer sizes with alignment + size_t dataset_bytes = n_rows * dim * sizeof(T); + size_t queries_bytes = n_queries * dim * sizeof(T); + size_t neighbors_bytes = n_queries * limit * sizeof(int64_t); + size_t distances_bytes = n_queries * limit * sizeof(float); + + size_t dataset_alloc = align_size(dataset_bytes); + size_t queries_alloc = align_size(queries_bytes); + size_t neighbors_alloc = align_size(neighbors_bytes); + size_t total_bytes = dataset_alloc + queries_alloc + neighbors_alloc + distances_bytes; + + // Use a single allocation for all temporary buffers to reduce overhead + void* d_ptr = nullptr; + RAFT_CUDA_TRY(cudaMallocAsync(&d_ptr, total_bytes, stream)); + + char* d_dataset = static_cast(d_ptr); + char* d_queries = d_dataset + dataset_alloc; + char* d_neighbors = d_queries + queries_alloc; + char* d_distances = d_neighbors + neighbors_alloc; + + // 2. Async copies to Device + RAFT_CUDA_TRY(cudaMemcpyAsync(d_dataset, dataset, dataset_bytes, cudaMemcpyHostToDevice, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(d_queries, queries, queries_bytes, cudaMemcpyHostToDevice, stream)); + + // 3. Prepare Views (zero allocation) + auto dataset_view = raft::make_device_matrix_view(reinterpret_cast(d_dataset), n_rows, dim); + auto queries_view = raft::make_device_matrix_view(reinterpret_cast(d_queries), n_queries, dim); + auto neighbors_view = raft::make_device_matrix_view(reinterpret_cast(d_neighbors), n_queries, limit); + auto distances_view = raft::make_device_matrix_view(reinterpret_cast(d_distances), n_queries, limit); + + // 4. Build temporary index (view-based, very fast) + cuvs::neighbors::brute_force::index_params index_params; + index_params.metric = metric; + auto index = cuvs::neighbors::brute_force::build(res, index_params, raft::make_const_mdspan(dataset_view)); + + // 5. Execute Search + cuvs::neighbors::brute_force::search_params search_params; + cuvs::neighbors::brute_force::search(res, search_params, index, + raft::make_const_mdspan(queries_view), + neighbors_view, + distances_view); + + // 6. Async copy results back to host + RAFT_CUDA_TRY(cudaMemcpyAsync(neighbors, d_neighbors, neighbors_bytes, cudaMemcpyDeviceToHost, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(distances, d_distances, distances_bytes, cudaMemcpyDeviceToHost, stream)); + + // 7. Synchronize + raft::resource::sync_stream(res); + + // 8. Async free + RAFT_CUDA_TRY(cudaFreeAsync(d_ptr, stream)); + + // Handle invalid neighbor indices (consistent with existing brute_force.hpp) + for (size_t i = 0; i < n_queries * limit; ++i) { + if (neighbors[i] == std::numeric_limits::max() || + neighbors[i] == 4294967295LL || neighbors[i] < 0) { + neighbors[i] = -1; + } + } +} + +} // namespace matrixone diff --git a/cgo/cuvs/adhoc_c.cpp b/cgo/cuvs/adhoc_c.cpp new file mode 100644 index 0000000000000..a28099297f2ed --- /dev/null +++ b/cgo/cuvs/adhoc_c.cpp @@ -0,0 +1,79 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "adhoc_c.h" +#include "adhoc.hpp" +#include "helper.h" +#include +#include + +extern "C" { + +void gpu_adhoc_brute_force_search(const void* dataset, + uint64_t n_rows, + uint32_t dim, + const void* queries, + uint64_t n_queries, + uint32_t limit, + distance_type_t metric, + quantization_t qtype, + int device_id, + int64_t* neighbors, + float* distances, + void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cudaSetDevice(device_id); + const auto& res = matrixone::get_raft_resources(); + auto m = static_cast(metric); + + if (qtype == Quantization_F32) { + matrixone::adhoc_brute_force_search(res, + static_cast(dataset), + n_rows, dim, + static_cast(queries), + n_queries, limit, m, + neighbors, distances); + } else if (qtype == Quantization_F16) { + matrixone::adhoc_brute_force_search(res, + static_cast(dataset), + n_rows, dim, + static_cast(queries), + n_queries, limit, m, + neighbors, distances); + } else { + throw std::runtime_error("Unsupported quantization type for adhoc search"); + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_adhoc_brute_force_search", e.what()); + } +} + +void gpu_adhoc_brute_force_search_float(const float* dataset, + uint64_t n_rows, + uint32_t dim, + const float* queries, + uint64_t n_queries, + uint32_t limit, + distance_type_t metric, + int device_id, + int64_t* neighbors, + float* distances, + void* errmsg) { + gpu_adhoc_brute_force_search(dataset, n_rows, dim, queries, n_queries, limit, metric, Quantization_F32, device_id, neighbors, distances, errmsg); +} + +} // extern "C" diff --git a/cgo/cuvs/adhoc_c.h b/cgo/cuvs/adhoc_c.h new file mode 100644 index 0000000000000..43146bf4deed7 --- /dev/null +++ b/cgo/cuvs/adhoc_c.h @@ -0,0 +1,72 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ADHOC_C_H +#define ADHOC_C_H + +#include "helper.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Performs an ad-hoc brute-force search on GPU. + * + * @param dataset Host pointer to the dataset vectors. + * @param n_rows Number of vectors in the dataset. + * @param dim Dimension of each vector. + * @param queries Host pointer to the query vectors. + * @param n_queries Number of query vectors. + * @param limit Number of nearest neighbors to find (k). + * @param metric Distance metric to use. + * @param qtype Quantization type (F32, F16). + * @param device_id GPU device ID to use. + * @param neighbors Host pointer to store the resulting neighbor IDs (size: n_queries * limit). + * @param distances Host pointer to store the resulting distances (size: n_queries * limit). + * @param errmsg Pointer to store error message if any. + */ +void gpu_adhoc_brute_force_search(const void* dataset, + uint64_t n_rows, + uint32_t dim, + const void* queries, + uint64_t n_queries, + uint32_t limit, + distance_type_t metric, + quantization_t qtype, + int device_id, + int64_t* neighbors, + float* distances, + void* errmsg); + +void gpu_adhoc_brute_force_search_float(const float* dataset, + uint64_t n_rows, + uint32_t dim, + const float* queries, + uint64_t n_queries, + uint32_t limit, + distance_type_t metric, + int device_id, + int64_t* neighbors, + float* distances, + void* errmsg); + +#ifdef __cplusplus +} +#endif + +#endif // ADHOC_C_H diff --git a/cgo/cuvs/blog.md b/cgo/cuvs/blog.md new file mode 100644 index 0000000000000..b49773aee0de3 --- /dev/null +++ b/cgo/cuvs/blog.md @@ -0,0 +1,52 @@ +# Scaling 50 Million Vectors on Modest Hardware: How MatrixOne Leverages cuVS for Extreme IVF-Flat Performance + +As AI applications proliferate, the demand for efficient vector search at scale has moved from a "nice-to-have" to a core database requirement. At MatrixOrigin, we recently faced a significant engineering challenge: **How do we build and search an IVF-Flat index of 50 million 1024-dimensional vectors on a server with only 16 cores and 64GB of RAM?** + +Traditional CPU-based approaches were hitting a wall. Building the index took days, and search latency was inconsistent. By integrating NVIDIA’s **cuVS** and **RAFT** libraries into our architecture, we transformed our performance profile. Here is the step-by-step story of how we did it. + +## The Challenge: The "Giant Index" Problem +Our target was an IVF-Flat index with approximately 8,000 clusters holding 50 million vectors. On a 16-core machine, we encountered three primary bottlenecks: +1. **Clustering Latency**: Standard K-Means was slow and often produced unbalanced clusters, leading to "hotspots" that slowed down search. +2. **Assignment Overhead**: Mapping 50 million vectors to their nearest centroids is computationally expensive. On CPUs, this task competed for resources with data loading and decompression, dragging the process out to 24 hours. +3. **The GPU "Single Query" Trap**: Databases typically process one query at a time. GPUs, however, only show their true strength when processing large batches. + +## Step 1: Solving Clustering with Balanced K-Means +Standard K-Means often results in some clusters having thousands of vectors while others have almost none. In an IVF index, this leads to unpredictable IO and search times. + +We initially implemented our own balanced K-Means, which brought the clustering time down from 30 minutes to 5 minutes. However, by switching to the **cuVS Balanced K-Means algorithm**, we utilized GPU parallelism to its fullest. +* **Result**: Clustering time dropped from **5 minutes to just 5 seconds**. + +## Step 2: Offloading Assignment to Brute-Force GPU Kernels +Once the 8,000 centroids are defined, every one of the 50 million vectors must be assigned to its closest cluster. Doing this on a 16-core CPU is a nightmare of cache misses and thread contention. + +By using the **cuVS Brute-Force index** to "offline" this distance computation to the GPU, we eliminated the CPU bottleneck entirely. +* **Result**: The assignment phase dropped from **24 hours to 30 minutes**. + +## Step 3: The Architecture—`cuvs_worker_t` and Dynamic Batching +To solve the "Single Query" problem, we designed a sophisticated bridge between Go and CUDA: the `cuvs_worker_t`. + +### Dynamic Batching: The Secret Sauce +Instead of launching a new CUDA kernel for every incoming request, our worker implements **Dynamic Batching**. It holds incoming queries for a tiny microsecond window, consolidates them into a single matrix, and executes one large GPU search. +* This maximizes warp utilization and reduces kernel launch overhead. +* **Performance Gain**: Provides a **5x-10x throughput boost** in high-concurrency environments. + +### RAFT Resource Management +We leverage the **RAFT** library to manage long-lived `raft::resources`. By caching CUDA streams and handles within persistent C++ threads, we ensure that our Go-based kernel can interact with the GPU with near-zero resource initialization overhead. + +## Step 4: Staying Within 64GB with Auto-Quantization +50 million 1024D vectors in `float32` require roughly 200GB of space—far exceeding our 64GB RAM limit. To solve this, we implemented **Automatic Type Quantization** directly on the GPU. +* **FP16 (Half Precision)**: Reduces memory by 2x with almost zero recall loss. +* **8-Bit Integer (int8/uint8)**: Uses a learned Scalar Quantizer to compress vectors by 4x. +* Because conversion happens on the GPU, we avoid taxing the CPU and minimize PCIe bus traffic. + +## Summary of Supported Indexes +Our architecture now supports a suite of high-performance indexes: +* **CAGRA**: A hardware-accelerated graph index for state-of-the-art search speed. +* **IVF-Flat**: The workhorse for high-accuracy general-purpose search. +* **IVF-PQ**: For extreme compression of billion-scale datasets. +* **K-Means**: For high-speed data partitioning. + +## Conclusion +By shifting the heavy lifting of clustering, assignment, and quantization to the GPU through cuVS, MatrixOne can now handle massive vector datasets on surprisingly modest hardware. What once took a full day now takes less than an hour, with search latencies that remain low even under heavy load. + +The integration of `cuvs_worker_t` and dynamic batching ensures that we don't just have a "fast index," but a **production-ready database engine** capable of scaling with the needs of modern AI. diff --git a/cgo/cuvs/brute_force.hpp b/cgo/cuvs/brute_force.hpp new file mode 100644 index 0000000000000..25b3178be6363 --- /dev/null +++ b/cgo/cuvs/brute_force.hpp @@ -0,0 +1,326 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "index_base.hpp" +#include "cuvs_worker.hpp" // For cuvs_worker_t and raft_handle_wrapper_t +#include // For RAFT_CUDA_TRY +#include // For half + +// Standard library includes +#include // For std::copy +#include // For simulation debug logs +#include +#include // For std::iota +#include // For std::runtime_error +#include +#include +#include +#include // For std::promise and std::future +#include // For std::numeric_limits +#include // For std::shared_mutex + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +// RAFT includes +#include // For raft::device_matrix +#include // Required for device_matrix_view +#include // For raft::host_matrix +#include // Core resource handle +#include +#include // For raft::copy with type conversion + + +// cuVS includes +#include // cuVS distance API +#include +#include "quantize.hpp" +#pragma GCC diagnostic pop + + +namespace matrixone { + +/** + * @brief Brute-force nearest neighbor search on GPU. + * @tparam T Data type of the vector elements (e.g., float, half). + */ +template +class gpu_brute_force_t : public gpu_index_base_t { +public: + std::unique_ptr> index; + + ~gpu_brute_force_t() override { + this->destroy(); + } + + /** + * @brief Constructor for brute-force search. + */ + gpu_brute_force_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, cuvs::distance::DistanceType m, + uint32_t nthread, int device_id = 0) { + + this->dimension = dimension; + this->count = static_cast(count_vectors); + this->metric = m; + this->devices_ = {device_id}; + this->current_offset_ = static_cast(count_vectors); + + this->worker = std::make_unique(nthread, this->devices_); + + this->flattened_host_dataset.resize(this->count * this->dimension); + if (dataset_data) { + std::copy(dataset_data, dataset_data + (this->count * this->dimension), this->flattened_host_dataset.begin()); + } + } + + /** + * @brief Constructor for an empty index (chunked addition support). + */ + gpu_brute_force_t(uint64_t total_count, uint32_t dimension, cuvs::distance::DistanceType m, + uint32_t nthread, int device_id = 0) { + + this->dimension = dimension; + this->count = static_cast(total_count); + this->metric = m; + this->devices_ = {device_id}; + this->current_offset_ = 0; + + this->worker = std::make_unique(nthread, this->devices_); + this->flattened_host_dataset.resize(this->count * this->dimension); + } + + /** + * @brief Starts the worker and initializes resources. + */ + void start() { + auto init_fn = [](raft_handle_wrapper_t&) -> std::any { + return std::any(); + }; + + auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any { + std::unique_lock lock(this->mutex_); + index.reset(); + this->dataset_device_ptr_.reset(); + return std::any(); + }; + + this->worker->start(init_fn, stop_fn); + } + + /** + * @brief Loads the dataset to the GPU and builds the index. + */ + void build() { + std::unique_lock lock(this->mutex_); + if (this->is_loaded_) return; + + if (this->count == 0) { + index = nullptr; + this->is_loaded_ = true; + return; + } + + if (this->current_offset_ > 0 && this->current_offset_ < this->count) { + this->count = static_cast(this->current_offset_); + this->flattened_host_dataset.resize(this->count * this->dimension); + } + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + this->build_internal(handle); + return std::any(); + } + ); + + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + this->is_loaded_ = true; + // Clear host dataset after building to save memory + this->flattened_host_dataset.clear(); + this->flattened_host_dataset.shrink_to_fit(); + } + + /** + * @brief Internal build implementation (no worker submission) + */ + void build_internal(raft_handle_wrapper_t& handle) { + auto res = handle.get_raft_resources(); + if (this->flattened_host_dataset.empty()) { + index = nullptr; + return; + } + + auto dataset_device = new auto(raft::make_device_matrix( + *res, static_cast(this->count), static_cast(this->dimension))); + + this->dataset_device_ptr_ = std::shared_ptr(dataset_device, [](void* ptr) { + delete static_cast*>(ptr); + }); + + RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), this->flattened_host_dataset.data(), + this->flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + cuvs::neighbors::brute_force::index_params index_params; + index_params.metric = this->metric; + + index = std::make_unique>( + cuvs::neighbors::brute_force::build(*res, index_params, raft::make_const_mdspan(dataset_device->view()))); + + raft::resource::sync_stream(*res); + } + + /** + * @brief Search result containing neighbor IDs and distances. + */ + struct search_result_t { + std::vector neighbors; // Indices of nearest neighbors + std::vector distances; // Distances to nearest neighbors + }; + + /** + * @brief Performs brute-force search for given queries. + */ + search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit) { + if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{}; + if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch"); + if (!this->is_loaded_ || !index) return search_result_t{}; + + uint64_t job_id = this->worker->submit( + [&, num_queries, limit, queries_data](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + auto queries_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(this->dimension)); + RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data, + num_queries * this->dimension * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + auto neighbors_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + auto distances_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + + cuvs::neighbors::brute_force::search_params search_params; + cuvs::neighbors::brute_force::search(*res, search_params, *index, + raft::make_const_mdspan(queries_device.view()), neighbors_device.view(), distances_device.view()); + + search_result_t s_res; + s_res.neighbors.resize(num_queries * limit); + s_res.distances.resize(num_queries * limit); + + RAFT_CUDA_TRY(cudaMemcpyAsync(s_res.neighbors.data(), neighbors_device.data_handle(), + s_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + RAFT_CUDA_TRY(cudaMemcpyAsync(s_res.distances.data(), distances_device.data_handle(), + s_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + + for (size_t i = 0; i < s_res.neighbors.size(); ++i) { + if (s_res.neighbors[i] == std::numeric_limits::max() || + s_res.neighbors[i] == 4294967295LL || s_res.neighbors[i] < 0) { + s_res.neighbors[i] = -1; + } + } + return s_res; + } + ); + + auto result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast(result.result); + } + + /** + * @brief Performs brute-force search for given float32 queries, with on-the-fly conversion if needed. + */ + search_result_t search_float(const float* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit) { + if constexpr (std::is_same_v) { + return search(queries_data, num_queries, query_dimension, limit); + } + + if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{}; + if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch"); + if (!this->is_loaded_ || !index) return search_result_t{}; + + uint64_t job_id = this->worker->submit( + [&, num_queries, limit, queries_data](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + auto queries_device_float = raft::make_device_matrix(*res, num_queries, this->dimension); + raft::copy(*res, queries_device_float.view(), raft::make_host_matrix_view(queries_data, num_queries, this->dimension)); + + auto queries_device_target = raft::make_device_matrix(*res, num_queries, this->dimension); + raft::copy(*res, queries_device_target.view(), queries_device_float.view()); + + auto neighbors_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + auto distances_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + + cuvs::neighbors::brute_force::search_params search_params; + cuvs::neighbors::brute_force::search(*res, search_params, *index, + raft::make_const_mdspan(queries_device_target.view()), neighbors_device.view(), distances_device.view()); + + search_result_t s_res; + s_res.neighbors.resize(num_queries * limit); + s_res.distances.resize(num_queries * limit); + + RAFT_CUDA_TRY(cudaMemcpyAsync(s_res.neighbors.data(), neighbors_device.data_handle(), + s_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + RAFT_CUDA_TRY(cudaMemcpyAsync(s_res.distances.data(), distances_device.data_handle(), + s_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + + for (size_t i = 0; i < s_res.neighbors.size(); ++i) { + if (s_res.neighbors[i] == std::numeric_limits::max() || + s_res.neighbors[i] == 4294967295LL || s_res.neighbors[i] < 0) { + s_res.neighbors[i] = -1; + } + } + return s_res; + } + ); + + auto result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast(result.result); + } + + std::string info() const override { + std::string json = gpu_index_base_t::info(); + json += ", \"type\": \"BruteForce\", \"brute_force\": {"; + if (index) { + json += "\"size\": " + std::to_string(index->size()); + } else { + json += "\"size\": 0, \"built\": false"; + } + json += "}}"; + return json; + } +}; + +} // namespace matrixone diff --git a/cgo/cuvs/brute_force_c.cpp b/cgo/cuvs/brute_force_c.cpp new file mode 100644 index 0000000000000..f880115b10b2e --- /dev/null +++ b/cgo/cuvs/brute_force_c.cpp @@ -0,0 +1,274 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "brute_force_c.h" +#include "brute_force.hpp" +#include +#include +#include +#include +#include +#include +#include + +struct gpu_brute_force_any_t { + + quantization_t qtype; + void* ptr; + + gpu_brute_force_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {} + ~gpu_brute_force_any_t() { + switch (qtype) { + case Quantization_F32: delete static_cast*>(ptr); break; + case Quantization_F16: delete static_cast*>(ptr); break; + default: break; + } + } +}; + +extern "C" { + +gpu_brute_force_c gpu_brute_force_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + void* index_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + index_ptr = new matrixone::gpu_brute_force_t(static_cast(dataset_data), count_vectors, dimension, metric, nthread, device_id); + break; + case Quantization_F16: + index_ptr = new matrixone::gpu_brute_force_t(static_cast(dataset_data), count_vectors, dimension, metric, nthread, device_id); + break; + default: + throw std::runtime_error("Unsupported quantization type for brute force (only f32 and f16 supported)"); + } + return static_cast(new gpu_brute_force_any_t(qtype, index_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_new", e.what()); + return nullptr; + } +} + +gpu_brute_force_c gpu_brute_force_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric_c, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + void* index_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + index_ptr = new matrixone::gpu_brute_force_t(total_count, dimension, metric, nthread, device_id); + break; + case Quantization_F16: + index_ptr = new matrixone::gpu_brute_force_t(total_count, dimension, metric, nthread, device_id); + break; + default: + throw std::runtime_error("Unsupported quantization type for brute force (only f32 and f16 supported)"); + } + return static_cast(new gpu_brute_force_any_t(qtype, index_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_new_empty", e.what()); + return nullptr; + } +} + +void gpu_brute_force_start(gpu_brute_force_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->start(); break; + case Quantization_F16: static_cast*>(any->ptr)->start(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_start", e.what()); + } +} + +void gpu_brute_force_build(gpu_brute_force_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->build(); break; + case Quantization_F16: static_cast*>(any->ptr)->build(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_build", e.what()); + } +} + +void gpu_brute_force_add_chunk(gpu_brute_force_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_F16: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_add_chunk", e.what()); + } +} + +void gpu_brute_force_add_chunk_float(gpu_brute_force_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_F16: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_add_chunk_float", e.what()); + } +} + +gpu_brute_force_search_result_c gpu_brute_force_search(gpu_brute_force_c index_c, const void* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + void* result_ptr = nullptr; + switch (any->qtype) { + case Quantization_F32: { + auto res = std::make_unique::search_result_t>(); + *res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit); + result_ptr = res.release(); + break; + } + case Quantization_F16: { + auto res = std::make_unique::search_result_t>(); + *res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit); + result_ptr = res.release(); + break; + } + default: break; + } + return static_cast(result_ptr); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_search", e.what()); + return nullptr; + } +} + +gpu_brute_force_search_result_c gpu_brute_force_search_float(gpu_brute_force_c index_c, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + void* result_ptr = nullptr; + switch (any->qtype) { + case Quantization_F32: { + auto res = std::make_unique::search_result_t>(); + *res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit); + result_ptr = res.release(); + break; + } + case Quantization_F16: { + auto res = std::make_unique::search_result_t>(); + *res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit); + result_ptr = res.release(); + break; + } + default: break; + } + return static_cast(result_ptr); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_search_float", e.what()); + return nullptr; + } +} + +void gpu_brute_force_get_results(gpu_brute_force_search_result_c result_c, uint64_t num_queries, uint32_t limit, int64_t* neighbors, float* distances) { + if (!result_c) return; + auto* search_result = static_cast::search_result_t*>(result_c); + + size_t total = num_queries * limit; + if (search_result->neighbors.size() >= total) { + std::copy(search_result->neighbors.begin(), search_result->neighbors.begin() + total, neighbors); + } else { + std::fill(neighbors, neighbors + total, -1); + } + + if (search_result->distances.size() >= total) { + std::copy(search_result->distances.begin(), search_result->distances.begin() + total, distances); + } else { + std::fill(distances, distances + total, std::numeric_limits::infinity()); + } +} + +void gpu_brute_force_free_search_result(gpu_brute_force_search_result_c result_c) { + if (!result_c) return; + delete static_cast::search_result_t*>(result_c); +} + +uint32_t gpu_brute_force_cap(gpu_brute_force_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->cap(); + case Quantization_F16: return static_cast*>(any->ptr)->cap(); + default: return 0; + } +} + +uint32_t gpu_brute_force_len(gpu_brute_force_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->len(); + case Quantization_F16: return static_cast*>(any->ptr)->len(); + default: return 0; + } +} + +char* gpu_brute_force_info(gpu_brute_force_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + if (!index_c) return nullptr; + try { + auto* any = static_cast(index_c); + std::string info; + switch (any->qtype) { + case Quantization_F32: info = static_cast*>(any->ptr)->info(); break; + case Quantization_F16: info = static_cast*>(any->ptr)->info(); break; + default: return nullptr; + } + return strdup(info.c_str()); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_info", e.what()); + return nullptr; + } +} + +void gpu_brute_force_destroy(gpu_brute_force_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + delete any; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_brute_force_destroy", e.what()); + } +} + +} // extern "C" + +namespace matrixone { +template class gpu_brute_force_t; +template class gpu_brute_force_t; +} diff --git a/cgo/cuvs/brute_force_c.h b/cgo/cuvs/brute_force_c.h new file mode 100644 index 0000000000000..3c28e47e2bdfd --- /dev/null +++ b/cgo/cuvs/brute_force_c.h @@ -0,0 +1,78 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef BRUTE_FORCE_C_H +#define BRUTE_FORCE_C_H + +#include "helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque pointer to the C++ gpu_brute_force_t object +typedef void* gpu_brute_force_c; + +// Opaque pointer to the C++ search result object +typedef void* gpu_brute_force_search_result_c; + +// Constructor for gpu_brute_force_t +gpu_brute_force_c gpu_brute_force_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg); + +// Constructor for an empty index (pre-allocates) +gpu_brute_force_c gpu_brute_force_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg); + +// Starts the worker and initializes resources +void gpu_brute_force_start(gpu_brute_force_c index_c, void* errmsg); + +// Builds the index (loads the dataset to the GPU) +void gpu_brute_force_build(gpu_brute_force_c index_c, void* errmsg); + +// Add chunk of data (same type as index quantization) +void gpu_brute_force_add_chunk(gpu_brute_force_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg); + +// Add chunk of data (from float, with on-the-fly conversion if needed) +void gpu_brute_force_add_chunk_float(gpu_brute_force_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg); + +// Performs a search operation +gpu_brute_force_search_result_c gpu_brute_force_search(gpu_brute_force_c index_c, const void* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg); + +// Performs a search operation with float32 queries +gpu_brute_force_search_result_c gpu_brute_force_search_float(gpu_brute_force_c index_c, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg); + +// Retrieves the results from a search operation +void gpu_brute_force_get_results(gpu_brute_force_search_result_c result_c, uint64_t num_queries, uint32_t limit, int64_t* neighbors, float* distances); + +// Frees the memory for a gpu_brute_force_search_result_c object +void gpu_brute_force_free_search_result(gpu_brute_force_search_result_c result_c); + +// Returns the capacity of the index buffer +uint32_t gpu_brute_force_cap(gpu_brute_force_c index_c); + +// Returns the current number of vectors in the index +uint32_t gpu_brute_force_len(gpu_brute_force_c index_c); + +// Returns info about the index as a JSON string +char* gpu_brute_force_info(gpu_brute_force_c index_c, void* errmsg); + +// Destroys the gpu_brute_force_t object and frees associated resources +void gpu_brute_force_destroy(gpu_brute_force_c index_c, void* errmsg); + +#ifdef __cplusplus +} +#endif + +#endif // BRUTE_FORCE_C_H diff --git a/cgo/cuvs/cagra.hpp b/cgo/cuvs/cagra.hpp new file mode 100644 index 0000000000000..c5dcd3a0e8db2 --- /dev/null +++ b/cgo/cuvs/cagra.hpp @@ -0,0 +1,754 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "index_base.hpp" +#include "cuvs_worker.hpp" +#include "cuvs_types.h" +#include "quantize.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#include +#include +#include +#include +#include +#include + +#include +#include +#pragma GCC diagnostic pop + +namespace matrixone { + +/** + * @brief Search result containing neighbor IDs and distances. + * Common for all CAGRA instantiations. + */ +struct cagra_search_result_t { + std::vector neighbors; // Indices of nearest neighbors + std::vector distances; // Distances to nearest neighbors +}; + +/** + * @brief gpu_cagra_t implements a CAGRA index that can run on a single GPU or sharded across multiple GPUs. + * It automatically chooses between single-GPU and multi-GPU (SNMG) cuVS APIs based on the RAFT handle resources. + */ +template +class gpu_cagra_t : public gpu_index_base_t { +public: + using cagra_index = cuvs::neighbors::cagra::index; + using mg_index = cuvs::neighbors::mg_index; + using search_result_t = cagra_search_result_t; + + // Internal index storage + std::unique_ptr index_; + std::unique_ptr mg_index_; + + ~gpu_cagra_t() override { + this->destroy(); + } + + // Unified Constructor for building from dataset + gpu_cagra_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, + cuvs::distance::DistanceType m, const cagra_build_params_t& bp, + const std::vector& devices, uint32_t nthread, distribution_mode_t mode) { + + this->dimension = dimension; + this->count = static_cast(count_vectors); + this->metric = m; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = static_cast(count_vectors); + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + + this->flattened_host_dataset.resize(this->count * this->dimension); + if (dataset_data) { + std::copy(dataset_data, dataset_data + (this->count * this->dimension), this->flattened_host_dataset.begin()); + } + } + + // Constructor for chunked input (pre-allocates) + gpu_cagra_t(uint64_t total_count, uint32_t dimension, cuvs::distance::DistanceType m, + const cagra_build_params_t& bp, const std::vector& devices, + uint32_t nthread, distribution_mode_t mode) { + + this->dimension = dimension; + this->count = static_cast(total_count); + this->metric = m; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = 0; + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + + this->flattened_host_dataset.resize(this->count * this->dimension); + } + + // Unified Constructor for loading from file + gpu_cagra_t(const std::string& filename, uint32_t dimension, cuvs::distance::DistanceType m, + const cagra_build_params_t& bp, const std::vector& devices, uint32_t nthread, distribution_mode_t mode) { + + this->filename_ = filename; + this->dimension = dimension; + this->metric = m; + this->count = 0; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = 0; + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + } + + // Private constructor for creating from an existing cuVS index (used by merge) + gpu_cagra_t(std::unique_ptr idx, + uint32_t dim, cuvs::distance::DistanceType m, uint32_t nthread, const std::vector& devices) + : index_(std::move(idx)) { + + this->metric = m; + this->dimension = dim; + this->devices_ = devices; + + // Merge result is currently a single-GPU index. + this->worker = std::make_unique(nthread, this->devices_, false); + + this->count = static_cast(index_->size()); + this->build_params.graph_degree = static_cast(index_->graph_degree()); + this->build_params.intermediate_graph_degree = this->build_params.graph_degree * 2; // Best guess + this->dist_mode = DistributionMode_SINGLE_GPU; + this->current_offset_ = this->count; + this->is_loaded_ = true; + } + + /** + * @brief Starts the worker and initializes resources. + */ + void start() { + auto init_fn = [](raft_handle_wrapper_t&) -> std::any { + return std::any(); + }; + + auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any { + std::unique_lock lock(this->mutex_); + index_.reset(); + mg_index_.reset(); + this->quantizer_.reset(); + this->dataset_device_ptr_.reset(); + return std::any(); + }; + + this->worker->start(init_fn, stop_fn); + } + + /** + * @brief Loads the index from file or builds it from the dataset. + */ + void build() { + std::unique_lock lock(this->mutex_); + if (this->is_loaded_) return; + + if (this->filename_.empty() && !index_ && this->current_offset_ > 0 && this->current_offset_ < this->count) { + this->count = static_cast(this->current_offset_); + this->flattened_host_dataset.resize(this->count * this->dimension); + } + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + this->build_internal(handle); + return std::any(); + } + ); + + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + + this->is_loaded_ = true; + // Clear host dataset after building to save memory + if (this->filename_.empty()) { + this->flattened_host_dataset.clear(); + this->flattened_host_dataset.shrink_to_fit(); + } + } + + /** + * @brief Internal build implementation (no worker submission) + */ + void build_internal(raft_handle_wrapper_t& handle) { + auto res = handle.get_raft_resources(); + bool is_mg = is_snmg_handle(res); + + if (!this->filename_.empty()) { + if (is_mg) { + mg_index_ = std::make_unique( + cuvs::neighbors::cagra::deserialize(*res, this->filename_)); + this->count = 0; + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) this->count += static_cast(iface.index_.value().size()); + } + if (!mg_index_->ann_interfaces_.empty() && mg_index_->ann_interfaces_[0].index_.has_value()) { + this->build_params.graph_degree = static_cast(mg_index_->ann_interfaces_[0].index_.value().graph_degree()); + } + } else { + index_ = std::make_unique(*res); + cuvs::neighbors::cagra::deserialize(*res, this->filename_, index_.get()); + this->count = static_cast(index_->size()); + this->build_params.graph_degree = static_cast(index_->graph_degree()); + } + raft::resource::sync_stream(*res); + } else if (!this->flattened_host_dataset.empty()) { + if (is_mg) { + auto dataset_host_view = raft::make_host_matrix_view( + this->flattened_host_dataset.data(), (int64_t)this->count, (int64_t)this->dimension); + + cuvs::neighbors::cagra::index_params index_params; + index_params.metric = this->metric; + index_params.intermediate_graph_degree = this->build_params.intermediate_graph_degree; + index_params.graph_degree = this->build_params.graph_degree; + + cuvs::neighbors::mg_index_params mg_params(index_params); + if (this->dist_mode == DistributionMode_REPLICATED) { + mg_params.mode = cuvs::neighbors::distribution_mode::REPLICATED; + } else { + mg_params.mode = cuvs::neighbors::distribution_mode::SHARDED; + } + + mg_index_ = std::make_unique( + cuvs::neighbors::cagra::build(*res, mg_params, dataset_host_view)); + } else { + auto dataset_device = new auto(raft::make_device_matrix( + *res, static_cast(this->count), static_cast(this->dimension))); + + this->dataset_device_ptr_ = std::shared_ptr(dataset_device, [](void* ptr) { + delete static_cast*>(ptr); + }); + + RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), this->flattened_host_dataset.data(), + this->flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + cuvs::neighbors::cagra::index_params index_params; + index_params.metric = this->metric; + index_params.intermediate_graph_degree = this->build_params.intermediate_graph_degree; + index_params.graph_degree = this->build_params.graph_degree; + index_params.attach_dataset_on_build = this->build_params.attach_dataset_on_build; + + index_ = std::make_unique( + cuvs::neighbors::cagra::build(*res, index_params, raft::make_const_mdspan(dataset_device->view()))); + } + raft::resource::sync_stream(*res); + } + } + + /** + * @brief Extends the existing index with additional vectors. + * @param additional_data Pointer to additional vectors on host. + * @param num_vectors Number of vectors to add. + */ + void extend(const T* additional_data, uint64_t num_vectors) { + if (!this->is_loaded_ || !index_) { + uint64_t old_size = this->flattened_host_dataset.size(); + this->flattened_host_dataset.resize(old_size + num_vectors * this->dimension); + std::copy(additional_data, additional_data + num_vectors * this->dimension, this->flattened_host_dataset.begin() + old_size); + this->count += static_cast(num_vectors); + this->current_offset_ += static_cast(num_vectors); + return; + } + + if constexpr (std::is_same_v) { + throw std::runtime_error("CAGRA single-GPU extend is not supported for float16 (half) by cuVS."); + } else { + if (num_vectors == 0) return; + + std::unique_lock lock(this->mutex_); + + uint64_t job_id = this->worker->submit_main( + [&, additional_data, num_vectors](raft_handle_wrapper_t& handle) -> std::any { + auto res = handle.get_raft_resources(); + + auto additional_dataset_device = raft::make_device_matrix( + *res, static_cast(num_vectors), static_cast(this->dimension)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(additional_dataset_device.data_handle(), additional_data, + num_vectors * this->dimension * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + cuvs::neighbors::cagra::extend_params params; + cuvs::neighbors::cagra::extend(*res, params, raft::make_const_mdspan(additional_dataset_device.view()), *index_); + + raft::resource::sync_stream(*res); + return std::any(); + } + ); + + cuvs_task_result_t result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + + this->count = static_cast(index_->size()); + this->current_offset_ = this->count; + } + } + + /** + * @brief Merges multiple single-GPU CAGRA indices into a single index. + * @param indices Vector of pointers to indices to merge. + * @param nthread Number of worker threads for the merged index. + * @param devices GPU devices to use for the merged index. + * @return A new merged CAGRA index. + */ + static std::unique_ptr> merge(const std::vector*>& indices, uint32_t nthread, const std::vector& devices) { + if (indices.empty()) throw std::invalid_argument("indices empty"); + uint32_t dim = indices[0]->dimension; + cuvs::distance::DistanceType m = indices[0]->metric; + + cuvs_worker_t transient_worker(1, devices, false); + transient_worker.start(); + + uint64_t job_id = transient_worker.submit_main( + [&indices](raft_handle_wrapper_t& handle) -> std::any { + auto res = handle.get_raft_resources(); + + std::vector cagra_indices; + for (auto* idx : indices) { + if (!idx->is_loaded_ || !idx->index_) { + throw std::runtime_error("One of the indices to merge is not loaded or is a multi-GPU index (merge only supports single-GPU indices)."); + } + cagra_indices.push_back(idx->index_.get()); + } + + cuvs::neighbors::cagra::index_params index_params; + auto merged = cuvs::neighbors::cagra::merge(*res, index_params, cagra_indices); + raft::resource::sync_stream(*res); + return new cagra_index(std::move(merged)); + } + ); + + auto result = transient_worker.wait(job_id).get(); + if (result.error) { + transient_worker.stop(); + std::rethrow_exception(result.error); + } + + auto* merged_idx_ptr = std::any_cast(result.result); + std::unique_ptr merged_idx(merged_idx_ptr); + transient_worker.stop(); + + auto new_idx = std::make_unique>( + std::move(merged_idx), + dim, m, nthread, devices + ); + new_idx->is_loaded_ = true; + return new_idx; + } + + /** + * @brief Serializes the index to a file. + * @param filename Path to the output file. + */ + void save(const std::string& filename) { + if (!this->is_loaded_ || (!index_ && !mg_index_)) throw std::runtime_error("index not loaded"); + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + if (is_snmg_handle(res)) { + cuvs::neighbors::cagra::serialize(*res, *mg_index_, filename); + } else { + cuvs::neighbors::cagra::serialize(*res, filename, *index_); + } + raft::resource::sync_stream(*res); + return std::any(); + } + ); + + cuvs_task_result_t result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + } + + /** + * @brief Performs CAGRA search for given queries. + * @param queries_data Pointer to flattened query vectors on host. + * @param num_queries Number of query vectors. + * @param query_dimension Dimension of query vectors. + * @param limit Number of nearest neighbors to find. + * @param sp CAGRA search parameters. + * @return Search results. + */ + search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const cagra_search_params_t& sp) { + if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{}; + if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch"); + if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{}; + + // For large batches or if batching is explicitly disabled, use standard path + if (num_queries > 16 || !this->worker->use_batching()) { + uint64_t job_id = this->worker->submit( + [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any { + return this->search_internal(handle, queries_data, num_queries, limit, sp); + } + ); + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + return std::any_cast(result_wait.result); + } + + return this->search_batch_internal(queries_data, num_queries, limit, sp); + } + + /** + * @brief Internal batch search implementation + */ + search_result_t search_batch_internal(const T* queries_data, uint64_t num_queries, uint32_t limit, const cagra_search_params_t& sp) { + // Dynamic batching for small query counts + struct search_req_t { + const T* data; + uint64_t n; + }; + + std::string batch_key = "cagra_s_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.itopk_size); + + auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector& reqs, const std::vector>& setters) { + uint64_t total_queries = 0; + for (const auto& r : reqs) total_queries += std::any_cast(r).n; + + std::vector aggregated_queries(total_queries * this->dimension); + uint64_t offset = 0; + for (const auto& r : reqs) { + auto req = std::any_cast(r); + std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension)); + offset += req.n; + } + + auto results = this->search_internal(handle, aggregated_queries.data(), total_queries, limit, sp); + + offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + auto req = std::any_cast(reqs[i]); + search_result_t individual_res; + individual_res.neighbors.resize(req.n * limit); + individual_res.distances.resize(req.n * limit); + std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin()); + std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin()); + setters[i](individual_res); + offset += req.n; + } + }; + + auto future = this->worker->template submit_batched(batch_key, search_req_t{queries_data, num_queries}, exec_fn); + return future.get(); + } + + /** + * @brief Internal search implementation (no worker submission) + */ + search_result_t search_internal(raft_handle_wrapper_t& handle, const T* queries_data, uint64_t num_queries, uint32_t limit, const cagra_search_params_t& sp) { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + search_result_t search_res; + search_res.neighbors.resize(num_queries * limit); + search_res.distances.resize(num_queries * limit); + + cuvs::neighbors::cagra::search_params search_params; + search_params.itopk_size = sp.itopk_size; + search_params.search_width = sp.search_width; + + const cagra_index* local_index = index_.get(); + if (!local_index && mg_index_) { + int current_device; + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + for (size_t i = 0; i < this->devices_.size(); ++i) { + if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) { + if (mg_index_->ann_interfaces_[i].index_.has_value()) { + local_index = &mg_index_->ann_interfaces_[i].index_.value(); + break; + } + } + } + } + + if (is_snmg_handle(res) && mg_index_) { + auto queries_host_view = raft::make_host_matrix_view( + queries_data, (int64_t)num_queries, (int64_t)this->dimension); + auto neighbors_host_view = raft::make_host_matrix_view( + search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit); + auto distances_host_view = raft::make_host_matrix_view( + search_res.distances.data(), (int64_t)num_queries, (int64_t)limit); + + cuvs::neighbors::mg_search_params mg_search_params(search_params); + cuvs::neighbors::cagra::search(*res, *mg_index_, mg_search_params, + queries_host_view, neighbors_host_view, distances_host_view); + } else if (local_index) { + auto queries_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(this->dimension)); + RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data, + num_queries * this->dimension * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + auto neighbors_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + auto distances_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + + cuvs::neighbors::cagra::search(*res, search_params, *local_index, + raft::make_const_mdspan(queries_device.view()), + neighbors_device.view(), distances_device.view()); + + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(), + search_res.neighbors.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(), + search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + } else { + throw std::runtime_error("Index not loaded or failed to find local index shard for current device."); + } + + raft::resource::sync_stream(*res); + + for (size_t i = 0; i < search_res.neighbors.size(); ++i) { + if (search_res.neighbors[i] == std::numeric_limits::max()) { + search_res.neighbors[i] = static_cast(-1); + } + } + return search_res; + } + + /** + * @brief Performs CAGRA search for given float32 queries, with on-the-fly quantization if needed. + */ + search_result_t search_float(const float* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const cagra_search_params_t& sp) { + if constexpr (std::is_same_v) { + return search(queries_data, num_queries, query_dimension, limit, sp); + } + + if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{}; + if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch"); + if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{}; + + // For large batches or if batching is explicitly disabled, use standard path + if (num_queries > 16 || !this->worker->use_batching()) { + uint64_t job_id = this->worker->submit( + [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any { + return this->search_float_internal(handle, queries_data, num_queries, query_dimension, limit, sp); + } + ); + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + return std::any_cast(result_wait.result); + } + + return this->search_float_batch_internal(queries_data, num_queries, limit, sp); + } + + /** + * @brief Internal batch search implementation for float32 queries + */ + search_result_t search_float_batch_internal(const float* queries_data, uint64_t num_queries, uint32_t limit, const cagra_search_params_t& sp) { + // Dynamic batching for small query counts + struct search_req_t { + const float* data; + uint64_t n; + }; + + std::string batch_key = "cagra_sf_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.itopk_size); + + auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector& reqs, const std::vector>& setters) { + uint64_t total_queries = 0; + for (const auto& r : reqs) total_queries += std::any_cast(r).n; + + std::vector aggregated_queries(total_queries * this->dimension); + uint64_t offset = 0; + for (const auto& r : reqs) { + auto req = std::any_cast(r); + std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension)); + offset += req.n; + } + + auto results = this->search_float_internal(handle, aggregated_queries.data(), total_queries, this->dimension, limit, sp); + + offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + auto req = std::any_cast(reqs[i]); + search_result_t individual_res; + individual_res.neighbors.resize(req.n * limit); + individual_res.distances.resize(req.n * limit); + std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin()); + std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin()); + setters[i](individual_res); + offset += req.n; + } + }; + + auto future = this->worker->template submit_batched(batch_key, search_req_t{queries_data, num_queries}, exec_fn); + return future.get(); + } + + /** + * @brief Internal search_float implementation (no worker submission) + */ + search_result_t search_float_internal(raft_handle_wrapper_t& handle, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const cagra_search_params_t& sp) { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + // 1. Quantize/Convert float queries to T on device + auto queries_device_float = raft::make_device_matrix(*res, num_queries, this->dimension); + raft::copy(*res, queries_device_float.view(), raft::make_host_matrix_view(queries_data, num_queries, this->dimension)); + + auto queries_device_target = raft::make_device_matrix(*res, num_queries, this->dimension); + if constexpr (sizeof(T) == 1) { + if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained"); + this->quantizer_.template transform(*res, queries_device_float.view(), queries_device_target.data_handle(), true); + raft::resource::sync_stream(*res); + } else { + raft::copy(*res, queries_device_target.view(), queries_device_float.view()); + } + + // 2. Perform search + search_result_t search_res; + search_res.neighbors.resize(num_queries * limit); + search_res.distances.resize(num_queries * limit); + + cuvs::neighbors::cagra::search_params search_params; + search_params.itopk_size = sp.itopk_size; + search_params.search_width = sp.search_width; + + const cagra_index* local_index = index_.get(); + if (!local_index && mg_index_) { + int current_device; + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + for (size_t i = 0; i < this->devices_.size(); ++i) { + if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) { + if (mg_index_->ann_interfaces_[i].index_.has_value()) { + local_index = &mg_index_->ann_interfaces_[i].index_.value(); + break; + } + } + } + } + + if (is_snmg_handle(res) && mg_index_) { + auto queries_host_target = raft::make_host_matrix(num_queries, this->dimension); + raft::copy(*res, queries_host_target.view(), queries_device_target.view()); + raft::resource::sync_stream(*res); + + auto neighbors_host_view = raft::make_host_matrix_view( + search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit); + auto distances_host_view = raft::make_host_matrix_view( + search_res.distances.data(), (int64_t)num_queries, (int64_t)limit); + + cuvs::neighbors::mg_search_params mg_search_params(search_params); + cuvs::neighbors::cagra::search(*res, *mg_index_, mg_search_params, + queries_host_target.view(), + neighbors_host_view, distances_host_view); + } else if (local_index) { + auto neighbors_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + auto distances_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + + cuvs::neighbors::cagra::search(*res, search_params, *local_index, + raft::make_const_mdspan(queries_device_target.view()), + neighbors_device.view(), distances_device.view()); + + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(), + search_res.neighbors.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(), + search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + } else { + throw std::runtime_error("Index not loaded or failed to find local index shard for current device."); + } + + raft::resource::sync_stream(*res); + + for (size_t i = 0; i < search_res.neighbors.size(); ++i) { + if (search_res.neighbors[i] == std::numeric_limits::max()) { + search_res.neighbors[i] = static_cast(-1); + } + } + return search_res; + } + + std::string info() const override { + std::string json = gpu_index_base_t::info(); + json += ", \"type\": \"CAGRA\", \"cagra\": {"; + if (index_) { + json += "\"mode\": \"Single-GPU\", \"size\": " + std::to_string(index_->size()) + + ", \"graph_degree\": " + std::to_string(index_->graph_degree()); + } else if (mg_index_) { + json += "\"mode\": \"Multi-GPU\", \"shards\": ["; + for (size_t i = 0; i < mg_index_->ann_interfaces_.size(); ++i) { + const auto& iface = mg_index_->ann_interfaces_[i]; + json += "{\"device\": " + std::to_string(this->devices_[i]); + if (iface.index_.has_value()) { + json += ", \"size\": " + std::to_string(iface.index_.value().size()) + + ", \"graph_degree\": " + std::to_string(iface.index_.value().graph_degree()); + } else { + json += ", \"status\": \"Not loaded\""; + } + json += "}" + std::string(i == mg_index_->ann_interfaces_.size() - 1 ? "" : ", "); + } + json += "]"; + } else { + json += "\"built\": false"; + } + json += "}}"; + return json; + } + + void destroy() override { + if (this->worker) { + this->worker->stop(); + } + std::unique_lock lock(this->mutex_); + index_.reset(); + mg_index_.reset(); + this->quantizer_.reset(); + this->dataset_device_ptr_.reset(); + } +}; + +} // namespace matrixone diff --git a/cgo/cuvs/cagra_c.cpp b/cgo/cuvs/cagra_c.cpp new file mode 100644 index 0000000000000..ba282895c1fe7 --- /dev/null +++ b/cgo/cuvs/cagra_c.cpp @@ -0,0 +1,502 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cagra_c.h" +#include "cagra.hpp" +#include +#include +#include +#include +#include +#include +#include + +struct gpu_cagra_any_t { + quantization_t qtype; + void* ptr; + + gpu_cagra_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {} + ~gpu_cagra_any_t() { + switch (qtype) { + case Quantization_F32: delete static_cast*>(ptr); break; + case Quantization_F16: delete static_cast*>(ptr); break; + case Quantization_INT8: delete static_cast*>(ptr); break; + case Quantization_UINT8: delete static_cast*>(ptr); break; + default: break; + } + } +}; + +extern "C" { + +gpu_cagra_c gpu_cagra_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c, + cagra_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* cagra_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + cagra_ptr = new matrixone::gpu_cagra_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + cagra_ptr = new matrixone::gpu_cagra_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + cagra_ptr = new matrixone::gpu_cagra_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + cagra_ptr = new matrixone::gpu_cagra_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for CAGRA"); + } + return static_cast(new gpu_cagra_any_t(qtype, cagra_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_new", e.what()); + return nullptr; + } +} + +gpu_cagra_c gpu_cagra_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric_c, + cagra_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* cagra_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + cagra_ptr = new matrixone::gpu_cagra_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + cagra_ptr = new matrixone::gpu_cagra_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + cagra_ptr = new matrixone::gpu_cagra_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + cagra_ptr = new matrixone::gpu_cagra_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for CAGRA"); + } + return static_cast(new gpu_cagra_any_t(qtype, cagra_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_new_empty", e.what()); + return nullptr; + } +} + +void gpu_cagra_add_chunk(gpu_cagra_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_F16: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_INT8: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_UINT8: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_add_chunk", e.what()); + } +} + +void gpu_cagra_add_chunk_float(gpu_cagra_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_F16: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_INT8: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_UINT8: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_add_chunk_float", e.what()); + } +} + +void gpu_cagra_train_quantizer(gpu_cagra_c index_c, const float* train_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_F16: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_INT8: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_UINT8: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_train_quantizer", e.what()); + } +} + +void gpu_cagra_set_per_thread_device(gpu_cagra_c index_c, bool enable, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_F16: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_set_per_thread_device", e.what()); + } +} + +void gpu_cagra_set_use_batching(gpu_cagra_c index_c, bool enable, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_F16: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_use_batching(enable); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_set_use_batching", e.what()); + } +} + +void gpu_cagra_set_quantizer(gpu_cagra_c index_c, float min, float max, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_F16: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_quantizer(min, max); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_set_quantizer", e.what()); + } +} + +void gpu_cagra_get_quantizer(gpu_cagra_c index_c, float* min, float* max, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_F16: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_INT8: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_UINT8: static_cast*>(any->ptr)->get_quantizer(min, max); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_get_quantizer", e.what()); + } +} + +gpu_cagra_c gpu_cagra_load_file(const char* filename, uint32_t dimension, distance_type_t metric_c, + cagra_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* cagra_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + cagra_ptr = new matrixone::gpu_cagra_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + cagra_ptr = new matrixone::gpu_cagra_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + cagra_ptr = new matrixone::gpu_cagra_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + cagra_ptr = new matrixone::gpu_cagra_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for CAGRA"); + } + return static_cast(new gpu_cagra_any_t(qtype, cagra_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_load_file", e.what()); + return nullptr; + } +} + +void gpu_cagra_destroy(gpu_cagra_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + delete any; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_destroy", e.what()); + } +} + +void gpu_cagra_start(gpu_cagra_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->start(); break; + case Quantization_F16: static_cast*>(any->ptr)->start(); break; + case Quantization_INT8: static_cast*>(any->ptr)->start(); break; + case Quantization_UINT8: static_cast*>(any->ptr)->start(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_start", e.what()); + } +} + +void gpu_cagra_build(gpu_cagra_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->build(); break; + case Quantization_F16: static_cast*>(any->ptr)->build(); break; + case Quantization_INT8: static_cast*>(any->ptr)->build(); break; + case Quantization_UINT8: static_cast*>(any->ptr)->build(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_build", e.what()); + } +} + +void gpu_cagra_save(gpu_cagra_c index_c, const char* filename, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->save(filename); break; + case Quantization_F16: static_cast*>(any->ptr)->save(filename); break; + case Quantization_INT8: static_cast*>(any->ptr)->save(filename); break; + case Quantization_UINT8: static_cast*>(any->ptr)->save(filename); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_save", e.what()); + } +} + +gpu_cagra_search_res_t gpu_cagra_search(gpu_cagra_c index_c, const void* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + cagra_search_params_t search_params, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_cagra_search_res_t res = {nullptr}; + try { + auto* any = static_cast(index_c); + auto* cpp_res = new matrixone::cagra_search_result_t(); + switch (any->qtype) { + case Quantization_F32: + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + break; + case Quantization_F16: + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + break; + case Quantization_INT8: + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + break; + case Quantization_UINT8: + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + break; + default: break; + } + res.result_ptr = static_cast(cpp_res); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_search", e.what()); + } + return res; +} + +gpu_cagra_search_res_t gpu_cagra_search_float(gpu_cagra_c index_c, const float* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + cagra_search_params_t search_params, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_cagra_search_res_t res = {nullptr}; + try { + auto* any = static_cast(index_c); + auto* cpp_res = new matrixone::cagra_search_result_t(); + switch (any->qtype) { + case Quantization_F32: + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + break; + case Quantization_F16: + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + break; + case Quantization_INT8: + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + break; + case Quantization_UINT8: + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + break; + default: break; + } + res.result_ptr = static_cast(cpp_res); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_search_float", e.what()); + } + return res; +} + +void gpu_cagra_get_neighbors(gpu_cagra_result_c result_c, uint64_t total_elements, uint32_t* neighbors) { + if (!result_c) return; + auto* neighbors_vec = &static_cast(result_c)->neighbors; + if (neighbors_vec->size() >= total_elements) { + std::copy(neighbors_vec->begin(), neighbors_vec->begin() + total_elements, neighbors); + } +} + +void gpu_cagra_get_distances(gpu_cagra_result_c result_c, uint64_t total_elements, float* distances) { + if (!result_c) return; + auto* distances_vec = &static_cast(result_c)->distances; + if (distances_vec->size() >= total_elements) { + std::copy(distances_vec->begin(), distances_vec->begin() + total_elements, distances); + } +} + +void gpu_cagra_free_result(gpu_cagra_result_c result_c) { + if (!result_c) return; + delete static_cast(result_c); +} + +uint32_t gpu_cagra_cap(gpu_cagra_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->cap(); + case Quantization_F16: return static_cast*>(any->ptr)->cap(); + case Quantization_INT8: return static_cast*>(any->ptr)->cap(); + case Quantization_UINT8: return static_cast*>(any->ptr)->cap(); + default: return 0; + } +} + +uint32_t gpu_cagra_len(gpu_cagra_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->len(); + case Quantization_F16: return static_cast*>(any->ptr)->len(); + case Quantization_INT8: return static_cast*>(any->ptr)->len(); + case Quantization_UINT8: return static_cast*>(any->ptr)->len(); + default: return 0; + } +} + +char* gpu_cagra_info(gpu_cagra_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + if (!index_c) return nullptr; + try { + auto* any = static_cast(index_c); + std::string info; + switch (any->qtype) { + case Quantization_F32: info = static_cast*>(any->ptr)->info(); break; + case Quantization_F16: info = static_cast*>(any->ptr)->info(); break; + case Quantization_INT8: info = static_cast*>(any->ptr)->info(); break; + case Quantization_UINT8: info = static_cast*>(any->ptr)->info(); break; + default: return nullptr; + } + return strdup(info.c_str()); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_info", e.what()); + return nullptr; + } +} + +void gpu_cagra_extend(gpu_cagra_c index_c, const void* additional_data, uint64_t num_vectors, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->extend(static_cast(additional_data), num_vectors); break; + case Quantization_F16: static_cast*>(any->ptr)->extend(static_cast(additional_data), num_vectors); break; + case Quantization_INT8: static_cast*>(any->ptr)->extend(static_cast(additional_data), num_vectors); break; + case Quantization_UINT8: static_cast*>(any->ptr)->extend(static_cast(additional_data), num_vectors); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_extend", e.what()); + } +} + +gpu_cagra_c gpu_cagra_merge(gpu_cagra_c* indices_c, int count, uint32_t nthread, const int* devices, int device_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + if (count <= 0) return nullptr; + std::vector devs(devices, devices + device_count); + auto* first_any = static_cast(indices_c[0]); + quantization_t qtype = first_any->qtype; + + void* merged_ptr = nullptr; + switch (qtype) { + case Quantization_F32: { + std::vector*> indices; + for (int i = 0; i < count; ++i) indices.push_back(static_cast*>(static_cast(indices_c[i])->ptr)); + merged_ptr = matrixone::gpu_cagra_t::merge(indices, nthread, devs).release(); + break; + } + case Quantization_F16: { + std::vector*> indices; + for (int i = 0; i < count; ++i) indices.push_back(static_cast*>(static_cast(indices_c[i])->ptr)); + merged_ptr = matrixone::gpu_cagra_t::merge(indices, nthread, devs).release(); + break; + } + case Quantization_INT8: { + std::vector*> indices; + for (int i = 0; i < count; ++i) indices.push_back(static_cast*>(static_cast(indices_c[i])->ptr)); + merged_ptr = matrixone::gpu_cagra_t::merge(indices, nthread, devs).release(); + break; + } + case Quantization_UINT8: { + std::vector*> indices; + for (int i = 0; i < count; ++i) indices.push_back(static_cast*>(static_cast(indices_c[i])->ptr)); + merged_ptr = matrixone::gpu_cagra_t::merge(indices, nthread, devs).release(); + break; + } + default: break; + } + return static_cast(new gpu_cagra_any_t(qtype, merged_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_cagra_merge", e.what()); + return nullptr; + } +} + +} // extern "C" + +namespace matrixone { +template class gpu_cagra_t; +template class gpu_cagra_t; +template class gpu_cagra_t; +template class gpu_cagra_t; +} // namespace matrixone diff --git a/cgo/cuvs/cagra_c.h b/cgo/cuvs/cagra_c.h new file mode 100644 index 0000000000000..587547ba87d17 --- /dev/null +++ b/cgo/cuvs/cagra_c.h @@ -0,0 +1,118 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CAGRA_C_H +#define CAGRA_C_H + +#include "helper.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque pointer to the C++ gpu_cagra_t object +typedef void* gpu_cagra_c; + +// Opaque pointer to the C++ CAGRA search result object +typedef void* gpu_cagra_result_c; + +// Constructor for building from dataset +gpu_cagra_c gpu_cagra_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, + distance_type_t metric, cagra_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Constructor for loading from file +gpu_cagra_c gpu_cagra_load_file(const char* filename, uint32_t dimension, distance_type_t metric, + cagra_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Destructor +void gpu_cagra_destroy(gpu_cagra_c index_c, void* errmsg); + +// Start function (initializes worker and resources) +void gpu_cagra_start(gpu_cagra_c index_c, void* errmsg); + +// Build function (actually triggers the build/load logic) +void gpu_cagra_build(gpu_cagra_c index_c, void* errmsg); + +// Constructor for an empty index (pre-allocates) +gpu_cagra_c gpu_cagra_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric, + cagra_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Add chunk of data (same type as index quantization) +void gpu_cagra_add_chunk(gpu_cagra_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg); + +// Add chunk of data (from float, with on-the-fly quantization if needed) +void gpu_cagra_add_chunk_float(gpu_cagra_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg); + +// Trains the scalar quantizer (if T is 1-byte) +void gpu_cagra_train_quantizer(gpu_cagra_c index_c, const float* train_data, uint64_t n_samples, void* errmsg); + +void gpu_cagra_set_per_thread_device(gpu_cagra_c index_c, bool enable, void* errmsg); +void gpu_cagra_set_use_batching(gpu_cagra_c index_c, bool enable, void* errmsg); + +void gpu_cagra_set_quantizer(gpu_cagra_c index_c, float min, float max, void* errmsg); +void gpu_cagra_get_quantizer(gpu_cagra_c index_c, float* min, float* max, void* errmsg); + +// Destructor + + +void gpu_cagra_save(gpu_cagra_c index_c, const char* filename, void* errmsg); + +// Search function +typedef struct { + gpu_cagra_result_c result_ptr; +} gpu_cagra_search_res_t; + +gpu_cagra_search_res_t gpu_cagra_search(gpu_cagra_c index_c, const void* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + cagra_search_params_t search_params, void* errmsg); + +gpu_cagra_search_res_t gpu_cagra_search_float(gpu_cagra_c index_c, const float* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + cagra_search_params_t search_params, void* errmsg); +// Get results from result object +void gpu_cagra_get_neighbors(gpu_cagra_result_c result_c, uint64_t total_elements, uint32_t* neighbors); +void gpu_cagra_get_distances(gpu_cagra_result_c result_c, uint64_t total_elements, float* distances); + +// Free result object +void gpu_cagra_free_result(gpu_cagra_result_c result_c); + +// Returns the capacity of the index buffer +uint32_t gpu_cagra_cap(gpu_cagra_c index_c); + +// Returns the current number of vectors in the index +uint32_t gpu_cagra_len(gpu_cagra_c index_c); + +// Returns info about the index as a JSON string +char* gpu_cagra_info(gpu_cagra_c index_c, void* errmsg); + +// Extend function +void gpu_cagra_extend(gpu_cagra_c index_c, const void* additional_data, uint64_t num_vectors, void* errmsg); + +// Merge function +gpu_cagra_c gpu_cagra_merge(gpu_cagra_c* indices_c, int num_indices, uint32_t nthread, const int* devices, int device_count, void* errmsg); + +#ifdef __cplusplus +} +#endif + +#endif // CAGRA_C_H diff --git a/cgo/cuvs/cuvs_types.h b/cgo/cuvs/cuvs_types.h new file mode 100644 index 0000000000000..c5b028fc45d47 --- /dev/null +++ b/cgo/cuvs/cuvs_types.h @@ -0,0 +1,181 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MO_CUVS_TYPES_H +#define MO_CUVS_TYPES_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Distance metrics supported by cuVS. + */ +typedef enum { + DistanceType_L2Expanded = 0, // Squared L2 distance: sum((x-y)^2) + DistanceType_L2SqrtExpanded = 1, // L2 distance: sqrt(sum((x-y)^2)) + DistanceType_CosineExpanded = 2, // Cosine distance: 1 - (x.y)/(|x||y|) + DistanceType_L1 = 3, // L1 (Manhattan) distance: sum(|x-y|) + DistanceType_L2Unexpanded = 4, // L2 distance without expansion + DistanceType_L2SqrtUnexpanded = 5, // L2 distance with sqrt without expansion + DistanceType_InnerProduct = 6, // Inner product: x.y + DistanceType_Linf = 7, // Chebyshev distance: max(|x-y|) + DistanceType_Canberra = 8, // Canberra distance + DistanceType_LpUnexpanded = 9, // Lp distance + DistanceType_CorrelationExpanded = 10, // Correlation distance + DistanceType_JaccardExpanded = 11, // Jaccard distance + DistanceType_HellingerExpanded = 12, // Hellinger distance + DistanceType_Haversine = 13, // Haversine distance + DistanceType_BrayCurtis = 14, // Bray-Curtis distance + DistanceType_JensenShannon = 15, // Jensen-Shannon distance + DistanceType_HammingUnexpanded = 16, // Hamming distance + DistanceType_KLDivergence = 17, // Kullback-Leibler divergence + DistanceType_RusselRaoExpanded = 18, // Russel-Rao distance + DistanceType_DiceExpanded = 19, // Dice distance + DistanceType_BitwiseHamming = 20, // Bitwise Hamming distance + DistanceType_Precomputed = 100, // Precomputed distance + // Aliases + DistanceType_CosineSimilarity = 2, // Alias for Cosine distance + DistanceType_Jaccard = 11, // Alias for Jaccard distance + DistanceType_Hamming = 16, // Alias for Hamming distance + DistanceType_Unknown = 255 // Unknown distance type +} distance_type_t; + +/** + * @brief Data quantization types. + */ +typedef enum { + Quantization_F32, // 32-bit floating point + Quantization_F16, // 16-bit floating point (half) + Quantization_INT8, // 8-bit signed integer + Quantization_UINT8 // 8-bit unsigned integer +} quantization_t; + +/** + * @brief GPU distribution modes. + */ +typedef enum { + DistributionMode_SINGLE_GPU, // Single GPU mode + DistributionMode_SHARDED, // Sharded across multiple GPUs + DistributionMode_REPLICATED // Replicated across multiple GPUs +} distribution_mode_t; + +/** + * @brief CAGRA index build parameters. + */ +typedef struct { + size_t intermediate_graph_degree; // Degree of the intermediate graph (default 128) + size_t graph_degree; // Degree of the final graph (default 64) + bool attach_dataset_on_build; // Whether to attach the dataset to the index (default true) +} cagra_build_params_t; + +/** + * @brief CAGRA search parameters. + */ +typedef struct { + size_t itopk_size; // Internal top-k size (default 64) + size_t search_width; // Number of search paths (default 1) +} cagra_search_params_t; + +/** + * @brief IVF-Flat index build parameters. + */ +typedef struct { + uint32_t n_lists; // Number of inverted lists (clusters) (default 1024) + bool add_data_on_build; // Whether to add data to the index during build (default true) + double kmeans_trainset_fraction; // Fraction of data to use for k-means training (default 0.5) +} ivf_flat_build_params_t; + +/** + * @brief IVF-Flat search parameters. + */ +typedef struct { + uint32_t n_probes; // Number of lists to probe during search (default 20) +} ivf_flat_search_params_t; + +/** + * @brief IVF-PQ index build parameters. + */ +typedef struct { + uint32_t n_lists; // Number of inverted lists (clusters) (default 1024) + uint32_t m; // Number of sub-vectors (default 16) + uint32_t bits_per_code; // Bits per code (default 8) + bool add_data_on_build; // Whether to add data to the index during build (default true) + double kmeans_trainset_fraction; // Fraction of data to use for k-means training (default 0.5) +} ivf_pq_build_params_t; + +/** + * @brief IVF-PQ search parameters. + */ +typedef struct { + uint32_t n_probes; // Number of lists to probe during search (default 20) +} ivf_pq_search_params_t; + +/** + * @brief Brute-force index build parameters (dummy). + */ +typedef struct { +} brute_force_build_params_t; + +/** + * @brief K-Means build parameters (dummy for inheritance). + */ +typedef struct { +} kmeans_build_params_t; + +#ifdef __cplusplus +static inline cagra_build_params_t cagra_build_params_default() { + return {128, 64, true}; +} + +static inline cagra_search_params_t cagra_search_params_default() { + return {64, 1}; +} + +static inline ivf_flat_build_params_t ivf_flat_build_params_default() { + return {1024, true, 0.5}; +} + +static inline ivf_flat_search_params_t ivf_flat_search_params_default() { + return {20}; +} + +static inline ivf_pq_build_params_t ivf_pq_build_params_default() { + return {1024, 16, 8, true, 0.5}; +} + +static inline ivf_pq_search_params_t ivf_pq_search_params_default() { + return {20}; +} + +static inline brute_force_build_params_t brute_force_build_params_default() { + return {}; +} + +static inline kmeans_build_params_t kmeans_build_params_default() { + return {}; +} +#endif + +#ifdef __cplusplus +} +#endif + +#endif // MO_CUVS_TYPES_H diff --git a/cgo/cuvs/cuvs_worker.hpp b/cgo/cuvs/cuvs_worker.hpp new file mode 100644 index 0000000000000..eeaca3551a32c --- /dev/null +++ b/cgo/cuvs/cuvs_worker.hpp @@ -0,0 +1,585 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +#include +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#include +#include +#include +#include +#include +#include +#pragma GCC diagnostic pop + +namespace matrixone { + +/** + * @brief Wrapper for RAFT resources to manage their lifecycle. + * Supports both single-GPU and single-node multi-GPU (SNMG) modes. + */ +class raft_handle_wrapper_t { +public: + // Default constructor for single-GPU mode (uses current device) + raft_handle_wrapper_t() : resources_(std::make_unique()) {} + + // Constructor for single-GPU mode with a specific device ID + explicit raft_handle_wrapper_t(int device_id) { + RAFT_CUDA_TRY(cudaSetDevice(device_id)); + resources_ = std::make_unique(); + } + + // Constructor for multi-GPU mode (SNMG) + // force_mg: If true, use device_resources_snmg even if devices.size() == 1 (useful for testing) + explicit raft_handle_wrapper_t(const std::vector& devices, bool force_mg = false) { + if (devices.empty()) { + resources_ = std::make_unique(); + } else if (devices.size() == 1 && !force_mg) { + RAFT_CUDA_TRY(cudaSetDevice(devices[0])); + resources_ = std::make_unique(); + } else { + // Ensure the main device is set before creating SNMG resources + RAFT_CUDA_TRY(cudaSetDevice(devices[0])); + resources_ = std::make_unique(devices); + } + } + + ~raft_handle_wrapper_t() = default; + + raft::resources* get_raft_resources() const { return resources_.get(); } + +private: + std::unique_ptr resources_; +}; + +/** + * @brief Helper to check if a RAFT handle is configured for Multi-GPU (SNMG). + */ +static inline bool is_snmg_handle(raft::resources* res) { + return dynamic_cast(res) != nullptr; +} + +/** + * @brief A thread-safe blocking queue for task distribution. + */ +template +class thread_safe_queue_t { +public: + void set_capacity(size_t capacity) { + std::lock_guard lock(mu_); + capacity_ = capacity; + } + + void push(T value) { + std::unique_lock lock(mu_); + cv_full_.wait(lock, [this] { return queue_.size() < capacity_ || stopped_; }); + if (stopped_) return; + queue_.push_back(std::move(value)); + cv_empty_.notify_one(); + } + + bool pop(T& value) { + std::unique_lock lock(mu_); + cv_empty_.wait(lock, [this] { return !queue_.empty() || stopped_; }); + if (stopped_) return false; + value = std::move(queue_.front()); + queue_.pop_front(); + cv_full_.notify_one(); + return true; + } + + bool try_pop(T& value) { + std::lock_guard lock(mu_); + if (queue_.empty() || stopped_) return false; + value = std::move(queue_.front()); + queue_.pop_front(); + cv_full_.notify_one(); + return true; + } + + void stop() { + { + std::lock_guard lock(mu_); + stopped_ = true; + } + cv_empty_.notify_all(); + cv_full_.notify_all(); + } + + bool is_stopped() const { + std::lock_guard lock(mu_); + return stopped_; + } + + bool empty() const { + std::lock_guard lock(mu_); + return queue_.empty(); + } + + size_t size() const { + std::lock_guard lock(mu_); + return queue_.size(); + } + +private: + std::deque queue_; + mutable std::mutex mu_; + std::condition_variable cv_empty_; + std::condition_variable cv_full_; + size_t capacity_ = std::numeric_limits::max(); + bool stopped_ = false; +}; + +struct cuvs_task_result_t { + uint64_t id; + std::any result; + std::exception_ptr error; +}; + +/** + * @brief Manages storage and retrieval of task results. + */ +class cuvs_task_result_store_t { +public: + cuvs_task_result_store_t() : next_id_(1), stopped_(false) {} + + uint64_t get_next_job_id() { return next_id_.fetch_add(1); } + + void store(const cuvs_task_result_t& result) { + std::unique_lock lock(mu_); + if (auto it = pending_.find(result.id); it != pending_.end()) { + auto promise = std::move(it->second); + pending_.erase(it); + lock.unlock(); + promise->set_value(result); + } else { + results_[result.id] = result; + } + } + + std::future wait(uint64_t job_id) { + std::unique_lock lock(mu_); + if (stopped_) { + std::promise p; + p.set_exception(std::make_exception_ptr(std::runtime_error("cuvs_task_result_store_t stopped before result was available"))); + return p.get_future(); + } + + if (auto it = results_.find(job_id); it != results_.end()) { + std::promise p; + p.set_value(std::move(it->second)); + results_.erase(it); + return p.get_future(); + } + + auto promise = std::make_shared>(); + pending_[job_id] = promise; + return promise->get_future(); + } + + void stop() { + std::lock_guard lock(mu_); + stopped_ = true; + for (auto& pair : pending_) { + pair.second->set_exception(std::make_exception_ptr(std::runtime_error("cuvs_task_result_store_t stopped before result was available"))); + } + pending_.clear(); + results_.clear(); + } + +private: + std::atomic next_id_; + std::mutex mu_; + std::map>> pending_; + std::map results_; + bool stopped_; +}; + +/** + * @brief dedicated worker pool for executing cuVS (RAFT) tasks in GPU-enabled threads. + */ +class cuvs_worker_t { +public: + using raft_handle = raft_handle_wrapper_t; + using user_task_fn = std::function; + using batch_exec_fn = std::function&, const std::vector>&)>; + + struct cuvs_task_t { + uint64_t id; + user_task_fn fn; + }; + + explicit cuvs_worker_t(size_t n_threads, int device_id = -1) + : n_threads_(n_threads), device_id_(device_id) { + if (n_threads == 0) throw std::invalid_argument("Thread count must be > 0"); + size_t cap = 2 * n_threads; + main_tasks_.set_capacity(cap); + worker_tasks_.set_capacity(cap); + } + + cuvs_worker_t(size_t n_threads, const std::vector& devices, bool force_mg = false) + : n_threads_(n_threads), devices_(devices), force_mg_(force_mg) { + if (n_threads == 0) throw std::invalid_argument("Thread count must be > 0"); + size_t cap = 2 * n_threads; + main_tasks_.set_capacity(cap); + worker_tasks_.set_capacity(cap); + } + + ~cuvs_worker_t() { stop(); } + + cuvs_worker_t(const cuvs_worker_t&) = delete; + cuvs_worker_t& operator=(const cuvs_worker_t&) = delete; + + void start(user_task_fn init_fn = nullptr, user_task_fn stop_fn = nullptr) { + if (started_.exchange(true)) return; + main_thread_ = std::thread(&cuvs_worker_t::run_main_loop, this, std::move(init_fn), std::move(stop_fn)); + } + + void set_per_thread_device(bool enable) { per_thread_device_ = enable; } + void set_use_batching(bool enable) { use_batching_ = enable; } + bool use_batching() const { return use_batching_; } + + void stop() { + if (!started_.load() || stopped_.exchange(true)) return; + + { + std::lock_guard lock(worker_mu_); + should_stop_ = true; + main_tasks_.stop(); + worker_tasks_.stop(); + } + worker_cv_.notify_all(); + + if (main_thread_.joinable()) main_thread_.join(); + for (auto& t : sub_workers_) if (t.joinable()) t.join(); + + sub_workers_.clear(); + result_store_.stop(); + } + + uint64_t submit(user_task_fn fn) { + if (stopped_.load()) throw std::runtime_error("Cannot submit task: worker stopped"); + uint64_t id = result_store_.get_next_job_id(); + worker_tasks_.push({id, std::move(fn)}); + worker_cv_.notify_all(); + return id; + } + + uint64_t submit_main(user_task_fn fn) { + if (stopped_.load()) throw std::runtime_error("Cannot submit main task: worker stopped"); + uint64_t id = result_store_.get_next_job_id(); + main_tasks_.push({id, std::move(fn)}); + worker_cv_.notify_all(); + return id; + } + + std::future wait(uint64_t id) { return result_store_.wait(id); } + + /** + * @brief Submits a task that can be merged with other tasks having the same batch_key. + * + * @tparam T The expected return type. + * @param batch_key Unique identifier for grouping compatible tasks. + * @param request The data for this individual request. + * @param exec_fn Callback to execute the combined batch. + * @return std::future Future for the individual result. + */ + template + std::future submit_batched(const std::string& batch_key, std::any request, batch_exec_fn exec_fn) { + if (stopped_.load()) throw std::runtime_error("Cannot submit batched task: worker stopped"); + + if (!use_batching_ || n_threads_ <= 1) { + // Direct submission without batching + auto promise = std::make_shared>(); + auto future = promise->get_future(); + submit([promise, request, exec_fn](raft_handle& handle) -> std::any { + try { + std::vector reqs = {request}; + std::vector> setters = {[promise](std::any val) { + try { + if (val.type() == typeid(std::exception_ptr)) promise->set_exception(std::any_cast(val)); + else promise->set_value(std::any_cast(val)); + } catch (...) { promise->set_exception(std::current_exception()); } + }}; + exec_fn(handle, reqs, setters); + } catch (...) { + promise->set_exception(std::current_exception()); + } + return std::any(); + }); + return future; + } + + auto promise = std::make_shared>(); + auto future = promise->get_future(); + + // Setter to resolve the promise from a std::any result + auto setter = [promise](std::any val) { + try { + if (val.type() == typeid(std::exception_ptr)) { + promise->set_exception(std::any_cast(val)); + } else { + promise->set_value(std::any_cast(val)); + } + } catch (...) { + promise->set_exception(std::current_exception()); + } + }; + + std::shared_ptr batch; + { + std::lock_guard lock(batches_mu_); + auto it = batches_.find(batch_key); + if (it == batches_.end()) { + batch = std::make_shared(); + batches_[batch_key] = batch; + } else { + batch = it->second; + } + + // Simple periodic cleanup of old batches + static size_t cleanup_counter = 0; + if (++cleanup_counter % 1000 == 0) { + for (auto bit = batches_.begin(); bit != batches_.end(); ) { + std::lock_guard block(bit->second->mu); + if (!bit->second->scheduled && bit->second->requests.empty()) { + bit = batches_.erase(bit); + } else { + ++bit; + } + } + } + } + + bool trigger = false; + { + std::lock_guard lock(batch->mu); + batch->requests.push_back(std::move(request)); + batch->setters.push_back(std::move(setter)); + if (!batch->scheduled) { + batch->scheduled = true; + trigger = true; + } + } + + if (trigger) { + // Submit a trigger task that will wait a tiny bit then drain the batch + submit([this, batch, exec_fn](raft_handle& handle) -> std::any { + // Micro-batching wait: allows more goroutines to join the batch + std::this_thread::sleep_for(std::chrono::microseconds(100)); + + std::vector reqs; + std::vector> setters; + + { + std::lock_guard lock(batch->mu); + reqs = std::move(batch->requests); + setters = std::move(batch->setters); + batch->scheduled = false; + } + + if (!reqs.empty()) { + try { + exec_fn(handle, reqs, setters); + } catch (...) { + auto err = std::current_exception(); + for (auto& s : setters) s(err); + } + } + return std::any(); + }); + } + + return future; + } + + std::exception_ptr get_first_error() { + std::lock_guard lock(event_mu_); + return fatal_error_; + } + +private: + void run_main_loop(user_task_fn init_fn, user_task_fn stop_fn) { + pin_thread(0); + auto resource = setup_resource_internal(0, true); + if (!resource) return; + + if (init_fn) { + try { init_fn(*resource); } + catch (...) { report_fatal_error(std::current_exception()); return; } + } + + auto defer_cleanup = [&]() { if (stop_fn) try { stop_fn(*resource); } catch (...) {} }; + std::shared_ptr cleanup_guard(nullptr, [&](...) { defer_cleanup(); }); + + if (n_threads_ > 1) { + for (size_t i = 1; i < n_threads_; ++i) { + sub_workers_.emplace_back(&cuvs_worker_t::worker_sub_loop, this, i); + } + } + + while (true) { + cuvs_task_t task; + bool found = false; + + { + std::unique_lock lock(worker_mu_); + worker_cv_.wait(lock, [&] { + return !main_tasks_.empty() || !worker_tasks_.empty() || should_stop_ || fatal_error_; + }); + + if (should_stop_ || fatal_error_) break; + + if (main_tasks_.try_pop(task)) { + found = true; + } else if (worker_tasks_.try_pop(task)) { + found = true; + } + } + + if (found) { + execute_task(task, *resource); + } + } + } + + void worker_sub_loop(size_t thread_idx) { + pin_thread(-1); + auto resource = setup_resource_internal(thread_idx, false); + if (!resource) return; + + cuvs_task_t task; + while (worker_tasks_.pop(task)) { + if (fatal_error_) break; + execute_task(task, *resource); + } + } + + void execute_task(const cuvs_task_t& task, raft_handle& resource) { + cuvs_task_result_t res; + res.id = task.id; + try { res.result = task.fn(resource); } + catch (...) { + res.error = std::current_exception(); + std::cerr << "ERROR: Task " << task.id << " failed." << std::endl; + } + result_store_.store(res); + } + + std::unique_ptr setup_resource_internal(size_t thread_idx, bool is_main_thread) { + try { + if (!devices_.empty()) { + if (is_main_thread) { + return std::make_unique(devices_, force_mg_); + } + if (per_thread_device_ && n_threads_ > 1) { + int dev = devices_[thread_idx % devices_.size()]; + return std::make_unique(dev); + } + return std::make_unique(devices_, force_mg_); + } else if (device_id_ >= 0) { + return std::make_unique(device_id_); + } else { + return std::make_unique(); + } + } catch (...) { + report_fatal_error(std::current_exception()); + std::cerr << "ERROR: Failed to setup RAFT resource." << std::endl; + return nullptr; + } + } + + void report_fatal_error(std::exception_ptr err) { + std::lock_guard lock(event_mu_); + if (!fatal_error_) fatal_error_ = err; + { + std::lock_guard lock_w(worker_mu_); + should_stop_ = true; // NEW: Ensure we signal stop on fatal error + } + worker_cv_.notify_all(); + } + + void pin_thread(int cpu_id) { +#ifdef __linux__ + static std::atomic next_cpu_id{1}; + int id = (cpu_id >= 0) ? cpu_id : (next_cpu_id.fetch_add(1) % std::thread::hardware_concurrency()); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(id, &cpuset); + if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0) { + std::cerr << "WARNING: Failed to set affinity for thread to core " << id << std::endl; + } +#endif + } + + size_t n_threads_; + int device_id_ = -1; + std::vector devices_; + bool force_mg_ = false; + bool per_thread_device_ = false; + bool use_batching_ = false; + std::atomic started_{false}; + std::atomic stopped_{false}; + + // Unified Task Management + std::mutex worker_mu_; + std::condition_variable worker_cv_; + thread_safe_queue_t main_tasks_; + thread_safe_queue_t worker_tasks_; + bool should_stop_ = false; + + cuvs_task_result_store_t result_store_; + std::thread main_thread_; + std::vector sub_workers_; + + std::mutex event_mu_; + std::exception_ptr fatal_error_; + + // Batching support + struct batch_t { + std::mutex mu; + std::vector requests; + std::vector> setters; + bool scheduled = false; + }; + std::mutex batches_mu_; + std::map> batches_; +}; + +} // namespace matrixone diff --git a/cgo/cuvs/distance.hpp b/cgo/cuvs/distance.hpp new file mode 100644 index 0000000000000..e98539b74b3ca --- /dev/null +++ b/cgo/cuvs/distance.hpp @@ -0,0 +1,98 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include "helper.h" +#include +#include + +namespace matrixone { + +/** + * @brief Performs a pairwise distance calculation on GPU. + * + * @tparam T Data type of the vector elements (e.g., float, half). + * @param res RAFT resources handle. + * @param x Host pointer to the first set of vectors (X). + * @param n_x Number of vectors in X. + * @param y Host pointer to the second set of vectors (Y). + * @param n_y Number of vectors in Y. + * @param dim Dimension of each vector. + * @param metric Distance metric to use. + * @param dist Host pointer to store the resulting distances (size: n_x * n_y). + */ +template +void pairwise_distance(const raft::resources& res, + const T* x, + uint64_t n_x, + const T* y, + uint64_t n_y, + uint32_t dim, + cuvs::distance::DistanceType metric, + float* dist) { + auto stream = raft::resource::get_cuda_stream(res); + + // Helper to align sizes to 256 bytes (CUDA default alignment) + auto align_size = [](size_t size) { + return (size + 255) & ~255; + }; + + // 1. Calculate total buffer sizes with alignment + size_t x_bytes = n_x * dim * sizeof(T); + size_t y_bytes = n_y * dim * sizeof(T); + size_t dist_bytes = n_x * n_y * sizeof(float); + + size_t x_alloc = align_size(x_bytes); + size_t y_alloc = align_size(y_bytes); + size_t total_bytes = x_alloc + y_alloc + dist_bytes; + + // Use a single allocation for all temporary buffers to reduce overhead + void* d_ptr = nullptr; + RAFT_CUDA_TRY(cudaMallocAsync(&d_ptr, total_bytes, stream)); + + char* d_x = static_cast(d_ptr); + char* d_y = d_x + x_alloc; + char* d_dist = d_y + y_alloc; + + // 2. Async copies to Device + RAFT_CUDA_TRY(cudaMemcpyAsync(d_x, x, x_bytes, cudaMemcpyHostToDevice, stream)); + RAFT_CUDA_TRY(cudaMemcpyAsync(d_y, y, y_bytes, cudaMemcpyHostToDevice, stream)); + + // 3. Prepare Views (zero allocation) + auto x_view = raft::make_device_matrix_view(reinterpret_cast(d_x), (int64_t)n_x, (int64_t)dim); + auto y_view = raft::make_device_matrix_view(reinterpret_cast(d_y), (int64_t)n_y, (int64_t)dim); + auto dist_view = raft::make_device_matrix_view(reinterpret_cast(d_dist), (int64_t)n_x, (int64_t)n_y); + + // 4. Execute Pairwise Distance + cuvs::distance::pairwise_distance(res, x_view, y_view, dist_view, metric); + + // 5. Async copy results back to host + RAFT_CUDA_TRY(cudaMemcpyAsync(dist, d_dist, dist_bytes, cudaMemcpyDeviceToHost, stream)); + + // 6. Synchronize + raft::resource::sync_stream(res); + + // 7. Async free + RAFT_CUDA_TRY(cudaFreeAsync(d_ptr, stream)); +} + +} // namespace matrixone diff --git a/cgo/cuvs/distance_c.cpp b/cgo/cuvs/distance_c.cpp new file mode 100644 index 0000000000000..e3c3b02db7d99 --- /dev/null +++ b/cgo/cuvs/distance_c.cpp @@ -0,0 +1,55 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "distance_c.h" +#include "distance.hpp" +#include +#include + +extern "C" { + +void gpu_pairwise_distance(const void* x, + uint64_t n_x, + const void* y, + uint64_t n_y, + uint32_t dim, + distance_type_t metric, + quantization_t qtype, + int device_id, + float* dist, + void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + if (!x || !y || !dist || n_x == 0 || n_y == 0 || dim == 0) return; + + RAFT_CUDA_TRY(cudaSetDevice(device_id)); + const raft::resources& res = matrixone::get_raft_resources(); + cuvs::distance::DistanceType metric_cuvs = matrixone::convert_distance_type(metric); + + if (qtype == Quantization_F32) { + matrixone::pairwise_distance(res, static_cast(x), n_x, static_cast(y), n_y, dim, metric_cuvs, dist); + } else if (qtype == Quantization_F16) { + matrixone::pairwise_distance(res, static_cast(x), n_x, static_cast(y), n_y, dim, metric_cuvs, dist); + } else { + throw std::runtime_error("Unsupported quantization type for pairwise_distance"); + } + + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_pairwise_distance", e.what()); + } +} + +} // extern "C" diff --git a/cgo/cuvs/distance_c.h b/cgo/cuvs/distance_c.h new file mode 100644 index 0000000000000..fe35660afb194 --- /dev/null +++ b/cgo/cuvs/distance_c.h @@ -0,0 +1,56 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DISTANCE_C_H +#define DISTANCE_C_H + +#include "helper.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Performs a pairwise distance calculation on GPU. + * + * @param x Host pointer to the first set of vectors (X). + * @param n_x Number of vectors in X. + * @param y Host pointer to the second set of vectors (Y). + * @param n_y Number of vectors in Y. + * @param dim Dimension of each vector. + * @param metric Distance metric to use. + * @param qtype Quantization type (F32, F16). + * @param device_id GPU device ID to use. + * @param dist Host pointer to store the resulting distances (size: n_x * n_y). + * @param errmsg Pointer to store error message if any. + */ +void gpu_pairwise_distance(const void* x, + uint64_t n_x, + const void* y, + uint64_t n_y, + uint32_t dim, + distance_type_t metric, + quantization_t qtype, + int device_id, + float* dist, + void* errmsg); + +#ifdef __cplusplus +} +#endif + +#endif // DISTANCE_C_H diff --git a/cgo/cuvs/helper.cpp b/cgo/cuvs/helper.cpp new file mode 100644 index 0000000000000..506f72b662b27 --- /dev/null +++ b/cgo/cuvs/helper.cpp @@ -0,0 +1,158 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "helper.h" +#include "cuvs_worker.hpp" +#include +#include +#include +#include +#include +#include +#include + +namespace matrixone { +cuvs::distance::DistanceType convert_distance_type(distance_type_t metric_c) { + switch (metric_c) { + case DistanceType_L2Expanded: return cuvs::distance::DistanceType::L2Expanded; + case DistanceType_L2SqrtExpanded: return cuvs::distance::DistanceType::L2SqrtExpanded; + case DistanceType_CosineExpanded: return cuvs::distance::DistanceType::CosineExpanded; + case DistanceType_L1: return cuvs::distance::DistanceType::L1; + case DistanceType_L2Unexpanded: return cuvs::distance::DistanceType::L2Unexpanded; + case DistanceType_L2SqrtUnexpanded: return cuvs::distance::DistanceType::L2SqrtUnexpanded; + case DistanceType_InnerProduct: return cuvs::distance::DistanceType::InnerProduct; + case DistanceType_Linf: return cuvs::distance::DistanceType::Linf; + case DistanceType_Canberra: return cuvs::distance::DistanceType::Canberra; + case DistanceType_LpUnexpanded: return cuvs::distance::DistanceType::LpUnexpanded; + case DistanceType_CorrelationExpanded: return cuvs::distance::DistanceType::CorrelationExpanded; + case DistanceType_JaccardExpanded: return cuvs::distance::DistanceType::JaccardExpanded; + case DistanceType_HellingerExpanded: return cuvs::distance::DistanceType::HellingerExpanded; + case DistanceType_Haversine: return cuvs::distance::DistanceType::Haversine; + case DistanceType_BrayCurtis: return cuvs::distance::DistanceType::BrayCurtis; + case DistanceType_JensenShannon: return cuvs::distance::DistanceType::JensenShannon; + case DistanceType_HammingUnexpanded: return cuvs::distance::DistanceType::HammingUnexpanded; + case DistanceType_KLDivergence: return cuvs::distance::DistanceType::KLDivergence; + case DistanceType_RusselRaoExpanded: return cuvs::distance::DistanceType::RusselRaoExpanded; + case DistanceType_DiceExpanded: return cuvs::distance::DistanceType::DiceExpanded; + case DistanceType_BitwiseHamming: return cuvs::distance::DistanceType::BitwiseHamming; + case DistanceType_Precomputed: return cuvs::distance::DistanceType::Precomputed; + default: + throw std::runtime_error("Unknown or unsupported distance type"); + } +} + +const raft::resources& get_raft_resources() { + thread_local raft::resources res; + return res; +} +} + +// Vectorized kernel processing 2 elements per thread +__global__ void f32_to_f16_vectorized_kernel(const float2* src, half2* dst, uint64_t n_pairs) { + uint64_t i = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x; + if (i < n_pairs) { + dst[i] = __float22half2_rn(src[i]); + } +} + +// Fallback kernel for the last element if total_elements is odd +__global__ void f32_to_f16_tail_kernel(const float* src, half* dst, uint64_t index) { + dst[index] = __float2half(src[index]); +} + +extern "C" { + +int gpu_get_device_count() { + int count = 0; + cudaError_t err = cudaGetDeviceCount(&count); + if (err != cudaSuccess) { + return -1; + } + return count; +} + +int gpu_get_device_list(int* devices, int max_count) { + int count = 0; + cudaError_t err = cudaGetDeviceCount(&count); + if (err != cudaSuccess) { + return -1; + } + int actual_count = (count > max_count) ? max_count : count; + for (int i = 0; i < actual_count; ++i) { + devices[i] = i; + } + return actual_count; +} + +void set_errmsg(void* errmsg, const char* prefix, const char* what) { + if (errmsg) { + std::string err_str = std::string(prefix) + ": " + std::string(what); + char* msg = (char*)malloc(err_str.length() + 1); + if (msg) { + std::strcpy(msg, err_str.c_str()); + *(static_cast(errmsg)) = msg; + } + } else { + std::cerr << prefix << ": " << what << std::endl; + } +} + +void gpu_convert_f32_to_f16(const float* src, void* dst, uint64_t total_elements, int device_id, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + if (!src || !dst || total_elements == 0) return; + + RAFT_CUDA_TRY(cudaSetDevice(device_id)); + + float *d_src = nullptr; + half *d_dst = nullptr; + + // Allocate device memory + RAFT_CUDA_TRY(cudaMalloc(&d_src, total_elements * sizeof(float))); + RAFT_CUDA_TRY(cudaMalloc(&d_dst, total_elements * sizeof(half))); + + // Copy source to device + RAFT_CUDA_TRY(cudaMemcpy(d_src, src, total_elements * sizeof(float), cudaMemcpyHostToDevice)); + + // Launch vectorized kernel for pairs + uint64_t n_pairs = total_elements / 2; + if (n_pairs > 0) { + uint32_t threads_per_block = 256; + uint32_t blocks = (n_pairs + threads_per_block - 1) / threads_per_block; + f32_to_f16_vectorized_kernel<<>>((const float2*)d_src, (half2*)d_dst, n_pairs); + } + + // Handle the tail if odd + if (total_elements % 2 != 0) { + f32_to_f16_tail_kernel<<<1, 1>>>(d_src, d_dst, total_elements - 1); + } + + RAFT_CUDA_TRY(cudaPeekAtLastError()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + + // Copy result back to host + RAFT_CUDA_TRY(cudaMemcpy(dst, d_dst, total_elements * sizeof(half), cudaMemcpyDeviceToHost)); + + // Free device memory + cudaFree(d_src); + cudaFree(d_dst); + + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_convert_f32_to_f16", e.what()); + } +} + +} // extern "C" diff --git a/cgo/cuvs/helper.h b/cgo/cuvs/helper.h new file mode 100644 index 0000000000000..095f2188fd692 --- /dev/null +++ b/cgo/cuvs/helper.h @@ -0,0 +1,68 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MO_CUVS_C_HELPER_H +#define MO_CUVS_C_HELPER_H + +#include "cuvs_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Returns the number of CUDA-capable devices available. + * @return Number of GPU devices. + */ +int gpu_get_device_count(); + +/** + * @brief Lists the IDs of available CUDA devices. + * @param devices Output array to store device IDs. + * @param max_count Maximum number of device IDs to store. + * @return Number of device IDs written to the array. + */ +int gpu_get_device_list(int* devices, int max_count); + +/** + * @brief Converts float32 data to float16 (half) on GPU. + * @param src Pointer to source float32 data on host or device. + * @param dst Pointer to destination float16 data on device. + * @param total_elements Total number of elements to convert. + * @param device_id ID of the GPU device to use. + * @param errmsg Pointer to store error message if any. + */ +void gpu_convert_f32_to_f16(const float* src, void* dst, uint64_t total_elements, int device_id, void* errmsg); + +/** + * @brief Standardized helper to set an error message. + * @param errmsg Pointer to the error message destination. + * @param prefix Prefix for the error message (e.g., function name). + * @param what The actual error description. + */ +void set_errmsg(void* errmsg, const char* prefix, const char* what); + +#ifdef __cplusplus +} + +#include +namespace matrixone { + cuvs::distance::DistanceType convert_distance_type(distance_type_t metric_c); + const raft::resources& get_raft_resources(); +} +#endif + +#endif // MO_CUVS_C_HELPER_H diff --git a/cgo/cuvs/index_base.hpp b/cgo/cuvs/index_base.hpp new file mode 100644 index 0000000000000..614fe7bf73c55 --- /dev/null +++ b/cgo/cuvs/index_base.hpp @@ -0,0 +1,183 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuvs_worker.hpp" +#include "cuvs_types.h" +#include "quantize.hpp" +#include +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#include +#include +#include +#include +#pragma GCC diagnostic pop + +// cuVS includes +#include + +namespace matrixone { + +/** + * @brief gpu_index_base_t provides common functionality for all GPU-based indexes. + * It manages host dataset, worker pool, quantization, and basic properties. + */ +template +class gpu_index_base_t { +public: + std::vector flattened_host_dataset; + std::vector devices_; + std::string filename_; + + cuvs::distance::DistanceType metric; + uint32_t dimension; + uint32_t count; + BuildParams build_params; + distribution_mode_t dist_mode; + + std::unique_ptr worker; + mutable std::shared_mutex mutex_; + bool is_loaded_ = false; + std::shared_ptr dataset_device_ptr_; // Keep device memory alive + + gpu_index_base_t() = default; + virtual ~gpu_index_base_t() { + destroy(); + } + + // Common management methods + virtual void destroy() { + if (worker) worker->stop(); + } + + void set_use_batching(bool enable) { + if (worker) worker->set_use_batching(enable); + } + + void set_per_thread_device(bool enable) { + if (worker) worker->set_per_thread_device(enable); + } + + void set_quantizer(float min, float max) { + quantizer_ = scalar_quantizer_t(min, max); + } + + void get_quantizer(float* min, float* max) const { + if (!quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained"); + *min = quantizer_.min(); + *max = quantizer_.max(); + } + + void train_quantizer(const float* train_data, uint64_t n_samples) { + if (!train_data || n_samples == 0) return; + uint64_t job_id = worker->submit_main( + [&, train_data, n_samples](raft_handle_wrapper_t& handle) -> std::any { + auto res = handle.get_raft_resources(); + auto train_device = raft::make_device_matrix(*res, n_samples, dimension); + raft::copy(*res, train_device.view(), raft::make_host_matrix_view(train_data, n_samples, dimension)); + quantizer_.train(*res, train_device.view()); + return std::any(); + } + ); + auto result_wait = worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + } + + void add_chunk(const T* chunk_data, uint64_t chunk_count) { + if (current_offset_ + chunk_count > count) throw std::runtime_error("offset out of bounds"); + std::copy(chunk_data, chunk_data + (chunk_count * dimension), flattened_host_dataset.begin() + (current_offset_ * dimension)); + current_offset_ += chunk_count; + } + + void add_chunk_float(const float* chunk_data, uint64_t chunk_count) { + if (current_offset_ + chunk_count > count) throw std::runtime_error("offset out of bounds"); + + uint64_t row_offset = current_offset_; + uint64_t job_id = worker->submit_main( + [&, chunk_data, chunk_count, row_offset](raft_handle_wrapper_t& handle) -> std::any { + auto res = handle.get_raft_resources(); + + // If quantization is needed (T is 1-byte) + if constexpr (sizeof(T) == 1) { + if (!quantizer_.is_trained()) { + int64_t n_train = std::min(static_cast(chunk_count), static_cast(500)); + auto train_device = raft::make_device_matrix(*res, n_train, dimension); + raft::copy(*res, train_device.view(), raft::make_host_matrix_view(chunk_data, n_train, dimension)); + quantizer_.train(*res, train_device.view()); + } + + auto chunk_device_float = raft::make_device_matrix(*res, chunk_count, dimension); + raft::copy(*res, chunk_device_float.view(), raft::make_host_matrix_view(chunk_data, chunk_count, dimension)); + quantizer_.template transform(*res, chunk_device_float.view(), flattened_host_dataset.data() + (row_offset * dimension), false); + raft::resource::sync_stream(*res); + } else if constexpr (std::is_same_v) { + std::copy(chunk_data, chunk_data + (chunk_count * dimension), flattened_host_dataset.begin() + (row_offset * dimension)); + } else { + auto chunk_device_float = raft::make_device_matrix(*res, chunk_count, dimension); + raft::copy(*res, chunk_device_float.view(), raft::make_host_matrix_view(chunk_data, chunk_count, dimension)); + auto out_view = raft::make_host_matrix_view(flattened_host_dataset.data() + (row_offset * dimension), chunk_count, dimension); + raft::copy(*res, out_view, chunk_device_float.view()); + raft::resource::sync_stream(*res); + } + return std::any(); + } + ); + + auto result_wait = worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + current_offset_ += chunk_count; + } + + uint32_t cap() const { + return count; + } + + uint32_t len() const { + return static_cast(current_offset_); + } + + virtual std::string info() const { + std::string json = "{"; + json += "\"element_size\": " + std::to_string(sizeof(T)) + ", "; + json += "\"dimension\": " + std::to_string(dimension) + ", "; + json += "\"metric\": " + std::to_string(static_cast(metric)) + ", "; + json += "\"status\": \"" + std::string(is_loaded_ ? "Loaded" : "Not Loaded") + "\", "; + json += "\"capacity\": " + std::to_string(count) + ", "; + json += "\"current_length\": " + std::to_string(current_offset_) + ", "; + json += "\"devices\": ["; + for (size_t i = 0; i < devices_.size(); ++i) { + json += std::to_string(devices_[i]) + (i == devices_.size() - 1 ? "" : ", "); + } + json += "]"; + return json; // Caller will close the object or add more fields + } + +protected: + scalar_quantizer_t quantizer_; + uint64_t current_offset_ = 0; +}; + +} // namespace matrixone diff --git a/cgo/cuvs/ivf_flat.hpp b/cgo/cuvs/ivf_flat.hpp new file mode 100644 index 0000000000000..7096d5f2e1640 --- /dev/null +++ b/cgo/cuvs/ivf_flat.hpp @@ -0,0 +1,695 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "index_base.hpp" +#include "cuvs_worker.hpp" +#include "cuvs_types.h" +#include "quantize.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#include +#include +#include +#include +#include +#include + +#include +#include +#pragma GCC diagnostic pop + + +namespace matrixone { + +/** + * @brief Search result containing neighbor IDs and distances. + * Common for all IVF-Flat instantiations. + */ +struct ivf_flat_search_result_t { + std::vector neighbors; // Indices of nearest neighbors + std::vector distances; // Distances to nearest neighbors +}; + +/** + * @brief gpu_ivf_flat_t implements an IVF-Flat index that can run on a single GPU or sharded across multiple GPUs. + * It automatically chooses between single-GPU and multi-GPU (SNMG) cuVS APIs based on the RAFT handle resources. + */ +template +class gpu_ivf_flat_t : public gpu_index_base_t { +public: + using ivf_flat_index = cuvs::neighbors::ivf_flat::index; + using mg_index = cuvs::neighbors::mg_index; + using search_result_t = ivf_flat_search_result_t; + + // Internal index storage + std::unique_ptr index_; + std::unique_ptr mg_index_; + + ~gpu_ivf_flat_t() override { + this->destroy(); + } + + // Unified Constructor for building from dataset + gpu_ivf_flat_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, + cuvs::distance::DistanceType m, const ivf_flat_build_params_t& bp, + const std::vector& devices, uint32_t nthread, distribution_mode_t mode) { + + this->dimension = dimension; + this->count = static_cast(count_vectors); + this->metric = m; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = static_cast(count_vectors); + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + + this->flattened_host_dataset.resize(this->count * this->dimension); + if (dataset_data) { + std::copy(dataset_data, dataset_data + (this->count * this->dimension), this->flattened_host_dataset.begin()); + } + } + + // Constructor for chunked input (pre-allocates) + gpu_ivf_flat_t(uint64_t total_count, uint32_t dimension, cuvs::distance::DistanceType m, + const ivf_flat_build_params_t& bp, const std::vector& devices, + uint32_t nthread, distribution_mode_t mode) { + + this->dimension = dimension; + this->count = static_cast(total_count); + this->metric = m; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = 0; + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + + this->flattened_host_dataset.resize(this->count * this->dimension); + } + + // Unified Constructor for loading from file + gpu_ivf_flat_t(const std::string& filename, uint32_t dimension, cuvs::distance::DistanceType m, + const ivf_flat_build_params_t& bp, const std::vector& devices, uint32_t nthread, distribution_mode_t mode) { + + this->filename_ = filename; + this->dimension = dimension; + this->metric = m; + this->count = 0; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = 0; + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + } + + void destroy() override { + if (this->worker) { + this->worker->stop(); + } + std::unique_lock lock(this->mutex_); + index_.reset(); + mg_index_.reset(); + this->quantizer_.reset(); + this->dataset_device_ptr_.reset(); + } + + /** + * @brief Starts the worker and initializes resources. + */ + void start() { + auto init_fn = [](raft_handle_wrapper_t&) -> std::any { + return std::any(); + }; + + auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any { + std::unique_lock lock(this->mutex_); + index_.reset(); + mg_index_.reset(); + this->quantizer_.reset(); + this->dataset_device_ptr_.reset(); + return std::any(); + }; + + this->worker->start(init_fn, stop_fn); + } + + /** + * @brief Loads the index from file or builds it from the dataset. + */ + void build() { + std::unique_lock lock(this->mutex_); + if (this->is_loaded_) return; + + if (this->filename_.empty() && this->current_offset_ > 0 && this->current_offset_ < this->count) { + this->count = static_cast(this->current_offset_); + this->flattened_host_dataset.resize(this->count * this->dimension); + } + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + this->build_internal(handle); + return std::any(); + } + ); + + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + this->is_loaded_ = true; + // Clear host dataset after building to save memory + if (this->filename_.empty()) { + this->flattened_host_dataset.clear(); + this->flattened_host_dataset.shrink_to_fit(); + } + } + + /** + * @brief Internal build implementation (no worker submission) + */ + void build_internal(raft_handle_wrapper_t& handle) { + auto res = handle.get_raft_resources(); + bool is_mg = is_snmg_handle(res); + + if (!this->filename_.empty()) { + if (is_mg) { + mg_index_ = std::make_unique( + cuvs::neighbors::ivf_flat::deserialize(*res, this->filename_)); + // Update metadata + this->count = 0; + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) this->count += static_cast(iface.index_.value().size()); + } + if (!mg_index_->ann_interfaces_.empty() && mg_index_->ann_interfaces_[0].index_.has_value()) { + this->build_params.n_lists = static_cast(mg_index_->ann_interfaces_[0].index_.value().n_lists()); + } + } else { + cuvs::neighbors::ivf_flat::index_params index_params; + index_params.metric = this->metric; + index_ = std::make_unique(*res, index_params, this->dimension); + cuvs::neighbors::ivf_flat::deserialize(*res, this->filename_, index_.get()); + this->count = static_cast(index_->size()); + this->build_params.n_lists = static_cast(index_->n_lists()); + } + raft::resource::sync_stream(*res); + } else if (!this->flattened_host_dataset.empty()) { + if (this->count < this->build_params.n_lists) { + throw std::runtime_error("Dataset too small: count (" + std::to_string(this->count) + + ") must be >= n_list (" + std::to_string(this->build_params.n_lists) + + ") to build IVF index."); + } + + if (is_mg) { + auto dataset_host_view = raft::make_host_matrix_view( + this->flattened_host_dataset.data(), (int64_t)this->count, (int64_t)this->dimension); + + cuvs::neighbors::ivf_flat::index_params index_params; + index_params.metric = this->metric; + index_params.n_lists = this->build_params.n_lists; + index_params.add_data_on_build = this->build_params.add_data_on_build; + index_params.kmeans_trainset_fraction = this->build_params.kmeans_trainset_fraction; + + cuvs::neighbors::mg_index_params mg_params(index_params); + if (this->dist_mode == DistributionMode_REPLICATED) { + mg_params.mode = cuvs::neighbors::distribution_mode::REPLICATED; + } else { + mg_params.mode = cuvs::neighbors::distribution_mode::SHARDED; + } + + mg_index_ = std::make_unique( + cuvs::neighbors::ivf_flat::build(*res, mg_params, dataset_host_view)); + } else { + auto dataset_device = new auto(raft::make_device_matrix( + *res, static_cast(this->count), static_cast(this->dimension))); + + this->dataset_device_ptr_ = std::shared_ptr(dataset_device, [](void* ptr) { + delete static_cast*>(ptr); + }); + + RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), this->flattened_host_dataset.data(), + this->flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + cuvs::neighbors::ivf_flat::index_params index_params; + index_params.metric = this->metric; + index_params.n_lists = this->build_params.n_lists; + index_params.add_data_on_build = this->build_params.add_data_on_build; + index_params.kmeans_trainset_fraction = this->build_params.kmeans_trainset_fraction; + + index_ = std::make_unique( + cuvs::neighbors::ivf_flat::build(*res, index_params, raft::make_const_mdspan(dataset_device->view()))); + } + raft::resource::sync_stream(*res); + } + } + + /** + * @brief Serializes the index to a file. + * @param filename Path to the output file. + */ + void save(const std::string& filename) { + if (!this->is_loaded_ || (!index_ && !mg_index_)) throw std::runtime_error("index not loaded"); + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + if (is_snmg_handle(res)) { + cuvs::neighbors::ivf_flat::serialize(*res, *mg_index_, filename); + } else { + cuvs::neighbors::ivf_flat::serialize(*res, filename, *index_); + } + raft::resource::sync_stream(*res); + return std::any(); + } + ); + + cuvs_task_result_t result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + } + + /** + * @brief Performs IVF-Flat search for given queries. + * @param queries_data Pointer to flattened query vectors on host. + * @param num_queries Number of query vectors. + * @param query_dimension Dimension of query vectors. + * @param limit Number of nearest neighbors to find. + * @param sp IVF-Flat search parameters. + * @return Search results. + */ + search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const ivf_flat_search_params_t& sp) { + if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{}; + if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch"); + if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{}; + + // For large batches or if batching is explicitly disabled, use standard path + if (num_queries > 16 || !this->worker->use_batching()) { + uint64_t job_id = this->worker->submit( + [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any { + return this->search_internal(handle, queries_data, num_queries, limit, sp); + } + ); + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + return std::any_cast(result_wait.result); + } + + return this->search_batch_internal(queries_data, num_queries, limit, sp); + } + + /** + * @brief Internal batch search implementation + */ + search_result_t search_batch_internal(const T* queries_data, uint64_t num_queries, uint32_t limit, const ivf_flat_search_params_t& sp) { + // Dynamic batching for small query counts + struct search_req_t { + const T* data; + uint64_t n; + }; + + std::string batch_key = "ivf_flat_s_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.n_probes); + + auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector& reqs, const std::vector>& setters) { + uint64_t total_queries = 0; + for (const auto& r : reqs) total_queries += std::any_cast(r).n; + + std::vector aggregated_queries(total_queries * this->dimension); + uint64_t offset = 0; + for (const auto& r : reqs) { + auto req = std::any_cast(r); + std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension)); + offset += req.n; + } + + auto results = this->search_internal(handle, aggregated_queries.data(), total_queries, limit, sp); + + offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + auto req = std::any_cast(reqs[i]); + search_result_t individual_res; + individual_res.neighbors.resize(req.n * limit); + individual_res.distances.resize(req.n * limit); + std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin()); + std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin()); + setters[i](individual_res); + offset += req.n; + } + }; + + auto future = this->worker->template submit_batched(batch_key, search_req_t{queries_data, num_queries}, exec_fn); + return future.get(); + } + + /** + * @brief Internal search implementation (no worker submission) + */ + search_result_t search_internal(raft_handle_wrapper_t& handle, const T* queries_data, uint64_t num_queries, uint32_t limit, const ivf_flat_search_params_t& sp) { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + search_result_t search_res; + search_res.neighbors.resize(num_queries * limit); + search_res.distances.resize(num_queries * limit); + + cuvs::neighbors::ivf_flat::search_params search_params; + search_params.n_probes = sp.n_probes; + + const ivf_flat_index* local_index = index_.get(); + if (!local_index && mg_index_) { + int current_device; + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + for (size_t i = 0; i < this->devices_.size(); ++i) { + if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) { + if (mg_index_->ann_interfaces_[i].index_.has_value()) { + local_index = &mg_index_->ann_interfaces_[i].index_.value(); + break; + } + } + } + } + + if (is_snmg_handle(res) && mg_index_) { + auto queries_host_view = raft::make_host_matrix_view( + queries_data, (int64_t)num_queries, (int64_t)this->dimension); + auto neighbors_host_view = raft::make_host_matrix_view( + search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit); + auto distances_host_view = raft::make_host_matrix_view( + search_res.distances.data(), (int64_t)num_queries, (int64_t)limit); + + cuvs::neighbors::mg_search_params mg_search_params(search_params); + cuvs::neighbors::ivf_flat::search(*res, *mg_index_, mg_search_params, + queries_host_view, neighbors_host_view, distances_host_view); + } else if (local_index) { + auto queries_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(this->dimension)); + RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data, + num_queries * this->dimension * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + auto neighbors_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + auto distances_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + + cuvs::neighbors::ivf_flat::search(*res, search_params, *local_index, + raft::make_const_mdspan(queries_device.view()), + neighbors_device.view(), distances_device.view()); + + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(), + search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(), + search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + } else { + throw std::runtime_error("Index not loaded or failed to find local index shard for current device."); + } + + raft::resource::sync_stream(*res); + + for (size_t i = 0; i < search_res.neighbors.size(); ++i) { + if (search_res.neighbors[i] == std::numeric_limits::max() || + search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) { + search_res.neighbors[i] = -1; + } + } + return search_res; + } + + /** + * @brief Performs IVF-Flat search for given float32 queries, with on-the-fly quantization if needed. + */ + search_result_t search_float(const float* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const ivf_flat_search_params_t& sp) { + if constexpr (std::is_same_v) { + return search(queries_data, num_queries, query_dimension, limit, sp); + } + + if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{}; + if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch"); + if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{}; + + // For large batches or if batching is explicitly disabled, use standard path + if (num_queries > 16 || !this->worker->use_batching()) { + uint64_t job_id = this->worker->submit( + [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any { + return this->search_float_internal(handle, queries_data, num_queries, query_dimension, limit, sp); + } + ); + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + return std::any_cast(result_wait.result); + } + + return this->search_float_batch_internal(queries_data, num_queries, limit, sp); + } + + /** + * @brief Internal batch search implementation for float32 queries + */ + search_result_t search_float_batch_internal(const float* queries_data, uint64_t num_queries, uint32_t limit, const ivf_flat_search_params_t& sp) { + // Dynamic batching for small query counts + struct search_req_t { + const float* data; + uint64_t n; + }; + + std::string batch_key = "ivf_flat_sf_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.n_probes); + + auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector& reqs, const std::vector>& setters) { + uint64_t total_queries = 0; + for (const auto& r : reqs) total_queries += std::any_cast(r).n; + + std::vector aggregated_queries(total_queries * this->dimension); + uint64_t offset = 0; + for (const auto& r : reqs) { + auto req = std::any_cast(r); + std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension)); + offset += req.n; + } + + auto results = this->search_float_internal(handle, aggregated_queries.data(), total_queries, this->dimension, limit, sp); + + offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + auto req = std::any_cast(reqs[i]); + search_result_t individual_res; + individual_res.neighbors.resize(req.n * limit); + individual_res.distances.resize(req.n * limit); + std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin()); + std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin()); + setters[i](individual_res); + offset += req.n; + } + }; + + auto future = this->worker->template submit_batched(batch_key, search_req_t{queries_data, num_queries}, exec_fn); + return future.get(); + } + + /** + * @brief Internal search_float implementation (no worker submission) + */ + search_result_t search_float_internal(raft_handle_wrapper_t& handle, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const ivf_flat_search_params_t& sp) { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + // 1. Quantize/Convert float queries to T on device + auto queries_device_float = raft::make_device_matrix(*res, num_queries, this->dimension); + raft::copy(*res, queries_device_float.view(), raft::make_host_matrix_view(queries_data, num_queries, this->dimension)); + + auto queries_device_target = raft::make_device_matrix(*res, num_queries, this->dimension); + if constexpr (sizeof(T) == 1) { + if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained"); + this->quantizer_.template transform(*res, queries_device_float.view(), queries_device_target.data_handle(), true); + raft::resource::sync_stream(*res); + } else { + raft::copy(*res, queries_device_target.view(), queries_device_float.view()); + } + + // 2. Perform search + search_result_t search_res; + search_res.neighbors.resize(num_queries * limit); + search_res.distances.resize(num_queries * limit); + + cuvs::neighbors::ivf_flat::search_params search_params; + search_params.n_probes = sp.n_probes; + + const ivf_flat_index* local_index = index_.get(); + if (!local_index && mg_index_) { + int current_device; + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + for (size_t i = 0; i < this->devices_.size(); ++i) { + if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) { + if (mg_index_->ann_interfaces_[i].index_.has_value()) { + local_index = &mg_index_->ann_interfaces_[i].index_.value(); + break; + } + } + } + } + + if (is_snmg_handle(res) && mg_index_) { + auto queries_host_target = raft::make_host_matrix(num_queries, this->dimension); + raft::copy(*res, queries_host_target.view(), queries_device_target.view()); + raft::resource::sync_stream(*res); + + auto neighbors_host_view = raft::make_host_matrix_view( + search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit); + auto distances_host_view = raft::make_host_matrix_view( + search_res.distances.data(), (int64_t)num_queries, (int64_t)limit); + + cuvs::neighbors::mg_search_params mg_search_params(search_params); + cuvs::neighbors::ivf_flat::search(*res, *mg_index_, mg_search_params, + queries_host_target.view(), + neighbors_host_view, distances_host_view); + } else if (local_index) { + auto neighbors_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + auto distances_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + + cuvs::neighbors::ivf_flat::search(*res, search_params, *local_index, + raft::make_const_mdspan(queries_device_target.view()), + neighbors_device.view(), distances_device.view()); + + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(), + search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(), + search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + } else { + throw std::runtime_error("Index not loaded or failed to find local index shard for current device."); + } + + raft::resource::sync_stream(*res); + + for (size_t i = 0; i < search_res.neighbors.size(); ++i) { + if (search_res.neighbors[i] == std::numeric_limits::max() || + search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) { + search_res.neighbors[i] = -1; + } + } + return search_res; + } + + std::vector get_centers() { + if (!this->is_loaded_ || (!index_ && !mg_index_)) return {}; + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + const ivf_flat_index* local_index = nullptr; + if (index_) { + local_index = index_.get(); + } else if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) { + local_index = &iface.index_.value(); + break; + } + } + } + + if (!local_index) return std::vector{}; + + auto centers_view = local_index->centers(); + size_t n_centers = centers_view.extent(0); + size_t dim = centers_view.extent(1); + std::vector host_centers(n_centers * dim); + + RAFT_CUDA_TRY(cudaMemcpyAsync(host_centers.data(), centers_view.data_handle(), + host_centers.size() * sizeof(T), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + return host_centers; + } + ); + + auto result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast>(result.result); + } + + uint32_t get_n_list() { + std::shared_lock lock(this->mutex_); + if (index_) return static_cast(index_->n_lists()); + if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) return static_cast(iface.index_.value().n_lists()); + } + } + return this->build_params.n_lists; + } + + std::string info() const override { + std::string json = gpu_index_base_t::info(); + json += ", \"type\": \"IVF-Flat\", \"ivf_flat\": {"; + if (index_) { + json += "\"mode\": \"Single-GPU\", \"size\": " + std::to_string(index_->size()) + + ", \"n_lists\": " + std::to_string(index_->n_lists()); + } else if (mg_index_) { + json += "\"mode\": \"Multi-GPU\", \"shards\": ["; + for (size_t i = 0; i < mg_index_->ann_interfaces_.size(); ++i) { + const auto& iface = mg_index_->ann_interfaces_[i]; + json += "{\"device\": " + std::to_string(this->devices_[i]); + if (iface.index_.has_value()) { + json += ", \"size\": " + std::to_string(iface.index_.value().size()) + + ", \"n_lists\": " + std::to_string(iface.index_.value().n_lists()); + } else { + json += ", \"status\": \"Not loaded\""; + } + json += "}" + std::string(i == mg_index_->ann_interfaces_.size() - 1 ? "" : ", "); + } + json += "]"; + } else { + json += "\"built\": false"; + } + json += "}}"; + return json; + } +}; + +} // namespace matrixone diff --git a/cgo/cuvs/ivf_flat_c.cpp b/cgo/cuvs/ivf_flat_c.cpp new file mode 100644 index 0000000000000..215090156c2bc --- /dev/null +++ b/cgo/cuvs/ivf_flat_c.cpp @@ -0,0 +1,507 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ivf_flat_c.h" +#include "ivf_flat.hpp" +#include +#include +#include +#include +#include +#include +#include + +struct gpu_ivf_flat_any_t { + quantization_t qtype; + void* ptr; + + gpu_ivf_flat_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {} + ~gpu_ivf_flat_any_t() { + switch (qtype) { + case Quantization_F32: delete static_cast*>(ptr); break; + case Quantization_F16: delete static_cast*>(ptr); break; + case Quantization_INT8: delete static_cast*>(ptr); break; + case Quantization_UINT8: delete static_cast*>(ptr); break; + default: break; + } + } +}; + +extern "C" { + +gpu_ivf_flat_c gpu_ivf_flat_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c, + ivf_flat_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* ivf_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + ivf_ptr = new matrixone::gpu_ivf_flat_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + ivf_ptr = new matrixone::gpu_ivf_flat_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + ivf_ptr = new matrixone::gpu_ivf_flat_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + ivf_ptr = new matrixone::gpu_ivf_flat_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for IVF-Flat"); + } + return static_cast(new gpu_ivf_flat_any_t(qtype, ivf_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_new", e.what()); + return nullptr; + } +} + +gpu_ivf_flat_c gpu_ivf_flat_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric_c, + ivf_flat_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* ivf_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + ivf_ptr = new matrixone::gpu_ivf_flat_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + ivf_ptr = new matrixone::gpu_ivf_flat_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + ivf_ptr = new matrixone::gpu_ivf_flat_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + ivf_ptr = new matrixone::gpu_ivf_flat_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for IVF-Flat"); + } + return static_cast(new gpu_ivf_flat_any_t(qtype, ivf_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_new_empty", e.what()); + return nullptr; + } +} + +void gpu_ivf_flat_add_chunk(gpu_ivf_flat_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_F16: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_INT8: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_UINT8: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_add_chunk", e.what()); + } +} + +void gpu_ivf_flat_add_chunk_float(gpu_ivf_flat_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_F16: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_INT8: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_UINT8: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_add_chunk_float", e.what()); + } +} + +void gpu_ivf_flat_train_quantizer(gpu_ivf_flat_c index_c, const float* train_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_F16: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_INT8: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_UINT8: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_train_quantizer", e.what()); + } +} + +void gpu_ivf_flat_set_per_thread_device(gpu_ivf_flat_c index_c, bool enable, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_F16: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_set_per_thread_device", e.what()); + } +} + +void gpu_ivf_flat_set_use_batching(gpu_ivf_flat_c index_c, bool enable, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_F16: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_use_batching(enable); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_set_use_batching", e.what()); + } +} + +void gpu_ivf_flat_set_quantizer(gpu_ivf_flat_c index_c, float min, float max, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_F16: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_quantizer(min, max); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_set_quantizer", e.what()); + } +} + +void gpu_ivf_flat_get_quantizer(gpu_ivf_flat_c index_c, float* min, float* max, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_F16: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_INT8: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_UINT8: static_cast*>(any->ptr)->get_quantizer(min, max); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_get_quantizer", e.what()); + } +} + +gpu_ivf_flat_c gpu_ivf_flat_load_file(const char* filename, uint32_t dimension, distance_type_t metric_c, + ivf_flat_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* ivf_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + ivf_ptr = new matrixone::gpu_ivf_flat_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + ivf_ptr = new matrixone::gpu_ivf_flat_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + ivf_ptr = new matrixone::gpu_ivf_flat_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + ivf_ptr = new matrixone::gpu_ivf_flat_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for IVF-Flat"); + } + return static_cast(new gpu_ivf_flat_any_t(qtype, ivf_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_load_file", e.what()); + return nullptr; + } +} + +void gpu_ivf_flat_destroy(gpu_ivf_flat_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + delete any; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_destroy", e.what()); + } +} + +void gpu_ivf_flat_start(gpu_ivf_flat_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->start(); break; + case Quantization_F16: static_cast*>(any->ptr)->start(); break; + case Quantization_INT8: static_cast*>(any->ptr)->start(); break; + case Quantization_UINT8: static_cast*>(any->ptr)->start(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_start", e.what()); + } +} + +void gpu_ivf_flat_build(gpu_ivf_flat_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->build(); break; + case Quantization_F16: static_cast*>(any->ptr)->build(); break; + case Quantization_INT8: static_cast*>(any->ptr)->build(); break; + case Quantization_UINT8: static_cast*>(any->ptr)->build(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_build", e.what()); + } +} + +void gpu_ivf_flat_save(gpu_ivf_flat_c index_c, const char* filename, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->save(filename); break; + case Quantization_F16: static_cast*>(any->ptr)->save(filename); break; + case Quantization_INT8: static_cast*>(any->ptr)->save(filename); break; + case Quantization_UINT8: static_cast*>(any->ptr)->save(filename); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_save", e.what()); + } +} + +gpu_ivf_flat_search_res_t gpu_ivf_flat_search(gpu_ivf_flat_c index_c, const void* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + ivf_flat_search_params_t search_params, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_ivf_flat_search_res_t res = {nullptr}; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: { + auto* cpp_res = new matrixone::gpu_ivf_flat_t::search_result_t(); + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + res.result_ptr = static_cast(cpp_res); + break; + } + case Quantization_F16: { + auto* cpp_res = new matrixone::gpu_ivf_flat_t::search_result_t(); + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + res.result_ptr = static_cast(cpp_res); + break; + } + case Quantization_INT8: { + auto* cpp_res = new matrixone::gpu_ivf_flat_t::search_result_t(); + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + res.result_ptr = static_cast(cpp_res); + break; + } + case Quantization_UINT8: { + auto* cpp_res = new matrixone::gpu_ivf_flat_t::search_result_t(); + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + res.result_ptr = static_cast(cpp_res); + break; + } + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_search", e.what()); + } + return res; +} + +gpu_ivf_flat_search_res_t gpu_ivf_flat_search_float(gpu_ivf_flat_c index_c, const float* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + ivf_flat_search_params_t search_params, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_ivf_flat_search_res_t res = {nullptr}; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: { + auto* cpp_res = new matrixone::gpu_ivf_flat_t::search_result_t(); + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + res.result_ptr = static_cast(cpp_res); + break; + } + case Quantization_F16: { + auto* cpp_res = new matrixone::gpu_ivf_flat_t::search_result_t(); + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + res.result_ptr = static_cast(cpp_res); + break; + } + case Quantization_INT8: { + auto* cpp_res = new matrixone::gpu_ivf_flat_t::search_result_t(); + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + res.result_ptr = static_cast(cpp_res); + break; + } + case Quantization_UINT8: { + auto* cpp_res = new matrixone::gpu_ivf_flat_t::search_result_t(); + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + res.result_ptr = static_cast(cpp_res); + break; + } + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_search_float", e.what()); + } + return res; +} + +void gpu_ivf_flat_get_neighbors(gpu_ivf_flat_result_c result_c, uint64_t total_elements, int64_t* neighbors) { + if (!result_c) return; + auto* neighbors_vec = &static_cast::search_result_t*>(result_c)->neighbors; + if (neighbors_vec->size() >= total_elements) { + std::copy(neighbors_vec->begin(), neighbors_vec->begin() + total_elements, neighbors); + } +} + +void gpu_ivf_flat_get_distances(gpu_ivf_flat_result_c result_c, uint64_t total_elements, float* distances) { + if (!result_c) return; + auto* distances_vec = &static_cast::search_result_t*>(result_c)->distances; + if (distances_vec->size() >= total_elements) { + std::copy(distances_vec->begin(), distances_vec->begin() + total_elements, distances); + } +} + +void gpu_ivf_flat_free_result(gpu_ivf_flat_result_c result_c) { + if (!result_c) return; + delete static_cast::search_result_t*>(result_c); +} + +uint32_t gpu_ivf_flat_cap(gpu_ivf_flat_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->cap(); + case Quantization_F16: return static_cast*>(any->ptr)->cap(); + case Quantization_INT8: return static_cast*>(any->ptr)->cap(); + case Quantization_UINT8: return static_cast*>(any->ptr)->cap(); + default: return 0; + } +} + +uint32_t gpu_ivf_flat_len(gpu_ivf_flat_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->len(); + case Quantization_F16: return static_cast*>(any->ptr)->len(); + case Quantization_INT8: return static_cast*>(any->ptr)->len(); + case Quantization_UINT8: return static_cast*>(any->ptr)->len(); + default: return 0; + } +} + +char* gpu_ivf_flat_info(gpu_ivf_flat_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + if (!index_c) return nullptr; + try { + auto* any = static_cast(index_c); + std::string info; + switch (any->qtype) { + case Quantization_F32: info = static_cast*>(any->ptr)->info(); break; + case Quantization_F16: info = static_cast*>(any->ptr)->info(); break; + case Quantization_INT8: info = static_cast*>(any->ptr)->info(); break; + case Quantization_UINT8: info = static_cast*>(any->ptr)->info(); break; + default: return nullptr; + } + return strdup(info.c_str()); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_info", e.what()); + return nullptr; + } +} + +void gpu_ivf_flat_get_centers(gpu_ivf_flat_c index_c, void* centers, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: { + auto host_centers = static_cast*>(any->ptr)->get_centers(); + std::copy(host_centers.begin(), host_centers.end(), static_cast(centers)); + break; + } + case Quantization_F16: { + auto host_centers = static_cast*>(any->ptr)->get_centers(); + std::copy(host_centers.begin(), host_centers.end(), static_cast(centers)); + break; + } + case Quantization_INT8: { + auto host_centers = static_cast*>(any->ptr)->get_centers(); + std::copy(host_centers.begin(), host_centers.end(), static_cast(centers)); + break; + } + case Quantization_UINT8: { + auto host_centers = static_cast*>(any->ptr)->get_centers(); + std::copy(host_centers.begin(), host_centers.end(), static_cast(centers)); + break; + } + default: throw std::runtime_error("Unsupported quantization type"); + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_flat_get_centers", e.what()); + } +} + +uint32_t gpu_ivf_flat_get_n_list(gpu_ivf_flat_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->get_n_list(); + case Quantization_F16: return static_cast*>(any->ptr)->get_n_list(); + case Quantization_INT8: return static_cast*>(any->ptr)->get_n_list(); + case Quantization_UINT8: return static_cast*>(any->ptr)->get_n_list(); + default: return 0; + } +} + +} // extern "C" + +namespace matrixone { +template class gpu_ivf_flat_t; +template class gpu_ivf_flat_t; +template class gpu_ivf_flat_t; +template class gpu_ivf_flat_t; +} // namespace matrixone diff --git a/cgo/cuvs/ivf_flat_c.h b/cgo/cuvs/ivf_flat_c.h new file mode 100644 index 0000000000000..79c1243060bf6 --- /dev/null +++ b/cgo/cuvs/ivf_flat_c.h @@ -0,0 +1,117 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef IVF_FLAT_C_H +#define IVF_FLAT_C_H + +#include "helper.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque pointer to the C++ gpu_ivf_flat_t object +typedef void* gpu_ivf_flat_c; + +// Opaque pointer to the C++ IVF-Flat search result object +typedef void* gpu_ivf_flat_result_c; + +// Constructor for building from dataset +gpu_ivf_flat_c gpu_ivf_flat_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, + distance_type_t metric, ivf_flat_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Constructor for loading from file +gpu_ivf_flat_c gpu_ivf_flat_load_file(const char* filename, uint32_t dimension, distance_type_t metric, + ivf_flat_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Destructor +void gpu_ivf_flat_destroy(gpu_ivf_flat_c index_c, void* errmsg); + +// Start function (initializes worker and resources) +void gpu_ivf_flat_start(gpu_ivf_flat_c index_c, void* errmsg); + +// Build function (actually triggers the build/load logic) +void gpu_ivf_flat_build(gpu_ivf_flat_c index_c, void* errmsg); + +// Constructor for an empty index (pre-allocates) +gpu_ivf_flat_c gpu_ivf_flat_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric, + ivf_flat_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Add chunk of data (same type as index quantization) +void gpu_ivf_flat_add_chunk(gpu_ivf_flat_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg); + +// Add chunk of data (from float, with on-the-fly quantization if needed) +void gpu_ivf_flat_add_chunk_float(gpu_ivf_flat_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg); + +// Trains the scalar quantizer (if T is 1-byte) +void gpu_ivf_flat_train_quantizer(gpu_ivf_flat_c index_c, const float* train_data, uint64_t n_samples, void* errmsg); + +void gpu_ivf_flat_set_per_thread_device(gpu_ivf_flat_c index_c, bool enable, void* errmsg); +void gpu_ivf_flat_set_use_batching(gpu_ivf_flat_c index_c, bool enable, void* errmsg); + +void gpu_ivf_flat_set_quantizer(gpu_ivf_flat_c index_c, float min, float max, void* errmsg); +void gpu_ivf_flat_get_quantizer(gpu_ivf_flat_c index_c, float* min, float* max, void* errmsg); + +// Destructor + +void gpu_ivf_flat_save(gpu_ivf_flat_c index_c, const char* filename, void* errmsg); + +// Search function +typedef struct { + gpu_ivf_flat_result_c result_ptr; +} gpu_ivf_flat_search_res_t; + +gpu_ivf_flat_search_res_t gpu_ivf_flat_search(gpu_ivf_flat_c index_c, const void* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + ivf_flat_search_params_t search_params, void* errmsg); + +gpu_ivf_flat_search_res_t gpu_ivf_flat_search_float(gpu_ivf_flat_c index_c, const float* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + ivf_flat_search_params_t search_params, void* errmsg); +// Get results from result object +void gpu_ivf_flat_get_neighbors(gpu_ivf_flat_result_c result_c, uint64_t total_elements, int64_t* neighbors); +void gpu_ivf_flat_get_distances(gpu_ivf_flat_result_c result_c, uint64_t total_elements, float* distances); + +// Free result object +void gpu_ivf_flat_free_result(gpu_ivf_flat_result_c result_c); + +// Returns the capacity of the index buffer +uint32_t gpu_ivf_flat_cap(gpu_ivf_flat_c index_c); + +// Returns the current number of vectors in the index +uint32_t gpu_ivf_flat_len(gpu_ivf_flat_c index_c); + +// Returns info about the index as a JSON string +char* gpu_ivf_flat_info(gpu_ivf_flat_c index_c, void* errmsg); + +// Gets the trained centroids +void gpu_ivf_flat_get_centers(gpu_ivf_flat_c index_c, void* centers, void* errmsg); + +// Gets the number of lists (centroids) +uint32_t gpu_ivf_flat_get_n_list(gpu_ivf_flat_c index_c); + +#ifdef __cplusplus +} +#endif + +#endif // IVF_FLAT_C_H diff --git a/cgo/cuvs/ivf_pq.hpp b/cgo/cuvs/ivf_pq.hpp new file mode 100644 index 0000000000000..8d06a844a99cb --- /dev/null +++ b/cgo/cuvs/ivf_pq.hpp @@ -0,0 +1,778 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "index_base.hpp" +#include "cuvs_worker.hpp" +#include "cuvs_types.h" +#include "quantize.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#include +#include +#include +#include +#include +#include + +#include +#include +#pragma GCC diagnostic pop + + +namespace matrixone { + +/** + * @brief Search result containing neighbor IDs and distances. + * Common for all IVF-PQ instantiations. + */ +struct ivf_pq_search_result_t { + std::vector neighbors; // Indices of nearest neighbors + std::vector distances; // Distances to nearest neighbors +}; + +/** + * @brief gpu_ivf_pq_t implements an IVF-PQ index that can run on a single GPU or sharded/replicated across multiple GPUs. + */ +template +class gpu_ivf_pq_t : public gpu_index_base_t { +public: + using ivf_pq_index = cuvs::neighbors::ivf_pq::index; + using mg_index = cuvs::neighbors::mg_index; + using search_result_t = ivf_pq_search_result_t; + + // Internal index storage + std::unique_ptr index_; + std::unique_ptr mg_index_; + + ~gpu_ivf_pq_t() override { + this->destroy(); + } + + // Unified Constructor for building from dataset + gpu_ivf_pq_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, + cuvs::distance::DistanceType m, const ivf_pq_build_params_t& bp, + const std::vector& devices, uint32_t nthread, distribution_mode_t mode) { + + this->dimension = dimension; + this->count = static_cast(count_vectors); + this->metric = m; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = static_cast(count_vectors); + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + + this->flattened_host_dataset.resize(this->count * this->dimension); + if (dataset_data) { + std::copy(dataset_data, dataset_data + (this->count * this->dimension), this->flattened_host_dataset.begin()); + } + } + + // Constructor for chunked input (pre-allocates) + gpu_ivf_pq_t(uint64_t total_count, uint32_t dimension, cuvs::distance::DistanceType m, + const ivf_pq_build_params_t& bp, const std::vector& devices, + uint32_t nthread, distribution_mode_t mode) { + + this->dimension = dimension; + this->count = static_cast(total_count); + this->metric = m; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = 0; + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + + this->flattened_host_dataset.resize(this->count * this->dimension); + } + + // Constructor for building from MODF datafile + gpu_ivf_pq_t(const std::string& data_filename, cuvs::distance::DistanceType m, + const ivf_pq_build_params_t& bp, const std::vector& devices, + uint32_t nthread, distribution_mode_t mode) { + + this->metric = m; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + + uint64_t file_count = 0; + uint64_t file_dim = 0; + load_host_matrix(data_filename, this->flattened_host_dataset, file_count, file_dim); + + this->count = static_cast(file_count); + this->dimension = static_cast(file_dim); + this->current_offset_ = this->count; + } + + // Unified Constructor for loading from file + gpu_ivf_pq_t(const std::string& filename, uint32_t dimension, cuvs::distance::DistanceType m, + const ivf_pq_build_params_t& bp, const std::vector& devices, uint32_t nthread, distribution_mode_t mode) { + + this->filename_ = filename; + this->dimension = dimension; + this->metric = m; + this->count = 0; + this->build_params = bp; + this->dist_mode = mode; + this->devices_ = devices; + this->current_offset_ = 0; + + bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED); + this->worker = std::make_unique(nthread, this->devices_, force_mg || (this->devices_.size() > 1)); + } + + void destroy() override { + if (this->worker) { + this->worker->stop(); + } + std::unique_lock lock(this->mutex_); + index_.reset(); + mg_index_.reset(); + this->quantizer_.reset(); + this->dataset_device_ptr_.reset(); + } + + /** + * @brief Starts the worker and initializes resources. + */ + void start() { + auto init_fn = [](raft_handle_wrapper_t&) -> std::any { + return std::any(); + }; + + auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any { + std::unique_lock lock(this->mutex_); + index_.reset(); + mg_index_.reset(); + this->quantizer_.reset(); + this->dataset_device_ptr_.reset(); + return std::any(); + }; + + this->worker->start(init_fn, stop_fn); + } + + /** + * @brief Loads the index from file or builds it from the dataset. + */ + void build() { + std::unique_lock lock(this->mutex_); + if (this->is_loaded_) return; + + if (this->filename_.empty() && this->current_offset_ > 0 && this->current_offset_ < this->count) { + this->count = static_cast(this->current_offset_); + this->flattened_host_dataset.resize(this->count * this->dimension); + } + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + this->build_internal(handle); + return std::any(); + } + ); + + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + this->is_loaded_ = true; + // Clear host dataset after building to save memory (IVF-PQ stores its own copy on device) + if (this->filename_.empty()) { + this->flattened_host_dataset.clear(); + this->flattened_host_dataset.shrink_to_fit(); + } + } + + /** + * @brief Internal build implementation (no worker submission) + */ + void build_internal(raft_handle_wrapper_t& handle) { + auto res = handle.get_raft_resources(); + bool is_mg = is_snmg_handle(res); + + if (!this->filename_.empty()) { + if (is_mg) { + mg_index_ = std::make_unique( + cuvs::neighbors::ivf_pq::deserialize(*res, this->filename_)); + // Update metadata + this->count = 0; + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) this->count += static_cast(iface.index_.value().size()); + } + if (!mg_index_->ann_interfaces_.empty() && mg_index_->ann_interfaces_[0].index_.has_value()) { + this->build_params.n_lists = static_cast(mg_index_->ann_interfaces_[0].index_.value().n_lists()); + this->build_params.m = static_cast(mg_index_->ann_interfaces_[0].index_.value().pq_dim()); + this->build_params.bits_per_code = static_cast(mg_index_->ann_interfaces_[0].index_.value().pq_bits()); + } + } else { + index_ = std::make_unique(*res); + cuvs::neighbors::ivf_pq::deserialize(*res, this->filename_, index_.get()); + this->count = static_cast(index_->size()); + this->build_params.n_lists = static_cast(index_->n_lists()); + this->build_params.m = static_cast(index_->pq_dim()); + this->build_params.bits_per_code = static_cast(index_->pq_bits()); + } + raft::resource::sync_stream(*res); + } else if (!this->flattened_host_dataset.empty()) { + if (this->count < this->build_params.n_lists) { + throw std::runtime_error("Dataset too small: count (" + std::to_string(this->count) + + ") must be >= n_list (" + std::to_string(this->build_params.n_lists) + + ") to build IVF index."); + } + + cuvs::neighbors::ivf_pq::index_params index_params; + index_params.metric = this->metric; + index_params.n_lists = this->build_params.n_lists; + index_params.pq_dim = this->build_params.m; + index_params.pq_bits = this->build_params.bits_per_code; + index_params.add_data_on_build = this->build_params.add_data_on_build; + index_params.kmeans_trainset_fraction = this->build_params.kmeans_trainset_fraction; + + if (is_mg) { + auto dataset_host_view = raft::make_host_matrix_view( + this->flattened_host_dataset.data(), (int64_t)this->count, (int64_t)this->dimension); + + cuvs::neighbors::mg_index_params mg_params(index_params); + if (this->dist_mode == DistributionMode_REPLICATED) { + mg_params.mode = cuvs::neighbors::distribution_mode::REPLICATED; + } else { + mg_params.mode = cuvs::neighbors::distribution_mode::SHARDED; + } + + mg_index_ = std::make_unique( + cuvs::neighbors::ivf_pq::build(*res, mg_params, dataset_host_view)); + } else { + auto dataset_device = raft::make_device_matrix( + *res, static_cast(this->count), static_cast(this->dimension)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device.data_handle(), this->flattened_host_dataset.data(), + this->flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + index_ = std::make_unique( + cuvs::neighbors::ivf_pq::build(*res, index_params, raft::make_const_mdspan(dataset_device.view()))); + } + raft::resource::sync_stream(*res); + } + } + + /** + * @brief Serializes the index to a file. + * @param filename Path to the output file. + */ + void save(const std::string& filename) { + if (!this->is_loaded_ || (!index_ && !mg_index_)) throw std::runtime_error("index not loaded"); + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + if (is_snmg_handle(res)) { + cuvs::neighbors::ivf_pq::serialize(*res, *mg_index_, filename); + } else { + cuvs::neighbors::ivf_pq::serialize(*res, filename, *index_); + } + raft::resource::sync_stream(*res); + return std::any(); + } + ); + + cuvs_task_result_t result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + } + + /** + * @brief Performs IVF-PQ search for given queries. + * @param queries_data Pointer to flattened query vectors on host. + * @param num_queries Number of query vectors. + * @param query_dimension Dimension of query vectors. + * @param limit Number of nearest neighbors to find. + * @param sp IVF-PQ search parameters. + * @return Search results. + */ + search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const ivf_pq_search_params_t& sp) { + if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{}; + if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch"); + if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{}; + + // For large batches or if batching is explicitly disabled, use standard path + if (num_queries > 16 || !this->worker->use_batching()) { + uint64_t job_id = this->worker->submit( + [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any { + return this->search_internal(handle, queries_data, num_queries, query_dimension, limit, sp); + } + ); + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + return std::any_cast(result_wait.result); + } + + return this->search_batch_internal(queries_data, num_queries, limit, sp); + } + + /** + * @brief Internal batch search implementation + */ + search_result_t search_batch_internal(const T* queries_data, uint64_t num_queries, uint32_t limit, const ivf_pq_search_params_t& sp) { + // Dynamic batching for small query counts + struct search_req_t { + const T* data; + uint64_t n; + }; + + std::string batch_key = "ivf_pq_s_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.n_probes); + + auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector& reqs, const std::vector>& setters) { + uint64_t total_queries = 0; + for (const auto& r : reqs) total_queries += std::any_cast(r).n; + + std::vector aggregated_queries(total_queries * this->dimension); + uint64_t offset = 0; + for (const auto& r : reqs) { + auto req = std::any_cast(r); + std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension)); + offset += req.n; + } + + auto results = this->search_internal(handle, aggregated_queries.data(), total_queries, this->dimension, limit, sp); + + offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + auto req = std::any_cast(reqs[i]); + search_result_t individual_res; + individual_res.neighbors.resize(req.n * limit); + individual_res.distances.resize(req.n * limit); + std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin()); + std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin()); + setters[i](individual_res); + offset += req.n; + } + }; + + auto future = this->worker->template submit_batched(batch_key, search_req_t{queries_data, num_queries}, exec_fn); + return future.get(); + } + + /** + * @brief Performs IVF-PQ search for given float32 queries, with on-the-fly quantization if needed. + */ + search_result_t search_float(const float* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const ivf_pq_search_params_t& sp) { + if constexpr (std::is_same_v) { + return search(queries_data, num_queries, query_dimension, limit, sp); + } + + if (!queries_data || num_queries == 0 || this->dimension == 0) return search_result_t{}; + if (query_dimension != this->dimension) throw std::runtime_error("dimension mismatch"); + if (!this->is_loaded_ || (!index_ && !mg_index_)) return search_result_t{}; + + if (num_queries > 16 || !this->worker->use_batching()) { + uint64_t job_id = this->worker->submit( + [&, num_queries, limit, sp, queries_data](raft_handle_wrapper_t& handle) -> std::any { + return this->search_float_internal(handle, queries_data, num_queries, query_dimension, limit, sp); + } + ); + auto result_wait = this->worker->wait(job_id).get(); + if (result_wait.error) std::rethrow_exception(result_wait.error); + return std::any_cast(result_wait.result); + } + + return this->search_float_batch_internal(queries_data, num_queries, limit, sp); + } + + /** + * @brief Internal batch search implementation for float32 queries + */ + search_result_t search_float_batch_internal(const float* queries_data, uint64_t num_queries, uint32_t limit, const ivf_pq_search_params_t& sp) { + // Dynamic batching for small query counts + struct search_req_t { + const float* data; + uint64_t n; + }; + + std::string batch_key = "ivf_pq_sf_" + std::to_string((uintptr_t)this) + "_" + std::to_string(limit) + "_" + std::to_string(sp.n_probes); + + auto exec_fn = [this, limit, sp](cuvs_worker_t::raft_handle& handle, const std::vector& reqs, const std::vector>& setters) { + uint64_t total_queries = 0; + for (const auto& r : reqs) total_queries += std::any_cast(r).n; + + std::vector aggregated_queries(total_queries * this->dimension); + uint64_t offset = 0; + for (const auto& r : reqs) { + auto req = std::any_cast(r); + std::copy(req.data, req.data + (req.n * this->dimension), aggregated_queries.begin() + (offset * this->dimension)); + offset += req.n; + } + + auto results = this->search_float_internal(handle, aggregated_queries.data(), total_queries, this->dimension, limit, sp); + + offset = 0; + for (size_t i = 0; i < reqs.size(); ++i) { + auto req = std::any_cast(reqs[i]); + search_result_t individual_res; + individual_res.neighbors.resize(req.n * limit); + individual_res.distances.resize(req.n * limit); + std::copy(results.neighbors.begin() + (offset * limit), results.neighbors.begin() + ((offset + req.n) * limit), individual_res.neighbors.begin()); + std::copy(results.distances.begin() + (offset * limit), results.distances.begin() + ((offset + req.n) * limit), individual_res.distances.begin()); + setters[i](individual_res); + offset += req.n; + } + }; + + auto future = this->worker->template submit_batched(batch_key, search_req_t{queries_data, num_queries}, exec_fn); + return future.get(); + } + + /** + * @brief Internal search implementation (no worker submission) + */ + search_result_t search_internal(raft_handle_wrapper_t& handle, const T* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const ivf_pq_search_params_t& sp) { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + search_result_t search_res; + search_res.neighbors.resize(num_queries * limit); + search_res.distances.resize(num_queries * limit); + + cuvs::neighbors::ivf_pq::search_params search_params; + search_params.n_probes = sp.n_probes; + + const ivf_pq_index* local_index = index_.get(); + if (!local_index && mg_index_) { + int current_device; + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + for (size_t i = 0; i < this->devices_.size(); ++i) { + if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) { + if (mg_index_->ann_interfaces_[i].index_.has_value()) { + local_index = &mg_index_->ann_interfaces_[i].index_.value(); + break; + } + } + } + } + + if (is_snmg_handle(res) && mg_index_) { + auto queries_host_view = raft::make_host_matrix_view( + queries_data, (int64_t)num_queries, (int64_t)this->dimension); + auto neighbors_host_view = raft::make_host_matrix_view( + search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit); + auto distances_host_view = raft::make_host_matrix_view( + search_res.distances.data(), (int64_t)num_queries, (int64_t)limit); + + cuvs::neighbors::mg_search_params mg_search_params(search_params); + cuvs::neighbors::ivf_pq::search(*res, *mg_index_, mg_search_params, + queries_host_view, neighbors_host_view, distances_host_view); + } else if (local_index) { + auto queries_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(this->dimension)); + RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data, + num_queries * this->dimension * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + auto neighbors_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + auto distances_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + + cuvs::neighbors::ivf_pq::search(*res, search_params, *local_index, + raft::make_const_mdspan(queries_device.view()), + neighbors_device.view(), distances_device.view()); + + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(), + search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(), + search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + } else { + throw std::runtime_error("Index not loaded or failed to find local index shard for current device."); + } + + raft::resource::sync_stream(*res); + + for (size_t i = 0; i < search_res.neighbors.size(); ++i) { + if (search_res.neighbors[i] == std::numeric_limits::max() || + search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) { + search_res.neighbors[i] = -1; + } + } + return search_res; + } + + /** + * @brief Internal search_float implementation (no worker submission) + */ + search_result_t search_float_internal(raft_handle_wrapper_t& handle, const float* queries_data, uint64_t num_queries, uint32_t query_dimension, + uint32_t limit, const ivf_pq_search_params_t& sp) { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + // 1. Quantize/Convert float queries to T on device + auto queries_device_float = raft::make_device_matrix(*res, num_queries, this->dimension); + raft::copy(*res, queries_device_float.view(), raft::make_host_matrix_view(queries_data, num_queries, this->dimension)); + + auto queries_device_target = raft::make_device_matrix(*res, num_queries, this->dimension); + if constexpr (sizeof(T) == 1) { + if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained"); + this->quantizer_.template transform(*res, queries_device_float.view(), queries_device_target.data_handle(), true); + } else { + raft::copy(*res, queries_device_target.view(), queries_device_float.view()); + } + + // 2. Perform search + search_result_t search_res; + search_res.neighbors.resize(num_queries * limit); + search_res.distances.resize(num_queries * limit); + + cuvs::neighbors::ivf_pq::search_params search_params; + search_params.n_probes = sp.n_probes; + + const ivf_pq_index* local_index = index_.get(); + if (!local_index && mg_index_) { + int current_device; + RAFT_CUDA_TRY(cudaGetDevice(¤t_device)); + for (size_t i = 0; i < this->devices_.size(); ++i) { + if (this->devices_[i] == current_device && i < mg_index_->ann_interfaces_.size()) { + if (mg_index_->ann_interfaces_[i].index_.has_value()) { + local_index = &mg_index_->ann_interfaces_[i].index_.value(); + break; + } + } + } + } + + if (is_snmg_handle(res) && mg_index_) { + auto queries_host_target = raft::make_host_matrix(num_queries, this->dimension); + raft::copy(*res, queries_host_target.view(), queries_device_target.view()); + raft::resource::sync_stream(*res); + + auto neighbors_host_view = raft::make_host_matrix_view( + search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit); + auto distances_host_view = raft::make_host_matrix_view( + search_res.distances.data(), (int64_t)num_queries, (int64_t)limit); + + cuvs::neighbors::mg_search_params mg_search_params(search_params); + cuvs::neighbors::ivf_pq::search(*res, *mg_index_, mg_search_params, + queries_host_target.view(), + neighbors_host_view, distances_host_view); + } else if (local_index) { + auto neighbors_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + auto distances_device = raft::make_device_matrix( + *res, static_cast(num_queries), static_cast(limit)); + + cuvs::neighbors::ivf_pq::search(*res, search_params, *local_index, + raft::make_const_mdspan(queries_device_target.view()), + neighbors_device.view(), distances_device.view()); + + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(), + search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(), + search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + } else { + throw std::runtime_error("Index not loaded or failed to find local index shard for current device."); + } + + raft::resource::sync_stream(*res); + + for (size_t i = 0; i < search_res.neighbors.size(); ++i) { + if (search_res.neighbors[i] == std::numeric_limits::max() || + search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) { + search_res.neighbors[i] = -1; + } + } + return search_res; + } + + std::vector get_centers() { + if (!this->is_loaded_ || (!index_ && !mg_index_)) return {}; + + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + const ivf_pq_index* local_index = nullptr; + if (index_) { + local_index = index_.get(); + } else if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) { + local_index = &iface.index_.value(); + break; + } + } + } + + if (!local_index) return std::vector{}; + + auto centers_view = local_index->centers(); + size_t n_centers = centers_view.extent(0); + size_t dim = centers_view.extent(1); + + // 1. Convert centers from float to T on device + auto centers_device_target = raft::make_device_matrix(*res, n_centers, dim); + if constexpr (sizeof(T) == 1) { + if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained"); + this->quantizer_.template transform(*res, centers_view, centers_device_target.data_handle(), true); + } else { + raft::copy(*res, centers_device_target.view(), centers_view); + } + + // 2. Copy to host + std::vector host_centers(n_centers * dim); + RAFT_CUDA_TRY(cudaMemcpyAsync(host_centers.data(), centers_device_target.data_handle(), + host_centers.size() * sizeof(T), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + return host_centers; + } + ); + + auto result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast>(result.result); + } + + uint32_t get_n_list() { + std::shared_lock lock(this->mutex_); + if (index_) return static_cast(index_->n_lists()); + if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) return static_cast(iface.index_.value().n_lists()); + } + } + return this->build_params.n_lists; + } + + uint32_t get_pq_dim() { + std::shared_lock lock(this->mutex_); + if (index_) return static_cast(index_->pq_dim()); + if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) return static_cast(iface.index_.value().pq_dim()); + } + } + return this->build_params.m; + } + + uint32_t get_pq_bits() { + std::shared_lock lock(this->mutex_); + if (index_) return static_cast(index_->pq_bits()); + if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) return static_cast(iface.index_.value().pq_bits()); + } + } + return this->build_params.bits_per_code; + } + + uint32_t get_dim() { + std::shared_lock lock(this->mutex_); + if (index_) return static_cast(index_->dim()); + if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) return static_cast(iface.index_.value().dim()); + } + } + return this->dimension; + } + + uint32_t get_rot_dim() { + std::shared_lock lock(this->mutex_); + if (index_) return static_cast(index_->rot_dim()); + if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) return static_cast(iface.index_.value().rot_dim()); + } + } + return this->dimension; + } + + uint32_t get_dim_ext() { + std::shared_lock lock(this->mutex_); + if (index_) return static_cast(index_->dim_ext()); + if (mg_index_) { + for (const auto& iface : mg_index_->ann_interfaces_) { + if (iface.index_.has_value()) return static_cast(iface.index_.value().dim_ext()); + } + } + return this->dimension; + } + + std::string info() const override { + std::string json = gpu_index_base_t::info(); + json += ", \"type\": \"IVF-PQ\", \"ivf_pq\": {"; + if (index_) { + json += "\"mode\": \"Single-GPU\", \"size\": " + std::to_string(index_->size()) + + ", \"n_lists\": " + std::to_string(index_->n_lists()) + + ", \"pq_dim\": " + std::to_string(index_->pq_dim()) + + ", \"pq_bits\": " + std::to_string(index_->pq_bits()); + } else if (mg_index_) { + json += "\"mode\": \"Multi-GPU\", \"shards\": ["; + for (size_t i = 0; i < mg_index_->ann_interfaces_.size(); ++i) { + const auto& iface = mg_index_->ann_interfaces_[i]; + json += "{\"device\": " + std::to_string(this->devices_[i]); + if (iface.index_.has_value()) { + json += ", \"size\": " + std::to_string(iface.index_.value().size()) + + ", \"n_lists\": " + std::to_string(iface.index_.value().n_lists()) + + ", \"pq_dim\": " + std::to_string(iface.index_.value().pq_dim()) + + ", \"pq_bits\": " + std::to_string(iface.index_.value().pq_bits()); + } else { + json += ", \"status\": \"Not loaded\""; + } + json += "}" + std::string(i == mg_index_->ann_interfaces_.size() - 1 ? "" : ", "); + } + json += "]"; + } else { + json += "\"built\": false"; + } + json += "}}"; + return json; + } +}; + +} // namespace matrixone diff --git a/cgo/cuvs/ivf_pq_c.cpp b/cgo/cuvs/ivf_pq_c.cpp new file mode 100644 index 0000000000000..5835f0fd2cab6 --- /dev/null +++ b/cgo/cuvs/ivf_pq_c.cpp @@ -0,0 +1,583 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ivf_pq_c.h" +#include "ivf_pq.hpp" +#include +#include +#include +#include +#include +#include +#include + +struct gpu_ivf_pq_any_t { + quantization_t qtype; + void* ptr; + + gpu_ivf_pq_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {} + ~gpu_ivf_pq_any_t() { + switch (qtype) { + case Quantization_F32: delete static_cast*>(ptr); break; + case Quantization_F16: delete static_cast*>(ptr); break; + case Quantization_INT8: delete static_cast*>(ptr); break; + case Quantization_UINT8: delete static_cast*>(ptr); break; + default: break; + } + } +}; + +extern "C" { + +gpu_ivf_pq_c gpu_ivf_pq_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c, + ivf_pq_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* ivf_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + ivf_ptr = new matrixone::gpu_ivf_pq_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + ivf_ptr = new matrixone::gpu_ivf_pq_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + ivf_ptr = new matrixone::gpu_ivf_pq_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + ivf_ptr = new matrixone::gpu_ivf_pq_t(static_cast(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for IVF-PQ"); + } + return static_cast(new gpu_ivf_pq_any_t(qtype, ivf_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_new", e.what()); + return nullptr; + } +} + +gpu_ivf_pq_c gpu_ivf_pq_new_from_data_file(const char* data_filename, distance_type_t metric_c, + ivf_pq_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* ivf_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + ivf_ptr = new matrixone::gpu_ivf_pq_t(std::string(data_filename), metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + ivf_ptr = new matrixone::gpu_ivf_pq_t(std::string(data_filename), metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + ivf_ptr = new matrixone::gpu_ivf_pq_t(std::string(data_filename), metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + ivf_ptr = new matrixone::gpu_ivf_pq_t(std::string(data_filename), metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for IVF-PQ"); + } + return static_cast(new gpu_ivf_pq_any_t(qtype, ivf_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_new_from_data_file", e.what()); + return nullptr; + } +} + +gpu_ivf_pq_c gpu_ivf_pq_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric_c, + ivf_pq_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* ivf_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + ivf_ptr = new matrixone::gpu_ivf_pq_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + ivf_ptr = new matrixone::gpu_ivf_pq_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + ivf_ptr = new matrixone::gpu_ivf_pq_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + ivf_ptr = new matrixone::gpu_ivf_pq_t(total_count, dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for IVF-PQ"); + } + return static_cast(new gpu_ivf_pq_any_t(qtype, ivf_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_new_empty", e.what()); + return nullptr; + } +} + +void gpu_ivf_pq_add_chunk(gpu_ivf_pq_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_F16: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_INT8: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + case Quantization_UINT8: static_cast*>(any->ptr)->add_chunk(static_cast(chunk_data), chunk_count); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_add_chunk", e.what()); + } +} + +void gpu_ivf_pq_add_chunk_float(gpu_ivf_pq_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_F16: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_INT8: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + case Quantization_UINT8: static_cast*>(any->ptr)->add_chunk_float(chunk_data, chunk_count); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_add_chunk_float", e.what()); + } +} + +void gpu_ivf_pq_train_quantizer(gpu_ivf_pq_c index_c, const float* train_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_F16: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_INT8: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_UINT8: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_train_quantizer", e.what()); + } +} + +void gpu_ivf_pq_set_per_thread_device(gpu_ivf_pq_c index_c, bool enable, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_F16: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_per_thread_device(enable); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_set_per_thread_device", e.what()); + } +} + +void gpu_ivf_pq_set_use_batching(gpu_ivf_pq_c index_c, bool enable, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_F16: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_use_batching(enable); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_use_batching(enable); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_set_use_batching", e.what()); + } +} + +void gpu_ivf_pq_set_quantizer(gpu_ivf_pq_c index_c, float min, float max, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_F16: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_quantizer(min, max); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_set_quantizer", e.what()); + } +} + +void gpu_ivf_pq_get_quantizer(gpu_ivf_pq_c index_c, float* min, float* max, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_F16: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_INT8: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_UINT8: static_cast*>(any->ptr)->get_quantizer(min, max); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_get_quantizer", e.what()); + } +} + +gpu_ivf_pq_c gpu_ivf_pq_load_file(const char* filename, uint32_t dimension, distance_type_t metric_c, + ivf_pq_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + std::vector devs(devices, devices + device_count); + void* ivf_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + ivf_ptr = new matrixone::gpu_ivf_pq_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_F16: + ivf_ptr = new matrixone::gpu_ivf_pq_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_INT8: + ivf_ptr = new matrixone::gpu_ivf_pq_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + case Quantization_UINT8: + ivf_ptr = new matrixone::gpu_ivf_pq_t(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode); + break; + default: + throw std::runtime_error("Unsupported quantization type for IVF-PQ"); + } + return static_cast(new gpu_ivf_pq_any_t(qtype, ivf_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_load_file", e.what()); + return nullptr; + } +} + +void gpu_ivf_pq_destroy(gpu_ivf_pq_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + delete any; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_destroy", e.what()); + } +} + +void gpu_ivf_pq_start(gpu_ivf_pq_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->start(); break; + case Quantization_F16: static_cast*>(any->ptr)->start(); break; + case Quantization_INT8: static_cast*>(any->ptr)->start(); break; + case Quantization_UINT8: static_cast*>(any->ptr)->start(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_start", e.what()); + } +} + +void gpu_ivf_pq_build(gpu_ivf_pq_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->build(); break; + case Quantization_F16: static_cast*>(any->ptr)->build(); break; + case Quantization_INT8: static_cast*>(any->ptr)->build(); break; + case Quantization_UINT8: static_cast*>(any->ptr)->build(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_build", e.what()); + } +} + +void gpu_ivf_pq_save(gpu_ivf_pq_c index_c, const char* filename, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->save(filename); break; + case Quantization_F16: static_cast*>(any->ptr)->save(filename); break; + case Quantization_INT8: static_cast*>(any->ptr)->save(filename); break; + case Quantization_UINT8: static_cast*>(any->ptr)->save(filename); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_save", e.what()); + } +} + +gpu_ivf_pq_search_res_t gpu_ivf_pq_search(gpu_ivf_pq_c index_c, const void* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + ivf_pq_search_params_t search_params, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_ivf_pq_search_res_t res = {nullptr}; + try { + auto* any = static_cast(index_c); + auto* cpp_res = new matrixone::ivf_pq_search_result_t(); + switch (any->qtype) { + case Quantization_F32: + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + break; + case Quantization_F16: + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + break; + case Quantization_INT8: + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + break; + case Quantization_UINT8: + *cpp_res = static_cast*>(any->ptr)->search(static_cast(queries_data), num_queries, query_dimension, limit, search_params); + break; + default: break; + } + res.result_ptr = static_cast(cpp_res); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_search", e.what()); + } + return res; +} + +gpu_ivf_pq_search_res_t gpu_ivf_pq_search_float(gpu_ivf_pq_c index_c, const float* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + ivf_pq_search_params_t search_params, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_ivf_pq_search_res_t res = {nullptr}; + try { + auto* any = static_cast(index_c); + auto* cpp_res = new matrixone::ivf_pq_search_result_t(); + switch (any->qtype) { + case Quantization_F32: + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + break; + case Quantization_F16: + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + break; + case Quantization_INT8: + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + break; + case Quantization_UINT8: + *cpp_res = static_cast*>(any->ptr)->search_float(queries_data, num_queries, query_dimension, limit, search_params); + break; + default: break; + } + res.result_ptr = static_cast(cpp_res); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_search_float", e.what()); + } + return res; +} + +void gpu_ivf_pq_get_neighbors(gpu_ivf_pq_result_c result_c, uint64_t total_elements, int64_t* neighbors) { + if (!result_c) return; + auto* neighbors_vec = &static_cast(result_c)->neighbors; + if (neighbors_vec->size() >= total_elements) { + std::copy(neighbors_vec->begin(), neighbors_vec->begin() + total_elements, neighbors); + } +} + +void gpu_ivf_pq_get_distances(gpu_ivf_pq_result_c result_c, uint64_t total_elements, float* distances) { + if (!result_c) return; + auto* distances_vec = &static_cast(result_c)->distances; + if (distances_vec->size() >= total_elements) { + std::copy(distances_vec->begin(), distances_vec->begin() + total_elements, distances); + } +} + +void gpu_ivf_pq_free_result(gpu_ivf_pq_result_c result_c) { + if (!result_c) return; + delete static_cast(result_c); +} + +uint32_t gpu_ivf_pq_cap(gpu_ivf_pq_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->cap(); + case Quantization_F16: return static_cast*>(any->ptr)->cap(); + case Quantization_INT8: return static_cast*>(any->ptr)->cap(); + case Quantization_UINT8: return static_cast*>(any->ptr)->cap(); + default: return 0; + } +} + +uint32_t gpu_ivf_pq_len(gpu_ivf_pq_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->len(); + case Quantization_F16: return static_cast*>(any->ptr)->len(); + case Quantization_INT8: return static_cast*>(any->ptr)->len(); + case Quantization_UINT8: return static_cast*>(any->ptr)->len(); + default: return 0; + } +} + +char* gpu_ivf_pq_info(gpu_ivf_pq_c index_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + if (!index_c) return nullptr; + try { + auto* any = static_cast(index_c); + std::string info; + switch (any->qtype) { + case Quantization_F32: info = static_cast*>(any->ptr)->info(); break; + case Quantization_F16: info = static_cast*>(any->ptr)->info(); break; + case Quantization_INT8: info = static_cast*>(any->ptr)->info(); break; + case Quantization_UINT8: info = static_cast*>(any->ptr)->info(); break; + default: return nullptr; + } + return strdup(info.c_str()); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_info", e.what()); + return nullptr; + } +} + +void gpu_ivf_pq_get_centers(gpu_ivf_pq_c index_c, void* centers, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: { + auto host_centers = static_cast*>(any->ptr)->get_centers(); + if (!host_centers.empty()) std::copy(host_centers.begin(), host_centers.end(), static_cast(centers)); + break; + } + case Quantization_F16: { + auto host_centers = static_cast*>(any->ptr)->get_centers(); + if (!host_centers.empty()) std::copy(host_centers.begin(), host_centers.end(), static_cast(centers)); + break; + } + case Quantization_INT8: { + auto host_centers = static_cast*>(any->ptr)->get_centers(); + if (!host_centers.empty()) std::copy(host_centers.begin(), host_centers.end(), static_cast(centers)); + break; + } + case Quantization_UINT8: { + auto host_centers = static_cast*>(any->ptr)->get_centers(); + if (!host_centers.empty()) std::copy(host_centers.begin(), host_centers.end(), static_cast(centers)); + break; + } + default: throw std::runtime_error("Unsupported quantization type"); + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_ivf_pq_get_centers", e.what()); + } +} + +uint32_t gpu_ivf_pq_get_n_list(gpu_ivf_pq_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->get_n_list(); + case Quantization_F16: return static_cast*>(any->ptr)->get_n_list(); + case Quantization_INT8: return static_cast*>(any->ptr)->get_n_list(); + case Quantization_UINT8: return static_cast*>(any->ptr)->get_n_list(); + default: return 0; + } +} + +uint32_t gpu_ivf_pq_get_dim(gpu_ivf_pq_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->get_dim(); + case Quantization_F16: return static_cast*>(any->ptr)->get_dim(); + case Quantization_INT8: return static_cast*>(any->ptr)->get_dim(); + case Quantization_UINT8: return static_cast*>(any->ptr)->get_dim(); + default: return 0; + } +} + +uint32_t gpu_ivf_pq_get_rot_dim(gpu_ivf_pq_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->get_rot_dim(); + case Quantization_F16: return static_cast*>(any->ptr)->get_rot_dim(); + case Quantization_INT8: return static_cast*>(any->ptr)->get_rot_dim(); + case Quantization_UINT8: return static_cast*>(any->ptr)->get_rot_dim(); + default: return 0; + } +} + +uint32_t gpu_ivf_pq_get_dim_ext(gpu_ivf_pq_c index_c) { + if (!index_c) return 0; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: return static_cast*>(any->ptr)->get_dim_ext(); + case Quantization_F16: return static_cast*>(any->ptr)->get_dim_ext(); + case Quantization_INT8: return static_cast*>(any->ptr)->get_dim_ext(); + case Quantization_UINT8: return static_cast*>(any->ptr)->get_dim_ext(); + default: return 0; + } +} + +void gpu_ivf_pq_get_dataset(gpu_ivf_pq_c index_c, void* out_data) { + if (!index_c) return; + auto* any = static_cast(index_c); + switch (any->qtype) { + case Quantization_F32: { + auto& ds = static_cast*>(any->ptr)->flattened_host_dataset; + std::copy(ds.begin(), ds.end(), static_cast(out_data)); + break; + } + case Quantization_F16: { + auto& ds = static_cast*>(any->ptr)->flattened_host_dataset; + std::copy(ds.begin(), ds.end(), static_cast(out_data)); + break; + } + case Quantization_INT8: { + auto& ds = static_cast*>(any->ptr)->flattened_host_dataset; + std::copy(ds.begin(), ds.end(), static_cast(out_data)); + break; + } + case Quantization_UINT8: { + auto& ds = static_cast*>(any->ptr)->flattened_host_dataset; + std::copy(ds.begin(), ds.end(), static_cast(out_data)); + break; + } + default: break; + } +} + +} // extern "C" + +namespace matrixone { +template class gpu_ivf_pq_t; +template class gpu_ivf_pq_t; +template class gpu_ivf_pq_t; +template class gpu_ivf_pq_t; +} // namespace matrixone diff --git a/cgo/cuvs/ivf_pq_c.h b/cgo/cuvs/ivf_pq_c.h new file mode 100644 index 0000000000000..27a2dd08e3868 --- /dev/null +++ b/cgo/cuvs/ivf_pq_c.h @@ -0,0 +1,135 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef IVF_PQ_C_H +#define IVF_PQ_C_H + +#include "helper.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque pointer to the C++ gpu_ivf_pq_t object +typedef void* gpu_ivf_pq_c; + +// Opaque pointer to the C++ IVF-PQ search result object +typedef void* gpu_ivf_pq_result_c; + +// Constructor for building from dataset +gpu_ivf_pq_c gpu_ivf_pq_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, + distance_type_t metric, ivf_pq_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Constructor for building from MODF datafile +gpu_ivf_pq_c gpu_ivf_pq_new_from_data_file(const char* data_filename, distance_type_t metric, + ivf_pq_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Constructor for loading from file +gpu_ivf_pq_c gpu_ivf_pq_load_file(const char* filename, uint32_t dimension, distance_type_t metric, + ivf_pq_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Constructor for an empty index (pre-allocates) +gpu_ivf_pq_c gpu_ivf_pq_new_empty(uint64_t total_count, uint32_t dimension, distance_type_t metric, + ivf_pq_build_params_t build_params, + const int* devices, int device_count, uint32_t nthread, + distribution_mode_t dist_mode, quantization_t qtype, void* errmsg); + +// Add chunk of data (same type as index quantization) +void gpu_ivf_pq_add_chunk(gpu_ivf_pq_c index_c, const void* chunk_data, uint64_t chunk_count, void* errmsg); + +// Add chunk of data (from float, with on-the-fly quantization if needed) +void gpu_ivf_pq_add_chunk_float(gpu_ivf_pq_c index_c, const float* chunk_data, uint64_t chunk_count, void* errmsg); + +// Trains the scalar quantizer (if T is 1-byte) +void gpu_ivf_pq_train_quantizer(gpu_ivf_pq_c index_c, const float* train_data, uint64_t n_samples, void* errmsg); + +void gpu_ivf_pq_set_per_thread_device(gpu_ivf_pq_c index_c, bool enable, void* errmsg); +void gpu_ivf_pq_set_use_batching(gpu_ivf_pq_c index_c, bool enable, void* errmsg); + +void gpu_ivf_pq_set_quantizer(gpu_ivf_pq_c index_c, float min, float max, void* errmsg); +void gpu_ivf_pq_get_quantizer(gpu_ivf_pq_c index_c, float* min, float* max, void* errmsg); + +// Destructor +void gpu_ivf_pq_destroy(gpu_ivf_pq_c index_c, void* errmsg); + +// Start function (initializes worker and resources) +void gpu_ivf_pq_start(gpu_ivf_pq_c index_c, void* errmsg); + +// Build function (actually triggers the build/load logic) +void gpu_ivf_pq_build(gpu_ivf_pq_c index_c, void* errmsg); + +// Save function +void gpu_ivf_pq_save(gpu_ivf_pq_c index_c, const char* filename, void* errmsg); + +// Search function +typedef struct { + gpu_ivf_pq_result_c result_ptr; +} gpu_ivf_pq_search_res_t; + +gpu_ivf_pq_search_res_t gpu_ivf_pq_search(gpu_ivf_pq_c index_c, const void* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + ivf_pq_search_params_t search_params, void* errmsg); + +gpu_ivf_pq_search_res_t gpu_ivf_pq_search_float(gpu_ivf_pq_c index_c, const float* queries_data, uint64_t num_queries, + uint32_t query_dimension, uint32_t limit, + ivf_pq_search_params_t search_params, void* errmsg); + +// Get results from result object +void gpu_ivf_pq_get_neighbors(gpu_ivf_pq_result_c result_c, uint64_t total_elements, int64_t* neighbors); +void gpu_ivf_pq_get_distances(gpu_ivf_pq_result_c result_c, uint64_t total_elements, float* distances); + +// Free result object +void gpu_ivf_pq_free_result(gpu_ivf_pq_result_c result_c); + +// Returns the capacity of the index buffer +uint32_t gpu_ivf_pq_cap(gpu_ivf_pq_c index_c); + +// Returns the current number of vectors in the index +uint32_t gpu_ivf_pq_len(gpu_ivf_pq_c index_c); + +// Returns info about the index as a JSON string +char* gpu_ivf_pq_info(gpu_ivf_pq_c index_c, void* errmsg); + +// Gets the trained centroids +void gpu_ivf_pq_get_centers(gpu_ivf_pq_c index_c, void* centers, void* errmsg); + +// Gets the number of lists (centroids) +uint32_t gpu_ivf_pq_get_n_list(gpu_ivf_pq_c index_c); + +// Gets the dimension of the index +uint32_t gpu_ivf_pq_get_dim(gpu_ivf_pq_c index_c); + +// Gets the rotated dimension of the index (dimension used for centers) +uint32_t gpu_ivf_pq_get_rot_dim(gpu_ivf_pq_c index_c); + +// Gets the extended dimension of the index (including norms and padding) +uint32_t gpu_ivf_pq_get_dim_ext(gpu_ivf_pq_c index_c); + +// Gets the flattened dataset (for debugging) +void gpu_ivf_pq_get_dataset(gpu_ivf_pq_c index_c, void* out_data); + +#ifdef __cplusplus +} +#endif + +#endif // IVF_PQ_C_H diff --git a/cgo/cuvs/kmeans.hpp b/cgo/cuvs/kmeans.hpp new file mode 100644 index 0000000000000..59894fd552e46 --- /dev/null +++ b/cgo/cuvs/kmeans.hpp @@ -0,0 +1,447 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "index_base.hpp" +#include "cuvs_worker.hpp" // For cuvs_worker_t and raft_handle_wrapper_t +#include "cuvs_types.h" // For distance_type_t and quantization_t +#include // For RAFT_CUDA_TRY +#include // For half + +// Standard library includes +#include +#include +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +// RAFT includes +#include +#include +#include +#include +#include + +// cuVS includes +#include +#include +#include "quantize.hpp" +#pragma GCC diagnostic pop + +namespace matrixone { + +/** + * @brief Search/Predict result for K-Means. + * Common for all KMeans instantiations. + */ +struct kmeans_result_t { + std::vector labels; + float inertia; + int64_t n_iter; +}; + +/** + * @brief gpu_kmeans_t implements K-Means clustering on GPU using cuVS. + */ +template +class gpu_kmeans_t : public gpu_index_base_t { +public: + using predict_result_t = kmeans_result_t; + using fit_predict_result_t = kmeans_result_t; + + uint32_t n_clusters; + + cuvs::cluster::kmeans::balanced_params params; + + // Type of centroids and inertia. cuVS uses float for these even if input is half, int8, or uint8. + using CentroidT = float; + + // Internal storage for centroids on device + std::unique_ptr> centroids_; + + gpu_kmeans_t(uint32_t n_clusters, uint32_t dimension, cuvs::distance::DistanceType metric, + int max_iter = 20, int device_id = 0, uint32_t nthread = 1) + : n_clusters(n_clusters) { + + this->dimension = dimension; + params.n_iters = static_cast(max_iter); + params.metric = metric; + this->devices_ = {device_id}; + + this->worker = std::make_unique(nthread, this->devices_); + } + + ~gpu_kmeans_t() override { + this->destroy(); + } + + /** + * @brief Starts the worker and initializes resources. + */ + void start() { + auto init_fn = [](raft_handle_wrapper_t&) -> std::any { + return std::any(); + }; + + auto stop_fn = [&](raft_handle_wrapper_t&) -> std::any { + std::unique_lock lock(this->mutex_); + centroids_.reset(); + this->quantizer_.reset(); + return std::any(); + }; + + this->worker->start(init_fn, stop_fn); + } + + struct fit_result_t { + float inertia; + int64_t n_iter; + }; + + /** + * @brief Computes the cluster centroids. + */ + fit_result_t fit(const T* X_data, uint64_t n_samples) { + if (!X_data || n_samples == 0) return {0, 0}; + + uint64_t job_id = this->worker->submit_main( + [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any { + return this->fit_internal(handle, X_data, n_samples); + } + ); + auto result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast(result.result); + } + + /** + * @brief Internal fit implementation (no worker submission) + */ + fit_result_t fit_internal(raft_handle_wrapper_t& handle, const T* X_data, uint64_t n_samples) { + std::unique_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + auto X_device = raft::make_device_matrix( + *res, static_cast(n_samples), static_cast(this->dimension)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data, + n_samples * this->dimension * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + if (!centroids_) { + centroids_ = std::make_unique>( + raft::make_device_matrix(*res, static_cast(n_clusters), static_cast(this->dimension))); + } + + cuvs::cluster::kmeans::fit(*res, params, + raft::make_const_mdspan(X_device.view()), + centroids_->view()); + + raft::resource::sync_stream(*res); + return fit_result_t{0.0f, static_cast(params.n_iters)}; + } + + /** + * @brief Assigns labels to new data based on existing centroids. + */ + predict_result_t predict(const T* X_data, uint64_t n_samples) { + if (!X_data || n_samples == 0) return {{}, 0, 0}; + + uint64_t job_id = this->worker->submit_main( + [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + if (!centroids_) throw std::runtime_error("KMeans centroids not trained. Call fit() first."); + + auto res = handle.get_raft_resources(); + + auto X_device = raft::make_device_matrix( + *res, static_cast(n_samples), static_cast(this->dimension)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data, + n_samples * this->dimension * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + predict_result_t res_out; + res_out.labels.resize(n_samples); + auto labels_device = raft::make_device_vector(*res, static_cast(n_samples)); + + cuvs::cluster::kmeans::predict(*res, params, + raft::make_const_mdspan(X_device.view()), + raft::make_const_mdspan(centroids_->view()), + labels_device.view()); + + std::vector host_labels(n_samples); + RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(), + n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + for(uint64_t i=0; iworker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast(result.result); + } + + /** + * @brief Assigns labels to new float32 data, performing on-the-fly quantization if needed. + */ + predict_result_t predict_float(const float* X_data, uint64_t n_samples) { + if constexpr (std::is_same_v) { + return predict(X_data, n_samples); + } + + if (!X_data || n_samples == 0) return {{}, 0, 0}; + + uint64_t job_id = this->worker->submit_main( + [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + if (!centroids_) throw std::runtime_error("KMeans centroids not trained. Call fit() first."); + + auto res = handle.get_raft_resources(); + + // 1. Quantize/Convert float data to T on device + auto X_device_float = raft::make_device_matrix(*res, n_samples, this->dimension); + raft::copy(*res, X_device_float.view(), raft::make_host_matrix_view(X_data, n_samples, this->dimension)); + + auto X_device_target = raft::make_device_matrix(*res, n_samples, this->dimension); + if constexpr (sizeof(T) == 1) { + if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained"); + this->quantizer_.template transform(*res, X_device_float.view(), X_device_target.data_handle(), true); + raft::resource::sync_stream(*res); + } else { + raft::copy(*res, X_device_target.view(), X_device_float.view()); + } + + // 2. Perform prediction + predict_result_t res_out; + res_out.labels.resize(n_samples); + auto labels_device = raft::make_device_vector(*res, static_cast(n_samples)); + + cuvs::cluster::kmeans::predict(*res, params, + raft::make_const_mdspan(X_device_target.view()), + raft::make_const_mdspan(centroids_->view()), + labels_device.view()); + + std::vector host_labels(n_samples); + RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(), + n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + for(uint64_t i=0; iworker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast(result.result); + } + + /** + * @brief Performs both fitting and labeling in one step. + */ + fit_predict_result_t fit_predict(const T* X_data, uint64_t n_samples) { + if (!X_data || n_samples == 0) return {{}, 0, 0}; + + uint64_t job_id = this->worker->submit_main( + [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any { + std::unique_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + auto X_device = raft::make_device_matrix( + *res, static_cast(n_samples), static_cast(this->dimension)); + + RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data, + n_samples * this->dimension * sizeof(T), cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(*res))); + + if (!centroids_) { + centroids_ = std::make_unique>( + raft::make_device_matrix(*res, static_cast(n_clusters), static_cast(this->dimension))); + } + + fit_predict_result_t res_out; + res_out.labels.resize(n_samples); + auto labels_device = raft::make_device_vector(*res, static_cast(n_samples)); + + if constexpr (std::is_same_v || std::is_same_v) { + cuvs::cluster::kmeans::fit_predict(*res, params, + raft::make_const_mdspan(X_device.view()), + centroids_->view(), + labels_device.view()); + } else { + // Fallback for half and uint8_t + cuvs::cluster::kmeans::fit(*res, params, + raft::make_const_mdspan(X_device.view()), + centroids_->view()); + cuvs::cluster::kmeans::predict(*res, params, + raft::make_const_mdspan(X_device.view()), + raft::make_const_mdspan(centroids_->view()), + labels_device.view()); + } + + std::vector host_labels(n_samples); + RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(), + n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + for(uint64_t i=0; i(params.n_iters); + return res_out; + } + ); + auto result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast(result.result); + } + + /** + * @brief Performs fitting and prediction for float32 data, with on-the-fly quantization if needed. + */ + fit_predict_result_t fit_predict_float(const float* X_data, uint64_t n_samples) { + if constexpr (std::is_same_v) { + return fit_predict(X_data, n_samples); + } + + if (!X_data || n_samples == 0) return {{}, 0, 0}; + + uint64_t job_id = this->worker->submit_main( + [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any { + std::unique_lock lock(this->mutex_); + auto res = handle.get_raft_resources(); + + // 1. Quantize/Convert float data to T on device + auto X_device_float = raft::make_device_matrix(*res, n_samples, this->dimension); + raft::copy(*res, X_device_float.view(), raft::make_host_matrix_view(X_data, n_samples, this->dimension)); + + auto X_device_target = raft::make_device_matrix(*res, n_samples, this->dimension); + if constexpr (sizeof(T) == 1) { + if (!this->quantizer_.is_trained()) { + int64_t n_train = std::min(static_cast(n_samples), static_cast(500)); + auto train_view = raft::make_device_matrix_view(X_device_float.data_handle(), n_train, this->dimension); + this->quantizer_.train(*res, train_view); + } + this->quantizer_.template transform(*res, X_device_float.view(), X_device_target.data_handle(), true); + raft::resource::sync_stream(*res); + } else { + raft::copy(*res, X_device_target.view(), X_device_float.view()); + } + + // 2. Perform fit_predict + if (!centroids_) { + centroids_ = std::make_unique>( + raft::make_device_matrix(*res, static_cast(n_clusters), static_cast(this->dimension))); + } + + fit_predict_result_t res_out; + res_out.labels.resize(n_samples); + auto labels_device = raft::make_device_vector(*res, static_cast(n_samples)); + + if constexpr (std::is_same_v) { + cuvs::cluster::kmeans::fit_predict(*res, params, + raft::make_const_mdspan(X_device_target.view()), + centroids_->view(), + labels_device.view()); + } else { + // Fallback for half and uint8_t + cuvs::cluster::kmeans::fit(*res, params, + raft::make_const_mdspan(X_device_target.view()), + centroids_->view()); + cuvs::cluster::kmeans::predict(*res, params, + raft::make_const_mdspan(X_device_target.view()), + raft::make_const_mdspan(centroids_->view()), + labels_device.view()); + } + + std::vector host_labels(n_samples); + RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(), + n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + for(uint64_t i=0; i(params.n_iters); + return res_out; + } + ); + auto result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast(result.result); + } + + /** + * @brief Returns the trained centroids. + */ + std::vector get_centroids() { + uint64_t job_id = this->worker->submit_main( + [&](raft_handle_wrapper_t& handle) -> std::any { + std::shared_lock lock(this->mutex_); + if (!centroids_) return std::vector{}; + + auto res = handle.get_raft_resources(); + + // 1. Convert centroids from float to T on device + auto centroids_device_target = raft::make_device_matrix(*res, n_clusters, this->dimension); + if constexpr (sizeof(T) == 1) { + if (!this->quantizer_.is_trained()) throw std::runtime_error("Quantizer not trained"); + this->quantizer_.template transform(*res, centroids_->view(), centroids_device_target.data_handle(), true); + } else { + raft::copy(*res, centroids_device_target.view(), centroids_->view()); + } + + // 2. Copy to host + std::vector host_centroids(n_clusters * this->dimension); + RAFT_CUDA_TRY(cudaMemcpyAsync(host_centroids.data(), centroids_device_target.data_handle(), + host_centroids.size() * sizeof(T), cudaMemcpyDeviceToHost, + raft::resource::get_cuda_stream(*res))); + + raft::resource::sync_stream(*res); + return host_centroids; + } + ); + auto result = this->worker->wait(job_id).get(); + if (result.error) std::rethrow_exception(result.error); + return std::any_cast>(result.result); + } + + std::string info() const override { + std::string json = gpu_index_base_t::info(); + json += ", \"type\": \"KMeans\", \"kmeans\": {"; + json += "\"n_clusters\": " + std::to_string(n_clusters) + ", "; + json += "\"centroids_trained\": " + std::string(centroids_ ? "true" : "false"); + json += "}}"; + return json; + } +}; + +} // namespace matrixone diff --git a/cgo/cuvs/kmeans_c.cpp b/cgo/cuvs/kmeans_c.cpp new file mode 100644 index 0000000000000..ef0bebe54a9b9 --- /dev/null +++ b/cgo/cuvs/kmeans_c.cpp @@ -0,0 +1,371 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kmeans_c.h" +#include "kmeans.hpp" +#include +#include +#include +#include +#include +#include +#include + +struct gpu_kmeans_any_t { + quantization_t qtype; + void* ptr; + + gpu_kmeans_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {} + ~gpu_kmeans_any_t() { + switch (qtype) { + case Quantization_F32: delete static_cast*>(ptr); break; + case Quantization_F16: delete static_cast*>(ptr); break; + case Quantization_INT8: delete static_cast*>(ptr); break; + case Quantization_UINT8: delete static_cast*>(ptr); break; + default: break; + } + } +}; + +extern "C" { + +gpu_kmeans_c gpu_kmeans_new(uint32_t n_clusters, uint32_t dimension, distance_type_t metric_c, + int max_iter, int device_id, uint32_t nthread, + quantization_t qtype, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c); + void* kmeans_ptr = nullptr; + switch (qtype) { + case Quantization_F32: + kmeans_ptr = new matrixone::gpu_kmeans_t(n_clusters, dimension, metric, max_iter, device_id, nthread); + break; + case Quantization_F16: + kmeans_ptr = new matrixone::gpu_kmeans_t(n_clusters, dimension, metric, max_iter, device_id, nthread); + break; + case Quantization_INT8: + kmeans_ptr = new matrixone::gpu_kmeans_t(n_clusters, dimension, metric, max_iter, device_id, nthread); + break; + case Quantization_UINT8: + kmeans_ptr = new matrixone::gpu_kmeans_t(n_clusters, dimension, metric, max_iter, device_id, nthread); + break; + default: + throw std::runtime_error("Unsupported quantization type for KMeans"); + } + return static_cast(new gpu_kmeans_any_t(qtype, kmeans_ptr)); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_new", e.what()); + return nullptr; + } +} + +void gpu_kmeans_destroy(gpu_kmeans_c kmeans_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(kmeans_c); + delete any; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_destroy", e.what()); + } +} + +void gpu_kmeans_start(gpu_kmeans_c kmeans_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(kmeans_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->start(); break; + case Quantization_F16: static_cast*>(any->ptr)->start(); break; + case Quantization_INT8: static_cast*>(any->ptr)->start(); break; + case Quantization_UINT8: static_cast*>(any->ptr)->start(); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_start", e.what()); + } +} + +void gpu_kmeans_train_quantizer(gpu_kmeans_c kmeans_c, const float* train_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(kmeans_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_F16: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_INT8: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + case Quantization_UINT8: static_cast*>(any->ptr)->train_quantizer(train_data, n_samples); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_train_quantizer", e.what()); + } +} + +void gpu_kmeans_set_quantizer(gpu_kmeans_c kmeans_c, float min, float max, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(kmeans_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_F16: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_INT8: static_cast*>(any->ptr)->set_quantizer(min, max); break; + case Quantization_UINT8: static_cast*>(any->ptr)->set_quantizer(min, max); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_set_quantizer", e.what()); + } +} + +void gpu_kmeans_get_quantizer(gpu_kmeans_c kmeans_c, float* min, float* max, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(kmeans_c); + switch (any->qtype) { + case Quantization_F32: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_F16: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_INT8: static_cast*>(any->ptr)->get_quantizer(min, max); break; + case Quantization_UINT8: static_cast*>(any->ptr)->get_quantizer(min, max); break; + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_get_quantizer", e.what()); + } +} + +gpu_kmeans_fit_res_t gpu_kmeans_fit(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_kmeans_fit_res_t res = {0.0f, 0}; + try { + auto* any = static_cast(kmeans_c); + switch (any->qtype) { + case Quantization_F32: { + auto cpp_res = static_cast*>(any->ptr)->fit(static_cast(X_data), n_samples); + res.inertia = cpp_res.inertia; res.n_iter = cpp_res.n_iter; + break; + } + case Quantization_F16: { + auto cpp_res = static_cast*>(any->ptr)->fit(static_cast(X_data), n_samples); + res.inertia = cpp_res.inertia; res.n_iter = cpp_res.n_iter; + break; + } + case Quantization_INT8: { + auto cpp_res = static_cast*>(any->ptr)->fit(static_cast(X_data), n_samples); + res.inertia = cpp_res.inertia; res.n_iter = cpp_res.n_iter; + break; + } + case Quantization_UINT8: { + auto cpp_res = static_cast*>(any->ptr)->fit(static_cast(X_data), n_samples); + res.inertia = cpp_res.inertia; res.n_iter = cpp_res.n_iter; + break; + } + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_fit", e.what()); + } + return res; +} + +gpu_kmeans_predict_res_t gpu_kmeans_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_kmeans_predict_res_t res = {nullptr, 0.0f}; + try { + auto* any = static_cast(kmeans_c); + auto* cpp_res = new matrixone::kmeans_result_t(); + switch (any->qtype) { + case Quantization_F32: + *cpp_res = static_cast*>(any->ptr)->predict(static_cast(X_data), n_samples); + break; + case Quantization_F16: + *cpp_res = static_cast*>(any->ptr)->predict(static_cast(X_data), n_samples); + break; + case Quantization_INT8: + *cpp_res = static_cast*>(any->ptr)->predict(static_cast(X_data), n_samples); + break; + case Quantization_UINT8: + *cpp_res = static_cast*>(any->ptr)->predict(static_cast(X_data), n_samples); + break; + default: break; + } + res.result_ptr = static_cast(cpp_res); + res.inertia = cpp_res->inertia; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_predict", e.what()); + } + return res; +} + +gpu_kmeans_predict_res_t gpu_kmeans_predict_float(gpu_kmeans_c kmeans_c, const float* X_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_kmeans_predict_res_t res = {nullptr, 0.0f}; + try { + auto* any = static_cast(kmeans_c); + auto* cpp_res = new matrixone::kmeans_result_t(); + switch (any->qtype) { + case Quantization_F32: + *cpp_res = static_cast*>(any->ptr)->predict_float(X_data, n_samples); + break; + case Quantization_F16: + *cpp_res = static_cast*>(any->ptr)->predict_float(X_data, n_samples); + break; + case Quantization_INT8: + *cpp_res = static_cast*>(any->ptr)->predict_float(X_data, n_samples); + break; + case Quantization_UINT8: + *cpp_res = static_cast*>(any->ptr)->predict_float(X_data, n_samples); + break; + default: break; + } + res.result_ptr = static_cast(cpp_res); + res.inertia = cpp_res->inertia; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_predict_float", e.what()); + } + return res; +} + +gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_kmeans_fit_predict_res_t res = {nullptr, 0.0f, 0}; + try { + auto* any = static_cast(kmeans_c); + auto* cpp_res = new matrixone::kmeans_result_t(); + switch (any->qtype) { + case Quantization_F32: + *cpp_res = static_cast*>(any->ptr)->fit_predict(static_cast(X_data), n_samples); + break; + case Quantization_F16: + *cpp_res = static_cast*>(any->ptr)->fit_predict(static_cast(X_data), n_samples); + break; + case Quantization_INT8: + *cpp_res = static_cast*>(any->ptr)->fit_predict(static_cast(X_data), n_samples); + break; + case Quantization_UINT8: + *cpp_res = static_cast*>(any->ptr)->fit_predict(static_cast(X_data), n_samples); + break; + default: break; + } + res.result_ptr = static_cast(cpp_res); + res.inertia = cpp_res->inertia; res.n_iter = cpp_res->n_iter; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_fit_predict", e.what()); + } + return res; +} + +gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict_float(gpu_kmeans_c kmeans_c, const float* X_data, uint64_t n_samples, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + gpu_kmeans_fit_predict_res_t res = {nullptr, 0.0f, 0}; + try { + auto* any = static_cast(kmeans_c); + auto* cpp_res = new matrixone::kmeans_result_t(); + switch (any->qtype) { + case Quantization_F32: + *cpp_res = static_cast*>(any->ptr)->fit_predict_float(X_data, n_samples); + break; + case Quantization_F16: + *cpp_res = static_cast*>(any->ptr)->fit_predict_float(X_data, n_samples); + break; + case Quantization_INT8: + *cpp_res = static_cast*>(any->ptr)->fit_predict_float(X_data, n_samples); + break; + case Quantization_UINT8: + *cpp_res = static_cast*>(any->ptr)->fit_predict_float(X_data, n_samples); + break; + default: break; + } + res.result_ptr = static_cast(cpp_res); + res.inertia = cpp_res->inertia; res.n_iter = cpp_res->n_iter; + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_fit_predict_float", e.what()); + } + return res; +} + +void gpu_kmeans_get_labels(gpu_kmeans_result_c result_c, uint64_t n_samples, int64_t* labels) { + if (!result_c) return; + auto* labels_vec = &static_cast(result_c)->labels; + if (labels_vec->size() >= n_samples) { + std::copy(labels_vec->begin(), labels_vec->begin() + n_samples, labels); + } +} + +void gpu_kmeans_free_result(gpu_kmeans_result_c result_c) { + if (!result_c) return; + delete static_cast(result_c); +} + +void gpu_kmeans_get_centroids(gpu_kmeans_c kmeans_c, void* centroids, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + try { + auto* any = static_cast(kmeans_c); + switch (any->qtype) { + case Quantization_F32: { + auto host_centers = static_cast*>(any->ptr)->get_centroids(); + std::copy(host_centers.begin(), host_centers.end(), static_cast(centroids)); + break; + } + case Quantization_F16: { + auto host_centers = static_cast*>(any->ptr)->get_centroids(); + std::copy(host_centers.begin(), host_centers.end(), static_cast(centroids)); + break; + } + case Quantization_INT8: { + auto host_centers = static_cast*>(any->ptr)->get_centroids(); + std::copy(host_centers.begin(), host_centers.end(), static_cast(centroids)); + break; + } + case Quantization_UINT8: { + auto host_centers = static_cast*>(any->ptr)->get_centroids(); + std::copy(host_centers.begin(), host_centers.end(), static_cast(centroids)); + break; + } + default: break; + } + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_get_centroids", e.what()); + } +} + +char* gpu_kmeans_info(gpu_kmeans_c kmeans_c, void* errmsg) { + if (errmsg) *(static_cast(errmsg)) = nullptr; + if (!kmeans_c) return nullptr; + try { + auto* any = static_cast(kmeans_c); + std::string info; + switch (any->qtype) { + case Quantization_F32: info = static_cast*>(any->ptr)->info(); break; + case Quantization_F16: info = static_cast*>(any->ptr)->info(); break; + case Quantization_INT8: info = static_cast*>(any->ptr)->info(); break; + case Quantization_UINT8: info = static_cast*>(any->ptr)->info(); break; + default: return nullptr; + } + return strdup(info.c_str()); + } catch (const std::exception& e) { + set_errmsg(errmsg, "Error in gpu_kmeans_info", e.what()); + return nullptr; + } +} + +} // extern "C" + +namespace matrixone { +template class gpu_kmeans_t; +template class gpu_kmeans_t; +template class gpu_kmeans_t; +template class gpu_kmeans_t; +} // namespace matrixone diff --git a/cgo/cuvs/kmeans_c.h b/cgo/cuvs/kmeans_c.h new file mode 100644 index 0000000000000..0e726ad698cdb --- /dev/null +++ b/cgo/cuvs/kmeans_c.h @@ -0,0 +1,95 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef KMEANS_C_H +#define KMEANS_C_H + +#include "helper.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque pointer to the C++ gpu_kmeans_t object +typedef void* gpu_kmeans_c; + +// Opaque pointer to the C++ KMeans result object +typedef void* gpu_kmeans_result_c; + +// Constructor +gpu_kmeans_c gpu_kmeans_new(uint32_t n_clusters, uint32_t dimension, distance_type_t metric, + int max_iter, int device_id, uint32_t nthread, + quantization_t qtype, void* errmsg); + +// Destructor +void gpu_kmeans_destroy(gpu_kmeans_c kmeans_c, void* errmsg); + +// Starts the worker and initializes resources +void gpu_kmeans_start(gpu_kmeans_c kmeans_c, void* errmsg); + +// Trains the scalar quantizer (if T is 1-byte) +void gpu_kmeans_train_quantizer(gpu_kmeans_c kmeans_c, const float* train_data, uint64_t n_samples, void* errmsg); + +void gpu_kmeans_set_quantizer(gpu_kmeans_c kmeans_c, float min, float max, void* errmsg); +void gpu_kmeans_get_quantizer(gpu_kmeans_c kmeans_c, float* min, float* max, void* errmsg); + +// Fit function +typedef struct { + float inertia; + int64_t n_iter; +} gpu_kmeans_fit_res_t; + +gpu_kmeans_fit_res_t gpu_kmeans_fit(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg); + +// Predict function +typedef struct { + gpu_kmeans_result_c result_ptr; + float inertia; +} gpu_kmeans_predict_res_t; + +gpu_kmeans_predict_res_t gpu_kmeans_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg); + +gpu_kmeans_predict_res_t gpu_kmeans_predict_float(gpu_kmeans_c kmeans_c, const float* X_data, uint64_t n_samples, void* errmsg); + +// FitPredict function +typedef struct { + gpu_kmeans_result_c result_ptr; + float inertia; + int64_t n_iter; +} gpu_kmeans_fit_predict_res_t; + +gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg); + +gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict_float(gpu_kmeans_c kmeans_c, const float* X_data, uint64_t n_samples, void* errmsg); + +// Get results from result object +void gpu_kmeans_get_labels(gpu_kmeans_result_c result_c, uint64_t n_samples, int64_t* labels); + +// Free result object +void gpu_kmeans_free_result(gpu_kmeans_result_c result_c); + +// Get centroids +void gpu_kmeans_get_centroids(gpu_kmeans_c kmeans_c, void* centroids, void* errmsg); + +// Returns info about the kmeans as a JSON string +char* gpu_kmeans_info(gpu_kmeans_c kmeans_c, void* errmsg); + +#ifdef __cplusplus +} +#endif + +#endif // KMEANS_C_H diff --git a/cgo/cuvs/quantize.hpp b/cgo/cuvs/quantize.hpp new file mode 100644 index 0000000000000..a677f822e0bd5 --- /dev/null +++ b/cgo/cuvs/quantize.hpp @@ -0,0 +1,443 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace matrixone { + +#pragma pack(push, 1) +struct file_header_t { + char magic[4]; // "MODF" + uint64_t count; // 8 bytes + uint64_t dimension; // 8 bytes + uint32_t data_type_size; // 4 bytes +}; +#pragma pack(pop) + +/** + * @brief Helper to manage cuVS scalar quantizer lifecycle and operations. + * + * @tparam S Source type (float, half, double) + */ +template +class scalar_quantizer_t { +public: + using quantizer_type = cuvs::preprocessing::quantize::scalar::quantizer; + + scalar_quantizer_t() = default; + + /** + * @brief Constructor that initializes the quantizer with specific min and max values. + */ + scalar_quantizer_t(S min, S max) + : quantizer_(std::make_unique(quantizer_type{min, max})) {} + + /** + * @brief Trains the quantizer on a device matrix. + */ + void train(const raft::resources& res, raft::device_matrix_view train_view) { + cuvs::preprocessing::quantize::scalar::params q_params; + quantizer_ = std::make_unique( + cuvs::preprocessing::quantize::scalar::train(res, q_params, train_view)); + raft::resource::sync_stream(res); + } + + /** + * @brief Sets the quantizer range manually. + */ + void set_quantizer(S min, S max) { + quantizer_ = std::make_unique(quantizer_type{min, max}); + } + + /** + * @brief Transforms a chunk of data into quantized 8-bit integers. + * + * @tparam T Target type (int8_t or uint8_t) + * @param res RAFT resources handle. + * @param src_view Source data view on device. + * @param out_ptr Destination pointer (host or device). + * @param is_device_ptr Whether out_ptr is in device memory. + */ + template + void transform(const raft::resources& res, raft::device_matrix_view src_view, T* out_ptr, bool is_device_ptr) { + if (!quantizer_) throw std::runtime_error("Quantizer not trained"); + static_assert(sizeof(T) == 1, "Quantization target must be 1-byte"); + + int64_t n_rows = src_view.extent(0); + int64_t n_cols = src_view.extent(1); + + auto chunk_device_int8 = raft::make_device_matrix(res, n_rows, n_cols); + cuvs::preprocessing::quantize::scalar::transform(res, *quantizer_, src_view, chunk_device_int8.view()); + + if (is_device_ptr) { + auto out_view = raft::make_device_matrix_view(out_ptr, n_rows, n_cols); + raft::copy(res, out_view, chunk_device_int8.view()); + } else { + auto out_view = raft::make_host_matrix_view(out_ptr, n_rows, n_cols); + raft::copy(res, out_view, chunk_device_int8.view()); + raft::resource::sync_stream(res); + } + } + + bool is_trained() const { return quantizer_ != nullptr; } + void reset() { quantizer_.reset(); } + + /** + * @brief Gets the minimum value of the quantizer range. + */ + S min() const { + if (!quantizer_) throw std::runtime_error("Quantizer not trained"); + return quantizer_->min_; + } + + /** + * @brief Gets the maximum value of the quantizer range. + */ + S max() const { + if (!quantizer_) throw std::runtime_error("Quantizer not trained"); + return quantizer_->max_; + } + + /** + * @brief Serializes the quantizer state to an output stream. + */ + void serialize(std::ostream& os) const { + if (!quantizer_) throw std::runtime_error("Quantizer not trained"); + os.write(reinterpret_cast(&quantizer_->min_), sizeof(S)); + os.write(reinterpret_cast(&quantizer_->max_), sizeof(S)); + } + + /** + * @brief Deserializes the quantizer state from an input stream. + */ + void deserialize(std::istream& is) { + S params[2]; + is.read(reinterpret_cast(params), 2 * sizeof(S)); + if (is.gcount() != static_cast(2 * sizeof(S))) { + throw std::runtime_error("Failed to read quantizer parameters from stream"); + } + quantizer_ = std::make_unique(quantizer_type{params[0], params[1]}); + } + + /** + * @brief Saves the quantizer state to a file. + */ + void save_to_file(const std::string& filename) const { + std::ofstream os(filename, std::ios::binary); + if (!os.is_open()) throw std::runtime_error("Failed to open file for writing: " + filename); + serialize(os); + } + + /** + * @brief Loads the quantizer state from a file. + */ + void load_from_file(const std::string& filename) { + std::ifstream is(filename, std::ios::binary); + if (!is.is_open()) throw std::runtime_error("Failed to open file for reading: " + filename); + deserialize(is); + } + +private: + std::unique_ptr quantizer_; +}; + +namespace detail { + +static constexpr int64_t DEFAULT_CHUNK_SIZE = 16384; + +/** + * @brief Internal helper to read a binary file into a raw pointer using chunking. + */ +template +void load_matrix_raw_ptr(const raft::resources& res, const std::string& filename, const file_header_t& header, S* out_ptr, bool is_device_ptr) { + int64_t n_rows = static_cast(header.count); + int64_t n_cols = static_cast(header.dimension); + + if (n_rows == 0 || n_cols == 0) return; + + std::ifstream file(filename, std::ios::binary); + file.seekg(sizeof(file_header_t)); + + if (!is_device_ptr) { + file.read(reinterpret_cast(out_ptr), n_rows * n_cols * sizeof(S)); + if (file.gcount() != static_cast(n_rows * n_cols * sizeof(S))) { + throw std::runtime_error("Failed to read data content from: " + filename); + } + } else { + std::vector chunk_host; + for (int64_t row_offset = 0; row_offset < n_rows; row_offset += DEFAULT_CHUNK_SIZE) { + int64_t current_chunk_rows = std::min(DEFAULT_CHUNK_SIZE, n_rows - row_offset); + size_t total_chunk_elements = current_chunk_rows * n_cols; + chunk_host.resize(total_chunk_elements); + file.read(reinterpret_cast(chunk_host.data()), total_chunk_elements * sizeof(S)); + raft::copy(out_ptr + (row_offset * n_cols), chunk_host.data(), total_chunk_elements, raft::resource::get_cuda_stream(res)); + } + raft::resource::sync_stream(res); + } +} + +/** + * @brief Internal helper to perform chunked quantization or conversion from datafile to a raw pointer. + */ +template +void load_matrix_chunked_ptr(const raft::resources& res, const std::string& filename, const file_header_t& header, T* out_ptr, bool is_device_ptr) { + int64_t n_rows = static_cast(header.count); + int64_t n_cols = static_cast(header.dimension); + if (n_rows == 0 || n_cols == 0) return; + + std::ifstream file(filename, std::ios::binary); + file.seekg(sizeof(file_header_t)); + + scalar_quantizer_t quantizer; + if constexpr (DoQuantize) { + int64_t n_train = std::min(n_rows, static_cast(500)); + std::vector train_host(n_train * n_cols); + file.read(reinterpret_cast(train_host.data()), train_host.size() * sizeof(S)); + auto train_device = raft::make_device_matrix(res, n_train, n_cols); + raft::copy(train_device.data_handle(), train_host.data(), train_host.size(), raft::resource::get_cuda_stream(res)); + quantizer.train(res, train_device.view()); + file.seekg(sizeof(file_header_t)); + } + + std::vector chunk_host; + auto chunk_device_src = raft::make_device_matrix(res, DEFAULT_CHUNK_SIZE, n_cols); + + for (int64_t row_offset = 0; row_offset < n_rows; row_offset += DEFAULT_CHUNK_SIZE) { + int64_t current_chunk_rows = std::min(DEFAULT_CHUNK_SIZE, n_rows - row_offset); + size_t total_chunk_elements = current_chunk_rows * n_cols; + chunk_host.resize(total_chunk_elements); + file.read(reinterpret_cast(chunk_host.data()), total_chunk_elements * sizeof(S)); + raft::copy(chunk_device_src.data_handle(), chunk_host.data(), total_chunk_elements, raft::resource::get_cuda_stream(res)); + + auto current_chunk_src_view = raft::make_device_matrix_view(chunk_device_src.data_handle(), current_chunk_rows, n_cols); + + if constexpr (DoQuantize) { + quantizer.template transform(res, current_chunk_src_view, out_ptr + (row_offset * n_cols), is_device_ptr); + } else { + if (is_device_ptr) { + auto out_chunk_view = raft::make_device_matrix_view(out_ptr + (row_offset * n_cols), current_chunk_rows, n_cols); + raft::copy(res, out_chunk_view, current_chunk_src_view); + } else { + auto out_chunk_view = raft::make_host_matrix_view(out_ptr + (row_offset * n_cols), current_chunk_rows, n_cols); + raft::copy(res, out_chunk_view, current_chunk_src_view); + } + } + } + raft::resource::sync_stream(res); +} + +} // namespace detail + +/** + * @brief Reads a binary file into a CUDA device matrix. + */ +template +auto load_device_matrix(const raft::resources& res, const std::string& filename) { + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) throw std::runtime_error("Failed to open file: " + filename); + + file_header_t header; + file.read(reinterpret_cast(&header), sizeof(file_header_t)); + if (std::string(header.magic, 4) != "MODF") throw std::runtime_error("Invalid magic: " + filename); + + auto matrix = raft::make_device_matrix(res, static_cast(header.count), static_cast(header.dimension)); + if (header.data_type_size == sizeof(T)) { + detail::load_matrix_raw_ptr(res, filename, header, matrix.data_handle(), true); + } else if (header.data_type_size == 4) { + if constexpr (sizeof(T) == 2) { + detail::load_matrix_chunked_ptr(res, filename, header, matrix.data_handle(), true); + } else if constexpr (sizeof(T) == 1) { + detail::load_matrix_chunked_ptr(res, filename, header, matrix.data_handle(), true); + } else { + throw std::runtime_error("Unsupported conversion from float to requested size"); + } + } else if (header.data_type_size == 2) { + if constexpr (sizeof(T) == 1) { + detail::load_matrix_chunked_ptr(res, filename, header, matrix.data_handle(), true); + } else if constexpr (sizeof(T) == 4) { + detail::load_matrix_chunked_ptr(res, filename, header, matrix.data_handle(), true); + } else { + throw std::runtime_error("Unsupported conversion from half to requested size"); + } + } else { + throw std::runtime_error("Type size mismatch and conversion not supported for source size: " + std::to_string(header.data_type_size)); + } + return matrix; +} + +/** + * @brief Reads a binary file into a CUDA device matrix (overload). + */ +template +void load_device_matrix(const raft::resources& res, const std::string& filename, raft::device_matrix& out_matrix, uint64_t& out_count, uint64_t& out_dimension) { + out_matrix = load_device_matrix(res, filename); + out_count = static_cast(out_matrix.extent(0)); + out_dimension = static_cast(out_matrix.extent(1)); +} + +/** + * @brief Reads a binary file into a CUDA host matrix. + */ +template +auto load_host_matrix(const std::string& filename) { + raft::resources res; + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) throw std::runtime_error("Failed to open file: " + filename); + + file_header_t header; + file.read(reinterpret_cast(&header), sizeof(file_header_t)); + if (std::string(header.magic, 4) != "MODF") throw std::runtime_error("Invalid magic: " + filename); + + auto matrix = raft::make_host_matrix(static_cast(header.count), static_cast(header.dimension)); + if (header.data_type_size == sizeof(T)) { + detail::load_matrix_raw_ptr(res, filename, header, matrix.data_handle(), false); + } else { + if (header.data_type_size == 4) { + if constexpr (sizeof(T) == 2) { + detail::load_matrix_chunked_ptr(res, filename, header, matrix.data_handle(), false); + } else if constexpr (sizeof(T) == 1) { + detail::load_matrix_chunked_ptr(res, filename, header, matrix.data_handle(), false); + } else { + throw std::runtime_error("Unsupported conversion from float to requested size"); + } + } else if (header.data_type_size == 2) { + if constexpr (sizeof(T) == 1) { + detail::load_matrix_chunked_ptr(res, filename, header, matrix.data_handle(), false); + } else if constexpr (sizeof(T) == 4) { + detail::load_matrix_chunked_ptr(res, filename, header, matrix.data_handle(), false); + } else { + throw std::runtime_error("Unsupported conversion from half to requested size"); + } + } else { + throw std::runtime_error("Unsupported conversion for host matrix"); + } + } + return matrix; +} + +/** + * @brief Reads a binary file into a host vector. + */ +template +void load_host_matrix(const std::string& filename, std::vector& out_data, uint64_t& out_count, uint64_t& out_dimension) { + raft::resources res; + std::ifstream file(filename, std::ios::binary); + if (!file.is_open()) throw std::runtime_error("Failed to open file: " + filename); + + file_header_t header; + file.read(reinterpret_cast(&header), sizeof(file_header_t)); + if (std::string(header.magic, 4) != "MODF") throw std::runtime_error("Invalid magic: " + filename); + + out_count = header.count; + out_dimension = header.dimension; + out_data.resize(out_count * out_dimension); + + if (header.data_type_size == sizeof(T)) { + detail::load_matrix_raw_ptr(res, filename, header, out_data.data(), false); + } else { + if (header.data_type_size == 4) { + if constexpr (sizeof(T) == 2) { + detail::load_matrix_chunked_ptr(res, filename, header, out_data.data(), false); + } else if constexpr (sizeof(T) == 1) { + detail::load_matrix_chunked_ptr(res, filename, header, out_data.data(), false); + } else { + throw std::runtime_error("Unsupported conversion from float to requested size"); + } + } else if (header.data_type_size == 2) { + if constexpr (sizeof(T) == 1) { + detail::load_matrix_chunked_ptr(res, filename, header, out_data.data(), false); + } else if constexpr (sizeof(T) == 4) { + detail::load_matrix_chunked_ptr(res, filename, header, out_data.data(), false); + } else { + throw std::runtime_error("Unsupported conversion from half to requested size"); + } + } else { + throw std::runtime_error("Unsupported conversion for host matrix"); + } + } +} + +/** + * @brief Saves a CUDA device matrix to a binary file in the "MODF" format using chunking. + */ +template +void save_device_matrix(const raft::resources& res, const std::string& filename, + raft::device_matrix_view matrix) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) throw std::runtime_error("Failed to open file for writing: " + filename); + + file_header_t header; + std::memcpy(header.magic, "MODF", 4); + header.count = static_cast(matrix.extent(0)); + header.dimension = static_cast(matrix.extent(1)); + header.data_type_size = sizeof(std::remove_const_t); + file.write(reinterpret_cast(&header), sizeof(file_header_t)); + + int64_t n_rows = static_cast(header.count); + int64_t n_cols = static_cast(header.dimension); + std::vector> chunk_host; + + for (int64_t row_offset = 0; row_offset < n_rows; row_offset += detail::DEFAULT_CHUNK_SIZE) { + int64_t current_chunk_rows = std::min(detail::DEFAULT_CHUNK_SIZE, n_rows - row_offset); + size_t total_chunk_elements = current_chunk_rows * n_cols; + chunk_host.resize(total_chunk_elements); + + auto src_chunk_view = raft::make_device_matrix_view(matrix.data_handle() + (row_offset * n_cols), current_chunk_rows, n_cols); + auto host_chunk_view = raft::make_host_matrix_view, int64_t>(chunk_host.data(), current_chunk_rows, n_cols); + + raft::copy(res, host_chunk_view, src_chunk_view); + raft::resource::sync_stream(res); + file.write(reinterpret_cast(chunk_host.data()), total_chunk_elements * sizeof(std::remove_const_t)); + } +} + +/** + * @brief Saves a host matrix to a binary file in the "MODF" format. + */ +template +void save_host_matrix(const std::string& filename, + raft::host_matrix_view matrix) { + std::ofstream file(filename, std::ios::binary); + if (!file.is_open()) throw std::runtime_error("Failed to open file for writing: " + filename); + + file_header_t header; + std::memcpy(header.magic, "MODF", 4); + header.count = static_cast(matrix.extent(0)); + header.dimension = static_cast(matrix.extent(1)); + header.data_type_size = sizeof(std::remove_const_t); + file.write(reinterpret_cast(&header), sizeof(file_header_t)); + + if (matrix.size() > 0) { + file.write(reinterpret_cast(matrix.data_handle()), matrix.size() * sizeof(std::remove_const_t)); + } +} + +} // namespace matrixone diff --git a/cgo/cuvs/test/batching_test.cu b/cgo/cuvs/test/batching_test.cu new file mode 100644 index 0000000000000..c789e5ee12bcc --- /dev/null +++ b/cgo/cuvs/test/batching_test.cu @@ -0,0 +1,132 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuvs_worker.hpp" +#include "cagra.hpp" +#include "ivf_flat.hpp" +#include "ivf_pq.hpp" +#include "helper.h" +#include "test_framework.hpp" +#include +#include +#include + +using namespace matrixone; + +TEST(DynamicBatchingTest, CagraConcurrentSearch) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / count; + + std::vector devices = {0}; + cagra_build_params_t bp = cagra_build_params_default(); + gpu_cagra_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 8, DistributionMode_SINGLE_GPU); + + index.set_use_batching(true); + index.start(); + index.build(); + + const int num_threads = 8; + std::vector> futures; + + for (int i = 0; i < num_threads; ++i) { + futures.push_back(std::async(std::launch::async, [&index, dimension, i]() { + std::vector query(dimension); + for (uint32_t j = 0; j < dimension; ++j) query[j] = (float)i / 10.0f; + cagra_search_params_t sp = cagra_search_params_default(); + return index.search(query.data(), 1, dimension, 5, sp); + })); + } + + for (auto& f : futures) { + auto res = f.get(); + ASSERT_EQ(res.neighbors.size(), (size_t)5); + } + + index.destroy(); +} + +TEST(DynamicBatchingTest, IvfFlatConcurrentSearch) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / count; + + std::vector devices = {0}; + ivf_flat_build_params_t bp = ivf_flat_build_params_default(); + bp.n_lists = 10; + gpu_ivf_flat_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 8, DistributionMode_SINGLE_GPU); + + index.set_use_batching(true); + index.start(); + index.build(); + + const int num_threads = 8; + std::vector> futures; + + for (int i = 0; i < num_threads; ++i) { + futures.push_back(std::async(std::launch::async, [&index, dimension, i]() { + std::vector query(dimension); + for (uint32_t j = 0; j < dimension; ++j) query[j] = (float)i / 10.0f; + ivf_flat_search_params_t sp = ivf_flat_search_params_default(); + return index.search(query.data(), 1, dimension, 5, sp); + })); + } + + for (auto& f : futures) { + auto res = f.get(); + ASSERT_EQ(res.neighbors.size(), (size_t)5); + } + + index.destroy(); +} + +TEST(DynamicBatchingTest, IvfPqConcurrentSearch) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / count; + + std::vector devices = {0}; + ivf_pq_build_params_t bp = ivf_pq_build_params_default(); + bp.n_lists = 10; + bp.m = 8; + gpu_ivf_pq_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 8, DistributionMode_SINGLE_GPU); + + index.set_use_batching(true); + index.start(); + index.build(); + + const int num_threads = 8; + std::vector> futures; + + for (int i = 0; i < num_threads; ++i) { + futures.push_back(std::async(std::launch::async, [&index, dimension, i]() { + std::vector query(dimension); + for (uint32_t j = 0; j < dimension; ++j) query[j] = (float)i / 10.0f; + ivf_pq_search_params_t sp = ivf_pq_search_params_default(); + return index.search(query.data(), 1, dimension, 5, sp); + })); + } + + for (auto& f : futures) { + auto res = f.get(); + ASSERT_EQ(res.neighbors.size(), (size_t)5); + } + + index.destroy(); +} diff --git a/cgo/cuvs/test/brute_force_test.cu b/cgo/cuvs/test/brute_force_test.cu new file mode 100644 index 0000000000000..1d641b6ac088b --- /dev/null +++ b/cgo/cuvs/test/brute_force_test.cu @@ -0,0 +1,219 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuvs_worker.hpp" +#include "brute_force.hpp" +#include "test_framework.hpp" +#include +#include +#include + +using namespace matrixone; + +// --- Helper to convert float to half --- +static std::vector float_to_half(const std::vector& src) { + std::vector dst(src.size()); + for (size_t i = 0; i < src.size(); ++i) { + dst[i] = __float2half(src[i]); + } + return dst; +} + +// --- GpuBruteForceTest --- + +TEST(GpuBruteForceTest, BasicLoadAndSearch) { + const uint32_t dimension = 3; + const uint64_t count = 2; + std::vector dataset = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + + gpu_brute_force_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0); + index.start(); + index.build(); + + std::vector queries = {1.0, 2.0, 3.0}; + auto result = index.search(queries.data(), 1, dimension, 1); + + ASSERT_EQ(result.neighbors.size(), (size_t)1); + ASSERT_EQ(result.neighbors[0], 0u); + ASSERT_EQ(result.distances[0], 0.0); + + index.destroy(); +} + +TEST(GpuBruteForceTest, SearchWithMultipleQueries) { + const uint32_t dimension = 4; + const uint64_t count = 4; + std::vector dataset = { + 1.0, 0.0, 0.0, 0.0, // ID 0 + 0.0, 1.0, 0.0, 0.0, // ID 1 + 0.0, 0.0, 1.0, 0.0, // ID 2 + 0.0, 0.0, 0.0, 1.0 // ID 3 + }; + + gpu_brute_force_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0); + index.start(); + index.build(); + + std::vector queries = { + 1.0, 0.0, 0.0, 0.0, // Should match ID 0 + 0.0, 0.0, 1.0, 0.0 // Should match ID 2 + }; + auto result = index.search(queries.data(), 2, dimension, 1); + + ASSERT_EQ(result.neighbors.size(), (size_t)2); + ASSERT_EQ(result.neighbors[0], 0u); + ASSERT_EQ(result.neighbors[1], 2u); + + index.destroy(); +} + +TEST(GpuBruteForceTest, SearchWithFloat16) { + const uint32_t dimension = 2; + const uint64_t count = 2; + std::vector f_dataset = {1.0, 1.0, 2.0, 2.0}; + std::vector h_dataset = float_to_half(f_dataset); + + gpu_brute_force_t index(h_dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0); + index.start(); + index.build(); + + std::vector f_queries = {1.0, 1.0}; + std::vector h_queries = float_to_half(f_queries); + auto result = index.search(h_queries.data(), 1, dimension, 1); + + ASSERT_EQ(result.neighbors.size(), (size_t)1); + ASSERT_EQ(result.neighbors[0], 0u); + ASSERT_EQ(result.distances[0], 0.0); + + index.destroy(); +} + +TEST(GpuBruteForceTest, SearchWithInnerProduct) { + const uint32_t dimension = 2; + const uint64_t count = 2; + std::vector dataset = { + 1.0, 0.0, + 0.0, 1.0 + }; + + gpu_brute_force_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::InnerProduct, 1, 0); + index.start(); + index.build(); + + std::vector queries = {1.0, 0.0}; + auto result = index.search(queries.data(), 1, dimension, 2); + + ASSERT_EQ(result.neighbors.size(), (size_t)2); + ASSERT_EQ(result.neighbors[0], 0u); + ASSERT_EQ(result.neighbors[1], 1u); + + // dot product should be 1.0 for exact match + ASSERT_TRUE(std::abs(result.distances[0] - 1.0) < 1e-5); + ASSERT_TRUE(std::abs(result.distances[1] - 0.0) < 1e-5); + + index.destroy(); +} + +TEST(GpuBruteForceTest, EmptyDataset) { + const uint32_t dimension = 128; + const uint64_t count = 0; + + gpu_brute_force_t index(nullptr, count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0); + index.build(); + + std::vector queries(dimension, 0.0); + auto result = index.search(queries.data(), 1, dimension, 5); + + ASSERT_EQ(result.neighbors.size(), (size_t)0); + + index.destroy(); +} + +TEST(GpuBruteForceTest, LargeLimit) { + const uint32_t dimension = 2; + const uint64_t count = 5; + std::vector dataset(count * dimension, 1.0); + + gpu_brute_force_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0); + index.start(); + index.build(); + + std::vector queries(dimension, 1.0); + uint32_t limit = 10; + auto result = index.search(queries.data(), 1, dimension, limit); + + ASSERT_EQ(result.neighbors.size(), (size_t)limit); + for (int i = 0; i < 5; ++i) ASSERT_GE(result.neighbors[i], 0); + for (int i = 5; i < 10; ++i) ASSERT_EQ((int64_t)result.neighbors[i], (int64_t)-1); + + index.destroy(); +} + +// --- CuvsWorkerTest --- + +TEST(CuvsWorkerTest, BruteForceSearch) { + uint32_t n_threads = 1; + cuvs_worker_t worker(n_threads, 0); // Added device_id + worker.start(); + + const uint32_t dimension = 128; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + gpu_brute_force_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0); + index.start(); + index.build(); + + std::vector queries = std::vector(dataset.begin(), dataset.begin() + dimension); + auto result = index.search(queries.data(), 1, dimension, 5); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); + worker.stop(); +} + +TEST(CuvsWorkerTest, ConcurrentSearches) { + const uint32_t dimension = 16; + const uint64_t count = 100; + std::vector dataset(count * dimension); + // Use very distinct values to ensure unique neighbors + for (size_t i = 0; i < count; ++i) { + for (size_t j = 0; j < dimension; ++j) { + dataset[i * dimension + j] = (float)i * 100.0f + (float)j; + } + } + + gpu_brute_force_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 4, 0); + index.start(); + index.build(); + + const int num_threads = 4; + std::vector> futures; + for (int i = 0; i < num_threads; ++i) { + futures.push_back(std::async(std::launch::async, [&index, dimension, &dataset, i]() { + std::vector query = std::vector(dataset.begin() + i * dimension, dataset.begin() + (i + 1) * dimension); + auto res = index.search(query.data(), 1, dimension, 1); + ASSERT_EQ(res.neighbors[0], (int64_t)i); + })); + } + + for (auto& f : futures) f.get(); + + index.destroy(); +} diff --git a/cgo/cuvs/test/cagra_test.cu b/cgo/cuvs/test/cagra_test.cu new file mode 100644 index 0000000000000..641ba5fbe1006 --- /dev/null +++ b/cgo/cuvs/test/cagra_test.cu @@ -0,0 +1,134 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuvs_worker.hpp" +#include "cagra.hpp" +#include "helper.h" +#include "test_framework.hpp" +#include +#include + +using namespace matrixone; + +TEST(GpuCagraTest, BasicLoadAndSearch) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + std::vector devices = {0}; + cagra_build_params_t bp = cagra_build_params_default(); + gpu_cagra_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + + std::vector queries(dataset.begin(), dataset.begin() + dimension); + cagra_search_params_t sp = cagra_search_params_default(); + auto result = index.search(queries.data(), 1, dimension, 5, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); +} + +TEST(GpuCagraTest, SaveAndLoadFromFile) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + std::string filename = "test_cagra.bin"; + std::vector devices = {0}; + + // 1. Build and Save + { + cagra_build_params_t bp = cagra_build_params_default(); + gpu_cagra_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + index.save(filename); + index.destroy(); + } + + // 2. Load and Search + { + cagra_build_params_t bp = cagra_build_params_default(); + gpu_cagra_t index(filename, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + + std::vector queries(dataset.begin(), dataset.begin() + dimension); + cagra_search_params_t sp = cagra_search_params_default(); + auto result = index.search(queries.data(), 1, dimension, 5, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); + } + + std::remove(filename.c_str()); +} + +TEST(GpuCagraTest, ShardedModeSimulation) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + int dev_count = gpu_get_device_count(); + ASSERT_TRUE(dev_count > 0); + std::vector devices(dev_count); + gpu_get_device_list(devices.data(), dev_count); + + cagra_build_params_t bp = cagra_build_params_default(); + gpu_cagra_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SHARDED); + index.start(); + index.build(); + std::vector queries(dataset.begin(), dataset.begin() + dimension); + cagra_search_params_t sp = cagra_search_params_default(); + auto result = index.search(queries.data(), 1, dimension, 5, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); +} + +TEST(GpuCagraTest, ReplicatedModeSimulation) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + int dev_count = gpu_get_device_count(); + ASSERT_TRUE(dev_count > 0); + std::vector devices(dev_count); + gpu_get_device_list(devices.data(), dev_count); + + cagra_build_params_t bp = cagra_build_params_default(); + gpu_cagra_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_REPLICATED); + index.start(); + index.build(); + std::vector queries(dataset.begin(), dataset.begin() + dimension); + cagra_search_params_t sp = cagra_search_params_default(); + auto result = index.search(queries.data(), 1, dimension, 5, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); +} diff --git a/cgo/cuvs/test/distance_test.cu b/cgo/cuvs/test/distance_test.cu new file mode 100644 index 0000000000000..c0558bf4997b7 --- /dev/null +++ b/cgo/cuvs/test/distance_test.cu @@ -0,0 +1,105 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "distance.hpp" +#include "test_framework.hpp" +#include +#include +#include +#include +#include + +using namespace matrixone; + +#define ASSERT_NEAR(val1, val2, abs_error) ASSERT_TRUE(std::abs((val1) - (val2)) <= (abs_error)) + +TEST(PairwiseDistanceTest, BasicF32) { + const uint32_t dimension = 3; + const uint64_t n_x = 2; + const uint64_t n_y = 2; + + std::vector x = { + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0 + }; + std::vector y = { + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0 + }; + + std::vector dist(n_x * n_y); + const raft::resources& res = get_raft_resources(); + + pairwise_distance(res, x.data(), n_x, y.data(), n_y, dimension, cuvs::distance::DistanceType::L2Expanded, dist.data()); + + // Expected results for L2Squared: + // dist[0,0] = (1-1)^2 + (0-0)^2 + (0-0)^2 = 0 + // dist[0,1] = (1-0)^2 + (0-1)^2 + (0-0)^2 = 2 + // dist[1,0] = (0-1)^2 + (1-0)^2 + (0-0)^2 = 2 + // dist[1,1] = (0-0)^2 + (1-1)^2 + (0-0)^2 = 0 + + ASSERT_NEAR(dist[0], 0.0f, 1e-5f); + ASSERT_NEAR(dist[1], 2.0f, 1e-5f); + ASSERT_NEAR(dist[2], 2.0f, 1e-5f); + ASSERT_NEAR(dist[3], 0.0f, 1e-5f); +} + +TEST(PairwiseDistanceTest, BasicF16) { + const uint32_t dimension = 2; + const uint64_t n_x = 1; + const uint64_t n_y = 1; + + std::vector x = {__float2half(1.0f), __float2half(2.0f)}; + std::vector y = {__float2half(1.0f), __float2half(2.0f)}; + + std::vector dist(n_x * n_y); + const raft::resources& res = get_raft_resources(); + + pairwise_distance(res, x.data(), n_x, y.data(), n_y, dimension, cuvs::distance::DistanceType::L2Expanded, dist.data()); + + ASSERT_NEAR(dist[0], 0.0f, 1e-3f); +} + +TEST(PairwiseDistanceTest, InnerProductF32) { + const uint32_t dimension = 2; + const uint64_t n_x = 2; + const uint64_t n_y = 2; + + std::vector x = { + 1.0, 0.0, + 0.0, 1.0 + }; + std::vector y = { + 1.0, 0.0, + 0.0, 1.0 + }; + + std::vector dist(n_x * n_y); + const raft::resources& res = get_raft_resources(); + + pairwise_distance(res, x.data(), n_x, y.data(), n_y, dimension, cuvs::distance::DistanceType::InnerProduct, dist.data()); + + // Inner product: + // dist[0,0] = 1*1 + 0*0 = 1 + // dist[0,1] = 1*0 + 0*1 = 0 + // dist[1,0] = 0*1 + 1*0 = 0 + // dist[1,1] = 0*0 + 1*1 = 1 + + ASSERT_NEAR(dist[0], 1.0f, 1e-5f); + ASSERT_NEAR(dist[1], 0.0f, 1e-5f); + ASSERT_NEAR(dist[2], 0.0f, 1e-5f); + ASSERT_NEAR(dist[3], 1.0f, 1e-5f); +} diff --git a/cgo/cuvs/test/ivf_flat_test.cu b/cgo/cuvs/test/ivf_flat_test.cu new file mode 100644 index 0000000000000..4088c209dc4b7 --- /dev/null +++ b/cgo/cuvs/test/ivf_flat_test.cu @@ -0,0 +1,174 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuvs_worker.hpp" +#include "ivf_flat.hpp" +#include "helper.h" +#include "test_framework.hpp" +#include +#include + +using namespace matrixone; + +TEST(GpuIvfFlatTest, BasicLoadSearchAndCenters) { + const uint32_t dimension = 2; + const uint64_t count = 4; + std::vector dataset = { + 1.0, 1.0, + 1.1, 1.1, + 100.0, 100.0, + 101.0, 101.0 + }; + + std::vector devices = {0}; + ivf_flat_build_params_t bp = ivf_flat_build_params_default(); + bp.n_lists = 2; + gpu_ivf_flat_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + + // Verify centers + auto centers = index.get_centers(); + ASSERT_EQ(centers.size(), (size_t)(2 * dimension)); + TEST_LOG("IVF-Flat Centers: " << centers[0] << ", " << centers[1]); + + std::vector queries = {1.05, 1.05}; + ivf_flat_search_params_t sp = ivf_flat_search_params_default(); + sp.n_probes = 2; + auto result = index.search(queries.data(), 1, dimension, 2, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)2); + // Should be either 0 or 1 + ASSERT_TRUE(result.neighbors[0] == 0 || result.neighbors[0] == 1); + + index.destroy(); +} + +TEST(GpuIvfFlatTest, SaveAndLoadFromFile) { + const uint32_t dimension = 2; + const uint64_t count = 4; + std::vector dataset = {1.0, 1.0, 1.1, 1.1, 100.0, 100.0, 101.0, 101.0}; + std::string filename = "test_ivf_flat.bin"; + std::vector devices = {0}; + + // 1. Build and Save + { + ivf_flat_build_params_t bp = ivf_flat_build_params_default(); + bp.n_lists = 2; + gpu_ivf_flat_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + index.save(filename); + index.destroy(); + } + + // 2. Load and Search + { + ivf_flat_build_params_t bp = ivf_flat_build_params_default(); + bp.n_lists = 2; + gpu_ivf_flat_t index(filename, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + + std::vector queries = {100.5, 100.5}; + + ivf_flat_search_params_t sp = ivf_flat_search_params_default(); + sp.n_probes = 2; + auto result = index.search(queries.data(), 1, dimension, 2, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)2); + ASSERT_TRUE(result.neighbors[0] == 2 || result.neighbors[0] == 3); + + index.destroy(); + } + + std::remove(filename.c_str()); +} + +TEST(GpuIvfFlatTest, ShardedModeSimulation) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / dataset.size(); + + std::vector devices = {0}; + ivf_flat_build_params_t bp = ivf_flat_build_params_default(); + bp.n_lists = 5; + gpu_ivf_flat_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SHARDED); + index.start(); + index.build(); + + auto centers = index.get_centers(); + ASSERT_EQ(centers.size(), (size_t)(5 * dimension)); + + std::vector queries(dataset.begin(), dataset.begin() + dimension); + ivf_flat_search_params_t sp = ivf_flat_search_params_default(); + sp.n_probes = 2; + auto result = index.search(queries.data(), 1, dimension, 5, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); +} + +TEST(GpuIvfFlatTest, ReplicatedModeSimulation) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + int dev_count = gpu_get_device_count(); + ASSERT_TRUE(dev_count > 0); + std::vector devices(dev_count); + gpu_get_device_list(devices.data(), dev_count); + + ivf_flat_build_params_t bp = ivf_flat_build_params_default(); + bp.n_lists = 10; + gpu_ivf_flat_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_REPLICATED); + index.start(); + index.build(); + std::vector queries(dataset.begin(), dataset.begin() + dimension); + ivf_flat_search_params_t sp = ivf_flat_search_params_default(); + auto result = index.search(queries.data(), 1, dimension, 5, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); +} + +TEST(GpuIvfFlatTest, SetGetQuantizer) { + const uint32_t dimension = 4; + const uint64_t count = 10; + ivf_flat_build_params_t bp = ivf_flat_build_params_default(); + std::vector devices = {0}; + + gpu_ivf_flat_t index(count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + + float min = -1.5f; + float max = 2.5f; + index.set_quantizer(min, max); + + float gMin = 0, gMax = 0; + index.get_quantizer(&gMin, &gMax); + + ASSERT_EQ(min, gMin); + ASSERT_EQ(max, gMax); + + index.destroy(); +} + diff --git a/cgo/cuvs/test/ivf_pq_test.cu b/cgo/cuvs/test/ivf_pq_test.cu new file mode 100644 index 0000000000000..d5bf0abb337af --- /dev/null +++ b/cgo/cuvs/test/ivf_pq_test.cu @@ -0,0 +1,201 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuvs_worker.hpp" +#include "ivf_pq.hpp" +#include "helper.h" +#include "test_framework.hpp" +#include +#include + +using namespace matrixone; + +TEST(GpuIvfPqTest, BasicLoadSearchAndCenters) { + const uint32_t dimension = 16; + const uint64_t count = 4; + std::vector dataset(count * dimension); + for (size_t i = 0; i < count; ++i) { + for (size_t j = 0; j < dimension; ++j) { + dataset[i * dimension + j] = (float)i; + } + } + + std::vector devices = {0}; + ivf_pq_build_params_t bp = ivf_pq_build_params_default(); + bp.n_lists = 2; + bp.m = 8; + gpu_ivf_pq_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + + // Verify centers + auto centers = index.get_centers(); + ASSERT_TRUE(centers.size() % index.get_n_list() == 0); + ASSERT_EQ(centers.size(), (size_t)(index.get_n_list() * index.get_dim_ext())); + + std::vector queries(dimension); + for (size_t j = 0; j < dimension; ++j) queries[j] = 0.9f; + + ivf_pq_search_params_t sp = ivf_pq_search_params_default(); + sp.n_probes = 2; + auto result = index.search(queries.data(), 1, dimension, 2, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)2); + // Should be either 0 or 1 + ASSERT_TRUE(result.neighbors[0] == 0 || result.neighbors[0] == 1); + + index.destroy(); +} + +TEST(GpuIvfPqTest, SaveAndLoadFromFile) { + const uint32_t dimension = 4; + const uint64_t count = 4; + std::vector dataset = { + 0.0, 0.0, 0.0, 0.0, + 1.0, 1.0, 1.0, 1.0, + 10.0, 10.0, 10.0, 10.0, + 11.0, 11.0, 11.0, 11.0 + }; + std::string filename = "test_ivf_pq.bin"; + std::vector devices = {0}; + + // 1. Build and Save + { + ivf_pq_build_params_t bp = ivf_pq_build_params_default(); + bp.n_lists = 2; + bp.m = 2; + gpu_ivf_pq_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + index.save(filename); + index.destroy(); + } + + // 2. Load and Search + { + ivf_pq_build_params_t bp = ivf_pq_build_params_default(); + bp.n_lists = 2; + bp.m = 2; + gpu_ivf_pq_t index(filename, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + + std::vector queries = {10.5, 10.5, 10.5, 10.5}; + ivf_pq_search_params_t sp = ivf_pq_search_params_default(); + sp.n_probes = 2; + auto result = index.search(queries.data(), 1, dimension, 2, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)2); + ASSERT_TRUE(result.neighbors[0] == 2 || result.neighbors[0] == 3); + + index.destroy(); + } + + std::remove(filename.c_str()); +} + +TEST(GpuIvfPqTest, BuildFromDataFile) { + const uint32_t dimension = 8; + const uint64_t count = 100; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) { + dataset[i] = static_cast(i % 10); + } + + std::string data_filename = "test_dataset.modf"; + { + // Use our utility to save the dataset in MODF format + raft::resources res; + auto matrix = raft::make_host_matrix(count, dimension); + std::copy(dataset.begin(), dataset.end(), matrix.data_handle()); + save_host_matrix(data_filename, matrix.view()); + } + + std::vector devices = {0}; + ivf_pq_build_params_t bp = ivf_pq_build_params_default(); + bp.n_lists = 10; + bp.m = 4; + + gpu_ivf_pq_t index(data_filename, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU); + index.start(); + index.build(); + + ASSERT_EQ(index.get_dim(), dimension); + ASSERT_EQ(index.count, static_cast(count)); + + std::vector queries(dimension, 0.0f); + ivf_pq_search_params_t sp = ivf_pq_search_params_default(); + auto result = index.search(queries.data(), 1, dimension, 1, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)1); + + index.destroy(); + std::remove(data_filename.c_str()); +} + +TEST(GpuIvfPqTest, ShardedModeSimulation) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + int dev_count = gpu_get_device_count(); + ASSERT_TRUE(dev_count > 0); + std::vector devices(dev_count); + gpu_get_device_list(devices.data(), dev_count); + + ivf_pq_build_params_t bp = ivf_pq_build_params_default(); + bp.n_lists = 10; + bp.m = 8; + gpu_ivf_pq_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SHARDED); + index.start(); + index.build(); + std::vector queries(dataset.begin(), dataset.begin() + dimension); + ivf_pq_search_params_t sp = ivf_pq_search_params_default(); + auto result = index.search(queries.data(), 1, dimension, 5, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); +} + +TEST(GpuIvfPqTest, ReplicatedModeSimulation) { + const uint32_t dimension = 16; + const uint64_t count = 1000; + std::vector dataset(count * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + int dev_count = gpu_get_device_count(); + ASSERT_TRUE(dev_count > 0); + std::vector devices(dev_count); + gpu_get_device_list(devices.data(), dev_count); + + ivf_pq_build_params_t bp = ivf_pq_build_params_default(); + bp.n_lists = 10; + bp.m = 8; + gpu_ivf_pq_t index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_REPLICATED); + index.start(); + index.build(); + std::vector queries(dataset.begin(), dataset.begin() + dimension); + ivf_pq_search_params_t sp = ivf_pq_search_params_default(); + auto result = index.search(queries.data(), 1, dimension, 5, sp); + + ASSERT_EQ(result.neighbors.size(), (size_t)5); + ASSERT_EQ(result.neighbors[0], 0u); + + index.destroy(); +} diff --git a/cgo/cuvs/test/kmeans_test.cu b/cgo/cuvs/test/kmeans_test.cu new file mode 100644 index 0000000000000..4b4b34bfe9587 --- /dev/null +++ b/cgo/cuvs/test/kmeans_test.cu @@ -0,0 +1,89 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuvs_worker.hpp" +#include "kmeans.hpp" +#include "test_framework.hpp" +#include +#include +#include + +using namespace matrixone; + +TEST(GpuKMeansTest, BasicFitAndPredict) { + const uint32_t n_clusters = 3; + const uint32_t dimension = 2; + const uint64_t n_samples = 9; + + // Create 3 clusters of points + std::vector dataset = { + 0.1f, 0.1f, 0.0f, 0.2f, 0.2f, 0.0f, // Cluster 0 + 10.1f, 10.1f, 10.0f, 10.2f, 10.2f, 10.0f, // Cluster 1 + 20.1f, 20.1f, 20.0f, 20.2f, 20.2f, 20.0f // Cluster 2 + }; + + gpu_kmeans_t kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1); + kmeans.start(); + + auto fit_res = kmeans.fit(dataset.data(), n_samples); + ASSERT_GE(fit_res.n_iter, 1); + + auto predict_res = kmeans.predict(dataset.data(), n_samples); + ASSERT_EQ(predict_res.labels.size(), (size_t)n_samples); + + // Since we use balanced_params, it might prioritize balancing cluster sizes over spatial distance + // on very small datasets. We just check that all labels are within range [0, nClusters). + for (size_t i = 0; i < n_samples; ++i) { + ASSERT_TRUE(predict_res.labels[i] >= 0 && predict_res.labels[i] < (int64_t)n_clusters); + } + + kmeans.destroy(); +} + +TEST(GpuKMeansTest, FitPredict) { + const uint32_t n_clusters = 2; + const uint32_t dimension = 4; + const uint64_t n_samples = 10; + std::vector dataset(n_samples * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + gpu_kmeans_t kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1); + kmeans.start(); + + auto res = kmeans.fit_predict(dataset.data(), n_samples); + ASSERT_EQ(res.labels.size(), (size_t)n_samples); + ASSERT_GE(res.n_iter, 1); + + kmeans.destroy(); +} + +TEST(GpuKMeansTest, GetCentroids) { + const uint32_t n_clusters = 5; + const uint32_t dimension = 8; + const uint64_t n_samples = 50; + std::vector dataset(n_samples * dimension); + for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX; + + gpu_kmeans_t kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1); + kmeans.start(); + + kmeans.fit(dataset.data(), n_samples); + auto centroids = kmeans.get_centroids(); + + ASSERT_EQ(centroids.size(), (size_t)(n_clusters * dimension)); + + kmeans.destroy(); +} diff --git a/cgo/cuvs/test/main_test.cu b/cgo/cuvs/test/main_test.cu new file mode 100644 index 0000000000000..3a9c373b90031 --- /dev/null +++ b/cgo/cuvs/test/main_test.cu @@ -0,0 +1,377 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cuvs_worker.hpp" +#include "test_framework.hpp" +#include +#include + +using namespace matrixone; + +thread_local bool current_test_failed = false; + +// --- thread_safe_queue_t Tests --- + +TEST(ThreadSafeQueueTest, BasicPushPop) { + thread_safe_queue_t q; + q.push(1); + q.push(2); + + int val; + ASSERT_TRUE(q.pop(val)); + ASSERT_EQ(val, 1); + ASSERT_TRUE(q.pop(val)); + ASSERT_EQ(val, 2); +} + +TEST(ThreadSafeQueueTest, PopEmptyBlocking) { + thread_safe_queue_t q; + int val = 0; + + auto fut = std::async(std::launch::async, [&]() { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + q.push(42); + }); + + ASSERT_TRUE(q.pop(val)); + ASSERT_EQ(val, 42); +} + +TEST(ThreadSafeQueueTest, StopQueue) { + thread_safe_queue_t q; + int val; + + auto fut = std::async(std::launch::async, [&]() { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + q.stop(); + }); + + ASSERT_FALSE(q.pop(val)); // Should return false after stop + ASSERT_TRUE(q.is_stopped()); +} + +TEST(ThreadSafeQueueTest, PushBlocking) { + thread_safe_queue_t q; + q.set_capacity(2); + + q.push(1); + q.push(2); + + std::atomic pushed_third{false}; + std::thread t([&]() { + q.push(3); // Should block + pushed_third.store(true); + }); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + ASSERT_FALSE(pushed_third.load()); + + int val; + ASSERT_TRUE(q.pop(val)); + ASSERT_EQ(val, 1); + + // Now the third push should unblock + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + ASSERT_TRUE(pushed_third.load()); + + ASSERT_TRUE(q.pop(val)); + ASSERT_EQ(val, 2); + ASSERT_TRUE(q.pop(val)); + ASSERT_EQ(val, 3); + + t.join(); +} + +TEST(ThreadSafeQueueTest, ProducerConsumerStress) { + thread_safe_queue_t q; + q.set_capacity(10); + const int num_producers = 4; + const int num_consumers = 4; + const int items_per_producer = 1000; + + std::atomic sum_pushed{0}; + std::atomic sum_popped{0}; + std::atomic count_popped{0}; + + auto producer = [&]() { + for (int i = 0; i < items_per_producer; ++i) { + q.push(1); + sum_pushed.fetch_add(1); + } + }; + + auto consumer = [&]() { + int val; + while (q.pop(val)) { + sum_popped.fetch_add(val); + count_popped.fetch_add(1); + if (count_popped.load() == num_producers * items_per_producer) { + q.stop(); + } + } + }; + + std::vector threads; + for (int i = 0; i < num_producers; ++i) threads.emplace_back(producer); + for (int i = 0; i < num_consumers; ++i) threads.emplace_back(consumer); + + for (auto& t : threads) t.join(); + + ASSERT_EQ(sum_pushed.load(), sum_popped.load()); + ASSERT_EQ(count_popped.load(), num_producers * items_per_producer); +} + +TEST(ThreadSafeQueueTest, StopUnblocksProducer) { + thread_safe_queue_t q; + q.set_capacity(1); + q.push(1); + + std::atomic push_exited{false}; + std::thread t([&]() { + q.push(2); // Blocks + push_exited.store(true); + }); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + ASSERT_FALSE(push_exited.load()); + + q.stop(); + t.join(); + ASSERT_TRUE(push_exited.load()); +} + +// --- cuvs_task_result_store_t Tests --- + +TEST(CuvsTaskResultStoreTest, BasicStoreRetrieve) { + cuvs_task_result_store_t store; + uint64_t id = store.get_next_job_id(); + + cuvs_task_result_t res{id, 100, nullptr}; + store.store(res); + + auto fut = store.wait(id); + auto retrieved = fut.get(); + ASSERT_EQ(std::any_cast(retrieved.result), 100); +} + +TEST(CuvsTaskResultStoreTest, AsyncWait) { + cuvs_task_result_store_t store; + uint64_t id = store.get_next_job_id(); + + auto fut = store.wait(id); + + std::thread t([&]() { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + store.store({id, std::string("async"), nullptr}); + }); + + auto retrieved = fut.get(); + ASSERT_EQ(std::any_cast(retrieved.result), std::string("async")); + t.join(); +} + +TEST(CuvsTaskResultStoreTest, StopStore) { + cuvs_task_result_store_t store; + uint64_t id = store.get_next_job_id(); + auto fut = store.wait(id); + + store.stop(); + + ASSERT_THROW(fut.get(), std::runtime_error); +} + +// --- raft_handle_wrapper_t and is_snmg_handle Tests --- + +TEST(RaftHandleWrapperTest, DetectSingleGpu) { + std::vector devices = {0}; + raft_handle_wrapper_t wrapper(devices, false); // force_mg = false + ASSERT_FALSE(is_snmg_handle(wrapper.get_raft_resources())); +} + +TEST(RaftHandleWrapperTest, DetectMultiGpuForced) { + std::vector devices = {0}; + raft_handle_wrapper_t wrapper(devices, true); // force_mg = true + ASSERT_TRUE(is_snmg_handle(wrapper.get_raft_resources())); +} + +// --- cuvs_worker_t Tests --- + +TEST(CuvsWorkerTest, BasicLifecycle) { + uint32_t n_threads = 1; + cuvs_worker_t worker(n_threads); + worker.start(); + worker.stop(); +} + +TEST(CuvsWorkerTest, SubmitTask) { + uint32_t n_threads = 1; + cuvs_worker_t worker(n_threads); + worker.start(); + + auto task = [](raft_handle_wrapper_t&) -> std::any { + return std::string("success"); + }; + + uint64_t job_id = worker.submit(task); + auto result = worker.wait(job_id).get(); + + ASSERT_EQ(std::any_cast(result.result), std::string("success")); + + worker.stop(); +} + +TEST(CuvsWorkerTest, MultipleThreads) { + uint32_t n_threads = 4; + cuvs_worker_t worker(n_threads); + worker.start(); + + std::vector ids; + for (int i = 0; i < 10; ++i) { + ids.push_back(worker.submit([i](raft_handle_wrapper_t&) -> std::any { + return i * 2; + })); + } + + for (int i = 0; i < 10; ++i) { + auto res = worker.wait(ids[i]).get(); + ASSERT_EQ(std::any_cast(res.result), i * 2); + } + + worker.stop(); +} + +TEST(CuvsWorkerTest, TaskErrorHandling) { + uint32_t n_threads = 1; + cuvs_worker_t worker(n_threads); + worker.start(); + + auto fail_task = [](raft_handle_wrapper_t&) -> std::any { + throw std::runtime_error("task failed intentionally"); + }; + + uint64_t job_id = worker.submit(fail_task); + auto result = worker.wait(job_id).get(); + + ASSERT_TRUE(result.error != nullptr); + ASSERT_TRUE(has_exception(result.error)); + + worker.stop(); +} + +TEST(CuvsWorkerTest, SubmitMain) { + uint32_t n_threads = 2; + cuvs_worker_t worker(n_threads); + worker.start(); + + // Task that identifies the thread it's running on + auto task = [](raft_handle_wrapper_t&) -> std::any { + return std::this_thread::get_id(); + }; + + // Submit many tasks to main to ensure they are picked up + std::vector ids; + for(int i=0; i<10; ++i) { + ids.push_back(worker.submit_main(task)); + } + + for(auto id : ids) { + auto res = worker.wait(id).get(); + ASSERT_TRUE(res.error == nullptr); + } + + worker.stop(); +} + +TEST(CuvsWorkerTest, BoundedQueueStress) { + const uint32_t n_workers = 4; + const uint32_t n_producers = 4; + const uint32_t tasks_per_producer = 500; + + cuvs_worker_t worker(n_workers); + worker.start(); + + std::atomic tasks_completed{0}; + auto task = [&](raft_handle_wrapper_t&) -> std::any { + tasks_completed.fetch_add(1); + // Small sleep to ensure queue builds up + std::this_thread::sleep_for(std::chrono::microseconds(10)); + return std::any(); + }; + + std::vector producers; + for (uint32_t i = 0; i < n_producers; ++i) { + producers.emplace_back([&, i]() { + for (uint32_t j = 0; j < tasks_per_producer; ++j) { + // Mix of submit and submit_main + if ((i + j) % 2 == 0) { + worker.submit(task); + } else { + worker.submit_main(task); + } + } + }); + } + + for (auto& t : producers) t.join(); + + // Wait for all tasks to complete (since we didn't keep track of IDs here for simplicity, + // we just check the counter) + const uint32_t total_tasks = n_producers * tasks_per_producer; + auto start_time = std::chrono::steady_clock::now(); + while (tasks_completed.load() < total_tasks) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + if (std::chrono::steady_clock::now() - start_time > std::chrono::seconds(10)) { + REPORT_FAILURE("BoundedQueueStress timed out - possible hang"); + } + } + + ASSERT_EQ(tasks_completed.load(), total_tasks); + worker.stop(); +} + +TEST(CuvsWorkerTest, StopUnderLoad) { + const uint32_t n_workers = 4; + cuvs_worker_t worker(n_workers); + worker.start(); + + std::atomic producer_should_stop{false}; + std::thread producer([&]() { + auto task = [](raft_handle_wrapper_t&) -> std::any { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + return std::any(); + }; + while (!producer_should_stop.load()) { + try { + worker.submit(task); + } catch (...) { + // Expected when worker stops + break; + } + } + }); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Stop the worker while tasks are being submitted/processed + worker.stop(); + + producer_should_stop.store(true); + if (producer.joinable()) producer.join(); +} + +int main() { + return RUN_ALL_TESTS(); +} diff --git a/cgo/cuvs/test/quantize_test.cu b/cgo/cuvs/test/quantize_test.cu new file mode 100644 index 0000000000000..fcb7bbf3a194c --- /dev/null +++ b/cgo/cuvs/test/quantize_test.cu @@ -0,0 +1,330 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "quantize.hpp" +#include "test_framework.hpp" +#include +#include +#include +#include +#include + +using namespace matrixone; + +TEST(UtilsTest, SaveLoadHostMatrix) { + const std::string filename = "test_host_matrix.modf"; + const int64_t count = 10; + const int64_t dimension = 4; + + auto matrix = raft::make_host_matrix(count, dimension); + for (int64_t i = 0; i < count * dimension; ++i) { + matrix.data_handle()[i] = static_cast(i); + } + + // Save + ASSERT_NO_THROW(save_host_matrix(filename, matrix.view())); + + // Load + auto loaded_matrix = load_host_matrix(filename); + + // Verify + ASSERT_EQ(loaded_matrix.extent(0), count); + ASSERT_EQ(loaded_matrix.extent(1), dimension); + + for (int64_t i = 0; i < count * dimension; ++i) { + ASSERT_EQ(loaded_matrix.data_handle()[i], static_cast(i)); + } + + std::remove(filename.c_str()); +} + +TEST(UtilsTest, SaveLoadDeviceMatrix) { + raft::resources res; + const std::string filename = "test_device_matrix.modf"; + const int64_t count = 5; + const int64_t dimension = 3; + + auto matrix = raft::make_device_matrix(res, count, dimension); + std::vector host_data(count * dimension); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = static_cast(i) * 1.1f; + } + raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + // Save + ASSERT_NO_THROW(save_device_matrix(res, filename, matrix.view())); + + // Load + auto loaded_matrix = load_device_matrix(res, filename); + + // Verify + ASSERT_EQ(loaded_matrix.extent(0), count); + ASSERT_EQ(loaded_matrix.extent(1), dimension); + + std::vector loaded_host_data(count * dimension); + raft::copy(loaded_host_data.data(), loaded_matrix.data_handle(), loaded_host_data.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + for (size_t i = 0; i < host_data.size(); ++i) { + ASSERT_EQ(loaded_host_data[i], host_data[i]); + } + + std::remove(filename.c_str()); +} + +TEST(UtilsTest, SaveLoadDeviceMatrixOverload) { + raft::resources res; + const std::string filename = "test_device_matrix_overload.modf"; + const int64_t count = 3; + const int64_t dimension = 2; + + auto matrix = raft::make_device_matrix(res, count, dimension); + std::vector host_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + // Save + save_device_matrix(res, filename, matrix.view()); + + // Load using overload + uint64_t loaded_count = 0; + uint64_t loaded_dimension = 0; + // We must initialize device_matrix with some dimensions if we want to declare it, + // but the overload will re-assign it. + // Actually, the simplest is to just use the returned value or if we must use the overload reference: + auto loaded_matrix = raft::make_device_matrix(res, 0, 0); + load_device_matrix(res, filename, loaded_matrix, loaded_count, loaded_dimension); + + // Verify + ASSERT_EQ(loaded_count, (uint64_t)count); + ASSERT_EQ(loaded_dimension, (uint64_t)dimension); + ASSERT_EQ(loaded_matrix.extent(0), count); + ASSERT_EQ(loaded_matrix.extent(1), dimension); + + std::vector loaded_host_data(count * dimension); + raft::copy(loaded_host_data.data(), loaded_matrix.data_handle(), loaded_host_data.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + for (size_t i = 0; i < host_data.size(); ++i) { + ASSERT_EQ(loaded_host_data[i], host_data[i]); + } + + std::remove(filename.c_str()); +} + +TEST(UtilsTest, LoadWithQuantization) { + raft::resources res; + const std::string filename = "test_quantization.modf"; + const int64_t count = 100; + const int64_t dimension = 8; + + // 1. Create and save float data + auto matrix = raft::make_device_matrix(res, count, dimension); + std::vector host_data(count * dimension); + for (size_t i = 0; i < host_data.size(); ++i) { + // Values between -1.0 and 1.0 to make quantization meaningful + host_data[i] = static_cast(i % 100) / 50.0f - 1.0f; + } + raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + save_device_matrix(res, filename, matrix.view()); + + // 2. Load as int8_t (should trigger quantization) + auto quantized_matrix = load_device_matrix(res, filename); + + // 3. Verify metadata + ASSERT_EQ(quantized_matrix.extent(0), count); + ASSERT_EQ(quantized_matrix.extent(1), dimension); + + // 4. Basic check that data is loaded + std::vector result_host(count * dimension); + raft::copy(result_host.data(), quantized_matrix.data_handle(), result_host.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + // We don't check exact values as quantization is lossy, but it should not be all zeros if input wasn't + bool non_zero = false; + for (auto v : result_host) if (v != 0) non_zero = true; + ASSERT_TRUE(non_zero); + + std::remove(filename.c_str()); +} + +TEST(UtilsTest, FloatToHalfConversion) { + raft::resources res; + const std::string filename = "test_f32_to_f16.modf"; + const int64_t count = 10; + const int64_t dimension = 4; + + // 1. Save float data + auto matrix = raft::make_device_matrix(res, count, dimension); + std::vector host_data(count * dimension); + for (size_t i = 0; i < host_data.size(); ++i) host_data[i] = static_cast(i); + raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res)); + save_device_matrix(res, filename, matrix.view()); + + // 2. Load as half (should trigger conversion) + auto half_matrix = load_device_matrix(res, filename); + + // 3. Verify + ASSERT_EQ(half_matrix.extent(0), count); + ASSERT_EQ(half_matrix.extent(1), dimension); + + std::vector result_host(count * dimension); + raft::copy(result_host.data(), half_matrix.data_handle(), result_host.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + for (size_t i = 0; i < host_data.size(); ++i) { + ASSERT_EQ(static_cast(result_host[i]), host_data[i]); + } + + std::remove(filename.c_str()); +} + +TEST(UtilsTest, HalfToUint8Quantization) { + raft::resources res; + const std::string filename = "test_f16_to_u8.modf"; + const int64_t count = 100; + const int64_t dimension = 8; + + // 1. Save half data + auto matrix = raft::make_host_matrix(count, dimension); + for (size_t i = 0; i < count * dimension; ++i) { + matrix.data_handle()[i] = static_cast(static_cast(i % 100) / 100.0f); + } + save_host_matrix(filename, matrix.view()); + + // 2. Load as uint8_t (should trigger quantization from half) + auto u8_matrix = load_device_matrix(res, filename); + + // 3. Verify + ASSERT_EQ(u8_matrix.extent(0), count); + ASSERT_EQ(u8_matrix.extent(1), dimension); + + std::vector result_host(count * dimension); + raft::copy(result_host.data(), u8_matrix.data_handle(), result_host.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + bool non_zero = false; + for (auto v : result_host) if (v != 0) non_zero = true; + ASSERT_TRUE(non_zero); + + std::remove(filename.c_str()); +} + +TEST(UtilsTest, LoadInvalidMagic) { + const std::string filename = "invalid_magic.modf"; + std::ofstream file(filename, std::ios::binary); + file.write("NOTM", 4); + file.close(); + + ASSERT_THROW(load_host_matrix(filename), std::runtime_error); + + std::remove(filename.c_str()); +} + +TEST(UtilsTest, LoadTypeSizeMismatch) { + const std::string filename = "size_mismatch.modf"; + file_header_t header; + std::memcpy(header.magic, "MODF", 4); + header.count = 1; + header.dimension = 1; + header.data_type_size = 8; // Double size + + std::ofstream file(filename, std::ios::binary); + file.write(reinterpret_cast(&header), sizeof(file_header_t)); + file.close(); + + // Try to load as float (size 4) should throw + ASSERT_THROW(load_host_matrix(filename), std::runtime_error); + + std::remove(filename.c_str()); +} + +TEST(UtilsTest, ScalarQuantizerLifecycle) { + raft::resources res; + const int64_t count = 100; + const int64_t dimension = 8; + + // 1. Train + scalar_quantizer_t quantizer; + ASSERT_FALSE(quantizer.is_trained()); + + auto matrix = raft::make_device_matrix(res, count, dimension); + std::vector host_data(count * dimension); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = static_cast(i % 100) / 50.0f - 1.0f; // range [-1, 0.98] + } + raft::copy(matrix.data_handle(), host_data.data(), host_data.size(), raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); + + quantizer.train(res, matrix.view()); + ASSERT_TRUE(quantizer.is_trained()); + + // 2. Getters + float q_min = quantizer.min(); + float q_max = quantizer.max(); + // Default quantile is 1.0, so it should be exactly -1.0 and 0.98 + ASSERT_TRUE(std::abs(q_min - (-1.0f)) < 1e-5f); + ASSERT_TRUE(std::abs(q_max - 0.98f) < 1e-5f); + + // 3. Constructor + scalar_quantizer_t quantizer2(q_min, q_max); + ASSERT_TRUE(quantizer2.is_trained()); + ASSERT_EQ(quantizer2.min(), q_min); + ASSERT_EQ(quantizer2.max(), q_max); + + // 4. Save/Load + const std::string filename = "test_quantizer.bin"; + quantizer.save_to_file(filename); + + scalar_quantizer_t quantizer3; + quantizer3.load_from_file(filename); + ASSERT_TRUE(quantizer3.is_trained()); + ASSERT_EQ(quantizer3.min(), q_min); + ASSERT_EQ(quantizer3.max(), q_max); + std::remove(filename.c_str()); + + // 5. Serialize/Deserialize + std::stringstream ss; + quantizer.serialize(ss); + + scalar_quantizer_t quantizer4; + quantizer4.deserialize(ss); + ASSERT_TRUE(quantizer4.is_trained()); + ASSERT_EQ(quantizer4.min(), q_min); + ASSERT_EQ(quantizer4.max(), q_max); + + // 6. SetQuantizer + scalar_quantizer_t quantizer5; + quantizer5.set_quantizer(0.1f, 0.9f); + ASSERT_TRUE(quantizer5.is_trained()); + ASSERT_EQ(quantizer5.min(), 0.1f); + ASSERT_EQ(quantizer5.max(), 0.9f); + + // 7. Getters again + ASSERT_EQ(quantizer5.min(), 0.1f); + ASSERT_EQ(quantizer5.max(), 0.9f); + + // 8. Transform + std::vector result_host(count * dimension); + quantizer.transform(res, matrix.view(), result_host.data(), false); + + bool non_zero = false; + for (auto v : result_host) if (v != 0) non_zero = true; + ASSERT_TRUE(non_zero); +} + diff --git a/cgo/cuvs/test/test_framework.hpp b/cgo/cuvs/test/test_framework.hpp new file mode 100644 index 0000000000000..f995f514686da --- /dev/null +++ b/cgo/cuvs/test/test_framework.hpp @@ -0,0 +1,150 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include // For std::iota +#include // For std::async +#include +#include +#include +#include // For building string messages +#include // For std::sort +#include // For std::any comparisons in assertions + +// --- Minimal Custom Test Framework (Stub for compilation) --- + +// Logging - minimal versions +#define TEST_LOG(msg) std::cout << "[INFO ] " << msg << std::endl +#define TEST_ERROR(msg) std::cerr << "[ERROR ] " << msg << std::endl + +// Global flag to indicate if the current test has failed (kept minimal) +extern thread_local bool current_test_failed; + +// Helper to build string messages for assertions (handles various types) +template +std::string to_string_for_assertion(const T& val) { + std::ostringstream oss; + oss << val; + return oss.str(); +} +inline std::string to_string_for_assertion(const std::any&) { return "std::any"; } // Simplified +inline std::string to_string_for_assertion(const char* val) { return std::string(val); } + +// Helper to check if an exception_ptr holds a specific exception type (kept minimal) +template +inline bool has_exception(const std::exception_ptr& ep) { + if (!ep) return false; + try { + std::rethrow_exception(ep); + } catch (const E& e) { + return true; + } catch (...) { + return false; + } +} + +// Assertions - simplified to just return/log if condition is false +#define REPORT_FAILURE(msg_str) do { TEST_ERROR(msg_str); current_test_failed = true; return; } while (0) +#define ASSERT_TRUE(condition) do { if (!(condition)) { REPORT_FAILURE("ASSERT_TRUE failed: " #condition); } } while (0) +#define ASSERT_FALSE(condition) ASSERT_TRUE(!(condition)) +#define ASSERT_EQ(val1, val2) do { \ + auto v1 = (val1); \ + auto v2 = (val2); \ + if (!(v1 == v2)) { \ + std::ostringstream oss; \ + oss << "ASSERT_EQ failed: " << #val1 << " (" << v1 << ") vs " << #val2 << " (" << v2 << ")"; \ + REPORT_FAILURE(oss.str()); \ + } \ +} while (0) +#define ASSERT_NE(val1, val2) do { if (!((val1) != (val2))) { REPORT_FAILURE("ASSERT_NE failed: " #val1 " vs " #val2); } } while (0) +#define ASSERT_GE(val1, val2) do { if (!((val1) >= (val2))) { REPORT_FAILURE("ASSERT_GE failed: " #val1 " vs " #val2); } } while (0) +#define ASSERT_THROW(statement, expected_exception) do { bool caught = false; try { statement; } catch (const expected_exception&) { caught = true; } if (!caught) { REPORT_FAILURE("ASSERT_THROW failed"); } } while (0) +#define ASSERT_NO_THROW(statement) do { try { statement; } catch (...) { REPORT_FAILURE("ASSERT_NO_THROW failed"); } } while (0) + +// Test registration +struct TestCase { + std::string name; + std::function func; + bool failed = false; +}; + +inline std::vector& get_test_cases() { + static std::vector test_cases; + return test_cases; +} + +// Simplified TEST macro for compilation +#define TEST(suite, name) \ + static void test_func_##suite##_##name(); \ + struct RegisterTest_##suite##_##name { \ + RegisterTest_##suite##_##name() { \ + get_test_cases().push_back({#suite "::" #name, test_func_##suite##_##name}); \ + } \ + }; \ + static RegisterTest_##suite##_##name register_test_##suite##_##name; \ + static void test_func_##suite##_##name() + +inline int RUN_ALL_TESTS() { + int passed_count = 0; + int failed_count = 0; + TEST_LOG("Running " << get_test_cases().size() << " tests (minimal framework)..."); + + for (auto& test_case : get_test_cases()) { + current_test_failed = false; // Reset for each test + TEST_LOG("[ RUN ] " << test_case.name); + try { + test_case.func(); + } catch (const std::exception& e) { + TEST_ERROR("Test threw unhandled exception: " << e.what()); + current_test_failed = true; + } catch (...) { + TEST_ERROR("Test threw unhandled unknown exception."); + current_test_failed = true; + } + + if (current_test_failed) { + test_case.failed = true; + failed_count++; + TEST_LOG("[ FAILED ] " << test_case.name); + } else { + passed_count++; + TEST_LOG("[ OK ] " << test_case.name); + } + } + + TEST_LOG("--------------------------------------------------"); + TEST_LOG("[==========] " << passed_count + failed_count << " tests ran."); + TEST_LOG("[ PASSED ] " << passed_count << " tests."); + if (failed_count > 0) { + TEST_ERROR("[ FAILED ] " << failed_count << " tests, listed below:"); + for (const auto& test_case : get_test_cases()) { + if (test_case.failed) { + TEST_ERROR(" " << test_case.name); + } + } + } + TEST_LOG("--------------------------------------------------"); + + return failed_count; +} + +// --- End of Minimal Custom Test Framework (Stub for compilation) --- diff --git a/cgo/test/Makefile b/cgo/test/Makefile index 506722a91f6e6..f0de3ac25285f 100644 --- a/cgo/test/Makefile +++ b/cgo/test/Makefile @@ -1,18 +1,47 @@ -CFLAGS=-I.. -g -Wall -Werror -lm -I../../thirdparties/install/include +UNAME_S := $(shell uname -s) -all: test_add.exe test_bloom.exe test_varlena.exe bloom_whole_test.exe +ifeq ($(MO_CL_CUDA),1) + ifeq ($(CONDA_PREFIX),) + $(error CONDA_PREFIX env variable not found. Please activate your conda environment.) + endif + CC = /usr/local/cuda/bin/nvcc + COMPILER_FLAGS := -Xcompiler "-Wall -Werror" + # When using nvcc to link, we need to pass the libraries and rpath + LINKER_FLAGS := -Xlinker "-rpath=$(shell realpath ..)" + # We must also include the cuVS and other deps that libmo.so needs if linked statically, + # but since libmo.so is shared, we just need to link against it. + LIBS += -L.. -lmo -L../../thirdparties/install/lib -lusearch_c -L$(CUDA_PATH)/lib64/stubs -lcuda -L$(CUDA_PATH)/lib64 -lcudart + LIBS += -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lpthread -lgomp + LIBS += -Xlinker -lpthread -Xlinker -lm +else + COMPILER_FLAGS := -Wall -Werror + ifeq ($(UNAME_S),Darwin) + LINKER_FLAGS := -Wl,-rpath,$(shell realpath ..) + else + LINKER_FLAGS := -Wl,-rpath=$(shell realpath ..) + endif + LIBS := -L.. -lmo -L../../thirdparties/install/lib -lusearch_c -lm -lstdc++ + ifneq ($(UNAME_S),Darwin) + LIBS += -fopenmp + endif +endif -test_add.exe: test_add.c ../libmo.a - $(CC) $(CFLAGS) -o test_add.exe test_add.c -L.. -lmo +CFLAGS := -I.. -g -I../../thirdparties/install/include $(COMPILER_FLAGS) +LDFLAGS := $(LIBS) $(LINKER_FLAGS) -test_bloom.exe: test_bloom.c ../libmo.a - $(CC) $(CFLAGS) -o test_bloom.exe test_bloom.c -L.. -lmo +all: test_add.exe test_bloom.exe test_varlena.exe -test_varlena.exe: varlena_test.c ../libmo.a - $(CC) $(CFLAGS) -o test_varlena.exe varlena_test.c -L.. -lmo +test_add.exe: test_add.c + $(CC) $(CFLAGS) -o $@ test_add.c $(LDFLAGS) -bloom_whole_test.exe: bloom_whole_test.c ../libmo.a - $(CC) $(CFLAGS) -o bloom_whole_test.exe bloom_whole_test.c -L.. -lmo +test_bloom.exe: test_bloom.c + $(CC) $(CFLAGS) -o $@ test_bloom.c $(LDFLAGS) + +test_varlena.exe: varlena_test.c + $(CC) $(CFLAGS) -o $@ varlena_test.c $(LDFLAGS) + +bloom_whole_test.exe: bloom_whole_test.c + $(CC) $(CFLAGS) $(NVCC_FLAGS) -o bloom_whole_test.exe bloom_whole_test.c $(LDFLAGS) clean: rm -f *.o *.exe diff --git a/cgo/test/bloom_whole_test.c b/cgo/test/bloom_whole_test.c new file mode 100644 index 0000000000000..23bf08586f94d --- /dev/null +++ b/cgo/test/bloom_whole_test.c @@ -0,0 +1,122 @@ +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include "../bloom.h" +#include "../varlena.h" + +// Helper to create a packed buffer of varlenas +int create_test_buffer(uint8_t *buffer, uint8_t *area) { + uint8_t *ptr = buffer; + int nitem = 0; + + // --- Element 1: small --- + const char *str1 = "apple"; + uint8_t len1 = strlen(str1); + ptr[0] = len1; + memcpy(ptr + 1, str1, len1); + ptr += VARLENA_SIZE; + nitem++; + + // --- Element 2: big --- + const char *str2 = "banana_long_string_to_test_big_varlena"; + uint32_t len2 = strlen(str2); + uint32_t offset2 = 50; + memcpy(area + offset2, str2, len2); + + varlena_set_big_offset_len(ptr, offset2, len2); + ptr += VARLENA_SIZE; + nitem++; + + // --- Element 3: small --- + const char *str3 = "cherry"; + uint8_t len3 = strlen(str3); + ptr[0] = len3; + memcpy(ptr + 1, str3, len3); + ptr += VARLENA_SIZE; + nitem++; + + return nitem; +} + +void test_add_and_test_varlena() { + printf("--- Running test_add_and_test_varlena ---\n"); + + bloomfilter_t *bf = bloomfilter_init(1000, 3); + assert(bf != NULL); + + uint8_t buffer[200]; + uint8_t area[200]; + int nitem = create_test_buffer(buffer, area); + + // Add all items from the buffer + bloomfilter_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0); + + // Test if all added items exist + bool results[nitem]; + bloomfilter_test_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results); + + for (int i = 0; i < nitem; i++) { + assert(results[i]); + } + + // Test for a non-existent item + const char *str_not_exist = "grape"; + assert(!bloomfilter_test(bf, str_not_exist, strlen(str_not_exist))); + + bloomfilter_free(bf); + printf("test_add_and_test_whole passed.\n\n"); +} + +void test_test_and_add_varlena() { + printf("--- Running test_test_and_add_varlena ---\n"); + + bloomfilter_t *bf = bloomfilter_init(1000, 3); + assert(bf != NULL); + + uint8_t buffer[200]; + uint8_t area[200]; + int nitem = create_test_buffer(buffer, area); + + bool results1[nitem]; + bool results2[nitem]; + + // First call: should report all items as non-existent and add them + bloomfilter_test_and_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results2); + for (int i = 0; i < nitem; i++) { + assert(!results1[i]); + } + + // Second call: should report all items as existent + bloomfilter_test_and_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results2); + for (int i = 0; i < nitem; i++) { + assert(results2[i]); + } + + bloomfilter_free(bf); + printf("test_test_and_add_whole passed.\n\n"); +} + +int main() { + test_add_and_test_varlena(); + test_test_and_add_varlena(); + printf("All bloom_varlena_test passed!\n"); + return 0; +} diff --git a/go.mod b/go.mod index ae09fa2ae94df..d1dcf1ba27f2d 100644 --- a/go.mod +++ b/go.mod @@ -76,7 +76,6 @@ require ( github.com/prashantv/gostub v1.1.0 github.com/prometheus/client_golang v1.17.0 github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 - github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d github.com/robfig/cron/v3 v3.0.1 github.com/samber/lo v1.38.1 github.com/segmentio/encoding v0.4.0 @@ -92,7 +91,7 @@ require ( github.com/tidwall/btree v1.7.0 github.com/tidwall/pretty v1.2.1 github.com/tmc/langchaingo v0.1.13 - github.com/unum-cloud/usearch/golang v0.0.0-20260106013029-7306bb446be5 + github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9 go.starlark.net v0.0.0-20250701195324-d457b4515e0e go.uber.org/automaxprocs v1.5.3 go.uber.org/ratelimit v0.2.0 @@ -259,9 +258,6 @@ replace ( github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef => github.com/matrixorigin/dragonboat/v4 v4.0.0-20251214113216-2ddf81ef2a85 github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4 => github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4 github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376 => github.com/matrixorigin/vfs v0.2.1-0.20220616104132-8852fd867376 - - github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d => github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6 - github.com/unum-cloud/usearch/golang v0.0.0-20260106013029-7306bb446be5 => github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9 ) replace github.com/shoenig/go-m1cpu => github.com/shoenig/go-m1cpu v0.1.7 diff --git a/go.sum b/go.sum index a22d3b1eeecea..8821ade189a9a 100644 --- a/go.sum +++ b/go.sum @@ -207,12 +207,8 @@ github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8Nz github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= -github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6 h1:hn6US40835XeZRilkHLIUpWTF2RYBRXCpBLn1PPOSjg= -github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6/go.mod h1:Ju9l9IcIHZOPLO1tjN9dEYSgEPFowDPF9pM70W9nNGs= github.com/cpegeric/pdftotext-go v0.0.0-20241112123704-49cb86a3790e h1:tQSCiEjYPRU+AuuVR+zd+xYVOsEqX1clPhmIAM6FCHU= github.com/cpegeric/pdftotext-go v0.0.0-20241112123704-49cb86a3790e/go.mod h1:zt7uTOYu0EEeKatGaTi9JiP0I9ePHpDvjAwpfPXh/N0= -github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9 h1:jnClZ1ddCpjYQLMem6YSlVm7Ois6sXbRr2CP6n/rc/s= -github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9/go.mod h1:3SN8SakyyBWzb14DNZn4t5yX8dOa7ae45KpqDioi4RA= github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E= github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= @@ -877,6 +873,8 @@ github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGr github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= +github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9 h1:KtfoWJQXPrvEfFCuk1FGgiPfBoIhSIqiTLaZLHjoKM4= +github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9/go.mod h1:NxBpQibuBBeA/V8RGbrNzVAv4OyWWL5yNao7mVz656k= github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4= github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= github.com/valyala/fasthttp v1.6.0/go.mod h1:FstJa9V+Pj9vQ7OJie2qMHdwemEDaDiSdBnvPM1Su9w= diff --git a/optools/images/Dockerfile b/optools/images/Dockerfile index 837b501811348..7383c0941b937 100644 --- a/optools/images/Dockerfile +++ b/optools/images/Dockerfile @@ -32,6 +32,7 @@ FROM matrixorigin/ubuntu:22.04 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/mo-service /mo-service COPY --from=builder /go/src/github.com/matrixorigin/matrixone/etc /etc COPY --from=builder /go/src/github.com/matrixorigin/matrixone/thirdparties/install/lib/*.so /usr/local/lib +COPY --from=builder /go/src/github.com/matrixorigin/matrixone/cgo/*.so /usr/local/lib # ldconfig and run mo-service to check if the shared library is found RUN ldconfig && /mo-service -h diff --git a/optools/images/gpu/Dockerfile b/optools/images/gpu/Dockerfile index 8e3640083e614..3549a0d249d70 100644 --- a/optools/images/gpu/Dockerfile +++ b/optools/images/gpu/Dockerfile @@ -8,7 +8,7 @@ RUN export LANG=en_US.utf8 ARG DEBIAN_FRONTEND=noninteractive ENV MOHOME=/matrixone ENV PATH="/usr/local/cuda/bin:${PATH}" -ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${MOHOME}/thirdparties/install/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${MOHOME}/thirdparties/install/lib:${MOHOME}/cgo:${LD_LIBRARY_PATH}" WORKDIR /matrixone COPY . . @@ -52,6 +52,7 @@ FROM nvidia/cuda:13.0.2-cudnn-runtime-ubuntu24.04 COPY --from=builder /matrixone/mo-service /mo-service COPY --from=builder /matrixone/etc /etc COPY --from=builder /matrixone/thirdparties/install/lib/*.so /usr/local/lib +COPY --from=builder /matrixone/cgo/*.so /usr/local/lib COPY --from=builder /root/miniconda/envs/go/lib /root/miniconda/envs/go/lib ENV PATH="/usr/local/cuda/bin:${PATH}" diff --git a/optools/run_ut.sh b/optools/run_ut.sh index a8a8205891efe..aa7307fd3c424 100755 --- a/optools/run_ut.sh +++ b/optools/run_ut.sh @@ -47,6 +47,27 @@ UT_COUNT="$G_WKSP/$G_TS-UT-Count.out" CODE_COVERAGE="$G_WKSP/$G_TS-UT-Coverage.html" RAW_COVERAGE="coverage.out" IS_BUILD_FAIL="" +TAGS="matrixone_test" + +THIRDPARTIES_INSTALL_DIR=${BUILD_WKSP}/thirdparties/install +CGO_CFLAGS="-I${BUILD_WKSP}/cgo -I${THIRDPARTIES_INSTALL_DIR}/include" +CGO_LDFLAGS="-Wl,-rpath,${THIRDPARTIES_INSTALL_DIR}/lib:${BUILD_WKSP}/cgo -L${THIRDPARTIES_INSTALL_DIR}/lib -L${BUILD_WKSP}/cgo -lmo -lusearch_c -lm" +LD_LIBRARY_PATH="${THIRDPARTIES_INSTALL_DIR}/lib:${BUILD_WKSP}/cgo" + +if [[ -n "${MO_CL_CUDA:-}" ]] ; then + if [[ ${MO_CL_CUDA} == "1" ]] ; then + if [[ -z "${CONDA_PREFIX:-}" ]] ; then + echo "CONDA_PREFIX environment variable not found" + exit 1 + fi + + CUDA_HOME=/usr/local/cuda + CGO_CFLAGS="${CGO_CFLAGS} -I${CUDA_HOME}/include -I${CONDA_PREFIX}/include" + CGO_LDFLAGS="${CGO_LDFLAGS} -L${CUDA_HOME}/lib64/stubs -lcuda -L${CUDA_HOME}/lib64 -lcudart -L${CONDA_PREFIX}/lib -lcuvs -lcuvs_c -lstdc++" + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${CONDA_PREFIX}/lib" + TAGS="${TAGS},gpu" + fi +fi if [[ -f $SCA_REPORT ]]; then rm $SCA_REPORT; fi if [[ -f $UT_REPORT ]]; then rm $UT_REPORT; fi @@ -70,7 +91,7 @@ function run_vet(){ if [[ -f $SCA_REPORT ]]; then rm $SCA_REPORT; fi logger "INF" "Test is in progress... " - go vet -tags matrixone_test -unsafeptr=false ./pkg/... 2>&1 | tee $SCA_REPORT + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go vet -tags "${TAGS}" -unsafeptr=false ./pkg/... 2>&1 | tee $SCA_REPORT logger "INF" "Refer to $SCA_REPORT for details" } @@ -95,18 +116,14 @@ function run_tests(){ local cover_profile='profile.raw' make cgo make thirdparties - THIRDPARTIES_INSTALL_DIR=${BUILD_WKSP}/thirdparties/install - - local CGO_CFLAGS="-I${BUILD_WKSP}/cgo -I${THIRDPARTIES_INSTALL_DIR}/include" - local CGO_LDFLAGS="-Wl,-rpath,${THIRDPARTIES_INSTALL_DIR}/lib -L${THIRDPARTIES_INSTALL_DIR}/lib -L${BUILD_WKSP}/cgo -lmo -lm" if [[ $SKIP_TESTS == 'race' ]]; then logger "INF" "Run UT without race check" - CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" $test_scope > $UT_REPORT + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags "${TAGS}" -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" $test_scope > $UT_REPORT else logger "INF" "Run UT with race check" - CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags "${TAGS}" -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT fi } diff --git a/pkg/common/concurrent/asyncworkerpool.go b/pkg/common/concurrent/asyncworkerpool.go new file mode 100644 index 0000000000000..844e3cd31a7a3 --- /dev/null +++ b/pkg/common/concurrent/asyncworkerpool.go @@ -0,0 +1,351 @@ +// Copyright 2024 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package concurrent + +import ( + "os" + "os/signal" + "runtime" + "sync" + "sync/atomic" + "syscall" + + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "github.com/matrixorigin/matrixone/pkg/logutil" + "go.uber.org/zap" +) + +// AsyncTask represents a task to be executed by the AsyncWorkerPool. +type AsyncTask struct { + ID uint64 + Fn func(res any) (any, error) +} + +// AsyncTaskResult holds the result of a AsyncTask execution. +type AsyncTaskResult struct { + ID uint64 + Result any + Error error +} + +// AsyncTaskResultStore manages the storage and retrieval of AsyncTaskResults. +type AsyncTaskResultStore struct { + states map[uint64]*taskState + mu sync.Mutex + nextJobID uint64 + stopCh chan struct{} + stopped atomic.Bool +} + +type taskState struct { + done chan struct{} + result *AsyncTaskResult +} + +// NewAsyncTaskResultStore creates a new AsyncTaskResultStore. +func NewAsyncTaskResultStore() *AsyncTaskResultStore { + return &AsyncTaskResultStore{ + states: make(map[uint64]*taskState), + nextJobID: 0, + stopCh: make(chan struct{}), + stopped: atomic.Bool{}, + } +} + +// Store saves a AsyncTaskResult in the store and signals any waiting goroutines. +func (s *AsyncTaskResultStore) Store(result *AsyncTaskResult) { + s.mu.Lock() + defer s.mu.Unlock() + state, ok := s.states[result.ID] + if !ok { + state = &taskState{done: make(chan struct{})} + s.states[result.ID] = state + } + state.result = result + close(state.done) +} + +// Wait blocks until the result for the given jobID is available and returns it. +// The result is removed from the internal map after being retrieved. +func (s *AsyncTaskResultStore) Wait(jobID uint64) (*AsyncTaskResult, error) { + s.mu.Lock() + state, ok := s.states[jobID] + if !ok { + // If task was not submitted yet, create state and wait. + state = &taskState{done: make(chan struct{})} + s.states[jobID] = state + s.mu.Unlock() // Release lock before blocking + } else if state.result != nil { + // If result is already available, return it immediately without blocking. + delete(s.states, jobID) // Remove after retrieval + s.mu.Unlock() + return state.result, nil + } else { + // Task was submitted, but result not yet available. Release lock and wait. + s.mu.Unlock() // Release lock before blocking + } + + select { + case <-state.done: + s.mu.Lock() + delete(s.states, jobID) + s.mu.Unlock() + return state.result, nil + case <-s.stopCh: + return nil, moerr.NewInternalErrorNoCtx("AsyncTaskResultStore stopped before result was available") + } +} + +// GetNextJobID atomically increments and returns a new unique job ID. +func (s *AsyncTaskResultStore) GetNextJobID() uint64 { + return atomic.AddUint64(&s.nextJobID, 1) +} + +// Stop signals the AsyncTaskResultStore to stop processing new waits. +func (s *AsyncTaskResultStore) Stop() { + if s.stopped.CompareAndSwap(false, true) { + close(s.stopCh) + } +} + +// AsyncWorkerPool runs tasks in a dedicated OS thread with a CUDA context. +type AsyncWorkerPool struct { + tasks chan *AsyncTask + stopCh chan struct{} + wg sync.WaitGroup + stopped atomic.Bool // Indicates if the worker has been stopped + firstError atomic.Value + *AsyncTaskResultStore // Embed the result store + nthread uint + sigc chan os.Signal // Add this field + errch chan error + createResource func() (any, error) + cleanupResource func(any) +} + +// NewAsyncWorkerPool creates a new AsyncWorkerPool. +func NewAsyncWorkerPool(nthread uint, createResource func() (any, error), cleanupResource func(any)) *AsyncWorkerPool { + return &AsyncWorkerPool{ + tasks: make(chan *AsyncTask, nthread), + stopCh: make(chan struct{}), + stopped: atomic.Bool{}, // Initialize to false + AsyncTaskResultStore: NewAsyncTaskResultStore(), + nthread: nthread, + sigc: make(chan os.Signal, 1), // Initialize sigc + errch: make(chan error, nthread), // Initialize errch + createResource: createResource, + cleanupResource: cleanupResource, + } +} + +// handleAndStoreTask processes a single AsyncTask and stores its result. +func (w *AsyncWorkerPool) handleAndStoreTask(task *AsyncTask, resource any) { + result, err := task.Fn(resource) + asyncResult := &AsyncTaskResult{ + ID: task.ID, + Result: result, + Error: err, + } + w.AsyncTaskResultStore.Store(asyncResult) +} + +// drainAndProcessTasks drains the w.tasks channel and processes each task. +// It stops when the channel is empty or closed. +func (w *AsyncWorkerPool) drainAndProcessTasks(resource any) { + for { + select { + case task, ok := <-w.tasks: + if !ok { + return // Channel closed, no more tasks. Exit. + } + w.handleAndStoreTask(task, resource) + default: + return // All tasks drained, or channel is empty. + } + } +} + +// Start begins the worker's execution loop. +func (w *AsyncWorkerPool) Start(initFn func(res any) error, stopFn func(resource any) error) { + w.wg.Add(1) // for w.run + go w.run(initFn, stopFn) + + signal.Notify(w.sigc, syscall.SIGTERM, syscall.SIGINT) // Notify signals to sigc + + w.wg.Add(1) // for the signal handler goroutine + go func() { + defer w.wg.Done() // Ensure wg.Done() is called when this goroutine exits + select { + case <-w.sigc: // Wait for a signal + logutil.Info("AsyncWorkerPool received shutdown signal, stopping...") + if w.stopped.CompareAndSwap(false, true) { + close(w.stopCh) // Signal run() to stop. + close(w.tasks) // Close tasks channel here. + } + case err := <-w.errch: // Listen for errors from worker goroutines + logutil.Error("AsyncWorkerPool received internal error, stopping...", zap.Error(err)) + if w.firstError.Load() == nil { + w.firstError.Store(err) + } + if w.stopped.CompareAndSwap(false, true) { + close(w.stopCh) // Signal run() to stop. + close(w.tasks) // Close tasks channel here. + } + case <-w.stopCh: // Listen for internal stop signal from w.Stop() + logutil.Info("AsyncWorkerPool signal handler received internal stop signal, exiting...") + // Do nothing, just exit. w.Stop() will handle the rest. + } + }() +} + +// Stop signals the worker to terminate. +func (w *AsyncWorkerPool) Stop() { + if w.stopped.CompareAndSwap(false, true) { + close(w.stopCh) // Signal run() to stop. + close(w.tasks) // Close tasks channel here. + } + w.wg.Wait() + w.AsyncTaskResultStore.Stop() // Signal the result store to stop +} + +// Submit sends a task to the worker. +func (w *AsyncWorkerPool) Submit(fn func(res any) (any, error)) (uint64, error) { + if w.stopped.Load() { + return 0, moerr.NewInternalErrorNoCtx("cannot submit task: worker is stopped") + } + jobID := w.GetNextJobID() + task := &AsyncTask{ + ID: jobID, + Fn: fn, + } + w.tasks <- task + return jobID, nil +} + +func (w *AsyncWorkerPool) workerLoop(wg *sync.WaitGroup) { + defer wg.Done() + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + var resource any + var err error + if w.createResource != nil { + resource, err = w.createResource() + if err != nil { + w.errch <- err + return + } + } + if w.cleanupResource != nil { + defer w.cleanupResource(resource) + } + + for { + select { + case task, ok := <-w.tasks: + if !ok { // tasks channel closed + return // No more tasks, and channel is closed. Exit. + } + w.handleAndStoreTask(task, resource) // Pass resource directly + case <-w.stopCh: + // stopCh signaled. Drain remaining tasks from w.tasks then exit. + w.drainAndProcessTasks(resource) // Pass resource directly + return + } + } +} + +func (w *AsyncWorkerPool) run(initFn func(res any) error, stopFn func(resource any) error) { + defer w.wg.Done() + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + var parentResource any + var err error + if w.createResource != nil { + parentResource, err = w.createResource() + if err != nil { + w.errch <- err + return + } + } + if w.cleanupResource != nil { + defer w.cleanupResource(parentResource) + } + + // Execute initFn once. + if initFn != nil { + if err := initFn(parentResource); err != nil { + logutil.Error("failed to initialize async resource with provided function", zap.Error(err)) + w.errch <- err + + return + } + } + + if stopFn != nil { + defer func() { + if err := stopFn(parentResource); err != nil { + logutil.Error("error during async resource stop function", zap.Error(err)) + w.errch <- err + } + }() + } + + if w.nthread == 1 { + // Special case: nthread is 1, process tasks directly in this goroutine + for { + select { + case task, ok := <-w.tasks: + if !ok { // tasks channel closed + return // Channel closed, no more tasks. Exit. + } + w.handleAndStoreTask(task, parentResource) + case <-w.stopCh: + // Drain the tasks channel before exiting + w.drainAndProcessTasks(parentResource) + return + } + } + } else { + // General case: nthread > 1, create worker goroutines + var workerWg sync.WaitGroup + workerWg.Add(int(w.nthread)) + for i := 0; i < int(w.nthread); i++ { + go w.workerLoop(&workerWg) + } + + // Wait for stop signal + <-w.stopCh + + // Signal workers to stop and wait for them to finish. + workerWg.Wait() + } +} + +// Wait blocks until the result for the given jobID is available and returns it. +// The result is removed from the internal map after being retrieved. +func (w *AsyncWorkerPool) Wait(jobID uint64) (*AsyncTaskResult, error) { + return w.AsyncTaskResultStore.Wait(jobID) +} + +// GetFirstError returns the first internal error encountered by the worker. +func (w *AsyncWorkerPool) GetFirstError() error { + err := w.firstError.Load() + if err == nil { + return nil + } + return err.(error) +} diff --git a/pkg/common/concurrent/asyncworkerpool_test.go b/pkg/common/concurrent/asyncworkerpool_test.go new file mode 100644 index 0000000000000..76c78314d17c3 --- /dev/null +++ b/pkg/common/concurrent/asyncworkerpool_test.go @@ -0,0 +1,509 @@ +// Copyright 2024 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package concurrent + +import ( + "fmt" + "sync" + "syscall" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewAsyncTaskResultStore(t *testing.T) { + store := NewAsyncTaskResultStore() + assert.NotNil(t, store) + assert.NotNil(t, store.states) + assert.Equal(t, uint64(0), store.nextJobID) +} + +func TestAsyncTaskResultStore_GetNextJobID(t *testing.T) { + store := NewAsyncTaskResultStore() + id1 := store.GetNextJobID() + id2 := store.GetNextJobID() + id3 := store.GetNextJobID() + + assert.Equal(t, uint64(1), id1) + assert.Equal(t, uint64(2), id2) + assert.Equal(t, uint64(3), id3) +} + +func TestAsyncTaskResultStore_StoreAndWait(t *testing.T) { + store := NewAsyncTaskResultStore() + jobID := store.GetNextJobID() + expectedResult := "task completed" + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + time.Sleep(10 * time.Millisecond) // Simulate some work before storing + store.Store(&AsyncTaskResult{ + ID: jobID, + Result: expectedResult, + Error: nil, + }) + }() + + result, err := store.Wait(jobID) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, jobID, result.ID) + assert.Equal(t, expectedResult, result.Result) + assert.Nil(t, result.Error) + + wg.Wait() + + // Verify that the result is removed after retrieval + store.mu.Lock() + _, ok := store.states[jobID] + store.mu.Unlock() + assert.False(t, ok, "Result should be removed from store after Wait") +} + +func TestAsyncTaskResultStore_ConcurrentStoreAndWait(t *testing.T) { + store := NewAsyncTaskResultStore() + numTasks := 100 + + var submitWg sync.WaitGroup + var waitWg sync.WaitGroup + submitWg.Add(numTasks) + waitWg.Add(numTasks) + + results := make(chan *AsyncTaskResult, numTasks) + + // Launch goroutines to wait for results + for i := 0; i < numTasks; i++ { + jobID := store.GetNextJobID() // Pre-generate job IDs + go func(id uint64) { + defer waitWg.Done() + result, err := store.Wait(id) + assert.NoError(t, err) + results <- result + }(jobID) + } + + // Launch goroutines to store results + for i := 1; i <= numTasks; i++ { + go func(id uint64) { + defer submitWg.Done() + // Simulate random delay + time.Sleep(time.Duration(id%10) * time.Millisecond) + store.Store(&AsyncTaskResult{ + ID: id, + Result: fmt.Sprintf("result-%d", id), + Error: nil, + }) + }(uint64(i)) + } + + submitWg.Wait() + waitWg.Wait() // Ensure all waiters have completed + close(results) + + receivedResults := make(map[uint64]string) + for r := range results { + receivedResults[r.ID] = r.Result.(string) + } + + assert.Len(t, receivedResults, numTasks) + for i := 1; i <= numTasks; i++ { + assert.Equal(t, fmt.Sprintf("result-%d", i), receivedResults[uint64(i)]) + } +} + +type dummyResource struct { + closed bool +} + +func (m *dummyResource) Close() { + m.closed = true +} + +func testCreateResource() (any, error) { + return &dummyResource{}, nil +} + +func testCleanupResource(res any) { + if res == nil { + return + } + resource := res.(*dummyResource) + resource.Close() +} + +func TestAsyncWorkerPool_LifecycleAndTaskExecution(t *testing.T) { + + worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource) + require.NotNil(t, worker) + + // Start the worker + worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn + + // Submit a task + expectedTaskResult := "processed by CUDA (mocked)" + taskID, err := worker.Submit(func(res any) (any, error) { + // In a real scenario, this would use the real resource + // For testing, we just return a value. + // Assert that res is not nil, even if it's a dummy one. + assert.NotNil(t, res) + return expectedTaskResult, nil + }) + require.NoError(t, err) + + // Wait for the result + result, err := worker.Wait(taskID) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, taskID, result.ID) + assert.Equal(t, expectedTaskResult, result.Result) + assert.Nil(t, result.Error) + + // Submit another task + expectedTaskResult2 := 123 + taskID2, err := worker.Submit(func(res any) (any, error) { + assert.NotNil(t, res) + return expectedTaskResult2, nil + }) + require.NoError(t, err) + + result2, err := worker.Wait(taskID2) + assert.NoError(t, err) + assert.NotNil(t, result2) + assert.Equal(t, taskID2, result2.ID) + assert.Equal(t, expectedTaskResult2, result2.Result) + assert.Nil(t, result2.Error) + + // Test a task that returns an error + expectedError := fmt.Errorf("cuda operation failed") + taskID3, err := worker.Submit(func(res any) (any, error) { + assert.NotNil(t, res) + return nil, expectedError + }) + require.NoError(t, err) + + result3, err := worker.Wait(taskID3) + assert.NoError(t, err) // Error is returned in AsyncTaskResult, not as return value of Wait + assert.NotNil(t, result3) + assert.Equal(t, taskID3, result3.ID) + assert.Nil(t, result3.Result) + assert.Equal(t, expectedError, result3.Error) + + // Stop the worker + worker.Stop() + + t.Log("AsyncWorkerPool stopped. Further submissions would block or panic.") +} + +func TestAsyncWorkerPool_StopDuringTaskProcessing(t *testing.T) { + + worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource) + worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn + + // Submit a long-running task + longTaskSignal := make(chan struct{}) + longTaskID, err := worker.Submit(func(res any) (any, error) { + assert.NotNil(t, res) + <-longTaskSignal // Block until signaled + return "long task done", nil + }) + require.NoError(t, err) + + // Give the worker a moment to pick up the task + time.Sleep(50 * time.Millisecond) + + // Stop the worker while the task is running + doneStopping := make(chan struct{}) + go func() { + worker.Stop() + close(doneStopping) + }() + + // Wait for a short period to see if Stop is blocked by the task + select { + case <-doneStopping: + t.Fatal("Worker stopped too quickly, long task might not have started blocking") + case <-time.After(100 * time.Millisecond): + // This means Stop is likely waiting for the `run` goroutine, which is blocked by the task. + t.Log("Worker.Stop is blocked by the long-running task as expected.") + } + + // Now unblock the long-running task + close(longTaskSignal) + + // The worker should now be able to stop + select { + case <-doneStopping: + t.Log("Worker successfully stopped after long task completed.") + case <-time.After(500 * time.Millisecond): + t.Fatal("Worker did not stop even after long task completed.") + } + + // Verify that the long task result was stored + result, err := worker.Wait(longTaskID) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, longTaskID, result.ID) + assert.Equal(t, "long task done", result.Result) +} + +func TestAsyncWorkerPool_MultipleSubmitsBeforeStart(t *testing.T) { + + worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource) + + // Start the worker - now takes initFn + worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn + + // Submit multiple tasks before starting the worker + numTasks := 5 + taskIDs := make([]uint64, numTasks) // Still need to collect IDs + for i := 0; i < numTasks; i++ { + var err error + taskIDs[i], err = worker.Submit(func(res any) (any, error) { + assert.NotNil(t, res) + return fmt.Sprintf("result-%d", i), nil + }) + require.NoError(t, err) + } + + // Start the worker + // worker.Start() // Already started above, remove duplicate + + // Wait for all results + for i, id := range taskIDs { + result, err := worker.Wait(id) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, id, result.ID) + assert.Equal(t, fmt.Sprintf("result-%d", i), result.Result) + } + + worker.Stop() +} + +func TestAsyncWorkerPool_GracefulShutdown(t *testing.T) { + + worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource) + worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn + + var wg sync.WaitGroup + numTasks := 10 + results := make(chan *AsyncTaskResult, numTasks) // Changed type + + // Submit tasks + for i := 0; i < numTasks; i++ { + wg.Add(1) + // Capture loop index for the anonymous function + loopIndex := i + + var submitErr error + taskID, submitErr := worker.Submit(func(res any) (any, error) { + assert.NotNil(t, res) + time.Sleep(10 * time.Millisecond) // Simulate work + return fmt.Sprintf("final-result-%d", loopIndex), nil // Use captured loop index + }) + require.NoError(t, submitErr) + + go func(id uint64) { + defer wg.Done() + r, waitErr := worker.Wait(id) + assert.NoError(t, waitErr) + results <- r + }(taskID) + } + + // Give some time for tasks to be submitted and processed + time.Sleep(50 * time.Millisecond) + + // Stop the worker + worker.Stop() + + // All tasks submitted before Stop should complete and their results should be retrievable + wg.Wait() + close(results) + + assert.Len(t, results, numTasks) + for r := range results { + assert.Contains(t, r.Result.(string), "final-result-") + } + + // Ensure new tasks cannot be submitted after stop + _, err := worker.Submit(func(res any) (any, error) { // Use := for first declaration of err in this scope + return "should not be processed", nil + }) + assert.Error(t, err) + assert.Contains(t, err.Error(), "worker is stopped") +} + +func TestAsyncWorkerPool_SignalTermination(t *testing.T) { + + worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource) // Use 1 thread for easier control and observation + require.NotNil(t, worker) + + worker.Start(nil, func(_ any) error { return nil }) + + // Submit a task that will complete after the signal, to ensure graceful processing + taskDone := make(chan struct{}) + taskID1, err := worker.Submit(func(res any) (any, error) { + assert.NotNil(t, res) + <-taskDone // Wait for signal to complete + return "task1 processed", nil + }) + require.NoError(t, err) + + // Submit a second quick task that should complete before or around the signal + taskID2, err := worker.Submit(func(res any) (any, error) { + assert.NotNil(t, res) + return "task2 processed", nil + }) + require.NoError(t, err) + + // Give the worker a moment to pick up the tasks + time.Sleep(50 * time.Millisecond) + + // Simulate SIGTERM by sending to the signal channel + t.Log("Simulating SIGTERM to AsyncWorkerPool") + worker.sigc <- syscall.SIGTERM + + // Allow some time for the signal handler to process and call worker.Stop() + time.Sleep(100 * time.Millisecond) + + // Unblock the long-running task to allow it to finish and the worker to fully stop + close(taskDone) + + // Wait for all worker goroutines to finish + // The worker.Stop() method, which is called by the signal handler, + // internally waits for worker.wg.Wait(). + // So, we can verify by checking if new submissions fail and if old tasks results are available. + + // Check if previously submitted tasks completed + result1, err := worker.Wait(taskID1) + assert.NoError(t, err) + assert.NotNil(t, result1) + assert.Equal(t, taskID1, result1.ID) + assert.Equal(t, "task1 processed", result1.Result) + + result2, err := worker.Wait(taskID2) + assert.NoError(t, err) + assert.NotNil(t, result2) + assert.Equal(t, taskID2, result2.ID) + assert.Equal(t, "task2 processed", result2.Result) + + // Attempt to submit a new task after termination. It should fail. + _, err = worker.Submit(func(res any) (any, error) { + return "should not be processed", nil + }) + assert.Error(t, err) + assert.Contains(t, err.Error(), "worker is stopped") +} + +func TestAsyncWorkerPool_GetFirstError(t *testing.T) { + + var err error // Explicitly declare err here + + worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource) + assert.Nil(t, worker.GetFirstError(), "GetFirstError should be nil initially") + + // Trigger an error in initFn, which will be pushed to w.errch + expectedErr1 := fmt.Errorf("simulated init error 1") + initFn1 := func(resource any) error { + return expectedErr1 + } + stopFn := func(_ any) error { return nil } + + worker.Start(initFn1, stopFn) + + // Give the `run` goroutine and the signal handler a moment to process initFn and store the first error. + time.Sleep(50 * time.Millisecond) + + // GetFirstError should now return the expected error + assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should return the first recorded error") + + // Submit a task that causes an error (this error won't be saved as firstError via w.errch) + // This ensures that only errors propagated through w.errch are considered. + _, err = worker.Submit(func(res any) (any, error) { // Use = for assignment + assert.NotNil(t, res) + return nil, fmt.Errorf("task error, should not affect GetFirstError()") + }) + require.Error(t, err) // Expect an error because the worker should be stopped + assert.Contains(t, err.Error(), "worker is stopped") + + // Give some time for the task to be processed, if it affects anything + time.Sleep(50 * time.Millisecond) + + // Ensure GetFirstError remains the same even if other errors (from tasks) occur. + assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should not change after the first error is set") + + worker.Stop() + + // After stop, GetFirstError should still be the same. + assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should retain the first error after stopping") +} + +func TestAsyncWorkerPool_MultipleStopCalls(t *testing.T) { + + worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource) // Use 1 thread + require.NotNil(t, worker) + + worker.Start(nil, func(_ any) error { return nil }) + + // Call Stop multiple times from the main goroutine + worker.Stop() + worker.Stop() + worker.Stop() + + // Call Stop from another goroutine + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + worker.Stop() + }() + wg.Wait() + + // Ensure no panics occurred during multiple Stop calls + // (Go's testing framework will catch panics) + + // Optionally, try submitting a task again to ensure it's truly stopped + _, err := worker.Submit(func(res any) (any, error) { return nil, nil }) + assert.Error(t, err) + assert.Contains(t, err.Error(), "worker is stopped") + + t.Log("Successfully called Stop multiple times without panic.") +} + +func TestAsyncWorkerPool_NilCallbacks(t *testing.T) { + worker := NewAsyncWorkerPool(2, nil, nil) + require.NotNil(t, worker) + + worker.Start(nil, nil) + + expectedResult := "no resource needed" + taskID, err := worker.Submit(func(res any) (any, error) { + assert.Nil(t, res) + return expectedResult, nil + }) + require.NoError(t, err) + + result, err := worker.Wait(taskID) + assert.NoError(t, err) + assert.NotNil(t, result) + assert.Equal(t, expectedResult, result.Result) + + worker.Stop() +} diff --git a/pkg/common/concurrent/executor.go b/pkg/common/concurrent/executor.go index 1cc21cf82cdaf..0eac95c6f5a4c 100644 --- a/pkg/common/concurrent/executor.go +++ b/pkg/common/concurrent/executor.go @@ -37,6 +37,14 @@ func (e ThreadPoolExecutor) Execute( nitems int, fn func(ctx context.Context, thread_id int, start, end int) error) (err error) { + if nitems <= 0 { + return nil + } + + if e.nthreads <= 1 { + return fn(ctx, 0, 0, nitems) + } + g, ctx := errgroup.WithContext(ctx) q := nitems / e.nthreads diff --git a/pkg/common/concurrent/executor_test.go b/pkg/common/concurrent/executor_test.go index 61f4856f15e88..50ef97b2df16e 100644 --- a/pkg/common/concurrent/executor_test.go +++ b/pkg/common/concurrent/executor_test.go @@ -87,3 +87,40 @@ func TestExecutorDistribution(t *testing.T) { require.Equal(t, 9, count) } + +func TestExecutorSingleThread(t *testing.T) { + ctx := context.Background() + nitems := 10 + nthreads := 1 + + e := NewThreadPoolExecutor(nthreads) + + called := false + err := e.Execute(ctx, nitems, func(ctx context.Context, thread_id int, start, end int) error { + called = true + require.Equal(t, 0, thread_id) + require.Equal(t, 0, start) + require.Equal(t, nitems, end) + return nil + }) + + require.NoError(t, err) + require.True(t, called) +} + +func TestExecutorZeroItems(t *testing.T) { + ctx := context.Background() + nitems := 0 + nthreads := 4 + + e := NewThreadPoolExecutor(nthreads) + + called := false + err := e.Execute(ctx, nitems, func(ctx context.Context, thread_id int, start, end int) error { + called = true + return nil + }) + + require.NoError(t, err) + require.False(t, called) +} diff --git a/pkg/cuvs/adhoc.go b/pkg/cuvs/adhoc.go new file mode 100644 index 0000000000000..6ca8e4c2a11fa --- /dev/null +++ b/pkg/cuvs/adhoc.go @@ -0,0 +1,74 @@ +//go:build gpu + +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cuvs + +/* +#include "../../cgo/cuvs/adhoc_c.h" +#include +*/ +import "C" +import ( + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "unsafe" +) + +// AdhocBruteForceSearch performs an ad-hoc brute-force search on GPU without using a worker thread. +func AdhocBruteForceSearch[T VectorType]( + dataset []T, + nRows uint64, + dim uint32, + queries []T, + nQueries uint64, + limit uint32, + metric DistanceType, + deviceID int, +) ([]int64, []float32, error) { + if len(dataset) == 0 || len(queries) == 0 { + return nil, nil, moerr.NewInternalErrorNoCtx("empty dataset or queries") + } + + qtype := GetQuantization[T]() + + neighbors := make([]int64, nQueries*uint64(limit)) + distances := make([]float32, nQueries*uint64(limit)) + + var errmsg *C.char + C.gpu_adhoc_brute_force_search( + unsafe.Pointer(&dataset[0]), + C.uint64_t(nRows), + C.uint32_t(dim), + unsafe.Pointer(&queries[0]), + C.uint64_t(nQueries), + C.uint32_t(limit), + C.distance_type_t(metric), + C.quantization_t(qtype), + C.int(deviceID), + (*C.int64_t)(unsafe.Pointer(&neighbors[0])), + (*C.float)(unsafe.Pointer(&distances[0])), + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, nil, moerr.NewInternalErrorNoCtx(errStr) + } + + return neighbors, distances, nil +} diff --git a/pkg/cuvs/adhoc_test.go b/pkg/cuvs/adhoc_test.go new file mode 100644 index 0000000000000..dec4b48fa8f94 --- /dev/null +++ b/pkg/cuvs/adhoc_test.go @@ -0,0 +1,60 @@ +//go:build gpu + +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cuvs + +import ( + "testing" +) + +func TestAdhocBruteForceSearch(t *testing.T) { + dim := uint32(3) + nRows := uint64(2) + nQueries := uint64(1) + limit := uint32(1) + + dataset := []float32{ + 1.0, 2.0, 3.0, + 4.0, 5.0, 6.0, + } + queries := []float32{ + 1.1, 2.1, 3.1, + } + + neighbors, distances, err := AdhocBruteForceSearch[float32]( + dataset, nRows, dim, + queries, nQueries, limit, + L2Expanded, 0, + ) + + if err != nil { + t.Fatalf("AdhocBruteForceSearch failed: %v", err) + } + + if len(neighbors) != int(nQueries*uint64(limit)) { + t.Errorf("Expected %d neighbors, got %d", nQueries*uint64(limit), len(neighbors)) + } + + if neighbors[0] != 0 { + t.Errorf("Expected neighbor 0, got %d", neighbors[0]) + } + + if distances[0] > 0.1 { + t.Errorf("Expected small distance, got %f", distances[0]) + } +} diff --git a/pkg/cuvs/brute_force.go b/pkg/cuvs/brute_force.go new file mode 100644 index 0000000000000..ea3914fd8d855 --- /dev/null +++ b/pkg/cuvs/brute_force.go @@ -0,0 +1,317 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +/* +#include "../../cgo/cuvs/brute_force_c.h" +#include +*/ +import "C" +import ( + "runtime" + "unsafe" + + "github.com/matrixorigin/matrixone/pkg/common/moerr" +) + +// GpuBruteForce represents the C++ gpu_brute_force_t object +type GpuBruteForce[T VectorType] struct { + cIndex C.gpu_brute_force_c +} + +// NewGpuBruteForce creates a new GpuBruteForce instance +func NewGpuBruteForce[T VectorType](dataset []T, count_vectors uint64, dimension uint32, metric DistanceType, nthread uint32, device_id int) (*GpuBruteForce[T], error) { + if len(dataset) == 0 || count_vectors == 0 || dimension == 0 { + return nil, moerr.NewInternalErrorNoCtx("dataset, count_vectors, and dimension cannot be zero") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cIndex := C.gpu_brute_force_new( + unsafe.Pointer(&dataset[0]), + C.uint64_t(count_vectors), + C.uint32_t(dimension), + C.distance_type_t(metric), + C.uint32_t(nthread), + C.int(device_id), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cIndex == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create GpuBruteForce") + } + return &GpuBruteForce[T]{cIndex: cIndex}, nil +} + +// NewGpuBruteForceEmpty creates a new GpuBruteForce instance with pre-allocated buffer but no data yet. +func NewGpuBruteForceEmpty[T VectorType](totalCount uint64, dimension uint32, metric DistanceType, + nthread uint32, deviceID int) (*GpuBruteForce[T], error) { + + qtype := GetQuantization[T]() + var errmsg *C.char + + cBruteForce := C.gpu_brute_force_new_empty( + C.uint64_t(totalCount), + C.uint32_t(dimension), + C.distance_type_t(metric), + C.uint32_t(nthread), + C.int(deviceID), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cBruteForce == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create GpuBruteForce") + } + + return &GpuBruteForce[T]{cIndex: cBruteForce}, nil +} + +// Start initializes the worker and resources +func (gb *GpuBruteForce[T]) Start() error { + if gb.cIndex == nil { + return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized") + } + var errmsg *C.char + C.gpu_brute_force_start(gb.cIndex, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Build triggers the dataset loading to GPU +func (gb *GpuBruteForce[T]) Build() error { + if gb.cIndex == nil { + return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized") + } + var errmsg *C.char + C.gpu_brute_force_build(gb.cIndex, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// AddChunk adds a chunk of data to the pre-allocated buffer. +func (gb *GpuBruteForce[T]) AddChunk(chunk []T, chunkCount uint64) error { + if gb.cIndex == nil { + return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized") + } + if len(chunk) == 0 || chunkCount == 0 { + return nil + } + + var errmsg *C.char + C.gpu_brute_force_add_chunk( + gb.cIndex, + unsafe.Pointer(&chunk[0]), + C.uint64_t(chunkCount), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(chunk) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// AddChunkFloat adds a chunk of float32 data, performing on-the-fly conversion if needed. +func (gb *GpuBruteForce[T]) AddChunkFloat(chunk []float32, chunkCount uint64) error { + if gb.cIndex == nil { + return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized") + } + if len(chunk) == 0 || chunkCount == 0 { + return nil + } + + var errmsg *C.char + C.gpu_brute_force_add_chunk_float( + gb.cIndex, + (*C.float)(&chunk[0]), + C.uint64_t(chunkCount), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(chunk) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Search performs a search operation +func (gb *GpuBruteForce[T]) Search(queries []T, num_queries uint64, query_dimension uint32, limit uint32) ([]int64, []float32, error) { + if gb.cIndex == nil { + return nil, nil, moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized") + } + if len(queries) == 0 || num_queries == 0 || query_dimension == 0 { + return nil, nil, moerr.NewInternalErrorNoCtx("queries, num_queries, and query_dimension cannot be zero") + } + + var errmsg *C.char + cResult := C.gpu_brute_force_search( + gb.cIndex, + unsafe.Pointer(&queries[0]), + C.uint64_t(num_queries), + C.uint32_t(query_dimension), + C.uint32_t(limit), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(queries) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, nil, moerr.NewInternalErrorNoCtx(errStr) + } + if cResult == nil { + return nil, nil, moerr.NewInternalErrorNoCtx("search returned nil result") + } + + // Allocate slices for results + neighbors := make([]int64, num_queries*uint64(limit)) + distances := make([]float32, num_queries*uint64(limit)) + + C.gpu_brute_force_get_results(cResult, C.uint64_t(num_queries), C.uint32_t(limit), (*C.int64_t)(unsafe.Pointer(&neighbors[0])), (*C.float)(unsafe.Pointer(&distances[0]))) + runtime.KeepAlive(neighbors) + runtime.KeepAlive(distances) + + C.gpu_brute_force_free_search_result(cResult) + + return neighbors, distances, nil +} + +// SearchFloat performs a search operation with float32 queries +func (gb *GpuBruteForce[T]) SearchFloat(queries []float32, num_queries uint64, query_dimension uint32, limit uint32) ([]int64, []float32, error) { + if gb.cIndex == nil { + return nil, nil, moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized") + } + if len(queries) == 0 || num_queries == 0 || query_dimension == 0 { + return nil, nil, moerr.NewInternalErrorNoCtx("queries, num_queries, and query_dimension cannot be zero") + } + + var errmsg *C.char + cResult := C.gpu_brute_force_search_float( + gb.cIndex, + (*C.float)(unsafe.Pointer(&queries[0])), + C.uint64_t(num_queries), + C.uint32_t(query_dimension), + C.uint32_t(limit), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(queries) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, nil, moerr.NewInternalErrorNoCtx(errStr) + } + if cResult == nil { + return nil, nil, moerr.NewInternalErrorNoCtx("search returned nil result") + } + + // Allocate slices for results + neighbors := make([]int64, num_queries*uint64(limit)) + distances := make([]float32, num_queries*uint64(limit)) + + C.gpu_brute_force_get_results(cResult, C.uint64_t(num_queries), C.uint32_t(limit), (*C.int64_t)(unsafe.Pointer(&neighbors[0])), (*C.float)(unsafe.Pointer(&distances[0]))) + runtime.KeepAlive(neighbors) + runtime.KeepAlive(distances) + + C.gpu_brute_force_free_search_result(cResult) + + return neighbors, distances, nil +} + +// Cap returns the capacity of the index buffer +func (gb *GpuBruteForce[T]) Cap() uint32 { + if gb.cIndex == nil { + return 0 + } + return uint32(C.gpu_brute_force_cap(gb.cIndex)) +} + +// Len returns current number of vectors in index +func (gb *GpuBruteForce[T]) Len() uint32 { + if gb.cIndex == nil { + return 0 + } + return uint32(C.gpu_brute_force_len(gb.cIndex)) +} + +// Info returns detailed information about the index as a JSON string. +func (gb *GpuBruteForce[T]) Info() (string, error) { + if gb.cIndex == nil { + return "", moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized") + } + var errmsg *C.char + infoPtr := C.gpu_brute_force_info(gb.cIndex, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + if infoPtr != nil { + C.free(unsafe.Pointer(infoPtr)) + } + return "", moerr.NewInternalErrorNoCtx(errStr) + } + if infoPtr == nil { + return "{}", nil + } + info := C.GoString(infoPtr) + C.free(unsafe.Pointer(infoPtr)) + return info, nil +} + +// Destroy frees the C++ GpuBruteForce instance +func (gb *GpuBruteForce[T]) Destroy() error { + if gb.cIndex == nil { + return nil + } + var errmsg *C.char + C.gpu_brute_force_destroy(gb.cIndex, unsafe.Pointer(&errmsg)) + gb.cIndex = nil // Mark as destroyed + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} diff --git a/pkg/cuvs/brute_force_test.go b/pkg/cuvs/brute_force_test.go new file mode 100644 index 0000000000000..2ebbe0261024c --- /dev/null +++ b/pkg/cuvs/brute_force_test.go @@ -0,0 +1,220 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "math/rand" + "testing" +) + +func TestGpuBruteForce(t *testing.T) { + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + dataset[i*uint64(dimension)] = float32(i) + dataset[i*uint64(dimension)+1] = float32(i) + } + + index, err := NewGpuBruteForce[float32](dataset, n_vectors, dimension, L2Expanded, 1, 0) + if err != nil { + t.Fatalf("Failed to create GpuBruteForce: %v", err) + } + defer index.Destroy() + + index.Start() + err = index.Build() + if err != nil { + t.Fatalf("Failed to load GpuBruteForce: %v", err) + } + + queries := []float32{1.0, 1.0, 100.0, 100.0} + neighbors, distances, err := index.Search(queries, 2, dimension, 1) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + + t.Logf("Neighbors: %v, Distances: %v", neighbors, distances) + if neighbors[0] != 1 { + t.Errorf("Expected neighbor 1, got %d", neighbors[0]) + } + if neighbors[1] != 100 { + t.Errorf("Expected neighbor 100, got %d", neighbors[1]) + } +} + +func TestGpuBruteForceChunked(t *testing.T) { + dimension := uint32(8) + totalCount := uint64(100) + + // Create empty index (target type half) + index, err := NewGpuBruteForceEmpty[Float16](totalCount, dimension, L2Expanded, 1, 0) + if err != nil { + t.Fatalf("Failed to create GpuBruteForceEmpty: %v", err) + } + defer index.Destroy() + + err = index.Start() + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + if index.Cap() != uint32(totalCount) { + t.Errorf("Expected capacity %d, got %d", totalCount, index.Cap()) + } + if index.Len() != 0 { + t.Errorf("Expected length 0, got %d", index.Len()) + } + + // Add data in chunks (from float32, triggers on-the-fly conversion to half) + chunkSize := uint64(50) + for i := uint64(0); i < totalCount; i += chunkSize { + chunk := make([]float32, chunkSize*uint64(dimension)) + val := float32(i/chunkSize*100 + 1) + for j := range chunk { + chunk[j] = val + } + err = index.AddChunkFloat(chunk, chunkSize) + if err != nil { + t.Fatalf("AddChunkFloat failed at offset %d: %v", i, err) + } + + expectedLen := uint32(i + chunkSize) + if index.Len() != expectedLen { + t.Errorf("Expected length %d, got %d", expectedLen, index.Len()) + } + } + + // Build index + err = index.Build() + if err != nil { + t.Fatalf("Load failed: %v", err) + } + + // Search + query := make([]Float16, dimension) + for i := range query { + query[i] = Float16(1) // matches first chunk + } + neighbors, _, err := index.Search(query, 1, dimension, 1) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + + if neighbors[0] < 0 || neighbors[0] >= 50 { + t.Errorf("Expected neighbor from first chunk (0-49), got %d", neighbors[0]) + } +} + +func TestGpuBruteForceFloat16(t *testing.T) { + dimension := uint32(2) + count := uint64(2) + dataset := []float32{1.0, 1.0, 2.0, 2.0} + + // Convert to Float16 on GPU + hDataset := make([]Float16, len(dataset)) + err := GpuConvertF32ToF16(dataset, hDataset, 0) + if err != nil { + t.Fatalf("Failed to convert dataset to F16: %v", err) + } + + index, err := NewGpuBruteForce(hDataset, count, dimension, L2Expanded, 1, 0) + if err != nil { + t.Fatalf("Failed to create F16 GpuBruteForce: %v", err) + } + defer index.Destroy() + + index.Start() + err = index.Build() + if err != nil { + t.Fatalf("Failed to load: %v", err) + } + + queries := []float32{1.0, 1.0} + hQueries := make([]Float16, len(queries)) + GpuConvertF32ToF16(queries, hQueries, 0) + + neighbors, distances, err := index.Search(hQueries, 1, dimension, 1) + if err != nil { + t.Fatalf("Failed to search F16: %v", err) + } + + if neighbors[0] != 0 { + t.Errorf("Expected first neighbor 0, got %d", neighbors[0]) + } + if distances[0] != 0.0 { + t.Errorf("Expected distance 0.0, got %f", distances[0]) + } +} + +func BenchmarkGpuAddChunkAndSearchBruteForceF16(b *testing.B) { + const dimension = 1024 + const totalCount = 100000 + const chunkSize = 10000 + + dataset := make([]float32, totalCount*dimension) + for i := range dataset { + dataset[i] = rand.Float32() + } + + // Use Float16 as internal type + index, err := NewGpuBruteForceEmpty[Float16](uint64(totalCount), dimension, L2Expanded, 8, 0) + if err != nil { + b.Fatalf("Failed to create index: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + + // Add data in chunks using AddChunkFloat + for i := 0; i < totalCount; i += chunkSize { + chunk := dataset[i*dimension : (i+chunkSize)*dimension] + if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil { + b.Fatalf("AddChunkFloat failed at %d: %v", i, err) + } + } + + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, _, err := index.SearchFloat(queries, 1, dimension, 10) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + neighbors, _, err := index.SearchFloat(queries, numQueries, dimension, limit) + if err != nil { + return nil, err + } + return neighbors, nil + }) +} diff --git a/pkg/cuvs/cagra.go b/pkg/cuvs/cagra.go new file mode 100644 index 0000000000000..7de30613dc299 --- /dev/null +++ b/pkg/cuvs/cagra.go @@ -0,0 +1,639 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +/* +#include "../../cgo/cuvs/cagra_c.h" +#include +#include +*/ +import "C" +import ( + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "runtime" + "unsafe" +) + +// GpuCagra represents the C++ gpu_cagra_t object. +type GpuCagra[T VectorType] struct { + cCagra C.gpu_cagra_c + dimension uint32 + nthread uint32 + distMode DistributionMode + useBatching bool +} + +// SetUseBatching enables or disables dynamic batching for search operations. +func (gi *GpuCagra[T]) SetUseBatching(enable bool) error { + gi.useBatching = enable + if gi.cCagra != nil { + var errmsg *C.char + C.gpu_cagra_set_use_batching(gi.cCagra, C.bool(enable), unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + } + return nil +} + +// NewGpuCagra creates a new GpuCagra instance from a dataset. +func NewGpuCagra[T VectorType](dataset []T, count uint64, dimension uint32, metric DistanceType, + bp CagraBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuCagra[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.cagra_build_params_t{ + intermediate_graph_degree: C.size_t(bp.IntermediateGraphDegree), + graph_degree: C.size_t(bp.GraphDegree), + attach_dataset_on_build: C.bool(bp.AttachDatasetOnBuild), + } + + cCagra := C.gpu_cagra_new( + unsafe.Pointer(&dataset[0]), + C.uint64_t(count), + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cCagra == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create GpuCagra") + } + + return &GpuCagra[T]{ + cCagra: cCagra, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// NewGpuCagraFromFile creates a new GpuCagra instance by loading from a file. +func NewGpuCagraFromFile[T VectorType](filename string, dimension uint32, metric DistanceType, + bp CagraBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuCagra[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cFilename := C.CString(filename) + defer C.free(unsafe.Pointer(cFilename)) + + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.cagra_build_params_t{ + intermediate_graph_degree: C.size_t(bp.IntermediateGraphDegree), + graph_degree: C.size_t(bp.GraphDegree), + attach_dataset_on_build: C.bool(bp.AttachDatasetOnBuild), + } + + cCagra := C.gpu_cagra_load_file( + cFilename, + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cCagra == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to load GpuCagra from file") + } + + return &GpuCagra[T]{ + cCagra: cCagra, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// Destroy frees the C++ gpu_cagra_t instance +func (gi *GpuCagra[T]) Destroy() error { + if gi.cCagra == nil { + return nil + } + var errmsg *C.char + C.gpu_cagra_destroy(gi.cCagra, unsafe.Pointer(&errmsg)) + gi.cCagra = nil + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Start initializes the worker and resources +func (gi *GpuCagra[T]) Start() error { + if gi.cCagra == nil { + return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + + if gi.distMode == Replicated && gi.nthread > 1 { + var errmsg *C.char + C.gpu_cagra_set_per_thread_device(gi.cCagra, C.bool(true), unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + } + + if gi.useBatching { + if err := gi.SetUseBatching(true); err != nil { + return err + } + } + + var errmsg *C.char + C.gpu_cagra_start(gi.cCagra, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Build triggers the build or file loading process +func (gi *GpuCagra[T]) Build() error { + if gi.cCagra == nil { + return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + var errmsg *C.char + C.gpu_cagra_build(gi.cCagra, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// NewGpuCagraEmpty creates a new GpuCagra instance with pre-allocated buffer but no data yet. +func NewGpuCagraEmpty[T VectorType](totalCount uint64, dimension uint32, metric DistanceType, + bp CagraBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuCagra[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.cagra_build_params_t{ + intermediate_graph_degree: C.size_t(bp.IntermediateGraphDegree), + graph_degree: C.size_t(bp.GraphDegree), + attach_dataset_on_build: C.bool(bp.AttachDatasetOnBuild), + } + + cCagra := C.gpu_cagra_new_empty( + C.uint64_t(totalCount), + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cCagra == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create empty GpuCagra") + } + + return &GpuCagra[T]{ + cCagra: cCagra, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// AddChunk adds a chunk of data to the pre-allocated buffer. +func (gi *GpuCagra[T]) AddChunk(chunk []T, chunkCount uint64) error { + if gi.cCagra == nil { + return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + if len(chunk) == 0 || chunkCount == 0 { + return nil + } + + var errmsg *C.char + C.gpu_cagra_add_chunk( + gi.cCagra, + unsafe.Pointer(&chunk[0]), + C.uint64_t(chunkCount), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(chunk) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// AddChunkFloat adds a chunk of float32 data, performing on-the-fly quantization if needed. +func (gi *GpuCagra[T]) AddChunkFloat(chunk []float32, chunkCount uint64) error { + if gi.cCagra == nil { + return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + if len(chunk) == 0 || chunkCount == 0 { + return nil + } + + var errmsg *C.char + C.gpu_cagra_add_chunk_float( + gi.cCagra, + (*C.float)(&chunk[0]), + C.uint64_t(chunkCount), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(chunk) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// TrainQuantizer trains the scalar quantizer (if T is 1-byte) +func (gi *GpuCagra[T]) TrainQuantizer(trainData []float32, nSamples uint64) error { + if gi.cCagra == nil { + return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + if len(trainData) == 0 || nSamples == 0 { + return nil + } + + var errmsg *C.char + C.gpu_cagra_train_quantizer( + gi.cCagra, + (*C.float)(&trainData[0]), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(trainData) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// SetQuantizer sets the scalar quantizer parameters (if T is 1-byte) +func (gi *GpuCagra[T]) SetQuantizer(min, max float32) error { + if gi.cCagra == nil { + return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + + var errmsg *C.char + C.gpu_cagra_set_quantizer( + gi.cCagra, + C.float(min), + C.float(max), + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// GetQuantizer gets the scalar quantizer parameters (if T is 1-byte) +func (gi *GpuCagra[T]) GetQuantizer() (float32, float32, error) { + if gi.cCagra == nil { + return 0, 0, moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + + var errmsg *C.char + var cMin, cMax C.float + C.gpu_cagra_get_quantizer( + gi.cCagra, + &cMin, + &cMax, + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return 0, 0, moerr.NewInternalErrorNoCtx(errStr) + } + return float32(cMin), float32(cMax), nil +} + +// Save serializes the index to a file +func (gc *GpuCagra[T]) Save(filename string) error { + if gc.cCagra == nil { + return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + var errmsg *C.char + cFilename := C.CString(filename) + defer C.free(unsafe.Pointer(cFilename)) + + C.gpu_cagra_save(gc.cCagra, cFilename, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Search performs a K-Nearest Neighbor search +func (gc *GpuCagra[T]) Search(queries []T, numQueries uint64, dimension uint32, limit uint32, sp CagraSearchParams) (SearchResult, error) { + if gc.cCagra == nil { + return SearchResult{}, moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + if len(queries) == 0 || numQueries == 0 { + return SearchResult{}, nil + } + + var errmsg *C.char + cSP := C.cagra_search_params_t{ + itopk_size: C.size_t(sp.ItopkSize), + search_width: C.size_t(sp.SearchWidth), + } + + res := C.gpu_cagra_search( + gc.cCagra, + unsafe.Pointer(&queries[0]), + C.uint64_t(numQueries), + C.uint32_t(dimension), + C.uint32_t(limit), + cSP, + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(queries) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return SearchResult{}, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return SearchResult{}, moerr.NewInternalErrorNoCtx("search returned nil result") + } + + totalElements := uint64(numQueries) * uint64(limit) + neighbors := make([]uint32, totalElements) + distances := make([]float32, totalElements) + + C.gpu_cagra_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.uint32_t)(unsafe.Pointer(&neighbors[0]))) + C.gpu_cagra_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0]))) + runtime.KeepAlive(neighbors) + runtime.KeepAlive(distances) + + C.gpu_cagra_free_result(res.result_ptr) + + return SearchResult{ + Neighbors: neighbors, + Distances: distances, + }, nil +} + +// SearchFloat performs a K-Nearest Neighbor search with float32 queries +func (gc *GpuCagra[T]) SearchFloat(queries []float32, numQueries uint64, dimension uint32, limit uint32, sp CagraSearchParams) (SearchResult, error) { + if gc.cCagra == nil { + return SearchResult{}, moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + if len(queries) == 0 || numQueries == 0 { + return SearchResult{}, nil + } + + var errmsg *C.char + cSP := C.cagra_search_params_t{ + itopk_size: C.size_t(sp.ItopkSize), + search_width: C.size_t(sp.SearchWidth), + } + + res := C.gpu_cagra_search_float( + gc.cCagra, + (*C.float)(unsafe.Pointer(&queries[0])), + C.uint64_t(numQueries), + C.uint32_t(dimension), + C.uint32_t(limit), + cSP, + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(queries) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return SearchResult{}, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return SearchResult{}, moerr.NewInternalErrorNoCtx("search returned nil result") + } + + totalElements := uint64(numQueries) * uint64(limit) + neighbors := make([]uint32, totalElements) + distances := make([]float32, totalElements) + + C.gpu_cagra_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.uint32_t)(unsafe.Pointer(&neighbors[0]))) + C.gpu_cagra_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0]))) + runtime.KeepAlive(neighbors) + runtime.KeepAlive(distances) + + C.gpu_cagra_free_result(res.result_ptr) + + return SearchResult{ + Neighbors: neighbors, + Distances: distances, + }, nil +} + +// Cap returns the capacity of the index buffer +func (gc *GpuCagra[T]) Cap() uint32 { + if gc.cCagra == nil { + return 0 + } + return uint32(C.gpu_cagra_cap(gc.cCagra)) +} + +// Len returns current number of vectors in index +func (gc *GpuCagra[T]) Len() uint32 { + if gc.cCagra == nil { + return 0 + } + return uint32(C.gpu_cagra_len(gc.cCagra)) +} + +// Info returns detailed information about the index as a JSON string. +func (gc *GpuCagra[T]) Info() (string, error) { + if gc.cCagra == nil { + return "", moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + var errmsg *C.char + infoPtr := C.gpu_cagra_info(gc.cCagra, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + if infoPtr != nil { + C.free(unsafe.Pointer(infoPtr)) + } + return "", moerr.NewInternalErrorNoCtx(errStr) + } + if infoPtr == nil { + return "{}", nil + } + info := C.GoString(infoPtr) + C.free(unsafe.Pointer(infoPtr)) + return info, nil +} + +// Extend adds more vectors to the index (single-GPU only) +func (gc *GpuCagra[T]) Extend(additionalData []T, numVectors uint64) error { + if gc.cCagra == nil { + return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized") + } + if len(additionalData) == 0 || numVectors == 0 { + return nil + } + + var errmsg *C.char + C.gpu_cagra_extend( + gc.cCagra, + unsafe.Pointer(&additionalData[0]), + C.uint64_t(numVectors), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(additionalData) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Merge combines multiple single-GPU GpuCagra indices into a new one. +func MergeGpuCagra[T VectorType](indices []*GpuCagra[T], nthread uint32, devices []int) (*GpuCagra[T], error) { + if len(indices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("no indices to merge") + } + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + cIndices := make([]C.gpu_cagra_c, len(indices)) + for i, idx := range indices { + cIndices[i] = idx.cCagra + } + + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + var errmsg *C.char + cCagra := C.gpu_cagra_merge( + &cIndices[0], + C.int(len(indices)), + C.uint32_t(nthread), + &cDevices[0], + C.int(len(devices)), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(cIndices) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cCagra == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to merge GpuCagra indices") + } + + return &GpuCagra[T]{cCagra: cCagra, dimension: indices[0].dimension}, nil +} + +// SearchResult contains the neighbors and distances from a search. +type SearchResult struct { + Neighbors []uint32 + Distances []float32 +} diff --git a/pkg/cuvs/cagra_test.go b/pkg/cuvs/cagra_test.go new file mode 100644 index 0000000000000..fb9a88c470e5d --- /dev/null +++ b/pkg/cuvs/cagra_test.go @@ -0,0 +1,714 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "fmt" + "math/rand" + "os" + "testing" +) + +func TestGpuCagra(t *testing.T) { + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + dataset[i*uint64(dimension)] = float32(i) + dataset[i*uint64(dimension)+1] = float32(i) + } + + devices := []int{0} + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuCagra: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + err = index.Build() + if err != nil { + t.Fatalf("Failed to load/build GpuCagra: %v", err) + } + + queries := []float32{1.0, 1.0, 100.0, 100.0} + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + result, err := index.Search(queries, 2, dimension, 1, sp) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + + t.Logf("Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) + if result.Neighbors[0] != 1 { + t.Errorf("Expected neighbor 1, got %d", result.Neighbors[0]) + } + if result.Neighbors[1] != 100 { + t.Errorf("Expected neighbor 100, got %d", result.Neighbors[1]) + } +} + +func TestGpuCagraSaveLoad(t *testing.T) { + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = float32(i) + } + + devices := []int{0} + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuCagra: %v", err) + } + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + index.Build() + + filename := "test_cagra.idx" + err = index.Save(filename) + if err != nil { + t.Fatalf("Save failed: %v", err) + } + defer os.Remove(filename) + index.Destroy() + + index2, err := NewGpuCagraFromFile[float32](filename, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuCagra from file: %v", err) + } + defer index2.Destroy() + + if err := index2.Start(); err != nil { + t.Fatalf("index2 Start failed: %v", err) + } + err = index2.Build() + if err != nil { + t.Fatalf("Load from file failed: %v", err) + } + + queries := []float32{0.0, 0.0} + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + result, err := index2.Search(queries, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + if result.Neighbors[0] != 0 { + t.Errorf("Expected 0, got %d", result.Neighbors[0]) + } +} + +func TestGpuShardedCagra(t *testing.T) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + t.Skip("Need at least 1 GPU for sharded CAGRA test") + } + + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + dataset[i*uint64(dimension)] = float32(i) + dataset[i*uint64(dimension)+1] = float32(i) + } + + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Sharded) + if err != nil { + t.Fatalf("Failed to create sharded CAGRA: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + err = index.Build() + if err != nil { + t.Fatalf("Load sharded failed: %v", err) + } + + queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5} + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + result, err := index.Search(queries, 5, dimension, 1, sp) + if err != nil { + t.Fatalf("Search sharded failed: %v", err) + } + t.Logf("Sharded Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) +} + +func TestGpuCagraChunked(t *testing.T) { + dimension := uint32(8) + totalCount := uint64(100) + devices := []int{0} + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + + // Create empty index (target type int8) + index, err := NewGpuCagraEmpty[int8](totalCount, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuCagraEmpty: %v", err) + } + defer index.Destroy() + + err = index.Start() + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + // Add data in chunks (from float32, triggers on-the-fly quantization) + chunkSize := uint64(50) + for i := uint64(0); i < totalCount; i += chunkSize { + chunk := make([]float32, chunkSize*uint64(dimension)) + val := float32(i/chunkSize*100 + 1) // 1.0 for first chunk, 101.0 for second + for j := range chunk { + chunk[j] = val + } + err = index.AddChunkFloat(chunk, chunkSize) + if err != nil { + t.Fatalf("AddChunkFloat failed at offset %d: %v", i, err) + } + } + + // Build index + err = index.Build() + if err != nil { + t.Fatalf("Load failed: %v", err) + } + + // Search for first chunk + query1 := make([]int8, dimension) + for i := range query1 { + query1[i] = -128 // matches first chunk (1.0) + } + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + result1, err := index.Search(query1, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search 1 failed: %v", err) + } + if result1.Neighbors[0] < 0 || result1.Neighbors[0] >= 50 { + t.Errorf("Expected neighbor from first chunk (0-49), got %d", result1.Neighbors[0]) + } + + // Search for second chunk + query2 := make([]int8, dimension) + for i := range query2 { + query2[i] = 127 // matches second chunk (101.0) + } + result2, err := index.Search(query2, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search 2 failed: %v", err) + } + if result2.Neighbors[0] < 50 || result2.Neighbors[0] >= 100 { + t.Errorf("Expected neighbor from second chunk (50-99), got %d", result2.Neighbors[0]) + } +} + +func TestGpuCagraExtend(t *testing.T) { + dimension := uint32(16) + count := uint64(100) + dataset := make([]float32, count*uint64(dimension)) + for i := range dataset { + dataset[i] = float32(i) + } + + devices := []int{0} + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err := NewGpuCagra[float32](dataset, count, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuCagra: %v", err) + } + defer index.Destroy() + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + index.Build() + + extra := make([]float32, 10*dimension) + for i := range extra { + extra[i] = 1000.0 + } + err = index.Extend(extra, 10) + if err != nil { + t.Fatalf("Extend failed: %v", err) + } + + queries := make([]float32, dimension) + for i := range queries { + queries[i] = 1000.0 + } + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + result, err := index.Search(queries, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + if result.Neighbors[0] < 100 { + t.Errorf("Expected neighbor from extended data, got %d", result.Neighbors[0]) + } +} + +func TestGpuCagraMerge(t *testing.T) { + dimension := uint32(16) + count := uint64(200) + + // Cluster 1: values around 0 + ds1 := make([]float32, count*uint64(dimension)) + for i := range ds1 { + ds1[i] = float32(i % 10) + } + // Cluster 2: values around 1000 + ds2 := make([]float32, count*uint64(dimension)) + for i := range ds2 { + ds2[i] = float32(1000 + (i % 10)) + } + + devices := []int{0} + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + + idx1, err := NewGpuCagra[float32](ds1, count, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create idx1: %v", err) + } + idx2, err := NewGpuCagra[float32](ds2, count, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create idx2: %v", err) + } + if err := idx1.Start(); err != nil { + t.Fatalf("idx1 Start failed: %v", err) + } + idx1.Build() + if err := idx2.Start(); err != nil { + t.Fatalf("idx2 Start failed: %v", err) + } + idx2.Build() + defer idx1.Destroy() + defer idx2.Destroy() + + merged, err := MergeGpuCagra([]*GpuCagra[float32]{idx1, idx2}, 1, devices) + if err != nil { + t.Fatalf("Merge failed: %v", err) + } + defer merged.Destroy() + + if err := merged.Start(); err != nil { + t.Fatalf("merged Start failed: %v", err) + } + + // Query near Cluster 2 + queries := make([]float32, dimension) + for i := range queries { + queries[i] = 1000.0 + } + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + result, err := merged.Search(queries, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + // Result should be from second index (index >= 200) + if result.Neighbors[0] < 200 { + t.Errorf("Expected neighbor from second index (>=200), got %d", result.Neighbors[0]) + } +} + +func TestGpuReplicatedCagra(t *testing.T) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + t.Skip("Need at least 1 GPU for replicated CAGRA test") + } + + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + dataset[i*uint64(dimension)] = float32(i) + dataset[i*uint64(dimension)+1] = float32(i) + } + + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Replicated) + if err != nil { + t.Fatalf("Failed to create replicated CAGRA: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + err = index.Build() + if err != nil { + t.Fatalf("Load replicated failed: %v", err) + } + + queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5} + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + result, err := index.Search(queries, 5, dimension, 1, sp) + if err != nil { + t.Fatalf("Search replicated failed: %v", err) + } + t.Logf("Replicated Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) +} + +func BenchmarkGpuShardedCagra(b *testing.B) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + b.Skip("Need at least 1 GPU for sharded CAGRA benchmark") + } + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Sharded) + if err != nil { + b.Fatalf("Failed to create sharded CAGRA: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.Search(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + }) + } +} + +func BenchmarkGpuSingleCagra(b *testing.B) { + devices := []int{0} + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create single CAGRA: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.Search(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + }) + } +} + +func BenchmarkGpuReplicatedCagra(b *testing.B) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + b.Skip("Need at least 1 GPU for replicated CAGRA benchmark") + } + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Replicated) + if err != nil { + b.Fatalf("Failed to create replicated CAGRA: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.Search(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + }) + } +} + +func BenchmarkGpuAddChunkAndSearchCagraF16(b *testing.B) { + const dimension = 1024 + const totalCount = 100000 + const chunkSize = 10000 + + dataset := make([]float32, totalCount*dimension) + for i := range dataset { + dataset[i] = rand.Float32() + } + + devices := []int{0} + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + // Use Float16 as internal type + index, err := NewGpuCagraEmpty[Float16](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create index: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + + // Add data in chunks using AddChunkFloat + for i := 0; i < totalCount; i += chunkSize { + chunk := dataset[i*dimension : (i+chunkSize)*dimension] + if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil { + b.Fatalf("AddChunkFloat failed at %d: %v", i, err) + } + } + + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) +} + +func BenchmarkGpuAddChunkAndSearchCagraInt8(b *testing.B) { + const dimension = 1024 + const totalCount = 100000 + const chunkSize = 10000 + + dataset := make([]float32, totalCount*dimension) + for i := range dataset { + dataset[i] = rand.Float32() + } + + devices := []int{0} + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + // Use int8 as internal type + index, err := NewGpuCagraEmpty[int8](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create index: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + + // Add data in chunks using AddChunkFloat + for i := 0; i < totalCount; i += chunkSize { + chunk := dataset[i*dimension : (i+chunkSize)*dimension] + if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil { + b.Fatalf("AddChunkFloat failed at %d: %v", i, err) + } + } + + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultCagraSearchParams() + sp.ItopkSize = 128 + sp.SearchWidth = 3 + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]uint32, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) +} diff --git a/pkg/cuvs/distance.go b/pkg/cuvs/distance.go new file mode 100644 index 0000000000000..2f29921b9212e --- /dev/null +++ b/pkg/cuvs/distance.go @@ -0,0 +1,73 @@ +//go:build gpu + +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cuvs + +/* +#include "../../cgo/cuvs/distance_c.h" +#include +*/ +import "C" +import ( + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "runtime" + "unsafe" +) + +// PairwiseDistance performs a pairwise distance calculation on GPU. +func PairwiseDistance[T VectorType]( + x []T, + nX uint64, + y []T, + nY uint64, + dim uint32, + metric DistanceType, + deviceID int, +) ([]float32, error) { + if len(x) == 0 || len(y) == 0 { + return nil, moerr.NewInternalErrorNoCtx("empty x or y") + } + + qtype := GetQuantization[T]() + dist := make([]float32, nX*nY) + + var errmsg *C.char + C.gpu_pairwise_distance( + unsafe.Pointer(&x[0]), + C.uint64_t(nX), + unsafe.Pointer(&y[0]), + C.uint64_t(nY), + C.uint32_t(dim), + C.distance_type_t(metric), + C.quantization_t(qtype), + C.int(deviceID), + (*C.float)(unsafe.Pointer(&dist[0])), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(x) + runtime.KeepAlive(y) + runtime.KeepAlive(dist) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + return dist, nil +} diff --git a/pkg/cuvs/distance_test.go b/pkg/cuvs/distance_test.go new file mode 100644 index 0000000000000..de63ac79f6f79 --- /dev/null +++ b/pkg/cuvs/distance_test.go @@ -0,0 +1,66 @@ +//go:build gpu + +/* + * Copyright 2021 Matrix Origin + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package cuvs + +import ( + "testing" +) + +func TestPairwiseDistance(t *testing.T) { + dim := uint32(3) + nX := uint64(2) + nY := uint64(2) + + x := []float32{ + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + } + y := []float32{ + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + } + + dist, err := PairwiseDistance[float32]( + x, nX, + y, nY, + dim, + L2Expanded, 0, + ) + + if err != nil { + t.Fatalf("PairwiseDistance failed: %v", err) + } + + if len(dist) != int(nX*nY) { + t.Errorf("Expected %d distances, got %d", nX*nY, len(dist)) + } + + // Expected results for L2Squared: + // dist[0,0] = (1-1)^2 + (0-0)^2 + (0-0)^2 = 0 + // dist[0,1] = (1-0)^2 + (0-1)^2 + (0-0)^2 = 2 + // dist[1,0] = (0-1)^2 + (1-0)^2 + (0-0)^2 = 2 + // dist[1,1] = (0-0)^2 + (1-1)^2 + (0-0)^2 = 0 + + expected := []float32{0.0, 2.0, 2.0, 0.0} + for i := 0; i < len(expected); i++ { + if dist[i] != expected[i] { + t.Errorf("Expected dist[%d] = %f, got %f", i, expected[i], dist[i]) + } + } +} diff --git a/pkg/cuvs/get_centers_test.go b/pkg/cuvs/get_centers_test.go new file mode 100644 index 0000000000000..eedadfeeac28f --- /dev/null +++ b/pkg/cuvs/get_centers_test.go @@ -0,0 +1,137 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "testing" +) + +func testIvfFlatGetCenters[T VectorType](t *testing.T, name string) { + t.Run(name, func(t *testing.T) { + dimension := uint32(16) + n_vectors := uint64(1000) + dataset := make([]T, n_vectors*uint64(dimension)) + // Fill some data + for i := range dataset { + dataset[i] = T(i % 127) + } + + devices := []int{0} + bp := DefaultIvfFlatBuildParams() + bp.NLists = 16 + index, err := NewGpuIvfFlat[T](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfFlat: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + t.Fatalf("Build failed: %v", err) + } + + nLists := index.GetNList() + centers, err := index.GetCenters(nLists) + if err != nil { + t.Fatalf("GetCenters failed: %v", err) + } + + expectedLen := int(nLists * dimension) + if len(centers) != expectedLen { + t.Errorf("Expected centers length %d, got %d", expectedLen, len(centers)) + } + + // Check that centers are not all zeros (simple sanity check) + allZeros := true + for _, v := range centers { + if v != 0 { + allZeros = false + break + } + } + if allZeros { + t.Errorf("Centers are all zeros") + } + }) +} + +func TestIvfFlatGetCentersAllTypes(t *testing.T) { + testIvfFlatGetCenters[float32](t, "float32") + testIvfFlatGetCenters[Float16](t, "Float16") + // testIvfFlatGetCenters[int8](t, "int8") + // testIvfFlatGetCenters[uint8](t, "uint8") +} + +func testIvfPqGetCenters[T VectorType](t *testing.T, name string) { + t.Run(name, func(t *testing.T) { + dimension := uint32(16) + n_vectors := uint64(1000) + dataset := make([]T, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = T(i % 127) + } + + devices := []int{0} + bp := DefaultIvfPqBuildParams() + bp.NLists = 16 + bp.M = 8 + index, err := NewGpuIvfPq[T](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfPq: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + t.Fatalf("Build failed: %v", err) + } + + centers, err := index.GetCenters() + if err != nil { + t.Fatalf("GetCenters failed: %v", err) + } + + nLists := index.GetNList() + rotDim := index.GetRotDim() + expectedLen := int(nLists * rotDim) + if len(centers) != expectedLen { + t.Errorf("Expected centers length %d, got %d", expectedLen, len(centers)) + } + + allZeros := true + for _, v := range centers { + if v != 0 { + allZeros = false + break + } + } + if allZeros { + t.Errorf("Centers are all zeros") + } + }) +} + +func TestIvfPqGetCentersAllTypes(t *testing.T) { + testIvfPqGetCenters[float32](t, "float32") + testIvfPqGetCenters[Float16](t, "Float16") + // testIvfPqGetCenters[int8](t, "int8") + // testIvfPqGetCenters[uint8](t, "uint8") +} diff --git a/pkg/cuvs/helper.go b/pkg/cuvs/helper.go new file mode 100644 index 0000000000000..1b00267be4d67 --- /dev/null +++ b/pkg/cuvs/helper.go @@ -0,0 +1,256 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +/* +#include "../../cgo/cuvs/helper.h" +#include +*/ +import "C" +import ( + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "runtime" + "unsafe" +) + +// DistanceType maps to C.distance_type_t +type DistanceType C.distance_type_t + +const ( + L2Expanded DistanceType = C.DistanceType_L2Expanded + L2SqrtExpanded DistanceType = C.DistanceType_L2SqrtExpanded + CosineExpanded DistanceType = C.DistanceType_CosineExpanded + L1 DistanceType = C.DistanceType_L1 + L2Unexpanded DistanceType = C.DistanceType_L2Unexpanded + L2SqrtUnexpanded DistanceType = C.DistanceType_L2SqrtUnexpanded + InnerProduct DistanceType = C.DistanceType_InnerProduct + Linf DistanceType = C.DistanceType_Linf + Canberra DistanceType = C.DistanceType_Canberra + LpUnexpanded DistanceType = C.DistanceType_LpUnexpanded + CorrelationExpanded DistanceType = C.DistanceType_CorrelationExpanded + JaccardExpanded DistanceType = C.DistanceType_JaccardExpanded + HellingerExpanded DistanceType = C.DistanceType_HellingerExpanded + Haversine DistanceType = C.DistanceType_Haversine + BrayCurtis DistanceType = C.DistanceType_BrayCurtis + JensenShannon DistanceType = C.DistanceType_JensenShannon + HammingUnexpanded DistanceType = C.DistanceType_HammingUnexpanded + KLDivergence DistanceType = C.DistanceType_KLDivergence + RusselRaoExpanded DistanceType = C.DistanceType_RusselRaoExpanded + DiceExpanded DistanceType = C.DistanceType_DiceExpanded + BitwiseHamming DistanceType = C.DistanceType_BitwiseHamming + Precomputed DistanceType = C.DistanceType_Precomputed + // Aliases + CosineSimilarity DistanceType = C.DistanceType_CosineSimilarity + Jaccard DistanceType = C.DistanceType_Jaccard + Hamming DistanceType = C.DistanceType_Hamming + Unknown DistanceType = C.DistanceType_Unknown +) + +// Quantization maps to C.quantization_t +type Quantization C.quantization_t + +const ( + F32 Quantization = C.Quantization_F32 + F16 Quantization = C.Quantization_F16 + INT8 Quantization = C.Quantization_INT8 + UINT8 Quantization = C.Quantization_UINT8 +) + +// DistributionMode maps to C.distribution_mode_t +type DistributionMode C.distribution_mode_t + +const ( + SingleGpu DistributionMode = C.DistributionMode_SINGLE_GPU + Sharded DistributionMode = C.DistributionMode_SHARDED + Replicated DistributionMode = C.DistributionMode_REPLICATED +) + +// CagraBuildParams maps to C.cagra_build_params_t +type CagraBuildParams struct { + IntermediateGraphDegree uint64 + GraphDegree uint64 + AttachDatasetOnBuild bool +} + +func DefaultCagraBuildParams() CagraBuildParams { + return CagraBuildParams{ + IntermediateGraphDegree: 128, + GraphDegree: 64, + AttachDatasetOnBuild: true, + } +} + +// CagraSearchParams maps to C.cagra_search_params_t +type CagraSearchParams struct { + ItopkSize uint64 + SearchWidth uint64 +} + +func DefaultCagraSearchParams() CagraSearchParams { + return CagraSearchParams{ + ItopkSize: 64, + SearchWidth: 1, + } +} + +// IvfFlatBuildParams maps to C.ivf_flat_build_params_t +type IvfFlatBuildParams struct { + NLists uint32 + AddDataOnBuild bool + KmeansTrainsetFraction float64 +} + +func DefaultIvfFlatBuildParams() IvfFlatBuildParams { + return IvfFlatBuildParams{ + NLists: 1024, + AddDataOnBuild: true, + KmeansTrainsetFraction: 0.5, + } +} + +// IvfFlatSearchParams maps to C.ivf_flat_search_params_t +type IvfFlatSearchParams struct { + NProbes uint32 +} + +func DefaultIvfFlatSearchParams() IvfFlatSearchParams { + return IvfFlatSearchParams{ + NProbes: 20, + } +} + +// IvfPqBuildParams maps to C.ivf_pq_build_params_t +type IvfPqBuildParams struct { + NLists uint32 + M uint32 + BitsPerCode uint32 + AddDataOnBuild bool + KmeansTrainsetFraction float64 +} + +func DefaultIvfPqBuildParams() IvfPqBuildParams { + return IvfPqBuildParams{ + NLists: 1024, + M: 16, + BitsPerCode: 8, + AddDataOnBuild: true, + KmeansTrainsetFraction: 0.5, + } +} + +// IvfPqSearchParams maps to C.ivf_pq_search_params_t +type IvfPqSearchParams struct { + NProbes uint32 +} + +func DefaultIvfPqSearchParams() IvfPqSearchParams { + return IvfPqSearchParams{ + NProbes: 20, + } +} + +// Float16 is a 16-bit floating point type (IEEE 754-2008). +// Go does not have a native float16 type, so we use uint16 to represent its memory layout. +type Float16 uint16 + +// VectorType is a constraint for types that can be used as vector data. +type VectorType interface { + float32 | Float16 | int8 | uint8 +} + +// GpuIndex is an interface for all GPU-accelerated indexes. +type GpuIndex interface { + Start() error + Build() error + Destroy() error + Info() (string, error) +} + +// GetQuantization returns the Quantization enum for a given VectorType. +func GetQuantization[T VectorType]() Quantization { + var zero T + switch any(zero).(type) { + case float32: + return F32 + case Float16: + return F16 + case int8: + return INT8 + case uint8: + return UINT8 + default: + panic("unsupported vector type") + } +} + +// GpuConvertF32ToF16 converts a float32 slice to a Float16 slice using the GPU. +func GpuConvertF32ToF16(src []float32, dst []Float16, deviceID int) error { + if len(src) == 0 { + return nil + } + if len(src) != len(dst) { + return moerr.NewInternalErrorNoCtx("source and destination slices must have the same length") + } + + var errmsg *C.char + C.gpu_convert_f32_to_f16( + (*C.float)(unsafe.Pointer(&src[0])), + unsafe.Pointer(&dst[0]), + C.uint64_t(len(src)), + C.int(deviceID), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(src) + runtime.KeepAlive(dst) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// GetGpuDeviceCount returns the number of available CUDA devices. +func GetGpuDeviceCount() (int, error) { + count := int(C.gpu_get_device_count()) + if count < 0 { + return 0, moerr.NewInternalErrorNoCtx("failed to get GPU device count") + } + return count, nil +} + +// GetGpuDeviceList returns a slice of available CUDA device IDs. +func GetGpuDeviceList() ([]int, error) { + count, err := GetGpuDeviceCount() + if err != nil { + return nil, err + } + if count == 0 { + return []int{}, nil + } + + cDevices := make([]C.int, count) + actualCount := int(C.gpu_get_device_list(&cDevices[0], C.int(count))) + + devices := make([]int, actualCount) + for i := 0; i < actualCount; i++ { + devices[i] = int(cDevices[i]) + } + runtime.KeepAlive(cDevices) + return devices, nil +} diff --git a/pkg/cuvs/helper_test.go b/pkg/cuvs/helper_test.go new file mode 100644 index 0000000000000..1b4def55e94a5 --- /dev/null +++ b/pkg/cuvs/helper_test.go @@ -0,0 +1,48 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "testing" +) + +func TestGpuHelpers(t *testing.T) { + count, err := GetGpuDeviceCount() + if err != nil { + t.Fatalf("GetGpuDeviceCount failed: %v", err) + } + t.Logf("GPU Device Count: %d", count) + + devices, err := GetGpuDeviceList() + if err != nil { + t.Fatalf("GetGpuDeviceList failed: %v", err) + } + t.Logf("GPU Device List: %v", devices) +} + +func TestGpuConvertF32ToF16(t *testing.T) { + src := []float32{1.0, 2.0, 3.0, 4.0} + deviceID := 0 + + // Test conversion to F16 + dstF16 := make([]Float16, len(src)) + if err := GpuConvertF32ToF16(src, dstF16, deviceID); err != nil { + t.Fatalf("GpuConvertF32ToF16 failed: %v", err) + } + // We can't easily verify the value without a float16 decoder, + // but we can check it didn't error. +} diff --git a/pkg/cuvs/info_test.go b/pkg/cuvs/info_test.go new file mode 100644 index 0000000000000..b52b647aec8ba --- /dev/null +++ b/pkg/cuvs/info_test.go @@ -0,0 +1,212 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "encoding/json" + "fmt" + "math/rand" + "testing" +) + +type commonInfo struct { + ElementSize int `json:"element_size"` + Dimension int `json:"dimension"` + Metric int `json:"metric"` + Status string `json:"status"` + Capacity int `json:"capacity"` + CurrentLength int `json:"current_length"` + Devices []int `json:"devices"` + Type string `json:"type"` +} + +func verifyCommonInfo(t *testing.T, infoStr string, expectedType string, expectedDim int, expectedElemSize int) { + var info commonInfo + err := json.Unmarshal([]byte(infoStr), &info) + if err != nil { + t.Fatalf("Failed to parse info JSON: %v\nJSON: %s", err, infoStr) + } + + if info.Type != expectedType { + t.Errorf("Expected type %s, got %s", expectedType, info.Type) + } + if info.Dimension != expectedDim { + t.Errorf("Expected dimension %d, got %d", expectedDim, info.Dimension) + } + if info.ElementSize != expectedElemSize { + t.Errorf("Expected element size %d, got %d", expectedElemSize, info.ElementSize) + } + if info.Status != "Loaded" { + t.Errorf("Expected status Loaded, got %s", info.Status) + } +} + +func TestIndexInfoComprehensive(t *testing.T) { + devices, err := GetGpuDeviceList() + if err != nil { + t.Fatalf("Failed to get GPU devices: %v", err) + } + if len(devices) == 0 { + t.Skip("No GPU devices available") + } + + dimension := uint32(128) + n_vectors := uint64(10000) + + // Test combinations of Index Type, Distribution Mode, and Data Type + + testCases := []struct { + indexType string + distMode DistributionMode + modeName string + }{ + {"CAGRA", SingleGpu, "SingleGPU"}, + {"CAGRA", Sharded, "Sharded"}, + {"CAGRA", Replicated, "Replicated"}, + {"IVF-Flat", SingleGpu, "SingleGPU"}, + {"IVF-Flat", Sharded, "Sharded"}, + {"IVF-Flat", Replicated, "Replicated"}, + {"IVF-PQ", SingleGpu, "SingleGPU"}, + {"IVF-PQ", Sharded, "Sharded"}, + {"IVF-PQ", Replicated, "Replicated"}, + } + + runTest := func(t *testing.T, indexType string, distMode DistributionMode, modeName string, dataType string) { + name := fmt.Sprintf("%s/%s/%s", indexType, modeName, dataType) + t.Run(name, func(t *testing.T) { + var index GpuIndex + var err error + var elemSize int + + // We use a large dataset + switch dataType { + case "float32": + dataset := GenerateRandomDataset(n_vectors, dimension) + elemSize = 4 + switch indexType { + case "CAGRA": + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err = NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + case "IVF-Flat": + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + index, err = NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + case "IVF-PQ": + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + bp.M = 16 + index, err = NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + } + case "Float16": + dataset := make([]Float16, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = Float16(rand.Uint32()) + } + elemSize = 2 + switch indexType { + case "CAGRA": + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err = NewGpuCagra[Float16](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + case "IVF-Flat": + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + index, err = NewGpuIvfFlat[Float16](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + case "IVF-PQ": + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + bp.M = 16 + index, err = NewGpuIvfPq[Float16](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + } + case "int8": + dataset := make([]int8, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = int8(rand.Intn(256) - 128) + } + elemSize = 1 + switch indexType { + case "CAGRA": + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err = NewGpuCagra[int8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + case "IVF-Flat": + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + index, err = NewGpuIvfFlat[int8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + case "IVF-PQ": + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + bp.M = 16 + index, err = NewGpuIvfPq[int8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + } + case "uint8": + dataset := make([]uint8, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = uint8(rand.Intn(256)) + } + elemSize = 1 + switch indexType { + case "CAGRA": + bp := DefaultCagraBuildParams() + bp.IntermediateGraphDegree = 256 + bp.GraphDegree = 128 + index, err = NewGpuCagra[uint8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + case "IVF-Flat": + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + index, err = NewGpuIvfFlat[uint8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + case "IVF-PQ": + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + bp.M = 16 + index, err = NewGpuIvfPq[uint8](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, distMode) + } + } + + if err != nil { + t.Fatalf("Failed to create index: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Failed to start index: %v", err) + } + if err := index.Build(); err != nil { + t.Fatalf("Failed to build index: %v", err) + } + + infoStr, err := index.Info() + if err != nil { + t.Fatalf("Failed to get info: %v", err) + } + + verifyCommonInfo(t, infoStr, indexType, int(dimension), elemSize) + }) + } + + dataTypes := []string{"float32", "Float16", "int8", "uint8"} + + for _, tc := range testCases { + for _, dt := range dataTypes { + runTest(t, tc.indexType, tc.distMode, tc.modeName, dt) + } + } +} diff --git a/pkg/cuvs/ivf_flat.go b/pkg/cuvs/ivf_flat.go new file mode 100644 index 0000000000000..0741b4b52eb2c --- /dev/null +++ b/pkg/cuvs/ivf_flat.go @@ -0,0 +1,593 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +/* +#include "../../cgo/cuvs/ivf_flat_c.h" +#include +#include +*/ +import "C" +import ( + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "runtime" + "unsafe" +) + +// GpuIvfFlat represents the C++ gpu_ivf_flat_t object. +type GpuIvfFlat[T VectorType] struct { + cIvfFlat C.gpu_ivf_flat_c + dimension uint32 + nthread uint32 + distMode DistributionMode + useBatching bool +} + +// SetUseBatching enables or disables dynamic batching for search operations. +func (gi *GpuIvfFlat[T]) SetUseBatching(enable bool) error { + gi.useBatching = enable + if gi.cIvfFlat != nil { + var errmsg *C.char + C.gpu_ivf_flat_set_use_batching(gi.cIvfFlat, C.bool(enable), unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + } + return nil +} + +// NewGpuIvfFlat creates a new GpuIvfFlat instance from a dataset. +func NewGpuIvfFlat[T VectorType](dataset []T, count uint64, dimension uint32, metric DistanceType, + bp IvfFlatBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfFlat[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.ivf_flat_build_params_t{ + n_lists: C.uint32_t(bp.NLists), + add_data_on_build: C.bool(bp.AddDataOnBuild), + kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction), + } + + cIvfFlat := C.gpu_ivf_flat_new( + unsafe.Pointer(&dataset[0]), + C.uint64_t(count), + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cIvfFlat == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create GpuIvfFlat") + } + + return &GpuIvfFlat[T]{ + cIvfFlat: cIvfFlat, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// NewGpuIvfFlatFromFile creates a new GpuIvfFlat instance by loading from a file. +func NewGpuIvfFlatFromFile[T VectorType](filename string, dimension uint32, metric DistanceType, + bp IvfFlatBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfFlat[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cFilename := C.CString(filename) + defer C.free(unsafe.Pointer(cFilename)) + + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.ivf_flat_build_params_t{ + n_lists: C.uint32_t(bp.NLists), + add_data_on_build: C.bool(bp.AddDataOnBuild), + kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction), + } + + cIvfFlat := C.gpu_ivf_flat_load_file( + cFilename, + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cIvfFlat == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to load GpuIvfFlat from file") + } + + return &GpuIvfFlat[T]{ + cIvfFlat: cIvfFlat, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// Destroy frees the C++ gpu_ivf_flat_t instance +func (gi *GpuIvfFlat[T]) Destroy() error { + if gi.cIvfFlat == nil { + return nil + } + var errmsg *C.char + C.gpu_ivf_flat_destroy(gi.cIvfFlat, unsafe.Pointer(&errmsg)) + gi.cIvfFlat = nil + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Start initializes the worker and resources +func (gi *GpuIvfFlat[T]) Start() error { + if gi.cIvfFlat == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + + if gi.distMode == Replicated && gi.nthread > 1 { + var errmsg *C.char + C.gpu_ivf_flat_set_per_thread_device(gi.cIvfFlat, C.bool(true), unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + } + + if gi.useBatching { + if err := gi.SetUseBatching(true); err != nil { + return err + } + } + + var errmsg *C.char + C.gpu_ivf_flat_start(gi.cIvfFlat, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Build triggers the build or file loading process +func (gi *GpuIvfFlat[T]) Build() error { + if gi.cIvfFlat == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + var errmsg *C.char + C.gpu_ivf_flat_build(gi.cIvfFlat, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// NewGpuIvfFlatEmpty creates a new GpuIvfFlat instance with pre-allocated buffer but no data yet. +func NewGpuIvfFlatEmpty[T VectorType](totalCount uint64, dimension uint32, metric DistanceType, + bp IvfFlatBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfFlat[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.ivf_flat_build_params_t{ + n_lists: C.uint32_t(bp.NLists), + add_data_on_build: C.bool(bp.AddDataOnBuild), + kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction), + } + + cIvfFlat := C.gpu_ivf_flat_new_empty( + C.uint64_t(totalCount), + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cIvfFlat == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create empty GpuIvfFlat") + } + + return &GpuIvfFlat[T]{ + cIvfFlat: cIvfFlat, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// AddChunk adds a chunk of data to the pre-allocated buffer. +func (gi *GpuIvfFlat[T]) AddChunk(chunk []T, chunkCount uint64) error { + if gi.cIvfFlat == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + if len(chunk) == 0 || chunkCount == 0 { + return nil + } + + var errmsg *C.char + C.gpu_ivf_flat_add_chunk( + gi.cIvfFlat, + unsafe.Pointer(&chunk[0]), + C.uint64_t(chunkCount), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(chunk) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// AddChunkFloat adds a chunk of float32 data, performing on-the-fly quantization if needed. +func (gi *GpuIvfFlat[T]) AddChunkFloat(chunk []float32, chunkCount uint64) error { + if gi.cIvfFlat == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + if len(chunk) == 0 || chunkCount == 0 { + return nil + } + + var errmsg *C.char + C.gpu_ivf_flat_add_chunk_float( + gi.cIvfFlat, + (*C.float)(&chunk[0]), + C.uint64_t(chunkCount), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(chunk) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// TrainQuantizer trains the scalar quantizer (if T is 1-byte) +func (gi *GpuIvfFlat[T]) TrainQuantizer(trainData []float32, nSamples uint64) error { + if gi.cIvfFlat == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + if len(trainData) == 0 || nSamples == 0 { + return nil + } + + var errmsg *C.char + C.gpu_ivf_flat_train_quantizer( + gi.cIvfFlat, + (*C.float)(&trainData[0]), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(trainData) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// SetQuantizer sets the scalar quantizer parameters (if T is 1-byte) +func (gi *GpuIvfFlat[T]) SetQuantizer(min, max float32) error { + if gi.cIvfFlat == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + + var errmsg *C.char + C.gpu_ivf_flat_set_quantizer( + gi.cIvfFlat, + C.float(min), + C.float(max), + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// GetQuantizer gets the scalar quantizer parameters (if T is 1-byte) +func (gi *GpuIvfFlat[T]) GetQuantizer() (float32, float32, error) { + if gi.cIvfFlat == nil { + return 0, 0, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + + var errmsg *C.char + var cMin, cMax C.float + C.gpu_ivf_flat_get_quantizer( + gi.cIvfFlat, + &cMin, + &cMax, + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return 0, 0, moerr.NewInternalErrorNoCtx(errStr) + } + return float32(cMin), float32(cMax), nil +} + +// Save serializes the index to a file +func (gi *GpuIvfFlat[T]) Save(filename string) error { + if gi.cIvfFlat == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + var errmsg *C.char + cFilename := C.CString(filename) + defer C.free(unsafe.Pointer(cFilename)) + + C.gpu_ivf_flat_save(gi.cIvfFlat, cFilename, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Search performs a K-Nearest Neighbor search +func (gi *GpuIvfFlat[T]) Search(queries []T, numQueries uint64, dimension uint32, limit uint32, sp IvfFlatSearchParams) (SearchResultIvfFlat, error) { + if gi.cIvfFlat == nil { + return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + if len(queries) == 0 || numQueries == 0 { + return SearchResultIvfFlat{}, nil + } + + var errmsg *C.char + cSP := C.ivf_flat_search_params_t{ + n_probes: C.uint32_t(sp.NProbes), + } + + res := C.gpu_ivf_flat_search( + gi.cIvfFlat, + unsafe.Pointer(&queries[0]), + C.uint64_t(numQueries), + C.uint32_t(dimension), + C.uint32_t(limit), + cSP, + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(queries) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("search returned nil result") + } + + totalElements := uint64(numQueries) * uint64(limit) + neighbors := make([]int64, totalElements) + distances := make([]float32, totalElements) + + C.gpu_ivf_flat_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0]))) + C.gpu_ivf_flat_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0]))) + runtime.KeepAlive(neighbors) + runtime.KeepAlive(distances) + + C.gpu_ivf_flat_free_result(res.result_ptr) + + return SearchResultIvfFlat{ + Neighbors: neighbors, + Distances: distances, + }, nil +} + +// SearchFloat performs a K-Nearest Neighbor search with float32 queries +func (gi *GpuIvfFlat[T]) SearchFloat(queries []float32, numQueries uint64, dimension uint32, limit uint32, sp IvfFlatSearchParams) (SearchResultIvfFlat, error) { + if gi.cIvfFlat == nil { + return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + if len(queries) == 0 || numQueries == 0 { + return SearchResultIvfFlat{}, nil + } + + var errmsg *C.char + cSP := C.ivf_flat_search_params_t{ + n_probes: C.uint32_t(sp.NProbes), + } + + res := C.gpu_ivf_flat_search_float( + gi.cIvfFlat, + (*C.float)(unsafe.Pointer(&queries[0])), + C.uint64_t(numQueries), + C.uint32_t(dimension), + C.uint32_t(limit), + cSP, + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(queries) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("search returned nil result") + } + + totalElements := uint64(numQueries) * uint64(limit) + neighbors := make([]int64, totalElements) + distances := make([]float32, totalElements) + + C.gpu_ivf_flat_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0]))) + C.gpu_ivf_flat_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0]))) + runtime.KeepAlive(neighbors) + runtime.KeepAlive(distances) + + C.gpu_ivf_flat_free_result(res.result_ptr) + + return SearchResultIvfFlat{ + Neighbors: neighbors, + Distances: distances, + }, nil +} + +// Cap returns the capacity of the index buffer +func (gi *GpuIvfFlat[T]) Cap() uint32 { + if gi.cIvfFlat == nil { + return 0 + } + return uint32(C.gpu_ivf_flat_cap(gi.cIvfFlat)) +} + +// Len returns current number of vectors in index +func (gi *GpuIvfFlat[T]) Len() uint32 { + if gi.cIvfFlat == nil { + return 0 + } + return uint32(C.gpu_ivf_flat_len(gi.cIvfFlat)) +} + +// Info returns detailed information about the index as a JSON string. +func (gi *GpuIvfFlat[T]) Info() (string, error) { + if gi.cIvfFlat == nil { + return "", moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + var errmsg *C.char + infoPtr := C.gpu_ivf_flat_info(gi.cIvfFlat, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + if infoPtr != nil { + C.free(unsafe.Pointer(infoPtr)) + } + return "", moerr.NewInternalErrorNoCtx(errStr) + } + if infoPtr == nil { + return "{}", nil + } + info := C.GoString(infoPtr) + C.free(unsafe.Pointer(infoPtr)) + return info, nil +} + +// GetCenters retrieves the trained centroids. +func (gi *GpuIvfFlat[T]) GetCenters(nLists uint32) ([]T, error) { + if gi.cIvfFlat == nil { + return nil, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized") + } + centers := make([]T, nLists*gi.dimension) + var errmsg *C.char + C.gpu_ivf_flat_get_centers(gi.cIvfFlat, unsafe.Pointer(¢ers[0]), unsafe.Pointer(&errmsg)) + runtime.KeepAlive(centers) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + return centers, nil +} + +// GetNList retrieves the number of lists (centroids) in the index. +func (gi *GpuIvfFlat[T]) GetNList() uint32 { + if gi.cIvfFlat == nil { + return 0 + } + return uint32(C.gpu_ivf_flat_get_n_list(gi.cIvfFlat)) +} + +// SearchResultIvfFlat contains the neighbors and distances from an IVF-Flat search. +type SearchResultIvfFlat struct { + Neighbors []int64 + Distances []float32 +} diff --git a/pkg/cuvs/ivf_flat_test.go b/pkg/cuvs/ivf_flat_test.go new file mode 100644 index 0000000000000..f1e9b5e3a1d1e --- /dev/null +++ b/pkg/cuvs/ivf_flat_test.go @@ -0,0 +1,580 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "fmt" + "math/rand" + "os" + "testing" +) + +func TestGpuIvfFlat(t *testing.T) { + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + dataset[i*uint64(dimension)] = float32(i) + dataset[i*uint64(dimension)+1] = float32(i) + } + + devices := []int{0} + bp := DefaultIvfFlatBuildParams() + bp.NLists = 10 + index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfFlat: %v", err) + } + defer index.Destroy() + + index.Start() + err = index.Build() + if err != nil { + t.Fatalf("Failed to load/build GpuIvfFlat: %v", err) + } + + centers, err := index.GetCenters(10) + if err != nil { + t.Fatalf("GetCenters failed: %v", err) + } + t.Logf("Centers: %v", centers[:4]) + + queries := []float32{1.0, 1.0, 100.0, 100.0} + sp := DefaultIvfFlatSearchParams() + sp.NProbes = 5 + result, err := index.Search(queries, 2, dimension, 1, sp) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + + t.Logf("Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) + if result.Neighbors[0] != 1 { + t.Errorf("Expected neighbor 1, got %d", result.Neighbors[0]) + } + if result.Neighbors[1] != 100 { + t.Errorf("Expected neighbor 100, got %d", result.Neighbors[1]) + } +} + +func TestGpuIvfFlatSaveLoad(t *testing.T) { + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = float32(i) + } + + devices := []int{0} + bp := DefaultIvfFlatBuildParams() + bp.NLists = 2 + index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfFlat: %v", err) + } + index.Start() + index.Build() + + filename := "test_ivf_flat.idx" + err = index.Save(filename) + if err != nil { + t.Fatalf("Save failed: %v", err) + } + defer os.Remove(filename) + index.Destroy() + + index2, err := NewGpuIvfFlatFromFile[float32](filename, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfFlat from file: %v", err) + } + defer index2.Destroy() + + index2.Start() + err = index2.Build() + if err != nil { + t.Fatalf("Load from file failed: %v", err) + } + + queries := []float32{0.0, 0.0} + sp := DefaultIvfFlatSearchParams() + result, err := index2.Search(queries, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + if result.Neighbors[0] != 0 { + t.Errorf("Expected 0, got %d", result.Neighbors[0]) + } +} + +func TestGpuShardedIvfFlat(t *testing.T) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + t.Skip("Need at least 1 GPU for sharded IVF-Flat test") + } + + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + dataset[i*uint64(dimension)] = float32(i) + dataset[i*uint64(dimension)+1] = float32(i) + } + + bp := DefaultIvfFlatBuildParams() + bp.NLists = 10 + index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Sharded) + if err != nil { + t.Fatalf("Failed to create sharded IVF-Flat: %v", err) + } + defer index.Destroy() + + index.Start() + err = index.Build() + if err != nil { + t.Fatalf("Load sharded failed: %v", err) + } + + queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5} + sp := DefaultIvfFlatSearchParams() + result, err := index.Search(queries, 5, dimension, 1, sp) + if err != nil { + t.Fatalf("Search sharded failed: %v", err) + } + t.Logf("Sharded Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) +} + +func TestGpuReplicatedIvfFlat(t *testing.T) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + t.Skip("Need at least 1 GPU for replicated IVF-Flat test") + } + + dimension := uint32(2) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + dataset[i*uint64(dimension)] = float32(i) + dataset[i*uint64(dimension)+1] = float32(i) + } + + bp := DefaultIvfFlatBuildParams() + bp.NLists = 10 + index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Replicated) + if err != nil { + t.Fatalf("Failed to create replicated IVF-Flat: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + err = index.Build() + if err != nil { + t.Fatalf("Load replicated failed: %v", err) + } + + queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5} + sp := DefaultIvfFlatSearchParams() + result, err := index.Search(queries, 5, dimension, 1, sp) + if err != nil { + t.Fatalf("Search replicated failed: %v", err) + } + t.Logf("Replicated Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) +} + +func BenchmarkGpuShardedIvfFlat(b *testing.B) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + b.Skip("Need at least 1 GPU for sharded IVF-Flat benchmark") + } + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Sharded) + if err != nil { + b.Fatalf("Failed to create sharded IVF-Flat: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfFlatSearchParams() + sp.NProbes = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + + }) + } +} + +func BenchmarkGpuSingleIvfFlat(b *testing.B) { + devices := []int{0} + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create single IVF-Flat: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfFlatSearchParams() + sp.NProbes = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + + }) + } +} + +func BenchmarkGpuReplicatedIvfFlat(b *testing.B) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + b.Skip("Need at least 1 GPU for replicated IVF-Flat benchmark") + } + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Replicated) + if err != nil { + b.Fatalf("Failed to create replicated IVF-Flat: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfFlatSearchParams() + sp.NProbes = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + + }) + } +} + +func BenchmarkGpuAddChunkAndSearchIvfFlatF16(b *testing.B) { + const dimension = 1024 + const totalCount = 100000 + const chunkSize = 10000 + + dataset := make([]float32, totalCount*dimension) + for i := range dataset { + dataset[i] = rand.Float32() + } + + devices := []int{0} + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + // Use Float16 as internal type + index, err := NewGpuIvfFlatEmpty[Float16](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create index: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + + // Add data in chunks using AddChunkFloat + for i := 0; i < totalCount; i += chunkSize { + chunk := dataset[i*dimension : (i+chunkSize)*dimension] + if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil { + b.Fatalf("AddChunkFloat failed at %d: %v", i, err) + } + } + + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfFlatSearchParams() + sp.NProbes = 3 + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + +} + +func BenchmarkGpuAddChunkAndSearchIvfFlatInt8(b *testing.B) { + const dimension = 1024 + const totalCount = 100000 + const chunkSize = 10000 + + dataset := make([]float32, totalCount*dimension) + for i := range dataset { + dataset[i] = rand.Float32() + } + + devices := []int{0} + bp := DefaultIvfFlatBuildParams() + bp.NLists = 1000 + // Use int8 as internal type + index, err := NewGpuIvfFlatEmpty[int8](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create index: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + + // Add data in chunks using AddChunkFloat + for i := 0; i < totalCount; i += chunkSize { + chunk := dataset[i*dimension : (i+chunkSize)*dimension] + if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil { + b.Fatalf("AddChunkFloat failed at %d: %v", i, err) + } + } + + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfFlatSearchParams() + sp.NProbes = 3 + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + +} + +func TestGpuIvfFlatChunked(t *testing.T) { + dimension := uint32(8) + totalCount := uint64(100) + devices := []int{0} + bp := DefaultIvfFlatBuildParams() + bp.NLists = 10 + + // Create empty index (target type int8) + index, err := NewGpuIvfFlatEmpty[int8](totalCount, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfFlatEmpty: %v", err) + } + defer index.Destroy() + + err = index.Start() + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + // Add data in chunks (from float32, triggers on-the-fly quantization) + chunkSize := uint64(50) + for i := uint64(0); i < totalCount; i += chunkSize { + chunk := make([]float32, chunkSize*uint64(dimension)) + val := float32(i/chunkSize*100 + 1) // 1.0 for first chunk, 101.0 for second + for j := range chunk { + chunk[j] = val + } + err = index.AddChunkFloat(chunk, chunkSize) + if err != nil { + t.Fatalf("AddChunkFloat failed at offset %d: %v", i, err) + } + } + + // Build index + err = index.Build() + if err != nil { + t.Fatalf("Load failed: %v", err) + } + + // Search for first chunk + query1 := make([]int8, dimension) + for i := range query1 { + query1[i] = -128 // matches first chunk (1.0) + } + sp := DefaultIvfFlatSearchParams() + sp.NProbes = 3 + result1, err := index.Search(query1, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search 1 failed: %v", err) + } + if result1.Neighbors[0] < 0 || result1.Neighbors[0] >= 50 { + t.Errorf("Expected neighbor from first chunk (0-49), got %d", result1.Neighbors[0]) + } + + // Search for second chunk + query2 := make([]int8, dimension) + for i := range query2 { + query2[i] = 127 // matches second chunk (101.0) + } + result2, err := index.Search(query2, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search 2 failed: %v", err) + } + if result2.Neighbors[0] < 50 || result2.Neighbors[0] >= 100 { + t.Errorf("Expected neighbor from second chunk (50-99), got %d", result2.Neighbors[0]) + } +} diff --git a/pkg/cuvs/ivf_pq.go b/pkg/cuvs/ivf_pq.go new file mode 100644 index 0000000000000..ae5165c390165 --- /dev/null +++ b/pkg/cuvs/ivf_pq.go @@ -0,0 +1,694 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +/* +#include "../../cgo/cuvs/ivf_pq_c.h" +#include +#include +*/ +import "C" +import ( + "runtime" + "unsafe" + + "github.com/matrixorigin/matrixone/pkg/common/moerr" +) + +// GpuIvfPq represents the C++ gpu_ivf_pq_t object. +type GpuIvfPq[T VectorType] struct { + cIvfPq C.gpu_ivf_pq_c + dimension uint32 + nthread uint32 + distMode DistributionMode + useBatching bool +} + +// SetUseBatching enables or disables dynamic batching for search operations. +func (gi *GpuIvfPq[T]) SetUseBatching(enable bool) error { + gi.useBatching = enable + if gi.cIvfPq != nil { + var errmsg *C.char + C.gpu_ivf_pq_set_use_batching(gi.cIvfPq, C.bool(enable), unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + } + return nil +} + +// NewGpuIvfPq creates a new GpuIvfPq instance from a dataset. +func NewGpuIvfPq[T VectorType](dataset []T, count uint64, dimension uint32, metric DistanceType, + bp IvfPqBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfPq[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.ivf_pq_build_params_t{ + n_lists: C.uint32_t(bp.NLists), + m: C.uint32_t(bp.M), + bits_per_code: C.uint32_t(bp.BitsPerCode), + add_data_on_build: C.bool(bp.AddDataOnBuild), + kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction), + } + + cIvfPq := C.gpu_ivf_pq_new( + unsafe.Pointer(&dataset[0]), + C.uint64_t(count), + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cIvfPq == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create GpuIvfPq") + } + + return &GpuIvfPq[T]{ + cIvfPq: cIvfPq, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// NewGpuIvfPqFromDataFile creates a new GpuIvfPq instance from a MODF datafile. +func NewGpuIvfPqFromDataFile[T VectorType](datafilename string, metric DistanceType, + bp IvfPqBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfPq[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cFilename := C.CString(datafilename) + defer C.free(unsafe.Pointer(cFilename)) + + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.ivf_pq_build_params_t{ + n_lists: C.uint32_t(bp.NLists), + m: C.uint32_t(bp.M), + bits_per_code: C.uint32_t(bp.BitsPerCode), + add_data_on_build: C.bool(bp.AddDataOnBuild), + kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction), + } + + cIvfPq := C.gpu_ivf_pq_new_from_data_file( + cFilename, + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cIvfPq == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create GpuIvfPq from data file") + } + + // dimension will be updated when GetDim() is called, but we can set it to 0 for now + // or ideally GetDim() should be used. + return &GpuIvfPq[T]{ + cIvfPq: cIvfPq, + dimension: 0, + nthread: nthread, + distMode: mode, + }, nil +} + +// NewGpuIvfPqEmpty creates a new GpuIvfPq instance with pre-allocated buffer but no data yet. +func NewGpuIvfPqEmpty[T VectorType](totalCount uint64, dimension uint32, metric DistanceType, + bp IvfPqBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfPq[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.ivf_pq_build_params_t{ + n_lists: C.uint32_t(bp.NLists), + m: C.uint32_t(bp.M), + bits_per_code: C.uint32_t(bp.BitsPerCode), + add_data_on_build: C.bool(bp.AddDataOnBuild), + kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction), + } + + cIvfPq := C.gpu_ivf_pq_new_empty( + C.uint64_t(totalCount), + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cIvfPq == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create empty GpuIvfPq") + } + + return &GpuIvfPq[T]{ + cIvfPq: cIvfPq, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// AddChunk adds a chunk of data to the pre-allocated buffer. +func (gi *GpuIvfPq[T]) AddChunk(chunk []T, chunkCount uint64) error { + if gi.cIvfPq == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + if len(chunk) == 0 || chunkCount == 0 { + return nil + } + + var errmsg *C.char + C.gpu_ivf_pq_add_chunk( + gi.cIvfPq, + unsafe.Pointer(&chunk[0]), + C.uint64_t(chunkCount), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(chunk) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// AddChunkFloat adds a chunk of float32 data, performing on-the-fly quantization if needed. +func (gi *GpuIvfPq[T]) AddChunkFloat(chunk []float32, chunkCount uint64) error { + if gi.cIvfPq == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + if len(chunk) == 0 || chunkCount == 0 { + return nil + } + + var errmsg *C.char + C.gpu_ivf_pq_add_chunk_float( + gi.cIvfPq, + (*C.float)(&chunk[0]), + C.uint64_t(chunkCount), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(chunk) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// TrainQuantizer trains the scalar quantizer (if T is 1-byte) +func (gi *GpuIvfPq[T]) TrainQuantizer(trainData []float32, nSamples uint64) error { + if gi.cIvfPq == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + if len(trainData) == 0 || nSamples == 0 { + return nil + } + + var errmsg *C.char + C.gpu_ivf_pq_train_quantizer( + gi.cIvfPq, + (*C.float)(&trainData[0]), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(trainData) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// SetQuantizer sets the scalar quantizer parameters (if T is 1-byte) +func (gi *GpuIvfPq[T]) SetQuantizer(min, max float32) error { + if gi.cIvfPq == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + + var errmsg *C.char + C.gpu_ivf_pq_set_quantizer( + gi.cIvfPq, + C.float(min), + C.float(max), + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// GetQuantizer gets the scalar quantizer parameters (if T is 1-byte) +func (gi *GpuIvfPq[T]) GetQuantizer() (float32, float32, error) { + if gi.cIvfPq == nil { + return 0, 0, moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + + var errmsg *C.char + var cMin, cMax C.float + C.gpu_ivf_pq_get_quantizer( + gi.cIvfPq, + &cMin, + &cMax, + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return 0, 0, moerr.NewInternalErrorNoCtx(errStr) + } + return float32(cMin), float32(cMax), nil +} + +// NewGpuIvfPqFromFile creates a new GpuIvfPq instance by loading from a file. +func NewGpuIvfPqFromFile[T VectorType](filename string, dimension uint32, metric DistanceType, + bp IvfPqBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfPq[T], error) { + if len(devices) == 0 { + return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified") + } + + qtype := GetQuantization[T]() + var errmsg *C.char + cFilename := C.CString(filename) + defer C.free(unsafe.Pointer(cFilename)) + + cDevices := make([]C.int, len(devices)) + for i, d := range devices { + cDevices[i] = C.int(d) + } + + cBP := C.ivf_pq_build_params_t{ + n_lists: C.uint32_t(bp.NLists), + m: C.uint32_t(bp.M), + bits_per_code: C.uint32_t(bp.BitsPerCode), + add_data_on_build: C.bool(bp.AddDataOnBuild), + kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction), + } + + cIvfPq := C.gpu_ivf_pq_load_file( + cFilename, + C.uint32_t(dimension), + C.distance_type_t(metric), + cBP, + &cDevices[0], + C.int(len(devices)), + C.uint32_t(nthread), + C.distribution_mode_t(mode), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(cDevices) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cIvfPq == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to load GpuIvfPq from file") + } + + return &GpuIvfPq[T]{ + cIvfPq: cIvfPq, + dimension: dimension, + nthread: nthread, + distMode: mode, + }, nil +} + +// Destroy frees the C++ gpu_ivf_pq_t instance +func (gi *GpuIvfPq[T]) Destroy() error { + if gi.cIvfPq == nil { + return nil + } + var errmsg *C.char + C.gpu_ivf_pq_destroy(gi.cIvfPq, unsafe.Pointer(&errmsg)) + gi.cIvfPq = nil + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Start initializes the worker and resources +func (gi *GpuIvfPq[T]) Start() error { + if gi.cIvfPq == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + + if gi.distMode == Replicated && gi.nthread > 1 { + var errmsg *C.char + C.gpu_ivf_pq_set_per_thread_device(gi.cIvfPq, C.bool(true), unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + } + + if gi.useBatching { + if err := gi.SetUseBatching(true); err != nil { + return err + } + } + + var errmsg *C.char + C.gpu_ivf_pq_start(gi.cIvfPq, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Build triggers the build or file loading process +func (gi *GpuIvfPq[T]) Build() error { + if gi.cIvfPq == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + var errmsg *C.char + C.gpu_ivf_pq_build(gi.cIvfPq, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Save serializes the index to a file +func (gi *GpuIvfPq[T]) Save(filename string) error { + if gi.cIvfPq == nil { + return moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + var errmsg *C.char + cFilename := C.CString(filename) + defer C.free(unsafe.Pointer(cFilename)) + + C.gpu_ivf_pq_save(gi.cIvfPq, cFilename, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Search performs a K-Nearest Neighbor search +func (gi *GpuIvfPq[T]) Search(queries []T, numQueries uint64, dimension uint32, limit uint32, sp IvfPqSearchParams) (SearchResultIvfPq, error) { + if gi.cIvfPq == nil { + return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + if len(queries) == 0 || numQueries == 0 { + return SearchResultIvfPq{}, nil + } + + var errmsg *C.char + cSP := C.ivf_pq_search_params_t{ + n_probes: C.uint32_t(sp.NProbes), + } + + res := C.gpu_ivf_pq_search( + gi.cIvfPq, + unsafe.Pointer(&queries[0]), + C.uint64_t(numQueries), + C.uint32_t(dimension), + C.uint32_t(limit), + cSP, + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(queries) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx("search returned nil result") + } + + totalElements := uint64(numQueries) * uint64(limit) + neighbors := make([]int64, totalElements) + distances := make([]float32, totalElements) + + C.gpu_ivf_pq_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0]))) + C.gpu_ivf_pq_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0]))) + runtime.KeepAlive(neighbors) + runtime.KeepAlive(distances) + + C.gpu_ivf_pq_free_result(res.result_ptr) + + return SearchResultIvfPq{ + Neighbors: neighbors, + Distances: distances, + }, nil +} + +// SearchFloat performs an IVF-PQ search operation with float32 queries +func (gi *GpuIvfPq[T]) SearchFloat(queries []float32, numQueries uint64, dimension uint32, limit uint32, sp IvfPqSearchParams) (SearchResultIvfPq, error) { + if gi.cIvfPq == nil { + return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + if len(queries) == 0 || numQueries == 0 { + return SearchResultIvfPq{}, nil + } + + var errmsg *C.char + cSP := C.ivf_pq_search_params_t{ + n_probes: C.uint32_t(sp.NProbes), + } + + res := C.gpu_ivf_pq_search_float( + gi.cIvfPq, + (*C.float)(unsafe.Pointer(&queries[0])), + C.uint64_t(numQueries), + C.uint32_t(dimension), + C.uint32_t(limit), + cSP, + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(queries) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return SearchResultIvfPq{}, moerr.NewInternalErrorNoCtx("search returned nil result") + } + + totalElements := uint64(numQueries) * uint64(limit) + neighbors := make([]int64, totalElements) + distances := make([]float32, totalElements) + + C.gpu_ivf_pq_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0]))) + C.gpu_ivf_pq_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0]))) + runtime.KeepAlive(neighbors) + runtime.KeepAlive(distances) + + C.gpu_ivf_pq_free_result(res.result_ptr) + + return SearchResultIvfPq{ + Neighbors: neighbors, + Distances: distances, + }, nil +} + +// Cap returns the capacity of the index buffer +func (gi *GpuIvfPq[T]) Cap() uint32 { + if gi.cIvfPq == nil { + return 0 + } + return uint32(C.gpu_ivf_pq_cap(gi.cIvfPq)) +} + +// Len returns current number of vectors in index +func (gi *GpuIvfPq[T]) Len() uint32 { + if gi.cIvfPq == nil { + return 0 + } + return uint32(C.gpu_ivf_pq_len(gi.cIvfPq)) +} + +// Info returns detailed information about the index as a JSON string. +func (gi *GpuIvfPq[T]) Info() (string, error) { + if gi.cIvfPq == nil { + return "", moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + var errmsg *C.char + infoPtr := C.gpu_ivf_pq_info(gi.cIvfPq, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + if infoPtr != nil { + C.free(unsafe.Pointer(infoPtr)) + } + return "", moerr.NewInternalErrorNoCtx(errStr) + } + if infoPtr == nil { + return "{}", nil + } + info := C.GoString(infoPtr) + C.free(unsafe.Pointer(infoPtr)) + return info, nil +} + +// GetCenters retrieves the trained centroids. +func (gi *GpuIvfPq[T]) GetCenters() ([]T, error) { + if gi.cIvfPq == nil { + return nil, moerr.NewInternalErrorNoCtx("GpuIvfPq is not initialized") + } + nList := gi.GetNList() + dim := gi.GetRotDim() + centers := make([]T, nList*dim) + var errmsg *C.char + C.gpu_ivf_pq_get_centers(gi.cIvfPq, unsafe.Pointer(¢ers[0]), unsafe.Pointer(&errmsg)) + runtime.KeepAlive(centers) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + return centers, nil +} + +// GetNList retrieves the number of lists (centroids) in the index. +func (gi *GpuIvfPq[T]) GetNList() uint32 { + if gi.cIvfPq == nil { + return 0 + } + return uint32(C.gpu_ivf_pq_get_n_list(gi.cIvfPq)) +} + +// GetDim retrieves the dimension of the index. +func (gi *GpuIvfPq[T]) GetDim() uint32 { + if gi.cIvfPq == nil { + return 0 + } + return uint32(C.gpu_ivf_pq_get_dim(gi.cIvfPq)) +} + +// GetRotDim retrieves the rotated dimension of the index. +func (gi *GpuIvfPq[T]) GetRotDim() uint32 { + if gi.cIvfPq == nil { + return 0 + } + return uint32(C.gpu_ivf_pq_get_rot_dim(gi.cIvfPq)) +} + +// GetDimExt retrieves the extended dimension of the index (including norms and padding). +func (gi *GpuIvfPq[T]) GetDimExt() uint32 { + if gi.cIvfPq == nil { + return 0 + } + return uint32(C.gpu_ivf_pq_get_dim_ext(gi.cIvfPq)) +} + +// GetDataset retrieves the flattened host dataset (for debugging). +func (gi *GpuIvfPq[T]) GetDataset(totalElements uint64) []T { + if gi.cIvfPq == nil { + return nil + } + data := make([]T, totalElements) + C.gpu_ivf_pq_get_dataset(gi.cIvfPq, unsafe.Pointer(&data[0])) + return data +} + +// SearchResultIvfPq contains the neighbors and distances from an IVF-PQ search. +type SearchResultIvfPq struct { + Neighbors []int64 + Distances []float32 +} diff --git a/pkg/cuvs/ivf_pq_test.go b/pkg/cuvs/ivf_pq_test.go new file mode 100644 index 0000000000000..7998559210e64 --- /dev/null +++ b/pkg/cuvs/ivf_pq_test.go @@ -0,0 +1,605 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "fmt" + "math/rand" + "os" + "testing" +) + +func TestGpuIvfPq(t *testing.T) { + dimension := uint32(16) + n_vectors := uint64(100) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + for j := uint32(0); j < dimension; j++ { + dataset[i*uint64(dimension)+uint64(j)] = float32(i) + } + } + + devices := []int{0} + bp := DefaultIvfPqBuildParams() + bp.NLists = 10 + bp.M = 8 // dimension 16 is divisible by 8 + index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfPq: %v", err) + } + defer index.Destroy() + + err = index.Start() + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + err = index.Build() + if err != nil { + t.Fatalf("Failed to load/build GpuIvfPq: %v", err) + } + + centers, err := index.GetCenters() + if err != nil { + t.Fatalf("GetCenters failed: %v", err) + } + t.Logf("Centers count: %d, dim_ext: %d", len(centers)/int(index.GetDimExt()), index.GetDimExt()) + + query := make([]float32, dimension) + for i := uint32(0); i < dimension; i++ { + query[i] = 1.0 + } + sp := DefaultIvfPqSearchParams() + sp.NProbes = 5 + result, err := index.Search(query, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + + t.Logf("Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) + if result.Neighbors[0] != 1 { + t.Errorf("Expected neighbor 1, got %d", result.Neighbors[0]) + } +} + +func TestGpuIvfPqSaveLoad(t *testing.T) { + dimension := uint32(4) + n_vectors := uint64(100) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = float32(i / int(dimension)) + } + + devices := []int{0} + bp := DefaultIvfPqBuildParams() + bp.NLists = 2 + bp.M = 2 + index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfPq: %v", err) + } + index.Start() + index.Build() + + filename := "test_ivf_pq.idx" + err = index.Save(filename) + if err != nil { + t.Fatalf("Save failed: %v", err) + } + defer os.Remove(filename) + index.Destroy() + + index2, err := NewGpuIvfPqFromFile[float32](filename, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfPq from file: %v", err) + } + defer index2.Destroy() + + err = index2.Start() + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + err = index2.Build() + if err != nil { + t.Fatalf("Load from file failed: %v", err) + } + + query := make([]float32, dimension) // all zeros + sp := DefaultIvfPqSearchParams() + result, err := index2.Search(query, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search failed: %v", err) + } + if result.Neighbors[0] != 0 { + t.Errorf("Expected 0, got %d", result.Neighbors[0]) + } +} + +func TestGpuIvfPqChunked(t *testing.T) { + dimension := uint32(8) + totalCount := uint64(100) + devices := []int{0} + bp := DefaultIvfPqBuildParams() + bp.NLists = 10 + bp.M = 4 + + // Create empty index (target type int8) + index, err := NewGpuIvfPqEmpty[int8](totalCount, dimension, L2Expanded, bp, devices, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create GpuIvfPqEmpty: %v", err) + } + defer index.Destroy() + + err = index.Start() + if err != nil { + t.Fatalf("Start failed: %v", err) + } + + // Add data in chunks (from float32, triggers on-the-fly quantization) + chunkSize := uint64(50) + for i := uint64(0); i < totalCount; i += chunkSize { + chunk := make([]float32, chunkSize*uint64(dimension)) + val := float32(i/chunkSize*100 + 1) // 1.0 for first chunk, 101.0 for second + for j := range chunk { + chunk[j] = val + } + err = index.AddChunkFloat(chunk, chunkSize) + if err != nil { + t.Fatalf("AddChunkFloat failed at offset %d: %v", i, err) + } + } + + // Debug: check dataset + ds := index.GetDataset(totalCount * uint64(dimension)) + t.Logf("Dataset[0]: %v, Dataset[50*dim]: %v", ds[0], ds[50*uint64(dimension)]) + + // Build index + err = index.Build() + if err != nil { + t.Fatalf("Load failed: %v", err) + } + + // Search for first chunk + query1 := make([]int8, dimension) + for i := range query1 { + query1[i] = -128 // matches first chunk (1.0) + } + sp := DefaultIvfPqSearchParams() + sp.NProbes = 3 + result1, err := index.Search(query1, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search 1 failed: %v", err) + } + if result1.Neighbors[0] < 0 || result1.Neighbors[0] >= 50 { + t.Errorf("Expected neighbor from first chunk (0-49), got %d", result1.Neighbors[0]) + } + + // Search for second chunk + query2 := make([]int8, dimension) + for i := range query2 { + query2[i] = 127 // matches second chunk (101.0) + } + result2, err := index.Search(query2, 1, dimension, 1, sp) + if err != nil { + t.Fatalf("Search 2 failed: %v", err) + } + if result2.Neighbors[0] < 50 || result2.Neighbors[0] >= 100 { + t.Errorf("Expected neighbor from second chunk (50-99), got %d", result2.Neighbors[0]) + } +} + +func TestGpuShardedIvfPq(t *testing.T) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + t.Skip("Need at least 1 GPU for sharded IVF-PQ test") + } + + dimension := uint32(4) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + for j := uint32(0); j < dimension; j++ { + dataset[i*uint64(dimension)+uint64(j)] = float32(i) + } + } + + bp := DefaultIvfPqBuildParams() + bp.NLists = 10 + bp.M = 2 + index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Sharded) + if err != nil { + t.Fatalf("Failed to create sharded IVF-PQ: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + err = index.Build() + if err != nil { + t.Fatalf("Load sharded failed: %v", err) + } + + queries := []float32{0.1, 0.1, 0.1, 0.1, 10.1, 10.1, 10.1, 10.1} + sp := DefaultIvfPqSearchParams() + sp.NProbes = 5 + result, err := index.Search(queries, 2, dimension, 1, sp) + if err != nil { + t.Fatalf("Search sharded failed: %v", err) + } + t.Logf("Sharded Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) +} + +func TestGpuReplicatedIvfPq(t *testing.T) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + t.Skip("Need at least 1 GPU for replicated IVF-PQ test") + } + + dimension := uint32(4) + n_vectors := uint64(1000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := uint64(0); i < n_vectors; i++ { + for j := uint32(0); j < dimension; j++ { + dataset[i*uint64(dimension)+uint64(j)] = float32(i) + } + } + + bp := DefaultIvfPqBuildParams() + bp.NLists = 10 + bp.M = 2 + index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Replicated) + if err != nil { + t.Fatalf("Failed to create replicated IVF-PQ: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + t.Fatalf("Start failed: %v", err) + } + err = index.Build() + if err != nil { + t.Fatalf("Load replicated failed: %v", err) + } + + queries := []float32{0.1, 0.1, 0.1, 0.1, 10.1, 10.1, 10.1, 10.1} + sp := DefaultIvfPqSearchParams() + sp.NProbes = 5 + result, err := index.Search(queries, 2, dimension, 1, sp) + if err != nil { + t.Fatalf("Search replicated failed: %v", err) + } + t.Logf("Replicated Neighbors: %v, Distances: %v", result.Neighbors, result.Distances) +} + +func BenchmarkGpuShardedIvfPq(b *testing.B) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + b.Skip("Need at least 1 GPU for sharded IVF-PQ benchmark") + } + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + bp.M = 128 // 1024 / 8 + index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Sharded) + if err != nil { + b.Fatalf("Failed to create sharded IVF-PQ: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfPqSearchParams() + sp.NProbes = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + + }) + } +} + +func BenchmarkGpuSingleIvfPq(b *testing.B) { + devices := []int{0} + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + bp.M = 128 // 1024 / 8 + index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create single IVF-PQ: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfPqSearchParams() + sp.NProbes = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + + }) + } +} + +func BenchmarkGpuReplicatedIvfPq(b *testing.B) { + devices, err := GetGpuDeviceList() + if err != nil || len(devices) < 1 { + b.Skip("Need at least 1 GPU for replicated IVF-PQ benchmark") + } + + dimension := uint32(1024) + n_vectors := uint64(100000) + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + bp.M = 128 // 1024 / 8 + index, err := NewGpuIvfPq[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 8, Replicated) + if err != nil { + b.Fatalf("Failed to create replicated IVF-PQ: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfPqSearchParams() + sp.NProbes = 3 + + for _, useBatching := range []bool{false, true} { + b.Run(fmt.Sprintf("Batching%v", useBatching), func(b *testing.B) { + index.SetUseBatching(useBatching) + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(n_vectors), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) + + }) + } +} + +func BenchmarkGpuAddChunkAndSearchIvfPqF16(b *testing.B) { + const dimension = 1024 + const totalCount = 100000 + const chunkSize = 10000 + + dataset := make([]float32, totalCount*dimension) + for i := range dataset { + dataset[i] = rand.Float32() + } + + devices := []int{0} + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + // Use Float16 as internal type + index, err := NewGpuIvfPqEmpty[Float16](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create index: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + + // Add data in chunks using AddChunkFloat + for i := 0; i < totalCount; i += chunkSize { + chunk := dataset[i*dimension : (i+chunkSize)*dimension] + if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil { + b.Fatalf("AddChunkFloat failed at %d: %v", i, err) + } + } + + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfPqSearchParams() + sp.NProbes = 3 + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) +} + +func BenchmarkGpuAddChunkAndSearchIvfPqInt8(b *testing.B) { + const dimension = 1024 + const totalCount = 100000 + const chunkSize = 10000 + + dataset := make([]float32, totalCount*dimension) + for i := range dataset { + dataset[i] = rand.Float32() + } + + devices := []int{0} + bp := DefaultIvfPqBuildParams() + bp.NLists = 1000 + // Use int8 as internal type + index, err := NewGpuIvfPqEmpty[int8](uint64(totalCount), dimension, L2Expanded, bp, devices, 8, SingleGpu) + if err != nil { + b.Fatalf("Failed to create index: %v", err) + } + defer index.Destroy() + + if err := index.Start(); err != nil { + b.Fatalf("Start failed: %v", err) + } + + // Add data in chunks using AddChunkFloat + for i := 0; i < totalCount; i += chunkSize { + chunk := dataset[i*dimension : (i+chunkSize)*dimension] + if err := index.AddChunkFloat(chunk, uint64(chunkSize)); err != nil { + b.Fatalf("AddChunkFloat failed at %d: %v", i, err) + } + } + + if err := index.Build(); err != nil { + b.Fatalf("Build failed: %v", err) + } + // info, _ := index.Info() + // fmt.Println(info) + + sp := DefaultIvfPqSearchParams() + sp.NProbes = 3 + + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + queries := make([]float32, dimension) + for i := range queries { + queries[i] = rand.Float32() + } + for pb.Next() { + _, err := index.SearchFloat(queries, 1, dimension, 10, sp) + if err != nil { + b.Fatalf("Search failed: %v", err) + } + } + }) + b.StopTimer() + ReportRecall(b, dataset, uint64(totalCount), uint32(dimension), 10, func(queries []float32, numQueries uint64, limit uint32) ([]int64, error) { + res, err := index.SearchFloat(queries, numQueries, dimension, limit, sp) + if err != nil { + return nil, err + } + return res.Neighbors, nil + }) +} diff --git a/pkg/cuvs/kmeans.go b/pkg/cuvs/kmeans.go new file mode 100644 index 0000000000000..1c07ea350f2d0 --- /dev/null +++ b/pkg/cuvs/kmeans.go @@ -0,0 +1,381 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +/* +#include "../../cgo/cuvs/kmeans_c.h" +#include +*/ +import "C" +import ( + "github.com/matrixorigin/matrixone/pkg/common/moerr" + "runtime" + "unsafe" +) + +// GpuKMeans represents the C++ gpu_kmeans_t object +type GpuKMeans[T VectorType] struct { + cKMeans C.gpu_kmeans_c + nClusters uint32 + dimension uint32 +} + +// NewGpuKMeans creates a new GpuKMeans instance +func NewGpuKMeans[T VectorType](nClusters uint32, dimension uint32, metric DistanceType, maxIter int, deviceID int, nthread uint32) (*GpuKMeans[T], error) { + qtype := GetQuantization[T]() + var errmsg *C.char + cKMeans := C.gpu_kmeans_new( + C.uint32_t(nClusters), + C.uint32_t(dimension), + C.distance_type_t(metric), + C.int(maxIter), + C.int(deviceID), + C.uint32_t(nthread), + C.quantization_t(qtype), + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + + if cKMeans == nil { + return nil, moerr.NewInternalErrorNoCtx("failed to create GpuKMeans") + } + + return &GpuKMeans[T]{cKMeans: cKMeans, nClusters: nClusters, dimension: dimension}, nil +} + +// Destroy frees the C++ gpu_kmeans_t instance +func (gk *GpuKMeans[T]) Destroy() error { + if gk.cKMeans == nil { + return nil + } + var errmsg *C.char + C.gpu_kmeans_destroy(gk.cKMeans, unsafe.Pointer(&errmsg)) + gk.cKMeans = nil + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// Start initializes the worker and resources +func (gk *GpuKMeans[T]) Start() error { + if gk.cKMeans == nil { + return moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + var errmsg *C.char + C.gpu_kmeans_start(gk.cKMeans, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// TrainQuantizer trains the scalar quantizer (if T is 1-byte) +func (gk *GpuKMeans[T]) TrainQuantizer(trainData []float32, nSamples uint64) error { + if gk.cKMeans == nil { + return moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + if len(trainData) == 0 || nSamples == 0 { + return nil + } + + var errmsg *C.char + C.gpu_kmeans_train_quantizer( + gk.cKMeans, + (*C.float)(&trainData[0]), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(trainData) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// SetQuantizer sets the scalar quantizer parameters (if T is 1-byte) +func (gk *GpuKMeans[T]) SetQuantizer(min, max float32) error { + if gk.cKMeans == nil { + return moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + + var errmsg *C.char + C.gpu_kmeans_set_quantizer( + gk.cKMeans, + C.float(min), + C.float(max), + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return moerr.NewInternalErrorNoCtx(errStr) + } + return nil +} + +// GetQuantizer gets the scalar quantizer parameters (if T is 1-byte) +func (gk *GpuKMeans[T]) GetQuantizer() (float32, float32, error) { + if gk.cKMeans == nil { + return 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + + var errmsg *C.char + var cMin, cMax C.float + C.gpu_kmeans_get_quantizer( + gk.cKMeans, + &cMin, + &cMax, + unsafe.Pointer(&errmsg), + ) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return 0, 0, moerr.NewInternalErrorNoCtx(errStr) + } + return float32(cMin), float32(cMax), nil +} + +// Fit computes the cluster centroids +func (gk *GpuKMeans[T]) Fit(dataset []T, nSamples uint64) (float32, int64, error) { + if gk.cKMeans == nil { + return 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + if len(dataset) == 0 || nSamples == 0 { + return 0, 0, nil + } + + var errmsg *C.char + res := C.gpu_kmeans_fit( + gk.cKMeans, + unsafe.Pointer(&dataset[0]), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return 0, 0, moerr.NewInternalErrorNoCtx(errStr) + } + + return float32(res.inertia), int64(res.n_iter), nil +} + +// Predict assigns labels to new data based on existing centroids. +func (gk *GpuKMeans[T]) Predict(dataset []T, nSamples uint64) ([]int64, float32, error) { + if gk.cKMeans == nil { + return nil, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + if len(dataset) == 0 || nSamples == 0 { + return nil, 0, nil + } + + var errmsg *C.char + res := C.gpu_kmeans_predict( + gk.cKMeans, + unsafe.Pointer(&dataset[0]), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, 0, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return nil, 0, moerr.NewInternalErrorNoCtx("predict returned nil result") + } + + labels := make([]int64, nSamples) + C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0]))) + runtime.KeepAlive(labels) + + C.gpu_kmeans_free_result(res.result_ptr) + + return labels, float32(res.inertia), nil +} + +// PredictFloat assigns labels to new float32 data based on existing centroids. +func (gk *GpuKMeans[T]) PredictFloat(dataset []float32, nSamples uint64) ([]int64, float32, error) { + if gk.cKMeans == nil { + return nil, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + if len(dataset) == 0 || nSamples == 0 { + return nil, 0, nil + } + + var errmsg *C.char + res := C.gpu_kmeans_predict_float( + gk.cKMeans, + (*C.float)(unsafe.Pointer(&dataset[0])), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, 0, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return nil, 0, moerr.NewInternalErrorNoCtx("predict returned nil result") + } + + labels := make([]int64, nSamples) + C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0]))) + runtime.KeepAlive(labels) + + C.gpu_kmeans_free_result(res.result_ptr) + + return labels, float32(res.inertia), nil +} + +// FitPredict performs both fitting and labeling in one step. +func (gk *GpuKMeans[T]) FitPredict(dataset []T, nSamples uint64) ([]int64, float32, int64, error) { + if gk.cKMeans == nil { + return nil, 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + if len(dataset) == 0 || nSamples == 0 { + return nil, 0, 0, nil + } + + var errmsg *C.char + res := C.gpu_kmeans_fit_predict( + gk.cKMeans, + unsafe.Pointer(&dataset[0]), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, 0, 0, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return nil, 0, 0, moerr.NewInternalErrorNoCtx("fit_predict returned nil result") + } + + labels := make([]int64, nSamples) + C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0]))) + runtime.KeepAlive(labels) + + C.gpu_kmeans_free_result(res.result_ptr) + + return labels, float32(res.inertia), int64(res.n_iter), nil +} + +// FitPredictFloat performs both fitting and labeling in one step for float32 data. +func (gk *GpuKMeans[T]) FitPredictFloat(dataset []float32, nSamples uint64) ([]int64, float32, int64, error) { + if gk.cKMeans == nil { + return nil, 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + if len(dataset) == 0 || nSamples == 0 { + return nil, 0, 0, nil + } + + var errmsg *C.char + res := C.gpu_kmeans_fit_predict_float( + gk.cKMeans, + (*C.float)(unsafe.Pointer(&dataset[0])), + C.uint64_t(nSamples), + unsafe.Pointer(&errmsg), + ) + runtime.KeepAlive(dataset) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, 0, 0, moerr.NewInternalErrorNoCtx(errStr) + } + + if res.result_ptr == nil { + return nil, 0, 0, moerr.NewInternalErrorNoCtx("fit_predict returned nil result") + } + + labels := make([]int64, nSamples) + C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0]))) + runtime.KeepAlive(labels) + + C.gpu_kmeans_free_result(res.result_ptr) + + return labels, float32(res.inertia), int64(res.n_iter), nil +} + +// GetCentroids retrieves the trained centroids. +func (gk *GpuKMeans[T]) GetCentroids() ([]T, error) { + if gk.cKMeans == nil { + return nil, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + centroids := make([]T, gk.nClusters*gk.dimension) + var errmsg *C.char + C.gpu_kmeans_get_centroids(gk.cKMeans, unsafe.Pointer(¢roids[0]), unsafe.Pointer(&errmsg)) + runtime.KeepAlive(centroids) + + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + return nil, moerr.NewInternalErrorNoCtx(errStr) + } + return centroids, nil +} + +// Info returns detailed information about the index as a JSON string. +func (gk *GpuKMeans[T]) Info() (string, error) { + if gk.cKMeans == nil { + return "", moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized") + } + var errmsg *C.char + infoPtr := C.gpu_kmeans_info(gk.cKMeans, unsafe.Pointer(&errmsg)) + if errmsg != nil { + errStr := C.GoString(errmsg) + C.free(unsafe.Pointer(errmsg)) + if infoPtr != nil { + C.free(unsafe.Pointer(infoPtr)) + } + return "", moerr.NewInternalErrorNoCtx(errStr) + } + if infoPtr == nil { + return "{}", nil + } + info := C.GoString(infoPtr) + C.free(unsafe.Pointer(infoPtr)) + return info, nil +} diff --git a/pkg/cuvs/kmeans_test.go b/pkg/cuvs/kmeans_test.go new file mode 100644 index 0000000000000..a14044ac6a6ca --- /dev/null +++ b/pkg/cuvs/kmeans_test.go @@ -0,0 +1,172 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "fmt" + "testing" +) + +func TestGpuKMeans_Float32(t *testing.T) { + nClusters := uint32(3) + dimension := uint32(2) + nSamples := uint64(9) + + // Create 3 clusters + dataset := []float32{ + 0.1, 0.1, 0.0, 0.2, 0.2, 0.0, // Cluster 0 + 10.1, 10.1, 10.0, 10.2, 10.2, 10.0, // Cluster 1 + 20.1, 20.1, 20.0, 20.2, 20.2, 20.0, // Cluster 2 + } + + deviceID := 0 + kmeans, err := NewGpuKMeans[float32](nClusters, dimension, L2Expanded, 20, deviceID, 1) + if err != nil { + t.Fatalf("Failed to create GpuKMeans: %v", err) + } + defer kmeans.Destroy() + + kmeans.Start() + inertia, nIter, err := kmeans.Fit(dataset, nSamples) + if err != nil { + t.Fatalf("Fit failed: %v", err) + } + fmt.Printf("Fit: inertia=%f, nIter=%d\n", inertia, nIter) + + labels, pInertia, err := kmeans.Predict(dataset, nSamples) + if err != nil { + t.Fatalf("Predict failed: %v", err) + } + fmt.Printf("Predict labels: %v, inertia=%f\n", labels, pInertia) + + if len(labels) != int(nSamples) { + t.Errorf("Expected %d labels, got %d", nSamples, len(labels)) + } + + // Since we use balanced_params, it might prioritize balancing cluster sizes over spatial distance + // on very small datasets. We just check that all labels are within range [0, nClusters). + for i, l := range labels { + if l < 0 || l >= int64(nClusters) { + t.Errorf("Label at index %d is out of range: %d", i, l) + } + } + + centroids, err := kmeans.GetCentroids() + if err != nil { + t.Fatalf("GetCentroids failed: %v", err) + } + if len(centroids) != int(nClusters*dimension) { + t.Errorf("Expected %d centroid elements, got %d", nClusters*dimension, len(centroids)) + } +} + +func TestGpuKMeans_FitPredict_Float16(t *testing.T) { + nClusters := uint32(2) + dimension := uint32(4) + nSamples := uint64(10) + + dataset := make([]float32, nSamples*uint64(dimension)) + for i := range dataset { + dataset[i] = 0.5 + } + + // Convert to F16 + datasetF16 := make([]Float16, len(dataset)) + err := GpuConvertF32ToF16(dataset, datasetF16, 0) + if err != nil { + t.Fatalf("F32 to F16 conversion failed: %v", err) + } + + deviceID := 0 + kmeans, err := NewGpuKMeans[Float16](nClusters, dimension, L2Expanded, 20, deviceID, 1) + if err != nil { + t.Fatalf("Failed to create GpuKMeans: %v", err) + } + defer kmeans.Destroy() + + kmeans.Start() + labels, inertia, nIter, err := kmeans.FitPredict(datasetF16, nSamples) + if err != nil { + t.Fatalf("FitPredict failed: %v", err) + } + fmt.Printf("FitPredict: inertia=%f, nIter=%d\n", inertia, nIter) + if len(labels) != int(nSamples) { + t.Errorf("Expected %d labels, got %d", nSamples, len(labels)) + } +} + +func TestGpuKMeans_Int8(t *testing.T) { + nClusters := uint32(2) + dimension := uint32(2) + nSamples := uint64(4) + + dataset := []int8{ + 0, 0, + 1, 1, + 10, 10, + 11, 11, + } + + deviceID := 0 + kmeans, err := NewGpuKMeans[int8](nClusters, dimension, L2Expanded, 20, deviceID, 1) + if err != nil { + t.Fatalf("Failed to create GpuKMeans: %v", err) + } + defer kmeans.Destroy() + + kmeans.Start() + labels, _, _, err := kmeans.FitPredict(dataset, nSamples) + if err != nil { + t.Fatalf("FitPredict failed: %v", err) + } + fmt.Printf("Int8 Predict labels: %v\n", labels) + + if len(labels) != int(nSamples) { + t.Errorf("Expected %d labels, got %d", nSamples, len(labels)) + } +} + +func TestGpuKMeans_Uint8(t *testing.T) { + nClusters := uint32(2) + dimension := uint32(2) + nSamples := uint64(4) + + dataset := []uint8{ + 0, 0, + 1, 1, + 10, 10, + 11, 11, + } + + deviceID := 0 + kmeans, err := NewGpuKMeans[uint8](nClusters, dimension, L2Expanded, 20, deviceID, 1) + if err != nil { + t.Fatalf("Failed to create GpuKMeans: %v", err) + } + defer kmeans.Destroy() + + kmeans.Start() + labels, _, _, err := kmeans.FitPredict(dataset, nSamples) + if err != nil { + t.Fatalf("FitPredict failed: %v", err) + } + fmt.Printf("Uint8 Predict labels: %v\n", labels) + + if len(labels) != int(nSamples) { + t.Errorf("Expected %d labels, got %d", nSamples, len(labels)) + } +} diff --git a/pkg/cuvs/recall_test.go b/pkg/cuvs/recall_test.go new file mode 100644 index 0000000000000..e5c6676531d5a --- /dev/null +++ b/pkg/cuvs/recall_test.go @@ -0,0 +1,76 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + //"fmt" + "math/rand" + "testing" +) + +type NeighborType interface { + uint32 | int64 +} + +// GenerateRandomDataset generates a random float32 dataset. +func GenerateRandomDataset(n_vectors uint64, dimension uint32) []float32 { + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = rand.Float32() + } + return dataset +} + +// ReportRecall reports the self-recall for an index. +// It verifies that querying with a point already in the index returns that point's ID. +func ReportRecall[T NeighborType](b *testing.B, dataset []float32, n_vectors uint64, dimension uint32, limit uint32, searchFunc func(queries []float32, numQueries uint64, limit uint32) ([]T, error)) { + numQueries := uint64(100) + if n_vectors < numQueries { + numQueries = n_vectors + } + + // Use the first numQueries vectors from the dataset as queries. + // Since these are the first vectors, we expect their IDs to be 0, 1, 2, ..., numQueries-1. + recallQueries := dataset[:numQueries*uint64(dimension)] + + // Search approximate index + approxNeighbors, err := searchFunc(recallQueries, numQueries, limit) + if err != nil { + b.Logf("Warning: Approximate search failed: %v", err) + return + } + + hitCount := 0 + for i := uint64(0); i < numQueries; i++ { + // For query i (which is dataset[i]), we expect ID 'i' to be in the results + expectedID := int64(i) + found := false + for j := uint32(0); j < limit; j++ { + if int64(approxNeighbors[i*uint64(limit)+uint64(j)]) == expectedID { + found = true + break + } + } + if found { + hitCount++ + } + } + + recall := float64(hitCount) / float64(numQueries) + //fmt.Printf("Benchmark %s: self_recall_at_%d = %.4f\n", b.Name(), int(limit), recall) + b.ReportMetric(recall*float64(b.N), "recall") +} diff --git a/pkg/cuvs/search_float_test.go b/pkg/cuvs/search_float_test.go new file mode 100644 index 0000000000000..2abdef34c37cc --- /dev/null +++ b/pkg/cuvs/search_float_test.go @@ -0,0 +1,169 @@ +//go:build gpu + +// Copyright 2021 - 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cuvs + +import ( + "testing" +) + +func TestGpuSearchFloatAll(t *testing.T) { + dimension := uint32(8) + n_vectors := uint64(100) + deviceID := 0 + + // 1. Test IVF-PQ SearchFloat (with int8 quantization) + t.Run("IVF-PQ", func(t *testing.T) { + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = float32(i % 10) + } + bp := IvfPqBuildParams{NLists: 10, M: 4, BitsPerCode: 8, AddDataOnBuild: true} + // Create empty index + index, err := NewGpuIvfPqEmpty[int8](n_vectors, dimension, L2Expanded, bp, []int{deviceID}, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create IVF-PQ: %v", err) + } + defer index.Destroy() + index.Start() + + // Explicitly train quantizer before adding data + err = index.TrainQuantizer(dataset[:dimension*10], 10) + if err != nil { + t.Fatalf("TrainQuantizer failed: %v", err) + } + + err = index.AddChunkFloat(dataset, n_vectors) + if err != nil { + t.Fatalf("AddChunkFloat failed: %v", err) + } + index.Build() + + queries := make([]float32, 2*uint64(dimension)) + for i := range queries { + queries[i] = float32(i % 10) + } + res, err := index.SearchFloat(queries, 2, dimension, 1, IvfPqSearchParams{NProbes: 1}) + if err != nil { + t.Fatalf("SearchFloat failed: %v", err) + } + if len(res.Neighbors) != 2 { + t.Errorf("Expected 2 neighbors, got %d", len(res.Neighbors)) + } + }) + + // 2. Test IVF-Flat SearchFloat (with half quantization) + t.Run("IVF-Flat", func(t *testing.T) { + dataset := make([]Float16, n_vectors*uint64(dimension)) + bp := IvfFlatBuildParams{NLists: 10, AddDataOnBuild: true} + index, err := NewGpuIvfFlat[Float16](dataset, n_vectors, dimension, L2Expanded, bp, []int{deviceID}, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create IVF-Flat: %v", err) + } + defer index.Destroy() + index.Start() + index.Build() + + queries := make([]float32, uint64(dimension)) + res, err := index.SearchFloat(queries, 1, dimension, 1, IvfFlatSearchParams{NProbes: 1}) + if err != nil { + t.Fatalf("SearchFloat failed: %v", err) + } + if len(res.Neighbors) != 1 { + t.Errorf("Expected 1 neighbor, got %d", len(res.Neighbors)) + } + }) + + // 3. Test CAGRA SearchFloat (with float32) + t.Run("CAGRA", func(t *testing.T) { + dataset := make([]float32, n_vectors*uint64(dimension)) + bp := CagraBuildParams{IntermediateGraphDegree: 64, GraphDegree: 32, AttachDatasetOnBuild: true} + index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, []int{deviceID}, 1, SingleGpu) + if err != nil { + t.Fatalf("Failed to create CAGRA: %v", err) + } + defer index.Destroy() + index.Start() + index.Build() + + queries := make([]float32, uint64(dimension)) + res, err := index.SearchFloat(queries, 1, dimension, 1, CagraSearchParams{ItopkSize: 64, SearchWidth: 1}) + if err != nil { + t.Fatalf("SearchFloat failed: %v", err) + } + if len(res.Neighbors) != 1 { + t.Errorf("Expected 1 neighbor, got %d", len(res.Neighbors)) + } + }) + + // 4. Test Brute-Force SearchFloat (with half) + t.Run("Brute-Force", func(t *testing.T) { + dataset := make([]Float16, n_vectors*uint64(dimension)) + index, err := NewGpuBruteForce[Float16](dataset, n_vectors, dimension, L2Expanded, 1, deviceID) + if err != nil { + t.Fatalf("Failed to create Brute-Force: %v", err) + } + defer index.Destroy() + index.Start() + index.Build() + + queries := make([]float32, uint64(dimension)) + neighbors, _, err := index.SearchFloat(queries, 1, dimension, 1) + if err != nil { + t.Fatalf("SearchFloat failed: %v", err) + } + if len(neighbors) != 1 { + t.Errorf("Expected 1 neighbor, got %d", len(neighbors)) + } + }) + + // 5. Test KMeans PredictFloat (with uint8) + t.Run("KMeans", func(t *testing.T) { + nClusters := uint32(5) + km, err := NewGpuKMeans[uint8](nClusters, dimension, L2Expanded, 20, deviceID, 1) + if err != nil { + t.Fatalf("Failed to create KMeans: %v", err) + } + defer km.Destroy() + km.Start() + + dataset := make([]float32, n_vectors*uint64(dimension)) + for i := range dataset { + dataset[i] = float32(i % 10) + } + + // Explicitly train quantizer + err = km.TrainQuantizer(dataset[:dimension*10], 10) + if err != nil { + t.Fatalf("TrainQuantizer failed: %v", err) + } + + // FitPredictFloat + labels, _, _, err := km.FitPredictFloat(dataset, n_vectors) + if err != nil { + t.Fatalf("FitPredictFloat failed: %v", err) + } + + queries := make([]float32, 2*uint64(dimension)) + labels, _, err = km.PredictFloat(queries, 2) + if err != nil { + t.Fatalf("PredictFloat failed: %v", err) + } + if len(labels) != 2 { + t.Errorf("Expected 2 labels, got %d", len(labels)) + } + }) +} diff --git a/pkg/vectorindex/brute_force/benchmark_test.go b/pkg/vectorindex/brute_force/benchmark_test.go new file mode 100644 index 0000000000000..bfa2782154525 --- /dev/null +++ b/pkg/vectorindex/brute_force/benchmark_test.go @@ -0,0 +1,105 @@ +// Copyright 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package brute_force + +import ( + "math/rand/v2" + "testing" + + "github.com/matrixorigin/matrixone/pkg/common/mpool" + "github.com/matrixorigin/matrixone/pkg/testutil" + "github.com/matrixorigin/matrixone/pkg/vectorindex" + "github.com/matrixorigin/matrixone/pkg/vectorindex/cache" + "github.com/matrixorigin/matrixone/pkg/vectorindex/metric" + "github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec" +) + +func benchmarkBruteForceGeneric(b *testing.B, dsize, qsize int, dimension uint, ncpu uint, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) { + b.Helper() + m := mpool.MustNewZero() + proc := testutil.NewProcessWithMPool(b, "", m) + sqlproc := sqlexec.NewSqlProcess(proc) + limit := uint(10) + elemsz := uint(4) // float32 + + dataset := make([][]float32, dsize) + for i := range dataset { + dataset[i] = make([]float32, dimension) + for j := range dataset[i] { + dataset[i][j] = rand.Float32() + } + } + + query := make([][]float32, qsize) + for i := range query { + query[i] = make([]float32, dimension) + for j := range query[i] { + query[i][j] = rand.Float32() + } + } + + idx, err := createFn(dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu) + if err != nil { + b.Fatal(err) + } + defer idx.Destroy() + + err = idx.Load(sqlproc) + if err != nil { + b.Fatal(err) + } + + rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu} + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _, err := idx.Search(sqlproc, query, rt) + if err != nil { + b.Fatal(err) + } + } +} + +func benchmarkBruteForce(b *testing.B, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) { + benchmarkBruteForceGeneric(b, 10000, 100, 1024, 8, createFn) +} + +func benchmarkCentroidSearch(b *testing.B, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) { + benchmarkBruteForceGeneric(b, 18000, 1, 1024, 1, createFn) +} + +func BenchmarkGoBruteForce(b *testing.B) { + benchmarkBruteForce(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) { + return NewGoBruteForceIndex[float32](dataset, dim, m, es) + }) +} + +func BenchmarkUsearchBruteForce(b *testing.B) { + benchmarkBruteForce(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) { + return NewUsearchBruteForceIndex[float32](dataset, dim, m, es) + }) +} + +func BenchmarkCentroidSearchGoBruteForce(b *testing.B) { + benchmarkCentroidSearch(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) { + return NewGoBruteForceIndex[float32](dataset, dim, m, es) + }) +} + +func BenchmarkCentroidSearchUsearchBruteForce(b *testing.B) { + benchmarkCentroidSearch(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) { + return NewUsearchBruteForceIndex[float32](dataset, dim, m, es) + }) +} diff --git a/pkg/vectorindex/brute_force/brute_force.go b/pkg/vectorindex/brute_force/brute_force.go index bdf217dd75433..84b529b04bbdf 100644 --- a/pkg/vectorindex/brute_force/brute_force.go +++ b/pkg/vectorindex/brute_force/brute_force.go @@ -136,44 +136,73 @@ func NewUsearchBruteForceIndex[T types.RealNumbers](dataset [][]T, return idx, nil } +func NewUsearchBruteForceIndexFlattened[T types.RealNumbers](dataset []T, + count uint, + dimension uint, + m metric.MetricType, + elemsz uint) (cache.VectorIndexSearchIf, error) { + var err error + + idx := &UsearchBruteForceIndex[T]{} + idx.Metric = metric.MetricTypeToUsearchMetric[m] + idx.Quantization, err = GetUsearchQuantizationFromType(T(0)) + if err != nil { + return nil, err + } + idx.Dimension = dimension + idx.Count = count + idx.ElementSize = elemsz + idx.Dataset = &dataset + + return idx, nil +} + func (idx *UsearchBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) error { return nil } func (idx *UsearchBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (keys any, distances []float64, err error) { - queries, ok := _queries.([][]T) - if !ok { - return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid") - } - var flatten []T var queryDeallocator malloc.Deallocator - - reqSize := len(queries) * int(idx.Dimension) - allocator := malloc.NewCAllocator() - var _t T - switch any(_t).(type) { - case float32: - slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear) - if err2 != nil { - return nil, nil, err2 + var nQueries int + + switch queries := _queries.(type) { + case []T: + flatten = queries + nQueries = len(queries) / int(idx.Dimension) + case [][]T: + if len(queries) == 0 { + return nil, nil, nil } - queryDeallocator = dealloc - f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize) - flatten = any(f32Slice).([]T) - case float64: - slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*8, malloc.NoClear) - if err2 != nil { - return nil, nil, err2 + nQueries = len(queries) + reqSize := nQueries * int(idx.Dimension) + allocator := malloc.NewCAllocator() + var _t T + switch any(_t).(type) { + case float32: + slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear) + if err2 != nil { + return nil, nil, err2 + } + queryDeallocator = dealloc + f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize) + flatten = any(f32Slice).([]T) + case float64: + slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*8, malloc.NoClear) + if err2 != nil { + return nil, nil, err2 + } + queryDeallocator = dealloc + f64Slice := util.UnsafeSliceCastToLength[float64](slice, reqSize) + flatten = any(f64Slice).([]T) } - queryDeallocator = dealloc - f64Slice := util.UnsafeSliceCastToLength[float64](slice, reqSize) - flatten = any(f64Slice).([]T) - } - for i := 0; i < len(queries); i++ { - offset := i * int(idx.Dimension) - copy(flatten[offset:], queries[i]) + for i := 0; i < nQueries; i++ { + offset := i * int(idx.Dimension) + copy(flatten[offset:], queries[i]) + } + default: + return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid") } if queryDeallocator != nil { @@ -191,7 +220,7 @@ func (idx *UsearchBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries util.UnsafePointer(&((*idx.Dataset)[0])), util.UnsafePointer(&(flatten[0])), uint(idx.Count), - uint(len(queries)), + uint(nQueries), idx.Dimension*idx.ElementSize, idx.Dimension*idx.ElementSize, idx.Dimension, diff --git a/pkg/vectorindex/brute_force/cpu.go b/pkg/vectorindex/brute_force/cpu.go index b5c65f96cf614..c403cbb9c5181 100644 --- a/pkg/vectorindex/brute_force/cpu.go +++ b/pkg/vectorindex/brute_force/cpu.go @@ -30,3 +30,20 @@ func NewBruteForceIndex[T types.RealNumbers](dataset [][]T, return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz) } + +func NewAdhocBruteForceIndex[T types.RealNumbers](dataset [][]T, + dimension uint, + m metric.MetricType, + elemsz uint) (cache.VectorIndexSearchIf, error) { + + return NewUsearchBruteForceIndex[T](dataset, dimension, m, elemsz) +} + +func NewAdhocBruteForceIndexFlattened[T types.RealNumbers](dataset []T, + count uint, + dimension uint, + m metric.MetricType, + elemsz uint) (cache.VectorIndexSearchIf, error) { + + return NewUsearchBruteForceIndexFlattened[T](dataset, count, dimension, m, elemsz) +} diff --git a/pkg/vectorindex/brute_force/gpu.go b/pkg/vectorindex/brute_force/gpu.go index 416c2a75d9a75..4c44be80ca0dc 100644 --- a/pkg/vectorindex/brute_force/gpu.go +++ b/pkg/vectorindex/brute_force/gpu.go @@ -17,171 +17,339 @@ package brute_force import ( - // "fmt" + "github.com/matrixorigin/matrixone/pkg/common/malloc" + "github.com/matrixorigin/matrixone/pkg/common/util" "github.com/matrixorigin/matrixone/pkg/common/moerr" "github.com/matrixorigin/matrixone/pkg/container/types" + "github.com/matrixorigin/matrixone/pkg/cuvs" "github.com/matrixorigin/matrixone/pkg/vectorindex" "github.com/matrixorigin/matrixone/pkg/vectorindex/cache" "github.com/matrixorigin/matrixone/pkg/vectorindex/metric" "github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec" - cuvs "github.com/rapidsai/cuvs/go" - "github.com/rapidsai/cuvs/go/brute_force" ) -type GpuBruteForceIndex[T cuvs.TensorNumberType] struct { - Resource *cuvs.Resource // shared resource for read-only index - Dataset *cuvs.Tensor[T] - Index *brute_force.BruteForceIndex - Metric cuvs.Distance - Dimension uint - Count uint - ElementSize uint +type GpuAdhocBruteForceIndex[T cuvs.VectorType] struct { + dataset []T + dimension uint + count uint + metric metric.MetricType } -var _ cache.VectorIndexSearchIf = &GpuBruteForceIndex[float32]{} +var _ cache.VectorIndexSearchIf = &GpuAdhocBruteForceIndex[float32]{} -// cuvs library has bug. comment out the GPU version until cuvs fix the bug -func NewBruteForceIndex[T types.RealNumbers](dataset [][]T, +func NewAdhocBruteForceIndex[T types.RealNumbers](dataset [][]T, dimension uint, m metric.MetricType, - elemsz uint, - nthread uint) (cache.VectorIndexSearchIf, error) { + elemsz uint) (cache.VectorIndexSearchIf, error) { + + // Threshold for switching between CPU and GPU for adhoc search. + // For small datasets, CPU (usearch) is much faster due to lower overhead. + const cpuThreshold = 5000 + if len(dataset) < cpuThreshold { + return NewUsearchBruteForceIndex[T](dataset, dimension, m, elemsz) + } switch dset := any(dataset).(type) { - case [][]float64: - return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz) case [][]float32: - return NewCpuBruteForceIndex[float32](dset, dimension, m, elemsz) - //return NewGpuBruteForceIndex[float32](dset, dimension, m, elemsz, nthread) + return NewGpuAdhocBruteForceIndex[float32](dset, dimension, m, elemsz) + case [][]uint16: + // Convert [][]uint16 to [][]cuvs.Float16 to pass to NewGpuAdhocBruteForceIndex + f16dset := make([][]cuvs.Float16, len(dset)) + for i, v := range dset { + f16dset[i] = util.UnsafeSliceCast[cuvs.Float16](v) + } + return NewGpuAdhocBruteForceIndex[cuvs.Float16](f16dset, dimension, m, elemsz) default: - return nil, moerr.NewInternalErrorNoCtx("type not supported for BruteForceIndex") + return NewUsearchBruteForceIndex[T](dataset, dimension, m, elemsz) } +} + +func NewAdhocBruteForceIndexFlattened[T types.RealNumbers](dataset []T, + count uint, + dimension uint, + m metric.MetricType, + elemsz uint) (cache.VectorIndexSearchIf, error) { + const cpuThreshold = 5000 + if count < cpuThreshold { + return NewUsearchBruteForceIndexFlattened[T](dataset, count, dimension, m, elemsz) + } + + switch dset := any(dataset).(type) { + case []float32: + return &GpuAdhocBruteForceIndex[float32]{ + dataset: dset, + dimension: dimension, + count: count, + metric: m, + }, nil + case []cuvs.Float16: + return &GpuAdhocBruteForceIndex[cuvs.Float16]{ + dataset: dset, + dimension: dimension, + count: count, + metric: m, + }, nil + default: + return NewUsearchBruteForceIndexFlattened[T](dataset, count, dimension, m, elemsz) + } } -func NewGpuBruteForceIndex[T cuvs.TensorNumberType](dataset [][]T, +func NewGpuAdhocBruteForceIndex[T cuvs.VectorType](dataset [][]T, dimension uint, m metric.MetricType, - elemsz uint, - nthread uint) (cache.VectorIndexSearchIf, error) { + elemsz uint) (cache.VectorIndexSearchIf, error) { - idx := &GpuBruteForceIndex[T]{} - resource, _ := cuvs.NewResource(nil) - idx.Resource = &resource - tensor, err := cuvs.NewTensor(dataset) - if err != nil { - return nil, err + if len(dataset) == 0 { + return nil, moerr.NewInternalErrorNoCtx("empty dataset") } - idx.Dataset = &tensor - idx.Metric = metric.MetricTypeToCuvsMetric[m] - idx.Dimension = dimension - idx.Count = uint(len(dataset)) - idx.ElementSize = elemsz - return idx, nil + dim := int(dimension) + reqSize := len(dataset) * dim + flattened := make([]T, reqSize) + + for i, v := range dataset { + copy(flattened[i*dim:(i+1)*dim], v) + } + return &GpuAdhocBruteForceIndex[T]{ + dataset: flattened, + dimension: dimension, + count: uint(len(dataset)), + metric: m, + }, nil } -func (idx *GpuBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) (err error) { - if _, err = idx.Dataset.ToDevice(idx.Resource); err != nil { - return err +func (idx *GpuAdhocBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) error { + return nil +} + +func (idx *GpuAdhocBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (retkeys any, retdistances []float64, err error) { + var flattenedQueries []T + var nQueries uint64 + + switch queries := _queries.(type) { + case []T: + flattenedQueries = queries + nQueries = uint64(len(queries) / int(idx.dimension)) + case [][]T: + if len(queries) == 0 { + return nil, nil, nil + } + dim := int(idx.dimension) + reqSize := len(queries) * dim + flattenedQueries = make([]T, reqSize) + for i, v := range queries { + copy(flattenedQueries[i*dim:(i+1)*dim], v) + } + nQueries = uint64(len(queries)) + default: + return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid") } - idx.Index, err = brute_force.CreateIndex() - if err != nil { - return + if nQueries == 0 { + return nil, nil, nil } - err = brute_force.BuildIndex[T](*idx.Resource, idx.Dataset, idx.Metric, 0, idx.Index) + deviceID := 0 + neighbors, distances, err := cuvs.AdhocBruteForceSearch[T]( + idx.dataset, uint64(idx.count), uint32(idx.dimension), + flattenedQueries, nQueries, uint32(rt.Limit), + resolveCuvsDistance(idx.metric), deviceID, + ) if err != nil { - return + return nil, nil, err } - if err = idx.Resource.Sync(); err != nil { - return + retdistances = make([]float64, len(distances)) + for i, d := range distances { + retdistances[i] = float64(d) } + retkeys = neighbors return } -func (idx *GpuBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (retkeys any, retdistances []float64, err error) { - queriesvec, ok := _queries.([][]T) - if !ok { - return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid") +func (idx *GpuAdhocBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf) error { + return nil +} + +func (idx *GpuAdhocBruteForceIndex[T]) Destroy() { + idx.dataset = nil +} + +type GpuBruteForceIndex[T cuvs.VectorType] struct { + index *cuvs.GpuBruteForce[T] + dimension uint + count uint +} + +var _ cache.VectorIndexSearchIf = &GpuBruteForceIndex[float32]{} + +func resolveCuvsDistance(m metric.MetricType) cuvs.DistanceType { + switch m { + case metric.Metric_L2sqDistance: + return cuvs.L2Expanded + case metric.Metric_L2Distance: + return cuvs.L2Expanded + case metric.Metric_InnerProduct: + return cuvs.InnerProduct + case metric.Metric_CosineDistance: + return cuvs.CosineSimilarity + case metric.Metric_L1Distance: + return cuvs.L1 + default: + return cuvs.L2Expanded } +} - // local resource for concurrent search - resource, err := cuvs.NewResource(nil) - if err != nil { - return nil, nil, err +func NewBruteForceIndex[T types.RealNumbers](dataset [][]T, + dimension uint, + m metric.MetricType, + elemsz uint, + nthread uint) (cache.VectorIndexSearchIf, error) { + + switch dset := any(dataset).(type) { + case [][]float64: + return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz) + case [][]float32: + return NewGpuBruteForceIndex[float32](dset, dimension, m, elemsz, nthread) + case [][]uint16: + // Convert [][]uint16 to [][]cuvs.Float16 to pass to NewGpuBruteForceIndex + f16dset := make([][]cuvs.Float16, len(dset)) + for i, v := range dset { + f16dset[i] = util.UnsafeSliceCast[cuvs.Float16](v) + } + return NewGpuBruteForceIndex[cuvs.Float16](f16dset, dimension, m, elemsz, nthread) + default: + return nil, moerr.NewInternalErrorNoCtx("type not supported for BruteForceIndex") } - defer resource.Close() +} - queries, err := cuvs.NewTensor(queriesvec) - if err != nil { - return nil, nil, err +func NewGpuBruteForceIndex[T cuvs.VectorType](dataset [][]T, + dimension uint, + m metric.MetricType, + elemsz uint, + nthread uint) (cache.VectorIndexSearchIf, error) { + + if len(dataset) == 0 { + return nil, moerr.NewInternalErrorNoCtx("empty dataset") } - defer queries.Close() - neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(rt.Limit)}) - if err != nil { - return nil, nil, err + dim := int(dimension) + reqSize := len(dataset) * dim + var flattened []T + + var _t T + switch any(_t).(type) { + case float32: + allocator := malloc.NewCAllocator() + slice, deallocator, err := allocator.Allocate(uint64(reqSize*4), malloc.NoClear) + if err != nil { + return nil, err + } + defer deallocator.Deallocate() + flattened = any(util.UnsafeSliceCast[float32](slice)).([]T) + case cuvs.Float16: + allocator := malloc.NewCAllocator() + slice, deallocator, err := allocator.Allocate(uint64(reqSize*2), malloc.NoClear) + if err != nil { + return nil, err + } + defer deallocator.Deallocate() + flattened = any(util.UnsafeSliceCast[cuvs.Float16](slice)).([]T) + default: + ds := make([]T, reqSize) + flattened = ds } - defer neighbors.Close() - distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(rt.Limit)}) + for i, v := range dataset { + copy(flattened[i*dim:(i+1)*dim], v) + } + + deviceID := 0 // Default to device 0 + km, err := cuvs.NewGpuBruteForce[T](flattened, uint64(len(dataset)), uint32(dimension), resolveCuvsDistance(m), uint32(nthread), deviceID) if err != nil { - return nil, nil, err + return nil, err } - defer distances.Close() - if _, err = queries.ToDevice(&resource); err != nil { - return nil, nil, err + km.Start() + return &GpuBruteForceIndex[T]{ + index: km, + dimension: dimension, + count: uint(len(dataset)), + }, nil +} + +func (idx *GpuBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) (err error) { + if idx.index == nil { + return moerr.NewInternalErrorNoCtx("GpuBruteForce not initialized") } + return idx.index.Build() +} - err = brute_force.SearchIndex(resource, *idx.Index, &queries, &neighbors, &distances) - if err != nil { - return nil, nil, err +func (idx *GpuBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (retkeys any, retdistances []float64, err error) { + queriesvec, ok := _queries.([][]T) + if !ok { + return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid") } - if _, err = neighbors.ToHost(&resource); err != nil { - return nil, nil, err + if len(queriesvec) == 0 { + return nil, nil, nil } - if _, err = distances.ToHost(&resource); err != nil { - return nil, nil, err + dim := int(idx.dimension) + reqSize := len(queriesvec) * dim + + var flattenedQueries []T + var queryDeallocator malloc.Deallocator + + var _t T + switch any(_t).(type) { + case float32: + allocator := malloc.NewCAllocator() + slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear) + if err2 != nil { + return nil, nil, err2 + } + queryDeallocator = dealloc + f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize) + flattenedQueries = any(f32Slice).([]T) + case cuvs.Float16: + allocator := malloc.NewCAllocator() + slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*2, malloc.NoClear) + if err2 != nil { + return nil, nil, err2 + } + queryDeallocator = dealloc + f16Slice := util.UnsafeSliceCastToLength[cuvs.Float16](slice, reqSize) + flattenedQueries = any(f16Slice).([]T) + default: + // Not pooling other types, although T is likely only float32 for CUVS + ds := make([]T, reqSize) + flattenedQueries = ds } - if err = resource.Sync(); err != nil { - return nil, nil, err + for i, v := range queriesvec { + copy(flattenedQueries[i*dim:(i+1)*dim], v) } - neighborsSlice, err := neighbors.Slice() - if err != nil { - return nil, nil, err + if queryDeallocator != nil { + defer queryDeallocator.Deallocate() } - distancesSlice, err := distances.Slice() + neighbors, distances, err := idx.index.Search(flattenedQueries, uint64(len(queriesvec)), uint32(idx.dimension), uint32(rt.Limit)) if err != nil { return nil, nil, err } - //fmt.Printf("flattened %v\n", flatten) - retdistances = make([]float64, len(distancesSlice)*int(rt.Limit)) - for i := range distancesSlice { - for j, dist := range distancesSlice[i] { - retdistances[i*int(rt.Limit)+j] = float64(dist) - } + retdistances = make([]float64, len(distances)) + for i, d := range distances { + retdistances[i] = float64(d) } - keys := make([]int64, len(neighborsSlice)*int(rt.Limit)) - for i := range neighborsSlice { - for j, key := range neighborsSlice[i] { - keys[i*int(rt.Limit)+j] = int64(key) - } - } - retkeys = keys + retkeys = neighbors return } @@ -190,13 +358,7 @@ func (idx *GpuBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf) er } func (idx *GpuBruteForceIndex[T]) Destroy() { - if idx.Dataset != nil { - idx.Dataset.Close() - } - if idx.Resource != nil { - idx.Resource.Close() - } - if idx.Index != nil { - idx.Index.Close() + if idx.index != nil { + idx.index.Destroy() } } diff --git a/pkg/vectorindex/brute_force/gpu_benchmark_test.go b/pkg/vectorindex/brute_force/gpu_benchmark_test.go new file mode 100644 index 0000000000000..9c6166b95dbed --- /dev/null +++ b/pkg/vectorindex/brute_force/gpu_benchmark_test.go @@ -0,0 +1,82 @@ +//go:build gpu + +// Copyright 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package brute_force + +import ( + "math/rand/v2" + "testing" + + "github.com/matrixorigin/matrixone/pkg/vectorindex" + "github.com/matrixorigin/matrixone/pkg/vectorindex/cache" + "github.com/matrixorigin/matrixone/pkg/vectorindex/metric" +) + +func BenchmarkGpuBruteForce(b *testing.B) { + benchmarkBruteForce(b, NewGpuBruteForceIndex[float32]) +} + +func BenchmarkCentroidSearchGpuBruteForce(b *testing.B) { + benchmarkCentroidSearch(b, NewGpuBruteForceIndex[float32]) +} + +func BenchmarkGpuAdhocBruteForce(b *testing.B) { + benchmarkBruteForce(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) { + return NewGpuAdhocBruteForceIndex[float32](dataset, dim, m, es) + }) +} + +func BenchmarkCentroidSearchGpuAdhocBruteForce(b *testing.B) { + benchmarkCentroidSearch(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) { + return NewGpuAdhocBruteForceIndex[float32](dataset, dim, m, es) + }) +} + +func BenchmarkGpuAdhocBruteForceSingle(b *testing.B) { + dsize := 10000 + dimension := uint(1024) + limit := uint(10) + elemsz := uint(4) // float32 + + dataset := make([][]float32, dsize) + for i := range dataset { + dataset[i] = make([]float32, dimension) + for j := range dataset[i] { + dataset[i][j] = rand.Float32() + } + } + + query := make([][]float32, 1) + query[0] = make([]float32, dimension) + for j := range query[0] { + query[0][j] = rand.Float32() + } + + rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 1} + + b.ResetTimer() + for i := 0; i < b.N; i++ { + idx, err := NewGpuAdhocBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz) + if err != nil { + b.Fatal(err) + } + _, _, err = idx.Search(nil, query, rt) + if err != nil { + b.Fatal(err) + } + idx.Destroy() + } +} diff --git a/pkg/vectorindex/brute_force/gpu_test.go b/pkg/vectorindex/brute_force/gpu_test.go index d9b024f5444cd..d1b341d797c21 100644 --- a/pkg/vectorindex/brute_force/gpu_test.go +++ b/pkg/vectorindex/brute_force/gpu_test.go @@ -17,7 +17,6 @@ package brute_force import ( - //"fmt" "math/rand/v2" "sync" "testing" @@ -35,22 +34,22 @@ func TestGpuBruteForce(t *testing.T) { dataset := [][]float32{{1, 2, 3}, {3, 4, 5}} query := [][]float32{{1, 2, 3}, {3, 4, 5}} dimension := uint(3) - ncpu := uint(1) + ncpu := uint(8) limit := uint(1) elemsz := uint(4) // float32 - idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz) + idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu) require.NoError(t, err) defer idx.Destroy() err = idx.Load(nil) require.NoError(t, err) - rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu} + rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 1} var wg sync.WaitGroup - for n := 0; n < 4; n++ { + for n := 0; n < 8; n++ { wg.Add(1) go func() { @@ -66,7 +65,6 @@ func TestGpuBruteForce(t *testing.T) { require.Equal(t, key, int64(j)) require.Equal(t, distances[j], float64(0)) } - // fmt.Printf("keys %v, dist %v\n", keys, distances) } }() } @@ -81,7 +79,7 @@ func TestGpuBruteForceConcurrent(t *testing.T) { proc := testutil.NewProcessWithMPool(t, "", m) sqlproc := sqlexec.NewSqlProcess(proc) dimension := uint(128) - ncpu := uint(4) + ncpu := uint(8) limit := uint(3) elemsz := uint(4) // float32 @@ -96,7 +94,7 @@ func TestGpuBruteForceConcurrent(t *testing.T) { query := dataset - idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz) + idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu) require.NoError(t, err) defer idx.Destroy() @@ -105,13 +103,12 @@ func TestGpuBruteForceConcurrent(t *testing.T) { // limit 3 { - rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu} + rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 1} anykeys, distances, err := idx.Search(sqlproc, query, rt) require.NoError(t, err) keys := anykeys.([]int64) - // fmt.Printf("keys %v, dist %v\n", keys, distances) require.Equal(t, int(rt.Limit)*len(query), len(keys)) for i := range query { offset := i * int(rt.Limit) @@ -122,13 +119,12 @@ func TestGpuBruteForceConcurrent(t *testing.T) { // limit 1 { - rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: ncpu} + rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: 1} anykeys, distances, err := idx.Search(sqlproc, query, rt) require.NoError(t, err) keys := anykeys.([]int64) - // fmt.Printf("keys %v, dist %v\n", keys, distances) require.Equal(t, int(rt.Limit)*len(query), len(keys)) for i := range query { offset := i * int(rt.Limit) diff --git a/pkg/vectorindex/ivfflat/kmeans/device/gpu.go b/pkg/vectorindex/ivfflat/kmeans/device/gpu.go index ed7eecfd58cf9..357a9bd89f24b 100644 --- a/pkg/vectorindex/ivfflat/kmeans/device/gpu.go +++ b/pkg/vectorindex/ivfflat/kmeans/device/gpu.go @@ -17,84 +17,48 @@ package device import ( - //"os" - "context" "github.com/matrixorigin/matrixone/pkg/common/moerr" "github.com/matrixorigin/matrixone/pkg/container/types" + "github.com/matrixorigin/matrixone/pkg/cuvs" "github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans" "github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/elkans" "github.com/matrixorigin/matrixone/pkg/vectorindex/metric" - cuvs "github.com/rapidsai/cuvs/go" - "github.com/rapidsai/cuvs/go/ivf_flat" ) -type GpuClusterer[T cuvs.TensorNumberType] struct { - indexParams *ivf_flat.IndexParams - nlist int - dim int - vectors [][]T +type GpuClusterer[T cuvs.VectorType] struct { + kmeans *cuvs.GpuKMeans[T] + nlist int + dim int + vectors []T } func (c *GpuClusterer[T]) InitCentroids(ctx context.Context) error { - return nil } func (c *GpuClusterer[T]) Cluster(ctx context.Context) (any, error) { - - resource, err := cuvs.NewResource(nil) - if err != nil { - return nil, err + if c.kmeans == nil { + return nil, moerr.NewInternalErrorNoCtx("GpuKMeans not initialized") } - defer resource.Close() - dataset, err := cuvs.NewTensor(c.vectors) + nSamples := uint64(len(c.vectors) / c.dim) + _, _, err := c.kmeans.Fit(c.vectors, nSamples) if err != nil { return nil, err } - defer dataset.Close() - index, err := ivf_flat.CreateIndex(c.indexParams, &dataset) + centroids, err := c.kmeans.GetCentroids() if err != nil { return nil, err } - defer index.Close() - if _, err := dataset.ToDevice(&resource); err != nil { - return nil, err - } - - centers, err := cuvs.NewTensorOnDevice[T](&resource, []int64{int64(c.nlist), int64(c.dim)}) - if err != nil { - return nil, err - } - defer centers.Close() - - if err := ivf_flat.BuildIndex(resource, c.indexParams, &dataset, index); err != nil { - return nil, err - } - - if err := resource.Sync(); err != nil { - return nil, err - } - - if err := ivf_flat.GetCenters(index, ¢ers); err != nil { - return nil, err - } - - if _, err := centers.ToHost(&resource); err != nil { - return nil, err - } - - if err := resource.Sync(); err != nil { - return nil, err - } - - result, err := centers.Slice() - if err != nil { - return nil, err + // Reshape centroids back to [][]T + result := make([][]T, c.nlist) + for i := 0; i < c.nlist; i++ { + result[i] = make([]T, c.dim) + copy(result[i], centroids[i*c.dim:(i+1)*c.dim]) } return result, nil @@ -105,26 +69,26 @@ func (c *GpuClusterer[T]) SSE() (float64, error) { } func (c *GpuClusterer[T]) Close() error { - if c.indexParams != nil { - c.indexParams.Close() + if c.kmeans != nil { + return c.kmeans.Destroy() } return nil } -func resolveCuvsDistanceForDense(distance metric.MetricType) cuvs.Distance { +func resolveCuvsDistanceForDense(distance metric.MetricType) cuvs.DistanceType { switch distance { case metric.Metric_L2sqDistance: - return cuvs.DistanceL2 + return cuvs.L2Expanded case metric.Metric_L2Distance: - return cuvs.DistanceL2 + return cuvs.L2Expanded case metric.Metric_InnerProduct: - return cuvs.DistanceL2 + return cuvs.InnerProduct case metric.Metric_CosineDistance: - return cuvs.DistanceL2 + return cuvs.CosineSimilarity case metric.Metric_L1Distance: - return cuvs.DistanceL2 + return cuvs.L1 default: - return cuvs.DistanceL2 + return cuvs.L2Expanded } } @@ -136,27 +100,36 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt, switch vecs := any(vectors).(type) { case [][]float32: - - c := &GpuClusterer[float32]{} - c.nlist = clusterCnt - if len(vectors) == 0 { + if len(vecs) == 0 { return nil, moerr.NewInternalErrorNoCtx("empty dataset") } - c.vectors = vecs - c.dim = len(vecs[0]) - indexParams, err := ivf_flat.CreateIndexParams() + dim := len(vecs[0]) + // Flatten vectors for pkg/cuvs + flattened := make([]float32, len(vecs)*dim) + for i, v := range vecs { + copy(flattened[i*dim:(i+1)*dim], v) + } + + // cuVS K-Means is currently single-GPU focused in our wrapper + deviceID := 0 + nthread := uint32(1) + + km, err := cuvs.NewGpuKMeans[float32](uint32(clusterCnt), uint32(dim), resolveCuvsDistanceForDense(distanceType), maxIterations, deviceID, nthread) if err != nil { return nil, err } - indexParams.SetNLists(uint32(clusterCnt)) - indexParams.SetMetric(resolveCuvsDistanceForDense(distanceType)) - indexParams.SetKMeansNIters(uint32(maxIterations)) - indexParams.SetKMeansTrainsetFraction(1) // train all sample - c.indexParams = indexParams + km.Start() + + c := &GpuClusterer[float32]{ + kmeans: km, + nlist: clusterCnt, + dim: dim, + vectors: flattened, + } return c, nil + default: return elkans.NewKMeans(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType, spherical, nworker) - } } diff --git a/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go b/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go index 1132ef924c17b..72fe4108ca9c7 100644 --- a/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go +++ b/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go @@ -17,8 +17,8 @@ package device import ( - //"fmt" "context" + //"fmt" "math/rand/v2" "sync" "testing" @@ -33,7 +33,7 @@ import ( ) func TestGpu(t *testing.T) { - + ctx := context.Background() dim := 128 dsize := 1024 nlist := 128 @@ -48,7 +48,11 @@ func TestGpu(t *testing.T) { c, err := NewKMeans[float32](vecs, nlist, 10, 0, metric.Metric_L2Distance, 0, false, 0) require.NoError(t, err) - centers, err := c.Cluster(context.Background()) + defer c.Close() + + c.InitCentroids(ctx) + + centers, err := c.Cluster(ctx) require.NoError(t, err) _, ok := centers.([][]float32) @@ -63,6 +67,7 @@ func TestGpu(t *testing.T) { func TestIVFAndBruteForce(t *testing.T) { + ctx := context.Background() m := mpool.MustNewZero() proc := testutil.NewProcessWithMPool(t, "", m) sqlproc := sqlexec.NewSqlProcess(proc) @@ -83,8 +88,10 @@ func TestIVFAndBruteForce(t *testing.T) { c, err := NewKMeans[float32](vecs, nlist, 10, 0, metric.Metric_L2Distance, 0, false, 0) require.NoError(t, err) + defer c.Close() - centers, err := c.Cluster(context.Background()) + c.InitCentroids(ctx) + centers, err := c.Cluster(ctx) require.NoError(t, err) centroids, ok := centers.([][]float32) @@ -97,7 +104,7 @@ func TestIVFAndBruteForce(t *testing.T) { */ queries := vecs[:8192] - idx, err := mobf.NewBruteForceIndex[float32](centroids, dimension, metric.Metric_L2sqDistance, elemsz) + idx, err := mobf.NewBruteForceIndex[float32](centroids, dimension, metric.Metric_L2sqDistance, elemsz, ncpu) require.NoError(t, err) defer idx.Destroy() @@ -116,21 +123,9 @@ func TestIVFAndBruteForce(t *testing.T) { for i := 0; i < 1000; i++ { _, _, err := idx.Search(sqlproc, queries, rt) require.NoError(t, err) - /* - - keys_i64, ok := keys.([]int64) - require.Equal(t, ok, true) - - for j, key := range keys_i64 { - require.Equal(t, key, int64(j)) - require.Equal(t, distances[j], float64(0)) - } - */ - // fmt.Printf("keys %v, dist %v\n", keys, distances) } }() } wg.Wait() - } diff --git a/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go b/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go index 17d89be59a97a..8202874c783f0 100644 --- a/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go +++ b/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go @@ -17,248 +17,172 @@ package device import ( - //"fmt" + "fmt" "math/rand/v2" + "runtime" "sync" "testing" - //"os" + "github.com/matrixorigin/matrixone/pkg/cuvs" "github.com/stretchr/testify/require" - - cuvs "github.com/rapidsai/cuvs/go" - "github.com/rapidsai/cuvs/go/brute_force" - "github.com/rapidsai/cuvs/go/ivf_flat" ) -func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.Distance, maxIterations int) ([][]float32, error) { - - resource, err := cuvs.NewResource(nil) - if err != nil { - return nil, err +func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.DistanceType, maxIterations int) ([][]float32, error) { + if len(vecs) == 0 { + return nil, fmt.Errorf("empty dataset") } - defer resource.Close() - indexParams, err := ivf_flat.CreateIndexParams() - if err != nil { - return nil, err + // Flatten vectors + flattened := make([]float32, len(vecs)*dim) + for i, v := range vecs { + copy(flattened[i*dim:(i+1)*dim], v) } - defer indexParams.Close() - - indexParams.SetNLists(uint32(clusterCnt)) - indexParams.SetMetric(distanceType) - indexParams.SetKMeansNIters(uint32(maxIterations)) - indexParams.SetKMeansTrainsetFraction(1) // train all sample - dataset, err := cuvs.NewTensor(vecs) + deviceID := 0 + nthread := uint32(1) + km, err := cuvs.NewGpuKMeans[float32](uint32(clusterCnt), uint32(dim), distanceType, maxIterations, deviceID, nthread) if err != nil { return nil, err } - defer dataset.Close() - - index, _ := ivf_flat.CreateIndex(indexParams, &dataset) - defer index.Close() - - if _, err := dataset.ToDevice(&resource); err != nil { - return nil, err - } + defer km.Destroy() + km.Start() - centers, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(clusterCnt), int64(dim)}) + _, _, err = km.Fit(flattened, uint64(len(vecs))) if err != nil { return nil, err } - if err := ivf_flat.BuildIndex(resource, indexParams, &dataset, index); err != nil { - return nil, err - } - - if err := resource.Sync(); err != nil { - return nil, err - } - - if err := ivf_flat.GetCenters(index, ¢ers); err != nil { - return nil, err - } - - if _, err := centers.ToHost(&resource); err != nil { - return nil, err - } - - if err := resource.Sync(); err != nil { + centroids, err := km.GetCentroids() + if err != nil { return nil, err } - result, err := centers.Slice() - if err != nil { - return nil, err + // Reshape centroids + result := make([][]float32, clusterCnt) + for i := 0; i < clusterCnt; i++ { + result[i] = make([]float32, dim) + copy(result[i], centroids[i*dim:(i+1)*dim]) } return result, nil - } -func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.Distance) (retkeys any, retdistances []float64, err error) { - //os.Stderr.WriteString(fmt.Sprintf("probe set %d\n", len(queriesvec))) - //os.Stderr.WriteString("brute force index search start\n") - - resource, err := cuvs.NewResource(nil) - if err != nil { - return +func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.DistanceType) (retkeys any, retdistances []float64, err error) { + if len(datasetvec) == 0 || len(queriesvec) == 0 { + return nil, nil, nil } - defer resource.Close() - dataset, err := cuvs.NewTensor(datasetvec) - if err != nil { - return + dim := len(datasetvec[0]) + flattenedDataset := make([]float32, len(datasetvec)*dim) + for i, v := range datasetvec { + copy(flattenedDataset[i*dim:(i+1)*dim], v) } - defer dataset.Close() - index, err := brute_force.CreateIndex() - if err != nil { - return + flattenedQueries := make([]float32, len(queriesvec)*dim) + for i, v := range queriesvec { + copy(flattenedQueries[i*dim:(i+1)*dim], v) } - defer index.Close() - queries, err := cuvs.NewTensor(queriesvec) + deviceID := 0 + nthread := uint32(1) + bf, err := cuvs.NewGpuBruteForce[float32](flattenedDataset, uint64(len(datasetvec)), uint32(dim), distanceType, nthread, deviceID) if err != nil { - return + return nil, nil, err } - defer queries.Close() + defer bf.Destroy() + bf.Start() - neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(limit)}) + err = bf.Build() if err != nil { - return + return nil, nil, err } - defer neighbors.Close() - distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(limit)}) + neighbors, distances, err := bf.Search(flattenedQueries, uint64(len(queriesvec)), uint32(dim), uint32(limit)) if err != nil { - return + return nil, nil, err } - defer distances.Close() - if _, err = dataset.ToDevice(&resource); err != nil { - return - } - - if err = resource.Sync(); err != nil { - return - } - - err = brute_force.BuildIndex(resource, &dataset, distanceType, 2.0, index) - if err != nil { - //os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed %v\n", err)) - //os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed centers %v\n", datasetvec)) - return + retdistances = make([]float64, len(distances)) + for i, d := range distances { + retdistances[i] = float64(d) } - if err = resource.Sync(); err != nil { - return - } - //os.Stderr.WriteString("built brute force index\n") + retkeys = neighbors + return +} - if _, err = queries.ToDevice(&resource); err != nil { - return - } +func TestIssueGpu(t *testing.T) { + var wg sync.WaitGroup + wg.Add(1) + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + defer wg.Done() + + dimension := uint(128) + dsize := 100000 + nlist := 128 + vecs := make([][]float32, dsize) + for i := range vecs { + vecs[i] = make([]float32, dimension) + for j := range vecs[i] { + vecs[i][j] = rand.Float32() + } + } - //os.Stderr.WriteString("brute force index search Runing....\n") - err = brute_force.SearchIndex(resource, *index, &queries, &neighbors, &distances) - if err != nil { - return - } - //os.Stderr.WriteString("brute force index search finished Runing....\n") + _, err := getCenters(vecs, int(dimension), nlist, cuvs.L2Expanded, 10) + require.NoError(t, err) + }() + wg.Wait() +} - if _, err = neighbors.ToHost(&resource); err != nil { - return - } - //os.Stderr.WriteString("brute force index search neighbour to host done....\n") +func TestIssueIvfAndBruteForceForIssue(t *testing.T) { + var wg1 sync.WaitGroup + wg1.Add(1) + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + defer wg1.Done() + + dimension := uint(128) + limit := uint(1) + dsize := 100000 + nlist := 128 + vecs := make([][]float32, dsize) + for i := range vecs { + vecs[i] = make([]float32, dimension) + for j := range vecs[i] { + vecs[i][j] = rand.Float32() + } + } + queries := vecs[:8192] - if _, err = distances.ToHost(&resource); err != nil { - return - } - //os.Stderr.WriteString("brute force index search distances to host done....\n") + centers, err := getCenters(vecs, int(dimension), nlist, cuvs.L2Expanded, 10) + require.NoError(t, err) - if err = resource.Sync(); err != nil { - return - } + fmt.Println("centers DONE") - //os.Stderr.WriteString("brute force index search return result....\n") - neighborsSlice, err := neighbors.Slice() - if err != nil { - return - } + var wg sync.WaitGroup - distancesSlice, err := distances.Slice() - if err != nil { - return - } + for n := 0; n < 8; n++ { + wg.Add(1) + go func() { + defer wg.Done() - //fmt.Printf("flattened %v\n", flatten) - retdistances = make([]float64, len(distancesSlice)*int(limit)) - for i := range distancesSlice { - for j, dist := range distancesSlice[i] { - retdistances[i*int(limit)+j] = float64(dist) - } - } + runtime.LockOSThread() + defer runtime.UnlockOSThread() - keys := make([]int64, len(neighborsSlice)*int(limit)) - for i := range neighborsSlice { - for j, key := range neighborsSlice[i] { - keys[i*int(limit)+j] = int64(key) + for i := 0; i < 100; i++ { // Reduced iteration count for faster test run + _, _, err := Search(centers, queries, limit, cuvs.L2Expanded) + require.NoError(t, err) + } + }() } - } - retkeys = keys - //os.Stderr.WriteString("brute force index search RETURN NOW....\n") - return -} -func TestIvfAndBruteForceForIssue(t *testing.T) { - - dimension := uint(128) - limit := uint(1) - /* - ncpu := uint(1) - elemsz := uint(4) // float32 - */ - - dsize := 100000 - nlist := 128 - vecs := make([][]float32, dsize) - for i := range vecs { - vecs[i] = make([]float32, dimension) - for j := range vecs[i] { - vecs[i][j] = rand.Float32() - } - } - queries := vecs[:8192] - - centers, err := getCenters(vecs, int(dimension), nlist, cuvs.DistanceL2, 10) - require.NoError(t, err) - - var wg sync.WaitGroup - - for n := 0; n < 4; n++ { - - wg.Add(1) - go func() { - defer wg.Done() - for i := 0; i < 1000; i++ { - _, _, err := Search(centers, queries, limit, cuvs.DistanceL2) - require.NoError(t, err) - - /* - keys_i64, ok := keys.([]int64) - require.Equal(t, ok, true) - - for j, key := range keys_i64 { - require.Equal(t, key, int64(j)) - require.Equal(t, distances[j], float64(0)) - } - */ - // fmt.Printf("keys %v, dist %v\n", keys, distances) - } - }() - } - - wg.Wait() + wg.Wait() + }() + wg1.Wait() } diff --git a/pkg/vectorindex/metric/cpu.go b/pkg/vectorindex/metric/cpu.go new file mode 100644 index 0000000000000..716092f44c349 --- /dev/null +++ b/pkg/vectorindex/metric/cpu.go @@ -0,0 +1,30 @@ +//go:build !gpu + +// Copyright 2022 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metric + +import ( + "github.com/matrixorigin/matrixone/pkg/container/types" +) + +func PairWiseDistance[T types.RealNumbers]( + x [][]T, + y [][]T, + metric MetricType, + _ int, +) ([]float32, error) { + return GoPairWiseDistance(x, y, metric) +} diff --git a/pkg/vectorindex/metric/distance_func.go b/pkg/vectorindex/metric/distance_func.go index cf8ffae96fb22..370c5cc80b61d 100644 --- a/pkg/vectorindex/metric/distance_func.go +++ b/pkg/vectorindex/metric/distance_func.go @@ -522,3 +522,35 @@ func ResolveDistanceFn[T types.RealNumbers](metric MetricType) (DistanceFunction } return distanceFunction, nil } + +func GoPairWiseDistance[T types.RealNumbers]( + x [][]T, + y [][]T, + metric MetricType, +) ([]float32, error) { + distFn, err := ResolveDistanceFn[T](metric) + if err != nil { + return nil, err + } + + nX := len(x) + nY := len(y) + res := make([]float32, nX*nY) + for i := 0; i < nX; i++ { + for j := 0; j < nY; j++ { + d, err := distFn(x[i], y[j]) + if err != nil { + return nil, err + } + res[i*nY+j] = float32(d) + } + } + + if metric == Metric_L2Distance { + for i := range res { + res[i] = float32(math.Sqrt(float64(res[i]))) + } + } + + return res, nil +} diff --git a/pkg/vectorindex/metric/gpu.go b/pkg/vectorindex/metric/gpu.go index d0ad025c1f3f0..9d8365d92049f 100644 --- a/pkg/vectorindex/metric/gpu.go +++ b/pkg/vectorindex/metric/gpu.go @@ -17,15 +17,85 @@ package metric import ( - cuvs "github.com/rapidsai/cuvs/go" + "math" + + "github.com/matrixorigin/matrixone/pkg/common/malloc" + "github.com/matrixorigin/matrixone/pkg/common/util" + "github.com/matrixorigin/matrixone/pkg/container/types" + "github.com/matrixorigin/matrixone/pkg/cuvs" ) var ( - MetricTypeToCuvsMetric = map[MetricType]cuvs.Distance{ - Metric_L2sqDistance: cuvs.DistanceSQEuclidean, - Metric_L2Distance: cuvs.DistanceSQEuclidean, - Metric_InnerProduct: cuvs.DistanceInnerProduct, - Metric_CosineDistance: cuvs.DistanceCosine, - Metric_L1Distance: cuvs.DistanceL1, + MetricTypeToCuvsMetric = map[MetricType]cuvs.DistanceType{ + Metric_L2sqDistance: cuvs.L2Expanded, + Metric_L2Distance: cuvs.L2Expanded, + Metric_InnerProduct: cuvs.InnerProduct, + Metric_CosineDistance: cuvs.CosineExpanded, + Metric_L1Distance: cuvs.L1, } ) + +func PairWiseDistance[T types.RealNumbers]( + x [][]T, + y [][]T, + metric MetricType, + deviceID int, +) ([]float32, error) { + nX := len(x) + nY := len(y) + if nX == 0 || nY == 0 { + return nil, nil + } + dim := len(x[0]) + + cuvsMetric, ok := MetricTypeToCuvsMetric[metric] + if !ok || nX*nY*dim < 40000*1024 { + return GoPairWiseDistance(x, y, metric) + } + + // T must be float32 for cuvs.PairwiseDistance as per VectorType constraint + // RealNumbers only includes float32/float64. cuvs.VectorType includes float32, Float16, int8, uint8. + // For now we only support float32 on GPU via this interface if T is float32. + var zero T + if any(zero).(interface{}) == any(float32(0)).(interface{}) { + allocator := malloc.NewCAllocator() + + xf32Slice, xDeallocator, err := allocator.Allocate(uint64(nX*dim*4), malloc.NoClear) + if err != nil { + return nil, err + } + defer xDeallocator.Deallocate() + xf32 := util.UnsafeSliceCast[float32](xf32Slice) + for i, v := range x { + copy(xf32[i*dim:(i+1)*dim], any(v).([]float32)) + } + + yf32Slice, yDeallocator, err := allocator.Allocate(uint64(nY*dim*4), malloc.NoClear) + if err != nil { + return nil, err + } + defer yDeallocator.Deallocate() + yf32 := util.UnsafeSliceCast[float32](yf32Slice) + for i, v := range y { + copy(yf32[i*dim:(i+1)*dim], any(v).([]float32)) + } + + res, err := cuvs.PairwiseDistance(xf32, uint64(nX), yf32, uint64(nY), uint32(dim), cuvsMetric, deviceID) + if err != nil { + return nil, err + } + + if metric == Metric_L2Distance { + for i := range res { + res[i] = float32(math.Sqrt(float64(res[i]))) + } + } else if metric == Metric_InnerProduct { + for i := range res { + res[i] = -res[i] + } + } + return res, nil + } + + return GoPairWiseDistance(x, y, metric) +} diff --git a/pkg/vectorindex/metric/pairwise_bench_test.go b/pkg/vectorindex/metric/pairwise_bench_test.go new file mode 100644 index 0000000000000..dd91c06810df5 --- /dev/null +++ b/pkg/vectorindex/metric/pairwise_bench_test.go @@ -0,0 +1,80 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metric + +import ( + "math/rand" + "testing" +) + +func BenchmarkPairWiseDistance(b *testing.B) { + nX, nY, dim := 100, 100, 128 + x := make([][]float32, nX) + y := make([][]float32, nY) + for i := range x { + x[i] = make([]float32, dim) + for j := range x[i] { + x[i][j] = rand.Float32() + } + } + for i := range y { + y[i] = make([]float32, dim) + for j := range y[i] { + y[i][j] = rand.Float32() + } + } + + b.Run("PairWiseDistance", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _ = PairWiseDistance(x, y, Metric_L2sqDistance, 0) + } + }) + + b.Run("GoPairWiseDistance", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _ = GoPairWiseDistance(x, y, Metric_L2sqDistance) + } + }) +} + +func BenchmarkPairWiseDistanceLarge(b *testing.B) { + nX, nY, dim := 10000, 5, 1024 + x := make([][]float32, nX) + y := make([][]float32, nY) + for i := range x { + x[i] = make([]float32, dim) + for j := range x[i] { + x[i][j] = rand.Float32() + } + } + for i := range y { + y[i] = make([]float32, dim) + for j := range y[i] { + y[i][j] = rand.Float32() + } + } + + b.Run("PairWiseDistance-Large", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _ = PairWiseDistance(x, y, Metric_L2sqDistance, 0) + } + }) + + b.Run("GoPairWiseDistance-Large", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _ = GoPairWiseDistance(x, y, Metric_L2sqDistance) + } + }) +} diff --git a/pkg/vectorindex/metric/pairwise_test.go b/pkg/vectorindex/metric/pairwise_test.go new file mode 100644 index 0000000000000..a9487beb46f84 --- /dev/null +++ b/pkg/vectorindex/metric/pairwise_test.go @@ -0,0 +1,87 @@ +// Copyright 2023 Matrix Origin +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metric + +import ( + "math" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPairWiseDistance(t *testing.T) { + nX, nY := 3, 2 + x := [][]float32{ + {1, 0, 0, 0}, + {0, 1, 0, 0}, + {0, 0, 1, 0}, + } + y := [][]float32{ + {1, 0, 0, 0}, + {0, 1, 1, 0}, + } + + metrics := []MetricType{ + Metric_L2sqDistance, + Metric_L2Distance, + Metric_InnerProduct, + Metric_CosineDistance, + Metric_L1Distance, + } + + for _, m := range metrics { + t.Run(MetricTypeToDistFuncName[m], func(t *testing.T) { + dist, err := PairWiseDistance(x, y, m, 0) + require.NoError(t, err) + require.Equal(t, nX*nY, len(dist)) + + // Verify against direct calls + distFn, err := ResolveDistanceFn[float32](m) + require.NoError(t, err) + + for i := 0; i < nX; i++ { + for j := 0; j < nY; j++ { + expected, err := distFn(x[i], y[j]) + require.NoError(t, err) + + val := dist[i*nY+j] + if m == Metric_L2Distance { + require.InDelta(t, math.Sqrt(float64(expected)), float64(val), 1e-5) + } else { + require.InDelta(t, float64(expected), float64(val), 1e-5) + } + } + } + }) + } +} + +func TestGoPairWiseDistance(t *testing.T) { + x := [][]float64{{1, 0}, {0, 1}} + y := [][]float64{{1, 0}, {1, 1}} + + dist, err := GoPairWiseDistance(x, y, Metric_L2sqDistance) + require.NoError(t, err) + require.Equal(t, 4, len(dist)) + + // (1,0) to (1,0) -> 0 + require.InDelta(t, 0.0, float64(dist[0]), 1e-5) + // (1,0) to (1,1) -> 1 + require.InDelta(t, 1.0, float64(dist[1]), 1e-5) + // (0,1) to (1,0) -> 2 + require.InDelta(t, 2.0, float64(dist[2]), 1e-5) + // (0,1) to (1,1) -> 1 + require.InDelta(t, 1.0, float64(dist[3]), 1e-5) +} diff --git a/pkg/vm/engine/tae/blockio/read.go b/pkg/vm/engine/tae/blockio/read.go index a0152bc9db10b..2db8f3482697a 100644 --- a/pkg/vm/engine/tae/blockio/read.go +++ b/pkg/vm/engine/tae/blockio/read.go @@ -34,7 +34,6 @@ import ( "github.com/matrixorigin/matrixone/pkg/pb/plan" "github.com/matrixorigin/matrixone/pkg/pb/timestamp" v2 "github.com/matrixorigin/matrixone/pkg/util/metric/v2" - "github.com/matrixorigin/matrixone/pkg/vectorindex" "github.com/matrixorigin/matrixone/pkg/vectorindex/metric" "github.com/matrixorigin/matrixone/pkg/vm/engine" "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/containers" @@ -394,23 +393,34 @@ func HandleOrderByLimitOnIVFFlatIndex( return nullsBm.Contains(uint64(row)) }) - searchResults := make([]vectorindex.SearchResult, 0, len(selectRows)) - switch orderByLimit.Typ { case types.T_array_float32: - distFunc, err := metric.ResolveDistanceFn[float32](orderByLimit.MetricType) + rhs := types.BytesToArray[float32](orderByLimit.NumVec) + dim := len(rhs) + if dim == 0 { + return nil, nil, moerr.NewInternalError(ctx, "empty query vector") + } + nX := len(selectRows) + if nX == 0 { + return nil, nil, nil + } + + lhs := make([][]float32, nX) + for i, row := range selectRows { + lhs[i] = types.BytesToArray[float32](vecCol.GetBytesAt(int(row))) + } + + pairwiseDists, err := metric.PairWiseDistance(lhs, [][]float32{rhs}, orderByLimit.MetricType, 0) if err != nil { return nil, nil, err } - rhs := types.BytesToArray[float32](orderByLimit.NumVec) + resIdx := 0 + sels := make([]int64, nX) + dists := make([]float64, nX) - for _, row := range selectRows { - dist, err := distFunc(types.BytesToArray[float32](vecCol.GetBytesAt(int(row))), rhs) - if err != nil { - return nil, nil, err - } - dist64 := float64(dist) + for i, row := range selectRows { + dist64 := float64(pairwiseDists[i]) if orderByLimit.LowerBoundType == plan.BoundType_INCLUSIVE { if dist64 < orderByLimit.LowerBound { @@ -442,25 +452,50 @@ func HandleOrderByLimitOnIVFFlatIndex( heap.Push(&orderByLimit.DistHeap, dist64) } - searchResults = append(searchResults, vectorindex.SearchResult{ - Id: row, - Distance: dist64, - }) + sels[resIdx] = row + dists[resIdx] = dist64 + resIdx++ } + sels = sels[:resIdx] + dists = dists[:resIdx] + + finalIdx := 0 + for i := 0; i < len(sels); i++ { + if dists[i] <= orderByLimit.DistHeap[0] { + sels[finalIdx] = sels[i] + dists[finalIdx] = dists[i] + finalIdx++ + } + } + return sels[:finalIdx], dists[:finalIdx], nil case types.T_array_float64: - distFunc, err := metric.ResolveDistanceFn[float64](orderByLimit.MetricType) + rhs := types.BytesToArray[float64](orderByLimit.NumVec) + dim := len(rhs) + if dim == 0 { + return nil, nil, moerr.NewInternalError(ctx, "empty query vector") + } + nX := len(selectRows) + if nX == 0 { + return nil, nil, nil + } + + lhs := make([][]float64, nX) + for i, row := range selectRows { + lhs[i] = types.BytesToArray[float64](vecCol.GetBytesAt(int(row))) + } + + pairwiseDists, err := metric.PairWiseDistance(lhs, [][]float64{rhs}, orderByLimit.MetricType, 0) if err != nil { return nil, nil, err } - rhs := types.BytesToArray[float64](orderByLimit.NumVec) + resIdx := 0 + sels := make([]int64, nX) + dists := make([]float64, nX) - for _, row := range selectRows { - dist64, err := distFunc(types.BytesToArray[float64](vecCol.GetBytesAt(int(row))), rhs) - if err != nil { - return nil, nil, err - } + for i, row := range selectRows { + dist64 := float64(pairwiseDists[i]) if orderByLimit.LowerBoundType == plan.BoundType_INCLUSIVE { if dist64 < orderByLimit.LowerBound { @@ -492,28 +527,26 @@ func HandleOrderByLimitOnIVFFlatIndex( heap.Push(&orderByLimit.DistHeap, dist64) } - searchResults = append(searchResults, vectorindex.SearchResult{ - Id: row, - Distance: dist64, - }) + sels[resIdx] = row + dists[resIdx] = dist64 + resIdx++ } + sels = sels[:resIdx] + dists = dists[:resIdx] + + finalIdx := 0 + for i := 0; i < len(sels); i++ { + if dists[i] <= orderByLimit.DistHeap[0] { + sels[finalIdx] = sels[i] + dists[finalIdx] = dists[i] + finalIdx++ + } + } + return sels[:finalIdx], dists[:finalIdx], nil default: return nil, nil, moerr.NewInternalError(ctx, fmt.Sprintf("only support float32/float64 type for topn: %s", orderByLimit.Typ)) } - - searchResults = slices.DeleteFunc(searchResults, func(res vectorindex.SearchResult) bool { - return res.Distance > orderByLimit.DistHeap[0] - }) - - sels := make([]int64, len(searchResults)) - dists := make([]float64, len(searchResults)) - for i, res := range searchResults { - sels[i] = res.Id - dists[i] = res.Distance - } - - return sels, dists, nil } func fillOutputBatchBySelectedRows( diff --git a/test/distributed/cases/vector/vector_ivfflat_null_entry_panic_minimal.result b/test/distributed/cases/vector/vector_ivfflat_null_entry_panic_minimal.result index 256e4dcea08e2..3e4b3fe0183a5 100644 --- a/test/distributed/cases/vector/vector_ivfflat_null_entry_panic_minimal.result +++ b/test/distributed/cases/vector/vector_ivfflat_null_entry_panic_minimal.result @@ -58,7 +58,7 @@ set @q_sql = concat( prepare p_q from @q_sql; execute p_q; ➤ __mo_index_pri_col[12,-1,0] ¦ d[8,54,0] 𝄀 -r_1 ¦ 0.64000004529953 +r_1 ¦ 0.800000011920929 deallocate prepare p_q; DROP TABLE IF EXISTS t1; DROP DATABASE vec_null_panic_db;