flow tests of tiered with dbpedia to run bm (including flat buffer limit)

meiravgri · meiravgri · commit bfead9b47642 · 2023-05-08T13:18:12.000Z
diff --git a/src/python_bindings/bindings.cpp b/src/python_bindings/bindings.cpp
@@ -16,8 +16,10 @@
 #include <thread>
 #include <VecSim/algorithms/hnsw/hnsw_single.h>
 #include <VecSim/algorithms/brute_force/brute_force_single.h>
+#include "tiered_index_mock.h"
 
 namespace py = pybind11;
+using namespace tiered_index_mock;
 
 // Helper function that iterates query results and wrap them in python numpy object -
 // a tuple of two 2D arrays: (labels, distances)
@@ -174,6 +176,13 @@ class PyVecSimIndex {
 
     size_t indexSize() { return VecSimIndex_IndexSize(index.get()); }
 
+    size_t indexMemory() { return this->index->getAllocationSize(); }
+
+    double getGetDistanceFrom(size_t id, const py::object &input) {
+        py::array query(input);
+        return this->index->getDistanceFrom(id, (const char *)query.data(0));
+    }
+
     PyBatchIterator createBatchIterator(const py::object &input, VecSimQueryParams *query_params) {
         py::array query(input);
         return PyBatchIterator(
@@ -360,6 +369,94 @@ class PyHNSWLibIndex : public PyVecSimIndex {
     }
 };
 
+class PyTIEREDIndex : public PyVecSimIndex {
+
+protected:
+    JobQueue jobQueue;          // External queue that holds the jobs.
+    IndexExtCtx jobQueueCtx;    // External context to be sent to the submit callback.
+    SubmitCB submitCb;          // A callback that submits an array of jobs into a given jobQueue.
+    size_t memoryCtx;           // External context that stores the index memory consumption.
+    UpdateMemoryCB UpdateMemCb; // A callback that updates the memoryCtx
+                                // with a given memory (number).
+    size_t flatBufferLimit; // Maximum size allowed for the flat buffer. If flat buffer is full, use
+                            // in-place insertion.
+    bool run_thread;
+    std::bitset<MAX_POOL_SIZE> executions_status;
+
+    TieredIndexParams TieredIndexParams_Init() {
+        TieredIndexParams ret = {
+            .jobQueue = &this->jobQueue,
+            .jobQueueCtx = &this->jobQueueCtx,
+            .submitCb = this->submitCb,
+            .memoryCtx = &this->memoryCtx,
+            .UpdateMemCb = this->UpdateMemCb,
+            .flatBufferLimit = this->flatBufferLimit,
+        };
+
+        return ret;
+    }
+
+public:
+    explicit PyTIEREDIndex(size_t BufferLimit = 20000000)
+        : submitCb(submit_callback), memoryCtx(0), UpdateMemCb(update_mem_callback), flatBufferLimit(BufferLimit),
+          run_thread(true) {
+
+        for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+            ThreadParams params(run_thread, executions_status, i, jobQueue);
+            thread_pool.emplace_back(thread_main_loop, params);
+        }
+    }
+
+    virtual ~PyTIEREDIndex() = 0;
+
+    void WaitForIndex(size_t waiting_duration = 10) {
+        bool keep_wating = true;
+        while (keep_wating) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(waiting_duration));
+            std::unique_lock<std::mutex> lock(queue_guard);
+            if (jobQueue.empty()) {
+                while (true) {
+                    if (executions_status.count() == 0) {
+                        keep_wating = false;
+                        break;
+                    }
+                    std::this_thread::sleep_for(std::chrono::milliseconds(waiting_duration));
+                }
+            }
+        }
+    }
+
+
+    static size_t GetThreadsNum() { return THREAD_POOL_SIZE; }
+
+    size_t getBufferLimit() {return flatBufferLimit; }
+};
+
+PyTIEREDIndex::~PyTIEREDIndex() { thread_pool_terminate(jobQueue, run_thread); }
+class PyTIERED_HNSWIndex : public PyTIEREDIndex {
+public:
+    explicit PyTIERED_HNSWIndex(const HNSWParams &hnsw_params,
+                                const TieredHNSWParams &tiered_hnsw_params) {
+
+        // Create primaryIndexParams and specific params for hnsw tiered index.
+        VecSimParams primary_index_params = {.algo = VecSimAlgo_HNSWLIB, .hnswParams = hnsw_params};
+
+        // create TieredIndexParams
+        TieredIndexParams tiered_params = TieredIndexParams_Init();
+
+        tiered_params.primaryIndexParams = &primary_index_params;
+        tiered_params.specificParams.tieredHnswParams = tiered_hnsw_params;
+
+        // create VecSimParams for TieredIndexParams
+        VecSimParams params = {.algo = VecSimAlgo_TIERED, .tieredParams = tiered_params};
+
+        this->index = std::shared_ptr<VecSimIndex>(VecSimIndex_New(&params), VecSimIndex_Free);
+        // Set the created tiered index in the index external context.
+        this->jobQueueCtx.index_strong_ref = this->index;
+    }
+    size_t HNSWLabelCount() { return this->index->info().hnswInfo.indexLabelCount; }
+};
+
 class PyBFIndex : public PyVecSimIndex {
 public:
     explicit PyBFIndex(const BFParams &bf_params) {
@@ -413,6 +510,10 @@ PYBIND11_MODULE(VecSim, m) {
         .def_readwrite("initialCapacity", &BFParams::initialCapacity)
         .def_readwrite("blockSize", &BFParams::blockSize);
 
+    py::class_<TieredHNSWParams>(m, "TieredHNSWParams")
+        .def(py::init())
+        .def_readwrite("swapJobThreshold", &TieredHNSWParams::swapJobThreshold);
+
     py::class_<VecSimParams>(m, "VecSimParams")
         .def(py::init())
         .def_readwrite("algo", &VecSimParams::algo)
@@ -439,8 +540,11 @@ PYBIND11_MODULE(VecSim, m) {
         .def("range_query", &PyVecSimIndex::range, py::arg("vector"), py::arg("radius"),
              py::arg("query_param") = nullptr)
         .def("index_size", &PyVecSimIndex::indexSize)
+        .def("index_memory", &PyVecSimIndex::indexMemory)
         .def("create_batch_iterator", &PyVecSimIndex::createBatchIterator, py::arg("query_blob"),
              py::arg("query_param") = nullptr)
+        .def("get_distance_from", &PyVecSimIndex::getGetDistanceFrom, py::arg("label"),
+             py::arg("blob"))
         .def("get_vector", &PyVecSimIndex::getVector);
 
     py::class_<PyHNSWLibIndex, PyVecSimIndex>(m, "HNSWIndex")
@@ -460,6 +564,19 @@ PYBIND11_MODULE(VecSim, m) {
         .def("range_parallel", &PyHNSWLibIndex::searchRangeParallel, py::arg("queries"),
              py::arg("radius"), py::arg("query_param") = nullptr, py::arg("num_threads") = -1);
 
+    py::class_<PyTIEREDIndex, PyVecSimIndex>(m, "TIEREDIndex")
+        .def("wait_for_index", &PyTIERED_HNSWIndex::WaitForIndex, py::arg("waiting_duration") = 10)
+        .def("get_buffer_limit", &PyTIERED_HNSWIndex::getBufferLimit)
+        .def_static("get_threads_num", &PyTIEREDIndex::GetThreadsNum);
+
+    py::class_<PyTIERED_HNSWIndex, PyTIEREDIndex>(m, "TIERED_HNSWIndex")
+        .def(
+            py::init([](const HNSWParams &hnsw_params, const TieredHNSWParams &tiered_hnsw_params) {
+                return new PyTIERED_HNSWIndex(hnsw_params, tiered_hnsw_params);
+            }),
+            py::arg("hnsw_params"), py::arg("tiered_hnsw_params"))
+        .def("hnsw_label_count", &PyTIERED_HNSWIndex::HNSWLabelCount);
+
     py::class_<PyBFIndex, PyVecSimIndex>(m, "BFIndex")
         .def(py::init([](const BFParams &params) { return new PyBFIndex(params); }),
              py::arg("params"));
diff --git a/src/python_bindings/tiered_index_mock.h b/src/python_bindings/tiered_index_mock.h
@@ -0,0 +1,152 @@
+ /*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include <thread>
+#include <condition_variable>
+#include <bitset>
+
+#include "VecSim/vec_sim.h"
+#include "VecSim/algorithms/hnsw/hnsw_tiered.h"
+#include "pybind11/pybind11.h"
+
+namespace tiered_index_mock {
+
+typedef struct RefManagedJob {
+    AsyncJob *job;
+    std::weak_ptr<VecSimIndex> index_weak_ref;
+} RefManagedJob;
+
+struct SearchJobMock : public AsyncJob {
+    void *query; // The query vector. ownership is passed to the job in the constructor.
+    size_t k;    // The number of results to return.
+    size_t n;    // The number of vectors in the index (might be useful for the mock)
+    size_t dim;  // The dimension of the vectors in the index (might be useful for the mock)
+    std::atomic_int &successful_searches; // A reference to a shared counter that counts the number
+                                          // of successful searches.
+    SearchJobMock(std::shared_ptr<VecSimAllocator> allocator, JobCallback searchCB,
+                  VecSimIndex *index_, void *query_, size_t k_, size_t n_, size_t dim_,
+                  std::atomic_int &successful_searches_)
+        : AsyncJob(allocator, HNSW_SEARCH_JOB, searchCB, index_), query(query_), k(k_), n(n_),
+          dim(dim_), successful_searches(successful_searches_) {}
+    ~SearchJobMock() { this->allocator->free_allocation(query); }
+};
+
+using JobQueue = std::queue<RefManagedJob>;
+int submit_callback(void *job_queue, AsyncJob **jobs, size_t len, void *index_ctx);
+int update_mem_callback(void *mem_ctx, size_t mem);
+
+typedef struct IndexExtCtx {
+    std::shared_ptr<VecSimIndex> index_strong_ref;
+    ~IndexExtCtx() { std::cout << "ctx dtor" << std::endl; }
+} IndexExtCtx;
+
+static const size_t MAX_POOL_SIZE = 16;
+static const size_t THREAD_POOL_SIZE = MIN(MAX_POOL_SIZE, std::thread::hardware_concurrency());
+extern std::vector<std::thread> thread_pool;
+extern std::mutex queue_guard;
+extern std::condition_variable queue_cond;
+
+void thread_pool_terminate(JobQueue &jobQ, bool &run_thread);
+
+class ThreadParams {
+public:
+    bool &run_thread;
+    std::bitset<MAX_POOL_SIZE> &executions_status;
+    const unsigned int thread_index;
+    JobQueue &jobQ;
+    ThreadParams(bool &run_thread, std::bitset<MAX_POOL_SIZE> &executions_status,
+                 const unsigned int thread_index, JobQueue &jobQ)
+        : run_thread(run_thread), executions_status(executions_status), thread_index(thread_index),
+          jobQ(jobQ) {}
+
+    ThreadParams(const ThreadParams &other) = default;
+};
+
+void inline MarkExecuteInProcess(std::bitset<MAX_POOL_SIZE> &executions_status,
+                                 size_t thread_index) {
+    executions_status.set(thread_index);
+}
+
+void inline MarkExecuteDone(std::bitset<MAX_POOL_SIZE> &executions_status, size_t thread_index) {
+    executions_status.reset(thread_index);
+}
+void thread_main_loop(ThreadParams params) {
+    while (params.run_thread) {
+        std::unique_lock<std::mutex> lock(queue_guard);
+        // Wake up and acquire the lock (atomically) ONLY if the job queue is not empty at that
+        // point, or if the thread should not run anymore (and quit in that case).
+        queue_cond.wait(lock, [&params]() { return !(params.jobQ.empty()) || !params.run_thread; });
+        if (!params.run_thread)
+            return;
+        auto managed_job = params.jobQ.front();
+        MarkExecuteInProcess(params.executions_status, params.thread_index);
+        params.jobQ.pop();
+
+        lock.unlock();
+        // Upgrade the index weak reference to a strong ref while we run the job over the index.
+        if (auto temp_ref = managed_job.index_weak_ref.lock()) {
+            managed_job.job->Execute(managed_job.job);
+            MarkExecuteDone(params.executions_status, params.thread_index);
+        }
+    }
+}
+
+/*
+ * Mock callbacks for testing async tiered index. We use a simple std::queue to simulate the job
+ * queue.
+ */
+
+std::mutex queue_guard;
+std::condition_variable queue_cond;
+std::vector<std::thread> thread_pool;
+
+int submit_callback(void *job_queue, AsyncJob **jobs, size_t len, void *index_ctx) {
+    {
+        std::unique_lock<std::mutex> lock(queue_guard);
+        for (size_t i = 0; i < len; i++) {
+            // Wrap the job with a struct that contains a weak reference to the related index.
+            auto owned_job = RefManagedJob{
+                .job = jobs[i],
+                .index_weak_ref = reinterpret_cast<IndexExtCtx *>(index_ctx)->index_strong_ref};
+            static_cast<JobQueue *>(job_queue)->push(owned_job);
+        }
+    }
+    if (len == 1) {
+        queue_cond.notify_one();
+    } else {
+        queue_cond.notify_all();
+    }
+    return VecSim_OK;
+}
+
+int update_mem_callback(void *mem_ctx, size_t mem) {
+    *(size_t *)mem_ctx = mem;
+    return VecSim_OK;
+}
+
+// Main loop for background worker threads that execute the jobs form the job queue.
+// run_thread uses as a signal to the thread that indicates whether it should keep running or
+// stop and terminate the thread.
+
+void thread_pool_terminate(JobQueue &jobQ, bool &run_thread) {
+    // Check every 10 ms if queue is empty, and if so, terminate the threads loop.
+    while (true) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        std::unique_lock<std::mutex> lock(queue_guard);
+        if (jobQ.empty()) {
+            run_thread = false;
+            queue_cond.notify_all();
+            break;
+        }
+    }
+    for (size_t i = 0; i < THREAD_POOL_SIZE; i++) {
+        thread_pool[i].join();
+    }
+    thread_pool.clear();
+}
+} // namespace tiered_index_mock
diff --git a/tests/flow/common.py b/tests/flow/common.py
@@ -7,7 +7,23 @@
 from scipy import spatial
 from numpy.testing import assert_allclose
 import time
+import math
 
+def create_hnsw_params(dim, num_elements, metric, data_type, ef_construction=200, m=16, ef_runtime=10, epsilon=0.01,
+                      is_multi=False):
+    hnsw_params = HNSWParams()
+
+    hnsw_params.dim = dim
+    hnsw_params.metric = metric
+    hnsw_params.type = data_type
+    hnsw_params.M = m
+    hnsw_params.efConstruction = ef_construction
+    hnsw_params.initialCapacity = num_elements
+    hnsw_params.efRuntime = ef_runtime
+    hnsw_params.epsilon = epsilon
+    hnsw_params.multi = is_multi
+    
+    return hnsw_params    
 # Helper function for creating an index,uses the default HNSW parameters if not specified.
 def create_hnsw_index(dim, num_elements, metric, data_type, ef_construction=200, m=16, ef_runtime=10, epsilon=0.01,
                       is_multi=False):
@@ -24,3 +40,5 @@ def create_hnsw_index(dim, num_elements, metric, data_type, ef_construction=200,
     hnsw_params.multi = is_multi
 
     return HNSWIndex(hnsw_params)
+
+    
diff --git a/tests/flow/test_bm_hnsw_tiered_dataset.py b/tests/flow/test_bm_hnsw_tiered_dataset.py