Implement rangeQuery for VecSimTieredIndex - [MOD-5164] (#360)

GuyAv46 · web-flow · commit 3a49c70e5132 · 2023-05-07T13:49:07.000+03:00
* implemented `rangeQuery` for VecSimTieredIndex,
... including needed utility functions

* renaming `merge_results.h` and moving `filter_results` to it

* fix build

* first test and some fixes

* improved test and added a parallel test

* fix a bug where we safely get (from `safeGetEntryPoint`)
the old entry point but then we get the new max
level when trying to search using the old one

* fix tests

* Update comments

* review fixes

* after rebase fixes

* added a general comment on tiered index's guarantees
diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h
@@ -231,7 +231,7 @@ class HNSWIndex : public VecSimIndexAbstract<DistType>,
     // (this option is used currently for tests).
     virtual inline bool safeCheckIfLabelExistsInIndex(labelType label,
                                                       bool also_done_processing = false) const = 0;
-    inline idType safeGetEntryPointCopy() const;
+    inline auto safeGetEntryPointState() const;
     inline void lockIndexDataGuard() const;
     inline void unlockIndexDataGuard() const;
     inline void lockNodeLinks(idType node_id) const;
@@ -1901,22 +1901,22 @@ void HNSWIndex<DataType, DistType>::appendVector(const void *vector_data, const
 }
 
 template <typename DataType, typename DistType>
-idType HNSWIndex<DataType, DistType>::safeGetEntryPointCopy() const {
+auto HNSWIndex<DataType, DistType>::safeGetEntryPointState() const {
     std::shared_lock<std::shared_mutex> lock(index_data_guard_);
-    return entrypoint_node_;
+    return std::make_pair(entrypoint_node_, max_level_);
 }
 
 template <typename DataType, typename DistType>
 idType HNSWIndex<DataType, DistType>::searchBottomLayerEP(const void *query_data, void *timeoutCtx,
                                                           VecSimQueryResult_Code *rc) const {
     *rc = VecSim_QueryResult_OK;
 
-    idType curr_element = safeGetEntryPointCopy();
+    auto [curr_element, max_level] = safeGetEntryPointState();
     if (curr_element == INVALID_ID)
         return curr_element; // index is empty.
 
     DistType cur_dist = this->dist_func(query_data, getDataByInternalId(curr_element), this->dim);
-    for (size_t level = max_level_; level > 0 && curr_element != INVALID_ID; level--) {
+    for (size_t level = max_level; level > 0 && curr_element != INVALID_ID; level--) {
         greedySearchLevel<true>(query_data, level, curr_element, cur_dist, timeoutCtx, rc);
     }
     return curr_element;
@@ -2127,7 +2127,10 @@ VecSimQueryResult_List HNSWIndex<DataType, DistType>::rangeQuery(const void *que
     }
 
     idType bottom_layer_ep = searchBottomLayerEP(query_data, timeoutCtx, &rl.code);
-    if (VecSim_OK != rl.code) {
+    // Although we checked that the index is not empty (cur_element_count == 0), it might be
+    // that another thread deleted all the elements or didn't finish inserting the first element
+    // yet. Anyway, we observed that the index is empty, so we return an empty result list.
+    if (VecSim_OK != rl.code || bottom_layer_ep == INVALID_ID) {
         rl.results = array_new<VecSimQueryResult>(0);
         return rl;
     }
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h
@@ -179,10 +179,6 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
     void increaseCapacity() override {}
 
     // TODO: Implement the actual methods instead of these temporary ones.
-    VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
-                                      VecSimQueryParams *queryParams) override {
-        return this->backendIndex->rangeQuery(queryBlob, radius, queryParams);
-    }
     VecSimIndexInfo info() const override { return this->backendIndex->info(); }
     VecSimInfoIterator *infoIterator() const override { return this->backendIndex->infoIterator(); }
     VecSimBatchIterator *newBatchIterator(const void *queryBlob,
@@ -766,8 +762,7 @@ double TieredHNSWIndex<DataType, DistType>::getDistanceFrom(labelType label,
 
     // Try to get the distance from the Main index.
     this->mainIndexGuard.lock_shared();
-    auto hnsw = getHNSWIndex();
-    auto hnsw_dist = hnsw->safeGetDistanceFrom(label, blob);
+    auto hnsw_dist = getHNSWIndex()->safeGetDistanceFrom(label, blob);
     this->mainIndexGuard.unlock_shared();
 
     // Return the minimum distance that is not NaN.
@@ -856,7 +851,6 @@ TieredHNSWIndex<DataType, DistType>::TieredHNSW_BatchIterator::getNextResults(
             auto tail = this->flat_iterator->getNextResults(
                 n_res - VecSimQueryResult_Len(this->flat_results), BY_SCORE_THEN_ID);
             concat_results(this->flat_results, tail);
-            VecSimQueryResult_Free(tail);
 
             if (!isMulti) {
                 // On single-value indexes, duplicates will never appear in the hnsw results before
diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h
@@ -26,6 +26,8 @@ INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_writeInPlaceMode_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_switchWriteModes_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_bufferLimit_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_bufferLimitAsync_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_RangeSearch_Test)
+INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTest_parallelRangeSearch_Test)
 
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_insertJobAsync_Test)
 INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_insertJobAsyncMulti_Test)
diff --git a/src/VecSim/utils/query_result_utils.h b/src/VecSim/utils/query_result_utils.h
@@ -33,7 +33,10 @@ template <bool withSet>
 VecSimQueryResult *merge_results(VecSimQueryResult *&first, const VecSimQueryResult *first_end,
                                  VecSimQueryResult *&second, const VecSimQueryResult *second_end,
                                  size_t limit) {
-    VecSimQueryResult *results = array_new<VecSimQueryResult>(limit);
+    // Allocate the merged results array with the minimum size needed.
+    // Min of the limit and the sum of the lengths of the two arrays.
+    VecSimQueryResult *results = array_new<VecSimQueryResult>(
+        std::min(limit, (size_t)(first_end - first) + (size_t)(second_end - second)));
     // Will hold the ids of the results we've already added to the merged results.
     // Will be used only if withSet is true.
     std::unordered_set<size_t> ids;
@@ -92,9 +95,54 @@ VecSimQueryResult_List merge_result_lists(VecSimQueryResult_List first,
     return mergedResults;
 }
 
+// Concatenate the results of two queries into the results of the first query, consuming the second.
 static inline void concat_results(VecSimQueryResult_List &first, VecSimQueryResult_List &second) {
     auto &dst = first.results;
     auto &src = second.results;
 
     dst = array_concat(dst, src);
+    VecSimQueryResult_Free(second);
+}
+
+// Sorts the results by id and removes duplicates.
+// Assumes that a result can appear at most twice in the results list.
+// @returns the number of unique results. This should be set to be the new length of the results
+template <bool IsMulti>
+void filter_results_by_id(VecSimQueryResult_List results) {
+    if (VecSimQueryResult_Len(results) < 2) {
+        return;
+    }
+    sort_results_by_id(results);
+
+    size_t i, cur_end;
+    for (i = 0, cur_end = 0; i < VecSimQueryResult_Len(results) - 1; i++, cur_end++) {
+        const VecSimQueryResult *cur_res = results.results + i;
+        const VecSimQueryResult *next_res = cur_res + 1;
+        if (VecSimQueryResult_GetId(cur_res) == VecSimQueryResult_GetId(next_res)) {
+            if (IsMulti) {
+                // On multi value index, scores might be different and we want to keep the lower
+                // score.
+                if (VecSimQueryResult_GetScore(cur_res) < VecSimQueryResult_GetScore(next_res)) {
+                    results.results[cur_end] = *cur_res;
+                } else {
+                    results.results[cur_end] = *next_res;
+                }
+            } else {
+                // On single value index, scores are the same so we can keep any of the results.
+                results.results[cur_end] = *cur_res;
+            }
+            // Assuming every id can appear at most twice, we can skip the next comparison between
+            // the current and the next result.
+            i++;
+        } else {
+            results.results[cur_end] = *cur_res;
+        }
+    }
+    // If the last result is unique, we need to add it to the results.
+    if (i == VecSimQueryResult_Len(results) - 1) {
+        results.results[cur_end] = results.results[i];
+        // Logically, we should increment cur_end and i here, but we don't need to because it won't
+        // affect the rest of the function.
+    }
+    array_pop_back_n(results.results, i - cur_end);
 }
diff --git a/src/VecSim/utils/vec_utils.cpp b/src/VecSim/utils/vec_utils.cpp
@@ -65,6 +65,17 @@ void sort_results_by_score_then_id(VecSimQueryResult_List rl) {
           (__compar_fn_t)cmpVecSimQueryResultByScoreThenId);
 }
 
+void sort_results(VecSimQueryResult_List rl, VecSimQueryResult_Order order) {
+    switch (order) {
+    case BY_ID:
+        return sort_results_by_id(rl);
+    case BY_SCORE:
+        return sort_results_by_score(rl);
+    case BY_SCORE_THEN_ID:
+        return sort_results_by_score_then_id(rl);
+    }
+}
+
 VecSimResolveCode validate_positive_integer_param(VecSimRawParam rawParam, long long *val) {
     char *ep; // For checking that strtoll used all rawParam.valLen chars.
     errno = 0;
diff --git a/src/VecSim/utils/vec_utils.h b/src/VecSim/utils/vec_utils.h
@@ -83,6 +83,8 @@ void sort_results_by_score(VecSimQueryResult_List results);
 
 void sort_results_by_score_then_id(VecSimQueryResult_List results);
 
+void sort_results(VecSimQueryResult_List results, VecSimQueryResult_Order order);
+
 VecSimResolveCode validate_positive_integer_param(VecSimRawParam rawParam, long long *val);
 
 VecSimResolveCode validate_positive_double_param(VecSimRawParam rawParam, double *val);
diff --git a/src/VecSim/vec_sim.cpp b/src/VecSim/vec_sim.cpp
@@ -221,14 +221,7 @@ extern "C" VecSimQueryResult_List VecSimIndex_RangeQuery(VecSimIndex *index, con
     if (radius < 0) {
         throw std::runtime_error("radius must be non-negative");
     }
-    VecSimQueryResult_List results = index->rangeQueryWrapper(queryBlob, radius, queryParams);
-
-    if (order == BY_SCORE) {
-        sort_results_by_score(results);
-    } else {
-        sort_results_by_id(results);
-    }
-    return results;
+    return index->rangeQueryWrapper(queryBlob, radius, queryParams, order);
 }
 
 extern "C" void VecSimIndex_Free(VecSimIndex *index) {
diff --git a/src/VecSim/vec_sim_index.h b/src/VecSim/vec_sim_index.h
@@ -88,6 +88,16 @@ struct VecSimIndexAbstract : public VecSimIndexInterface {
     inline VecSimMetric getMetric() const { return metric; }
     inline size_t getDataSize() const { return data_size; }
 
+    virtual VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
+                                              VecSimQueryParams *queryParams) = 0;
+    VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
+                                      VecSimQueryParams *queryParams,
+                                      VecSimQueryResult_Order order) override {
+        auto results = rangeQuery(queryBlob, radius, queryParams);
+        sort_results(results, order);
+        return results;
+    }
+
     void log(const char *fmt, ...) const {
         if (VecSimIndexInterface::logCallback) {
             // Format the message and call the callback
@@ -136,11 +146,12 @@ struct VecSimIndexAbstract : public VecSimIndexInterface {
     }
 
     virtual VecSimQueryResult_List rangeQueryWrapper(const void *queryBlob, double radius,
-                                                     VecSimQueryParams *queryParams) override {
+                                                     VecSimQueryParams *queryParams,
+                                                     VecSimQueryResult_Order order) override {
         char processed_blob[this->data_size];
         const void *query_to_send = processBlob(queryBlob, processed_blob);
 
-        return this->rangeQuery(query_to_send, radius, queryParams);
+        return this->rangeQuery(query_to_send, radius, queryParams, order);
     }
 
     virtual VecSimBatchIterator *
diff --git a/src/VecSim/vec_sim_interface.h b/src/VecSim/vec_sim_interface.h
@@ -138,7 +138,8 @@ struct VecSimIndexInterface : public VecsimBaseObject {
      * blob.
      */
     virtual VecSimQueryResult_List rangeQueryWrapper(const void *queryBlob, double radius,
-                                                     VecSimQueryParams *queryParams) = 0;
+                                                     VecSimQueryParams *queryParams,
+                                                     VecSimQueryResult_Order order) = 0;
     /**
      * @brief Search for the vectors that are in a given range in the index with respect to a given
      * vector. The results can be ordered by their score or id.
@@ -153,7 +154,9 @@ struct VecSimIndexInterface : public VecsimBaseObject {
      * VecSimQueryResult_Iterator.
      */
     virtual VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
-                                              VecSimQueryParams *queryParams) = 0;
+                                              VecSimQueryParams *queryParams,
+                                              VecSimQueryResult_Order order) = 0;
+
     /**
      * @brief Return index information.
      *
diff --git a/src/VecSim/vec_sim_tiered_index.h b/src/VecSim/vec_sim_tiered_index.h
@@ -3,7 +3,7 @@
 #include "vec_sim_index.h"
 #include "algorithms/brute_force/brute_force.h"
 #include "VecSim/batch_iterator.h"
-#include "VecSim/utils/merge_results.h"
+#include "VecSim/utils/query_result_utils.h"
 
 #include <shared_mutex>
 
@@ -20,6 +20,9 @@ struct AsyncJob : public VecsimBaseObject {
         : VecsimBaseObject(allocator), jobType(type), Execute(callback), index(index_ref) {}
 };
 
+// All read operations (including KNN, range, batch iterators and get-distance-from) are guaranteed
+// to consider all vectors that were added to the index before the query was submitted. The results
+// may include vectors that were added after the query was submitted, with no guarantees.
 template <typename DataType, typename DistType>
 class VecSimTieredIndex : public VecSimIndexInterface {
 protected:
@@ -62,6 +65,9 @@ class VecSimTieredIndex : public VecSimIndexInterface {
 
     VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k,
                                      VecSimQueryParams *queryParams) override;
+    VecSimQueryResult_List rangeQuery(const void *queryBlob, double radius,
+                                      VecSimQueryParams *queryParams,
+                                      VecSimQueryResult_Order order) override;
 
     // Return the current state of the global write mode (async/in-place).
     static VecSimWriteMode getWriteMode() { return VecSimIndexInterface::asyncWriteMode; }
@@ -83,12 +89,13 @@ class VecSimTieredIndex : public VecSimIndexInterface {
     }
 
     virtual VecSimQueryResult_List rangeQueryWrapper(const void *queryBlob, double radius,
-                                                     VecSimQueryParams *queryParams) override {
+                                                     VecSimQueryParams *queryParams,
+                                                     VecSimQueryResult_Order order) override {
         // Will be used only if a processing stage is needed
         char processed_blob[this->backendIndex->getDataSize()];
         const void *query_to_send = this->backendIndex->processBlob(queryBlob, processed_blob);
 
-        return this->rangeQuery(query_to_send, radius, queryParams);
+        return this->rangeQuery(query_to_send, radius, queryParams, order);
     }
 
     virtual VecSimBatchIterator *
@@ -151,3 +158,74 @@ VecSimTieredIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k
         }
     }
 }
+
+template <typename DataType, typename DistType>
+VecSimQueryResult_List
+VecSimTieredIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double radius,
+                                                  VecSimQueryParams *queryParams,
+                                                  VecSimQueryResult_Order order) {
+    this->flatIndexGuard.lock_shared();
+
+    // If the flat buffer is empty, we can simply query the main index.
+    if (this->frontendIndex->indexSize() == 0) {
+        // Release the flat lock and acquire the main lock.
+        this->flatIndexGuard.unlock_shared();
+
+        // Simply query the main index and return the results while holding the lock.
+        this->mainIndexGuard.lock_shared();
+        auto res = this->backendIndex->rangeQuery(queryBlob, radius, queryParams);
+        this->mainIndexGuard.unlock_shared();
+
+        // We could have passed the order to the main index, but we can sort them here after
+        // unlocking it instead.
+        sort_results(res, order);
+        return res;
+    } else {
+        // No luck... first query the flat buffer and release the lock.
+        auto flat_results = this->frontendIndex->rangeQuery(queryBlob, radius, queryParams);
+        this->flatIndexGuard.unlock_shared();
+
+        // If the query failed (currently only on timeout), return the error code and the partial
+        // results.
+        if (flat_results.code != VecSim_QueryResult_OK) {
+            return flat_results;
+        }
+
+        // Lock the main index and query it.
+        this->mainIndexGuard.lock_shared();
+        auto main_results = this->backendIndex->rangeQuery(queryBlob, radius, queryParams);
+        this->mainIndexGuard.unlock_shared();
+
+        // Merge the results and return, avoiding duplicates.
+        // At this point, the return code of the FLAT index is OK, and the return code of the MAIN
+        // index is either OK or TIMEOUT. Make sure to return the return code of the MAIN index.
+        if (BY_SCORE == order) {
+            sort_results_by_score_then_id(main_results);
+            sort_results_by_score_then_id(flat_results);
+
+            // Keep the return code of the main index.
+            auto code = main_results.code;
+
+            // Merge the sorted results with no limit (all the results are valid).
+            VecSimQueryResult_List ret;
+            if (this->backendIndex->isMultiValue()) {
+                ret = merge_result_lists<true>(main_results, flat_results, -1);
+            } else {
+                ret = merge_result_lists<false>(main_results, flat_results, -1);
+            }
+            // Restore the return code and return.
+            ret.code = code;
+            return ret;
+
+        } else { // BY_ID
+            // Notice that we don't modify the return code of the main index in any step.
+            concat_results(main_results, flat_results);
+            if (this->backendIndex->isMultiValue()) {
+                filter_results_by_id<true>(main_results);
+            } else {
+                filter_results_by_id<false>(main_results);
+            }
+            return main_results;
+        }
+    }
+}
diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp
diff --git a/tests/unit/test_utils.cpp b/tests/unit/test_utils.cpp