Skip to content

Commit c0497c0

Browse files
authored
Tiered HNSW - preferAdHocSearch fix (#368)
* fixed a bug in preferAdHocSearch of tiered * added a test * made it possible to query for preference of too big subset size * fix tests * review fix
1 parent 2e0098b commit c0497c0

File tree

9 files changed

+65
-33
lines changed

9 files changed

+65
-33
lines changed

src/VecSim/algorithms/brute_force/brute_force.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -425,9 +425,9 @@ bool BruteForceIndex<DataType, DistType>::preferAdHocSearch(size_t subsetSize, s
425425
// This heuristic is based on sklearn decision tree classifier (with 10 leaves nodes) -
426426
// see scripts/BF_batches_clf.py
427427
size_t index_size = this->indexSize();
428-
if (subsetSize > index_size) {
429-
throw std::runtime_error("internal error: subset size cannot be larger than index size");
430-
}
428+
// Referring to too large subset size as if it was the maximum possible size.
429+
subsetSize = std::min(subsetSize, index_size);
430+
431431
size_t d = this->dim;
432432
float r = (index_size == 0) ? 0.0f : (float)(subsetSize) / (float)this->indexLabelCount();
433433
bool res;

src/VecSim/algorithms/hnsw/hnsw.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2249,9 +2249,9 @@ bool HNSWIndex<DataType, DistType>::preferAdHocSearch(size_t subsetSize, size_t
22492249
// This heuristic is based on sklearn decision tree classifier (with 20 leaves nodes) -
22502250
// see scripts/HNSW_batches_clf.py
22512251
size_t index_size = this->indexSize();
2252-
if (subsetSize > index_size) {
2253-
throw std::runtime_error("internal error: subset size cannot be larger than index size");
2254-
}
2252+
// Referring to too large subset size as if it was the maximum possible size.
2253+
subsetSize = std::min(subsetSize, index_size);
2254+
22552255
size_t d = this->dim;
22562256
size_t M = this->getM();
22572257
float r = (index_size == 0) ? 0.0f : (float)(subsetSize) / (float)this->indexLabelCount();

src/VecSim/algorithms/hnsw/hnsw_tiered.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,6 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
189189
return new (this->allocator)
190190
TieredHNSW_BatchIterator(queryBlobCopy, this, queryParams, this->allocator);
191191
}
192-
bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override {
193-
// For now, decide according to the bigger index.
194-
return this->backendIndex->indexSize() > this->frontendIndex->indexSize()
195-
? this->backendIndex->preferAdHocSearch(subsetSize, k, initial_check)
196-
: this->frontendIndex->preferAdHocSearch(subsetSize, k, initial_check);
197-
}
198192
inline void setLastSearchMode(VecSearchMode mode) override {
199193
return this->backendIndex->setLastSearchMode(mode);
200194
}

src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,4 @@ INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteVectorMulti_Test)
4444
INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteVectorMultiFromFlatAdvanced_Test)
4545
INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_overwriteVectorBasic_Test)
4646
INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_overwriteVectorAsync_Test)
47+
INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_preferAdHocOptimization_Test)

src/VecSim/vec_sim_tiered_index.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,13 @@ class VecSimTieredIndex : public VecSimIndexInterface {
6969
VecSimQueryParams *queryParams,
7070
VecSimQueryResult_Order order) override;
7171

72+
bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override {
73+
// For now, decide according to the bigger index.
74+
return this->backendIndex->indexSize() > this->frontendIndex->indexSize()
75+
? this->backendIndex->preferAdHocSearch(subsetSize, k, initial_check)
76+
: this->frontendIndex->preferAdHocSearch(subsetSize, k, initial_check);
77+
}
78+
7279
// Return the current state of the global write mode (async/in-place).
7380
static VecSimWriteMode getWriteMode() { return VecSimIndexInterface::asyncWriteMode; }
7481

tests/unit/test_bruteforce.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,13 +1246,9 @@ TYPED_TEST(BruteForceTest, preferAdHocOptimization) {
12461246
ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
12471247

12481248
// Corner cases - subset size is greater than index size.
1249-
try {
1250-
VecSimIndex_PreferAdHocSearch(index, 1, 50, true);
1251-
FAIL() << "Expected std::runtime error";
1252-
} catch (std::runtime_error const &err) {
1253-
EXPECT_EQ(err.what(),
1254-
std::string("internal error: subset size cannot be larger than index size"));
1255-
}
1249+
ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true),
1250+
VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
1251+
12561252
VecSimIndex_Free(index);
12571253
}
12581254

tests/unit/test_hnsw.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1503,13 +1503,9 @@ TYPED_TEST(HNSWTest, preferAdHocOptimization) {
15031503
ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
15041504

15051505
// Corner cases - subset size is greater than index size.
1506-
try {
1507-
VecSimIndex_PreferAdHocSearch(index, 1, 50, true);
1508-
FAIL() << "Expected std::runtime error";
1509-
} catch (std::runtime_error const &err) {
1510-
EXPECT_EQ(err.what(),
1511-
std::string("internal error: subset size cannot be larger than index size"));
1512-
}
1506+
ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true),
1507+
VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
1508+
15131509
VecSimIndex_Free(index);
15141510
}
15151511

tests/unit/test_hnsw_multi.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -627,13 +627,9 @@ TYPED_TEST(HNSWMultiTest, preferAdHocOptimization) {
627627
ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
628628

629629
// Corner cases - subset size is greater than index size.
630-
try {
631-
VecSimIndex_PreferAdHocSearch(index, 1, 50, true);
632-
FAIL() << "Expected std::runtime error";
633-
} catch (std::runtime_error const &err) {
634-
EXPECT_EQ(err.what(),
635-
std::string("internal error: subset size cannot be larger than index size"));
636-
}
630+
ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true),
631+
VecSimIndex_PreferAdHocSearch(index, 0, 50, true));
632+
637633
VecSimIndex_Free(index);
638634
}
639635
TYPED_TEST(HNSWMultiTest, search_empty_index) {

tests/unit/test_hnsw_tiered.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3374,3 +3374,45 @@ TYPED_TEST(HNSWTieredIndexTest, parallelRangeSearch) {
33743374
// Cleanup.
33753375
delete index_ctx;
33763376
}
3377+
3378+
TYPED_TEST(HNSWTieredIndexTestBasic, preferAdHocOptimization) {
3379+
size_t dim = 4;
3380+
3381+
HNSWParams params = {
3382+
.type = TypeParam::get_index_type(),
3383+
.dim = dim,
3384+
.metric = VecSimMetric_L2,
3385+
};
3386+
VecSimParams hnsw_params = CreateParams(params);
3387+
auto jobQ = JobQueue();
3388+
auto index_ctx = new IndexExtCtx();
3389+
size_t memory_ctx = 0;
3390+
3391+
// Create tiered index with buffer limit set to 0.
3392+
auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, &memory_ctx);
3393+
auto allocator = tiered_index->getAllocator();
3394+
3395+
auto hnsw = tiered_index->backendIndex;
3396+
auto flat = tiered_index->frontendIndex;
3397+
3398+
// Insert 5 vectors to the main index.
3399+
for (size_t i = 0; i < 5; i++) {
3400+
GenerateAndAddVector<TEST_DATA_T>(hnsw, dim, i, i);
3401+
}
3402+
// Sanity check. Should choose as HNSW.
3403+
ASSERT_EQ(tiered_index->preferAdHocSearch(5, 5, true), hnsw->preferAdHocSearch(5, 5, true));
3404+
3405+
// Insert 6 vectors to the flat index.
3406+
for (size_t i = 0; i < 6; i++) {
3407+
GenerateAndAddVector<TEST_DATA_T>(flat, dim, i, i);
3408+
}
3409+
// Sanity check. Should choose as flat as it has more vectors.
3410+
ASSERT_EQ(tiered_index->preferAdHocSearch(5, 5, true), flat->preferAdHocSearch(5, 5, true));
3411+
3412+
// Check for preference of tiered with subset (10) smaller than the tiered index size (11),
3413+
// but larger than any of the underlying indexes.
3414+
ASSERT_NO_THROW(tiered_index->preferAdHocSearch(10, 5, false));
3415+
3416+
// Cleanup.
3417+
delete index_ctx;
3418+
}

0 commit comments

Comments
 (0)