Skip to content

Commit 2e0098b

Browse files
authored
let the capacity of idToLabel in flat index be tight to the size. This requires changing a bit the logic of updating jobs id after removing a vector from flat buffer and swap ids. (#367)
1 parent 3a49c70 commit 2e0098b

File tree

7 files changed

+56
-34
lines changed

7 files changed

+56
-34
lines changed

src/VecSim/algorithms/brute_force/brute_force.h

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,14 @@ class BruteForceIndex : public VecSimIndexAbstract<DistType> {
5757

5858
inline vecsim_stl::vector<VectorBlock *> getVectorBlocks() const { return vectorBlocks; }
5959
inline const labelType getLabelByInternalId(idType internal_id) const {
60-
return idToLabelMapping[internal_id];
60+
return idToLabelMapping.at(internal_id);
6161
}
6262
// Remove a specific vector that is stored under a label from the index by its internal id.
6363
virtual int deleteVectorById(labelType label, idType id) = 0;
64-
// Remove a vector and return a map between internal ids and the original internal ids that they
65-
// hold AFTER the removals and swaps.
66-
virtual std::unordered_map<idType, idType> deleteVectorAndGetUpdatedIds(labelType label) = 0;
64+
// Remove a vector and return a map between internal ids and the original internal ids of the
65+
// vector that they hold as a result of the overall removals and swaps, along with its label.
66+
virtual std::unordered_map<idType, std::pair<idType, labelType>>
67+
deleteVectorAndGetUpdatedIds(labelType label) = 0;
6768
// Check if a certain label exists in the index.
6869
virtual inline bool isLabelExists(labelType label) = 0;
6970
// Return a set of all labels that are stored in the index (helper for computing label count
@@ -160,6 +161,7 @@ void BruteForceIndex<DataType, DistType>::appendVector(const void *vector_data,
160161
size_t last_block_vectors_count = id % this->blockSize;
161162
this->idToLabelMapping.resize(
162163
idToLabelMapping_size + this->blockSize - last_block_vectors_count, 0);
164+
this->idToLabelMapping.shrink_to_fit();
163165
}
164166

165167
// add label to idToLabelMapping
@@ -210,10 +212,11 @@ void BruteForceIndex<DataType, DistType>::removeVector(idType id_to_delete) {
210212
// Resize and align the idToLabelMapping.
211213
size_t idToLabel_size = idToLabelMapping.size();
212214
// If the new size is smaller by at least one block comparing to the idToLabelMapping
213-
// align to be a multiplication of blocksize and resize by one block.
215+
// align to be a multiplication of block size and resize by one block.
214216
if (this->count + this->blockSize <= idToLabel_size) {
215217
size_t vector_to_align_count = idToLabel_size % this->blockSize;
216218
this->idToLabelMapping.resize(idToLabel_size - this->blockSize - vector_to_align_count);
219+
this->idToLabelMapping.shrink_to_fit();
217220
}
218221
}
219222
}

src/VecSim/algorithms/brute_force/brute_force_multi.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
3434
return std::unique_ptr<vecsim_stl::abstract_results_container>(
3535
new (this->allocator) vecsim_stl::unique_results_container(cap, this->allocator));
3636
}
37-
std::unordered_map<idType, idType> deleteVectorAndGetUpdatedIds(labelType label) override;
37+
std::unordered_map<idType, std::pair<idType, labelType>>
38+
deleteVectorAndGetUpdatedIds(labelType label) override;
3839
#ifdef BUILD_TESTS
3940
void getDataByLabel(labelType label,
4041
std::vector<std::vector<DataType>> &vectors_output) const override {
@@ -117,15 +118,15 @@ int BruteForceIndex_Multi<DataType, DistType>::deleteVector(labelType label) {
117118
}
118119

119120
template <typename DataType, typename DistType>
120-
std::unordered_map<idType, idType>
121+
std::unordered_map<idType, std::pair<idType, labelType>>
121122
BruteForceIndex_Multi<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelType label) {
122123
// Hold a mapping from ids that are removed and changed to the original ids that were swapped
123124
// into it. For example, if we have ids 0, 1, 2, 3, 4 and are about to remove ids 1, 3, 4, we
124125
// should get the following scenario: {1->4} => {1->4} => {1->2}.
125126
// Explanation: first we delete 1 and swap it with 4. Then, we remove 3 and have no swap since 3
126127
// is the last id. Lastly, we delete the original 4 which is now in id 1, and swap it with 2.
127128
// Eventually, in id 1 we should have the original vector whose id was 2.
128-
std::unordered_map<idType, idType> updated_ids;
129+
std::unordered_map<idType, std::pair<idType, labelType>> updated_ids;
129130

130131
// Find the id to delete.
131132
auto deleted_label_ids_pair = this->labelToIdsLookup.find(label);
@@ -140,6 +141,7 @@ BruteForceIndex_Multi<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelTyp
140141
// The removal take into consideration the current internal id to remove, even if it is not
141142
// the original id, and it has swapped into this id after previous swap of another id that
142143
// belongs to this label.
144+
labelType last_id_label = this->idToLabelMapping[this->count - 1];
143145
this->removeVector(cur_id_to_delete);
144146
// If cur_id_to_delete exists in the map, remove it as it is no longer valid, whether it
145147
// will get a new value due to a swap, or it is the last element in the index.
@@ -151,7 +153,7 @@ BruteForceIndex_Multi<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelTyp
151153
updated_ids.erase(this->count);
152154
} else {
153155
// Otherwise, the last id now resides where the deleted id was.
154-
updated_ids[cur_id_to_delete] = this->count;
156+
updated_ids[cur_id_to_delete] = {this->count, last_id_label};
155157
}
156158
}
157159
}

src/VecSim/algorithms/brute_force/brute_force_single.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
3333
}
3434

3535
inline size_t indexLabelCount() const override { return this->count; }
36-
std::unordered_map<idType, idType> deleteVectorAndGetUpdatedIds(labelType label) override;
36+
std::unordered_map<idType, std::pair<idType, labelType>>
37+
deleteVectorAndGetUpdatedIds(labelType label) override;
3738

3839
// We call this when we KNOW that the label exists in the index.
3940
idType getIdOfLabel(labelType label) const { return labelToIdLookup.find(label)->second; }
@@ -149,10 +150,10 @@ int BruteForceIndex_Single<DataType, DistType>::deleteVector(labelType label) {
149150
}
150151

151152
template <typename DataType, typename DistType>
152-
std::unordered_map<idType, idType>
153+
std::unordered_map<idType, std::pair<idType, labelType>>
153154
BruteForceIndex_Single<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelType label) {
154155

155-
std::unordered_map<idType, idType> updated_ids;
156+
std::unordered_map<idType, std::pair<idType, labelType>> updated_ids;
156157
// Find the id to delete.
157158
auto deleted_label_id_pair = this->labelToIdLookup.find(label);
158159
if (deleted_label_id_pair == this->labelToIdLookup.end()) {
@@ -165,10 +166,10 @@ BruteForceIndex_Single<DataType, DistType>::deleteVectorAndGetUpdatedIds(labelTy
165166

166167
// Remove the pair of the deleted vector.
167168
labelToIdLookup.erase(label);
168-
169-
this->removeVector(id_to_delete);
169+
labelType last_id_label = this->idToLabelMapping[this->count - 1];
170+
this->removeVector(id_to_delete); // this will decrease this->count and make the swap
170171
if (id_to_delete != this->count) {
171-
updated_ids[id_to_delete] = this->count;
172+
updated_ids[id_to_delete] = {this->count, last_id_label};
172173
}
173174
return updated_ids;
174175
}

src/VecSim/algorithms/hnsw/hnsw_tiered.h

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
100100
// label-to-insert-jobs lookup. Also, since deletion a vector triggers swapping of the
101101
// internal last id with the deleted vector id, here we update the pending insert job(s) for the
102102
// last id (if needed). This should be called while *flat lock is held* (exclusive lock).
103-
void updateInsertJobInternalId(idType prev_id, idType new_id);
103+
void updateInsertJobInternalId(idType prev_id, idType new_id, labelType label);
104104

105105
// Helper function for performing in place mark delete of vector(s) associated with a label
106106
// and creating the appropriate repair jobs for the effected connections. This should be called
@@ -356,11 +356,11 @@ int TieredHNSWIndex<DataType, DistType>::deleteLabelFromHNSW(labelType label) {
356356
}
357357

358358
template <typename DataType, typename DistType>
359-
void TieredHNSWIndex<DataType, DistType>::updateInsertJobInternalId(idType prev_id, idType new_id) {
359+
void TieredHNSWIndex<DataType, DistType>::updateInsertJobInternalId(idType prev_id, idType new_id,
360+
labelType label) {
360361
// Update the pending job id, due to a swap that was caused after the removal of new_id.
361362
assert(new_id != INVALID_ID && prev_id != INVALID_ID);
362-
labelType last_idx_label = this->frontendIndex->getLabelByInternalId(prev_id);
363-
auto it = this->labelToInsertJobs.find(last_idx_label);
363+
auto it = this->labelToInsertJobs.find(label);
364364
if (it != this->labelToInsertJobs.end()) {
365365
// There is a pending job for the label of the swapped last id - update its id.
366366
for (HNSWInsertJob *job_it : it->second) {
@@ -471,11 +471,17 @@ void TieredHNSWIndex<DataType, DistType>::executeInsertJob(HNSWInsertJob *job) {
471471
if (labelToInsertJobs.at(job->label).empty()) {
472472
labelToInsertJobs.erase(job->label);
473473
}
474-
// Remove the vector from the flat buffer.
474+
// Remove the vector from the flat buffer. This may cause the last vector id to swap with
475+
// the deleted id. Hold the label for the last id, so we can later on update its
476+
// corresponding job id. Note that after calling deleteVectorById, the last id's label
477+
// shouldn't be available, since it is removed from the lookup.
478+
labelType last_vec_label =
479+
this->frontendIndex->getLabelByInternalId(this->frontendIndex->indexSize() - 1);
475480
int deleted = this->frontendIndex->deleteVectorById(job->label, job->id);
476481
if (deleted && job->id != this->frontendIndex->indexSize()) {
477482
// If the vector removal caused a swap with the last id, update the relevant insert job.
478-
this->updateInsertJobInternalId(this->frontendIndex->indexSize(), job->id);
483+
this->updateInsertJobInternalId(this->frontendIndex->indexSize(), job->id,
484+
last_vec_label);
479485
}
480486
}
481487
this->flatIndexGuard.unlock();
@@ -700,7 +706,9 @@ int TieredHNSWIndex<DataType, DistType>::deleteVector(labelType label) {
700706
// an example in this function implementation in MULTI index).
701707
auto updated_ids = this->frontendIndex->deleteVectorAndGetUpdatedIds(label);
702708
for (auto &it : updated_ids) {
703-
this->updateInsertJobInternalId(it.second, it.first);
709+
idType prev_id = it.second.first;
710+
labelType updated_vec_label = it.second.second;
711+
this->updateInsertJobInternalId(prev_id, it.first, updated_vec_label);
704712
}
705713
}
706714
this->flatIndexGuard.unlock();

tests/unit/test_allocator.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,8 @@ TYPED_TEST(IndexAllocatorTest, test_bf_index_block_size_1) {
164164
// collection allocate additional structures for their internal implementation.
165165
ASSERT_EQ(allocator->getAllocationSize(),
166166
expectedAllocationSize + deleteCommandAllocationDelta);
167-
ASSERT_LE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
168-
ASSERT_LE(expectedAllocationDelta, deleteCommandAllocationDelta);
167+
ASSERT_GE(expectedAllocationSize + expectedAllocationDelta, allocator->getAllocationSize());
168+
ASSERT_GE(expectedAllocationDelta, deleteCommandAllocationDelta);
169169

170170
info = bfIndex->info();
171171
ASSERT_EQ(allocator->getAllocationSize(), info.bfInfo.memory);

tests/unit/test_bruteforce.cpp

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ TYPED_TEST(BruteForceTest, brute_force_vector_update_test) {
108108

109109
TYPED_TEST(BruteForceTest, resize_and_align_index) {
110110
size_t dim = 4;
111-
size_t n = 15;
111+
size_t n = 14;
112112
size_t blockSize = 10;
113113

114114
BFParams params = {
@@ -134,24 +134,26 @@ TYPED_TEST(BruteForceTest, resize_and_align_index) {
134134

135135
// Add another vector, since index size equals to the capacity, this should cause resizing
136136
// (to fit a multiplication of block_size).
137-
GenerateAndAddVector<TEST_DATA_T>(index, dim, n + 1);
137+
GenerateAndAddVector<TEST_DATA_T>(index, dim, n);
138138
ASSERT_EQ(VecSimIndex_IndexSize(index), n + 1);
139139
// Check new capacity size, should be blockSize * 2.
140140
ASSERT_EQ(bf_index->idToLabelMapping.size(), 2 * blockSize);
141+
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 2 * blockSize);
141142

142-
// Now size = n + 1 = 16, capacity = 2* bs = 20. Test capacity overflow again
143-
// to check that it stays aligned with blocksize.
143+
// Now size = n + 1 (= 15), capacity = 2 * bs (= 20). Test capacity overflow again
144+
// to check that it stays aligned with block size.
144145

145146
size_t add_vectors_count = 8;
146147
for (size_t i = 0; i < add_vectors_count; i++) {
147148
GenerateAndAddVector<TEST_DATA_T>(index, dim, n + 2 + i, i);
148149
}
149150

150-
// Size should be n + 1 + 8 = 24.
151+
// Size should be n + 1 + 8 (= 25).
151152
ASSERT_EQ(VecSimIndex_IndexSize(index), n + 1 + add_vectors_count);
152153

153154
// Check new capacity size, should be blockSize * 3.
154155
ASSERT_EQ(bf_index->idToLabelMapping.size(), 3 * blockSize);
156+
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 3 * blockSize);
155157

156158
VecSimIndex_Free(index);
157159
}
@@ -170,7 +172,7 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
170172
BruteForceIndex<TEST_DATA_T, TEST_DIST_T> *bf_index = this->CastToBF(index);
171173
ASSERT_EQ(VecSimIndex_IndexSize(index), 0);
172174

173-
// add up to blocksize + 1 = 3 + 1 = 4
175+
// Add up to block size + 1 = 3 + 1 = 4
174176
for (size_t i = 0; i < bs + 1; i++) {
175177
GenerateAndAddVector<TEST_DATA_T>(index, dim, i, i);
176178
}
@@ -190,6 +192,7 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
190192
// 10 - 3 - 10 % 3 (1) = 6
191193
idToLabelMapping_size = bf_index->idToLabelMapping.size();
192194
ASSERT_EQ(idToLabelMapping_size, n - bs - n % bs);
195+
ASSERT_EQ(idToLabelMapping_size, bf_index->idToLabelMapping.capacity());
193196

194197
// Delete all the vectors to decrease idToLabelMapping size by another bs.
195198
size_t i = 0;
@@ -198,19 +201,24 @@ TYPED_TEST(BruteForceTest, resize_and_align_index_largeInitialCapacity) {
198201
++i;
199202
}
200203
ASSERT_EQ(bf_index->idToLabelMapping.size(), bs);
204+
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), bs);
205+
201206
// Add and delete a vector to achieve:
202207
// size % block_size == 0 && size + bs <= idToLabelMapping_size(3).
203208
// idToLabelMapping_size should be resized to zero.
204209
GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
205210
VecSimIndex_DeleteVector(index, 0);
206211
ASSERT_EQ(bf_index->idToLabelMapping.size(), 0);
212+
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 0);
207213

208214
// Do it again. This time after adding a vector idToLabelMapping_size is increased by bs.
209215
// Upon deletion it will be resized to zero again.
210216
GenerateAndAddVector<TEST_DATA_T>(index, dim, 0);
211217
ASSERT_EQ(bf_index->idToLabelMapping.size(), bs);
218+
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), bs);
212219
VecSimIndex_DeleteVector(index, 0);
213220
ASSERT_EQ(bf_index->idToLabelMapping.size(), 0);
221+
ASSERT_EQ(bf_index->idToLabelMapping.capacity(), 0);
214222

215223
VecSimIndex_Free(index);
216224
}

tests/unit/test_hnsw_tiered.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1601,9 +1601,9 @@ TYPED_TEST(HNSWTieredIndexTestBasic, deleteVectorMultiFromFlatAdvanced) {
16011601
ASSERT_EQ(tiered_index->labelToInsertJobs.erase(vec_label_second), 1);
16021602
auto updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_second);
16031603
ASSERT_EQ(updated_ids.size(), 1);
1604-
ASSERT_EQ(updated_ids.at(1), 2);
1604+
ASSERT_EQ(updated_ids.at(1).first, 2);
16051605
for (auto &it : updated_ids) {
1606-
tiered_index->updateInsertJobInternalId(it.second, it.first);
1606+
tiered_index->updateInsertJobInternalId(it.second.first, it.first, it.second.second);
16071607
}
16081608
ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
16091609
ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first).size(), 2);
@@ -1633,9 +1633,9 @@ TYPED_TEST(HNSWTieredIndexTestBasic, deleteVectorMultiFromFlatAdvanced) {
16331633
ASSERT_EQ(tiered_index->labelToInsertJobs.erase(vec_label_second), 1);
16341634
updated_ids = tiered_index->frontendIndex->deleteVectorAndGetUpdatedIds(vec_label_second);
16351635
ASSERT_EQ(updated_ids.size(), 1);
1636-
ASSERT_EQ(updated_ids.at(1), 3);
1636+
ASSERT_EQ(updated_ids.at(1).first, 3);
16371637
for (auto &it : updated_ids) {
1638-
tiered_index->updateInsertJobInternalId(it.second, it.first);
1638+
tiered_index->updateInsertJobInternalId(it.second.first, it.first, it.second.second);
16391639
}
16401640
ASSERT_EQ(tiered_index->labelToInsertJobs.size(), 1);
16411641
ASSERT_EQ(tiered_index->labelToInsertJobs.at(vec_label_first).size(), 2);

0 commit comments

Comments
 (0)