Skip to content

Commit 3ce2aca

Browse files
authored
Timeout Enabling - Brute Force (#161)
* enable timeout function implement using it with brute force algorithm * freeing result list if not null * added tests * review changes * moved compute block scores to the BF algorithm * added `vec_sim_index.cpp` file. * more review fixes * comment fix
1 parent 2100fd7 commit 3ce2aca

21 files changed

+276
-83
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010
/1/
1111
**/build/
1212

13+
# Ignore benchmark fetched data but not the source file
14+
/tests/benchmark/data/*
15+
!/tests/benchmark/data/hnsw_indices.txt
16+
1317
# Prerequisites
1418
*.d
1519

src/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ add_library(VectorSimilarity ${VECSIM_LIBTYPE}
5555
VecSim/algorithms/brute_force/vector_block.cpp
5656
VecSim/algorithms/hnsw/visited_nodes_handler.cpp
5757
VecSim/algorithms/hnsw/hnsw_wrapper.cpp
58+
VecSim/vec_sim_index.cpp
5859
VecSim/vec_sim.cpp
5960
VecSim/query_results.cpp
6061
VecSim/info_iterator.cpp

src/VecSim/algorithms/brute_force/bf_batch_iterator.cpp

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,21 +12,21 @@ unsigned char BF_BatchIterator::next_id = 0;
1212

1313
// heuristics: decide if using heap or select search, based on the ratio between the
1414
// number of remaining results and the index size.
15-
VecSimQueryResult *BF_BatchIterator::searchByHeuristics(size_t n_res,
16-
VecSimQueryResult_Order order) {
15+
VecSimQueryResult_List BF_BatchIterator::searchByHeuristics(size_t n_res,
16+
VecSimQueryResult_Order order) {
1717
if ((this->index->indexSize() - this->getResultsCount()) / 1000 > n_res) {
1818
// Heap based search always returns the results ordered by score
1919
return this->heapBasedSearch(n_res);
2020
}
21-
VecSimQueryResult *res = this->selectBasedSearch(n_res);
21+
VecSimQueryResult_List rl = this->selectBasedSearch(n_res);
2222
if (order == BY_SCORE) {
23-
sort_results_by_score(res);
23+
sort_results_by_score(rl);
2424
}
25-
return res;
25+
return rl;
2626
}
2727

28-
void BF_BatchIterator::swapScores(const unordered_map<size_t, size_t> &TopCandidatesIndices,
29-
size_t res_num) {
28+
void BF_BatchIterator::swapScores(
29+
const vecsim_stl::unordered_map<size_t, size_t> &TopCandidatesIndices, size_t res_num) {
3030
// Create a set of the indices in the scores array for every results that we return.
3131
set<size_t> indices;
3232
for (auto pos : TopCandidatesIndices) {
@@ -55,11 +55,12 @@ void BF_BatchIterator::swapScores(const unordered_map<size_t, size_t> &TopCandid
5555
this->scores_valid_start_pos = next_scores_valid_start_pos;
5656
}
5757

58-
VecSimQueryResult *BF_BatchIterator::heapBasedSearch(size_t n_res) {
58+
VecSimQueryResult_List BF_BatchIterator::heapBasedSearch(size_t n_res) {
59+
VecSimQueryResult_List rl = {0};
5960
float upperBound = std::numeric_limits<float>::lowest();
6061
vecsim_stl::max_priority_queue<pair<float, labelType>> TopCandidates(this->allocator);
6162
// map vector's label to its index in the scores vector.
62-
unordered_map<size_t, size_t> TopCandidatesIndices(n_res);
63+
vecsim_stl::unordered_map<size_t, size_t> TopCandidatesIndices(n_res, this->allocator);
6364
for (size_t i = this->scores_valid_start_pos; i < this->scores.size(); i++) {
6465
if (TopCandidates.size() < n_res) {
6566
TopCandidates.emplace(this->scores[i].first, this->scores[i].second);
@@ -69,28 +70,30 @@ VecSimQueryResult *BF_BatchIterator::heapBasedSearch(size_t n_res) {
6970
if (this->scores[i].first >= upperBound) {
7071
continue;
7172
} else {
72-
TopCandidates.emplace(this->scores[i].first, this->scores[i].second);
73-
TopCandidatesIndices[this->scores[i].second] = i;
7473
// remove the furthest vector from the candidates and from the label->index mappings
74+
// we first remove the worst candidate so we wont exceed the allocated size
7575
TopCandidatesIndices.erase(TopCandidates.top().second);
7676
TopCandidates.pop();
77+
TopCandidatesIndices[this->scores[i].second] = i;
78+
TopCandidates.emplace(this->scores[i].first, this->scores[i].second);
7779
upperBound = TopCandidates.top().first;
7880
}
7981
}
8082
}
8183

8284
// Save the top results to return.
83-
auto *results = array_new_len<VecSimQueryResult>(TopCandidates.size(), TopCandidates.size());
85+
rl.results = array_new_len<VecSimQueryResult>(TopCandidates.size(), TopCandidates.size());
8486
for (int i = (int)TopCandidates.size() - 1; i >= 0; --i) {
85-
VecSimQueryResult_SetId(results[i], TopCandidates.top().second);
86-
VecSimQueryResult_SetScore(results[i], TopCandidates.top().first);
87+
VecSimQueryResult_SetId(rl.results[i], TopCandidates.top().second);
88+
VecSimQueryResult_SetScore(rl.results[i], TopCandidates.top().first);
8789
TopCandidates.pop();
8890
}
89-
swapScores(TopCandidatesIndices, array_len(results));
90-
return results;
91+
swapScores(TopCandidatesIndices, array_len(rl.results));
92+
return rl;
9193
}
9294

93-
VecSimQueryResult *BF_BatchIterator::selectBasedSearch(size_t n_res) {
95+
VecSimQueryResult_List BF_BatchIterator::selectBasedSearch(size_t n_res) {
96+
VecSimQueryResult_List rl = {0};
9497
size_t remaining_vectors_count = this->scores.size() - this->scores_valid_start_pos;
9598
// Get an iterator to the effective first element in the scores array, which is the first
9699
// element that hasn't been returned in previous iterations.
@@ -105,20 +108,22 @@ VecSimQueryResult *BF_BatchIterator::selectBasedSearch(size_t n_res) {
105108
// will be placed before it, and all the rest will be placed after.
106109
std::nth_element(valid_begin_it, n_th_element_pos, this->scores.end());
107110

108-
auto *results = array_new<VecSimQueryResult>(n_res);
111+
rl.results = array_new<VecSimQueryResult>(n_res);
109112
for (size_t i = this->scores_valid_start_pos; i < this->scores_valid_start_pos + n_res; i++) {
110-
results = array_append(results, VecSimQueryResult{});
111-
VecSimQueryResult_SetId(results[array_len(results) - 1], this->scores[i].second);
112-
VecSimQueryResult_SetScore(results[array_len(results) - 1], this->scores[i].first);
113+
rl.results = array_append(rl.results, VecSimQueryResult{});
114+
VecSimQueryResult_SetId(rl.results[array_len(rl.results) - 1], this->scores[i].second);
115+
VecSimQueryResult_SetScore(rl.results[array_len(rl.results) - 1], this->scores[i].first);
113116
}
114117
// Update the valid results start position after returning the results.
115-
this->scores_valid_start_pos += array_len(results);
116-
return results;
118+
this->scores_valid_start_pos += array_len(rl.results);
119+
return rl;
117120
}
118121

119122
BF_BatchIterator::BF_BatchIterator(void *query_vector, const BruteForceIndex *bf_index,
123+
VecSimQueryParams *queryParams,
120124
std::shared_ptr<VecSimAllocator> allocator)
121-
: VecSimBatchIterator(query_vector, allocator), index(bf_index), scores_valid_start_pos(0) {
125+
: VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr, allocator),
126+
index(bf_index), scores_valid_start_pos(0) {
122127
BF_BatchIterator::next_id++;
123128
}
124129

@@ -131,20 +136,29 @@ VecSimQueryResult_List BF_BatchIterator::getNextResults(size_t n_res,
131136
assert(getResultsCount() == 0);
132137
this->scores.reserve(this->index->indexSize());
133138
vecsim_stl::vector<VectorBlock *> blocks = this->index->getVectorBlocks();
139+
VecSimQueryResult_Code rc;
134140
for (auto &block : blocks) {
135141
// compute the scores for the vectors in every block and extend the scores array.
136-
vecsim_stl::vector<std::pair<float, labelType>> block_scores =
137-
block->computeBlockScores(getIndex()->distFunc(), getQueryBlob());
138-
this->scores.insert(this->scores.end(), block_scores.begin(), block_scores.end());
142+
auto block_scores = this->index->computeBlockScores(block, this->getQueryBlob(),
143+
this->getTimeoutCtx(), &rc);
144+
if (VecSim_OK != rc) {
145+
return {NULL, rc};
146+
}
147+
for (size_t i = 0; i < block_scores.size(); i++) {
148+
this->scores.emplace_back(block_scores[i], block->getMember(i)->label);
149+
}
139150
}
140151
}
141-
VecSimQueryResult *results = searchByHeuristics(n_res, order);
152+
if (__builtin_expect(VecSimIndex::timeoutCallback(this->getTimeoutCtx()), 0)) {
153+
return {NULL, VecSim_QueryResult_TimedOut};
154+
}
155+
VecSimQueryResult_List rl = searchByHeuristics(n_res, order);
142156

143-
this->updateResultsCount(array_len(results));
157+
this->updateResultsCount(array_len(rl.results));
144158
if (order == BY_ID) {
145-
sort_results_by_id(results);
159+
sort_results_by_id(rl);
146160
}
147-
return results;
161+
return rl;
148162
}
149163

150164
bool BF_BatchIterator::isDepleted() {

src/VecSim/algorithms/brute_force/bf_batch_iterator.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,15 @@ class BF_BatchIterator : public VecSimBatchIterator {
1414
// that hasn't been returned already.
1515
static unsigned char next_id; // this holds the next available id to be used by a new instance.
1616

17-
VecSimQueryResult *searchByHeuristics(size_t n_res, VecSimQueryResult_Order order);
18-
VecSimQueryResult *selectBasedSearch(size_t n_res);
19-
VecSimQueryResult *heapBasedSearch(size_t n_res);
20-
void swapScores(const unordered_map<size_t, size_t> &TopCandidatesIndices, size_t res_num);
17+
VecSimQueryResult_List searchByHeuristics(size_t n_res, VecSimQueryResult_Order order);
18+
VecSimQueryResult_List selectBasedSearch(size_t n_res);
19+
VecSimQueryResult_List heapBasedSearch(size_t n_res);
20+
void swapScores(const vecsim_stl::unordered_map<size_t, size_t> &TopCandidatesIndices,
21+
size_t res_num);
2122

2223
public:
2324
BF_BatchIterator(void *query_vector, const BruteForceIndex *index,
24-
std::shared_ptr<VecSimAllocator> allocator);
25+
VecSimQueryParams *queryParams, std::shared_ptr<VecSimAllocator> allocator);
2526

2627
inline const BruteForceIndex *getIndex() const { return index; };
2728

src/VecSim/algorithms/brute_force/brute_force.cpp

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,30 @@ double BruteForceIndex::getDistanceFrom(size_t label, const void *vector_data) {
180180

181181
size_t BruteForceIndex::indexSize() const { return this->count; }
182182

183+
// Compute the score for every vector in the block by using the given distance function.
184+
vecsim_stl::vector<float> BruteForceIndex::computeBlockScores(VectorBlock *block,
185+
const void *queryBlob,
186+
void *timeoutCtx,
187+
VecSimQueryResult_Code *rc) const {
188+
size_t len = block->getLength();
189+
vecsim_stl::vector<float> scores(len, this->allocator);
190+
for (size_t i = 0; i < len; i++) {
191+
if (__builtin_expect(VecSimIndex::timeoutCallback(timeoutCtx), 0)) {
192+
*rc = VecSim_QueryResult_TimedOut;
193+
return scores;
194+
}
195+
scores[i] = this->dist_func(block->getVector(i), queryBlob, &this->dim);
196+
}
197+
*rc = VecSim_QueryResult_OK;
198+
return scores;
199+
}
200+
183201
VecSimQueryResult_List BruteForceIndex::topKQuery(const void *queryBlob, size_t k,
184202
VecSimQueryParams *queryParams) {
185203

204+
VecSimQueryResult_List rl = {0};
205+
void *timeoutCtx = queryParams ? queryParams->timeoutCtx : NULL;
206+
186207
this->last_mode = STANDARD_KNN;
187208
float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
188209
if (this->metric == VecSimMetric_Cosine) {
@@ -196,10 +217,9 @@ VecSimQueryResult_List BruteForceIndex::topKQuery(const void *queryBlob, size_t
196217
vecsim_stl::max_priority_queue<pair<float, labelType>> TopCandidates(this->allocator);
197218
// For every block, compute its vectors scores and update the Top candidates max heap
198219
for (auto vectorBlock : this->vectorBlocks) {
199-
size_t block_size = vectorBlock->getLength();
200-
vecsim_stl::vector<float> scores(block_size, this->allocator);
201-
for (size_t i = 0; i < block_size; i++) {
202-
scores[i] = this->dist_func(vectorBlock->getVector(i), queryBlob, &this->dim);
220+
auto scores = computeBlockScores(vectorBlock, queryBlob, timeoutCtx, &rl.code);
221+
if (VecSim_OK != rl.code) {
222+
return rl;
203223
}
204224
for (size_t i = 0; i < scores.size(); i++) {
205225
// Always choose the current candidate if we have less than k.
@@ -219,13 +239,14 @@ VecSimQueryResult_List BruteForceIndex::topKQuery(const void *queryBlob, size_t
219239
}
220240
}
221241
}
222-
auto *results = array_new_len<VecSimQueryResult>(TopCandidates.size(), TopCandidates.size());
242+
rl.results = array_new_len<VecSimQueryResult>(TopCandidates.size(), TopCandidates.size());
223243
for (int i = (int)TopCandidates.size() - 1; i >= 0; --i) {
224-
VecSimQueryResult_SetId(results[i], TopCandidates.top().second);
225-
VecSimQueryResult_SetScore(results[i], TopCandidates.top().first);
244+
VecSimQueryResult_SetId(rl.results[i], TopCandidates.top().second);
245+
VecSimQueryResult_SetScore(rl.results[i], TopCandidates.top().first);
226246
TopCandidates.pop();
227247
}
228-
return results;
248+
rl.code = VecSim_QueryResult_OK;
249+
return rl;
229250
}
230251

231252
VecSimIndexInfo BruteForceIndex::info() const {
@@ -290,7 +311,8 @@ VecSimBatchIterator *BruteForceIndex::newBatchIterator(const void *queryBlob,
290311
float_vector_normalize((float *)queryBlobCopy, dim);
291312
}
292313
// Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end.
293-
return new (this->allocator) BF_BatchIterator(queryBlobCopy, this, this->allocator);
314+
return new (this->allocator)
315+
BF_BatchIterator(queryBlobCopy, this, queryParams, this->allocator);
294316
}
295317

296318
bool BruteForceIndex::preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) {

src/VecSim/algorithms/brute_force/brute_force.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ class BruteForceIndex : public VecSimIndex {
2121
virtual int deleteVector(size_t id) override;
2222
virtual double getDistanceFrom(size_t label, const void *vector_data) override;
2323
virtual size_t indexSize() const override;
24+
vecsim_stl::vector<float> computeBlockScores(VectorBlock *block, const void *queryBlob,
25+
void *timeoutCtx,
26+
VecSimQueryResult_Code *rc) const;
2427
virtual VecSimQueryResult_List topKQuery(const void *queryBlob, size_t k,
2528
VecSimQueryParams *queryParams) override;
2629
virtual VecSimIndexInfo info() const override;

src/VecSim/algorithms/brute_force/vector_block.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,3 @@ void VectorBlock::addVector(VectorBlockMember *vectorBlockMember, const void *ve
3131
memcpy(this->vectors + (this->length * this->dim), vectorData, this->dim * sizeof(float));
3232
this->length++;
3333
}
34-
35-
vecsim_stl::vector<std::pair<float, labelType>>
36-
VectorBlock::computeBlockScores(DISTFUNC<float> DistFunc, const void *queryBlob) {
37-
vecsim_stl::vector<std::pair<float, labelType>> scores(this->length, this->allocator);
38-
for (size_t i = 0; i < this->length; i++) {
39-
scores[i] = {DistFunc(this->getVector(i), queryBlob, &this->dim),
40-
this->getMember(i)->label};
41-
}
42-
return scores;
43-
}

src/VecSim/algorithms/brute_force/vector_block.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,6 @@ struct VectorBlock : public VecsimBaseObject {
4242
member->block = this;
4343
}
4444

45-
// Compute the score for every vector in the block by using the given distance function.
46-
// Return a collection of (score, label) pairs for every vector in the block.
47-
vecsim_stl::vector<std::pair<float, labelType>> computeBlockScores(DISTFUNC<float> DistFunc,
48-
const void *queryBlob);
49-
5045
virtual ~VectorBlock();
5146

5247
private:

src/VecSim/algorithms/hnsw/hnsw_batch_iterator.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,25 @@ inline bool HNSW_BatchIterator::hasVisitedNode(idType node_id) const {
1616

1717
VecSimQueryResult_List HNSW_BatchIterator::prepareResults(candidatesMaxHeap top_candidates,
1818
size_t n_res) {
19+
VecSimQueryResult_List rl = {0};
1920
// size_t initial_results_num = array_len(batch_results);
2021
// Put the "spare" results (if exist) in the results heap.
2122
while (top_candidates.size() > n_res) {
2223
this->top_candidates_extras.emplace(top_candidates.top().first,
2324
top_candidates.top().second); // (distance, id)
2425
top_candidates.pop();
2526
}
26-
auto *batch_results =
27-
array_new_len<VecSimQueryResult>(top_candidates.size(), top_candidates.size());
27+
rl.results = array_new_len<VecSimQueryResult>(top_candidates.size(), top_candidates.size());
2828
// Return results from the top candidates heap, put them in reverse order in the batch results
2929
// array.
3030
for (int i = (int)(top_candidates.size() - 1); i >= 0; i--) {
3131
labelType label = this->hnsw_index->getExternalLabel(top_candidates.top().second);
32-
VecSimQueryResult_SetId(batch_results[i], label);
33-
VecSimQueryResult_SetScore(batch_results[i], top_candidates.top().first);
32+
VecSimQueryResult_SetId(rl.results[i], label);
33+
VecSimQueryResult_SetScore(rl.results[i], top_candidates.top().first);
3434
top_candidates.pop();
3535
}
36-
return batch_results;
36+
rl.code = VecSim_QueryResult_OK;
37+
return rl;
3738
}
3839

3940
candidatesMaxHeap HNSW_BatchIterator::scanGraph(candidatesMinHeap &candidates,
@@ -131,8 +132,10 @@ candidatesMaxHeap HNSW_BatchIterator::scanGraph(candidatesMinHeap &candidates,
131132
HNSW_BatchIterator::HNSW_BatchIterator(void *query_vector, HNSWIndex *index_wrapper,
132133
VecSimQueryParams *queryParams,
133134
std::shared_ptr<VecSimAllocator> allocator)
134-
: VecSimBatchIterator(query_vector, std::move(allocator)), index_wrapper(index_wrapper),
135-
depleted(false), top_candidates_extras(this->allocator), candidates(this->allocator) {
135+
: VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr,
136+
std::move(allocator)),
137+
index_wrapper(index_wrapper), depleted(false), top_candidates_extras(this->allocator),
138+
candidates(this->allocator) {
136139
this->space = index_wrapper->getSpace();
137140

138141
this->hnsw_index = index_wrapper->getHNSWIndex();
@@ -169,7 +172,7 @@ VecSimQueryResult_List HNSW_BatchIterator::getNextResults(size_t n_res,
169172
// Move the spare results to the "extras" queue if needed, and create the batch results array.
170173
auto batch_results = this->prepareResults(top_candidates, n_res);
171174

172-
this->updateResultsCount(array_len(batch_results));
175+
this->updateResultsCount(VecSimQueryResult_Len(batch_results));
173176
if (this->getResultsCount() == this->index_wrapper->indexSize()) {
174177
this->depleted = true;
175178
}

src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ void HNSWIndex::setEf(size_t ef) { this->hnsw->setEf(ef); }
9494

9595
VecSimQueryResult_List HNSWIndex::topKQuery(const void *query_data, size_t k,
9696
VecSimQueryParams *queryParams) {
97+
VecSimQueryResult_List rl = {0};
9798
try {
9899
this->last_mode = STANDARD_KNN;
99100
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
@@ -112,20 +113,21 @@ VecSimQueryResult_List HNSWIndex::topKQuery(const void *query_data, size_t k,
112113
}
113114
}
114115
auto knn_res = hnsw->searchKnn(query_data, k);
115-
auto *results = array_new_len<VecSimQueryResult>(knn_res.size(), knn_res.size());
116+
rl.results = array_new_len<VecSimQueryResult>(knn_res.size(), knn_res.size());
116117
for (int i = (int)knn_res.size() - 1; i >= 0; --i) {
117-
VecSimQueryResult_SetId(results[i], knn_res.top().second);
118-
VecSimQueryResult_SetScore(results[i], knn_res.top().first);
118+
VecSimQueryResult_SetId(rl.results[i], knn_res.top().second);
119+
VecSimQueryResult_SetScore(rl.results[i], knn_res.top().first);
119120
knn_res.pop();
120121
}
121122
// Restore efRuntime
122123
hnsw->setEf(originalEF);
123124
assert(hnsw->getEf() == originalEF);
124125

125-
return results;
126+
rl.code = VecSim_QueryResult_OK;
126127
} catch (...) {
127-
return NULL;
128+
rl.code = VecSim_QueryResult_Err;
128129
}
130+
return rl;
129131
}
130132

131133
VecSimIndexInfo HNSWIndex::info() const {

0 commit comments

Comments
 (0)