Skip to content

Commit e134c63

Browse files
authored
CP tiered index 0.7 (#383)
1 parent 75f2c93 commit e134c63

File tree

83 files changed

+11113
-2056
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+11113
-2056
lines changed

.github/wordlist.txt

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,51 @@
1+
AVX
2+
BatchIterator
3+
DQ
4+
Datatypes
5+
FP
16
HDF
27
HNSW
8+
KNN
9+
RediSearch
10+
RedisAI
11+
SIMD
312
TBD
13+
TopK
414
VSCode
15+
VecSimBasics
16+
VecSimGeneral
17+
VecSimUpdatedIndex
518
VectorSimilarity
619
ZSH
20+
allocators
21+
ann
722
benchmarked
823
benchmarking
9-
byndings
24+
bm
1025
cmake
26+
cpp
27+
dataset
28+
datasets
29+
destructor
1130
devcontainer
1231
dir
32+
enum
33+
fp
34+
frac
1335
gcc
1436
github
1537
gnist
1638
hnsw
39+
hnswlib
1740
mnist
1841
neighbor
1942
pre
2043
py
2144
repo
45+
runtime
46+
templated
2247
tox
2348
valgrind
49+
vecsim
2450
virtualenv
2551
whl
26-
datasets
27-
runtime
28-
RedisAI
29-
dataset

.github/workflows/arm.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
2525
# Ubuntu 22.04 region AMI for ARM
2626
ec2-image-id: ami-062b37d89f25c958f
27-
ec2-instance-type: t4g.small
27+
ec2-instance-type: t4g.medium
2828
subnet-id: ${{ secrets.AWS_EC2_SUBNET_ID }}
2929
security-group-id: ${{ secrets.AWS_EC2_SG_ID }}
3030

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ make clean # remove binary files
7171
make unit_test # run unit tests
7272
CTEST_ARGS=args # extra CTest arguments
7373
VG|VALGRIND=1 # run tests with valgrind
74+
FP_64=1 # run tests with 64-bit floating point
7475
make valgrind # build for Valgrind and run tests
7576
make flow_test # run flow tests (with pytest)
7677
TEST=file::name # run specific test
@@ -124,6 +125,11 @@ ifeq ($(VERBOSE),1)
124125
CMAKE_FLAGS += -DCMAKE_VERBOSE_MAKEFILE=on
125126
endif
126127

128+
# CMake flags for fp64 unit tests
129+
ifeq ($(FP_64),1)
130+
CMAKE_FLAGS += -DFP64_TESTS=on
131+
endif
132+
127133
CMAKE_FLAGS += \
128134
-Wno-deprecated \
129135
-DCMAKE_WARN_DEPRECATED=OFF \

setup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,5 @@ def build_extension(self, ext):
9292
description="Python library around collection of vector similarity algorithm",
9393
long_description="",
9494
ext_modules=[CMakeExtension("VecSim", "src/python_bindings")],
95-
py_modules=['src/python_bindings/Mybytearray'],
9695
cmdclass={"build_ext": CMakeBuild}
9796
)

src/VecSim/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ add_subdirectory(spaces)
1515
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall")
1616

1717
add_library(VectorSimilarity ${VECSIM_LIBTYPE}
18-
algorithms/brute_force/brute_force_factory.cpp
19-
algorithms/hnsw/hnsw_factory.cpp
18+
index_factories/brute_force_factory.cpp
19+
index_factories/hnsw_factory.cpp
20+
index_factories/tiered_factory.cpp
21+
index_factories/index_factory.cpp
2022
algorithms/brute_force/vector_block.cpp
2123
algorithms/hnsw/visited_nodes_handler.cpp
2224
vec_sim.cpp

src/VecSim/algorithms/brute_force/bf_batch_iterator.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ template <typename DataType, typename DistType>
2323
class BF_BatchIterator : public VecSimBatchIterator {
2424
protected:
2525
const BruteForceIndex<DataType, DistType> *index;
26+
size_t index_label_count; // number of labels in the index when calculating the scores,
27+
// which is the only time we access the index.
2628
vecsim_stl::vector<pair<DistType, labelType>> scores; // vector of scores for every label.
2729
size_t scores_valid_start_pos; // the first index in the scores vector that contains a vector
2830
// that hasn't been returned already.
@@ -56,13 +58,15 @@ template <typename DataType, typename DistType>
5658
VecSimQueryResult_List
5759
BF_BatchIterator<DataType, DistType>::searchByHeuristics(size_t n_res,
5860
VecSimQueryResult_Order order) {
59-
if ((this->index->indexLabelCount() - this->getResultsCount()) / 1000 > n_res) {
61+
if ((this->index_label_count - this->getResultsCount()) / 1000 > n_res) {
6062
// Heap based search always returns the results ordered by score
6163
return this->heapBasedSearch(n_res);
6264
}
6365
VecSimQueryResult_List rl = this->selectBasedSearch(n_res);
6466
if (order == BY_SCORE) {
6567
sort_results_by_score(rl);
68+
} else if (order == BY_SCORE_THEN_ID) {
69+
sort_results_by_score_then_id(rl);
6670
}
6771
return rl;
6872
}
@@ -167,17 +171,17 @@ BF_BatchIterator<DataType, DistType>::BF_BatchIterator(
167171
void *query_vector, const BruteForceIndex<DataType, DistType> *bf_index,
168172
VecSimQueryParams *queryParams, std::shared_ptr<VecSimAllocator> allocator)
169173
: VecSimBatchIterator(query_vector, queryParams ? queryParams->timeoutCtx : nullptr, allocator),
170-
index(bf_index), scores(allocator), scores_valid_start_pos(0) {}
174+
index(bf_index), index_label_count(index->indexLabelCount()), scores(allocator),
175+
scores_valid_start_pos(0) {}
171176

172177
template <typename DataType, typename DistType>
173178
VecSimQueryResult_List
174179
BF_BatchIterator<DataType, DistType>::getNextResults(size_t n_res, VecSimQueryResult_Order order) {
175-
assert((order == BY_ID || order == BY_SCORE) &&
176-
"Possible order values are only 'BY_ID' or 'BY_SCORE'");
177180
// Only in the first iteration we need to compute all the scores
178181
if (this->scores.empty()) {
179182
assert(getResultsCount() == 0);
180183

184+
// The only time we access the index. This function also updates the iterator's label count.
181185
auto rc = calculateScores();
182186

183187
if (VecSim_OK != rc) {
@@ -198,8 +202,8 @@ BF_BatchIterator<DataType, DistType>::getNextResults(size_t n_res, VecSimQueryRe
198202

199203
template <typename DataType, typename DistType>
200204
bool BF_BatchIterator<DataType, DistType>::isDepleted() {
201-
assert(this->getResultsCount() <= this->index->indexLabelCount());
202-
bool depleted = this->getResultsCount() == this->index->indexLabelCount();
205+
assert(this->getResultsCount() <= this->index_label_count);
206+
bool depleted = this->getResultsCount() == this->index_label_count;
203207
return depleted;
204208
}
205209

src/VecSim/algorithms/brute_force/bfm_batch_iterator.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ class BFM_BatchIterator : public BF_BatchIterator<DataType, DistType> {
2020

2121
private:
2222
inline VecSimQueryResult_Code calculateScores() override {
23-
24-
this->scores.reserve(this->index->indexLabelCount());
25-
vecsim_stl::unordered_map<labelType, DistType> tmp_scores(this->index->indexLabelCount(),
23+
this->index_label_count = this->index->indexLabelCount();
24+
this->scores.reserve(this->index_label_count);
25+
vecsim_stl::unordered_map<labelType, DistType> tmp_scores(this->index_label_count,
2626
this->allocator);
2727
vecsim_stl::vector<VectorBlock *> blocks = this->index->getVectorBlocks();
2828
VecSimQueryResult_Code rc;

src/VecSim/algorithms/brute_force/bfs_batch_iterator.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ class BFS_BatchIterator : public BF_BatchIterator<DataType, DistType> {
2020

2121
private:
2222
inline VecSimQueryResult_Code calculateScores() override {
23-
24-
this->scores.reserve(this->index->indexLabelCount());
23+
this->index_label_count = this->index->indexLabelCount();
24+
this->scores.reserve(this->index_label_count);
2525
vecsim_stl::vector<VectorBlock *> blocks = this->index->getVectorBlocks();
2626
VecSimQueryResult_Code rc;
2727

0 commit comments

Comments
 (0)