Skip to content

Commit 7bbe870

Browse files
alonre24DvirDukhan
andauthored
Cp 0.3.1 (#172)
* CP "HNSW- account for memory allocation of incoming edges set creation (#169)" * CP Hnsw reclaim memory (#168) * CP version h (#171) * CP "HNSW delete improvements (#170)" * version bump Co-authored-by: DvirDukhan <[email protected]>
1 parent 3d3bbe4 commit 7bbe870

19 files changed

+778
-380
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ ifeq ($(wildcard $(ROOT)/deps/readies/mk),)
6161
$(shell mkdir -p deps; cd deps; git clone https://github.com/RedisLabsModules/readies.git)
6262
endif
6363
include $(ROOT)/deps/readies/mk/main
64+
export ROOT
6465

6566
#----------------------------------------------------------------------------------------------
6667

src/VecSim/algorithms/brute_force/brute_force.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@ BruteForceIndex::~BruteForceIndex() {
3939
/******************** Implementation **************/
4040
size_t BruteForceIndex::estimateInitialSize(const BFParams *params) {
4141
// Constant part (not effected by parameters).
42-
size_t est = sizeof(BruteForceIndex);
43-
est += sizeof(*allocator);
44-
est += sizeof(*space);
42+
size_t est = sizeof(VecSimAllocator) + sizeof(BruteForceIndex) + sizeof(size_t);
43+
est += (params->metric == VecSimMetric_L2 ? sizeof(L2Space) : sizeof(InnerProductSpace)) +
44+
sizeof(size_t);
4545
// Parameters related part.
46-
est += params->initialCapacity * sizeof(decltype(idToVectorBlockMemberMapping)::value_type);
46+
est += params->initialCapacity * sizeof(decltype(idToVectorBlockMemberMapping)::value_type) +
47+
sizeof(size_t);
4748

4849
return est;
4950
}

src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,35 +32,63 @@ HNSWIndex::HNSWIndex(const HNSWParams *params, std::shared_ptr<VecSimAllocator>
3232

3333
/******************** Implementation **************/
3434
size_t HNSWIndex::estimateInitialSize(const HNSWParams *params) {
35-
size_t est = sizeof(HNSWIndex);
36-
est += sizeof(*space);
37-
est += sizeof(*hnsw);
38-
est += sizeof(VisitedNodesHandler);
35+
size_t est = sizeof(VecSimAllocator) + sizeof(HNSWIndex) + sizeof(size_t);
36+
est += (params->metric == VecSimMetric_L2 ? sizeof(L2Space) : sizeof(InnerProductSpace)) +
37+
sizeof(size_t);
38+
est += sizeof(*hnsw) + sizeof(size_t);
39+
est += sizeof(VisitedNodesHandler) + sizeof(size_t);
3940
// used for synchronization only when parallel indexing / searching is enabled.
4041
#ifdef ENABLE_PARALLELIZATION
4142
est += sizeof(VisitedNodesHandlerPool);
4243
#endif
43-
est += params->initialCapacity * sizeof(tag_t);
44+
est += sizeof(tag_t) * params->initialCapacity + sizeof(size_t); // visited nodes
4445

45-
est += sizeof(void *) * params->initialCapacity; // link lists
46-
est += sizeof(size_t) * params->initialCapacity; // element level
46+
est += sizeof(void *) * params->initialCapacity + sizeof(size_t); // link lists (for levels > 0)
47+
est += sizeof(size_t) * params->initialCapacity + sizeof(size_t); // element level
48+
est += sizeof(size_t) * params->initialCapacity +
49+
sizeof(size_t); // labels lookup hash table buckets
4750

4851
size_t size_links_level0 =
4952
sizeof(linklistsizeint) + params->M * 2 * sizeof(tableint) + sizeof(void *);
50-
size_t size_data_per_element =
53+
size_t size_total_data_per_element =
5154
size_links_level0 + params->dim * sizeof(float) + sizeof(labeltype);
52-
est += params->initialCapacity * size_data_per_element;
55+
est += params->initialCapacity * size_total_data_per_element + sizeof(size_t);
5356

5457
return est;
5558
}
5659

5760
size_t HNSWIndex::estimateElementMemory(const HNSWParams *params) {
58-
size_t size_links_level0 =
59-
sizeof(linklistsizeint) + params->M * 2 * sizeof(tableint) + sizeof(void *);
60-
size_t size_data_per_element =
61-
size_links_level0 + params->dim * sizeof(float) + sizeof(labeltype);
61+
size_t size_links_level0 = sizeof(linklistsizeint) + params->M * 2 * sizeof(tableint) +
62+
sizeof(void *) + sizeof(vecsim_stl::vector<tableint>);
63+
size_t size_links_higher_level = sizeof(linklistsizeint) + params->M * sizeof(tableint) +
64+
sizeof(void *) + sizeof(vecsim_stl::vector<tableint>);
65+
// The Expectancy for the random variable which is the number of levels per element equals
66+
// 1/ln(M). Since the max_level is rounded to the "floor" integer, the actual average number
67+
// of levels is lower (intuitively, we "loose" a level every time the random generated number
68+
// should have been rounded up to the larger integer). So, we "fix" the expectancy and take
69+
// 1/2*ln(M) instead as an approximation.
70+
size_t expected_size_links_higher_levels =
71+
ceil((1 / (2 * log(params->M))) * (float)size_links_higher_level);
72+
73+
size_t size_total_data_per_element = size_links_level0 + expected_size_links_higher_levels +
74+
params->dim * sizeof(float) + sizeof(labeltype);
6275

63-
return size_data_per_element + sizeof(tag_t) + sizeof(size_t) + sizeof(void *);
76+
// For every new vector, a new node of size 24 is allocated in a bucket of the hash table.
77+
size_t size_label_lookup_node =
78+
24 + sizeof(size_t); // 24 + VecSimAllocator::allocation_header_size
79+
// 1 entry in visited nodes + 1 entry in element levels + (approximately) 1 bucket in labels
80+
// lookup hash map.
81+
size_t size_meta_data =
82+
sizeof(tag_t) + sizeof(size_t) + sizeof(size_t) + size_label_lookup_node;
83+
84+
/* Disclaimer: we are neglecting two additional factors that consume memory:
85+
* 1. The overall bucket size in labels_lookup hash table is usually higher than the number of
86+
* requested buckets (which is the index capacity), and it is auto selected according to the
87+
* hashing policy and the max load factor.
88+
* 2. The incoming edges that aren't bidirectional are stored in a dynamic array
89+
* (vecsim_stl::vector) Those edges' memory *is omitted completely* from this estimation.
90+
*/
91+
return size_meta_data + size_total_data_per_element;
6492
}
6593

6694
int HNSWIndex::addVector(const void *vector_data, size_t id) {
@@ -82,7 +110,13 @@ int HNSWIndex::addVector(const void *vector_data, size_t id) {
82110
}
83111
}
84112

85-
int HNSWIndex::deleteVector(size_t id) { return this->hnsw->removePoint(id); }
113+
int HNSWIndex::deleteVector(size_t id) {
114+
bool res = this->hnsw->removePoint(id);
115+
if (hnsw->getIndexSize() + this->blockSize <= this->hnsw->getIndexCapacity()) {
116+
this->hnsw->resizeIndex(this->hnsw->getIndexCapacity() - this->blockSize);
117+
}
118+
return res;
119+
}
86120

87121
VecSimResolveCode HNSWIndex::resolveParams(VecSimRawParam *rparams, int paramNum,
88122
VecSimQueryParams *qparams) {

0 commit comments

Comments
 (0)