-
Notifications
You must be signed in to change notification settings - Fork 14
Hybrid Query #55
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: attribute_filter
Are you sure you want to change the base?
Hybrid Query #55
Changes from all commits
3a46c22
2063b5d
e1aa5ac
c0aece3
147dbe1
29e4614
fbedcac
b8012db
8281295
3db5f45
35dfe11
318bb8c
3bc3e5e
bf953a2
e70db43
1d8a5b5
f7eb486
f12d21e
ac50b7f
1288863
711e414
d530d46
332ea7a
68706a6
f6305c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1,4 @@ | ||
| __pycache__ | ||
| __pycache__ | ||
| .vscode | ||
| build/ | ||
| quake.egg-info/ |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,6 +10,7 @@ dependencies: | |
| - faiss-cpu | ||
| - matplotlib | ||
| - pytest | ||
| - libarrow-all=19.0.1 | ||
| - pip | ||
| - pip: | ||
| - sphinx | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ class IndexPartition { | |
|
|
||
| uint8_t* codes_ = nullptr; ///< Pointer to the encoded vectors (raw memory block) | ||
| idx_t* ids_ = nullptr; ///< Pointer to the vector IDs | ||
| std::shared_ptr<arrow::Table> attributes_table_ = {}; | ||
|
|
||
| std::unordered_map<idx_t, int64_t> id_to_index_; ///< Map of vector ID to index | ||
|
|
||
|
|
@@ -88,7 +89,7 @@ class IndexPartition { | |
| * @param new_ids Pointer to the new vector IDs. | ||
| * @param new_codes Pointer to the new encoded vectors. | ||
| */ | ||
| void append(int64_t n_entry, const idx_t* new_ids, const uint8_t* new_codes); | ||
| void append(int64_t n_entry, const idx_t* new_ids, const uint8_t* new_codes, std::shared_ptr<arrow::Table> attributes_table=nullptr); | ||
|
|
||
| /** | ||
| * @brief Update existing entries in place. | ||
|
|
@@ -111,6 +112,15 @@ class IndexPartition { | |
| */ | ||
| void remove(int64_t index); | ||
|
|
||
| /** | ||
| * @brief Remove the associated attribute of an entry from the partition. Used in conjuntion with the remove(index) function | ||
| * | ||
| * Removes the attribute by performing masking & filtering | ||
| * | ||
| * @param index Index of the vector to remove. | ||
| */ | ||
| void removeAttribute(int64_t index); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this function needed? seems unnecessary |
||
|
|
||
| /** | ||
| * @brief Resize the partition. | ||
| * | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -148,6 +148,10 @@ class TypedTopKBuffer { | |
| partitions_scanned_.fetch_add(1, std::memory_order_relaxed); | ||
| } | ||
|
|
||
| void remove(int rejected_index) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would avoid modifying the topkbuffer class. Just make sure that elements you add to the buffer pass the filter (in the case of pre-filtering) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added this for post-filtering case. So after we get topk buffer from one partition, we need to remove whatever doesn't pass the filter. This function serves that purpose |
||
| topk_[rejected_index] = topk_[--curr_offset_]; | ||
| } | ||
|
|
||
| DistanceType flush() { | ||
| std::lock_guard<std::recursive_mutex> buffer_lock(buffer_mutex_); | ||
| if (curr_offset_ > k_) { | ||
|
|
@@ -280,11 +284,22 @@ inline void scan_list_with_ids_l2(const float *query_vec, | |
| const int64_t *list_ids, | ||
| int list_size, | ||
| int d, | ||
| TopkBuffer &buffer) { | ||
| TopkBuffer &buffer, | ||
| bool* bitmap = nullptr) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. switch to a vector so we avoid memory leaks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure |
||
| const float *vec = list_vecs; | ||
| for (int l = 0; l < list_size; l++) { | ||
| buffer.add(sqrt(faiss::fvec_L2sqr(query_vec, vec, d)), list_ids[l]); | ||
| vec += d; | ||
|
|
||
| if (bitmap == nullptr) { | ||
| for (int l = 0; l < list_size; l++) { | ||
| buffer.add(sqrt(faiss::fvec_L2sqr(query_vec, vec, d)), list_ids[l]); | ||
| vec += d; | ||
| } | ||
| } else { | ||
| for (int l = 0; l < list_size; l++) { | ||
| if (bitmap[l]) { | ||
| buffer.add(sqrt(faiss::fvec_L2sqr(query_vec, vec, d)), list_ids[l]); | ||
| } | ||
| vec += d; | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -295,7 +310,8 @@ inline void scan_list(const float *query_vec, | |
| int list_size, | ||
| int d, | ||
| TopkBuffer &buffer, | ||
| faiss::MetricType metric = faiss::METRIC_L2) { | ||
| faiss::MetricType metric = faiss::METRIC_L2, | ||
| bool* bitmap = nullptr) { | ||
| // Dispatch based on metric type and whether list_ids is provided. | ||
| if (metric == faiss::METRIC_INNER_PRODUCT) { | ||
| if (list_ids == nullptr) | ||
|
|
@@ -306,7 +322,7 @@ inline void scan_list(const float *query_vec, | |
| if (list_ids == nullptr) | ||
| scan_list_no_ids_l2(query_vec, list_vecs, list_size, d, buffer); | ||
| else | ||
| scan_list_with_ids_l2(query_vec, list_vecs, list_ids, list_size, d, buffer); | ||
| scan_list_with_ids_l2(query_vec, list_vecs, list_ids, list_size, d, buffer, bitmap); | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the price threshold in case no value is given by the user. Will modify this while making the filtering column name agnostic