Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions src/core/search/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ struct BaseSortIndex : BaseIndex {
See index_result.h for more details. */
struct SeekableTag {};

template <typename Iterator> void BasicSeekGE(DocId min_doc_id, const Iterator& end, Iterator* it);

/* Used for converting field values to double. Returns std::nullopt if the conversion fails */
std::optional<double> ParseNumericField(std::string_view value);

Expand All @@ -114,4 +116,43 @@ template <typename InlinedVector> std::optional<InlinedVector> EmptyAccessResult
#endif
}

// Implementation
/******************************************************************/
namespace details {
inline size_t GetHighestPowerOfTwo(size_t n) {
static constexpr size_t kBitsNumber = sizeof(size_t) * 8;
return size_t(1) << (kBitsNumber - 1 - __builtin_clzl(n));
}
} // namespace details

template <typename Iterator> void BasicSeekGE(DocId min_doc_id, const Iterator& end, Iterator* it) {
using Category = typename std::iterator_traits<Iterator>::iterator_category;

auto extract_doc_id = [](const auto& value) {
using T = std::decay_t<decltype(value)>;
if constexpr (std::is_same_v<T, DocId>) {
return value;
} else {
return value.first;
}
};

if constexpr (std::is_base_of_v<std::random_access_iterator_tag, Category>) {
size_t length = std::distance(*it, end);
for (size_t step = details::GetHighestPowerOfTwo(length); step > 0; step >>= 1) {
if (step < length) {
auto next_it = *it + step;
if (extract_doc_id(*next_it) < min_doc_id) {
*it = next_it;
length -= step;
}
}
}
}

while (*it != end && extract_doc_id(**it) < min_doc_id) {
++(*it);
}
}

} // namespace dfly::search
40 changes: 40 additions & 0 deletions src/core/search/block_list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,46 @@ typename BlockList<C>::BlockListIterator& BlockList<C>::BlockListIterator::opera
return *this;
}

template <typename C> void BlockList<C>::BlockListIterator::SeekGE(DocId min_doc_id) {
if (it == it_end) {
block_it = {};
block_end = {};
return;
}

auto extract_doc_id = [](const auto& value) {
using T = std::decay_t<decltype(value)>;
if constexpr (std::is_same_v<T, DocId>) {
return value;
} else {
return value.first;
}
};

auto needed_block = [&](const auto& it) {
return it->begin() != it->end() && min_doc_id <= extract_doc_id(it->Back());
};

// Choose the first block that has the last element >= min_doc_id
if (!needed_block(it)) {
while (++it != it_end) {
if (needed_block(it)) {
block_it = it->begin();
block_end = it->end();
break;
}
}
if (it == it_end) {
block_it = {};
block_end = {};
return;
}
}

BasicSeekGE(min_doc_id, block_end, &block_it);
DCHECK(block_it != block_end && min_doc_id <= extract_doc_id(*block_it));
}

template class BlockList<CompressedSortedSet>;
template class BlockList<SortedVector<DocId>>;
template class BlockList<SortedVector<std::pair<DocId, double>>>;
Expand Down
7 changes: 6 additions & 1 deletion src/core/search/block_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ template <typename Container /* underlying container */> class BlockList {
blocks_.clear();
}

struct BlockListIterator {
struct BlockListIterator : public SeekableTag {
// To make it work with std container contructors
using iterator_category = std::forward_iterator_tag;
using difference_type = std::ptrdiff_t;
Expand All @@ -97,6 +97,7 @@ template <typename Container /* underlying container */> class BlockList {
}

BlockListIterator& operator++();
void SeekGE(DocId min_doc_id);

friend class BlockList;

Expand Down Expand Up @@ -169,6 +170,10 @@ template <typename T> class SortedVector {
entries_.clear();
}

const T& Back() const {
return entries_.back();
}

using iterator = typename PMR_NS::vector<T>::const_iterator;

iterator begin() const {
Expand Down
8 changes: 6 additions & 2 deletions src/core/search/compressed_sorted_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,12 @@ std::pair<CompressedSortedSet, CompressedSortedSet> CompressedSortedSet::Split()

// Move iterator to middle position and save size of diffs tail
auto it = begin();
std::advance(it, size_ / 2);
std::advance(it, (size_ - 1) / 2);

// Save last value in the first set
tail_value_ = *it;
++it;

size_t keep_bytes = it.last_read_.data() - diffs_.data();

// Copy second half into second set
Expand All @@ -215,7 +220,6 @@ std::pair<CompressedSortedSet, CompressedSortedSet> CompressedSortedSet::Split()

// Erase diffs tail
diffs_.resize(keep_bytes);
tail_value_ = std::nullopt;
size_ -= second.Size();

return std::make_pair(std::move(*this), std::move(second));
Expand Down
5 changes: 5 additions & 0 deletions src/core/search/compressed_sorted_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ class CompressedSortedSet {
// Split into two equally sized halves
std::pair<CompressedSortedSet, CompressedSortedSet> Split() &&;

IntType Back() const {
DCHECK(!Empty() && tail_value_.has_value());
return tail_value_.value();
}

private:
struct EntryLocation {
IntType value; // Value or 0
Expand Down
27 changes: 23 additions & 4 deletions src/core/search/index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,31 @@ using BackInserter = std::back_insert_iterator<std::vector<DocId>>;
template <typename T> constexpr bool IsSeekableIterator = std::is_base_of_v<SeekableTag, T>;

template <typename Iterator> void Seek(DocId min_doc_id, const Iterator& end, Iterator* it) {
if constexpr (IsSeekableIterator<Iterator>) {
it->SeekGE(min_doc_id);
static constexpr DocId kFastSeekThreshold = 15;

auto extract_doc_id = [](const auto& value) {
using T = std::decay_t<decltype(value)>;
if constexpr (std::is_same_v<T, DocId>) {
return value;
} else {
return value.first;
}
};

DocId current_value = extract_doc_id(**it);
DCHECK(current_value < min_doc_id);

if (min_doc_id - current_value > kFastSeekThreshold) { // If the gap is large, use a fast seek
if constexpr (IsSeekableIterator<Iterator>) {
it->SeekGE(min_doc_id);
} else {
BasicSeekGE(min_doc_id, end, it);
}
} else {
while (*it != end && **it < min_doc_id) {
// If the gap is small, just iterate
do {
++(*it);
}
} while (*it != end && extract_doc_id(**it) < min_doc_id);
}
}

Expand Down
4 changes: 3 additions & 1 deletion src/core/search/range_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,9 @@ inline RangeFilterIterator& RangeFilterIterator::operator++() {
}

inline void RangeFilterIterator::SeekGE(DocId min_doc_id) {
while (current_ != end_ && (!InRange(current_) || (*current_).first < min_doc_id)) {
current_.SeekGE(min_doc_id);
while (current_ != end_ && !InRange(current_)) {
DCHECK((*current_).first >= min_doc_id);
++current_;
}
}
Expand Down
50 changes: 50 additions & 0 deletions src/core/search/search_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1678,6 +1678,56 @@ BENCHMARK(BM_SearchSeveralNumericAndTagIndexes)
->Arg(1000000)
->ArgNames({"num_docs"});

static void BM_SearchMergeEqualSets(benchmark::State& state) {
auto schema = MakeSimpleSchema({
{"numeric1", SchemaField::NUMERIC,
SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}},
{"numeric2", SchemaField::NUMERIC,
SchemaField::NumericParams{.block_size = kMaxRangeBlockSize}},
});

FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};

SearchAlgorithm algo;
QueryParams params;
std::default_random_engine rnd;

using NumericType = long long;
uniform_int_distribution<NumericType> dist1(std::numeric_limits<NumericType>::min(),
std::numeric_limits<NumericType>::max());
uniform_int_distribution<NumericType> dist2(std::numeric_limits<NumericType>::min(),
std::numeric_limits<NumericType>::max());

const size_t num_docs = state.range(0);
for (size_t i = 0; i < num_docs; ++i) {
MockedDocument doc{Map{
{"numeric1", std::to_string(dist1(rnd))},
{"numeric2", std::to_string(dist2(rnd))},
}};
indices.Add(i, doc);
}

std::string query = absl::StrCat("@numeric1:[-inf +inf] @numeric2:[-inf +inf]");

while (state.KeepRunning()) {
CHECK(algo.Init(query, &params));
auto result = algo.Search(&indices);
CHECK(result.error.empty());

// All documents should match both conditions, so total should equal num_docs
CHECK_EQ(result.total, num_docs);
CHECK_EQ(result.ids.size(), num_docs);
}
}

BENCHMARK(BM_SearchMergeEqualSets)
->Arg(100)
->Arg(1000)
->Arg(10000)
->Arg(100000)
->Arg(1000000)
->ArgNames({"num_docs"});

#ifdef USE_SIMSIMD

#define SIMSIMD_NATIVE_F16 0
Expand Down
Loading