Skip to content
This repository was archived by the owner on Jul 31, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions crates/meilisearch/tests/search/hybrid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -625,3 +625,109 @@ async fn retrieve_vectors() {
]
"###);
}
// Test for hybrid search distinct bug
static DOCUMENTS_WITH_DUPLICATES: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Captain Marvel",
"desc": "First Captain Marvel movie",
"id": "1",
"category": "superhero",
"_vectors": {"default": [1.0, 2.0]},
},
{
"title": "Captain Marvel Returns",
"desc": "Second Captain Marvel movie",
"id": "2",
"category": "superhero",
"_vectors": {"default": [1.1, 2.1]},
},
{
"title": "Shazam!",
"desc": "DC's Captain Marvel",
"id": "3",
"category": "superhero",
"_vectors": {"default": [2.0, 3.0]},
},
{
"title": "Iron Man",
"desc": "Tony Stark's story",
"id": "4",
"category": "action",
"_vectors": {"default": [3.0, 1.0]},
}
])
});

async fn index_with_distinct_and_vectors<'a>(
server: &'a Server,
documents: &Value,
) -> Index<'a> {
let index = server.index("test");

// Set up embedders
let (response, code) = index
.update_settings(json!({
"embedders": {"default": {
"source": "userProvided",
"dimensions": 2
}},
"distinctAttribute": "category"
}))
.await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await.succeeded();

// Add documents
let (response, code) = index.add_documents(documents.clone(), None).await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await.succeeded();

index
}

#[actix_rt::test]
async fn test_hybrid_search_respects_distinct_attribute() {
let server = Server::new().await;
let index = index_with_distinct_and_vectors(&server, &DOCUMENTS_WITH_DUPLICATES).await;

// First test: pure keyword search with distinct - should only return 2 documents (one per category)
let (response, code) = index
.search_post(
json!({"q": "Captain", "showRankingScore": true}),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will return a single document because the one in the action category is not matched by the search

)
.await;
snapshot!(code, @"200 OK");
// Should only return 2 hits: one from "superhero" category and one from "action" category
// But due to the bug, it returns more than expected

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this comment, the bug is on the semantic search, not on the hybrid search

let keyword_hits = response["hits"].as_array().unwrap().len();
println!("Keyword search hits: {}", keyword_hits);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do not print results in tests; please use snapshots to assert the number of results


// Now test: hybrid search with distinct - this should also respect the distinct constraint
let (response, code) = index
.search_post(
json!({
"q": "Captain",
"vector": [1.5, 2.5],
"hybrid": {"semanticRatio": 0.5, "embedder": "default"},
"showRankingScore": true
}),
)
.await;
snapshot!(code, @"200 OK");

let hybrid_hits = response["hits"].as_array().unwrap().len();
println!("Hybrid search hits: {}", hybrid_hits);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same remark as above


// Check that we only get one document per distinct category
let mut categories_seen = std::collections::HashSet::new();
for hit in response["hits"].as_array().unwrap() {
let category = hit["category"].as_str().unwrap();
assert!(!categories_seen.contains(category),
"Duplicate category '{}' found in hybrid search results - distinct constraint violated!", category);
categories_seen.insert(category);
}

// This assertion will fail due to the bug - hybrid search ignores distinct constraint
assert!(hybrid_hits <= 2, "Hybrid search should respect distinct constraint and return at most 2 documents (one per category), but got {}", hybrid_hits);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests in Meilisearch are written to avoid accidental changes, please snapshot on the exact number of hits rather than asserting the logical condition here.

}
77 changes: 58 additions & 19 deletions crates/milli/src/search/hybrid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ use roaring::RoaringBitmap;

use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
use crate::search::SemanticSearch;
use crate::{MatchingWords, Result, Search, SearchResult};
use crate::search::new::distinct::apply_distinct_rule;
use crate::{MatchingWords, Result, Search, SearchResult, SearchContext};

struct ScoreWithRatioResult {
matching_words: MatchingWords,
Expand Down Expand Up @@ -89,9 +90,10 @@ impl ScoreWithRatioResult {
fn merge(
vector_results: Self,
keyword_results: Self,
search: &Search<'_>,
from: usize,
length: usize,
) -> (SearchResult, u32) {
) -> Result<(SearchResult, u32)> {
#[derive(Clone, Copy)]
enum ResultSource {
Semantic,
Expand Down Expand Up @@ -123,10 +125,6 @@ impl ScoreWithRatioResult {
)
// remove documents we already saw
.filter(|((docid, _), _)| documents_seen.insert(*docid))
// start skipping **after** the filter
.skip(from)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing the filters at this point breaks the semantic_hit_count, which is no longer representative of the number of semantic hits in these results.

// take **after** skipping
.take(length)
{
if let ResultSource::Semantic = source {
semantic_hit_count += 1;
Expand All @@ -136,18 +134,59 @@ impl ScoreWithRatioResult {
document_scores.push(main_score);
}

(
SearchResult {
matching_words: keyword_results.matching_words,
candidates: vector_results.candidates | keyword_results.candidates,
documents_ids,
document_scores,
degraded: vector_results.degraded | keyword_results.degraded,
used_negative_operator: vector_results.used_negative_operator
| keyword_results.used_negative_operator,
},
semantic_hit_count,
)
let mut final_result = SearchResult {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: we usually build an immutable object from its final constituents, rather than a mutable object and then edit its fields.

matching_words: keyword_results.matching_words,
candidates: vector_results.candidates | keyword_results.candidates,
documents_ids,
document_scores,
degraded: vector_results.degraded | keyword_results.degraded,
used_negative_operator: vector_results.used_negative_operator
| keyword_results.used_negative_operator,
};

// Apply distinct rule if a distinct field is configured, similar to bucket_sort
let distinct_field = match search.distinct.as_deref() {
Some(distinct) => Some(distinct),
None => search.index.distinct_field(search.rtxn)?,
};

if let Some(f) = distinct_field {
let fields_ids_map = search.index.fields_ids_map(search.rtxn)?;
if let Some(distinct_fid) = fields_ids_map.id(f) {
let mut ctx = SearchContext::new(search.index, search.rtxn)?;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

performance: SearchContext is a slightly heavyweight object that we'd rather not rebuild here.

let all_candidates: RoaringBitmap = final_result.documents_ids.iter().copied().collect();
let remaining_candidates = apply_distinct_rule(&mut ctx, distinct_fid, &all_candidates)?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, this call will give priority to distinct documents with a lower document id, breaking the order of documents returned by the search.

.remaining;

// Filter documents_ids and document_scores to only include remaining candidates
let mut filtered_docs = Vec::new();
let mut filtered_scores = Vec::new();
for (i, &docid) in final_result.documents_ids.iter().enumerate() {
if remaining_candidates.contains(docid) {
filtered_docs.push(docid);
filtered_scores.push(final_result.document_scores[i].clone());
}
}
final_result.documents_ids = filtered_docs;
final_result.document_scores = filtered_scores;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the candidates from the final result are not updated, meaning that the field distribution and the total hits will be wrong

}
}

// Apply offset and limit after distinct filtering
let (documents_ids, document_scores) = if from >= final_result.documents_ids.len() {
(vec![], vec![])
} else {
let end = std::cmp::min(from + length, final_result.documents_ids.len());
(
final_result.documents_ids[from..end].to_vec(),
final_result.document_scores[from..end].to_vec(),
)
};

final_result.documents_ids = documents_ids;
final_result.document_scores = document_scores;

Ok((final_result, semantic_hit_count))
}
}

Expand Down Expand Up @@ -227,7 +266,7 @@ impl Search<'_> {
let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio);

let (merge_results, semantic_hit_count) =
ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit);
ScoreWithRatioResult::merge(vector_results, keyword_results, self, self.offset, self.limit)?;
assert!(merge_results.documents_ids.len() <= self.limit);
Ok((merge_results, Some(semantic_hit_count)))
}
Expand Down
2 changes: 1 addition & 1 deletion crates/milli/src/search/new/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mod bucket_sort;
mod db_cache;
mod distinct;
pub mod distinct;
mod geo_sort;
mod graph_based_ranking_rule;
mod interner;
Expand Down