From 4b39e577f95143b165c7ca210079d912bdeaba54 Mon Sep 17 00:00:00 2001 From: Priyank Chodisetti Date: Tue, 27 May 2025 15:46:13 -0700 Subject: [PATCH] fix: apply distinct constraint in hybrid search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hybrid search was not respecting the distinct attribute configuration, returning duplicate documents from the same distinct group. This fix ensures distinct filtering is applied after merging vector and keyword results but before pagination. Changes: - Apply distinct rule in hybrid search merge logic - Add comprehensive test covering the distinct constraint bug - Make distinct module public to enable reuse in hybrid search - Update function signature to handle distinct filtering errors The fix maintains the correct order of operations: merge → distinct → paginate, ensuring hybrid search behaves consistently with regular keyword search regarding distinct attributes. --- crates/meilisearch/tests/search/hybrid.rs | 106 ++++++++++++++++++++++ crates/milli/src/search/hybrid.rs | 77 ++++++++++++---- crates/milli/src/search/new/mod.rs | 2 +- 3 files changed, 165 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index 3282a357a0..1dccd74be7 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -625,3 +625,109 @@ async fn retrieve_vectors() { ] "###); } +// Test for hybrid search distinct bug +static DOCUMENTS_WITH_DUPLICATES: Lazy = Lazy::new(|| { + json!([ + { + "title": "Captain Marvel", + "desc": "First Captain Marvel movie", + "id": "1", + "category": "superhero", + "_vectors": {"default": [1.0, 2.0]}, + }, + { + "title": "Captain Marvel Returns", + "desc": "Second Captain Marvel movie", + "id": "2", + "category": "superhero", + "_vectors": {"default": [1.1, 2.1]}, + }, + { + "title": "Shazam!", + "desc": "DC's Captain Marvel", + "id": "3", + "category": "superhero", + "_vectors": {"default": [2.0, 3.0]}, + }, + { + "title": "Iron Man", + "desc": "Tony Stark's story", + "id": "4", + "category": "action", + "_vectors": {"default": [3.0, 1.0]}, + } + ]) +}); + +async fn index_with_distinct_and_vectors<'a>( + server: &'a Server, + documents: &Value, +) -> Index<'a> { + let index = server.index("test"); + + // Set up embedders + let (response, code) = index + .update_settings(json!({ + "embedders": {"default": { + "source": "userProvided", + "dimensions": 2 + }}, + "distinctAttribute": "category" + })) + .await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await.succeeded(); + + // Add documents + let (response, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await.succeeded(); + + index +} + +#[actix_rt::test] +async fn test_hybrid_search_respects_distinct_attribute() { + let server = Server::new().await; + let index = index_with_distinct_and_vectors(&server, &DOCUMENTS_WITH_DUPLICATES).await; + + // First test: pure keyword search with distinct - should only return 2 documents (one per category) + let (response, code) = index + .search_post( + json!({"q": "Captain", "showRankingScore": true}), + ) + .await; + snapshot!(code, @"200 OK"); + // Should only return 2 hits: one from "superhero" category and one from "action" category + // But due to the bug, it returns more than expected + let keyword_hits = response["hits"].as_array().unwrap().len(); + println!("Keyword search hits: {}", keyword_hits); + + // Now test: hybrid search with distinct - this should also respect the distinct constraint + let (response, code) = index + .search_post( + json!({ + "q": "Captain", + "vector": [1.5, 2.5], + "hybrid": {"semanticRatio": 0.5, "embedder": "default"}, + "showRankingScore": true + }), + ) + .await; + snapshot!(code, @"200 OK"); + + let hybrid_hits = response["hits"].as_array().unwrap().len(); + println!("Hybrid search hits: {}", hybrid_hits); + + // Check that we only get one document per distinct category + let mut categories_seen = std::collections::HashSet::new(); + for hit in response["hits"].as_array().unwrap() { + let category = hit["category"].as_str().unwrap(); + assert!(!categories_seen.contains(category), + "Duplicate category '{}' found in hybrid search results - distinct constraint violated!", category); + categories_seen.insert(category); + } + + // This assertion will fail due to the bug - hybrid search ignores distinct constraint + assert!(hybrid_hits <= 2, "Hybrid search should respect distinct constraint and return at most 2 documents (one per category), but got {}", hybrid_hits); +} diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index e07f886c9c..408e7ff853 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -5,7 +5,8 @@ use roaring::RoaringBitmap; use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy}; use crate::search::SemanticSearch; -use crate::{MatchingWords, Result, Search, SearchResult}; +use crate::search::new::distinct::apply_distinct_rule; +use crate::{MatchingWords, Result, Search, SearchResult, SearchContext}; struct ScoreWithRatioResult { matching_words: MatchingWords, @@ -89,9 +90,10 @@ impl ScoreWithRatioResult { fn merge( vector_results: Self, keyword_results: Self, + search: &Search<'_>, from: usize, length: usize, - ) -> (SearchResult, u32) { + ) -> Result<(SearchResult, u32)> { #[derive(Clone, Copy)] enum ResultSource { Semantic, @@ -123,10 +125,6 @@ impl ScoreWithRatioResult { ) // remove documents we already saw .filter(|((docid, _), _)| documents_seen.insert(*docid)) - // start skipping **after** the filter - .skip(from) - // take **after** skipping - .take(length) { if let ResultSource::Semantic = source { semantic_hit_count += 1; @@ -136,18 +134,59 @@ impl ScoreWithRatioResult { document_scores.push(main_score); } - ( - SearchResult { - matching_words: keyword_results.matching_words, - candidates: vector_results.candidates | keyword_results.candidates, - documents_ids, - document_scores, - degraded: vector_results.degraded | keyword_results.degraded, - used_negative_operator: vector_results.used_negative_operator - | keyword_results.used_negative_operator, - }, - semantic_hit_count, - ) + let mut final_result = SearchResult { + matching_words: keyword_results.matching_words, + candidates: vector_results.candidates | keyword_results.candidates, + documents_ids, + document_scores, + degraded: vector_results.degraded | keyword_results.degraded, + used_negative_operator: vector_results.used_negative_operator + | keyword_results.used_negative_operator, + }; + + // Apply distinct rule if a distinct field is configured, similar to bucket_sort + let distinct_field = match search.distinct.as_deref() { + Some(distinct) => Some(distinct), + None => search.index.distinct_field(search.rtxn)?, + }; + + if let Some(f) = distinct_field { + let fields_ids_map = search.index.fields_ids_map(search.rtxn)?; + if let Some(distinct_fid) = fields_ids_map.id(f) { + let mut ctx = SearchContext::new(search.index, search.rtxn)?; + let all_candidates: RoaringBitmap = final_result.documents_ids.iter().copied().collect(); + let remaining_candidates = apply_distinct_rule(&mut ctx, distinct_fid, &all_candidates)? + .remaining; + + // Filter documents_ids and document_scores to only include remaining candidates + let mut filtered_docs = Vec::new(); + let mut filtered_scores = Vec::new(); + for (i, &docid) in final_result.documents_ids.iter().enumerate() { + if remaining_candidates.contains(docid) { + filtered_docs.push(docid); + filtered_scores.push(final_result.document_scores[i].clone()); + } + } + final_result.documents_ids = filtered_docs; + final_result.document_scores = filtered_scores; + } + } + + // Apply offset and limit after distinct filtering + let (documents_ids, document_scores) = if from >= final_result.documents_ids.len() { + (vec![], vec![]) + } else { + let end = std::cmp::min(from + length, final_result.documents_ids.len()); + ( + final_result.documents_ids[from..end].to_vec(), + final_result.document_scores[from..end].to_vec(), + ) + }; + + final_result.documents_ids = documents_ids; + final_result.document_scores = document_scores; + + Ok((final_result, semantic_hit_count)) } } @@ -227,7 +266,7 @@ impl Search<'_> { let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio); let (merge_results, semantic_hit_count) = - ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit); + ScoreWithRatioResult::merge(vector_results, keyword_results, self, self.offset, self.limit)?; assert!(merge_results.documents_ids.len() <= self.limit); Ok((merge_results, Some(semantic_hit_count))) } diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index 0a3bc1b04c..91d6663270 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -1,6 +1,6 @@ mod bucket_sort; mod db_cache; -mod distinct; +pub mod distinct; mod geo_sort; mod graph_based_ranking_rule; mod interner;