diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index 3282a357a0..1dccd74be7 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -625,3 +625,109 @@ async fn retrieve_vectors() { ] "###); } +// Test for hybrid search distinct bug +static DOCUMENTS_WITH_DUPLICATES: Lazy = Lazy::new(|| { + json!([ + { + "title": "Captain Marvel", + "desc": "First Captain Marvel movie", + "id": "1", + "category": "superhero", + "_vectors": {"default": [1.0, 2.0]}, + }, + { + "title": "Captain Marvel Returns", + "desc": "Second Captain Marvel movie", + "id": "2", + "category": "superhero", + "_vectors": {"default": [1.1, 2.1]}, + }, + { + "title": "Shazam!", + "desc": "DC's Captain Marvel", + "id": "3", + "category": "superhero", + "_vectors": {"default": [2.0, 3.0]}, + }, + { + "title": "Iron Man", + "desc": "Tony Stark's story", + "id": "4", + "category": "action", + "_vectors": {"default": [3.0, 1.0]}, + } + ]) +}); + +async fn index_with_distinct_and_vectors<'a>( + server: &'a Server, + documents: &Value, +) -> Index<'a> { + let index = server.index("test"); + + // Set up embedders + let (response, code) = index + .update_settings(json!({ + "embedders": {"default": { + "source": "userProvided", + "dimensions": 2 + }}, + "distinctAttribute": "category" + })) + .await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await.succeeded(); + + // Add documents + let (response, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await.succeeded(); + + index +} + +#[actix_rt::test] +async fn test_hybrid_search_respects_distinct_attribute() { + let server = Server::new().await; + let index = index_with_distinct_and_vectors(&server, &DOCUMENTS_WITH_DUPLICATES).await; + + // First test: pure keyword search with distinct - should only return 2 documents (one per category) + let (response, code) = index + .search_post( + json!({"q": "Captain", "showRankingScore": true}), + ) + .await; + snapshot!(code, @"200 OK"); + // Should only return 2 hits: one from "superhero" category and one from "action" category + // But due to the bug, it returns more than expected + let keyword_hits = response["hits"].as_array().unwrap().len(); + println!("Keyword search hits: {}", keyword_hits); + + // Now test: hybrid search with distinct - this should also respect the distinct constraint + let (response, code) = index + .search_post( + json!({ + "q": "Captain", + "vector": [1.5, 2.5], + "hybrid": {"semanticRatio": 0.5, "embedder": "default"}, + "showRankingScore": true + }), + ) + .await; + snapshot!(code, @"200 OK"); + + let hybrid_hits = response["hits"].as_array().unwrap().len(); + println!("Hybrid search hits: {}", hybrid_hits); + + // Check that we only get one document per distinct category + let mut categories_seen = std::collections::HashSet::new(); + for hit in response["hits"].as_array().unwrap() { + let category = hit["category"].as_str().unwrap(); + assert!(!categories_seen.contains(category), + "Duplicate category '{}' found in hybrid search results - distinct constraint violated!", category); + categories_seen.insert(category); + } + + // This assertion will fail due to the bug - hybrid search ignores distinct constraint + assert!(hybrid_hits <= 2, "Hybrid search should respect distinct constraint and return at most 2 documents (one per category), but got {}", hybrid_hits); +} diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index e07f886c9c..408e7ff853 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -5,7 +5,8 @@ use roaring::RoaringBitmap; use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy}; use crate::search::SemanticSearch; -use crate::{MatchingWords, Result, Search, SearchResult}; +use crate::search::new::distinct::apply_distinct_rule; +use crate::{MatchingWords, Result, Search, SearchResult, SearchContext}; struct ScoreWithRatioResult { matching_words: MatchingWords, @@ -89,9 +90,10 @@ impl ScoreWithRatioResult { fn merge( vector_results: Self, keyword_results: Self, + search: &Search<'_>, from: usize, length: usize, - ) -> (SearchResult, u32) { + ) -> Result<(SearchResult, u32)> { #[derive(Clone, Copy)] enum ResultSource { Semantic, @@ -123,10 +125,6 @@ impl ScoreWithRatioResult { ) // remove documents we already saw .filter(|((docid, _), _)| documents_seen.insert(*docid)) - // start skipping **after** the filter - .skip(from) - // take **after** skipping - .take(length) { if let ResultSource::Semantic = source { semantic_hit_count += 1; @@ -136,18 +134,59 @@ impl ScoreWithRatioResult { document_scores.push(main_score); } - ( - SearchResult { - matching_words: keyword_results.matching_words, - candidates: vector_results.candidates | keyword_results.candidates, - documents_ids, - document_scores, - degraded: vector_results.degraded | keyword_results.degraded, - used_negative_operator: vector_results.used_negative_operator - | keyword_results.used_negative_operator, - }, - semantic_hit_count, - ) + let mut final_result = SearchResult { + matching_words: keyword_results.matching_words, + candidates: vector_results.candidates | keyword_results.candidates, + documents_ids, + document_scores, + degraded: vector_results.degraded | keyword_results.degraded, + used_negative_operator: vector_results.used_negative_operator + | keyword_results.used_negative_operator, + }; + + // Apply distinct rule if a distinct field is configured, similar to bucket_sort + let distinct_field = match search.distinct.as_deref() { + Some(distinct) => Some(distinct), + None => search.index.distinct_field(search.rtxn)?, + }; + + if let Some(f) = distinct_field { + let fields_ids_map = search.index.fields_ids_map(search.rtxn)?; + if let Some(distinct_fid) = fields_ids_map.id(f) { + let mut ctx = SearchContext::new(search.index, search.rtxn)?; + let all_candidates: RoaringBitmap = final_result.documents_ids.iter().copied().collect(); + let remaining_candidates = apply_distinct_rule(&mut ctx, distinct_fid, &all_candidates)? + .remaining; + + // Filter documents_ids and document_scores to only include remaining candidates + let mut filtered_docs = Vec::new(); + let mut filtered_scores = Vec::new(); + for (i, &docid) in final_result.documents_ids.iter().enumerate() { + if remaining_candidates.contains(docid) { + filtered_docs.push(docid); + filtered_scores.push(final_result.document_scores[i].clone()); + } + } + final_result.documents_ids = filtered_docs; + final_result.document_scores = filtered_scores; + } + } + + // Apply offset and limit after distinct filtering + let (documents_ids, document_scores) = if from >= final_result.documents_ids.len() { + (vec![], vec![]) + } else { + let end = std::cmp::min(from + length, final_result.documents_ids.len()); + ( + final_result.documents_ids[from..end].to_vec(), + final_result.document_scores[from..end].to_vec(), + ) + }; + + final_result.documents_ids = documents_ids; + final_result.document_scores = document_scores; + + Ok((final_result, semantic_hit_count)) } } @@ -227,7 +266,7 @@ impl Search<'_> { let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio); let (merge_results, semantic_hit_count) = - ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit); + ScoreWithRatioResult::merge(vector_results, keyword_results, self, self.offset, self.limit)?; assert!(merge_results.documents_ids.len() <= self.limit); Ok((merge_results, Some(semantic_hit_count))) } diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index 0a3bc1b04c..91d6663270 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -1,6 +1,6 @@ mod bucket_sort; mod db_cache; -mod distinct; +pub mod distinct; mod geo_sort; mod graph_based_ranking_rule; mod interner;