-
Notifications
You must be signed in to change notification settings - Fork 0
fix: apply distinct constraint in hybrid search #5
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -625,3 +625,109 @@ async fn retrieve_vectors() { | |
| ] | ||
| "###); | ||
| } | ||
| // Test for hybrid search distinct bug | ||
| static DOCUMENTS_WITH_DUPLICATES: Lazy<Value> = Lazy::new(|| { | ||
| json!([ | ||
| { | ||
| "title": "Captain Marvel", | ||
| "desc": "First Captain Marvel movie", | ||
| "id": "1", | ||
| "category": "superhero", | ||
| "_vectors": {"default": [1.0, 2.0]}, | ||
| }, | ||
| { | ||
| "title": "Captain Marvel Returns", | ||
| "desc": "Second Captain Marvel movie", | ||
| "id": "2", | ||
| "category": "superhero", | ||
| "_vectors": {"default": [1.1, 2.1]}, | ||
| }, | ||
| { | ||
| "title": "Shazam!", | ||
| "desc": "DC's Captain Marvel", | ||
| "id": "3", | ||
| "category": "superhero", | ||
| "_vectors": {"default": [2.0, 3.0]}, | ||
| }, | ||
| { | ||
| "title": "Iron Man", | ||
| "desc": "Tony Stark's story", | ||
| "id": "4", | ||
| "category": "action", | ||
| "_vectors": {"default": [3.0, 1.0]}, | ||
| } | ||
| ]) | ||
| }); | ||
|
|
||
| async fn index_with_distinct_and_vectors<'a>( | ||
| server: &'a Server, | ||
| documents: &Value, | ||
| ) -> Index<'a> { | ||
| let index = server.index("test"); | ||
|
|
||
| // Set up embedders | ||
| let (response, code) = index | ||
| .update_settings(json!({ | ||
| "embedders": {"default": { | ||
| "source": "userProvided", | ||
| "dimensions": 2 | ||
| }}, | ||
| "distinctAttribute": "category" | ||
| })) | ||
| .await; | ||
| assert_eq!(202, code, "{:?}", response); | ||
| index.wait_task(response.uid()).await.succeeded(); | ||
|
|
||
| // Add documents | ||
| let (response, code) = index.add_documents(documents.clone(), None).await; | ||
| assert_eq!(202, code, "{:?}", response); | ||
| index.wait_task(response.uid()).await.succeeded(); | ||
|
|
||
| index | ||
| } | ||
|
|
||
| #[actix_rt::test] | ||
| async fn test_hybrid_search_respects_distinct_attribute() { | ||
| let server = Server::new().await; | ||
| let index = index_with_distinct_and_vectors(&server, &DOCUMENTS_WITH_DUPLICATES).await; | ||
|
|
||
| // First test: pure keyword search with distinct - should only return 2 documents (one per category) | ||
| let (response, code) = index | ||
| .search_post( | ||
| json!({"q": "Captain", "showRankingScore": true}), | ||
| ) | ||
| .await; | ||
| snapshot!(code, @"200 OK"); | ||
| // Should only return 2 hits: one from "superhero" category and one from "action" category | ||
| // But due to the bug, it returns more than expected | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand this comment, the bug is on the semantic search, not on the hybrid search |
||
| let keyword_hits = response["hits"].as_array().unwrap().len(); | ||
| println!("Keyword search hits: {}", keyword_hits); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we do not print results in tests; please use snapshots to assert the number of results |
||
|
|
||
| // Now test: hybrid search with distinct - this should also respect the distinct constraint | ||
| let (response, code) = index | ||
| .search_post( | ||
| json!({ | ||
| "q": "Captain", | ||
| "vector": [1.5, 2.5], | ||
| "hybrid": {"semanticRatio": 0.5, "embedder": "default"}, | ||
| "showRankingScore": true | ||
| }), | ||
| ) | ||
| .await; | ||
| snapshot!(code, @"200 OK"); | ||
|
|
||
| let hybrid_hits = response["hits"].as_array().unwrap().len(); | ||
| println!("Hybrid search hits: {}", hybrid_hits); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same remark as above |
||
|
|
||
| // Check that we only get one document per distinct category | ||
| let mut categories_seen = std::collections::HashSet::new(); | ||
| for hit in response["hits"].as_array().unwrap() { | ||
| let category = hit["category"].as_str().unwrap(); | ||
| assert!(!categories_seen.contains(category), | ||
| "Duplicate category '{}' found in hybrid search results - distinct constraint violated!", category); | ||
| categories_seen.insert(category); | ||
| } | ||
|
|
||
| // This assertion will fail due to the bug - hybrid search ignores distinct constraint | ||
| assert!(hybrid_hits <= 2, "Hybrid search should respect distinct constraint and return at most 2 documents (one per category), but got {}", hybrid_hits); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests in Meilisearch are written to avoid accidental changes, please snapshot on the exact number of hits rather than asserting the logical condition here. |
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,7 +5,8 @@ use roaring::RoaringBitmap; | |
|
|
||
| use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy}; | ||
| use crate::search::SemanticSearch; | ||
| use crate::{MatchingWords, Result, Search, SearchResult}; | ||
| use crate::search::new::distinct::apply_distinct_rule; | ||
| use crate::{MatchingWords, Result, Search, SearchResult, SearchContext}; | ||
|
|
||
| struct ScoreWithRatioResult { | ||
| matching_words: MatchingWords, | ||
|
|
@@ -89,9 +90,10 @@ impl ScoreWithRatioResult { | |
| fn merge( | ||
| vector_results: Self, | ||
| keyword_results: Self, | ||
| search: &Search<'_>, | ||
| from: usize, | ||
| length: usize, | ||
| ) -> (SearchResult, u32) { | ||
| ) -> Result<(SearchResult, u32)> { | ||
| #[derive(Clone, Copy)] | ||
| enum ResultSource { | ||
| Semantic, | ||
|
|
@@ -123,10 +125,6 @@ impl ScoreWithRatioResult { | |
| ) | ||
| // remove documents we already saw | ||
| .filter(|((docid, _), _)| documents_seen.insert(*docid)) | ||
| // start skipping **after** the filter | ||
| .skip(from) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing the filters at this point breaks the |
||
| // take **after** skipping | ||
| .take(length) | ||
| { | ||
| if let ResultSource::Semantic = source { | ||
| semantic_hit_count += 1; | ||
|
|
@@ -136,18 +134,59 @@ impl ScoreWithRatioResult { | |
| document_scores.push(main_score); | ||
| } | ||
|
|
||
| ( | ||
| SearchResult { | ||
| matching_words: keyword_results.matching_words, | ||
| candidates: vector_results.candidates | keyword_results.candidates, | ||
| documents_ids, | ||
| document_scores, | ||
| degraded: vector_results.degraded | keyword_results.degraded, | ||
| used_negative_operator: vector_results.used_negative_operator | ||
| | keyword_results.used_negative_operator, | ||
| }, | ||
| semantic_hit_count, | ||
| ) | ||
| let mut final_result = SearchResult { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. style: we usually build an immutable object from its final constituents, rather than a mutable object and then edit its fields. |
||
| matching_words: keyword_results.matching_words, | ||
| candidates: vector_results.candidates | keyword_results.candidates, | ||
| documents_ids, | ||
| document_scores, | ||
| degraded: vector_results.degraded | keyword_results.degraded, | ||
| used_negative_operator: vector_results.used_negative_operator | ||
| | keyword_results.used_negative_operator, | ||
| }; | ||
|
|
||
| // Apply distinct rule if a distinct field is configured, similar to bucket_sort | ||
| let distinct_field = match search.distinct.as_deref() { | ||
| Some(distinct) => Some(distinct), | ||
| None => search.index.distinct_field(search.rtxn)?, | ||
| }; | ||
|
|
||
| if let Some(f) = distinct_field { | ||
| let fields_ids_map = search.index.fields_ids_map(search.rtxn)?; | ||
| if let Some(distinct_fid) = fields_ids_map.id(f) { | ||
| let mut ctx = SearchContext::new(search.index, search.rtxn)?; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. performance: |
||
| let all_candidates: RoaringBitmap = final_result.documents_ids.iter().copied().collect(); | ||
| let remaining_candidates = apply_distinct_rule(&mut ctx, distinct_fid, &all_candidates)? | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, this call will give priority to distinct documents with a lower document id, breaking the order of documents returned by the search. |
||
| .remaining; | ||
|
|
||
| // Filter documents_ids and document_scores to only include remaining candidates | ||
| let mut filtered_docs = Vec::new(); | ||
| let mut filtered_scores = Vec::new(); | ||
| for (i, &docid) in final_result.documents_ids.iter().enumerate() { | ||
| if remaining_candidates.contains(docid) { | ||
| filtered_docs.push(docid); | ||
| filtered_scores.push(final_result.document_scores[i].clone()); | ||
| } | ||
| } | ||
| final_result.documents_ids = filtered_docs; | ||
| final_result.document_scores = filtered_scores; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the |
||
| } | ||
| } | ||
|
|
||
| // Apply offset and limit after distinct filtering | ||
| let (documents_ids, document_scores) = if from >= final_result.documents_ids.len() { | ||
| (vec![], vec![]) | ||
| } else { | ||
| let end = std::cmp::min(from + length, final_result.documents_ids.len()); | ||
| ( | ||
| final_result.documents_ids[from..end].to_vec(), | ||
| final_result.document_scores[from..end].to_vec(), | ||
| ) | ||
| }; | ||
|
|
||
| final_result.documents_ids = documents_ids; | ||
| final_result.document_scores = document_scores; | ||
|
|
||
| Ok((final_result, semantic_hit_count)) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -227,7 +266,7 @@ impl Search<'_> { | |
| let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio); | ||
|
|
||
| let (merge_results, semantic_hit_count) = | ||
| ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit); | ||
| ScoreWithRatioResult::merge(vector_results, keyword_results, self, self.offset, self.limit)?; | ||
| assert!(merge_results.documents_ids.len() <= self.limit); | ||
| Ok((merge_results, Some(semantic_hit_count))) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this will return a single document because the one in the
actioncategory is not matched by the search