diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index 3282a357a0..ea13c3048b 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -625,3 +625,90 @@ async fn retrieve_vectors() { ] "###); } + +#[actix_rt::test] +async fn reproduce_hybrid_distinct_bug() { + let server = Server::new().await; + let index = server.index("test"); + + // Set up embedder with higher dimensions for more extreme differences + let (response, code) = index + .update_settings(json!({ + "embedders": { + "default": { + "source": "userProvided", + "dimensions": 8 // Increased from 2 to 8 dimensions + } + } + })) + .await; + assert_eq!(202, code); + index.wait_task(response.uid()).await.succeeded(); + + // Documents with IDENTICAL distinct values but extreme content differences + let test_documents = json!([ + { + "id": 1, + "title": "red nike running shoes athletic footwear", // Strong keyword match + "product_id": "DUPLICATE_PRODUCT_ID", + "_vectors": {"default": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]} // Extreme low similarity + }, + { + "id": 2, + "title": "blue adidas casual shirt clothing apparel", // Completely different keywords + "product_id": "DUPLICATE_PRODUCT_ID", // SAME product_id! + "_vectors": {"default": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]} // Extreme high similarity + } + ]); + + let (response, code) = index.add_documents(test_documents, Some("id")).await; + assert_eq!(202, code); + index.wait_task(response.uid()).await.succeeded(); + + // Set distinct on product_id + let (task, _) = index.update_distinct_attribute(json!("product_id")).await; + index.wait_task(task.uid()).await.succeeded(); + + // Hybrid search - should trigger both vector and keyword search + let (response, code) = index + .search_post(json!({ + "q": "red nike running shoes", // Should only match doc 1 well in keyword search + "vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], // Should only match doc 2 well in vector search + "hybrid": { + "embedder": "default", + "semanticRatio": 0.5 + }, + "limit": 10 + })) + .await; + + assert_eq!(200, code); + let hits = response["hits"].as_array().unwrap(); + + println!("SEARCH RESULTS:"); + for (i, hit) in hits.iter().enumerate() { + println!(" Hit {}: id={}, product_id={}", i, hit["id"], hit["product_id"]); + } + + // Check for duplicate product_ids - THIS IS THE BUG + let product_ids: Vec<&str> = hits.iter().filter_map(|hit| hit["product_id"].as_str()).collect(); + + let unique_count = product_ids.iter().collect::>().len(); + + if product_ids.len() > unique_count { + panic!( + "BUG REPRODUCED! Found {} total results but only {} unique product_ids. \ + This means distinct filtering failed in hybrid search! Results: {:?}", + product_ids.len(), + unique_count, + product_ids + ); + } + + // If only one result, distinct filtering worked (but we hoped to reproduce the bug) + if hits.len() == 1 { + println!("Only one result returned - distinct filtering working or different issue"); + } else { + println!("Multiple results with unique product_ids - distinct filtering appears to work"); + } +}