Skip to content
This repository was archived by the owner on Jul 31, 2025. It is now read-only.
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions crates/meilisearch/tests/search/hybrid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -625,3 +625,90 @@ async fn retrieve_vectors() {
]
"###);
}

#[actix_rt::test]
async fn reproduce_hybrid_distinct_bug() {
let server = Server::new().await;
let index = server.index("test");

// Set up embedder with higher dimensions for more extreme differences
let (response, code) = index
.update_settings(json!({
"embedders": {
"default": {
"source": "userProvided",
"dimensions": 8 // Increased from 2 to 8 dimensions
}
}
}))
.await;
assert_eq!(202, code);
index.wait_task(response.uid()).await.succeeded();

// Documents with IDENTICAL distinct values but extreme content differences
let test_documents = json!([
{
"id": 1,
"title": "red nike running shoes athletic footwear", // Strong keyword match
"product_id": "DUPLICATE_PRODUCT_ID",
"_vectors": {"default": [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]} // Extreme low similarity
},
{
"id": 2,
"title": "blue adidas casual shirt clothing apparel", // Completely different keywords
"product_id": "DUPLICATE_PRODUCT_ID", // SAME product_id!
"_vectors": {"default": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]} // Extreme high similarity
}
]);

let (response, code) = index.add_documents(test_documents, Some("id")).await;
assert_eq!(202, code);
index.wait_task(response.uid()).await.succeeded();

// Set distinct on product_id
let (task, _) = index.update_distinct_attribute(json!("product_id")).await;
index.wait_task(task.uid()).await.succeeded();

// Hybrid search - should trigger both vector and keyword search
let (response, code) = index
.search_post(json!({
"q": "red nike running shoes", // Should only match doc 1 well in keyword search
"vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], // Should only match doc 2 well in vector search
"hybrid": {
"embedder": "default",
"semanticRatio": 0.5
},
"limit": 10
}))
.await;

assert_eq!(200, code);
let hits = response["hits"].as_array().unwrap();

println!("SEARCH RESULTS:");
for (i, hit) in hits.iter().enumerate() {
println!(" Hit {}: id={}, product_id={}", i, hit["id"], hit["product_id"]);
}

// Check for duplicate product_ids - THIS IS THE BUG
let product_ids: Vec<&str> = hits.iter().filter_map(|hit| hit["product_id"].as_str()).collect();

let unique_count = product_ids.iter().collect::<std::collections::HashSet<_>>().len();

if product_ids.len() > unique_count {
panic!(
"BUG REPRODUCED! Found {} total results but only {} unique product_ids. \
This means distinct filtering failed in hybrid search! Results: {:?}",
product_ids.len(),
unique_count,
product_ids
);
}

// If only one result, distinct filtering worked (but we hoped to reproduce the bug)
if hits.len() == 1 {
println!("Only one result returned - distinct filtering working or different issue");
} else {
println!("Multiple results with unique product_ids - distinct filtering appears to work");
}
}