|
9 | 9 | //! - Performance tests for large datasets |
10 | 10 |
|
11 | 11 | use super::{ |
12 | | - metadata::{StorageMetadata, VectorEndianness, NATIVE_VECTOR_ENDIANNESS}, |
| 12 | + HelixGraphStorage, |
| 13 | + metadata::{NATIVE_VECTOR_ENDIANNESS, StorageMetadata, VectorEndianness}, |
13 | 14 | storage_migration::{ |
14 | 15 | convert_all_vector_properties, convert_old_vector_properties_to_new_format, |
15 | 16 | convert_vector_endianness, migrate, |
16 | 17 | }, |
17 | | - HelixGraphStorage, |
18 | 18 | }; |
19 | 19 | use crate::{ |
20 | 20 | helix_engine::{ |
21 | | - storage_core::version_info::VersionInfo, traversal_core::config::Config, types::GraphError, |
| 21 | + bm25::bm25::{ |
| 22 | + BM25, BM25_SCHEMA_VERSION, BM25_SCHEMA_VERSION_KEY, BM25Metadata, METADATA_KEY, |
| 23 | + }, |
| 24 | + storage_core::version_info::VersionInfo, |
| 25 | + traversal_core::{ |
| 26 | + config::Config, |
| 27 | + ops::{g::G, source::add_n::AddNAdapter}, |
| 28 | + }, |
| 29 | + types::GraphError, |
22 | 30 | }, |
23 | 31 | protocol::value::Value, |
| 32 | + utils::{items::Node, properties::ImmutablePropertiesMap}, |
24 | 33 | }; |
| 34 | +use bumpalo::Bump; |
25 | 35 | use std::collections::HashMap; |
26 | 36 | use tempfile::TempDir; |
27 | 37 |
|
@@ -169,6 +179,45 @@ fn clear_metadata(storage: &mut HelixGraphStorage) -> Result<(), GraphError> { |
169 | 179 | Ok(()) |
170 | 180 | } |
171 | 181 |
|
| 182 | +fn add_test_node( |
| 183 | + storage: &HelixGraphStorage, |
| 184 | + label: &'static str, |
| 185 | + properties: &[(&'static str, Value)], |
| 186 | +) -> u128 { |
| 187 | + let arena = Bump::new(); |
| 188 | + let properties = if properties.is_empty() { |
| 189 | + None |
| 190 | + } else { |
| 191 | + Some(ImmutablePropertiesMap::new( |
| 192 | + properties.len(), |
| 193 | + properties.iter().map(|(key, value)| (*key, value.clone())), |
| 194 | + &arena, |
| 195 | + )) |
| 196 | + }; |
| 197 | + let mut txn = storage.graph_env.write_txn().unwrap(); |
| 198 | + let node = G::new_mut(storage, &arena, &mut txn) |
| 199 | + .add_n(label, properties, None) |
| 200 | + .collect_to_obj() |
| 201 | + .unwrap(); |
| 202 | + let node_id = node.id(); |
| 203 | + txn.commit().unwrap(); |
| 204 | + node_id |
| 205 | +} |
| 206 | + |
| 207 | +fn bm25_search_ids(storage: &HelixGraphStorage, query: &str) -> Vec<u128> { |
| 208 | + let arena = Bump::new(); |
| 209 | + let txn = storage.graph_env.read_txn().unwrap(); |
| 210 | + storage |
| 211 | + .bm25 |
| 212 | + .as_ref() |
| 213 | + .unwrap() |
| 214 | + .search(&txn, query, 10, &arena) |
| 215 | + .unwrap() |
| 216 | + .into_iter() |
| 217 | + .map(|(id, _)| id) |
| 218 | + .collect() |
| 219 | +} |
| 220 | + |
172 | 221 | // ============================================================================ |
173 | 222 | // Unit Tests: Endianness Conversion |
174 | 223 | // ============================================================================ |
@@ -960,6 +1009,140 @@ fn test_error_handling_graceful_failure() { |
960 | 1009 | assert_eq!(count, 11); // 10 valid + 1 invalid |
961 | 1010 | } |
962 | 1011 |
|
| 1012 | +#[test] |
| 1013 | +fn test_bm25_migration_rerun_is_noop_once_schema_written() { |
| 1014 | + let (mut storage, _temp_dir) = setup_test_storage(); |
| 1015 | + let node_id = add_test_node(&storage, "person", &[("name", Value::from("stable_term"))]); |
| 1016 | + |
| 1017 | + let before_metadata = { |
| 1018 | + let txn = storage.graph_env.read_txn().unwrap(); |
| 1019 | + let bm25 = storage.bm25.as_ref().unwrap(); |
| 1020 | + assert_eq!( |
| 1021 | + bm25.schema_version(&txn).unwrap(), |
| 1022 | + Some(BM25_SCHEMA_VERSION) |
| 1023 | + ); |
| 1024 | + bm25.metadata_db |
| 1025 | + .get(&txn, METADATA_KEY) |
| 1026 | + .unwrap() |
| 1027 | + .map(|bytes| bytes.to_vec()) |
| 1028 | + }; |
| 1029 | + let before_results = bm25_search_ids(&storage, "stable_term"); |
| 1030 | + assert_eq!(before_results, vec![node_id]); |
| 1031 | + |
| 1032 | + migrate(&mut storage).unwrap(); |
| 1033 | + |
| 1034 | + let after_metadata = { |
| 1035 | + let txn = storage.graph_env.read_txn().unwrap(); |
| 1036 | + let bm25 = storage.bm25.as_ref().unwrap(); |
| 1037 | + assert_eq!( |
| 1038 | + bm25.schema_version(&txn).unwrap(), |
| 1039 | + Some(BM25_SCHEMA_VERSION) |
| 1040 | + ); |
| 1041 | + bm25.metadata_db |
| 1042 | + .get(&txn, METADATA_KEY) |
| 1043 | + .unwrap() |
| 1044 | + .map(|bytes| bytes.to_vec()) |
| 1045 | + }; |
| 1046 | + let after_results = bm25_search_ids(&storage, "stable_term"); |
| 1047 | + |
| 1048 | + assert_eq!(after_results, vec![node_id]); |
| 1049 | + assert_eq!(before_results, after_results); |
| 1050 | + assert_eq!(before_metadata, after_metadata); |
| 1051 | +} |
| 1052 | + |
| 1053 | +#[test] |
| 1054 | +fn test_bm25_migration_repairs_stale_node_index() { |
| 1055 | + let (mut storage, _temp_dir) = setup_test_storage(); |
| 1056 | + let node_id = add_test_node(&storage, "person", &[("name", Value::from("legacyalpha"))]); |
| 1057 | + |
| 1058 | + assert_eq!(bm25_search_ids(&storage, "legacyalpha"), vec![node_id]); |
| 1059 | + assert!(bm25_search_ids(&storage, "freshomega").is_empty()); |
| 1060 | + |
| 1061 | + { |
| 1062 | + let arena = Bump::new(); |
| 1063 | + let mut txn = storage.graph_env.write_txn().unwrap(); |
| 1064 | + let node_bytes = storage |
| 1065 | + .nodes_db |
| 1066 | + .get(&txn, &node_id) |
| 1067 | + .unwrap() |
| 1068 | + .unwrap() |
| 1069 | + .to_vec(); |
| 1070 | + let mut node = Node::from_bincode_bytes(node_id, &node_bytes, &arena).unwrap(); |
| 1071 | + node.properties = Some(ImmutablePropertiesMap::new( |
| 1072 | + 1, |
| 1073 | + std::iter::once(("name", Value::from("freshomega"))), |
| 1074 | + &arena, |
| 1075 | + )); |
| 1076 | + |
| 1077 | + let updated_bytes = node.to_bincode_bytes().unwrap(); |
| 1078 | + storage |
| 1079 | + .nodes_db |
| 1080 | + .put(&mut txn, &node_id, &updated_bytes) |
| 1081 | + .unwrap(); |
| 1082 | + storage |
| 1083 | + .bm25 |
| 1084 | + .as_ref() |
| 1085 | + .unwrap() |
| 1086 | + .metadata_db |
| 1087 | + .put(&mut txn, BM25_SCHEMA_VERSION_KEY, &0u64.to_le_bytes()) |
| 1088 | + .unwrap(); |
| 1089 | + txn.commit().unwrap(); |
| 1090 | + } |
| 1091 | + |
| 1092 | + assert_eq!(bm25_search_ids(&storage, "legacyalpha"), vec![node_id]); |
| 1093 | + assert!(bm25_search_ids(&storage, "freshomega").is_empty()); |
| 1094 | + |
| 1095 | + migrate(&mut storage).unwrap(); |
| 1096 | + |
| 1097 | + assert!(bm25_search_ids(&storage, "legacyalpha").is_empty()); |
| 1098 | + assert_eq!(bm25_search_ids(&storage, "freshomega"), vec![node_id]); |
| 1099 | + |
| 1100 | + let txn = storage.graph_env.read_txn().unwrap(); |
| 1101 | + assert_eq!( |
| 1102 | + storage.bm25.as_ref().unwrap().schema_version(&txn).unwrap(), |
| 1103 | + Some(BM25_SCHEMA_VERSION) |
| 1104 | + ); |
| 1105 | +} |
| 1106 | + |
| 1107 | +#[test] |
| 1108 | +fn test_bm25_migration_drops_legacy_direct_docs() { |
| 1109 | + let (mut storage, _temp_dir) = setup_test_storage(); |
| 1110 | + let node_id = add_test_node(&storage, "person", &[("name", Value::from("nodeonlyterm"))]); |
| 1111 | + |
| 1112 | + { |
| 1113 | + let mut txn = storage.graph_env.write_txn().unwrap(); |
| 1114 | + let bm25 = storage.bm25.as_ref().unwrap(); |
| 1115 | + bm25.insert_doc(&mut txn, 999u128, "legacyvectorterm") |
| 1116 | + .unwrap(); |
| 1117 | + bm25.metadata_db |
| 1118 | + .put(&mut txn, BM25_SCHEMA_VERSION_KEY, &0u64.to_le_bytes()) |
| 1119 | + .unwrap(); |
| 1120 | + txn.commit().unwrap(); |
| 1121 | + } |
| 1122 | + |
| 1123 | + assert_eq!(bm25_search_ids(&storage, "legacyvectorterm"), vec![999u128]); |
| 1124 | + assert_eq!(bm25_search_ids(&storage, "nodeonlyterm"), vec![node_id]); |
| 1125 | + |
| 1126 | + migrate(&mut storage).unwrap(); |
| 1127 | + |
| 1128 | + assert!(bm25_search_ids(&storage, "legacyvectorterm").is_empty()); |
| 1129 | + assert_eq!(bm25_search_ids(&storage, "nodeonlyterm"), vec![node_id]); |
| 1130 | + |
| 1131 | + let txn = storage.graph_env.read_txn().unwrap(); |
| 1132 | + let metadata: BM25Metadata = bincode::deserialize( |
| 1133 | + storage |
| 1134 | + .bm25 |
| 1135 | + .as_ref() |
| 1136 | + .unwrap() |
| 1137 | + .metadata_db |
| 1138 | + .get(&txn, METADATA_KEY) |
| 1139 | + .unwrap() |
| 1140 | + .unwrap(), |
| 1141 | + ) |
| 1142 | + .unwrap(); |
| 1143 | + assert_eq!(metadata.total_docs, 1); |
| 1144 | +} |
| 1145 | + |
963 | 1146 | // ============================================================================ |
964 | 1147 | // Performance Tests |
965 | 1148 | // ============================================================================ |
|
0 commit comments