@@ -75,18 +75,19 @@ def __init__(self, data_size = 0, initialCap = 0, M = 32, ef_c = 512, ef_r = 10,
7575
7676 data = load_data ("dbpedia-768" )
7777 self .num_elements = data_size if data_size != 0 else data .shape [0 ]
78- self .initialCap = initialCap if initialCap != 0 else 2 * self .num_elements
78+ #self.initialCap = initialCap if initialCap != 0 else 2 * self.num_elements
79+ self .initialCap = initialCap if initialCap != 0 else self .num_elements
7980
8081 self .data = data [:self .num_elements ]
8182 self .dim = len (self .data [0 ])
8283 self .metric = metric
83- self .type = data_type
84+ self .data_type = data_type
8485 self .is_multi = is_multi
8586
8687 self .hnsw_params = create_hnsw_params (dim = self .dim ,
8788 num_elements = self .initialCap ,
8889 metric = self .metric ,
89- data_type = self .type ,
90+ data_type = self .data_type ,
9091 ef_construction = ef_c ,
9192 m = M ,
9293 ef_runtime = ef_r ,
@@ -102,22 +103,18 @@ def create_tiered(self):
102103
103104 def create_hnsw (self ):
104105 return HNSWIndex (self .hnsw_params )
105-
106- def set_num_vectors_per_label (self , num_per_label = 1 ):
107- self .num_per_label = num_per_label
108106
109107 def init_and_populate_flat_index (self ):
110108 bfparams = BFParams ()
111109 bfparams .initialCapacity = self .num_elements
112110 bfparams .dim = self .dim
113- bfparams .type = self .type
111+ bfparams .type = self .data_type
114112 bfparams .metric = self .metric
115113 bfparams .multi = self .is_multi
116114 self .flat_index = BFIndex (bfparams )
117115
118116 for i , vector in enumerate (self .data ):
119- for _ in range (self .num_per_label ):
120- self .flat_index .add_vector (vector , i )
117+ self .flat_index .add_vector (vector , i )
121118
122119 return self .flat_index
123120
@@ -129,6 +126,16 @@ def init_and_populate_hnsw_index(self):
129126 self .hnsw_index = hnsw_index
130127 return hnsw_index
131128
129+ def populate_index (self , index ):
130+ start = time .time ()
131+ duration = 0
132+ for label , vector in enumerate (self .data ):
133+ start_add = time .time ()
134+ index .add_vector (vector , label )
135+ duration += time .time () - start_add
136+ end = time .time ()
137+ return (start , duration , end )
138+
132139 def generate_random_vectors (self , num_vectors ):
133140 vectors = 0
134141 np_file_path = os .path .join (f'np_{ num_vectors } vec_dim{ self .dim } .npy' )
@@ -154,7 +161,12 @@ def insert_in_batch(self, index, data, data_first_idx, batch_size, first_label):
154161 duration += time .time () - start_add
155162 end = time .time ()
156163 return (duration , end )
164+
165+ def generate_queries (self , num_queries ):
166+ self .rng = np .random .default_rng (seed = 47 )
157167
168+ queries = self .rng .random ((num_queries , self .dim ))
169+ return np .float32 (queries ) if self .data_type == VecSimType_FLOAT32 else queries
158170
159171def create_dbpedia ():
160172 indices_ctx = DBPediaIndexCtx ()
@@ -192,7 +204,7 @@ def create_tiered():
192204 create_tiered ()
193205
194206def create_dbpedia_graph ():
195- indices_ctx = DBPediaIndexCtx (data_size = 100000 )
207+ indices_ctx = DBPediaIndexCtx ()
196208
197209 threads_num = TIEREDIndex .get_threads_num ()
198210 print (f"thread num = { threads_num } " )
@@ -283,9 +295,68 @@ def create_hnsw():
283295 print (f"Start hnsw creation" )
284296
285297 create_hnsw ()
298+
299+ def search_insert (is_multi : bool , num_per_label = 1 ):
300+ indices_ctx = DBPediaIndexCtx (data_size = 1000 , mode = CreationMode .CREATE_TIERED_INDEX , is_multi = is_multi )
301+ index = indices_ctx .tiered_index
302+
303+ num_elements = indices_ctx .num_elements
304+
305+ query_data = indices_ctx .generate_queries (num_queries = 1 )
306+
307+ # Add vectors to the flat index.
308+ bf_index = indices_ctx .init_and_populate_flat_index ()
309+
310+ # Start background insertion to the tiered index.
311+ index_start , _ , _ = indices_ctx .populate_index (index )
312+
313+ correct = 0
314+ k = 10
315+ searches_number = 0
316+
317+ # config knn log
318+ index .start_knn_log ()
319+
320+ # run knn query every 1 s.
321+ total_tiered_search_time = 0
322+ prev_bf_size = num_elements
323+ while index .hnsw_label_count () < num_elements :
324+ # For each run get the current hnsw size and the query time.
325+ bf_curr_size = index .get_curr_bf_size (mode = 'insert_and_knn' )
326+ query_start = time .time ()
327+ tiered_labels , _ = index .knn_query (query_data , k )
328+ query_dur = time .time () - query_start
329+ total_tiered_search_time += query_dur
330+
331+ print (f"query time = { round_ms (query_dur )} ms" )
332+
333+ # BF size should decrease.
334+ print (f"bf size = { bf_curr_size } " )
335+ assert bf_curr_size < prev_bf_size
336+
337+ # Run the query also in the bf index to get the ground truth results.
338+ bf_labels , _ = bf_index .knn_query (query_data , k )
339+ correct += len (np .intersect1d (tiered_labels [0 ], bf_labels [0 ]))
340+ time .sleep (1 )
341+ searches_number += 1
342+ prev_bf_size = bf_curr_size
343+
344+ index .reset_log ()
345+
346+ # HNSW labels count updates before the job is done, so we need to wait for the queue to be empty.
347+ index .wait_for_index (1 )
348+ index_dur = time .time () - index_start
349+ print (f"indexing during search in tiered took { round_ (index_dur )} s" )
350+
351+ # Measure recall.
352+ recall = float (correct )/ (k * searches_number )
353+ print ("Average recall is:" , round_ (recall , 3 ))
354+ print ("tiered query per seconds: " , round_ (searches_number / total_tiered_search_time ))
286355
287356def test_main ():
288357 print ("Test creation" )
289- # create_dbpedia()
290- create_dbpedia_graph ()
358+ create_dbpedia ()
359+ # create_dbpedia_graph()
360+ print (f"\n Start insert & search test" )
361+ # search_insert(is_multi=False)
291362
0 commit comments