Incorporate ANN (#119)

bamader · web-flow · commit 7f25d8e61362 · 2025-10-23T17:42:11.000-04:00
## Description This PR introduces approximate neighbor search to our performance metrics notebook. It also creates a script that uses a relative grid-search to optimize the parameters for HNSW evaluation. ## Related Issues Closes #106
diff --git a/model_tuning/build_hnsw_index.py b/model_tuning/build_hnsw_index.py
@@ -0,0 +1,51 @@
+"""
+build_hnsw_index.py
+
+
+Simple script for creating an HNSW index for a specific set of model
+vector embeddings. This index can be persisted to disk for faster
+instantiation during performance metric computation.
+"""
+
+import os
+import pickle
+
+import hnswlib
+
+# MODEL VARIABLES
+MODEL_NAME = "intfloat/e5-base-v2"
+EMBEDDING_SIZE = 768
+
+# EMBEDDING VARIABLES
+EMBEDDING_CACHE_DIR = "../data/training_files/embeddings/"
+EMBEDDING_FILE = "loinc_lab_names_intfloat_e5-base-v2_20251007"
+
+# ANN INDEX VARIABLES
+INDEX_FP = f"hnswlib_index_{MODEL_NAME.replace('/', '_')}.index"
+EF_VALUE = 200
+M_VALUE = 64
+
+
+if __name__ == "__main__":
+    print("Checking for cached embeddings...")
+    if os.path.exists(EMBEDDING_CACHE_DIR + EMBEDDING_FILE):
+        print("  Found cached embeddings. Loading them...")
+        with open(EMBEDDING_CACHE_DIR + EMBEDDING_FILE, "rb") as fp:
+            cache_data = pickle.load(fp)
+            name_codes = cache_data["codes"]
+            embeddings = cache_data["embeddings"]
+            embeddings = embeddings.cpu().numpy()
+
+            index = hnswlib.Index(space="cosine", dim=EMBEDDING_SIZE)
+            print("Checking for cached ANN index...")
+            if os.path.exists(INDEX_FP):
+                print("  Cached index already exists.")
+            else:
+                print(f"No local index found. Creating index for {MODEL_NAME}...")
+                index.init_index(max_elements=len(embeddings), ef_construction=EF_VALUE, M=M_VALUE)
+                print("  Index created, adding vectors...")
+                index.add_items(embeddings, list(range(len(embeddings))))
+                print("  Vectors embedded, saving index...")
+                index.save_index(INDEX_FP)
+    else:
+        print("No embeddings found, please run embedding.py to compute vectors first.")
diff --git a/model_tuning/cpu_convert.py b/model_tuning/cpu_convert.py
@@ -0,0 +1,35 @@
+"""
+cpu_convert.py
+
+
+Simple script for converting collections of embedded vectors that were
+built using GPU / Tensor optimization to purely CPU-compatible.
+
+Vector embeddings *must* be CPU-formatted for use with Azure ML Studio's
+copy of the `performance.ipynb` notebook.
+"""
+
+import pickle
+
+# Directory in which the embeddings are saved
+EMBEDDING_CACHE_DIR = "../data/training_files/embeddings/"
+
+# The original embedding file that may have been saved in a GPU-based
+# format
+GPU_PICKLE_FILE = "loinc_lab_names_intfloat_e5-base-v2_20251007"
+
+# The new embedding file to write after conversion to pure CPU formatting
+CPU_PICKLE_FILE = "loinc_lab_names_intfloat_e5-base-v2_20251007_cpu"
+
+
+if __name__ == "__main__":
+    print("Loading pickled tensor embeddings...")
+    with open(EMBEDDING_CACHE_DIR + GPU_PICKLE_FILE, "rb") as fp:
+        cache_data = pickle.load(fp)
+    name_codes = cache_data["codes"]
+    embeddings = cache_data["embeddings"]
+
+    print("Converting to CPU and writing back...")
+    embeddings = embeddings.cpu()
+    with open(EMBEDDING_CACHE_DIR + CPU_PICKLE_FILE, "wb") as fp:
+        pickle.dump({"codes": name_codes, "embeddings": embeddings}, fp)
diff --git a/model_tuning/hnsw_estimator.py b/model_tuning/hnsw_estimator.py
@@ -0,0 +1,146 @@
+import os
+import pickle
+import random
+import time
+from typing import List
+
+import hnswlib
+from sentence_transformers import SentenceTransformer
+
+# MODEL VARIABLES
+MODEL_NAME = "intfloat/e5-base-v2"
+EMBEDDING_SIZE = 768
+
+# EMBEDDING VARIABLES
+EMBEDDING_CACHE_DIR = "../data/training_files/embeddings/"
+EMBEDDING_FILE = "loinc_lab_names_intfloat_e5-base-v2_20251007"
+
+# GRID-SEARCH ANN PARAMS
+# EF-value is described as the "speed/accuracy" tradeoff metric for HNSW
+# search. EF typically ranges from 50 to 1000, with a default value being
+# 200. Higher values of EF will increase recall compared to exact search,
+# (i.e. results will tend to look more like exact kNN), but will increase
+# search time in a nonlinear fashion.
+EF_CONSTRUCTION = 200
+# M-Value is the number of connections/neighbors made per "node" in the
+# search graph. It represents how many embedded vectors are considered to
+# be in the "small world" defined around each other vector. Higher values
+# of M increase recall compared to exact search, but also slow down
+# the search time.
+M_VALUE = 48
+# These are the range of EF values we want to test during our grid search.
+# The EF-value that an HNSW index is constructed with *does not* need to be
+# the EF-value that index is searched with. The search EF can range from
+# 0 to 1000, just like the initial EF used during construction. The initial
+# EF controls how many "small worlds" get attached as branches in the
+# search graph, while this "search EF" controls how many actually get
+# explored by the algorithm during ANN.
+EFS_TO_TEST = [50, 100, 200, 400, 600, 800, 1000]
+
+# VALIDATION VARIABLES
+VALIDATION_FILE = "../data/training_files/validation_set_positive_pairs.txt"
+# This is the "k" value in KNN, how many approximate neighbors we'll be
+# retrieving. The script does not optimize a search over K, but the choice of
+# K does directly influence the ordered-recall calculation (e.g. more neighbors
+# means a better sample to compare ANN to exact KNN).
+NUM_NEIGHBORS_TO_SEARCH = 10
+
+# IMPORTANT: Change this value to calculate stats using more or less
+# examples drawn from the validation set.
+NUM_EXAMPLES_TO_VALIDATE = 10000
+
+
+def run_recall_trial(
+    model: SentenceTransformer,
+    hnsw_index: hnswlib.Index,
+    bf_index: hnswlib.Index,
+    examples: List[List[str]],
+    k: int,
+    ef: int,
+) -> None:
+    """
+    Perform a single search in a grid of trials to compare approximate search
+    with exact search. Importantly, the goal of a recall trial is *not* to
+    maximize accuracy. Model analysis is a separate task. The goal of ANN
+    hyperparameter optimization is to get the approximate search to behave
+    as closely as possible to exact search in terms of which results are
+    retrieved and the relative rankings of those results. This allows other
+    notebooks to optimize for Top-K performance.
+
+    :param model: The sentence transformers model to evaluate.
+    :param hnsw_index: An HNSW index file computed over the embeddings.
+    :param bf_index: A brute force index file computed over the embeddings.
+    :param examples: A list of validation samples on which to evaluate recall.
+    :param k: The number of search results to retrieve.
+    :param ef: The search depth to use as part of this optimization.
+    """
+    num_correct = 0.0
+    search_times = []
+
+    for e in examples:
+        nonstandard_in = e[1].strip()
+
+        # Unlike embedding, which can convert to tensor on GPU, HNSW exists in
+        # CPU memory, so we leave as is
+        enc = model.encode(nonstandard_in)
+        start = time.time()
+        labels_hnsw, _ = hnsw_index.knn_query(enc, k=k)
+        search_times.append(time.time() - start)
+        labels_bf, _ = bf_index.knn_query(enc, k=k)
+
+        for label in labels_hnsw[0]:
+            for correct_label in labels_bf[0]:
+                # We're counting only the instances where the elements between
+                # HNSW and brute force match
+                if label == correct_label:
+                    num_correct += 1
+                    break
+
+    recall = round(num_correct / float(k * len(examples)), 3)
+    mean_search_time = round(float(sum(search_times)) / float(len(search_times)), 3)
+
+    print(f"Speed/Accuracy Tradeoff for K = {k}, EF = {ef}")
+    print(f"  Recall: {recall}")
+    print(f"  Mean Search Time: {mean_search_time}")
+
+
+if __name__ == "__main__":
+    print("Instantiating language model...")
+    model = SentenceTransformer(MODEL_NAME)
+
+    print("Checking for cached embeddings...")
+    if os.path.exists(EMBEDDING_CACHE_DIR + EMBEDDING_FILE):
+        print("  Found cached embeddings. Loading them...")
+        with open(EMBEDDING_CACHE_DIR + EMBEDDING_FILE, "rb") as fp:
+            cache_data = pickle.load(fp)
+            name_codes = cache_data["codes"]
+            embeddings = cache_data["embeddings"]
+            embeddings = embeddings.cpu().numpy()
+
+            print("Loading validation set...")
+            examples = []
+            with open(VALIDATION_FILE, "r") as fp:
+                for line in fp:
+                    if line.strip() != "":
+                        examples.append(line.strip().split("|"))
+            random.shuffle(examples)
+            examples = examples[:NUM_EXAMPLES_TO_VALIDATE]
+
+            print("Initializing Indices: Regular and Brute Force")
+            hnsw_index = hnswlib.Index(space="cosine", dim=EMBEDDING_SIZE)
+            bf_index = hnswlib.BFIndex(space="cosine", dim=EMBEDDING_SIZE)
+            hnsw_index.init_index(
+                max_elements=len(embeddings), ef_construction=EF_CONSTRUCTION, M=M_VALUE
+            )
+            bf_index.init_index(max_elements=len(embeddings))
+
+            hnsw_index.add_items(embeddings)
+            bf_index.add_items(embeddings)
+
+            print("Performing grid-search on EF to identify optimal value...")
+            for ef in EFS_TO_TEST:
+                hnsw_index.set_ef(ef)
+                run_recall_trial(model, hnsw_index, bf_index, examples, NUM_NEIGHBORS_TO_SEARCH, ef)
+
+    else:
+        print("No embeddings found, please run embedding.py to compute vectors first.")
diff --git a/model_tuning/performance.py b/model_tuning/performance.py
@@ -4,13 +4,24 @@
 import time
 from typing import List
 
+import hnswlib
 from sentence_transformers import SentenceTransformer
-from sentence_transformers import util
-from torch import Tensor
 
+# MODEL VARIABLES
 MODEL_NAME = "intfloat/e5-base-v2"
+EMBEDDING_SIZE = 768
+
+# EMBEDDING VARIABLES
 EMBEDDING_CACHE_DIR = "../data/training_files/embeddings/"
 EMBEDDING_FILE = "loinc_lab_names_intfloat_e5-base-v2_20251007"
+
+# ANN INDEX VARIABLES
+INDEX_FP = "./hnswlib.index"
+EF_CONSTRUCTION = 200
+M_VALUE = 64
+EF_SEARCH = 100
+
+# VALIDATION VARIABLES
 VALIDATION_FILE = "../data/training_files/validation_set_positive_pairs.txt"
 K_VALUES = [1, 3, 5, 10]
 
@@ -21,7 +32,7 @@
 
 def predict_and_evaluate_validation_set(
     model: SentenceTransformer,
-    vector_db: Tensor,
+    ann_index: hnswlib.Index,
     standard_loinc_names: List[str],
     examples: List[List[str]],
     k_vals: List[int],
@@ -35,14 +46,15 @@ def predict_and_evaluate_validation_set(
     scoring result, and mean time to encode an input and perform semantic search.
 
     :param model: The sentence transformer model to evaluate.
-    :param vector_db: A list of pre-computed embeddings on the corpus in which
-      to semantic search (these are the embedded standard LOINC codes).
+    :param ann_index: A pre-computed HNSW index file over the embeddings that
+      we want to match nonstandard inputs to.
     :param standard_loinc_names: A list of strings representing the names of
       the LOINC codes embedded in the `vector_db`. Note that the order of
       strings in the list should match the order of embeddings in the DB.
     :param examples: A list of lists of strings representing the experimental
       examples to evaluate.
-    :param k: An integer for how many neighbors to retrieve from the DB.
+    :param k_vals: A list of integers indicating how many neighbors should be
+      retrieved from the DB across a range of trials.
     :returns: None
     """
     encoding_times = []
@@ -57,20 +69,19 @@ def predict_and_evaluate_validation_set(
         correct_code = e[0].strip()
         nonstandard_in = e[1].strip()
 
-        # This utility performs exact neighbor semantic search
-        # If approximate is desired, see
-        # https://sbert.net/examples/sentence_transformer/applications/semantic-search/README.html#approximate-nearest-neighbor     # noqa
-        # for details
         start = time.time()
-        enc = model.encode(nonstandard_in, convert_to_tensor=True)
+        enc = model.encode(nonstandard_in)
         encoding_times.append(time.time() - start)
 
         for k in k_vals:
             start = time.time()
-            hits = util.semantic_search(enc, vector_db, top_k=k)
-            hits = hits[0]
+            embedding_ids, distances = ann_index.knn_query(enc, k=k)
+            hits = [
+                {"corpus_id": id, "score": 1 - dist}
+                for id, dist in zip(embedding_ids[0], distances[0])
+            ]
+            hits = sorted(hits, key=lambda x: x["score"], reverse=True)
 
-            # Store some metrics
             times[k].append(time.time() - start)
             cosine_sims[k].append(hits[0]["score"])
 
@@ -110,6 +121,21 @@ def predict_and_evaluate_validation_set(
             cache_data = pickle.load(fp)
             name_codes = cache_data["codes"]
             embeddings = cache_data["embeddings"]
+            embeddings = embeddings.cpu().numpy()
+
+            index = hnswlib.Index(space="cosine", dim=EMBEDDING_SIZE)
+            print("Checking for cached ANN index...")
+            if os.path.exists(INDEX_FP):
+                print("  Found cached index. Loading it...")
+                index.load_index(INDEX_FP)
+            else:
+                print("No locally cached index found. Creating hierarchical index...")
+                index.init_index(
+                    max_elements=len(embeddings), ef_construction=EF_CONSTRUCTION, M=M_VALUE
+                )
+                index.add_items(embeddings, list(range(len(embeddings))))
+                index.save_index(INDEX_FP)
+            index.set_ef(EF_SEARCH)
 
             print("Loading validation set...")
             examples = []
@@ -119,7 +145,9 @@ def predict_and_evaluate_validation_set(
                         examples.append(line.strip().split("|"))
 
             print("Predicting and computing stats for validation set...")
-            predict_and_evaluate_validation_set(model, embeddings, name_codes, examples, K_VALUES)
+            predict_and_evaluate_validation_set(
+                model, index, embeddings, name_codes, examples, K_VALUES
+            )
 
     else:
         print("No embeddings found, please run embedding.py to compute vectors first.")
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,10 @@ dependencies = [
     "spacy-lookups-data",
     "sentence-transformers",
     "scikit-learn",
+    # HNSWLIB might actually not be needed in our main dependencies, but we won't know that
+    # until we get into AWS and see how OpenSearch structures things. We'll leave this in 
+    # main for now, but will remove in prod if we're able to migrate it to a dev dependency.
+    "hnswlib",
     "pydantic-settings",
     # Typing
     "aws-lambda-typing",