YosefLab
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/build.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/release.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/release.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/test_linux_cuda.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/test_linux_cuda.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/tutorials/notebooks/tabula_sapiens_tutorial.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎docs/tutorials/notebooks/tabula_sapiens_tutorial.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎popv/_faiss_knn_classifier.py‎
Lines changed: 97 additions & 0 deletions b/‎popv/_faiss_knn_classifier.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎popv/_settings.py‎
Lines changed: 11 additions & 0 deletions b/‎popv/_settings.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎popv/algorithms/_base_algorithm.py‎
Lines changed: 2 additions & 0 deletions b/‎popv/algorithms/_base_algorithm.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎popv/algorithms/_bbknn.py‎
Lines changed: 17 additions & 45 deletions b/‎popv/algorithms/_bbknn.py‎
Lines changed: 17 additions & 45 deletions
diff --git a/‎popv/algorithms/_celltypist.py‎
Lines changed: 16 additions & 6 deletions b/‎popv/algorithms/_celltypist.py‎
Lines changed: 16 additions & 6 deletions
@@ -15,10 +15,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
-      - name: Set up Python 3.10
+      - name: Set up Python 3.12
         uses: actions/setup-python@v4
         with:
-          python-version: "3.10"
+          python-version: "3.12"
           cache: "pip"
           cache-dependency-path: "**/pyproject.toml"
       - name: Install build dependencies
 
@@ -112,7 +112,7 @@ jobs:
 
       - uses: actions/setup-python@v4
         with:
-          python-version: "3.11"
+          python-version: "3.12"
 
       - run: pip install build
 
@@ -158,6 +158,6 @@ jobs:
           cache-from: type=registry,ref=ghcr.io/yoseflab/popv:buildcache
           cache-to: type=inline,ref=ghcr.io/yoseflab/popv:buildcache
           target: build
-          tags: ghcr.io/yoseflab/popv:py3.11-cu12-${{ inputs.tag }}-${{ matrix.dependencies }}
+          tags: ghcr.io/yoseflab/popv:py3.12-cu12-${{ inputs.tag }}-${{ matrix.dependencies }}
           build-args: |
             DEPENDENCIES=${{ matrix.dependencies }}
@@ -30,7 +30,7 @@ jobs:
         shell: bash -e {0} # -e to fail on error
 
     container:
-      image: ghcr.io/yoseflab/popv:py3.11-cu12-0.5.2.post1-
+      image: ghcr.io/yoseflab/popv:py3.12-cu12-0.6.0-
       options: --user root --gpus all --pull always
 
     name: integration
@@ -54,6 +54,7 @@ jobs:
           python -m uv pip install --system "PopV[tests] @ ."
           python -m pip install jax[cuda]
           python -m pip install nvidia-nccl-cu12
+          python -m pip install faiss-gpu-cu12
 
       - name: Run pytest
         env:
 
@@ -1829,4 +1829,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 5
-}
+}
@@ -0,0 +1,97 @@
+import os
+
+import faiss
+import numpy as np
+import pandas as pd
+
+from popv import settings
+
+
+class FAISSKNNProba:
+    def __init__(self, n_neighbors=5):
+        self.n_neighbors = n_neighbors
+        self.index = None
+        if settings.cuml and faiss.get_num_gpus() > 0:
+            self.res = faiss.StandardGpuResources()
+            self.use_gpu = True
+        else:
+            self.res = None
+            self.use_gpu = False
+
+    def fit(self, X, labels):
+        X = X.astype("float32")
+        self.labels = labels
+        d = X.shape[1]
+
+        cpu_index = faiss.IndexFlatL2(d)
+
+        if self.use_gpu:
+            gpu_index = faiss.index_cpu_to_gpu(self.res, settings.device, cpu_index)
+            gpu_index.add(X)
+            self.index = faiss.index_gpu_to_cpu(gpu_index)
+        else:
+            cpu_index.add(X)
+            self.index = cpu_index
+
+        return self
+
+    def query(self, X, n_neighbors):
+        X = X.astype("float32")
+        if self.use_gpu:
+            index = faiss.index_cpu_to_gpu(self.res, settings.device, self.index)
+        else:
+            index = self.index
+        _, I = index.search(X, n_neighbors)
+        return I
+
+    def predict(self, X, classes):
+        X = X.astype("float32")
+        if self.use_gpu:
+            index = faiss.index_cpu_to_gpu(self.res, settings.device, self.index)
+        else:
+            index = self.index
+        _, I = index.search(X, self.n_neighbors)
+        preds = classes[np.array([np.bincount(self.labels[i], minlength=len(classes)).argmax() for i in I])]
+        return preds
+
+    def predict_proba(self, X, classes):
+        X = X.astype("float32")
+        if self.use_gpu:
+            index = faiss.index_cpu_to_gpu(self.res, settings.device, self.index)
+        else:
+            index = self.index
+        _, I = index.search(X, self.n_neighbors)
+        probas = []
+        for neighbors in I:
+            counts = np.bincount(self.labels[neighbors], minlength=len(classes))
+            probas.append(counts / counts.sum())
+        return np.array(probas)
+
+    def save(self, path_prefix):
+        """
+        Save FAISS index and metadata (labels + classes) to disk.
+
+        Parameters
+        ----------
+        path_prefix : str
+            Path prefix, e.g. "models/faiss_knn"
+        """
+        faiss.write_index(self.index, f"{path_prefix}.index")
+
+    @classmethod
+    def load(cls, path_prefix, index, n_neighbors=5):
+        """
+        Load FAISS index and metadata from disk.
+
+        Parameters
+        ----------
+        path_prefix : str
+            Path prefix used in save()
+        n_neighbors : int
+            Number of neighbors to use
+        """
+        obj = cls(n_neighbors=n_neighbors)
+        obj.index = faiss.read_index(os.path.join(path_prefix, f"{index}.index"))
+        labels = pd.read_csv(os.path.join(path_prefix, "ref_labels.csv"), index_col=0)
+        obj.labels = labels.iloc[:, 0].to_numpy()
+        return obj
@@ -68,6 +68,7 @@ def __init__(
         recompute_embeddings: bool = False,
         return_probabilities: bool = True,
         compute_umap_embedding: bool = True,
+        device: int | None = 0,
     ):
         """Set up Config manager for PopV."""
         self.seed = seed
@@ -80,6 +81,7 @@ def __init__(
         self.recompute_embeddings = recompute_embeddings
         self.return_probabilities = return_probabilities
         self.compute_umap_embedding = compute_umap_embedding
+        self.device = device
 
     @property
     def logging_dir(self) -> Path:
@@ -198,5 +200,14 @@ def return_probabilities(self) -> bool:
     def return_probabilities(self, return_probabilities: bool):
         self._return_probabilities = return_probabilities
 
+    @property
+    def device(self) -> int | None:
+        """GPU device to use for acceleration."""
+        return self._device
+
+    @device.setter
+    def device(self, device: int | None):
+        self._device = device
+
 
 settings = Config()
@@ -35,6 +35,8 @@ def __init__(
         umap_key
             Key in obsm in which UMAP embedding of integrated data is stored.
         """
+        if settings.cuml:
+            import rapids_singlecell as rsc  # noqa: F401
         self.batch_key = batch_key
         self.labels_key = labels_key
         if seen_result_key is None:
 
@@ -1,18 +1,12 @@
 from __future__ import annotations
 
 import logging
-import os
 
-import joblib
 import numpy as np
 import scanpy as sc
-from scipy.stats import mode
 from sklearn.neighbors import KNeighborsClassifier
 
 from popv import settings
-
-if settings.cuml:
-    import rapids_singlecell as rsc
 from popv.algorithms._base_algorithm import BaseAlgorithm
 
 
@@ -95,48 +89,22 @@ def compute_integration(self, adata):
             AnnData object. Modified inplace.
         """
         logging.info("Integrating data with bbknn")
-        if (
-            adata.uns["_prediction_mode"] == "inference"
-            and "X_umap_bbknn" in adata.obsm
-            and not settings.recompute_embeddings
-        ):
-            index = joblib.load(os.path.join(adata.uns["_save_path_trained_models"], "pynndescent_index.joblib"))
-            query_features = adata.obsm["X_pca"][adata.obs["_dataset"] == "query", :]
-            indices, _ = index.query(query_features.astype(np.float32), k=5)
-
-            neighbor_embedding = adata.obsm["X_umap_bbknn"][adata.obs["_dataset"] == "ref", :][indices].astype(
-                np.float32
-            )
-            adata.obsm[self.umap_key][adata.obs["_dataset"] == "query", :] = np.mean(neighbor_embedding, axis=1)
-            adata.obsm[self.umap_key] = adata.obsm[self.umap_key].astype(np.float32)
-
-            neighbor_probabilities = adata.obs[f"{self.result_key}_probabilities"][adata.obs["_dataset"] == "ref", :][
-                indices
-            ].astype(np.float32)
-            adata.obs.loc[adata.obs["_dataset"] == "query", f"{self.result_key}_probabilities"] = np.mean(
-                neighbor_probabilities, axis=1
-            )
-
-            neighbor_prediction = adata.obs[f"{self.result_key}"][adata.obs["_dataset"] == "ref", :][indices].astype(
-                np.float32
+        if len(adata.obs[self.batch_key].unique()) > 100:
+            self.method_kwargs["neighbors_within_batch"] = 1
+        if settings.cuml:
+            import rapids_singlecell as rsc
+
+            self.method_kwargs.pop("approx", None)  # approx not supported in rsc
+            self.method_kwargs.pop("use_annoy", None)  # use_annoy not supported in rsc
+            rsc.pp.bbknn(
+                adata, batch_key=self.batch_key, use_rep="X_pca", algorithm="ivfflat", **self.method_kwargs, trim=0
             )
-            adata.obs.loc[adata.obs["_dataset"] == "query", f"{self.result_key}"] = mode(neighbor_prediction, axis=1)
         else:
-            if len(adata.obs[self.batch_key].unique()) > 100:
-                logging.warning("Using PyNNDescent instead of FAISS as high number of batches leads to OOM.")
-                self.method_kwargs["neighbors_within_batch"] = 1  # Reduce memory usage.
-                self.method_kwargs["pynndescent_n_neighbors"] = 10  # Reduce memory usage.
-                sc.external.pp.bbknn(
-                    adata, batch_key=self.batch_key, use_faiss=False, use_rep="X_pca", **self.method_kwargs
-                )
-            else:
-                sc.external.pp.bbknn(
-                    adata, batch_key=self.batch_key, use_faiss=True, use_rep="X_pca", **self.method_kwargs
-                )
+            sc.external.pp.bbknn(adata, batch_key=self.batch_key, use_rep="X_pca", **self.method_kwargs)
 
     def predict(self, adata):
         """
-        Predict celltypes using Celltypist.
+        Predict celltypes using BBKNN kNN.
 
         Parameters
         ----------
@@ -168,7 +136,9 @@ def predict(self, adata):
         adata.obs[self.result_key] = adata.uns["label_categories"][knn.predict(test_distances)]
 
         if self.return_probabilities:
-            adata.obs[f"{self.result_key}_probabilities"] = np.max(knn.predict_proba(test_distances), axis=1)
+            probabilities = knn.predict_proba(test_distances)
+            adata.obs[f"{self.result_key}_probabilities"] = np.max(probabilities, axis=1)
+            adata.obsm[f"{self.result_key}_probabilities"] = probabilities
 
     def compute_umap(self, adata):
         """
@@ -180,8 +150,10 @@ def compute_umap(self, adata):
             AnnData object. Results are stored in adata.obsm[self.umap_key].
         """
         if self.compute_umap_embedding:
-            logging.info(f'Saving UMAP of bbknn results to adata.obs["{self.embedding_key}"]')
+            logging.info(f'Saving UMAP of BBKNN results to adata.obsm["{self.umap_key}"]')
             if settings.cuml:
+                import rapids_singlecell as rsc
+
                 rsc.pp.neighbors(adata, use_rep=self.embedding_key)
                 adata.obsm[self.umap_key] = rsc.tl.umap(adata, copy=True, **self.embedding_kwargs).obsm["X_umap"]
             else:
 
@@ -4,16 +4,13 @@
 import os
 
 import celltypist
-import joblib
 import numpy as np
 import pandas as pd
 import scanpy as sc
 from scipy.stats import mode
 
 from popv import settings
-
-if settings.cuml:
-    import rapids_singlecell as rsc
+from popv._faiss_knn_classifier import FAISSKNNProba
 from popv.algorithms._base_algorithm import BaseAlgorithm
 
 
@@ -86,16 +83,20 @@ def predict(self, adata):
             and "over_clustering" in adata.obs
             and not settings.recompute_embeddings
         ):
-            index = joblib.load(os.path.join(adata.uns["_save_path_trained_models"], "pynndescent_index.joblib"))
+            knn = FAISSKNNProba(n_neighbors=5)
+            knn = knn.load(adata.uns["_save_path_trained_models"], "faiss_index")
+
             query_features = adata.obsm["X_pca"][adata.obs["_dataset"] == "query", :]
-            indices, _ = index.query(query_features.astype(np.float32), k=5)
+            indices = knn.query(query_features.astype(np.float32), n_neighbors=5)
             neighbor_values = adata.obs.loc[adata.obs["_dataset"] == "ref", "over_clustering"].cat.codes.values[indices]
             adata.obs.loc[adata.obs["_dataset"] == "query", "over_clustering"] = adata.obs[
                 "over_clustering"
             ].cat.categories[mode(neighbor_values, axis=1).mode.flatten()]
             over_clustering = adata.obs.loc[adata.obs["_predict_cells"] == "relabel", "over_clustering"]
         else:
             if settings.cuml:
+                import rapids_singlecell as rsc
+
                 rsc.pp.neighbors(adata, n_neighbors=15, use_rep="X_pca")
                 rsc.tl.leiden(adata, resolution=25.0, key_added="over_clustering")
             else:
@@ -136,7 +137,16 @@ def predict(self, adata):
         if self.return_probabilities:
             if f"{self.result_key}_probabilities" not in adata.obs.columns:
                 adata.obs[f"{self.result_key}_probabilities"] = pd.Series(dtype="float64")
+            if f"{self.result_key}_probabilities" not in adata.obsm:
+                adata.obsm[f"{self.result_key}_probabilities"] = pd.DataFrame(
+                    np.nan,
+                    index=adata.obs_names,
+                    columns=adata.uns["label_categories"],
+                )
             adata.obs.loc[
                 adata.obs["_predict_cells"] == "relabel",
                 f"{self.result_key}_probabilities",
             ] = predictions.probability_matrix.max(axis=1).values
+            adata.obsm[f"{self.result_key}_probabilities"].loc[adata.obs["_predict_cells"] == "relabel", :] = (
+                predictions.probability_matrix
+            )