Backport PR #2414 on branch 1.9.x (matplotlib 3.7 compat) (#2419)

meeseeksmachine · ivirshup · adamgayoso · web-flow · commit 1fbbfcdbb53d · 2023-02-16T16:25:34.000+01:00
* Backport PR #2414: matplotlib 3.7 compat * fix scrublet * Update visium default plot for matplotlib 3.7 * Update hashsolo docstrings * skip plotting test that changed on mpl 3.7 if mpl < 3.7 is installed * Fix hashsolo docs (again) * update anndata-dev tests to install anndata test deps * Temporarily set warnings as errors to False for doc builds * Release notes * Fix using custom layer with highly_variable_genes (#2302) * Fix using custom layer with highly_variable_genes * Add tests * Add release note * Move release note to correct section * Format release notes * Add check for number of normalized dispersions (#2231) * Add check for number of normalized dispersions In sc.pp.highly_variable_genes() when flavor='cell_ranger' and n_top_genes is set check that enough normalized dispersions have been calculated and if not raise a warning and set n_top_genes to the number of calculated dispersions. Fixes #2230 * Use .size instead of len() * Add test for n_top_genes warning * Add release note * Remove blank line Co-authored-by: Isaac Virshup <ivirshup@gmail.com> --------- Co-authored-by: Isaac Virshup <ivirshup@gmail.com> Co-authored-by: adamgayoso <adamgayoso@users.noreply.github.com> Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Co-authored-by: Luke Zappia <lazappi@users.noreply.github.com>
diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml
@@ -50,7 +50,7 @@ jobs:
     displayName: 'Install dependencies'
 
   - script: |
-      pip install -v git+https://github.com/scverse/anndata
+      'pip install -v "anndata[dev,test] @ git+https://github.com/scverse/anndata"'
     displayName: 'Install development anndata'
     condition: eq(variables['ANNDATA_DEV'], 'yes')
 
diff --git a/docs/conf.py b/docs/conf.py
@@ -128,7 +128,9 @@
 
 
 def setup(app):
-    app.warningiserror = on_rtd
+    app.warningiserror = (
+        False  # Temporarily disable warnings as errors to get 1.9.2 out
+    )
 
 
 # -- Options for other output formats ------------------------------------------
diff --git a/docs/release-notes/1.9.2.md b/docs/release-notes/1.9.2.md
@@ -1,11 +1,9 @@
-### 1.9.2 {small}`the future`
-
-
-```{rubric} Documentation
-```
+### 1.9.2 {small}`2023-02-16`
 
 ```{rubric} Bug fixes
 ```
 
-```{rubric} Performance
-```
+* {func}`~scanpy.pp.highly_variable_genes` `layer` argument now works in tandem with `batches` {pr}`2302` {smaller}`D Schaumont`
+* {func}`~scanpy.pp.highly_variable_genes` with `flavor='cell_ranger'` now handles the case in {issue}`2230` where the number of calculated dispersions is less than `n_top_genes` {pr}`2231` {smaller}`L Zappia`
+* Fix compatibility with matplotlib 3.7 {pr}`2414` {smaller}`I Virshup` {smaller}`P Fisher`
+* Fix scrublet numpy matrix compatibility issue {pr}`2395` {smaller}`A Gayoso`
diff --git a/scanpy/external/pp/_hashsolo.py b/scanpy/external/pp/_hashsolo.py
@@ -25,7 +25,8 @@
 
 
 def _calculate_log_likelihoods(data, number_of_noise_barcodes):
-    """Calculate log likelihoods for each hypothesis, negative, singlet, doublet
+    """\
+    Calculate log likelihoods for each hypothesis, negative, singlet, doublet
 
     Parameters
     ----------
@@ -43,8 +44,8 @@ def _calculate_log_likelihoods(data, number_of_noise_barcodes):
     """
 
     def gaussian_updates(data, mu_o, std_o):
-        """Update parameters of your gaussian
-        https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf
+        """\
+        Update parameters of your gaussian https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf
 
         Parameters
         ----------
@@ -210,7 +211,7 @@ def gaussian_updates(data, mu_o, std_o):
 
 
 def _calculate_bayes_rule(data, priors, number_of_noise_barcodes):
-    """
+    """\
     Calculate bayes rule from log likelihoods
 
     Parameters
@@ -263,7 +264,8 @@ def hashsolo(
     number_of_noise_barcodes: int = None,
     inplace: bool = True,
 ):
-    """Probabilistic demultiplexing of cell hashing data using HashSolo [Bernstein20]_.
+    """\
+    Probabilistic demultiplexing of cell hashing data using HashSolo [Bernstein20]_.
 
     .. note::
         More information and bug reports `here <https://github.com/calico/solo>`__.
@@ -294,9 +296,8 @@ def hashsolo(
 
     Returns
     -------
-    adata
-        if inplace is False returns AnnData with demultiplexing results
-        in .obs attribute otherwise does is in place
+    if inplace is False returns AnnData with demultiplexing results
+    in .obs attribute otherwise does is in place
 
     Examples
     -------
diff --git a/scanpy/external/pp/_scrublet.py b/scanpy/external/pp/_scrublet.py
@@ -431,6 +431,11 @@ def _scrublet_call_doublets(
 
     if mean_center:
         logg.info('Embedding transcriptomes using PCA...')
+        # Sklearn PCA doesn't like matrices, so convert to arrays
+        if isinstance(scrub._E_obs_norm, np.matrix):
+            scrub._E_obs_norm = np.asarray(scrub._E_obs_norm)
+        if isinstance(scrub._E_sim_norm, np.matrix):
+            scrub._E_sim_norm = np.asarray(scrub._E_sim_norm)
         sl.pipeline_pca(
             scrub, n_prin_comps=n_prin_comps, random_state=scrub.random_state
         )
diff --git a/scanpy/plotting/_utils.py b/scanpy/plotting/_utils.py
@@ -32,7 +32,7 @@
 VBound = Union[str, float, Callable[[Sequence[float]], float]]
 
 
-class _AxesSubplot(Axes, axes.SubplotBase, ABC):
+class _AxesSubplot(Axes, axes.SubplotBase):
     """Intersection between Axes and SubplotBase: Has methods of both"""
 
 
diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py
@@ -265,6 +265,12 @@ def _highly_variable_genes_single_batch(
         if n_top_genes > adata.n_vars:
             logg.info('`n_top_genes` > `adata.n_var`, returning all genes.')
             n_top_genes = adata.n_vars
+        if n_top_genes > dispersion_norm.size:
+            warnings.warn(
+                '`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.',
+                UserWarning,
+            )
+            n_top_genes = dispersion_norm.size
         disp_cut_off = dispersion_norm[n_top_genes - 1]
         gene_subset = np.nan_to_num(df['dispersions_norm'].values) >= disp_cut_off
         logg.debug(
@@ -458,6 +464,7 @@ def highly_variable_genes(
 
             hvg = _highly_variable_genes_single_batch(
                 adata_subset,
+                layer=layer,
                 min_disp=min_disp,
                 max_disp=max_disp,
                 min_mean=min_mean,
diff --git a/scanpy/tests/_images/master_spatial_visium_default.png b/scanpy/tests/_images/master_spatial_visium_default.png
diff --git a/scanpy/tests/test_embedding_plots.py b/scanpy/tests/test_embedding_plots.py
@@ -1,6 +1,7 @@
 from functools import partial
 from pathlib import Path
 
+import matplotlib as mpl
 import matplotlib.pyplot as plt
 from matplotlib.colors import Normalize
 from matplotlib.testing.compare import compare_images
@@ -304,6 +305,11 @@ def test_visium_circles(image_comparer):  # standard visium data
 
 
 def test_visium_default(image_comparer):  # default values
+    from packaging.version import parse as parse_version
+
+    if parse_version(mpl.__version__) < parse_version("3.7.0"):
+        pytest.xfail("Matplotlib 3.7.0+ required for this test")
+
     save_and_compare_images = image_comparer(ROOT, FIGS, tol=5)
     adata = sc.read_visium(HERE / '_data' / 'visium_data' / '1.0.0')
     adata.obs = adata.obs.astype({'array_row': 'str'})
diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py
@@ -36,11 +36,26 @@ def test_highly_variable_genes_basic():
     assert 'highly_variable_intersection' in adata.var.columns
 
     adata = sc.datasets.blobs()
-    adata.obs['batch'] = np.random.binomial(4, 0.5, size=(adata.n_obs))
+    batch = np.random.binomial(4, 0.5, size=(adata.n_obs))
+    adata.obs['batch'] = batch
     adata.obs['batch'] = adata.obs['batch'].astype('category')
     sc.pp.highly_variable_genes(adata, batch_key='batch', n_top_genes=3)
     assert 'highly_variable_nbatches' in adata.var.columns
     assert adata.var['highly_variable'].sum() == 3
+    highly_var_first_layer = adata.var['highly_variable']
+
+    adata = sc.datasets.blobs()
+    new_layer = adata.X.copy()
+    np.random.shuffle(new_layer)
+    adata.layers['test_layer'] = new_layer
+    adata.obs['batch'] = batch
+    adata.obs['batch'] = adata.obs['batch'].astype('category')
+    sc.pp.highly_variable_genes(
+        adata, batch_key='batch', n_top_genes=3, layer='test_layer'
+    )
+    assert 'highly_variable_nbatches' in adata.var.columns
+    assert adata.var['highly_variable'].sum() == 3
+    assert (highly_var_first_layer != adata.var['highly_variable']).any()
 
     sc.pp.highly_variable_genes(adata)
     no_batch_hvg = adata.var.highly_variable.copy()
@@ -491,3 +506,16 @@ def test_seurat_v3_mean_var_output_with_batchkey():
     )
     np.testing.assert_allclose(true_mean, result_df['means'], rtol=2e-05, atol=2e-05)
     np.testing.assert_allclose(true_var, result_df['variances'], rtol=2e-05, atol=2e-05)
+
+
+def test_cellranger_n_top_genes_warning():
+    X = np.random.poisson(2, (100, 30))
+    adata = sc.AnnData(X, dtype=X.dtype)
+    sc.pp.normalize_total(adata)
+    sc.pp.log1p(adata)
+
+    with pytest.warns(
+        UserWarning,
+        match="`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.",
+    ):
+        sc.pp.highly_variable_genes(adata, n_top_genes=1000, flavor="cell_ranger")