diff --git a/benchmarks/benchmarks/backed_hdf5.py b/benchmarks/benchmarks/backed_hdf5.py index e8d2e346b..1bf63e95d 100644 --- a/benchmarks/benchmarks/backed_hdf5.py +++ b/benchmarks/benchmarks/backed_hdf5.py @@ -1,62 +1,68 @@ from __future__ import annotations -import tempfile -from pathlib import Path - import numpy as np import pandas as pd from scipy import sparse import anndata as ad +file_paths = {"sparse": "adata_sparse.h5ad"} -class BackedHDF5: - def setup(self): - n_obs, n_vars = 10000, 5000 - self.n_obs = n_obs - self.n_vars = n_vars - - # Create count matrix - X = sparse.random(n_obs, n_vars, density=0.1, format="csr") - X.data = np.random.poisson(5, X.data.shape).astype(np.float32) - - # Create obs and var dataframes - obs = pd.DataFrame( - { - "cell_type": pd.Categorical( - np.random.choice(["TypeA", "TypeB", "TypeC"], n_obs) - ), - "total_counts": np.random.randint(1000, 5000, n_obs), - }, - index=[f"cell_{i}" for i in range(n_obs)], - ) - - var = pd.DataFrame( - { - "gene_name": [f"gene_{i}" for i in range(n_vars)], - }, - index=[f"ENSG_{i:08d}" for i in range(n_vars)], - ) - # Create AnnData object and save to HDF5 - adata = ad.AnnData(X=X, obs=obs, var=var) - - # Create temporary file - self.temp_path = Path(tempfile.mkdtemp()) / "test_data.h5ad" - adata.write_h5ad(self.temp_path) +class BackedHDF5Indexing: + param_names = ("arr_type",) + params = ("sparse",) + def setup_cache(self): + X_sparse = sparse.random( + 10000, + 50000, + density=0.01, + format="csr", + random_state=np.random.default_rng(42), + ) + for X, arr_type in [ + (X_sparse, "sparse"), + ]: + n_obs, n_var = X.shape + + # Create obs and var dataframes + obs = pd.DataFrame( + { + "cell_type": pd.Categorical( + np.random.choice(["TypeA", "TypeB", "TypeC"], n_obs) + ), + "total_counts": np.random.randint(1000, 5000, n_obs), + }, + index=[f"cell_{i}" for i in range(n_obs)], + ) + + var = pd.DataFrame( + { + "gene_name": [f"gene_{i}" for i in range(n_var)], + }, + index=[f"ENSG_{i:08d}" for i in range(n_var)], + ) + + # Create AnnData object and save to HDF5 + adata = ad.AnnData(X=X, obs=obs, var=var) + + # Create temporary file + adata.write_h5ad(file_paths[arr_type]) + + def setup(self, arr_type): # Open as backed - self.adata_backed = ad.read_h5ad(self.temp_path, backed="r") - + self.adata_backed = ad.read_h5ad(file_paths[arr_type], backed="r") + self.n_obs, self.n_var = self.adata_backed.shape # Prepare indices for duplicate index testing - self.obs_idx_with_dupes = np.array([0, 1, 0, 2, 1] * (n_obs // 100 + 1))[ - : (n_obs // 10) + self.obs_idx_with_dupes = np.array([0, 1, 0, 2, 1] * (self.n_obs // 100 + 1))[ + : (self.n_obs // 10) ] - self.var_idx_with_dupes = np.array([0, 1, 2, 0, 3] * (n_vars // 100 + 1))[ - : (n_vars // 10) + self.var_idx_with_dupes = np.array([0, 1, 2, 0, 3] * (self.n_var // 100 + 1))[ + : (self.n_var // 10) ] - self.obs_idx_no_dupes = np.arange(0, n_obs, 10) - self.var_idx_no_dupes = np.arange(0, n_vars, 10) + self.obs_idx_no_dupes = np.arange(0, self.n_obs, 10) + self.var_idx_no_dupes = np.arange(0, self.n_var, 10) def time_slice_obs(self, *_): """Time slicing observations from backed HDF5""" @@ -92,18 +98,15 @@ def peakmem_index_with_dupes_obs(self, *_): def time_to_memory_subset(self, *_): """Time converting subset to memory""" - subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_vars // 4)] + subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)] subset.to_memory() def peakmem_to_memory_subset(self, *_): """Peak memory for converting subset to memory""" - subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_vars // 4)] + subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)] subset.to_memory() def teardown(self, *_): """Clean up temporary files""" if hasattr(self, "adata_backed"): self.adata_backed.file.close() - if hasattr(self, "temp_path") and self.temp_path.exists(): - self.temp_path.unlink() - self.temp_path.parent.rmdir() diff --git a/benchmarks/benchmarks/dataset2d.py b/benchmarks/benchmarks/dataset2d.py index 7dfe04713..fc679ebf7 100644 --- a/benchmarks/benchmarks/dataset2d.py +++ b/benchmarks/benchmarks/dataset2d.py @@ -1,7 +1,5 @@ from __future__ import annotations -import tempfile -from pathlib import Path from typing import TYPE_CHECKING import h5py @@ -12,35 +10,39 @@ import anndata as ad if TYPE_CHECKING: - from collections.abc import Callable + from typing import Literal class Dataset2D: - param_names = ("gen_store", "chunks") + param_names = ("store_type", "chunks") params = ( - ( - lambda: h5py.File(Path(tempfile.mkdtemp()) / "data.h5ad", mode="w"), - lambda: zarr.open( - Path(tempfile.mkdtemp()) / "data.zarr", mode="w", zarr_version=2 - ), - ), + ("zarr", "h5ad"), ((-1,), None), ) - def setup( - self, gen_store: Callable[[], zarr.Group | h5py.File], chunks: None | tuple[int] - ): - self.n_obs = 100000 + def setup_cache(self): + n_obs = 100000 df = pd.DataFrame( { - "a": pd.Categorical(np.array(["a"] * self.n_obs)), - "b": np.arange(self.n_obs), + "a": pd.Categorical(np.array(["a"] * n_obs)), + "b": np.arange(n_obs), }, - index=[f"cell{i}" for i in range(self.n_obs)], + index=[f"cell{i}" for i in range(n_obs)], + ) + for store in [ + h5py.File("data.h5ad", mode="w"), + zarr.open("data.zarr", mode="w", zarr_version=2), + ]: + ad.io.write_elem(store, "obs", df) + + def setup(self, store_type: Literal["zarr", "h5ad"], chunks: None | tuple[int]): + store = ( + h5py.File("data.h5ad", mode="r") + if store_type == "h5ad" + else zarr.open("data.zarr") ) - store = gen_store() - ad.io.write_elem(store, "obs", df) self.ds = ad.experimental.read_elem_lazy(store["obs"], chunks=chunks) + self.n_obs = self.ds.shape[0] def time_getitem_slice(self, *_): self.ds.iloc[0 : (self.n_obs // 2)].to_memory() diff --git a/benchmarks/benchmarks/readwrite.py b/benchmarks/benchmarks/readwrite.py index f2f1289b9..e3e5930ab 100644 --- a/benchmarks/benchmarks/readwrite.py +++ b/benchmarks/benchmarks/readwrite.py @@ -38,52 +38,15 @@ PBMC_3K_URL = "https://falexwolf.de/data/pbmc3k_raw.h5ad" -# PBMC_3K_PATH = Path(__file__).parent / "data/pbmc3k_raw.h5ad" -# PBMC_REDUCED_PATH = Path(__file__).parent / "10x_pbmc68k_reduced.h5ad" -# BM_43K_CSR_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells.h5ad" -# BM_43K_CSC_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells_CSC.h5ad" - - -# class ZarrReadSuite: -# params = [] -# param_names = ["input_url"] - -# def setup(self, input_url): -# self.filepath = pooch.retrieve(url=input_url, known_hash=None) - -# def time_read_full(self, input_url): -# anndata.read_zarr(self.filepath) - -# def peakmem_read_full(self, input_url): -# anndata.read_zarr(self.filepath) - -# def mem_readfull_object(self, input_url): -# return anndata.read_zarr(self.filepath) - -# def track_read_full_memratio(self, input_url): -# mem_recording = memory_usage( -# (sedate(anndata.read_zarr, 0.005), (self.filepath,)), interval=0.001 -# ) -# adata = anndata.read_zarr(self.filepath) -# base_size = mem_recording[-1] - mem_recording[0] -# print(np.max(mem_recording) - np.min(mem_recording)) -# print(base_size) -# return (np.max(mem_recording) - np.min(mem_recording)) / base_size - -# def peakmem_read_backed(self, input_url): -# anndata.read_zarr(self.filepath, backed="r") - -# def mem_read_backed_object(self, input_url): -# return anndata.read_zarr(self.filepath, backed="r") - class H5ADInMemorySizeSuite: - _urls = MappingProxyType(dict(pbmc3k=PBMC_3K_URL)) - params = _urls.keys() - param_names = ("input_data",) + filepath = "pbmc_in_mem.h5ad" - def setup(self, input_data: str): - self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None) + def setup_cache(self): + # Need to specify path because the working directory is special for asv + pooch.retrieve( + url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath + ) def track_in_memory_size(self, *_): adata = anndata.read_h5ad(self.filepath) @@ -99,12 +62,13 @@ def track_actual_in_memory_size(self, *_): class H5ADReadSuite: - _urls = MappingProxyType(dict(pbmc3k=PBMC_3K_URL)) - params = _urls.keys() - param_names = ("input_data",) + filepath = "pbmc_read.h5ad" - def setup(self, input_data: str): - self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None) + def setup_cache(self): + # Need to specify path because the working directory is special for asv + pooch.retrieve( + url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath + ) def time_read_full(self, *_): anndata.read_h5ad(self.filepath) diff --git a/benchmarks/benchmarks/sparse_dataset.py b/benchmarks/benchmarks/sparse_dataset.py index c40e99b73..fd11065e6 100644 --- a/benchmarks/benchmarks/sparse_dataset.py +++ b/benchmarks/benchmarks/sparse_dataset.py @@ -21,7 +21,7 @@ def make_alternating_mask(n): class SparseCSRContiguousSlice: - _slices = MappingProxyType({ + _indexers = MappingProxyType({ "0:1000": slice(0, 1000), "0:9000": slice(0, 9000), ":9000:-1": slice(None, 9000, -1), @@ -31,42 +31,49 @@ class SparseCSRContiguousSlice: "first": 0, "alternating": make_alternating_mask(10), }) + filepath = "data.zarr" params = ( - [ - (10_000, 10_000), - # (10_000, 500) - ], - _slices.keys(), + list(_indexers.keys()), [True, False], ) - param_names = ("shape", "slice", "use_dask") + param_names = ( + "index", + "use_dask", + ) - def setup(self, shape: tuple[int, int], slice: str, use_dask: bool): # noqa: FBT001 + def setup_cache(self): X = sparse.random( - *shape, density=0.01, format="csr", random_state=np.random.default_rng(42) + 10_000, + 10_000, + density=0.01, + format="csr", + random_state=np.random.default_rng(42), ) - self.slice = self._slices[slice] - g = zarr.group() + g = zarr.group(self.filepath) write_elem(g, "X", X) + + def setup(self, index: str, use_dask: bool): # noqa: FBT001 + g = zarr.open(self.filepath) self.x = read_elem_lazy(g["X"]) if use_dask else sparse_dataset(g["X"]) self.adata = AnnData(self.x) + self.index = self._indexers[index] def time_getitem(self, *_): - res = self.x[self.slice] + res = self.x[self.index] if isinstance(res, DaskArray): res.compute() def peakmem_getitem(self, *_): - res = self.x[self.slice] + res = self.x[self.index] if isinstance(res, DaskArray): res.compute() def time_getitem_adata(self, *_): - res = self.adata[self.slice] + res = self.adata[self.index] if isinstance(res, DaskArray): res.compute() def peakmem_getitem_adata(self, *_): - res = self.adata[self.slice] + res = self.adata[self.index] if isinstance(res, DaskArray): res.compute() diff --git a/benchmarks/benchmarks/utils.py b/benchmarks/benchmarks/utils.py index 9e983498f..2c4da9cd6 100644 --- a/benchmarks/benchmarks/utils.py +++ b/benchmarks/benchmarks/utils.py @@ -95,13 +95,31 @@ def gen_indexer(adata, dim, index_kind, ratio): def gen_adata(n_obs, n_var, attr_set): if "X-csr" in attr_set: - X = sparse.random(n_obs, n_var, density=0.1, format="csr") + X = sparse.random( + n_obs, + n_var, + density=0.1, + format="csr", + random_state=np.random.default_rng(42), + ) elif "X-dense" in attr_set: - X = sparse.random(n_obs, n_var, density=0.1, format="csr") + X = sparse.random( + n_obs, + n_var, + density=0.1, + format="csr", + random_state=np.random.default_rng(42), + ) X = X.toarray() else: # TODO: There's probably a better way to do this - X = sparse.random(n_obs, n_var, density=0, format="csr") + X = sparse.random( + n_obs, + n_var, + density=0, + format="csr", + random_state=np.random.default_rng(42), + ) adata = AnnData(X) if "obs,var" in attr_set: adata.obs = pd.DataFrame(