Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 52 additions & 49 deletions benchmarks/benchmarks/backed_hdf5.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,68 @@
from __future__ import annotations

import tempfile
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse

import anndata as ad

file_paths = {"sparse": "adata_sparse.h5ad"}

class BackedHDF5:
def setup(self):
n_obs, n_vars = 10000, 5000
self.n_obs = n_obs
self.n_vars = n_vars

# Create count matrix
X = sparse.random(n_obs, n_vars, density=0.1, format="csr")
X.data = np.random.poisson(5, X.data.shape).astype(np.float32)

# Create obs and var dataframes
obs = pd.DataFrame(
{
"cell_type": pd.Categorical(
np.random.choice(["TypeA", "TypeB", "TypeC"], n_obs)
),
"total_counts": np.random.randint(1000, 5000, n_obs),
},
index=[f"cell_{i}" for i in range(n_obs)],
)

var = pd.DataFrame(
{
"gene_name": [f"gene_{i}" for i in range(n_vars)],
},
index=[f"ENSG_{i:08d}" for i in range(n_vars)],
)

# Create AnnData object and save to HDF5
adata = ad.AnnData(X=X, obs=obs, var=var)

# Create temporary file
self.temp_path = Path(tempfile.mkdtemp()) / "test_data.h5ad"
adata.write_h5ad(self.temp_path)
class BackedHDF5Indexing:
param_names = ("arr_type",)
params = ("sparse",)

def setup_cache(self):
X_sparse = sparse.random(
10000,
50000,
density=0.01,
format="csr",
random_state=np.random.default_rng(42),
)
for X, arr_type in [
(X_sparse, "sparse"),
]:
n_obs, n_var = X.shape

# Create obs and var dataframes
obs = pd.DataFrame(
{
"cell_type": pd.Categorical(
np.random.choice(["TypeA", "TypeB", "TypeC"], n_obs)
),
"total_counts": np.random.randint(1000, 5000, n_obs),
},
index=[f"cell_{i}" for i in range(n_obs)],
)

var = pd.DataFrame(
{
"gene_name": [f"gene_{i}" for i in range(n_var)],
},
index=[f"ENSG_{i:08d}" for i in range(n_var)],
)

# Create AnnData object and save to HDF5
adata = ad.AnnData(X=X, obs=obs, var=var)

# Create temporary file
adata.write_h5ad(file_paths[arr_type])

def setup(self, arr_type):
# Open as backed
self.adata_backed = ad.read_h5ad(self.temp_path, backed="r")

self.adata_backed = ad.read_h5ad(file_paths[arr_type], backed="r")
self.n_obs, self.n_var = self.adata_backed.shape
# Prepare indices for duplicate index testing
self.obs_idx_with_dupes = np.array([0, 1, 0, 2, 1] * (n_obs // 100 + 1))[
: (n_obs // 10)
self.obs_idx_with_dupes = np.array([0, 1, 0, 2, 1] * (self.n_obs // 100 + 1))[
: (self.n_obs // 10)
]
self.var_idx_with_dupes = np.array([0, 1, 2, 0, 3] * (n_vars // 100 + 1))[
: (n_vars // 10)
self.var_idx_with_dupes = np.array([0, 1, 2, 0, 3] * (self.n_var // 100 + 1))[
: (self.n_var // 10)
]
self.obs_idx_no_dupes = np.arange(0, n_obs, 10)
self.var_idx_no_dupes = np.arange(0, n_vars, 10)
self.obs_idx_no_dupes = np.arange(0, self.n_obs, 10)
self.var_idx_no_dupes = np.arange(0, self.n_var, 10)

def time_slice_obs(self, *_):
"""Time slicing observations from backed HDF5"""
Expand Down Expand Up @@ -92,18 +98,15 @@ def peakmem_index_with_dupes_obs(self, *_):

def time_to_memory_subset(self, *_):
"""Time converting subset to memory"""
subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_vars // 4)]
subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)]
subset.to_memory()

def peakmem_to_memory_subset(self, *_):
"""Peak memory for converting subset to memory"""
subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_vars // 4)]
subset = self.adata_backed[0 : (self.n_obs // 4), 0 : (self.n_var // 4)]
subset.to_memory()

def teardown(self, *_):
"""Clean up temporary files"""
if hasattr(self, "adata_backed"):
self.adata_backed.file.close()
if hasattr(self, "temp_path") and self.temp_path.exists():
self.temp_path.unlink()
self.temp_path.parent.rmdir()
40 changes: 21 additions & 19 deletions benchmarks/benchmarks/dataset2d.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

import tempfile
from pathlib import Path
from typing import TYPE_CHECKING

import h5py
Expand All @@ -12,35 +10,39 @@
import anndata as ad

if TYPE_CHECKING:
from collections.abc import Callable
from typing import Literal


class Dataset2D:
param_names = ("gen_store", "chunks")
param_names = ("store_type", "chunks")
params = (
(
lambda: h5py.File(Path(tempfile.mkdtemp()) / "data.h5ad", mode="w"),
lambda: zarr.open(
Path(tempfile.mkdtemp()) / "data.zarr", mode="w", zarr_version=2
),
),
("zarr", "h5ad"),
((-1,), None),
)

def setup(
self, gen_store: Callable[[], zarr.Group | h5py.File], chunks: None | tuple[int]
):
self.n_obs = 100000
def setup_cache(self):
n_obs = 100000
df = pd.DataFrame(
{
"a": pd.Categorical(np.array(["a"] * self.n_obs)),
"b": np.arange(self.n_obs),
"a": pd.Categorical(np.array(["a"] * n_obs)),
"b": np.arange(n_obs),
},
index=[f"cell{i}" for i in range(self.n_obs)],
index=[f"cell{i}" for i in range(n_obs)],
)
for store in [
h5py.File("data.h5ad", mode="w"),
zarr.open("data.zarr", mode="w", zarr_version=2),
]:
ad.io.write_elem(store, "obs", df)

def setup(self, store_type: Literal["zarr", "h5ad"], chunks: None | tuple[int]):
store = (
h5py.File("data.h5ad", mode="r")
if store_type == "h5ad"
else zarr.open("data.zarr")
)
store = gen_store()
ad.io.write_elem(store, "obs", df)
self.ds = ad.experimental.read_elem_lazy(store["obs"], chunks=chunks)
self.n_obs = self.ds.shape[0]

def time_getitem_slice(self, *_):
self.ds.iloc[0 : (self.n_obs // 2)].to_memory()
Expand Down
60 changes: 12 additions & 48 deletions benchmarks/benchmarks/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,52 +38,15 @@

PBMC_3K_URL = "https://falexwolf.de/data/pbmc3k_raw.h5ad"

# PBMC_3K_PATH = Path(__file__).parent / "data/pbmc3k_raw.h5ad"
# PBMC_REDUCED_PATH = Path(__file__).parent / "10x_pbmc68k_reduced.h5ad"
# BM_43K_CSR_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells.h5ad"
# BM_43K_CSC_PATH = Path(__file__).parent.parent / "datasets/BM2_43k-cells_CSC.h5ad"


# class ZarrReadSuite:
# params = []
# param_names = ["input_url"]

# def setup(self, input_url):
# self.filepath = pooch.retrieve(url=input_url, known_hash=None)

# def time_read_full(self, input_url):
# anndata.read_zarr(self.filepath)

# def peakmem_read_full(self, input_url):
# anndata.read_zarr(self.filepath)

# def mem_readfull_object(self, input_url):
# return anndata.read_zarr(self.filepath)

# def track_read_full_memratio(self, input_url):
# mem_recording = memory_usage(
# (sedate(anndata.read_zarr, 0.005), (self.filepath,)), interval=0.001
# )
# adata = anndata.read_zarr(self.filepath)
# base_size = mem_recording[-1] - mem_recording[0]
# print(np.max(mem_recording) - np.min(mem_recording))
# print(base_size)
# return (np.max(mem_recording) - np.min(mem_recording)) / base_size

# def peakmem_read_backed(self, input_url):
# anndata.read_zarr(self.filepath, backed="r")

# def mem_read_backed_object(self, input_url):
# return anndata.read_zarr(self.filepath, backed="r")


class H5ADInMemorySizeSuite:
_urls = MappingProxyType(dict(pbmc3k=PBMC_3K_URL))
params = _urls.keys()
param_names = ("input_data",)
filepath = "pbmc_in_mem.h5ad"

def setup(self, input_data: str):
self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)
def setup_cache(self):
# Need to specify path because the working directory is special for asv
pooch.retrieve(
url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath
)

def track_in_memory_size(self, *_):
adata = anndata.read_h5ad(self.filepath)
Expand All @@ -99,12 +62,13 @@ def track_actual_in_memory_size(self, *_):


class H5ADReadSuite:
_urls = MappingProxyType(dict(pbmc3k=PBMC_3K_URL))
params = _urls.keys()
param_names = ("input_data",)
filepath = "pbmc_read.h5ad"

def setup(self, input_data: str):
self.filepath = pooch.retrieve(url=self._urls[input_data], known_hash=None)
def setup_cache(self):
# Need to specify path because the working directory is special for asv
pooch.retrieve(
url=PBMC_3K_URL, known_hash=None, path=Path.cwd(), fname=self.filepath
)

def time_read_full(self, *_):
anndata.read_h5ad(self.filepath)
Expand Down
37 changes: 22 additions & 15 deletions benchmarks/benchmarks/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def make_alternating_mask(n):


class SparseCSRContiguousSlice:
_slices = MappingProxyType({
_indexers = MappingProxyType({
"0:1000": slice(0, 1000),
"0:9000": slice(0, 9000),
":9000:-1": slice(None, 9000, -1),
Expand All @@ -31,42 +31,49 @@ class SparseCSRContiguousSlice:
"first": 0,
"alternating": make_alternating_mask(10),
})
filepath = "data.zarr"
params = (
[
(10_000, 10_000),
# (10_000, 500)
],
_slices.keys(),
list(_indexers.keys()),
[True, False],
)
param_names = ("shape", "slice", "use_dask")
param_names = (
"index",
"use_dask",
)

def setup(self, shape: tuple[int, int], slice: str, use_dask: bool): # noqa: FBT001
def setup_cache(self):
X = sparse.random(
*shape, density=0.01, format="csr", random_state=np.random.default_rng(42)
10_000,
10_000,
density=0.01,
format="csr",
random_state=np.random.default_rng(42),
)
self.slice = self._slices[slice]
g = zarr.group()
g = zarr.group(self.filepath)
write_elem(g, "X", X)

def setup(self, index: str, use_dask: bool): # noqa: FBT001
g = zarr.open(self.filepath)
self.x = read_elem_lazy(g["X"]) if use_dask else sparse_dataset(g["X"])
self.adata = AnnData(self.x)
self.index = self._indexers[index]

def time_getitem(self, *_):
res = self.x[self.slice]
res = self.x[self.index]
if isinstance(res, DaskArray):
res.compute()

def peakmem_getitem(self, *_):
res = self.x[self.slice]
res = self.x[self.index]
if isinstance(res, DaskArray):
res.compute()

def time_getitem_adata(self, *_):
res = self.adata[self.slice]
res = self.adata[self.index]
if isinstance(res, DaskArray):
res.compute()

def peakmem_getitem_adata(self, *_):
res = self.adata[self.slice]
res = self.adata[self.index]
if isinstance(res, DaskArray):
res.compute()
24 changes: 21 additions & 3 deletions benchmarks/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,31 @@ def gen_indexer(adata, dim, index_kind, ratio):

def gen_adata(n_obs, n_var, attr_set):
if "X-csr" in attr_set:
X = sparse.random(n_obs, n_var, density=0.1, format="csr")
X = sparse.random(
n_obs,
n_var,
density=0.1,
format="csr",
random_state=np.random.default_rng(42),
)
elif "X-dense" in attr_set:
X = sparse.random(n_obs, n_var, density=0.1, format="csr")
X = sparse.random(
n_obs,
n_var,
density=0.1,
format="csr",
random_state=np.random.default_rng(42),
)
X = X.toarray()
else:
# TODO: There's probably a better way to do this
X = sparse.random(n_obs, n_var, density=0, format="csr")
X = sparse.random(
n_obs,
n_var,
density=0,
format="csr",
random_state=np.random.default_rng(42),
)
adata = AnnData(X)
if "obs,var" in attr_set:
adata.obs = pd.DataFrame(
Expand Down