Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
4479b94
move core xarray support to _core
ilia-kats Apr 9, 2025
098beb0
allow XArray Datasets in obs, var, obsm, and varm
ilia-kats Apr 9, 2025
8a60353
reorganize to register all singledispatches at anndata import
ilia-kats Apr 9, 2025
3ce1624
fix remaining bugs to make tests pass
ilia-kats Apr 10, 2025
02c8355
add in-memory xarray Dataset to tests and make tests pass
ilia-kats Apr 10, 2025
3862745
add tests for var/obs as in-memory XDatasets and make them pass
ilia-kats Apr 11, 2025
9592203
move fake index handling into Dataset2D as much as possible
ilia-kats Apr 11, 2025
1a0b3f9
add release notes
ilia-kats Apr 11, 2025
3eb33bb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 11, 2025
801902c
fix linter errors
ilia-kats Apr 11, 2025
4b7867e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 11, 2025
bc88f12
more linter fixes
ilia-kats Apr 11, 2025
5344e80
add xarray to test dependencies
ilia-kats Apr 11, 2025
80563bb
fix docs
ilia-kats Apr 11, 2025
938f0b3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 11, 2025
01386a4
bump min. xarray version to 2024.10.0
ilia-kats Apr 14, 2025
ff21784
bump min pandas version for tests to 2.1.0 to satisfy CI
ilia-kats Apr 14, 2025
522d679
fix min-deps.py script
ilia-kats Apr 14, 2025
8e5d122
set the true index to the column specified in the _index attr
ilia-kats Apr 14, 2025
d00b7a0
fix min-deps.py docstring
ilia-kats Apr 14, 2025
bb41f0a
attempt to fix docs build
ilia-kats Apr 14, 2025
d7f3b40
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 14, 2025
8e501fb
type_checking fixes
ilia-kats Apr 14, 2025
07b63d0
more docs fixes
ilia-kats Apr 14, 2025
6cbbaf8
s/XArray/XDataArray/
ilia-kats Apr 24, 2025
e63a17f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 24, 2025
881a46e
Merge branch 'main' into xarray_dataset
ilan-gold Apr 24, 2025
efba502
add force_lazy argument to concat
ilia-kats Apr 24, 2025
0f506f2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 24, 2025
6998f23
fix linter warnings
ilia-kats Apr 25, 2025
d6b8f7f
add tests for Dataset2D
ilia-kats Apr 25, 2025
4a8c4fc
fix mininum awkward version
ilia-kats Apr 25, 2025
9235625
remove unreachable code
ilia-kats May 6, 2025
f1f0d6e
properly version-gate xfailing test
ilia-kats May 6, 2025
8c34999
(fix): small fix for new xarray + test deps
ilan-gold May 6, 2025
71e378c
Merge pull request #1 from scverse/ig/xarray_dataset_updates
ilia-kats May 6, 2025
e8631a1
add back min. pandas version for tests
ilia-kats May 6, 2025
dc1805d
(chore): add `obsm` access test
ilan-gold May 6, 2025
98922b2
Merge pull request #2 from scverse/ig/obsm_access_test
ilia-kats May 6, 2025
a235fcb
Merge branch 'main' into xarray_dataset
ilan-gold May 7, 2025
cce1275
(fix): only use minimal dependcies for min
ilan-gold May 7, 2025
7384e22
(fix): add pandas as `test` min dep
ilan-gold May 7, 2025
c248e5c
(fix): docs
ilan-gold May 9, 2025
f4e332b
Merge pull request #4 from scverse/ig/fix_docs
ilia-kats May 9, 2025
caf42c2
Merge pull request #3 from scverse/ig/change_pandas_min
ilia-kats May 9, 2025
97cbacc
ci/min-deps.py: correctly handle same dependency with different extras
ilia-kats May 9, 2025
01194d9
move reorganize concat label handling
ilia-kats May 12, 2025
3a12723
Update src/anndata/_core/merge.py
ilan-gold May 12, 2025
40567aa
Apply suggestions from code review
ilia-kats May 12, 2025
6c081c7
Merge branch 'main' into xarray_dataset
ilan-gold May 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
if: matrix.dependencies-version == 'minimum'
run: |
uv pip install --system --compile tomli packaging
deps=$(python3 ci/scripts/min-deps.py pyproject.toml --extra dev test)
deps=$(python3 ci/scripts/min-deps.py pyproject.toml --extra dev test-min)
uv pip install --system --compile $deps "anndata @ ."

- name: Install dependencies release candidates
Expand Down
11 changes: 8 additions & 3 deletions ci/scripts/min-deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def min_dep(req: Requirement) -> Requirement:
-------

>>> min_dep(Requirement("numpy>=1.0"))
<Requirement('numpy==1.0.*')>
<Requirement('numpy~=1.0.0')>
>>> min_dep(Requirement("numpy<3.0"))
<Requirement('numpy<3.0')>
"""
Expand All @@ -55,7 +55,7 @@ def min_dep(req: Requirement) -> Requirement:
elif spec.operator == "==":
min_version = Version(spec.version)

return Requirement(f"{req_name}=={min_version}.*")
return Requirement(f"{req_name}~={min_version}.0")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The == was intentional. If a patch version has issues, the minimum version should be bumped to a version that makes the tests pass.

Not everything uses semver, especially not Python land.

We can discuss this of course, if there’s a good reason to make this change we can make it, I’m just saying that this needs to be motivated.



def extract_min_deps(
Expand All @@ -64,6 +64,7 @@ def extract_min_deps(
dependencies = deque(dependencies) # We'll be mutating this
project_name = pyproject["project"]["name"]

deps = {}
while len(dependencies) > 0:
req = dependencies.pop()

Expand All @@ -76,7 +77,11 @@ def extract_min_deps(
extra_deps = pyproject["project"]["optional-dependencies"][extra]
dependencies += map(Requirement, extra_deps)
else:
yield min_dep(req)
if req.name in deps:
req.specifier &= deps[req.name].specifier
req.extras |= deps[req.name].extras
deps[req.name] = min_dep(req)
yield from deps.values()


class Args(argparse.Namespace):
Expand Down
2 changes: 1 addition & 1 deletion docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ Types used by the former:
experimental.StorageType
experimental.backed._lazy_arrays.MaskedArray
experimental.backed._lazy_arrays.CategoricalArray
experimental.backed._xarray.Dataset2D
_core.xarray.Dataset2D
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No public exports should start with an underscore, this should be exported somewhere else.

We talked about

  • anndata.types for types that are not ABCs but intended to be used in isinstance checks
  • anndata.typing for types that are only for annotations

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already had this comment as a pending review but never submitted it :(

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, apologies you are 100% right.

```

(extensions-api)=
Expand Down
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def setup(app: Sphinx):
"anndata.compat.DaskArray": "dask.array.Array",
"anndata.compat.CupyArray": "cupy.ndarray",
"anndata.compat.CupySparseMatrix": "cupyx.scipy.sparse.spmatrix",
"anndata.compat.XDataArray": "xarray.DataArray",
"awkward.highlevel.Array": "ak.Array",
"numpy.int64": ("py:attr", "numpy.int64"),
"pandas.DataFrame.iloc": ("py:attr", "pandas.DataFrame.iloc"),
Expand Down
2 changes: 1 addition & 1 deletion docs/release-notes/0.12.0rc1.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

#### Bug fixes

- Disallow writing of {class}`~anndata.experimental.backed._xarray.Dataset2D` objects {user}`ilan-gold` ({pr}`1887`)
- Disallow writing of {class}`~anndata._core.xarray.Dataset2D` objects {user}`ilan-gold` ({pr}`1887`)
- Upgrade old deprecation warning to a `FutureWarning` on `BaseCompressedSparseDataset.__setitem__`, showing our intent to remove the feature in the next release. {user}`ilan-gold` ({pr}`1928`)
- Don't use {func}`asyncio.run` internally for any operations {user}`ilan-gold` ({pr}`1933`)
- Disallow forward slashes in keys for writing {user}`ilan-gold` ({pr}`1940`)
Expand Down
1 change: 1 addition & 0 deletions docs/release-notes/1966.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Allow xarray Datasets to be used for obs/var/obsm/varm. {user}`ilia-kats`
12 changes: 7 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,9 @@ doc = [
"sphinx_design>=0.5.0",
# for unreleased changes
"anndata[dev-doc,dask]",
"awkward>=2.3",
]
dev-doc = [ "towncrier>=24.8.0" ] # release notes tool
test-full = [ "anndata[test,lazy]" ]
test = [
test-min = [
"loompy>=3.0.5",
"pytest>=8.2,<8.3.4",
"pytest-cov",
Expand All @@ -100,15 +98,19 @@ test = [
"scanpy>=1.10",
"httpx", # For data downloading
"dask[distributed]",
"awkward>=2.3",
"awkward>=2.3.2",
"pyarrow",
"anndata[dask]",
]
test = [
"anndata[test-min,lazy]",
"pandas>=2.1.0",
] # pandas 2.1.0 needs to be specified for xarray to work with min-deps script
gpu = [ "cupy" ]
cu12 = [ "cupy-cuda12x" ]
cu11 = [ "cupy-cuda11x" ]
# requests and aiohttp needed for zarr remote data
lazy = [ "xarray>=2024.06.0", "aiohttp", "requests", "anndata[dask]" ]
lazy = [ "xarray>=2025.04.0", "aiohttp", "requests", "anndata[dask]" ]
# https://github.com/dask/dask/issues/11290
# https://github.com/dask/dask/issues/11752
dask = [ "dask[array]>=2023.5.1,!=2024.8.*,!=2024.9.*,<2025.2.0" ]
Expand Down
30 changes: 28 additions & 2 deletions src/anndata/_core/aligned_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from pandas.api.types import is_string_dtype

from .._warnings import ImplicitModificationWarning
from ..compat import XDataset
from .xarray import Dataset2D

if TYPE_CHECKING:
from collections.abc import Iterable
Expand Down Expand Up @@ -108,15 +110,39 @@ def _mk_df_error(
expected: int,
actual: int,
):
what = "row" if attr == "obs" else "column"
if source == "X":
what = "row" if attr == "obs" else "column"
msg = (
f"Observations annot. `{attr}` must have as many rows as `X` has {what}s "
f"({expected}), but has {actual} rows."
)
else:
msg = (
f"`shape` is inconsistent with `{attr}` "
"({actual} {what}s instead of {expected})"
f"({actual} {what}s instead of {expected})"
)
return ValueError(msg)


@_gen_dataframe.register(Dataset2D)
def _gen_dataframe_xr(
anno: Dataset2D,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
return anno


@_gen_dataframe.register(XDataset)
def _gen_dataframe_xdataset(
anno: XDataset,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
return Dataset2D(anno)
10 changes: 8 additions & 2 deletions src/anndata/_core/aligned_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pandas as pd

from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning
from ..compat import AwkArray, CSArray, CSMatrix, CupyArray
from ..compat import AwkArray, CSArray, CSMatrix, CupyArray, XDataset
from ..utils import (
axis_len,
convert_to_dict,
Expand All @@ -23,6 +23,7 @@
from .index import _subset
from .storage import coerce_array
from .views import as_view, view_update
from .xarray import Dataset2D

if TYPE_CHECKING:
from collections.abc import Callable, Iterable, Iterator, Mapping
Expand Down Expand Up @@ -75,8 +76,10 @@ def _validate_value(self, val: Value, key: str) -> Value:
ExperimentalFeatureWarning,
# stacklevel=3,
)
if isinstance(val, np.ndarray | CupyArray) and len(val.shape) == 1:
elif isinstance(val, np.ndarray | CupyArray) and len(val.shape) == 1:
val = val.reshape((val.shape[0], 1))
elif isinstance(val, XDataset):
val = Dataset2D(data_vars=val.data_vars, coords=val.coords, attrs=val.attrs)
for i, axis in enumerate(self.axes):
if self.parent.shape[axis] == axis_len(val, i):
continue
Expand Down Expand Up @@ -275,6 +278,9 @@ def _validate_value(self, val: Value, key: str) -> Value:
else:
msg = "Index.equals and pd.testing.assert_index_equal disagree"
raise AssertionError(msg)
val.index.name = (
self.dim_names.name
) # this is consistent with AnnData.obsm.setter and AnnData.varm.setter
return super()._validate_value(val, key)

@property
Expand Down
31 changes: 22 additions & 9 deletions src/anndata/_core/anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
_resolve_idxs,
as_view,
)
from .xarray import Dataset2D

if TYPE_CHECKING:
from collections.abc import Iterable
Expand All @@ -55,7 +56,7 @@

from zarr.storage import StoreLike

from ..compat import Index1D
from ..compat import Index1D, XDataset
from ..typing import XDataType
from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView
from .index import Index
Expand Down Expand Up @@ -746,10 +747,14 @@ def n_vars(self) -> int:
"""Number of variables/features."""
return len(self.var_names)

def _set_dim_df(self, value: pd.DataFrame, attr: Literal["obs", "var"]):
if not isinstance(value, pd.DataFrame):
msg = f"Can only assign pd.DataFrame to {attr}."
raise ValueError(msg)
def _set_dim_df(self, value: pd.DataFrame | XDataset, attr: Literal["obs", "var"]):
value = _gen_dataframe(
value,
[f"{attr}_names", f"{'row' if attr == 'obs' else 'col'}_names"],
source="shape",
attr=attr,
length=self.n_obs if attr == "obs" else self.n_vars,
)
raise_value_error_if_multiindex_columns(value, attr)
value_idx = self._prep_dim_index(value.index, attr)
if self.is_view:
Expand Down Expand Up @@ -804,12 +809,12 @@ def _set_dim_index(self, value: pd.Index, attr: str):
v.index = value

@property
def obs(self) -> pd.DataFrame:
def obs(self) -> pd.DataFrame | Dataset2D:
"""One-dimensional annotation of observations (`pd.DataFrame`)."""
return self._obs

@obs.setter
def obs(self, value: pd.DataFrame):
def obs(self, value: pd.DataFrame | XDataset):
self._set_dim_df(value, "obs")

@obs.deleter
Expand All @@ -827,12 +832,12 @@ def obs_names(self, names: Sequence[str]):
self._set_dim_index(names, "obs")

@property
def var(self) -> pd.DataFrame:
def var(self) -> pd.DataFrame | Dataset2D:
"""One-dimensional annotation of variables/ features (`pd.DataFrame`)."""
return self._var

@var.setter
def var(self, value: pd.DataFrame):
def var(self, value: pd.DataFrame | XDataset):
self._set_dim_df(value, "var")

@var.deleter
Expand Down Expand Up @@ -2079,6 +2084,14 @@ def _get_and_delete_multicol_field(self, a, key_multicol):
return values


@AnnData._remove_unused_categories.register(Dataset2D)
@staticmethod
def _remove_unused_categories_xr(
df_full: Dataset2D, df_sub: Dataset2D, uns: dict[str, Any]
):
pass # this is handled automatically by the categorical arrays themselves i.e., they dedup upon access.


def _check_2d_shape(X):
"""\
Check shape of array or sparse matrix.
Expand Down
6 changes: 6 additions & 0 deletions src/anndata/_core/file_backing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from ..compat import AwkArray, DaskArray, ZarrArray, ZarrGroup
from .sparse_dataset import BaseCompressedSparseDataset
from .xarray import Dataset2D

if TYPE_CHECKING:
from collections.abc import Iterator
Expand Down Expand Up @@ -162,6 +163,11 @@ def _(x: AwkArray, *, copy: bool = False):
return x


@to_memory.register(Dataset2D)
def _(x: Dataset2D, *, copy: bool = False):
return x.to_memory(copy=copy)


@singledispatch
def filename(x):
msg = f"Not implemented for {type(x)}"
Expand Down
16 changes: 12 additions & 4 deletions src/anndata/_core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import pandas as pd
from scipy.sparse import issparse

from ..compat import AwkArray, CSArray, CSMatrix, DaskArray
from ..compat import AwkArray, CSArray, CSMatrix, DaskArray, XDataArray
from .xarray import Dataset2D

if TYPE_CHECKING:
from ..compat import Index, Index1D
Expand Down Expand Up @@ -44,8 +45,6 @@ def _normalize_index( # noqa: PLR0911, PLR0912
| pd.Index,
index: pd.Index,
) -> slice | int | np.ndarray: # ndarray of int or bool
from ..experimental.backed._compat import DataArray

# TODO: why is this here? All tests pass without it and it seems at the minimum not strict enough.
if not isinstance(index, pd.RangeIndex) and index.dtype in (np.float64, np.int64):
msg = f"Don’t call _normalize_index with non-categorical/string names and non-range index {index}"
Expand Down Expand Up @@ -112,7 +111,7 @@ def name_idx(i):
)
raise KeyError(msg)
return positions # np.ndarray[int]
elif isinstance(indexer, DataArray):
elif isinstance(indexer, XDataArray):
if isinstance(indexer.data, DaskArray):
return indexer.data.compute()
return indexer.data
Expand Down Expand Up @@ -210,6 +209,15 @@ def _subset_awkarray(a: AwkArray, subset_idx: Index):
return a[subset_idx]


@_subset.register(Dataset2D)
def _(a: Dataset2D, subset_idx: Index):
key = a.index_dim
# xarray seems to have some code looking for a second entry in tuples
if isinstance(subset_idx, tuple) and len(subset_idx) == 1:
subset_idx = subset_idx[0]
return a.isel(**{key: subset_idx})


# Registration for SparseDataset occurs in sparse_dataset.py
@_subset.register(h5py.Dataset)
def _subset_dataset(d, subset_idx):
Expand Down
Loading