From 01e75181ee904282e656ee01180c2d1d3e679239 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 24 Oct 2024 17:48:00 -0400 Subject: [PATCH 001/112] new blank whatsnew --- doc/whats-new.rst | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9a451a836ad..18fae4e0151 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,6 +14,34 @@ What's New np.random.seed(123456) +.. _whats-new.2024.10.1: + +v.2024.10.1 (unreleased) +------------------------ + +New Features +~~~~~~~~~~~~ + + +Breaking changes +~~~~~~~~~~~~~~~~ + + +Deprecations +~~~~~~~~~~~~ + + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ + .. _whats-new.2024.10.0: v2024.10.0 (Oct 24th, 2024) From e6b3b3bd777b423435241c67a9482187a86a2256 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 15 May 2025 17:55:20 -0400 Subject: [PATCH 002/112] test async load using special zarr LatencyStore --- pyproject.toml | 1 + xarray/tests/test_async.py | 106 +++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 xarray/tests/test_async.py diff --git a/pyproject.toml b/pyproject.toml index fa087abbc13..7dc784f170f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ dev = [ "pytest-mypy-plugins", "pytest-timeout", "pytest-xdist", + "pytest-asyncio", "ruff>=0.8.0", "sphinx", "sphinx_autosummary_accessors", diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py new file mode 100644 index 00000000000..78a78a95c4c --- /dev/null +++ b/xarray/tests/test_async.py @@ -0,0 +1,106 @@ +from typing import TypeVar, Iterable +import asyncio +import time + +import pytest +import numpy as np + +from xarray.tests import has_zarr_v3, requires_zarr_v3 +import xarray as xr + + +if has_zarr_v3: + import zarr + from zarr.abc.store import Store + from zarr.storage import MemoryStore + from zarr.storage._wrapper import WrapperStore + + from zarr.abc.store import ByteRequest + from zarr.core.buffer import Buffer, BufferPrototype + + T_Store = TypeVar("T_Store", bound=Store) + + + class LatencyStore(WrapperStore[T_Store]): + """Works the same way as the zarr LoggingStore""" + latency: float + + def __init__( + self, + store: T_Store, + latency: float = 0.0, + ) -> None: + """ + Store wrapper that adds artificial latency to each get call. + + Parameters + ---------- + store : Store + Store to wrap + latency : float + Amount of artificial latency to add to each get call, in seconds. + """ + super().__init__(store) + self.latency = latency + + def __str__(self) -> str: + return f"latency-{self._store}" + + def __repr__(self) -> str: + return f"LatencyStore({self._store.__class__.__name__}, '{self._store}', latency={self.latency})" + + async def get( + self, + key: str, + prototype: BufferPrototype, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + await asyncio.sleep(self.latency) + return await self._store.get(key=key, prototype=prototype, byte_range=byte_range) + + async def get_partial_values( + self, + prototype: BufferPrototype, + key_ranges: Iterable[tuple[str, ByteRequest | None]], + ) -> list[Buffer | None]: + await asyncio.sleep(self.latency) + return await self._store.get_partial_values(prototype=prototype, key_ranges=key_ranges) +else: + LatencyStore = {} + + +@pytest.fixture +def memorystore() -> "MemoryStore": + memorystore = zarr.storage.MemoryStore({}) + z = zarr.create_array( + store=memorystore, + name="foo", + shape=(10, 10), + chunks=(5, 5), + dtype="f4", + dimension_names=["x", "y"] + ) + z[:, :] = np.random.random((10, 10)) + + return memorystore + + +@requires_zarr_v3 +@pytest.mark.asyncio +async def test_async_load(memorystore): + N_DATASETS = 3 + LATENCY = 1.0 + + latencystore = LatencyStore(memorystore, latency=LATENCY) + datasets = [xr.open_zarr(latencystore, zarr_format=3, consolidated=False) for _ in range(N_DATASETS)] + + start_time = time.time() + # TODO actually implement the async.load method + #tasks = [ds.async.load() for ds in datasets] + #results = await asyncio.gather(*tasks) + results = [ds.load() for ds in datasets] + total_time = time.time() - start_time + + assert total_time > LATENCY # Cannot possibly be quicker than this + assert total_time < LATENCY * N_DATASETS # If this isn't true we're gaining nothing from async + assert abs(total_time - LATENCY) < 0.5 # Should take approximately LATENCY seconds, but allow some buffer From 3ceab60f79201cfd46c0381731d1f188eda816cf Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 16 May 2025 11:53:32 -0400 Subject: [PATCH 003/112] don't use dask --- xarray/tests/test_async.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 78a78a95c4c..c28680cd1dd 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -88,19 +88,22 @@ def memorystore() -> "MemoryStore": @requires_zarr_v3 @pytest.mark.asyncio async def test_async_load(memorystore): - N_DATASETS = 3 + N_DATASETS = 10 LATENCY = 1.0 latencystore = LatencyStore(memorystore, latency=LATENCY) - datasets = [xr.open_zarr(latencystore, zarr_format=3, consolidated=False) for _ in range(N_DATASETS)] + datasets = [xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) for _ in range(N_DATASETS)] + # TODO add async load to Dataset and DataArray as well as to Variable start_time = time.time() - # TODO actually implement the async.load method - #tasks = [ds.async.load() for ds in datasets] - #results = await asyncio.gather(*tasks) - results = [ds.load() for ds in datasets] + tasks = [ds['foo'].variable.async_load() for ds in datasets] + results = await asyncio.gather(*tasks) + #results = [ds['foo'].variable.load() for ds in datasets] total_time = time.time() - start_time assert total_time > LATENCY # Cannot possibly be quicker than this assert total_time < LATENCY * N_DATASETS # If this isn't true we're gaining nothing from async assert abs(total_time - LATENCY) < 0.5 # Should take approximately LATENCY seconds, but allow some buffer + + print(total_time) + assert False \ No newline at end of file From 071c35a19914b2974523e46fa43598b97b7d5777 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 16 May 2025 11:53:44 -0400 Subject: [PATCH 004/112] async all the way down --- xarray/backends/common.py | 6 +++++ xarray/backends/zarr.py | 22 +++++++++++++++++ xarray/core/indexing.py | 46 +++++++++++++++++++++++++++++++++++ xarray/core/variable.py | 6 +++++ xarray/namedarray/pycompat.py | 23 ++++++++++++++++++ 5 files changed, 103 insertions(+) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 58a98598a5b..c31a3caaf81 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -273,6 +273,12 @@ class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): def get_duck_array(self, dtype: np.typing.DTypeLike = None): key = indexing.BasicIndexer((slice(None),) * self.ndim) return self[key] # type: ignore[index] + + async def async_get_duck_array(self, dtype: np.typing.DTypeLike = None): + key = indexing.BasicIndexer((slice(None),) * self.ndim) + # TODO use zarr-python async get method here? + print("async inside BackendArray") + return await self.getitem(key) # type: ignore[index] class AbstractDataStore: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 1a46346dda7..dbbf93d125f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -185,6 +185,8 @@ class ZarrArrayWrapper(BackendArray): def __init__(self, zarr_array): # some callers attempt to evaluate an array if an `array` property exists on the object. # we prefix with _ to avoid this inference. + + # TODO type hint this? self._array = zarr_array self.shape = self._array.shape @@ -211,6 +213,10 @@ def _vindex(self, key): def _getitem(self, key): return self._array[key] + + async def _async_getitem(self, key): + async_array = self._array._async_array + return await async_array.getitem(key) def __getitem__(self, key): array = self._array @@ -227,6 +233,22 @@ def __getitem__(self, key): # if self.ndim == 0: # could possibly have a work-around for 0d data here + async def async_getitem(self, key): + # this doesn't need to be async + array = self._array + if isinstance(key, indexing.BasicIndexer): + method = self._async_getitem + elif isinstance(key, indexing.VectorizedIndexer): + # TODO + method = self._vindex + elif isinstance(key, indexing.OuterIndexer): + # TODO + method = self._oindex + + print("did an async get") + return await indexing.async_explicit_indexing_adapter( + key, array.shape, indexing.IndexingSupport.VECTORIZED, method + ) def _determine_zarr_chunks( enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode, shape diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index c1b847202c7..2adea07e96c 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -523,6 +523,10 @@ def get_duck_array(self): key = BasicIndexer((slice(None),) * self.ndim) return self[key] + async def async_get_duck_array(self): + key = BasicIndexer((slice(None),) * self.ndim) + return self[key] + def _oindex_get(self, indexer: OuterIndexer): raise NotImplementedError( f"{self.__class__.__name__}._oindex_get method should be overridden" @@ -661,6 +665,22 @@ def get_duck_array(self): array = array.get_duck_array() return _wrap_numpy_scalars(array) + async def async_get_duck_array(self): + if isinstance(self.array, ExplicitlyIndexedNDArrayMixin): + array = apply_indexer(self.array, self.key) + else: + # If the array is not an ExplicitlyIndexedNDArrayMixin, + # it may wrap a BackendArray so use its (async) getitem + array = await self.array.async_getitem(self.key) + + # self.array[self.key] is now a numpy array when + # self.array is a BackendArray subclass + # and self.key is BasicIndexer((slice(None, None, None),)) + # so we need the explicit check for ExplicitlyIndexed + if isinstance(array, ExplicitlyIndexed): + array = await array.async_get_duck_array() + return _wrap_numpy_scalars(array) + def transpose(self, order): return LazilyVectorizedIndexedArray(self.array, self.key).transpose(order) @@ -797,6 +817,9 @@ def _ensure_copied(self): def get_duck_array(self): return self.array.get_duck_array() + async def async_get_duck_array(self): + return await self.array.async_get_duck_array() + def _oindex_get(self, indexer: OuterIndexer): return type(self)(_wrap_numpy_scalars(self.array.oindex[indexer])) @@ -839,10 +862,18 @@ def __init__(self, array): def _ensure_cached(self): self.array = as_indexable(self.array.get_duck_array()) + + async def _async_ensure_cached(self): + duck_array = await self.array.async_get_duck_array() + self.array = as_indexable(duck_array) def get_duck_array(self): self._ensure_cached() return self.array.get_duck_array() + + async def async_get_duck_array(self): + await self._async_ensure_cached() + return await self.array.async_get_duck_array() def _oindex_get(self, indexer: OuterIndexer): return type(self)(_wrap_numpy_scalars(self.array.oindex[indexer])) @@ -1027,6 +1058,21 @@ def explicit_indexing_adapter( return result +async def async_explicit_indexing_adapter( + key: ExplicitIndexer, + shape: _Shape, + indexing_support: IndexingSupport, + raw_indexing_method: Callable[..., Any], +) -> Any: + raw_key, numpy_indices = decompose_indexer(key, shape, indexing_support) + result = await raw_indexing_method(raw_key.tuple) + if numpy_indices.tuple: + # index the loaded duck array + indexable = as_indexable(result) + result = apply_indexer(indexable, numpy_indices) + return result + + def apply_indexer(indexable, indexer: ExplicitIndexer): """Apply an indexer to an indexable object.""" if isinstance(indexer, VectorizedIndexer): diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 4e58b0d4b20..9b184d1069e 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -51,6 +51,7 @@ is_0d_dask_array, is_chunked_array, to_duck_array, + async_to_duck_array, ) from xarray.namedarray.utils import module_available from xarray.util.deprecation_helpers import _deprecate_positional_args, deprecate_dims @@ -956,6 +957,11 @@ def load(self, **kwargs): """ self._data = to_duck_array(self._data, **kwargs) return self + + async def async_load(self, **kwargs): + print("async inside Variable") + self._data = await async_to_duck_array(self._data, **kwargs) + return self def compute(self, **kwargs): """Manually trigger loading of this variable's data from disk or a diff --git a/xarray/namedarray/pycompat.py b/xarray/namedarray/pycompat.py index 68b6a7853bf..527b83fed15 100644 --- a/xarray/namedarray/pycompat.py +++ b/xarray/namedarray/pycompat.py @@ -145,3 +145,26 @@ def to_duck_array(data: Any, **kwargs: dict[str, Any]) -> duckarray[_ShapeType, return data else: return np.asarray(data) # type: ignore[return-value] + + +async def async_to_duck_array(data: Any, **kwargs: dict[str, Any]) -> duckarray[_ShapeType, _DType]: + from xarray.core.indexing import ( + ExplicitlyIndexed, + ImplicitToExplicitIndexingAdapter, + ) + from xarray.namedarray.parallelcompat import get_chunked_array_type + + print(type(data)) + + if is_chunked_array(data): + chunkmanager = get_chunked_array_type(data) + loaded_data, *_ = chunkmanager.compute(data, **kwargs) # type: ignore[var-annotated] + return loaded_data + + if isinstance(data, ExplicitlyIndexed | ImplicitToExplicitIndexingAdapter): + print("async inside to_duck_array") + return await data.async_get_duck_array() # type: ignore[no-untyped-call, no-any-return] + elif is_duck_array(data): + return data + else: + return np.asarray(data) # type: ignore[return-value] From 29374f9e4f20056690c9a8ac6330f1a00ecaba59 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 16 May 2025 12:02:09 -0400 Subject: [PATCH 005/112] remove assert False --- xarray/tests/test_async.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index c28680cd1dd..05422beea1f 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -88,22 +88,19 @@ def memorystore() -> "MemoryStore": @requires_zarr_v3 @pytest.mark.asyncio async def test_async_load(memorystore): - N_DATASETS = 10 + N_LOADS= 10 LATENCY = 1.0 latencystore = LatencyStore(memorystore, latency=LATENCY) - datasets = [xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) for _ in range(N_DATASETS)] + ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) # TODO add async load to Dataset and DataArray as well as to Variable + # TODO change the syntax to `.async.load()`? start_time = time.time() - tasks = [ds['foo'].variable.async_load() for ds in datasets] + tasks = [ds['foo'].variable.async_load() for _ in range(N_LOADS)] results = await asyncio.gather(*tasks) - #results = [ds['foo'].variable.load() for ds in datasets] total_time = time.time() - start_time assert total_time > LATENCY # Cannot possibly be quicker than this - assert total_time < LATENCY * N_DATASETS # If this isn't true we're gaining nothing from async + assert total_time < LATENCY * N_LOADS # If this isn't true we're gaining nothing from async assert abs(total_time - LATENCY) < 0.5 # Should take approximately LATENCY seconds, but allow some buffer - - print(total_time) - assert False \ No newline at end of file From ab12bb8d01cabd910ac909aad4509b1ee70dfa4c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 16:07:10 +0000 Subject: [PATCH 006/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/common.py | 2 +- xarray/backends/zarr.py | 5 ++-- xarray/core/indexing.py | 4 +-- xarray/core/variable.py | 4 +-- xarray/namedarray/pycompat.py | 6 +++-- xarray/tests/test_async.py | 46 ++++++++++++++++++++--------------- 6 files changed, 38 insertions(+), 29 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index c31a3caaf81..edda5cff429 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -273,7 +273,7 @@ class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): def get_duck_array(self, dtype: np.typing.DTypeLike = None): key = indexing.BasicIndexer((slice(None),) * self.ndim) return self[key] # type: ignore[index] - + async def async_get_duck_array(self, dtype: np.typing.DTypeLike = None): key = indexing.BasicIndexer((slice(None),) * self.ndim) # TODO use zarr-python async get method here? diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index dbbf93d125f..f068826eef2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -185,7 +185,7 @@ class ZarrArrayWrapper(BackendArray): def __init__(self, zarr_array): # some callers attempt to evaluate an array if an `array` property exists on the object. # we prefix with _ to avoid this inference. - + # TODO type hint this? self._array = zarr_array self.shape = self._array.shape @@ -213,7 +213,7 @@ def _vindex(self, key): def _getitem(self, key): return self._array[key] - + async def _async_getitem(self, key): async_array = self._array._async_array return await async_array.getitem(key) @@ -250,6 +250,7 @@ async def async_getitem(self, key): key, array.shape, indexing.IndexingSupport.VECTORIZED, method ) + def _determine_zarr_chunks( enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode, shape ): diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 2adea07e96c..53f9bd12088 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -862,7 +862,7 @@ def __init__(self, array): def _ensure_cached(self): self.array = as_indexable(self.array.get_duck_array()) - + async def _async_ensure_cached(self): duck_array = await self.array.async_get_duck_array() self.array = as_indexable(duck_array) @@ -870,7 +870,7 @@ async def _async_ensure_cached(self): def get_duck_array(self): self._ensure_cached() return self.array.get_duck_array() - + async def async_get_duck_array(self): await self._async_ensure_cached() return await self.array.async_get_duck_array() diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9b184d1069e..cd3af386b95 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -47,11 +47,11 @@ from xarray.namedarray.core import NamedArray, _raise_if_any_duplicate_dimensions from xarray.namedarray.parallelcompat import get_chunked_array_type from xarray.namedarray.pycompat import ( + async_to_duck_array, integer_types, is_0d_dask_array, is_chunked_array, to_duck_array, - async_to_duck_array, ) from xarray.namedarray.utils import module_available from xarray.util.deprecation_helpers import _deprecate_positional_args, deprecate_dims @@ -957,7 +957,7 @@ def load(self, **kwargs): """ self._data = to_duck_array(self._data, **kwargs) return self - + async def async_load(self, **kwargs): print("async inside Variable") self._data = await async_to_duck_array(self._data, **kwargs) diff --git a/xarray/namedarray/pycompat.py b/xarray/namedarray/pycompat.py index 527b83fed15..c6a07e5963f 100644 --- a/xarray/namedarray/pycompat.py +++ b/xarray/namedarray/pycompat.py @@ -145,9 +145,11 @@ def to_duck_array(data: Any, **kwargs: dict[str, Any]) -> duckarray[_ShapeType, return data else: return np.asarray(data) # type: ignore[return-value] - -async def async_to_duck_array(data: Any, **kwargs: dict[str, Any]) -> duckarray[_ShapeType, _DType]: + +async def async_to_duck_array( + data: Any, **kwargs: dict[str, Any] +) -> duckarray[_ShapeType, _DType]: from xarray.core.indexing import ( ExplicitlyIndexed, ImplicitToExplicitIndexingAdapter, diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 05422beea1f..8523c41662a 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -1,28 +1,26 @@ -from typing import TypeVar, Iterable import asyncio import time +from collections.abc import Iterable +from typing import TypeVar -import pytest import numpy as np +import pytest -from xarray.tests import has_zarr_v3, requires_zarr_v3 import xarray as xr - +from xarray.tests import has_zarr_v3, requires_zarr_v3 if has_zarr_v3: import zarr - from zarr.abc.store import Store + from zarr.abc.store import ByteRequest, Store + from zarr.core.buffer import Buffer, BufferPrototype from zarr.storage import MemoryStore from zarr.storage._wrapper import WrapperStore - from zarr.abc.store import ByteRequest - from zarr.core.buffer import Buffer, BufferPrototype - T_Store = TypeVar("T_Store", bound=Store) - class LatencyStore(WrapperStore[T_Store]): """Works the same way as the zarr LoggingStore""" + latency: float def __init__( @@ -42,7 +40,7 @@ def __init__( """ super().__init__(store) self.latency = latency - + def __str__(self) -> str: return f"latency-{self._store}" @@ -56,15 +54,19 @@ async def get( byte_range: ByteRequest | None = None, ) -> Buffer | None: await asyncio.sleep(self.latency) - return await self._store.get(key=key, prototype=prototype, byte_range=byte_range) - + return await self._store.get( + key=key, prototype=prototype, byte_range=byte_range + ) + async def get_partial_values( self, prototype: BufferPrototype, key_ranges: Iterable[tuple[str, ByteRequest | None]], ) -> list[Buffer | None]: await asyncio.sleep(self.latency) - return await self._store.get_partial_values(prototype=prototype, key_ranges=key_ranges) + return await self._store.get_partial_values( + prototype=prototype, key_ranges=key_ranges + ) else: LatencyStore = {} @@ -76,9 +78,9 @@ def memorystore() -> "MemoryStore": store=memorystore, name="foo", shape=(10, 10), - chunks=(5, 5), + chunks=(5, 5), dtype="f4", - dimension_names=["x", "y"] + dimension_names=["x", "y"], ) z[:, :] = np.random.random((10, 10)) @@ -88,7 +90,7 @@ def memorystore() -> "MemoryStore": @requires_zarr_v3 @pytest.mark.asyncio async def test_async_load(memorystore): - N_LOADS= 10 + N_LOADS = 10 LATENCY = 1.0 latencystore = LatencyStore(memorystore, latency=LATENCY) @@ -97,10 +99,14 @@ async def test_async_load(memorystore): # TODO add async load to Dataset and DataArray as well as to Variable # TODO change the syntax to `.async.load()`? start_time = time.time() - tasks = [ds['foo'].variable.async_load() for _ in range(N_LOADS)] + tasks = [ds["foo"].variable.async_load() for _ in range(N_LOADS)] results = await asyncio.gather(*tasks) total_time = time.time() - start_time - + assert total_time > LATENCY # Cannot possibly be quicker than this - assert total_time < LATENCY * N_LOADS # If this isn't true we're gaining nothing from async - assert abs(total_time - LATENCY) < 0.5 # Should take approximately LATENCY seconds, but allow some buffer + assert ( + total_time < LATENCY * N_LOADS + ) # If this isn't true we're gaining nothing from async + assert ( + abs(total_time - LATENCY) < 0.5 + ) # Should take approximately LATENCY seconds, but allow some buffer From 62aa39dd81a64005891dea403eb7f778b2b67669 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 16 May 2025 14:30:46 -0400 Subject: [PATCH 007/112] add pytest-asyncio to CI envs --- ci/minimum_versions.py | 1 + ci/requirements/all-but-dask.yml | 1 + ci/requirements/all-but-numba.yml | 1 + ci/requirements/bare-minimum.yml | 1 + ci/requirements/environment-3.14.yml | 1 + ci/requirements/environment-windows-3.14.yml | 1 + ci/requirements/environment-windows.yml | 1 + ci/requirements/environment.yml | 1 + ci/requirements/min-all-deps.yml | 1 + 9 files changed, 9 insertions(+) diff --git a/ci/minimum_versions.py b/ci/minimum_versions.py index 08808d002d9..c4816c39a74 100644 --- a/ci/minimum_versions.py +++ b/ci/minimum_versions.py @@ -30,6 +30,7 @@ "coveralls", "pip", "pytest", + "pytest-asyncio" "pytest-cov", "pytest-env", "pytest-mypy-plugins", diff --git a/ci/requirements/all-but-dask.yml b/ci/requirements/all-but-dask.yml index ca4943bddb1..987adc7dfdd 100644 --- a/ci/requirements/all-but-dask.yml +++ b/ci/requirements/all-but-dask.yml @@ -28,6 +28,7 @@ dependencies: - pip - pydap - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins diff --git a/ci/requirements/all-but-numba.yml b/ci/requirements/all-but-numba.yml index fa7ad81f198..1d49f92133c 100644 --- a/ci/requirements/all-but-numba.yml +++ b/ci/requirements/all-but-numba.yml @@ -41,6 +41,7 @@ dependencies: - pyarrow # pandas raises a deprecation warning without this, breaking doctests - pydap - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins diff --git a/ci/requirements/bare-minimum.yml b/ci/requirements/bare-minimum.yml index 02e99d34af2..cc34a6e4824 100644 --- a/ci/requirements/bare-minimum.yml +++ b/ci/requirements/bare-minimum.yml @@ -7,6 +7,7 @@ dependencies: - coveralls - pip - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins diff --git a/ci/requirements/environment-3.14.yml b/ci/requirements/environment-3.14.yml index 1e6ee7ff5f9..bfbeababa56 100644 --- a/ci/requirements/environment-3.14.yml +++ b/ci/requirements/environment-3.14.yml @@ -37,6 +37,7 @@ dependencies: - pyarrow # pandas raises a deprecation warning without this, breaking doctests - pydap - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins diff --git a/ci/requirements/environment-windows-3.14.yml b/ci/requirements/environment-windows-3.14.yml index 4eb2049f2e6..d5143470614 100644 --- a/ci/requirements/environment-windows-3.14.yml +++ b/ci/requirements/environment-windows-3.14.yml @@ -32,6 +32,7 @@ dependencies: - pyarrow # importing dask.dataframe raises an ImportError without this - pydap - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index 45cbebd38db..6aeca2cb0ab 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -32,6 +32,7 @@ dependencies: - pyarrow # importing dask.dataframe raises an ImportError without this - pydap - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index b4354b14f40..9c253d5d489 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -38,6 +38,7 @@ dependencies: - pydap - pydap-server - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml index 03e14773d53..1293f4d78d6 100644 --- a/ci/requirements/min-all-deps.yml +++ b/ci/requirements/min-all-deps.yml @@ -44,6 +44,7 @@ dependencies: - pip - pydap=3.5 - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins From a906decee5e8d08d605d671f373d59a233745917 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 18:31:20 +0000 Subject: [PATCH 008/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ci/minimum_versions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/minimum_versions.py b/ci/minimum_versions.py index c4816c39a74..4cc0b76916b 100644 --- a/ci/minimum_versions.py +++ b/ci/minimum_versions.py @@ -30,8 +30,7 @@ "coveralls", "pip", "pytest", - "pytest-asyncio" - "pytest-cov", + "pytest-asynciopytest-cov", "pytest-env", "pytest-mypy-plugins", "pytest-timeout", From 629ab31e2b13f51cd74acc0e4174959c8a85f3c1 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 16 May 2025 16:15:09 -0400 Subject: [PATCH 009/112] assert results are identical --- xarray/tests/test_async.py | 57 ++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 8523c41662a..4a03ed738e3 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -7,6 +7,7 @@ import pytest import xarray as xr +import xarray.testing as xrt from xarray.tests import has_zarr_v3, requires_zarr_v3 if has_zarr_v3: @@ -84,29 +85,43 @@ def memorystore() -> "MemoryStore": ) z[:, :] = np.random.random((10, 10)) + z = zarr.create_array( + store=memorystore, + name="bar", + shape=(10,), + chunks=(5), + dtype="f4", + dimension_names=["x"], + ) + z[:] = np.random.random((10,)) + return memorystore @requires_zarr_v3 @pytest.mark.asyncio -async def test_async_load(memorystore): - N_LOADS = 10 - LATENCY = 1.0 - - latencystore = LatencyStore(memorystore, latency=LATENCY) - ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - - # TODO add async load to Dataset and DataArray as well as to Variable - # TODO change the syntax to `.async.load()`? - start_time = time.time() - tasks = [ds["foo"].variable.async_load() for _ in range(N_LOADS)] - results = await asyncio.gather(*tasks) - total_time = time.time() - start_time - - assert total_time > LATENCY # Cannot possibly be quicker than this - assert ( - total_time < LATENCY * N_LOADS - ) # If this isn't true we're gaining nothing from async - assert ( - abs(total_time - LATENCY) < 0.5 - ) # Should take approximately LATENCY seconds, but allow some buffer +class TestAsyncLoad: + async def test_async_load_variable(self, memorystore): + N_LOADS = 5 + LATENCY = 1.0 + + latencystore = LatencyStore(memorystore, latency=LATENCY) + ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) + + # TODO add async load to Dataset and DataArray as well as to Variable + # TODO change the syntax to `.async.load()`? + start_time = time.time() + tasks = [ds["foo"].variable.async_load() for _ in range(N_LOADS)] + results = await asyncio.gather(*tasks) + total_time = time.time() - start_time + + for result in results: + xrt.assert_identical(result, ds["foo"].variable.load()) + + assert total_time > LATENCY # Cannot possibly be quicker than this + assert ( + total_time < LATENCY * N_LOADS + ) # If this isn't true we're gaining nothing from async + assert ( + abs(total_time - LATENCY) < 0.5 + ) # Should take approximately LATENCY seconds, but allow some buffer From 7e9ae0fa20a736a3d0baf8c0c6a8e10c263fdf2d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 17 May 2025 22:41:56 +0300 Subject: [PATCH 010/112] implement async load for dataarray and dataset --- xarray/core/dataarray.py | 8 ++++++ xarray/core/dataset.py | 25 ++++++++++++++++ xarray/tests/test_async.py | 59 ++++++++++++++++++++++++++++++++------ 3 files changed, 84 insertions(+), 8 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 1e7e1069076..808f39d8c03 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1160,6 +1160,14 @@ def load(self, **kwargs) -> Self: self._coords = new._coords return self + async def async_load(self, **kwargs) -> Self: + temp_ds = self._to_temp_dataset() + ds = await temp_ds.async_load(**kwargs) + new = self._from_temp_dataset(ds) + self._variable = new._variable + self._coords = new._coords + return self + def compute(self, **kwargs) -> Self: """Manually trigger loading of this array's data from disk or a remote source into memory and return a new array. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5a7f757ba8a..33e37e1ca4c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -552,6 +552,31 @@ def load(self, **kwargs) -> Self: return self + async def async_load(self, **kwargs) -> Self: + # this blocks on chunked arrays but not on lazily indexed arrays + + # access .data to coerce everything to numpy or dask arrays + lazy_data = { + k: v._data for k, v in self.variables.items() if is_chunked_array(v._data) + } + if lazy_data: + chunkmanager = get_chunked_array_type(*lazy_data.values()) + + # evaluate all the chunked arrays simultaneously + evaluated_data: tuple[np.ndarray[Any, Any], ...] = chunkmanager.compute( + *lazy_data.values(), **kwargs + ) + + for k, data in zip(lazy_data, evaluated_data, strict=False): + self.variables[k].data = data + + # load everything else sequentially + for k, v in self.variables.items(): + if k not in lazy_data: + await v.async_load() + + return self + def __dask_tokenize__(self) -> object: from dask.base import normalize_token diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 4a03ed738e3..37491619af1 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -101,27 +101,70 @@ def memorystore() -> "MemoryStore": @requires_zarr_v3 @pytest.mark.asyncio class TestAsyncLoad: - async def test_async_load_variable(self, memorystore): - N_LOADS = 5 - LATENCY = 1.0 + N_LOADS = 10 + LATENCY = 1.0 - latencystore = LatencyStore(memorystore, latency=LATENCY) + # TODO refactor these tests + async def test_async_load_variable(self, memorystore): + latencystore = LatencyStore(memorystore, latency=self.LATENCY) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) # TODO add async load to Dataset and DataArray as well as to Variable # TODO change the syntax to `.async.load()`? start_time = time.time() - tasks = [ds["foo"].variable.async_load() for _ in range(N_LOADS)] + tasks = [ds["foo"].variable.async_load() for _ in range(self.N_LOADS)] results = await asyncio.gather(*tasks) total_time = time.time() - start_time for result in results: xrt.assert_identical(result, ds["foo"].variable.load()) - assert total_time > LATENCY # Cannot possibly be quicker than this + assert total_time > self.LATENCY # Cannot possibly be quicker than this + assert ( + total_time < self.LATENCY * self.N_LOADS + ) # If this isn't true we're gaining nothing from async + assert ( + abs(total_time - self.LATENCY) < 0.5 + ) # Should take approximately LATENCY seconds, but allow some buffer + + async def test_async_load_dataarray(self, memorystore): + latencystore = LatencyStore(memorystore, latency=self.LATENCY) + ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) + + # TODO change the syntax to `.async.load()`? + start_time = time.time() + tasks = [ds["foo"].async_load() for _ in range(self.N_LOADS)] + results = await asyncio.gather(*tasks) + total_time = time.time() - start_time + + for result in results: + xrt.assert_identical(result, ds["foo"].load()) + + assert total_time > self.LATENCY # Cannot possibly be quicker than this + assert ( + total_time < self.LATENCY * self.N_LOADS + ) # If this isn't true we're gaining nothing from async + assert ( + abs(total_time - self.LATENCY) < 0.5 + ) # Should take approximately LATENCY seconds, but allow some buffer + + async def test_async_load_dataset(self, memorystore): + latencystore = LatencyStore(memorystore, latency=self.LATENCY) + ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) + + # TODO change the syntax to `.async.load()`? + start_time = time.time() + tasks = [ds.async_load() for _ in range(self.N_LOADS)] + results = await asyncio.gather(*tasks) + total_time = time.time() - start_time + + for result in results: + xrt.assert_identical(result, ds.load()) + + assert total_time > self.LATENCY # Cannot possibly be quicker than this assert ( - total_time < LATENCY * N_LOADS + total_time < self.LATENCY * self.N_LOADS ) # If this isn't true we're gaining nothing from async assert ( - abs(total_time - LATENCY) < 0.5 + abs(total_time - self.LATENCY) < 0.5 ) # Should take approximately LATENCY seconds, but allow some buffer From d288351df45300a260e17c7de3f710687510e8cd Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 17 May 2025 23:27:07 +0300 Subject: [PATCH 011/112] factor out common logic --- xarray/tests/test_async.py | 79 ++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 37491619af1..9e9bbefe56b 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -1,6 +1,7 @@ import asyncio import time from collections.abc import Iterable +from contextlib import asynccontextmanager from typing import TypeVar import numpy as np @@ -98,73 +99,77 @@ def memorystore() -> "MemoryStore": return memorystore +class AsyncTimer: + """Context manager for timing async operations and making assertions about their execution time.""" + + start_time: float + end_time: float + total_time: float + + @asynccontextmanager + async def measure(self): + """Measure the execution time of the async code within this context.""" + self.start_time = time.time() + try: + yield self + finally: + self.end_time = time.time() + self.total_time = self.end_time - self.start_time + + @requires_zarr_v3 @pytest.mark.asyncio class TestAsyncLoad: - N_LOADS = 10 - LATENCY = 1.0 + N_LOADS: int = 5 + LATENCY: float = 1.0 + + def assert_time_as_expected(self, total_time: float) -> None: + assert total_time > self.LATENCY # Cannot possibly be quicker than this + assert ( + total_time < self.LATENCY * self.N_LOADS + ) # If this isn't true we're gaining nothing from async + assert ( + abs(total_time - self.LATENCY) < 0.5 + ) # Should take approximately LATENCY seconds, but allow some buffer - # TODO refactor these tests async def test_async_load_variable(self, memorystore): latencystore = LatencyStore(memorystore, latency=self.LATENCY) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - # TODO add async load to Dataset and DataArray as well as to Variable # TODO change the syntax to `.async.load()`? - start_time = time.time() - tasks = [ds["foo"].variable.async_load() for _ in range(self.N_LOADS)] - results = await asyncio.gather(*tasks) - total_time = time.time() - start_time + async with AsyncTimer().measure() as timer: + tasks = [ds["foo"].variable.async_load() for _ in range(self.N_LOADS)] + results = await asyncio.gather(*tasks) for result in results: xrt.assert_identical(result, ds["foo"].variable.load()) - assert total_time > self.LATENCY # Cannot possibly be quicker than this - assert ( - total_time < self.LATENCY * self.N_LOADS - ) # If this isn't true we're gaining nothing from async - assert ( - abs(total_time - self.LATENCY) < 0.5 - ) # Should take approximately LATENCY seconds, but allow some buffer + self.assert_time_as_expected(timer.total_time) async def test_async_load_dataarray(self, memorystore): latencystore = LatencyStore(memorystore, latency=self.LATENCY) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) # TODO change the syntax to `.async.load()`? - start_time = time.time() - tasks = [ds["foo"].async_load() for _ in range(self.N_LOADS)] - results = await asyncio.gather(*tasks) - total_time = time.time() - start_time + async with AsyncTimer().measure() as timer: + tasks = [ds["foo"].async_load() for _ in range(self.N_LOADS)] + results = await asyncio.gather(*tasks) for result in results: xrt.assert_identical(result, ds["foo"].load()) - assert total_time > self.LATENCY # Cannot possibly be quicker than this - assert ( - total_time < self.LATENCY * self.N_LOADS - ) # If this isn't true we're gaining nothing from async - assert ( - abs(total_time - self.LATENCY) < 0.5 - ) # Should take approximately LATENCY seconds, but allow some buffer + self.assert_time_as_expected(timer.total_time) async def test_async_load_dataset(self, memorystore): latencystore = LatencyStore(memorystore, latency=self.LATENCY) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) # TODO change the syntax to `.async.load()`? - start_time = time.time() - tasks = [ds.async_load() for _ in range(self.N_LOADS)] - results = await asyncio.gather(*tasks) - total_time = time.time() - start_time + async with AsyncTimer().measure() as timer: + tasks = [ds.async_load() for _ in range(self.N_LOADS)] + results = await asyncio.gather(*tasks) for result in results: xrt.assert_identical(result, ds.load()) - assert total_time > self.LATENCY # Cannot possibly be quicker than this - assert ( - total_time < self.LATENCY * self.N_LOADS - ) # If this isn't true we're gaining nothing from async - assert ( - abs(total_time - self.LATENCY) < 0.5 - ) # Should take approximately LATENCY seconds, but allow some buffer + self.assert_time_as_expected(timer.total_time) From e0731a08c563b04a01c56ff288ae7eea6e5d7a4b Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sun, 18 May 2025 00:01:29 +0300 Subject: [PATCH 012/112] consolidate tests via a parametrized fixture --- xarray/tests/test_async.py | 52 +++++++++++++------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 9e9bbefe56b..109deba78d3 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -123,53 +123,35 @@ class TestAsyncLoad: N_LOADS: int = 5 LATENCY: float = 1.0 + @pytest.fixture(params=["ds", "da", "var"]) + def xr_obj(self, request, memorystore) -> xr.Dataset | xr.DataArray | xr.Variable: + latencystore = LatencyStore(memorystore, latency=self.LATENCY) + ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) + + match request.param: + case "var": + return ds["foo"].variable + case "da": + return ds["foo"] + case "ds": + return ds + def assert_time_as_expected(self, total_time: float) -> None: assert total_time > self.LATENCY # Cannot possibly be quicker than this assert ( total_time < self.LATENCY * self.N_LOADS ) # If this isn't true we're gaining nothing from async assert ( - abs(total_time - self.LATENCY) < 0.5 + abs(total_time - self.LATENCY) < 2.0 ) # Should take approximately LATENCY seconds, but allow some buffer - async def test_async_load_variable(self, memorystore): - latencystore = LatencyStore(memorystore, latency=self.LATENCY) - ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - - # TODO change the syntax to `.async.load()`? - async with AsyncTimer().measure() as timer: - tasks = [ds["foo"].variable.async_load() for _ in range(self.N_LOADS)] - results = await asyncio.gather(*tasks) - - for result in results: - xrt.assert_identical(result, ds["foo"].variable.load()) - - self.assert_time_as_expected(timer.total_time) - - async def test_async_load_dataarray(self, memorystore): - latencystore = LatencyStore(memorystore, latency=self.LATENCY) - ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - - # TODO change the syntax to `.async.load()`? - async with AsyncTimer().measure() as timer: - tasks = [ds["foo"].async_load() for _ in range(self.N_LOADS)] - results = await asyncio.gather(*tasks) - - for result in results: - xrt.assert_identical(result, ds["foo"].load()) - - self.assert_time_as_expected(timer.total_time) - - async def test_async_load_dataset(self, memorystore): - latencystore = LatencyStore(memorystore, latency=self.LATENCY) - ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - + async def test_async_load(self, xr_obj): # TODO change the syntax to `.async.load()`? async with AsyncTimer().measure() as timer: - tasks = [ds.async_load() for _ in range(self.N_LOADS)] + tasks = [xr_obj.async_load() for _ in range(self.N_LOADS)] results = await asyncio.gather(*tasks) for result in results: - xrt.assert_identical(result, ds.load()) + xrt.assert_identical(result, xr_obj.load()) self.assert_time_as_expected(timer.total_time) From 9b41e78daafc42ca32d6961a95540ec8cee15457 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sun, 18 May 2025 01:24:13 +0300 Subject: [PATCH 013/112] async_load -> load_async --- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 4 ++-- xarray/core/variable.py | 2 +- xarray/tests/test_async.py | 3 +-- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 808f39d8c03..05f5d4c7fa8 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1160,9 +1160,9 @@ def load(self, **kwargs) -> Self: self._coords = new._coords return self - async def async_load(self, **kwargs) -> Self: + async def load_async(self, **kwargs) -> Self: temp_ds = self._to_temp_dataset() - ds = await temp_ds.async_load(**kwargs) + ds = await temp_ds.load_async(**kwargs) new = self._from_temp_dataset(ds) self._variable = new._variable self._coords = new._coords diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 33e37e1ca4c..26441256a4a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -552,7 +552,7 @@ def load(self, **kwargs) -> Self: return self - async def async_load(self, **kwargs) -> Self: + async def load_async(self, **kwargs) -> Self: # this blocks on chunked arrays but not on lazily indexed arrays # access .data to coerce everything to numpy or dask arrays @@ -573,7 +573,7 @@ async def async_load(self, **kwargs) -> Self: # load everything else sequentially for k, v in self.variables.items(): if k not in lazy_data: - await v.async_load() + await v.load_async() return self diff --git a/xarray/core/variable.py b/xarray/core/variable.py index cd3af386b95..e45987bca35 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -958,7 +958,7 @@ def load(self, **kwargs): self._data = to_duck_array(self._data, **kwargs) return self - async def async_load(self, **kwargs): + async def load_async(self, **kwargs): print("async inside Variable") self._data = await async_to_duck_array(self._data, **kwargs) return self diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 109deba78d3..d87208c3e59 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -146,9 +146,8 @@ def assert_time_as_expected(self, total_time: float) -> None: ) # Should take approximately LATENCY seconds, but allow some buffer async def test_async_load(self, xr_obj): - # TODO change the syntax to `.async.load()`? async with AsyncTimer().measure() as timer: - tasks = [xr_obj.async_load() for _ in range(self.N_LOADS)] + tasks = [xr_obj.load_async() for _ in range(self.N_LOADS)] results = await asyncio.gather(*tasks) for result in results: From 67ba26a1ee1704ab274dc80bcb10f8635a97caf1 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sun, 18 May 2025 03:18:24 +0300 Subject: [PATCH 014/112] make BackendArray an ABC --- xarray/backends/common.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index edda5cff429..ce0ff3c323e 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -4,6 +4,7 @@ import os import time import traceback +from abc import ABC, abstractmethod from collections.abc import Hashable, Iterable, Mapping, Sequence from glob import glob from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, Union, overload @@ -267,18 +268,22 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500 time.sleep(1e-3 * next_delay) -class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): +class BackendArray(ABC, NdimSizeLenMixin, indexing.ExplicitlyIndexed): __slots__ = () + @abstractmethod + def __getitem__(key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: ... + + async def async_getitem(key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: + raise NotImplementedError("Backend does not not support asynchronous loading") + def get_duck_array(self, dtype: np.typing.DTypeLike = None): key = indexing.BasicIndexer((slice(None),) * self.ndim) return self[key] # type: ignore[index] async def async_get_duck_array(self, dtype: np.typing.DTypeLike = None): key = indexing.BasicIndexer((slice(None),) * self.ndim) - # TODO use zarr-python async get method here? - print("async inside BackendArray") - return await self.getitem(key) # type: ignore[index] + return await self.async_getitem(key) # type: ignore[index] class AbstractDataStore: From 9344e2e78ecb5ca3cadf436fe8a446325b73b13f Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sun, 18 May 2025 03:19:10 +0300 Subject: [PATCH 015/112] explain how to add async support for any backend in the docs --- doc/internals/how-to-add-new-backend.rst | 49 ++++++++++++++++-------- xarray/backends/zarr.py | 1 - 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst index e4f6d54f75c..a6858b35775 100644 --- a/doc/internals/how-to-add-new-backend.rst +++ b/doc/internals/how-to-add-new-backend.rst @@ -325,39 +325,42 @@ information on plugins. How to support lazy loading +++++++++++++++++++++++++++ -If you want to make your backend effective with big datasets, then you should -support lazy loading. -Basically, you shall replace the :py:class:`numpy.ndarray` inside the -variables with a custom class that supports lazy loading indexing. +If you want to make your backend effective with big datasets, then you should take advantage of xarray's +support for lazy loading and indexing. + +Basically, when your backend constructs the ``Variable`` objects, +you need to replace the :py:class:`numpy.ndarray` inside the +variables with a custom :py:class:`~xarray.backends.BackendArray` subclass that supports lazy loading and indexing. See the example below: .. code-block:: python - backend_array = MyBackendArray() data = indexing.LazilyIndexedArray(backend_array) var = xr.Variable(dims, data, attrs=attrs, encoding=encoding) Where: -- :py:class:`~xarray.core.indexing.LazilyIndexedArray` is a class - provided by Xarray that manages the lazy loading. -- ``MyBackendArray`` shall be implemented by the backend and shall inherit +- :py:class:`~xarray.core.indexing.LazilyIndexedArray` is a wrapper class + provided by Xarray that manages the lazy loading and indexing. +- ``MyBackendArray`` should be implemented by the backend and must inherit from :py:class:`~xarray.backends.BackendArray`. BackendArray subclassing ^^^^^^^^^^^^^^^^^^^^^^^^ -The BackendArray subclass shall implement the following method and attributes: +The BackendArray subclass must implement the following method and attributes: -- the ``__getitem__`` method that takes in input an index and returns a - `NumPy `__ array -- the ``shape`` attribute +- the ``__getitem__`` method that takes an index as an input and returns a + `NumPy `__ array, +- the ``shape`` attribute, - the ``dtype`` attribute. -Xarray supports different type of :doc:`/user-guide/indexing`, that can be -grouped in three types of indexes +It may also optionally implement an additional ``async_getitem`` method. + +Xarray supports different types of :doc:`/user-guide/indexing`, that can be +grouped in three types of indexes: :py:class:`~xarray.core.indexing.BasicIndexer`, -:py:class:`~xarray.core.indexing.OuterIndexer` and +:py:class:`~xarray.core.indexing.OuterIndexer`, and :py:class:`~xarray.core.indexing.VectorizedIndexer`. This implies that the implementation of the method ``__getitem__`` can be tricky. In order to simplify this task, Xarray provides a helper function, @@ -413,8 +416,22 @@ input the ``key``, the array ``shape`` and the following parameters: For more details see :py:class:`~xarray.core.indexing.IndexingSupport` and :ref:`RST indexing`. +Async support +^^^^^^^^^^^^^ + +Backends can also optionally support loading data asynchronously via xarray's asynchronous loading methods +(e.g. ``~xarray.Dataset.load_async``). +To support async loading the `BackendArray` subclass must additionally implement the ``BackendArray.async_getitem`` method. + +Note that implementing this method is only necessary if you want to be able to load data from different xarray objects concurrently. +Even without this method your ``BackendArray`` implementation is still free to concurrently load chunks of data for a single ``Variable`` itself, +so long as it does so behind the synchronous ``__getitem__`` interface. + +Dask support +^^^^^^^^^^^^ + In order to support `Dask Distributed `__ and -:py:mod:`multiprocessing`, ``BackendArray`` subclass should be serializable +:py:mod:`multiprocessing`, the ``BackendArray`` subclass should be serializable either with :ref:`io.pickle` or `cloudpickle `__. That implies that all the reference to open files should be dropped. For diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f068826eef2..9e36e8198c7 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -234,7 +234,6 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here async def async_getitem(self, key): - # this doesn't need to be async array = self._array if isinstance(key, indexing.BasicIndexer): method = self._async_getitem From f8f8563586b58e1825d70e6f115966216065c46b Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 19 May 2025 09:46:23 +0700 Subject: [PATCH 016/112] add new methods to api docs --- doc/api-hidden.rst | 1 + doc/api.rst | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 9a6037cf3c4..98d3704de9b 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -228,6 +228,7 @@ Variable.isnull Variable.item Variable.load + Variable.load_async Variable.max Variable.mean Variable.median diff --git a/doc/api.rst b/doc/api.rst index b6023866eb8..80715555e56 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1122,6 +1122,7 @@ Dataset methods Dataset.filter_by_attrs Dataset.info Dataset.load + Dataset.load_async Dataset.persist Dataset.unify_chunks @@ -1154,6 +1155,7 @@ DataArray methods DataArray.compute DataArray.persist DataArray.load + DataArray.load_async DataArray.unify_chunks DataTree methods From 30ce9bea5f21c722f15f4cf45ea8ca4e617cdf71 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 19 May 2025 09:54:15 +0700 Subject: [PATCH 017/112] whatsnew --- doc/whats-new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c8fbecf82af..97dc3096fde 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,8 @@ v2025.05.0 (unreleased) New Features ~~~~~~~~~~~~ +- Added new asynchronous loading methods :py:meth:`~xarray.Dataset.load_async`, :py:meth:`~xarray.DataArray.load_async`, :py:meth:`~xarray.Variable.load_async`. + (:issue:`10326`, :pull:`10327`) By `Tom Nicholas `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -38,7 +40,6 @@ Bug fixes ~~~~~~~~~ - Fix :py:class:`~xarray.groupers.BinGrouper` when ``labels`` is not specified (:issue:`10284`). By `Deepak Cherian `_. - - Allow accessing arbitrary attributes on Pandas ExtensionArrays. By `Deepak Cherian `_. From 2342b50b6459197542dab6bc87697060f2314d90 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 May 2025 02:55:54 +0000 Subject: [PATCH 018/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/internals/how-to-add-new-backend.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst index a6858b35775..f5ee255c420 100644 --- a/doc/internals/how-to-add-new-backend.rst +++ b/doc/internals/how-to-add-new-backend.rst @@ -325,7 +325,7 @@ information on plugins. How to support lazy loading +++++++++++++++++++++++++++ -If you want to make your backend effective with big datasets, then you should take advantage of xarray's +If you want to make your backend effective with big datasets, then you should take advantage of xarray's support for lazy loading and indexing. Basically, when your backend constructs the ``Variable`` objects, @@ -334,6 +334,7 @@ variables with a custom :py:class:`~xarray.backends.BackendArray` subclass that See the example below: .. code-block:: python + backend_array = MyBackendArray() data = indexing.LazilyIndexedArray(backend_array) var = xr.Variable(dims, data, attrs=attrs, encoding=encoding) @@ -424,7 +425,7 @@ Backends can also optionally support loading data asynchronously via xarray's as To support async loading the `BackendArray` subclass must additionally implement the ``BackendArray.async_getitem`` method. Note that implementing this method is only necessary if you want to be able to load data from different xarray objects concurrently. -Even without this method your ``BackendArray`` implementation is still free to concurrently load chunks of data for a single ``Variable`` itself, +Even without this method your ``BackendArray`` implementation is still free to concurrently load chunks of data for a single ``Variable`` itself, so long as it does so behind the synchronous ``__getitem__`` interface. Dask support From b6d4a824c5414b76a77b34ffdc18dcd858364f6e Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sun, 18 May 2025 19:56:51 -0700 Subject: [PATCH 019/112] Fix ci/minimum_versions.py --- ci/minimum_versions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/minimum_versions.py b/ci/minimum_versions.py index 4cc0b76916b..21123bffcd6 100644 --- a/ci/minimum_versions.py +++ b/ci/minimum_versions.py @@ -30,7 +30,8 @@ "coveralls", "pip", "pytest", - "pytest-asynciopytest-cov", + "pytest-asyncio", + "pytest-cov", "pytest-env", "pytest-mypy-plugins", "pytest-timeout", From 2079d7e5f703fd21bd4aad615b985f3b2ddb2729 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 21 May 2025 08:43:24 +0700 Subject: [PATCH 020/112] fix formatting --- doc/internals/how-to-add-new-backend.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst index f5ee255c420..883c817dccc 100644 --- a/doc/internals/how-to-add-new-backend.rst +++ b/doc/internals/how-to-add-new-backend.rst @@ -422,7 +422,7 @@ Async support Backends can also optionally support loading data asynchronously via xarray's asynchronous loading methods (e.g. ``~xarray.Dataset.load_async``). -To support async loading the `BackendArray` subclass must additionally implement the ``BackendArray.async_getitem`` method. +To support async loading the ``BackendArray`` subclass must additionally implement the ``BackendArray.async_getitem`` method. Note that implementing this method is only necessary if you want to be able to load data from different xarray objects concurrently. Even without this method your ``BackendArray`` implementation is still free to concurrently load chunks of data for a single ``Variable`` itself, From 48e453434593281ae5be6f6d5962bf9e7d7cd6f0 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 21 May 2025 08:45:34 +0700 Subject: [PATCH 021/112] concurrently load different variables in ds.load_async using asyncio.gather --- xarray/core/dataset.py | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 26441256a4a..5d5abd27987 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import copy import datetime import math @@ -531,49 +532,50 @@ def load(self, **kwargs) -> Self: dask.compute """ # access .data to coerce everything to numpy or dask arrays - lazy_data = { + chunked_data = { k: v._data for k, v in self.variables.items() if is_chunked_array(v._data) } - if lazy_data: - chunkmanager = get_chunked_array_type(*lazy_data.values()) + if chunked_data: + chunkmanager = get_chunked_array_type(*chunked_data.values()) # evaluate all the chunked arrays simultaneously evaluated_data: tuple[np.ndarray[Any, Any], ...] = chunkmanager.compute( - *lazy_data.values(), **kwargs + *chunked_data.values(), **kwargs ) - for k, data in zip(lazy_data, evaluated_data, strict=False): + for k, data in zip(chunked_data, evaluated_data, strict=False): self.variables[k].data = data # load everything else sequentially - for k, v in self.variables.items(): - if k not in lazy_data: - v.load() + [v.load_async() for k, v in self.variables.items() if k not in chunked_data] return self async def load_async(self, **kwargs) -> Self: + # TODO refactor this to pul out the common chunked_data codepath + # this blocks on chunked arrays but not on lazily indexed arrays # access .data to coerce everything to numpy or dask arrays - lazy_data = { + chunked_data = { k: v._data for k, v in self.variables.items() if is_chunked_array(v._data) } - if lazy_data: - chunkmanager = get_chunked_array_type(*lazy_data.values()) + if chunked_data: + chunkmanager = get_chunked_array_type(*chunked_data.values()) # evaluate all the chunked arrays simultaneously evaluated_data: tuple[np.ndarray[Any, Any], ...] = chunkmanager.compute( - *lazy_data.values(), **kwargs + *chunked_data.values(), **kwargs ) - for k, data in zip(lazy_data, evaluated_data, strict=False): + for k, data in zip(chunked_data, evaluated_data, strict=False): self.variables[k].data = data - # load everything else sequentially - for k, v in self.variables.items(): - if k not in lazy_data: - await v.load_async() + # load everything else concurrently + tasks = [ + v.load_async() for k, v in self.variables.items() if k not in chunked_data + ] + await asyncio.gather(*tasks) return self From cca758931dd46d2b59756843086db19b97f9449e Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 21 May 2025 08:46:07 +0700 Subject: [PATCH 022/112] test concurrent loading of multiple variables in one dataset --- xarray/tests/test_async.py | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index d87208c3e59..d8e91c4aca3 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -120,7 +120,6 @@ async def measure(self): @requires_zarr_v3 @pytest.mark.asyncio class TestAsyncLoad: - N_LOADS: int = 5 LATENCY: float = 1.0 @pytest.fixture(params=["ds", "da", "var"]) @@ -136,21 +135,42 @@ def xr_obj(self, request, memorystore) -> xr.Dataset | xr.DataArray | xr.Variabl case "ds": return ds - def assert_time_as_expected(self, total_time: float) -> None: - assert total_time > self.LATENCY # Cannot possibly be quicker than this + def assert_time_as_expected( + self, total_time: float, latency: float, n_loads: int + ) -> None: + assert total_time > latency # Cannot possibly be quicker than this assert ( - total_time < self.LATENCY * self.N_LOADS + total_time < latency * n_loads ) # If this isn't true we're gaining nothing from async assert ( - abs(total_time - self.LATENCY) < 2.0 - ) # Should take approximately LATENCY seconds, but allow some buffer + abs(total_time - latency) < 2.0 + ) # Should take approximately `latency` seconds, but allow some buffer + + async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: + N_OBJECTS = 5 - async def test_async_load(self, xr_obj): async with AsyncTimer().measure() as timer: - tasks = [xr_obj.load_async() for _ in range(self.N_LOADS)] + tasks = [xr_obj.load_async() for _ in range(N_OBJECTS)] results = await asyncio.gather(*tasks) for result in results: xrt.assert_identical(result, xr_obj.load()) - self.assert_time_as_expected(timer.total_time) + self.assert_time_as_expected( + total_time=timer.total_time, latency=self.LATENCY, n_loads=N_OBJECTS + ) + + async def test_concurrent_load_multiple_variables(self, memorystore) -> None: + latencystore = LatencyStore(memorystore, latency=self.LATENCY) + ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) + + # TODO up the number of variables in the dataset? + async with AsyncTimer().measure() as timer: + result_ds = await ds.load_async() + + xrt.assert_identical(result_ds, ds.load()) + + # 2 because there are 2 lazy variables in the dataset + self.assert_time_as_expected( + total_time=timer.total_time, latency=self.LATENCY, n_loads=2 + ) From dfe9b87d7267c66fd77975f845e298dc2131ffdb Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 21 May 2025 11:04:34 +0700 Subject: [PATCH 023/112] fix non-awaited load_async --- xarray/core/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5d5abd27987..8a4e7177caa 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -547,12 +547,12 @@ def load(self, **kwargs) -> Self: self.variables[k].data = data # load everything else sequentially - [v.load_async() for k, v in self.variables.items() if k not in chunked_data] + [v.load() for k, v in self.variables.items() if k not in chunked_data] return self async def load_async(self, **kwargs) -> Self: - # TODO refactor this to pul out the common chunked_data codepath + # TODO refactor this to pull out the common chunked_data codepath # this blocks on chunked arrays but not on lazily indexed arrays @@ -572,10 +572,10 @@ async def load_async(self, **kwargs) -> Self: self.variables[k].data = data # load everything else concurrently - tasks = [ + coros = [ v.load_async() for k, v in self.variables.items() if k not in chunked_data ] - await asyncio.gather(*tasks) + await asyncio.gather(*coros) return self From 84099f3f164c2e097c71a6050027eb3990e6cd7b Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 21 May 2025 11:05:37 +0700 Subject: [PATCH 024/112] rearrange test order --- xarray/tests/test_async.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index d8e91c4aca3..d17ccf9e7e5 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -11,6 +11,7 @@ import xarray.testing as xrt from xarray.tests import has_zarr_v3, requires_zarr_v3 + if has_zarr_v3: import zarr from zarr.abc.store import ByteRequest, Store @@ -146,20 +147,6 @@ def assert_time_as_expected( abs(total_time - latency) < 2.0 ) # Should take approximately `latency` seconds, but allow some buffer - async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: - N_OBJECTS = 5 - - async with AsyncTimer().measure() as timer: - tasks = [xr_obj.load_async() for _ in range(N_OBJECTS)] - results = await asyncio.gather(*tasks) - - for result in results: - xrt.assert_identical(result, xr_obj.load()) - - self.assert_time_as_expected( - total_time=timer.total_time, latency=self.LATENCY, n_loads=N_OBJECTS - ) - async def test_concurrent_load_multiple_variables(self, memorystore) -> None: latencystore = LatencyStore(memorystore, latency=self.LATENCY) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) @@ -174,3 +161,17 @@ async def test_concurrent_load_multiple_variables(self, memorystore) -> None: self.assert_time_as_expected( total_time=timer.total_time, latency=self.LATENCY, n_loads=2 ) + + async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: + N_OBJECTS = 5 + + async with AsyncTimer().measure() as timer: + coros = [xr_obj.load_async() for _ in range(N_OBJECTS)] + results = await asyncio.gather(*coros) + + for result in results: + xrt.assert_identical(result, xr_obj.load()) + + self.assert_time_as_expected( + total_time=timer.total_time, latency=self.LATENCY, n_loads=N_OBJECTS + ) From ab000c86c463dbbad906b433e15a8dd29abc6073 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 21 May 2025 04:06:00 +0000 Subject: [PATCH 025/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_async.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index d17ccf9e7e5..2d5a7f027d6 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -11,7 +11,6 @@ import xarray.testing as xrt from xarray.tests import has_zarr_v3, requires_zarr_v3 - if has_zarr_v3: import zarr from zarr.abc.store import ByteRequest, Store From a8b7b466abbc511a6fdf045b459c6f0d66107bd1 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 May 2025 12:36:15 +0700 Subject: [PATCH 026/112] add test for orthogonal indexing --- xarray/tests/test_async.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index d17ccf9e7e5..45240a66bd4 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -11,7 +11,6 @@ import xarray.testing as xrt from xarray.tests import has_zarr_v3, requires_zarr_v3 - if has_zarr_v3: import zarr from zarr.abc.store import ByteRequest, Store @@ -77,7 +76,7 @@ async def get_partial_values( @pytest.fixture def memorystore() -> "MemoryStore": memorystore = zarr.storage.MemoryStore({}) - z = zarr.create_array( + z1 = zarr.create_array( store=memorystore, name="foo", shape=(10, 10), @@ -85,17 +84,17 @@ def memorystore() -> "MemoryStore": dtype="f4", dimension_names=["x", "y"], ) - z[:, :] = np.random.random((10, 10)) + z1[:, :] = np.random.random((10, 10)) - z = zarr.create_array( + z2 = zarr.create_array( store=memorystore, - name="bar", + name="x", shape=(10,), chunks=(5), dtype="f4", dimension_names=["x"], ) - z[:] = np.random.random((10,)) + z2[:] = np.arange(10) return memorystore @@ -123,7 +122,7 @@ async def measure(self): class TestAsyncLoad: LATENCY: float = 1.0 - @pytest.fixture(params=["ds", "da", "var"]) + @pytest.fixture(params=["var", "ds", "da"]) def xr_obj(self, request, memorystore) -> xr.Dataset | xr.DataArray | xr.Variable: latencystore = LatencyStore(memorystore, latency=self.LATENCY) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) @@ -175,3 +174,17 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: self.assert_time_as_expected( total_time=timer.total_time, latency=self.LATENCY, n_loads=N_OBJECTS ) + + @pytest.mark.xfail(reason="not implemented") + async def test_indexing(self, memorystore) -> None: + latencystore = LatencyStore(memorystore, latency=self.LATENCY) + ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) + + # TODO test basic indexing + + # test orthogonal indexing + indexer = {"x": [2, 3]} + result = await ds.sel(indexer).load_async() + xrt.assert_identical(result, ds.sel(indexer).load()) + + # TODO test vectorized indexing From 82c7654bbc28a66965a195c6fc36131d948b3c2d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 May 2025 12:36:32 +0700 Subject: [PATCH 027/112] explicitly forbid orthogonal indexing --- xarray/backends/zarr.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9e36e8198c7..981c41d828a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -238,15 +238,15 @@ async def async_getitem(self, key): if isinstance(key, indexing.BasicIndexer): method = self._async_getitem elif isinstance(key, indexing.VectorizedIndexer): - # TODO - method = self._vindex + # method = self._vindex + raise NotImplementedError("async lazy vectorized indexing is not supported") elif isinstance(key, indexing.OuterIndexer): - # TODO - method = self._oindex + # method = self._oindex + raise NotImplementedError("async lazy orthogonal indexing is not supported") print("did an async get") return await indexing.async_explicit_indexing_adapter( - key, array.shape, indexing.IndexingSupport.VECTORIZED, method + key, array.shape, indexing.IndexingSupport.BASIC, method ) From 5eacdb0eead3e09cbf888a0be656d90845acb30a Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 May 2025 13:36:31 +0700 Subject: [PATCH 028/112] support async orthogonal indexing via https://github.com/zarr-developers/zarr-python/pull/3083 --- xarray/backends/zarr.py | 9 ++++++--- xarray/tests/test_async.py | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 981c41d828a..e3f825f2b64 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -218,6 +218,10 @@ async def _async_getitem(self, key): async_array = self._array._async_array return await async_array.getitem(key) + async def _async_oindex(self, key): + async_array = self._array._async_array + return await async_array.oindex.getitem(key) + def __getitem__(self, key): array = self._array if isinstance(key, indexing.BasicIndexer): @@ -241,12 +245,11 @@ async def async_getitem(self, key): # method = self._vindex raise NotImplementedError("async lazy vectorized indexing is not supported") elif isinstance(key, indexing.OuterIndexer): - # method = self._oindex - raise NotImplementedError("async lazy orthogonal indexing is not supported") + method = self._async_oindex print("did an async get") return await indexing.async_explicit_indexing_adapter( - key, array.shape, indexing.IndexingSupport.BASIC, method + key, array.shape, indexing.IndexingSupport.OUTER, method ) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 45240a66bd4..9ea15468fce 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -25,6 +25,9 @@ class LatencyStore(WrapperStore[T_Store]): latency: float + # TODO only have to add this because of dumb behaviour in zarr where it raises with "ValueError: Store is not read-only but mode is 'r'" + read_only = True + def __init__( self, store: T_Store, @@ -175,7 +178,6 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: total_time=timer.total_time, latency=self.LATENCY, n_loads=N_OBJECTS ) - @pytest.mark.xfail(reason="not implemented") async def test_indexing(self, memorystore) -> None: latencystore = LatencyStore(memorystore, latency=self.LATENCY) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) From 093bf50275700e67560787181adc55ec61ffb49d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 May 2025 14:12:27 +0700 Subject: [PATCH 029/112] add test for vectorized indexing (even if it doesn't work) --- xarray/tests/test_async.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 9ea15468fce..ddced7423ab 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -189,4 +189,8 @@ async def test_indexing(self, memorystore) -> None: result = await ds.sel(indexer).load_async() xrt.assert_identical(result, ds.sel(indexer).load()) - # TODO test vectorized indexing + # test vectorized indexing + # TODO this shouldn't pass! I haven't implemented async vectorized indexing yet... + indexer = xr.DataArray([2, 3], dims=['x']) + result = await ds.foo[indexer].load_async() + xrt.assert_identical(result, ds.foo[indexer].load()) From 4073a24f563c1010d6fb50b6757b38196909d02c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 23 May 2025 07:12:53 +0000 Subject: [PATCH 030/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_async.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index ddced7423ab..81493bff30c 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -191,6 +191,6 @@ async def test_indexing(self, memorystore) -> None: # test vectorized indexing # TODO this shouldn't pass! I haven't implemented async vectorized indexing yet... - indexer = xr.DataArray([2, 3], dims=['x']) + indexer = xr.DataArray([2, 3], dims=["x"]) result = await ds.foo[indexer].load_async() xrt.assert_identical(result, ds.foo[indexer].load()) From 842a06cd99b9d5d1112fb6a5a2de58912fe8a96d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 May 2025 20:32:15 +0700 Subject: [PATCH 031/112] add test for basic indexing --- xarray/tests/test_async.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 81493bff30c..10c78089914 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -182,7 +182,10 @@ async def test_indexing(self, memorystore) -> None: latencystore = LatencyStore(memorystore, latency=self.LATENCY) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - # TODO test basic indexing + # test basic indexing + indexer = {"x": 2} + result = await ds.sel(indexer).load_async() + xrt.assert_identical(result, ds.sel(indexer).load()) # test orthogonal indexing indexer = {"x": [2, 3]} From e19ab55e82645c7ace96e8f1472207b3f5a8fae9 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 May 2025 22:27:06 +0700 Subject: [PATCH 032/112] correct test to actually use vectorized indexing --- xarray/tests/test_async.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 10c78089914..3bc27d82c44 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -179,7 +179,8 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: ) async def test_indexing(self, memorystore) -> None: - latencystore = LatencyStore(memorystore, latency=self.LATENCY) + # TODO we don't need a LatencyStore for this test + latencystore = LatencyStore(memorystore, latency=0.0) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) # test basic indexing @@ -193,7 +194,6 @@ async def test_indexing(self, memorystore) -> None: xrt.assert_identical(result, ds.sel(indexer).load()) # test vectorized indexing - # TODO this shouldn't pass! I haven't implemented async vectorized indexing yet... - indexer = xr.DataArray([2, 3], dims=["x"]) - result = await ds.foo[indexer].load_async() - xrt.assert_identical(result, ds.foo[indexer].load()) + indexer = {"x": xr.DataArray([2, 3], dims="points"), "y": xr.DataArray([2, 3], dims="points")} + result = await ds.isel(indexer).load_async() + xrt.assert_identical(result, ds.isel(indexer).load()) From b9e8e0631f19ed42a3894efab95c4f93012fcb93 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 23 May 2025 23:02:35 +0700 Subject: [PATCH 033/112] refactor to parametrize indexing test --- xarray/tests/test_async.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 3bc27d82c44..7a8b0190298 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -178,22 +178,26 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: total_time=timer.total_time, latency=self.LATENCY, n_loads=N_OBJECTS ) - async def test_indexing(self, memorystore) -> None: + @pytest.mark.parametrize( + "method,indexer", + [ + ("sel", {"x": 2}), + ("sel", {"x": [2, 3]}), + ( + "isel", + { + "x": xr.DataArray([2, 3], dims="points"), + "y": xr.DataArray([2, 3], dims="points"), + }, + ), + ], + ids=["basic", "outer", "vectorized"], + ) + async def test_indexing(self, memorystore, method, indexer) -> None: # TODO we don't need a LatencyStore for this test latencystore = LatencyStore(memorystore, latency=0.0) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - # test basic indexing - indexer = {"x": 2} - result = await ds.sel(indexer).load_async() - xrt.assert_identical(result, ds.sel(indexer).load()) - - # test orthogonal indexing - indexer = {"x": [2, 3]} - result = await ds.sel(indexer).load_async() - xrt.assert_identical(result, ds.sel(indexer).load()) - - # test vectorized indexing - indexer = {"x": xr.DataArray([2, 3], dims="points"), "y": xr.DataArray([2, 3], dims="points")} - result = await ds.isel(indexer).load_async() - xrt.assert_identical(result, ds.isel(indexer).load()) + result = await getattr(ds, method)(**indexer).load_async() + expected = getattr(ds, method)(**indexer).load() + xrt.assert_identical(result, expected) From 8bc7bea9b1ee8ee6a24870bd1d36f516d7c8da30 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 29 May 2025 17:18:00 +0700 Subject: [PATCH 034/112] implement async vectorized indexing --- xarray/backends/zarr.py | 11 ++++++----- xarray/core/indexing.py | 37 ++++++++++++++++++++++++++++++++++++- xarray/core/variable.py | 1 - xarray/tests/test_async.py | 3 ++- 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e3f825f2b64..9c3d0dc7d63 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -222,6 +222,10 @@ async def _async_oindex(self, key): async_array = self._array._async_array return await async_array.oindex.getitem(key) + async def _async_vindex(self, key): + async_array = self._array._async_array + return await async_array.vindex.getitem(key) + def __getitem__(self, key): array = self._array if isinstance(key, indexing.BasicIndexer): @@ -242,14 +246,11 @@ async def async_getitem(self, key): if isinstance(key, indexing.BasicIndexer): method = self._async_getitem elif isinstance(key, indexing.VectorizedIndexer): - # method = self._vindex - raise NotImplementedError("async lazy vectorized indexing is not supported") + method = self._async_vindex elif isinstance(key, indexing.OuterIndexer): method = self._async_oindex - - print("did an async get") return await indexing.async_explicit_indexing_adapter( - key, array.shape, indexing.IndexingSupport.OUTER, method + key, array.shape, indexing.IndexingSupport.VECTORIZED, method ) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 53f9bd12088..22220184cb8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -525,7 +525,7 @@ def get_duck_array(self): async def async_get_duck_array(self): key = BasicIndexer((slice(None),) * self.ndim) - return self[key] + return await self.async_getitem(key) def _oindex_get(self, indexer: OuterIndexer): raise NotImplementedError( @@ -756,6 +756,22 @@ def get_duck_array(self): array = array.get_duck_array() return _wrap_numpy_scalars(array) + async def async_get_duck_array(self): + print("inside LazilyVectorizedIndexedArray.async_get_duck_array") + if isinstance(self.array, ExplicitlyIndexedNDArrayMixin): + array = apply_indexer(self.array, self.key) + else: + # If the array is not an ExplicitlyIndexedNDArrayMixin, + # it may wrap a BackendArray so use its __getitem__ + array = await self.array.async_getitem(self.key) + # self.array[self.key] is now a numpy array when + # self.array is a BackendArray subclass + # and self.key is BasicIndexer((slice(None, None, None),)) + # so we need the explicit check for ExplicitlyIndexed + if isinstance(array, ExplicitlyIndexed): + array = await array.async_get_duck_array() + return _wrap_numpy_scalars(array) + def _updated_key(self, new_key: ExplicitIndexer): return _combine_indexers(self.key, self.shape, new_key) @@ -1608,6 +1624,16 @@ def __getitem__(self, indexer: ExplicitIndexer): key = indexer.tuple + (Ellipsis,) return array[key] + async def async_getitem(self, indexer: ExplicitIndexer): + self._check_and_raise_if_non_basic_indexer(indexer) + + array = self.array + # We want 0d slices rather than scalars. This is achieved by + # appending an ellipsis (see + # https://numpy.org/doc/stable/reference/arrays.indexing.html#detailed-notes). + key = indexer.tuple + (Ellipsis,) + return array[key] + def _safe_setitem(self, array, key: tuple[Any, ...], value: Any) -> None: try: array[key] = value @@ -1855,6 +1881,15 @@ def get_duck_array(self) -> np.ndarray | PandasExtensionArray: return PandasExtensionArray(self.array.array) return np.asarray(self) + async def async_get_duck_array(self) -> np.ndarray | PandasExtensionArray: + # TODO this must surely be wrong - it's not async yet + print("in PandasIndexingAdapter") + if pd.api.types.is_extension_array_dtype(self.array): + from xarray.core.extension_array import PandasExtensionArray + + return PandasExtensionArray(self.array.array) + return np.asarray(self) + @property def shape(self) -> _Shape: return (len(self.array),) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e45987bca35..38f2676ec52 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -959,7 +959,6 @@ def load(self, **kwargs): return self async def load_async(self, **kwargs): - print("async inside Variable") self._data = await async_to_duck_array(self._data, **kwargs) return self diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 7a8b0190298..7ec1967cf86 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -184,7 +184,7 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: ("sel", {"x": 2}), ("sel", {"x": [2, 3]}), ( - "isel", + "sel", { "x": xr.DataArray([2, 3], dims="points"), "y": xr.DataArray([2, 3], dims="points"), @@ -198,6 +198,7 @@ async def test_indexing(self, memorystore, method, indexer) -> None: latencystore = LatencyStore(memorystore, latency=0.0) ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) + # TODO we're not actually testing that these indexing methods are not blocking... result = await getattr(ds, method)(**indexer).load_async() expected = getattr(ds, method)(**indexer).load() xrt.assert_identical(result, expected) From 6c47e3f41bb4934959df6eb14acbb5dc0caf15d1 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 29 May 2025 18:00:00 +0700 Subject: [PATCH 035/112] revert breaking change to BackendArray --- xarray/backends/common.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index ce0ff3c323e..b6cb1660c44 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -268,12 +268,9 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500 time.sleep(1e-3 * next_delay) -class BackendArray(ABC, NdimSizeLenMixin, indexing.ExplicitlyIndexed): +class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): __slots__ = () - @abstractmethod - def __getitem__(key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: ... - async def async_getitem(key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: raise NotImplementedError("Backend does not not support asynchronous loading") From a86f6465ea6e4f39304c6b9f4db31b4ee05ed30f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 May 2025 11:00:28 +0000 Subject: [PATCH 036/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index b6cb1660c44..10a698ac329 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -4,7 +4,6 @@ import os import time import traceback -from abc import ABC, abstractmethod from collections.abc import Hashable, Iterable, Mapping, Sequence from glob import glob from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, Union, overload From 884ce139acfe3c6b1f38ce1eb8ac03370fd4e9dd Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 29 May 2025 18:13:44 +0700 Subject: [PATCH 037/112] remove indirection in _ensure_cached method --- xarray/core/indexing.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 22220184cb8..2497b0e71bc 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -876,19 +876,15 @@ class MemoryCachedArray(ExplicitlyIndexedNDArrayMixin): def __init__(self, array): self.array = _wrap_numpy_scalars(as_indexable(array)) - def _ensure_cached(self): - self.array = as_indexable(self.array.get_duck_array()) - - async def _async_ensure_cached(self): - duck_array = await self.array.async_get_duck_array() - self.array = as_indexable(duck_array) - def get_duck_array(self): - self._ensure_cached() + # first ensure the array object is cached + self.array = as_indexable(self.array.get_duck_array()) return self.array.get_duck_array() async def async_get_duck_array(self): - await self._async_ensure_cached() + # first ensure the array object is cached + duck_array = await self.array.async_get_duck_array() + self.array = as_indexable(duck_array) return await self.array.async_get_duck_array() def _oindex_get(self, indexer: OuterIndexer): From a43af86a13a10b13e408146f562dce473105e4c8 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 29 May 2025 15:33:27 -0600 Subject: [PATCH 038/112] IndexingAdapters don't need async get --- xarray/core/indexing.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 2497b0e71bc..1198f835789 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -678,7 +678,9 @@ async def async_get_duck_array(self): # and self.key is BasicIndexer((slice(None, None, None),)) # so we need the explicit check for ExplicitlyIndexed if isinstance(array, ExplicitlyIndexed): - array = await array.async_get_duck_array() + # At this point, we have issued completed the possible async load from disk + # and array is in-memory. So use the sync get + array = array.get_duck_array() return _wrap_numpy_scalars(array) def transpose(self, order): @@ -769,7 +771,9 @@ async def async_get_duck_array(self): # and self.key is BasicIndexer((slice(None, None, None),)) # so we need the explicit check for ExplicitlyIndexed if isinstance(array, ExplicitlyIndexed): - array = await array.async_get_duck_array() + # At this point, we have issued completed the possible async load from disk + # and array is in-memory. So use the sync get + array = array.get_duck_array() return _wrap_numpy_scalars(array) def _updated_key(self, new_key: ExplicitIndexer): @@ -877,15 +881,16 @@ def __init__(self, array): self.array = _wrap_numpy_scalars(as_indexable(array)) def get_duck_array(self): - # first ensure the array object is cached - self.array = as_indexable(self.array.get_duck_array()) - return self.array.get_duck_array() + duck_array = self.array.get_duck_array() + # ensure the array object is cached in-memory + self.array = as_indexable(duck_array) + return duck_array async def async_get_duck_array(self): - # first ensure the array object is cached duck_array = await self.array.async_get_duck_array() + # ensure the array object is cached in-memory self.array = as_indexable(duck_array) - return await self.array.async_get_duck_array() + return duck_array def _oindex_get(self, indexer: OuterIndexer): return type(self)(_wrap_numpy_scalars(self.array.oindex[indexer])) @@ -1620,16 +1625,6 @@ def __getitem__(self, indexer: ExplicitIndexer): key = indexer.tuple + (Ellipsis,) return array[key] - async def async_getitem(self, indexer: ExplicitIndexer): - self._check_and_raise_if_non_basic_indexer(indexer) - - array = self.array - # We want 0d slices rather than scalars. This is achieved by - # appending an ellipsis (see - # https://numpy.org/doc/stable/reference/arrays.indexing.html#detailed-notes). - key = indexer.tuple + (Ellipsis,) - return array[key] - def _safe_setitem(self, array, key: tuple[Any, ...], value: Any) -> None: try: array[key] = value From 17d7a0efe98dd5ee797420747612f578657626d2 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 29 May 2025 16:00:35 -0600 Subject: [PATCH 039/112] Add tests --- xarray/tests/test_async.py | 3 ++- xarray/tests/test_indexing.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 7ec1967cf86..99f4619e736 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -182,6 +182,7 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: "method,indexer", [ ("sel", {"x": 2}), + ("sel", {"x": slice(2, 4)}), ("sel", {"x": [2, 3]}), ( "sel", @@ -191,7 +192,7 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: }, ), ], - ids=["basic", "outer", "vectorized"], + ids=["basic-int", "basic-slice", "outer", "vectorized"], ) async def test_indexing(self, memorystore, method, indexer) -> None: # TODO we don't need a LatencyStore for this test diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 6dd75b58c6a..d308844c6fa 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -490,6 +490,23 @@ def test_sub_array(self) -> None: assert isinstance(child.array, indexing.NumpyIndexingAdapter) assert isinstance(wrapped.array, indexing.LazilyIndexedArray) + async def test_async_wrapper(self) -> None: + original = indexing.LazilyIndexedArray(np.arange(10)) + wrapped = indexing.MemoryCachedArray(original) + await wrapped.async_get_duck_array() + assert_array_equal(wrapped, np.arange(10)) + assert isinstance(wrapped.array, indexing.NumpyIndexingAdapter) + + async def test_async_sub_array(self) -> None: + original = indexing.LazilyIndexedArray(np.arange(10)) + wrapped = indexing.MemoryCachedArray(original) + child = wrapped[B[:5]] + assert isinstance(child, indexing.MemoryCachedArray) + await child.async_get_duck_array() + assert_array_equal(child, np.arange(5)) + assert isinstance(child.array, indexing.NumpyIndexingAdapter) + assert isinstance(wrapped.array, indexing.LazilyIndexedArray) + def test_setitem(self) -> None: original = np.arange(10) wrapped = indexing.MemoryCachedArray(original) From d824a2d3898f86fca471163c275e8bb72521e3d5 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 29 May 2025 17:02:29 -0600 Subject: [PATCH 040/112] Add decoding test --- xarray/coding/common.py | 3 +++ xarray/tests/test_async.py | 1 + 2 files changed, 4 insertions(+) diff --git a/xarray/coding/common.py b/xarray/coding/common.py index 1b455009668..8093827138b 100644 --- a/xarray/coding/common.py +++ b/xarray/coding/common.py @@ -75,6 +75,9 @@ def __getitem__(self, key): def get_duck_array(self): return self.func(self.array.get_duck_array()) + async def async_get_duck_array(self): + return self.func(await self.array.async_get_duck_array()) + def __repr__(self) -> str: return f"{type(self).__name__}({self.array!r}, func={self.func!r}, dtype={self.dtype!r})" diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 99f4619e736..d2beb353123 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -86,6 +86,7 @@ def memorystore() -> "MemoryStore": chunks=(5, 5), dtype="f4", dimension_names=["x", "y"], + attributes={"add_offset": 1, "scale_factor": 2}, ) z1[:, :] = np.random.random((10, 10)) From 6a136118efcab55c5a88203a618a62c5b6652314 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 29 May 2025 15:33:27 -0600 Subject: [PATCH 041/112] Add IndexingAdapter mixin --- xarray/backends/zarr.py | 1 + xarray/coding/variables.py | 1 + xarray/core/indexing.py | 121 +++++++++++++++------------------- xarray/namedarray/pycompat.py | 17 ++--- 4 files changed, 61 insertions(+), 79 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9c3d0dc7d63..8f814c7f1f3 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -242,6 +242,7 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here async def async_getitem(self, key): + print("async getting") array = self._array if isinstance(key, indexing.BasicIndexer): method = self._async_getitem diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 1b7bc95e2b4..911c532f7bd 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -105,6 +105,7 @@ def __getitem__(self, key) -> np.ndarray: return np.asarray(self.array[key], dtype=self.dtype) + def _apply_mask( data: np.ndarray, encoded_fill_values: list, diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 1198f835789..824558010e1 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -516,16 +516,30 @@ def get_duck_array(self): return self.array -class ExplicitlyIndexedNDArrayMixin(NDArrayMixin, ExplicitlyIndexed): - __slots__ = () +class IndexingAdapter: + """Marker class for indexing adapters. + + These classes translate between Xarray's indexing semantics and the underlying array's + indexing semantics. + """ def get_duck_array(self): key = BasicIndexer((slice(None),) * self.ndim) return self[key] async def async_get_duck_array(self): - key = BasicIndexer((slice(None),) * self.ndim) - return await self.async_getitem(key) + """These classes are applied to in-memory arrays, so specific async support isn't needed.""" + return self.get_duck_array() + + +class ExplicitlyIndexedNDArrayMixin(NDArrayMixin, ExplicitlyIndexed): + __slots__ = () + + def get_duck_array(self): + raise NotImplementedError + + async def async_get_duck_array(self): + raise NotImplementedError def _oindex_get(self, indexer: OuterIndexer): raise NotImplementedError( @@ -650,37 +664,25 @@ def shape(self) -> _Shape: return self._shape def get_duck_array(self): - if isinstance(self.array, ExplicitlyIndexedNDArrayMixin): - array = apply_indexer(self.array, self.key) - else: - # If the array is not an ExplicitlyIndexedNDArrayMixin, - # it may wrap a BackendArray so use its __getitem__ - array = self.array[self.key] + from xarray.backends.common import BackendArray - # self.array[self.key] is now a numpy array when - # self.array is a BackendArray subclass - # and self.key is BasicIndexer((slice(None, None, None),)) - # so we need the explicit check for ExplicitlyIndexed - if isinstance(array, ExplicitlyIndexed): - array = array.get_duck_array() + if isinstance(self.array, BackendArray): + array = self.array[self.key] + else: + array = apply_indexer(self.array, self.key) + if isinstance(array, ExplicitlyIndexed): + array = array.get_duck_array() return _wrap_numpy_scalars(array) async def async_get_duck_array(self): - if isinstance(self.array, ExplicitlyIndexedNDArrayMixin): - array = apply_indexer(self.array, self.key) - else: - # If the array is not an ExplicitlyIndexedNDArrayMixin, - # it may wrap a BackendArray so use its (async) getitem - array = await self.array.async_getitem(self.key) + from xarray.backends.common import BackendArray - # self.array[self.key] is now a numpy array when - # self.array is a BackendArray subclass - # and self.key is BasicIndexer((slice(None, None, None),)) - # so we need the explicit check for ExplicitlyIndexed - if isinstance(array, ExplicitlyIndexed): - # At this point, we have issued completed the possible async load from disk - # and array is in-memory. So use the sync get - array = array.get_duck_array() + if isinstance(self.array, BackendArray): + array = await self.array.async_getitem(self.key) + else: + array = apply_indexer(self.array, self.key) + if isinstance(array, ExplicitlyIndexed): + array = await array.async_get_duck_array() return _wrap_numpy_scalars(array) def transpose(self, order): @@ -744,36 +746,26 @@ def shape(self) -> _Shape: return np.broadcast(*self.key.tuple).shape def get_duck_array(self): - if isinstance(self.array, ExplicitlyIndexedNDArrayMixin): - array = apply_indexer(self.array, self.key) - else: - # If the array is not an ExplicitlyIndexedNDArrayMixin, - # it may wrap a BackendArray so use its __getitem__ + from xarray.backends.common import BackendArray + + if isinstance(self.array, BackendArray): array = self.array[self.key] - # self.array[self.key] is now a numpy array when - # self.array is a BackendArray subclass - # and self.key is BasicIndexer((slice(None, None, None),)) - # so we need the explicit check for ExplicitlyIndexed - if isinstance(array, ExplicitlyIndexed): - array = array.get_duck_array() + else: + array = apply_indexer(self.array, self.key) + if isinstance(array, ExplicitlyIndexed): + array = array.get_duck_array() return _wrap_numpy_scalars(array) async def async_get_duck_array(self): print("inside LazilyVectorizedIndexedArray.async_get_duck_array") - if isinstance(self.array, ExplicitlyIndexedNDArrayMixin): - array = apply_indexer(self.array, self.key) - else: - # If the array is not an ExplicitlyIndexedNDArrayMixin, - # it may wrap a BackendArray so use its __getitem__ + from xarray.backends.common import BackendArray + + if isinstance(self.array, BackendArray): array = await self.array.async_getitem(self.key) - # self.array[self.key] is now a numpy array when - # self.array is a BackendArray subclass - # and self.key is BasicIndexer((slice(None, None, None),)) - # so we need the explicit check for ExplicitlyIndexed - if isinstance(array, ExplicitlyIndexed): - # At this point, we have issued completed the possible async load from disk - # and array is in-memory. So use the sync get - array = array.get_duck_array() + else: + array = apply_indexer(self.array, self.key) + if isinstance(array, ExplicitlyIndexed): + array = await array.async_get_duck_array() return _wrap_numpy_scalars(array) def _updated_key(self, new_key: ExplicitIndexer): @@ -1589,7 +1581,7 @@ def is_fancy_indexer(indexer: Any) -> bool: return True -class NumpyIndexingAdapter(ExplicitlyIndexedNDArrayMixin): +class NumpyIndexingAdapter(IndexingAdapter, ExplicitlyIndexedNDArrayMixin): """Wrap a NumPy array to use explicit indexing.""" __slots__ = ("array",) @@ -1668,7 +1660,7 @@ def __init__(self, array): self.array = array -class ArrayApiIndexingAdapter(ExplicitlyIndexedNDArrayMixin): +class ArrayApiIndexingAdapter(IndexingAdapter, ExplicitlyIndexedNDArrayMixin): """Wrap an array API array to use explicit indexing.""" __slots__ = ("array",) @@ -1733,7 +1725,7 @@ def _assert_not_chunked_indexer(idxr: tuple[Any, ...]) -> None: ) -class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin): +class DaskIndexingAdapter(IndexingAdapter, ExplicitlyIndexedNDArrayMixin): """Wrap a dask array to support explicit indexing.""" __slots__ = ("array",) @@ -1809,7 +1801,7 @@ def transpose(self, order): return self.array.transpose(order) -class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin): +class PandasIndexingAdapter(IndexingAdapter, ExplicitlyIndexedNDArrayMixin): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" __slots__ = ("_dtype", "array") @@ -1872,15 +1864,6 @@ def get_duck_array(self) -> np.ndarray | PandasExtensionArray: return PandasExtensionArray(self.array.array) return np.asarray(self) - async def async_get_duck_array(self) -> np.ndarray | PandasExtensionArray: - # TODO this must surely be wrong - it's not async yet - print("in PandasIndexingAdapter") - if pd.api.types.is_extension_array_dtype(self.array): - from xarray.core.extension_array import PandasExtensionArray - - return PandasExtensionArray(self.array.array) - return np.asarray(self) - @property def shape(self) -> _Shape: return (len(self.array),) @@ -2135,7 +2118,9 @@ def copy(self, deep: bool = True) -> Self: return type(self)(array, self._dtype, self.level) -class CoordinateTransformIndexingAdapter(ExplicitlyIndexedNDArrayMixin): +class CoordinateTransformIndexingAdapter( + IndexingAdapter, ExplicitlyIndexedNDArrayMixin +): """Wrap a CoordinateTransform as a lazy coordinate array. Supports explicit indexing (both outer and vectorized). diff --git a/xarray/namedarray/pycompat.py b/xarray/namedarray/pycompat.py index c6a07e5963f..6e61d3445ab 100644 --- a/xarray/namedarray/pycompat.py +++ b/xarray/namedarray/pycompat.py @@ -153,20 +153,15 @@ async def async_to_duck_array( from xarray.core.indexing import ( ExplicitlyIndexed, ImplicitToExplicitIndexingAdapter, + IndexingAdapter, ) - from xarray.namedarray.parallelcompat import get_chunked_array_type print(type(data)) - - if is_chunked_array(data): - chunkmanager = get_chunked_array_type(data) - loaded_data, *_ = chunkmanager.compute(data, **kwargs) # type: ignore[var-annotated] - return loaded_data - - if isinstance(data, ExplicitlyIndexed | ImplicitToExplicitIndexingAdapter): + if isinstance(data, IndexingAdapter): + # These wrap in-memory arrays, and async isn't needed + return data.get_duck_array() + elif isinstance(data, ExplicitlyIndexed | ImplicitToExplicitIndexingAdapter): print("async inside to_duck_array") return await data.async_get_duck_array() # type: ignore[no-untyped-call, no-any-return] - elif is_duck_array(data): - return data else: - return np.asarray(data) # type: ignore[return-value] + return to_duck_array(data, **kwargs) From d79ed54724644419758226f795bf4ac1bedd7233 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 29 May 2025 21:58:52 -0600 Subject: [PATCH 042/112] [cherry] Making decoding arrays lazy too --- xarray/coding/strings.py | 9 ++++++--- xarray/coding/variables.py | 17 +++++++++++------ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/xarray/coding/strings.py b/xarray/coding/strings.py index 4ca6a3f0a46..a2295c218a6 100644 --- a/xarray/coding/strings.py +++ b/xarray/coding/strings.py @@ -250,14 +250,17 @@ def __repr__(self): return f"{type(self).__name__}({self.array!r})" def _vindex_get(self, key): - return _numpy_char_to_bytes(self.array.vindex[key]) + return type(self)(self.array.vindex[key]) def _oindex_get(self, key): - return _numpy_char_to_bytes(self.array.oindex[key]) + return type(self)(self.array.oindex[key]) def __getitem__(self, key): # require slicing the last dimension completely key = type(key)(indexing.expanded_indexer(key.tuple, self.array.ndim)) if key.tuple[-1] != slice(None): raise IndexError("too many indices") - return _numpy_char_to_bytes(self.array[key]) + return type(self)(self.array[key]) + + def get_duck_array(self): + return _numpy_char_to_bytes(self.array.get_duck_array()) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 911c532f7bd..f82f0c65768 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -58,13 +58,16 @@ def dtype(self) -> np.dtype: return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize)) def _oindex_get(self, key): - return np.asarray(self.array.oindex[key], dtype=self.dtype) + return type(self)(self.array.oindex[key]) def _vindex_get(self, key): - return np.asarray(self.array.vindex[key], dtype=self.dtype) + return type(self)(self.array.vindex[key]) def __getitem__(self, key) -> np.ndarray: - return np.asarray(self.array[key], dtype=self.dtype) + return type(self)(self.array[key]) + + def get_duck_array(self): + return duck_array_ops.astype(self.array.get_duck_array(), dtype=self.dtype) class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin): @@ -96,14 +99,16 @@ def dtype(self) -> np.dtype: return np.dtype("bool") def _oindex_get(self, key): - return np.asarray(self.array.oindex[key], dtype=self.dtype) + return type(self)(self.array.oindex[key]) def _vindex_get(self, key): - return np.asarray(self.array.vindex[key], dtype=self.dtype) + return type(self)(self.array.vindex[key]) def __getitem__(self, key) -> np.ndarray: - return np.asarray(self.array[key], dtype=self.dtype) + return type(self)(self.array[key]) + def get_duck_array(self): + return duck_array_ops.astype(self.array.get_duck_array(), dtype=self.dtype) def _apply_mask( From 1da335991fc1c86c1a79f706402c3f762ea15e04 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 30 May 2025 21:27:05 +0700 Subject: [PATCH 043/112] parametrized over isel and sel --- xarray/tests/test_async.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index d2beb353123..ff6998de51c 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -179,19 +179,17 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: total_time=timer.total_time, latency=self.LATENCY, n_loads=N_OBJECTS ) + @pytest.mark.parametrize("method", ["sel", "isel"]) @pytest.mark.parametrize( - "method,indexer", + "indexer", [ - ("sel", {"x": 2}), - ("sel", {"x": slice(2, 4)}), - ("sel", {"x": [2, 3]}), - ( - "sel", - { - "x": xr.DataArray([2, 3], dims="points"), - "y": xr.DataArray([2, 3], dims="points"), - }, - ), + {"x": 2}, + {"x": slice(2, 4)}, + {"x": [2, 3]}, + { + "x": xr.DataArray([2, 3], dims="points"), + "y": xr.DataArray([2, 3], dims="points"), + }, ], ids=["basic-int", "basic-slice", "outer", "vectorized"], ) From dded9e04ba2abdb01a1923afd450f8ae23ebe08b Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 30 May 2025 22:50:27 +0700 Subject: [PATCH 044/112] mock zarr.AsyncArray.getitem in test --- xarray/tests/test_async.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index ff6998de51c..9def990f353 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -3,6 +3,7 @@ from collections.abc import Iterable from contextlib import asynccontextmanager from typing import TypeVar +from unittest.mock import patch import numpy as np import pytest @@ -196,9 +197,27 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: async def test_indexing(self, memorystore, method, indexer) -> None: # TODO we don't need a LatencyStore for this test latencystore = LatencyStore(memorystore, latency=0.0) - ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - # TODO we're not actually testing that these indexing methods are not blocking... - result = await getattr(ds, method)(**indexer).load_async() + original_getitem = zarr.AsyncArray.getitem + + async def wrapper(instance, selection): + # Call the original method with proper self + result = await original_getitem(instance, selection) + return result + + with patch.object( + zarr.AsyncArray, "getitem", side_effect=wrapper, autospec=True + ) as mocked_meth: + ds = xr.open_zarr( + latencystore, zarr_format=3, consolidated=False, chunks=None + ) + + # TODO we're not actually testing that these indexing methods are not blocking... + result = await getattr(ds, method)(**indexer).load_async() + + assert mocked_meth.call_count > 0 + mocked_meth.assert_called() + mocked_meth.assert_awaited() + expected = getattr(ds, method)(**indexer).load() xrt.assert_identical(result, expected) From 4c347ad0a2225214f8932afd926dfbf53f51bd06 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 30 May 2025 23:06:27 +0700 Subject: [PATCH 045/112] tidy up the mocking --- xarray/tests/test_async.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 9def990f353..918a2508ea0 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -182,31 +182,29 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: @pytest.mark.parametrize("method", ["sel", "isel"]) @pytest.mark.parametrize( - "indexer", + "indexer, zarr_getitem_method", [ - {"x": 2}, - {"x": slice(2, 4)}, - {"x": [2, 3]}, - { - "x": xr.DataArray([2, 3], dims="points"), - "y": xr.DataArray([2, 3], dims="points"), - }, + ({"x": 2}, zarr.AsyncArray.getitem), + ({"x": slice(2, 4)}, zarr.AsyncArray.getitem), + ({"x": [2, 3]}, zarr.core.indexing.AsyncOIndex.getitem), + ( + { + "x": xr.DataArray([2, 3], dims="points"), + "y": xr.DataArray([2, 3], dims="points"), + }, + zarr.core.indexing.AsyncVIndex.getitem, + ), ], ids=["basic-int", "basic-slice", "outer", "vectorized"], ) - async def test_indexing(self, memorystore, method, indexer) -> None: + async def test_indexing( + self, memorystore, method, indexer, zarr_getitem_method + ) -> None: # TODO we don't need a LatencyStore for this test latencystore = LatencyStore(memorystore, latency=0.0) - original_getitem = zarr.AsyncArray.getitem - - async def wrapper(instance, selection): - # Call the original method with proper self - result = await original_getitem(instance, selection) - return result - with patch.object( - zarr.AsyncArray, "getitem", side_effect=wrapper, autospec=True + zarr.AsyncArray, "getitem", wraps=zarr_getitem_method, autospec=True ) as mocked_meth: ds = xr.open_zarr( latencystore, zarr_format=3, consolidated=False, chunks=None From 4018e285dc227b18d229bf6d9077bcc08083f3d4 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 20 Jun 2025 15:17:23 +0700 Subject: [PATCH 046/112] ensure the correct zarr class's method is patched for each test --- xarray/tests/test_async.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 918a2508ea0..d5b6c6aee0d 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -182,29 +182,32 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: @pytest.mark.parametrize("method", ["sel", "isel"]) @pytest.mark.parametrize( - "indexer, zarr_getitem_method", + "indexer, zarr_class_and_method", [ - ({"x": 2}, zarr.AsyncArray.getitem), - ({"x": slice(2, 4)}, zarr.AsyncArray.getitem), - ({"x": [2, 3]}, zarr.core.indexing.AsyncOIndex.getitem), + ({"x": 2}, (zarr.AsyncArray, "getitem")), + ({"x": slice(2, 4)}, (zarr.AsyncArray, "getitem")), + ({"x": [2, 3]}, (zarr.core.indexing.AsyncOIndex, "getitem")), ( { "x": xr.DataArray([2, 3], dims="points"), "y": xr.DataArray([2, 3], dims="points"), }, - zarr.core.indexing.AsyncVIndex.getitem, + (zarr.core.indexing.AsyncVIndex, "getitem"), ), ], ids=["basic-int", "basic-slice", "outer", "vectorized"], ) async def test_indexing( - self, memorystore, method, indexer, zarr_getitem_method + self, memorystore, method, indexer, zarr_class_and_method ) -> None: # TODO we don't need a LatencyStore for this test latencystore = LatencyStore(memorystore, latency=0.0) + target_class, method_name = zarr_class_and_method + original_method = getattr(target_class, method_name) + with patch.object( - zarr.AsyncArray, "getitem", wraps=zarr_getitem_method, autospec=True + target_class, method_name, wraps=original_method, autospec=True ) as mocked_meth: ds = xr.open_zarr( latencystore, zarr_format=3, consolidated=False, chunks=None @@ -213,9 +216,9 @@ async def test_indexing( # TODO we're not actually testing that these indexing methods are not blocking... result = await getattr(ds, method)(**indexer).load_async() - assert mocked_meth.call_count > 0 mocked_meth.assert_called() mocked_meth.assert_awaited() + assert mocked_meth.call_count > 0 expected = getattr(ds, method)(**indexer).load() xrt.assert_identical(result, expected) From 6da81ce0980854cd2bbe655900da14fbe6018580 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Jun 2025 08:17:48 +0000 Subject: [PATCH 047/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_async.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index d5b6c6aee0d..c575f2ce136 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -205,7 +205,7 @@ async def test_indexing( target_class, method_name = zarr_class_and_method original_method = getattr(target_class, method_name) - + with patch.object( target_class, method_name, wraps=original_method, autospec=True ) as mocked_meth: From 7972164feb00b47e884a0981397429335ea12003 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 20 Jun 2025 15:36:15 +0700 Subject: [PATCH 048/112] add degenerate test case of no indexing --- xarray/tests/test_async.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index c575f2ce136..a8ec3665e27 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -184,6 +184,7 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: @pytest.mark.parametrize( "indexer, zarr_class_and_method", [ + ({}, (zarr.AsyncArray, "getitem")), ({"x": 2}, (zarr.AsyncArray, "getitem")), ({"x": slice(2, 4)}, (zarr.AsyncArray, "getitem")), ({"x": [2, 3]}, (zarr.core.indexing.AsyncOIndex, "getitem")), @@ -195,7 +196,7 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: (zarr.core.indexing.AsyncVIndex, "getitem"), ), ], - ids=["basic-int", "basic-slice", "outer", "vectorized"], + ids=["no-indexing", "basic-int", "basic-slice", "outer", "vectorized"], ) async def test_indexing( self, memorystore, method, indexer, zarr_class_and_method @@ -203,6 +204,7 @@ async def test_indexing( # TODO we don't need a LatencyStore for this test latencystore = LatencyStore(memorystore, latency=0.0) + # each type of indexing ends up calling a different zarr indexing method target_class, method_name = zarr_class_and_method original_method = getattr(target_class, method_name) From 618424a7154ef71e594bbdd8620b6c9597f390ff Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 20 Jun 2025 16:34:08 +0700 Subject: [PATCH 049/112] factor out the Latency part of LatencyStore --- xarray/tests/test_async.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index a8ec3665e27..a91fbfbf2fe 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -104,6 +104,12 @@ def memorystore() -> "MemoryStore": return memorystore +@pytest.fixture +def store(memorystore) -> "zarr.abc.Store": + # TODO we shouldn't need a LatencyStore at all for the patched tests, but we currently use it just as a way around https://github.com/zarr-developers/zarr-python/issues/3105#issuecomment-2990367167 + return LatencyStore(memorystore, latency=0.0) + + class AsyncTimer: """Context manager for timing async operations and making assertions about their execution time.""" @@ -199,11 +205,8 @@ async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: ids=["no-indexing", "basic-int", "basic-slice", "outer", "vectorized"], ) async def test_indexing( - self, memorystore, method, indexer, zarr_class_and_method + self, store, method, indexer, zarr_class_and_method ) -> None: - # TODO we don't need a LatencyStore for this test - latencystore = LatencyStore(memorystore, latency=0.0) - # each type of indexing ends up calling a different zarr indexing method target_class, method_name = zarr_class_and_method original_method = getattr(target_class, method_name) @@ -212,7 +215,10 @@ async def test_indexing( target_class, method_name, wraps=original_method, autospec=True ) as mocked_meth: ds = xr.open_zarr( - latencystore, zarr_format=3, consolidated=False, chunks=None + store, + zarr_format=3, + consolidated=False, + chunks=None, ) # TODO we're not actually testing that these indexing methods are not blocking... From cd97481aeec6d072ab8763c77e2aaf191cc21f98 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 20 Jun 2025 18:31:22 +0700 Subject: [PATCH 050/112] use mocks in multiple objects test --- xarray/tests/test_async.py | 41 ++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index a91fbfbf2fe..9e3c613ab73 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -2,7 +2,7 @@ import time from collections.abc import Iterable from contextlib import asynccontextmanager -from typing import TypeVar +from typing import Literal, TypeVar from unittest.mock import patch import numpy as np @@ -105,7 +105,7 @@ def memorystore() -> "MemoryStore": @pytest.fixture -def store(memorystore) -> "zarr.abc.Store": +def store(memorystore) -> "zarr.abc.store.Store": # TODO we shouldn't need a LatencyStore at all for the patched tests, but we currently use it just as a way around https://github.com/zarr-developers/zarr-python/issues/3105#issuecomment-2990367167 return LatencyStore(memorystore, latency=0.0) @@ -128,6 +128,20 @@ async def measure(self): self.total_time = self.end_time - self.start_time +def get_xr_obj( + store: "zarr.abc.store.Store", cls_name: Literal["Variable", "DataArray", "Dataset"] +): + ds = xr.open_zarr(store, zarr_format=3, consolidated=False, chunks=None) + + match cls_name: + case "Variable": + return ds["foo"].variable + case "DataArray": + return ds["foo"] + case "Dataset": + return ds + + @requires_zarr_v3 @pytest.mark.asyncio class TestAsyncLoad: @@ -172,20 +186,31 @@ async def test_concurrent_load_multiple_variables(self, memorystore) -> None: total_time=timer.total_time, latency=self.LATENCY, n_loads=2 ) - async def test_concurrent_load_multiple_objects(self, xr_obj) -> None: + # TODO apply this parametrization to the other test too? + @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) + async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: N_OBJECTS = 5 - async with AsyncTimer().measure() as timer: + target_class = zarr.AsyncArray + method_name = "getitem" + original_method = getattr(target_class, method_name) + + with patch.object( + target_class, method_name, wraps=original_method, autospec=True + ) as mocked_meth: + xr_obj = get_xr_obj(store, cls_name) + + # TODO we're not actually testing that these indexing methods are not blocking... coros = [xr_obj.load_async() for _ in range(N_OBJECTS)] results = await asyncio.gather(*coros) + mocked_meth.assert_called() + assert mocked_meth.call_count >= N_OBJECTS + mocked_meth.assert_awaited() + for result in results: xrt.assert_identical(result, xr_obj.load()) - self.assert_time_as_expected( - total_time=timer.total_time, latency=self.LATENCY, n_loads=N_OBJECTS - ) - @pytest.mark.parametrize("method", ["sel", "isel"]) @pytest.mark.parametrize( "indexer, zarr_class_and_method", From 75abdec7174e4eed8417177022a34386bd254201 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 20 Jun 2025 18:59:38 +0700 Subject: [PATCH 051/112] use mocks in multiple variables test --- xarray/tests/test_async.py | 72 ++++++++++---------------------------- 1 file changed, 19 insertions(+), 53 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 9e3c613ab73..04d674bd13d 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -110,24 +110,6 @@ def store(memorystore) -> "zarr.abc.store.Store": return LatencyStore(memorystore, latency=0.0) -class AsyncTimer: - """Context manager for timing async operations and making assertions about their execution time.""" - - start_time: float - end_time: float - total_time: float - - @asynccontextmanager - async def measure(self): - """Measure the execution time of the async code within this context.""" - self.start_time = time.time() - try: - yield self - finally: - self.end_time = time.time() - self.total_time = self.end_time - self.start_time - - def get_xr_obj( store: "zarr.abc.store.Store", cls_name: Literal["Variable", "DataArray", "Dataset"] ): @@ -145,52 +127,36 @@ def get_xr_obj( @requires_zarr_v3 @pytest.mark.asyncio class TestAsyncLoad: - LATENCY: float = 1.0 - - @pytest.fixture(params=["var", "ds", "da"]) - def xr_obj(self, request, memorystore) -> xr.Dataset | xr.DataArray | xr.Variable: - latencystore = LatencyStore(memorystore, latency=self.LATENCY) - ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) - - match request.param: - case "var": - return ds["foo"].variable - case "da": - return ds["foo"] - case "ds": - return ds - - def assert_time_as_expected( - self, total_time: float, latency: float, n_loads: int - ) -> None: - assert total_time > latency # Cannot possibly be quicker than this - assert ( - total_time < latency * n_loads - ) # If this isn't true we're gaining nothing from async - assert ( - abs(total_time - latency) < 2.0 - ) # Should take approximately `latency` seconds, but allow some buffer - - async def test_concurrent_load_multiple_variables(self, memorystore) -> None: - latencystore = LatencyStore(memorystore, latency=self.LATENCY) - ds = xr.open_zarr(latencystore, zarr_format=3, consolidated=False, chunks=None) + async def test_concurrent_load_multiple_variables(self, store) -> None: + target_class = zarr.AsyncArray + method_name = "getitem" + original_method = getattr(target_class, method_name) # TODO up the number of variables in the dataset? - async with AsyncTimer().measure() as timer: + # the coordinate variable is not lazy + N_LAZY_VARS = 1 + + with patch.object( + target_class, method_name, wraps=original_method, autospec=True + ) as mocked_meth: + # blocks upon loading the coordinate variables here + ds = xr.open_zarr(store, zarr_format=3, consolidated=False, chunks=None) + + # TODO we're not actually testing that these indexing methods are not blocking... result_ds = await ds.load_async() - xrt.assert_identical(result_ds, ds.load()) + mocked_meth.assert_called() + assert mocked_meth.call_count >= N_LAZY_VARS + mocked_meth.assert_awaited() - # 2 because there are 2 lazy variables in the dataset - self.assert_time_as_expected( - total_time=timer.total_time, latency=self.LATENCY, n_loads=2 - ) + xrt.assert_identical(result_ds, ds.load()) # TODO apply this parametrization to the other test too? @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: N_OBJECTS = 5 + # factor this mocking out of all tests as a fixture? target_class = zarr.AsyncArray method_name = "getitem" original_method = getattr(target_class, method_name) From 74093723c7445021c615320ec7039fb5dba4aa86 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 20 Jun 2025 19:04:32 +0700 Subject: [PATCH 052/112] trim latencystore down to just what's needed to dodge https://github.com/zarr-developers/zarr-python/issues/3105#issuecomment-2990367167 --- xarray/tests/test_async.py | 46 +++++++------------------------------- 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 04d674bd13d..26b039675dd 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -1,7 +1,4 @@ import asyncio -import time -from collections.abc import Iterable -from contextlib import asynccontextmanager from typing import Literal, TypeVar from unittest.mock import patch @@ -21,37 +18,20 @@ T_Store = TypeVar("T_Store", bound=Store) - class LatencyStore(WrapperStore[T_Store]): - """Works the same way as the zarr LoggingStore""" + class ReadOnlyStore(WrapperStore[T_Store]): + """ + We shouldn't need this - but we currently do just as a way around https://github.com/zarr-developers/zarr-python/issues/3105#issuecomment-2990367167 - latency: float + Works the same way as the zarr LoggingStore. + """ - # TODO only have to add this because of dumb behaviour in zarr where it raises with "ValueError: Store is not read-only but mode is 'r'" read_only = True def __init__( self, store: T_Store, - latency: float = 0.0, ) -> None: - """ - Store wrapper that adds artificial latency to each get call. - - Parameters - ---------- - store : Store - Store to wrap - latency : float - Amount of artificial latency to add to each get call, in seconds. - """ super().__init__(store) - self.latency = latency - - def __str__(self) -> str: - return f"latency-{self._store}" - - def __repr__(self) -> str: - return f"LatencyStore({self._store.__class__.__name__}, '{self._store}', latency={self.latency})" async def get( self, @@ -59,22 +39,12 @@ async def get( prototype: BufferPrototype, byte_range: ByteRequest | None = None, ) -> Buffer | None: - await asyncio.sleep(self.latency) return await self._store.get( key=key, prototype=prototype, byte_range=byte_range ) - async def get_partial_values( - self, - prototype: BufferPrototype, - key_ranges: Iterable[tuple[str, ByteRequest | None]], - ) -> list[Buffer | None]: - await asyncio.sleep(self.latency) - return await self._store.get_partial_values( - prototype=prototype, key_ranges=key_ranges - ) else: - LatencyStore = {} + ReadOnlyStore = {} @pytest.fixture @@ -106,8 +76,8 @@ def memorystore() -> "MemoryStore": @pytest.fixture def store(memorystore) -> "zarr.abc.store.Store": - # TODO we shouldn't need a LatencyStore at all for the patched tests, but we currently use it just as a way around https://github.com/zarr-developers/zarr-python/issues/3105#issuecomment-2990367167 - return LatencyStore(memorystore, latency=0.0) + # TODO we shouldn't this Store at all for the patched tests, but we currently use it just as a way around https://github.com/zarr-developers/zarr-python/issues/3105#issuecomment-2990367167 + return ReadOnlyStore(memorystore) def get_xr_obj( From 1f7903441bee65c58a8b745ca7a43aee501d2127 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 20 Jun 2025 19:24:20 +0700 Subject: [PATCH 053/112] parametrizing indexing test over xarray classes --- xarray/tests/test_async.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 26b039675dd..8392ee05855 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -121,12 +121,10 @@ async def test_concurrent_load_multiple_variables(self, store) -> None: xrt.assert_identical(result_ds, ds.load()) - # TODO apply this parametrization to the other test too? @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: N_OBJECTS = 5 - # factor this mocking out of all tests as a fixture? target_class = zarr.AsyncArray method_name = "getitem" original_method = getattr(target_class, method_name) @@ -147,6 +145,7 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: for result in results: xrt.assert_identical(result, xr_obj.load()) + @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) @pytest.mark.parametrize("method", ["sel", "isel"]) @pytest.mark.parametrize( "indexer, zarr_class_and_method", @@ -166,8 +165,16 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: ids=["no-indexing", "basic-int", "basic-slice", "outer", "vectorized"], ) async def test_indexing( - self, store, method, indexer, zarr_class_and_method + self, + store, + cls_name, + method, + indexer, + zarr_class_and_method, ) -> None: + if cls_name == "Variable" and method == "sel": + pytest.skip("Variable doesn't have a .sel method") + # each type of indexing ends up calling a different zarr indexing method target_class, method_name = zarr_class_and_method original_method = getattr(target_class, method_name) @@ -175,19 +182,14 @@ async def test_indexing( with patch.object( target_class, method_name, wraps=original_method, autospec=True ) as mocked_meth: - ds = xr.open_zarr( - store, - zarr_format=3, - consolidated=False, - chunks=None, - ) + xr_obj = get_xr_obj(store, cls_name) # TODO we're not actually testing that these indexing methods are not blocking... - result = await getattr(ds, method)(**indexer).load_async() + result = await getattr(xr_obj, method)(**indexer).load_async() mocked_meth.assert_called() mocked_meth.assert_awaited() assert mocked_meth.call_count > 0 - expected = getattr(ds, method)(**indexer).load() + expected = getattr(xr_obj, method)(**indexer).load() xrt.assert_identical(result, expected) From 9881e8dd0f08ed35419d449ac3e3a17aef8123b6 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 20 Jun 2025 23:53:20 +0700 Subject: [PATCH 054/112] ensure we actually test vectorized indexing for Variable --- xarray/tests/test_async.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 8392ee05855..3d514dae113 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -156,8 +156,8 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: ({"x": [2, 3]}, (zarr.core.indexing.AsyncOIndex, "getitem")), ( { - "x": xr.DataArray([2, 3], dims="points"), - "y": xr.DataArray([2, 3], dims="points"), + "x": xr.Variable(data=[2, 3], dims="points"), + "y": xr.Variable(data=[2, 3], dims="points"), }, (zarr.core.indexing.AsyncVIndex, "getitem"), ), From 119779873ab44e9d6460e5a16a85dd16f97bf449 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Sat, 21 Jun 2025 00:31:35 +0700 Subject: [PATCH 055/112] use create_test_data --- xarray/tests/test_async.py | 70 +++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 3d514dae113..8a3ca76e16d 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -2,12 +2,12 @@ from typing import Literal, TypeVar from unittest.mock import patch -import numpy as np import pytest import xarray as xr import xarray.testing as xrt from xarray.tests import has_zarr_v3, requires_zarr_v3 +from xarray.tests.test_dataset import create_test_data if has_zarr_v3: import zarr @@ -50,26 +50,9 @@ async def get( @pytest.fixture def memorystore() -> "MemoryStore": memorystore = zarr.storage.MemoryStore({}) - z1 = zarr.create_array( - store=memorystore, - name="foo", - shape=(10, 10), - chunks=(5, 5), - dtype="f4", - dimension_names=["x", "y"], - attributes={"add_offset": 1, "scale_factor": 2}, - ) - z1[:, :] = np.random.random((10, 10)) - - z2 = zarr.create_array( - store=memorystore, - name="x", - shape=(10,), - chunks=(5), - dtype="f4", - dimension_names=["x"], - ) - z2[:] = np.arange(10) + + ds = create_test_data() + ds.to_zarr(memorystore, zarr_format=3, consolidated=False) return memorystore @@ -87,9 +70,9 @@ def get_xr_obj( match cls_name: case "Variable": - return ds["foo"].variable + return ds["var1"].variable case "DataArray": - return ds["foo"] + return ds["var1"] case "Dataset": return ds @@ -146,23 +129,46 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: xrt.assert_identical(result, xr_obj.load()) @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) - @pytest.mark.parametrize("method", ["sel", "isel"]) @pytest.mark.parametrize( - "indexer, zarr_class_and_method", + "indexer, method, zarr_class_and_method", [ - ({}, (zarr.AsyncArray, "getitem")), - ({"x": 2}, (zarr.AsyncArray, "getitem")), - ({"x": slice(2, 4)}, (zarr.AsyncArray, "getitem")), - ({"x": [2, 3]}, (zarr.core.indexing.AsyncOIndex, "getitem")), + ({}, "sel", (zarr.AsyncArray, "getitem")), + ({}, "isel", (zarr.AsyncArray, "getitem")), + ({"dim2": 1.0}, "sel", (zarr.AsyncArray, "getitem")), + ({"dim2": 2}, "isel", (zarr.AsyncArray, "getitem")), + ({"dim2": slice(1.0, 3.0)}, "sel", (zarr.AsyncArray, "getitem")), + ({"dim2": slice(1, 3)}, "isel", (zarr.AsyncArray, "getitem")), + ({"dim2": [1.0, 3.0]}, "sel", (zarr.core.indexing.AsyncOIndex, "getitem")), + ({"dim2": [1, 3]}, "isel", (zarr.core.indexing.AsyncOIndex, "getitem")), ( { - "x": xr.Variable(data=[2, 3], dims="points"), - "y": xr.Variable(data=[2, 3], dims="points"), + "dim1": xr.Variable(data=[2, 3], dims="points"), + "dim2": xr.Variable(data=[1.0, 2.0], dims="points"), }, + "sel", (zarr.core.indexing.AsyncVIndex, "getitem"), ), + ( + { + "dim1": xr.Variable(data=[2, 3], dims="points"), + "dim2": xr.Variable(data=[1, 3], dims="points"), + }, + "isel", + (zarr.core.indexing.AsyncVIndex, "getitem"), + ), + ], + ids=[ + "no-indexing-sel", + "no-indexing-isel", + "basic-int-sel", + "basic-int-isel", + "basic-slice-sel", + "basic-slice-isel", + "outer-sel", + "outer-isel", + "vectorized-sel", + "vectorized-isel", ], - ids=["no-indexing", "basic-int", "basic-slice", "outer", "vectorized"], ) async def test_indexing( self, From 642fd48ac71527dd7f8c47259a0a88ab3d03eea9 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 20 Jun 2025 14:21:22 -0600 Subject: [PATCH 056/112] add @pytest.mark.asyncio --- xarray/tests/test_indexing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index d308844c6fa..010987337a6 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -490,6 +490,7 @@ def test_sub_array(self) -> None: assert isinstance(child.array, indexing.NumpyIndexingAdapter) assert isinstance(wrapped.array, indexing.LazilyIndexedArray) + @pytest.mark.asyncio async def test_async_wrapper(self) -> None: original = indexing.LazilyIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) @@ -497,6 +498,7 @@ async def test_async_wrapper(self) -> None: assert_array_equal(wrapped, np.arange(10)) assert isinstance(wrapped.array, indexing.NumpyIndexingAdapter) + @pytest.mark.asyncio async def test_async_sub_array(self) -> None: original = indexing.LazilyIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) From f22b56b98ba19168e48ba1945aeb1a77a8b520a3 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 22 Jul 2025 13:33:40 +0100 Subject: [PATCH 057/112] remove outdated readonly_store --- xarray/tests/test_async.py | 43 +++----------------------------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 8a3ca76e16d..ca30c2bb81f 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -11,44 +11,13 @@ if has_zarr_v3: import zarr - from zarr.abc.store import ByteRequest, Store - from zarr.core.buffer import Buffer, BufferPrototype - from zarr.storage import MemoryStore - from zarr.storage._wrapper import WrapperStore - - T_Store = TypeVar("T_Store", bound=Store) - - class ReadOnlyStore(WrapperStore[T_Store]): - """ - We shouldn't need this - but we currently do just as a way around https://github.com/zarr-developers/zarr-python/issues/3105#issuecomment-2990367167 - - Works the same way as the zarr LoggingStore. - """ - - read_only = True - - def __init__( - self, - store: T_Store, - ) -> None: - super().__init__(store) - - async def get( - self, - key: str, - prototype: BufferPrototype, - byte_range: ByteRequest | None = None, - ) -> Buffer | None: - return await self._store.get( - key=key, prototype=prototype, byte_range=byte_range - ) - else: - ReadOnlyStore = {} + # TODO what should we test when async loading not available? + pytest.mark.skip(reason="async loading from zarr requires zarr-python v3") @pytest.fixture -def memorystore() -> "MemoryStore": +def store() -> "MemoryStore": memorystore = zarr.storage.MemoryStore({}) ds = create_test_data() @@ -57,12 +26,6 @@ def memorystore() -> "MemoryStore": return memorystore -@pytest.fixture -def store(memorystore) -> "zarr.abc.store.Store": - # TODO we shouldn't this Store at all for the patched tests, but we currently use it just as a way around https://github.com/zarr-developers/zarr-python/issues/3105#issuecomment-2990367167 - return ReadOnlyStore(memorystore) - - def get_xr_obj( store: "zarr.abc.store.Store", cls_name: Literal["Variable", "DataArray", "Dataset"] ): From ebfede5d1686939885cc11587ed3a474ec87f7b0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Jul 2025 12:34:23 +0000 Subject: [PATCH 058/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_async.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index ca30c2bb81f..5e79df2cf2d 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -1,5 +1,5 @@ import asyncio -from typing import Literal, TypeVar +from typing import Literal from unittest.mock import patch import pytest From 87c7fcb519ec8c8ab8744f2e50d3c3d1ee76f049 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 31 Jul 2025 12:22:11 +0100 Subject: [PATCH 059/112] enable tests to run when recent version of zarr-python is not available --- xarray/tests/__init__.py | 1 + xarray/tests/test_async.py | 45 +++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 4de9e422761..9bfd943e9cd 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -143,6 +143,7 @@ def _importorskip( requires_zarr_v3_dtypes = pytest.mark.skipif( not has_zarr_v3_dtypes, reason="requires zarr>3.1.0" ) +has_zarr_v3_async_index, requires_zarr_v3_async_index = _importorskip("zarr", "3.1.2") has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 5e79df2cf2d..6f57a94aee1 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -1,20 +1,21 @@ import asyncio -from typing import Literal +from importlib import import_module +from typing import Any, Literal from unittest.mock import patch import pytest import xarray as xr import xarray.testing as xrt -from xarray.tests import has_zarr_v3, requires_zarr_v3 +from xarray.tests import has_zarr_v3_async_index, requires_zarr_v3_async_index from xarray.tests.test_dataset import create_test_data -if has_zarr_v3: +if has_zarr_v3_async_index: import zarr else: # TODO what should we test when async loading not available? pytest.mark.skip(reason="async loading from zarr requires zarr-python v3") - + zarr = None @pytest.fixture def store() -> "MemoryStore": @@ -40,11 +41,18 @@ def get_xr_obj( return ds -@requires_zarr_v3 +def _resolve_class_from_string(class_path: str) -> type[Any]: + """Resolve a string class path like 'zarr.AsyncArray' to the actual class.""" + module_path, class_name = class_path.rsplit('.', 1) + module = import_module(module_path) + return getattr(module, class_name) + + +@requires_zarr_v3_async_index @pytest.mark.asyncio class TestAsyncLoad: async def test_concurrent_load_multiple_variables(self, store) -> None: - target_class = zarr.AsyncArray + target_class = _resolve_class_from_string("zarr.AsyncArray") method_name = "getitem" original_method = getattr(target_class, method_name) @@ -71,7 +79,7 @@ async def test_concurrent_load_multiple_variables(self, store) -> None: async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: N_OBJECTS = 5 - target_class = zarr.AsyncArray + target_class = _resolve_class_from_string("zarr.AsyncArray") method_name = "getitem" original_method = getattr(target_class, method_name) @@ -95,21 +103,21 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: @pytest.mark.parametrize( "indexer, method, zarr_class_and_method", [ - ({}, "sel", (zarr.AsyncArray, "getitem")), - ({}, "isel", (zarr.AsyncArray, "getitem")), - ({"dim2": 1.0}, "sel", (zarr.AsyncArray, "getitem")), - ({"dim2": 2}, "isel", (zarr.AsyncArray, "getitem")), - ({"dim2": slice(1.0, 3.0)}, "sel", (zarr.AsyncArray, "getitem")), - ({"dim2": slice(1, 3)}, "isel", (zarr.AsyncArray, "getitem")), - ({"dim2": [1.0, 3.0]}, "sel", (zarr.core.indexing.AsyncOIndex, "getitem")), - ({"dim2": [1, 3]}, "isel", (zarr.core.indexing.AsyncOIndex, "getitem")), + ({}, "sel", ("zarr.AsyncArray", "getitem")), + ({}, "isel", ("zarr.AsyncArray", "getitem")), + ({"dim2": 1.0}, "sel", ("zarr.AsyncArray", "getitem")), + ({"dim2": 2}, "isel", ("zarr.AsyncArray", "getitem")), + ({"dim2": slice(1.0, 3.0)}, "sel", ("zarr.AsyncArray", "getitem")), + ({"dim2": slice(1, 3)}, "isel", ("zarr.AsyncArray", "getitem")), + ({"dim2": [1.0, 3.0]}, "sel", ("zarr.core.indexing.AsyncOIndex", "getitem")), + ({"dim2": [1, 3]}, "isel", ("zarr.core.indexing.AsyncOIndex", "getitem")), ( { "dim1": xr.Variable(data=[2, 3], dims="points"), "dim2": xr.Variable(data=[1.0, 2.0], dims="points"), }, "sel", - (zarr.core.indexing.AsyncVIndex, "getitem"), + ("zarr.core.indexing.AsyncVIndex", "getitem"), ), ( { @@ -117,7 +125,7 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: "dim2": xr.Variable(data=[1, 3], dims="points"), }, "isel", - (zarr.core.indexing.AsyncVIndex, "getitem"), + ("zarr.core.indexing.AsyncVIndex", "getitem"), ), ], ids=[ @@ -145,7 +153,8 @@ async def test_indexing( pytest.skip("Variable doesn't have a .sel method") # each type of indexing ends up calling a different zarr indexing method - target_class, method_name = zarr_class_and_method + target_class_path, method_name = zarr_class_and_method + target_class = _resolve_class_from_string(target_class_path) original_method = getattr(target_class, method_name) with patch.object( From 67c77cc8c8bbdef1d4b11077dc0f2ba73d2f642e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:24:44 +0000 Subject: [PATCH 060/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_async.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 6f57a94aee1..82ab12ce5a7 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -17,6 +17,7 @@ pytest.mark.skip(reason="async loading from zarr requires zarr-python v3") zarr = None + @pytest.fixture def store() -> "MemoryStore": memorystore = zarr.storage.MemoryStore({}) @@ -43,7 +44,7 @@ def get_xr_obj( def _resolve_class_from_string(class_path: str) -> type[Any]: """Resolve a string class path like 'zarr.AsyncArray' to the actual class.""" - module_path, class_name = class_path.rsplit('.', 1) + module_path, class_name = class_path.rsplit(".", 1) module = import_module(module_path) return getattr(module, class_name) @@ -109,7 +110,11 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: ({"dim2": 2}, "isel", ("zarr.AsyncArray", "getitem")), ({"dim2": slice(1.0, 3.0)}, "sel", ("zarr.AsyncArray", "getitem")), ({"dim2": slice(1, 3)}, "isel", ("zarr.AsyncArray", "getitem")), - ({"dim2": [1.0, 3.0]}, "sel", ("zarr.core.indexing.AsyncOIndex", "getitem")), + ( + {"dim2": [1.0, 3.0]}, + "sel", + ("zarr.core.indexing.AsyncOIndex", "getitem"), + ), ({"dim2": [1, 3]}, "isel", ("zarr.core.indexing.AsyncOIndex", "getitem")), ( { From 0d4bb0f371969b4649a95616b1ed4294cbcf5688 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 1 Aug 2025 11:04:48 +0100 Subject: [PATCH 061/112] separate tests to only run on correct versions of zarr --- xarray/tests/__init__.py | 7 ++++++- xarray/tests/test_async.py | 42 ++++++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 9bfd943e9cd..9437f10f979 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -131,6 +131,7 @@ def _importorskip( has_zarr, requires_zarr = _importorskip("zarr") has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.1.0") +has_zarr_v3_async_index, requires_zarr_v3_async_index = _importorskip("zarr", "3.1.2") if has_zarr_v3: import zarr @@ -139,11 +140,15 @@ def _importorskip( # installing from git main is giving me a lower version than the # most recently released zarr has_zarr_v3_dtypes = hasattr(zarr.core, "dtype") + has_zarr_v3_async_index = hasattr(zarr.AsyncArray, "oindex") requires_zarr_v3_dtypes = pytest.mark.skipif( not has_zarr_v3_dtypes, reason="requires zarr>3.1.0" ) -has_zarr_v3_async_index, requires_zarr_v3_async_index = _importorskip("zarr", "3.1.2") + requires_zarr_v3_async_index = pytest.mark.skipif( + not has_zarr_v3_async_index, reason="requires zarr>3.1.1" + ) + has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 6f57a94aee1..fb27fc4a55d 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -7,18 +7,17 @@ import xarray as xr import xarray.testing as xrt -from xarray.tests import has_zarr_v3_async_index, requires_zarr_v3_async_index +from xarray.tests import has_zarr, requires_zarr, has_zarr_v3_async_index, requires_zarr_v3_async_index from xarray.tests.test_dataset import create_test_data -if has_zarr_v3_async_index: +if has_zarr: import zarr else: - # TODO what should we test when async loading not available? - pytest.mark.skip(reason="async loading from zarr requires zarr-python v3") zarr = None + @pytest.fixture -def store() -> "MemoryStore": +def store() -> "zarr.storage.MemoryStore": memorystore = zarr.storage.MemoryStore({}) ds = create_test_data() @@ -43,14 +42,14 @@ def get_xr_obj( def _resolve_class_from_string(class_path: str) -> type[Any]: """Resolve a string class path like 'zarr.AsyncArray' to the actual class.""" - module_path, class_name = class_path.rsplit('.', 1) + module_path, class_name = class_path.rsplit(".", 1) module = import_module(module_path) return getattr(module, class_name) -@requires_zarr_v3_async_index @pytest.mark.asyncio class TestAsyncLoad: + @requires_zarr_v3_async_index async def test_concurrent_load_multiple_variables(self, store) -> None: target_class = _resolve_class_from_string("zarr.AsyncArray") method_name = "getitem" @@ -75,6 +74,7 @@ async def test_concurrent_load_multiple_variables(self, store) -> None: xrt.assert_identical(result_ds, ds.load()) + @requires_zarr_v3_async_index @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: N_OBJECTS = 5 @@ -99,6 +99,7 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: for result in results: xrt.assert_identical(result, xr_obj.load()) + @requires_zarr_v3_async_index @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) @pytest.mark.parametrize( "indexer, method, zarr_class_and_method", @@ -109,7 +110,11 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: ({"dim2": 2}, "isel", ("zarr.AsyncArray", "getitem")), ({"dim2": slice(1.0, 3.0)}, "sel", ("zarr.AsyncArray", "getitem")), ({"dim2": slice(1, 3)}, "isel", ("zarr.AsyncArray", "getitem")), - ({"dim2": [1.0, 3.0]}, "sel", ("zarr.core.indexing.AsyncOIndex", "getitem")), + ( + {"dim2": [1.0, 3.0]}, + "sel", + ("zarr.core.indexing.AsyncOIndex", "getitem"), + ), ({"dim2": [1, 3]}, "isel", ("zarr.core.indexing.AsyncOIndex", "getitem")), ( { @@ -171,3 +176,24 @@ async def test_indexing( expected = getattr(xr_obj, method)(**indexer).load() xrt.assert_identical(result, expected) + + # TODO generalize store to a v2 store? + @requires_zarr + @pytest.mark.skipif(has_zarr_v3_async_index, reason="newer version of zarr has async indexing") + @pytest.mark.parametrize( + "indexer", + [ + {"dim2": [1, 3]}, # tests oindexing + { # test vindexing + "dim1": xr.Variable(data=[2, 3], dims="points"), + "dim2": xr.Variable(data=[1, 3], dims="points"), + }, + ], + ) + async def test_raise_on_older_zarr_version(self, store, indexer): + """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" + + ds = xr.open_zarr(store, zarr_format=3, consolidated=False, chunks=None) + + with pytest.raises(NotImplementedError, match="async indexing"): + await ds.isel(**indexer).load_async() From ed2c8088e6ced7c2baebdccd843615d123b62220 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 1 Aug 2025 11:21:20 +0100 Subject: [PATCH 062/112] clear error message if async oindexing not available --- xarray/backends/zarr.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f39a5400e3f..28097b065ae 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -180,6 +180,14 @@ def encode_zarr_attr_value(value): return encoded +def has_zarr_async_index() -> bool: + try: + import zarr + return hasattr(zarr.AsyncArray, 'oindex') + except (ImportError, AttributeError): + return False + + class ZarrArrayWrapper(BackendArray): __slots__ = ("_array", "dtype", "shape") @@ -216,14 +224,21 @@ def _getitem(self, key): return self._array[key] async def _async_getitem(self, key): + # TODO requires zarr-python v3.0.0 async_array = self._array._async_array return await async_array.getitem(key) async def _async_oindex(self, key): + if not has_zarr_async_index(): + raise NotImplementedError("For lazy orthogonal async indexing with zarr, zarr-python=>v3.1.2 is required") + async_array = self._array._async_array return await async_array.oindex.getitem(key) async def _async_vindex(self, key): + if not has_zarr_async_index(): + raise NotImplementedError("For lazy orthogonal async indexing with zarr, zarr-python=>v3.1.2 is required") + async_array = self._array._async_array return await async_array.vindex.getitem(key) From df32020706e8355aff736f80b88fb5d89a9c79ac Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 1 Aug 2025 12:47:32 +0100 Subject: [PATCH 063/112] parametrize over zarr_format --- xarray/tests/test_async.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index fb27fc4a55d..ee8679b13d0 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -9,6 +9,8 @@ import xarray.testing as xrt from xarray.tests import has_zarr, requires_zarr, has_zarr_v3_async_index, requires_zarr_v3_async_index from xarray.tests.test_dataset import create_test_data +from xarray.tests.test_backends import ZARR_FORMATS + if has_zarr: import zarr @@ -16,12 +18,12 @@ zarr = None -@pytest.fixture -def store() -> "zarr.storage.MemoryStore": +@pytest.fixture(scope="module", params=ZARR_FORMATS) +def store(request) -> "zarr.storage.MemoryStore": memorystore = zarr.storage.MemoryStore({}) ds = create_test_data() - ds.to_zarr(memorystore, zarr_format=3, consolidated=False) + ds.to_zarr(memorystore, zarr_format=request.param, consolidated=False) return memorystore @@ -29,7 +31,7 @@ def store() -> "zarr.storage.MemoryStore": def get_xr_obj( store: "zarr.abc.store.Store", cls_name: Literal["Variable", "DataArray", "Dataset"] ): - ds = xr.open_zarr(store, zarr_format=3, consolidated=False, chunks=None) + ds = xr.open_zarr(store, consolidated=False, chunks=None) match cls_name: case "Variable": @@ -63,7 +65,7 @@ async def test_concurrent_load_multiple_variables(self, store) -> None: target_class, method_name, wraps=original_method, autospec=True ) as mocked_meth: # blocks upon loading the coordinate variables here - ds = xr.open_zarr(store, zarr_format=3, consolidated=False, chunks=None) + ds = xr.open_zarr(store, consolidated=False, chunks=None) # TODO we're not actually testing that these indexing methods are not blocking... result_ds = await ds.load_async() @@ -177,7 +179,6 @@ async def test_indexing( expected = getattr(xr_obj, method)(**indexer).load() xrt.assert_identical(result, expected) - # TODO generalize store to a v2 store? @requires_zarr @pytest.mark.skipif(has_zarr_v3_async_index, reason="newer version of zarr has async indexing") @pytest.mark.parametrize( @@ -193,7 +194,9 @@ async def test_indexing( async def test_raise_on_older_zarr_version(self, store, indexer): """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" - ds = xr.open_zarr(store, zarr_format=3, consolidated=False, chunks=None) + ds = xr.open_zarr(store, consolidated=False, chunks=None) with pytest.raises(NotImplementedError, match="async indexing"): await ds.isel(**indexer).load_async() + + # TODO also test raising informative error if attempting to do basic async indexing with 3.0.0 <= zarr <= 3.1.1? From d3e6a64a06fedfc3acb7571fd5ee89758aeac917 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Aug 2025 11:48:49 +0000 Subject: [PATCH 064/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/zarr.py | 11 ++++++++--- xarray/tests/test_async.py | 22 ++++++++++++++-------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 28097b065ae..0645a0fae29 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -183,7 +183,8 @@ def encode_zarr_attr_value(value): def has_zarr_async_index() -> bool: try: import zarr - return hasattr(zarr.AsyncArray, 'oindex') + + return hasattr(zarr.AsyncArray, "oindex") except (ImportError, AttributeError): return False @@ -230,14 +231,18 @@ async def _async_getitem(self, key): async def _async_oindex(self, key): if not has_zarr_async_index(): - raise NotImplementedError("For lazy orthogonal async indexing with zarr, zarr-python=>v3.1.2 is required") + raise NotImplementedError( + "For lazy orthogonal async indexing with zarr, zarr-python=>v3.1.2 is required" + ) async_array = self._array._async_array return await async_array.oindex.getitem(key) async def _async_vindex(self, key): if not has_zarr_async_index(): - raise NotImplementedError("For lazy orthogonal async indexing with zarr, zarr-python=>v3.1.2 is required") + raise NotImplementedError( + "For lazy orthogonal async indexing with zarr, zarr-python=>v3.1.2 is required" + ) async_array = self._array._async_array return await async_array.vindex.getitem(key) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index ee8679b13d0..5097108c5a7 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -7,10 +7,14 @@ import xarray as xr import xarray.testing as xrt -from xarray.tests import has_zarr, requires_zarr, has_zarr_v3_async_index, requires_zarr_v3_async_index -from xarray.tests.test_dataset import create_test_data +from xarray.tests import ( + has_zarr, + has_zarr_v3_async_index, + requires_zarr, + requires_zarr_v3_async_index, +) from xarray.tests.test_backends import ZARR_FORMATS - +from xarray.tests.test_dataset import create_test_data if has_zarr: import zarr @@ -180,15 +184,17 @@ async def test_indexing( xrt.assert_identical(result, expected) @requires_zarr - @pytest.mark.skipif(has_zarr_v3_async_index, reason="newer version of zarr has async indexing") + @pytest.mark.skipif( + has_zarr_v3_async_index, reason="newer version of zarr has async indexing" + ) @pytest.mark.parametrize( "indexer", [ {"dim2": [1, 3]}, # tests oindexing - { # test vindexing - "dim1": xr.Variable(data=[2, 3], dims="points"), - "dim2": xr.Variable(data=[1, 3], dims="points"), - }, + { # test vindexing + "dim1": xr.Variable(data=[2, 3], dims="points"), + "dim2": xr.Variable(data=[1, 3], dims="points"), + }, ], ) async def test_raise_on_older_zarr_version(self, store, indexer): From 4570aed94de25eefed5526ba931d286155312155 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 1 Aug 2025 16:15:35 +0100 Subject: [PATCH 065/112] add pytest-asyncio to other test CI env --- ci/requirements/bare-min-and-scipy.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/requirements/bare-min-and-scipy.yml b/ci/requirements/bare-min-and-scipy.yml index bb25af67651..d4a61586d82 100644 --- a/ci/requirements/bare-min-and-scipy.yml +++ b/ci/requirements/bare-min-and-scipy.yml @@ -7,6 +7,7 @@ dependencies: - coveralls - pip - pytest + - pytest-asyncio - pytest-cov - pytest-env - pytest-mypy-plugins From 0ec670efe0fcb51ec2d95cdedc3885b169633eba Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Fri, 1 Aug 2025 16:33:08 +0100 Subject: [PATCH 066/112] fix some mypy errors --- xarray/tests/test_async.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 5097108c5a7..3dac448de17 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -19,7 +19,7 @@ if has_zarr: import zarr else: - zarr = None + zarr = None # type: ignore[assignment] @pytest.fixture(scope="module", params=ZARR_FORMATS) @@ -27,7 +27,7 @@ def store(request) -> "zarr.storage.MemoryStore": memorystore = zarr.storage.MemoryStore({}) ds = create_test_data() - ds.to_zarr(memorystore, zarr_format=request.param, consolidated=False) + ds.to_zarr(memorystore, zarr_format=request.param, consolidated=False) # type: ignore[call-overload] return memorystore From a3a3b62ac88f1f12652533925219143b0f8bbe31 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 14:53:12 +0100 Subject: [PATCH 067/112] use method directly when possible --- xarray/tests/test_async.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 3dac448de17..83886e42579 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -57,7 +57,7 @@ def _resolve_class_from_string(class_path: str) -> type[Any]: class TestAsyncLoad: @requires_zarr_v3_async_index async def test_concurrent_load_multiple_variables(self, store) -> None: - target_class = _resolve_class_from_string("zarr.AsyncArray") + target_class = zarr.AsyncArray method_name = "getitem" original_method = getattr(target_class, method_name) @@ -85,7 +85,7 @@ async def test_concurrent_load_multiple_variables(self, store) -> None: async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: N_OBJECTS = 5 - target_class = _resolve_class_from_string("zarr.AsyncArray") + target_class = zarr.AsyncArray method_name = "getitem" original_method = getattr(target_class, method_name) From a28a6a9e44b4cf4a4fe8398fbd8f17373eeecf96 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 14:57:34 +0100 Subject: [PATCH 068/112] remove repeated API docs from bad merge --- doc/api.rst | 1760 --------------------------------------------------- 1 file changed, 1760 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 7cf8e999257..f7bb382e922 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -11,1763 +11,3 @@ and examples, refer to the relevant chapters in the main part of the documentation. See also: :ref:`public-api` and :ref:`api-stability`. - -.. autosummary:: - :toctree: generated/ - - apply_ufunc - align - broadcast - concat - merge - combine_by_coords - combine_nested - where - infer_freq - full_like - zeros_like - ones_like - cov - corr - cross - dot - polyval - map_blocks - show_versions - set_options - get_options - unify_chunks - -Dataset -======= - -Creating a dataset ------------------- - -.. autosummary:: - :toctree: generated/ - - Dataset - decode_cf - -Attributes ----------- - -.. autosummary:: - :toctree: generated/ - - Dataset.dims - Dataset.sizes - Dataset.dtypes - Dataset.data_vars - Dataset.coords - Dataset.attrs - Dataset.encoding - Dataset.indexes - Dataset.xindexes - Dataset.chunks - Dataset.chunksizes - Dataset.nbytes - -Dictionary interface --------------------- - -Datasets implement the mapping interface with keys given by variable names -and values given by ``DataArray`` objects. - -.. autosummary:: - :toctree: generated/ - - Dataset.__getitem__ - Dataset.__setitem__ - Dataset.__delitem__ - Dataset.update - Dataset.get - Dataset.items - Dataset.keys - Dataset.values - -Dataset contents ----------------- - -.. autosummary:: - :toctree: generated/ - - Dataset.copy - Dataset.assign - Dataset.assign_coords - Dataset.assign_attrs - Dataset.pipe - Dataset.merge - Dataset.rename - Dataset.rename_vars - Dataset.rename_dims - Dataset.swap_dims - Dataset.expand_dims - Dataset.drop_vars - Dataset.drop_indexes - Dataset.drop_duplicates - Dataset.drop_dims - Dataset.drop_encoding - Dataset.drop_attrs - Dataset.set_coords - Dataset.reset_coords - Dataset.convert_calendar - Dataset.interp_calendar - Dataset.get_index - -Comparisons ------------ - -.. autosummary:: - :toctree: generated/ - - Dataset.equals - Dataset.identical - Dataset.broadcast_equals - -Indexing --------- - -.. autosummary:: - :toctree: generated/ - - Dataset.loc - Dataset.isel - Dataset.sel - Dataset.drop_sel - Dataset.drop_isel - Dataset.head - Dataset.tail - Dataset.thin - Dataset.squeeze - Dataset.interp - Dataset.interp_like - Dataset.reindex - Dataset.reindex_like - Dataset.set_index - Dataset.reset_index - Dataset.set_xindex - Dataset.reorder_levels - Dataset.query - -Missing value handling ----------------------- - -.. autosummary:: - :toctree: generated/ - - Dataset.isnull - Dataset.notnull - Dataset.combine_first - Dataset.count - Dataset.dropna - Dataset.fillna - Dataset.ffill - Dataset.bfill - Dataset.interpolate_na - Dataset.where - Dataset.isin - -Computation ------------ - -.. autosummary:: - :toctree: generated/ - - Dataset.map - Dataset.reduce - Dataset.groupby - Dataset.groupby_bins - Dataset.rolling - Dataset.rolling_exp - Dataset.cumulative - Dataset.weighted - Dataset.coarsen - Dataset.resample - Dataset.diff - Dataset.quantile - Dataset.differentiate - Dataset.integrate - Dataset.map_blocks - Dataset.polyfit - Dataset.curvefit - Dataset.eval - -Aggregation ------------ - -.. autosummary:: - :toctree: generated/ - - Dataset.all - Dataset.any - Dataset.argmax - Dataset.argmin - Dataset.count - Dataset.idxmax - Dataset.idxmin - Dataset.max - Dataset.min - Dataset.mean - Dataset.median - Dataset.prod - Dataset.sum - Dataset.std - Dataset.var - Dataset.cumsum - Dataset.cumprod - -ndarray methods ---------------- - -.. autosummary:: - :toctree: generated/ - - Dataset.argsort - Dataset.astype - Dataset.clip - Dataset.conj - Dataset.conjugate - Dataset.imag - Dataset.round - Dataset.real - Dataset.rank - -Reshaping and reorganizing --------------------------- - -.. autosummary:: - :toctree: generated/ - - Dataset.transpose - Dataset.stack - Dataset.unstack - Dataset.to_stacked_array - Dataset.shift - Dataset.roll - Dataset.pad - Dataset.sortby - Dataset.broadcast_like - -DataArray -========= - -.. autosummary:: - :toctree: generated/ - - DataArray - -Attributes ----------- - -.. autosummary:: - :toctree: generated/ - - DataArray.values - DataArray.data - DataArray.coords - DataArray.dims - DataArray.sizes - DataArray.name - DataArray.attrs - DataArray.encoding - DataArray.indexes - DataArray.xindexes - DataArray.chunksizes - -ndarray attributes ------------------- - -.. autosummary:: - :toctree: generated/ - - DataArray.ndim - DataArray.nbytes - DataArray.shape - DataArray.size - DataArray.dtype - DataArray.chunks - - -DataArray contents ------------------- - -.. autosummary:: - :toctree: generated/ - - DataArray.assign_coords - DataArray.assign_attrs - DataArray.pipe - DataArray.rename - DataArray.swap_dims - DataArray.expand_dims - DataArray.drop_vars - DataArray.drop_indexes - DataArray.drop_duplicates - DataArray.drop_encoding - DataArray.drop_attrs - DataArray.reset_coords - DataArray.copy - DataArray.convert_calendar - DataArray.interp_calendar - DataArray.get_index - DataArray.astype - DataArray.item - -Indexing --------- - -.. autosummary:: - :toctree: generated/ - - DataArray.__getitem__ - DataArray.__setitem__ - DataArray.loc - DataArray.isel - DataArray.sel - DataArray.drop_sel - DataArray.drop_isel - DataArray.head - DataArray.tail - DataArray.thin - DataArray.squeeze - DataArray.interp - DataArray.interp_like - DataArray.reindex - DataArray.reindex_like - DataArray.set_index - DataArray.reset_index - DataArray.set_xindex - DataArray.reorder_levels - DataArray.query - -Missing value handling ----------------------- - -.. autosummary:: - :toctree: generated/ - - DataArray.isnull - DataArray.notnull - DataArray.combine_first - DataArray.count - DataArray.dropna - DataArray.fillna - DataArray.ffill - DataArray.bfill - DataArray.interpolate_na - DataArray.where - DataArray.isin - -Comparisons ------------ - -.. autosummary:: - :toctree: generated/ - - DataArray.equals - DataArray.identical - DataArray.broadcast_equals - -Computation ------------ - -.. autosummary:: - :toctree: generated/ - - DataArray.reduce - DataArray.groupby - DataArray.groupby_bins - DataArray.rolling - DataArray.rolling_exp - DataArray.cumulative - DataArray.weighted - DataArray.coarsen - DataArray.resample - DataArray.get_axis_num - DataArray.diff - DataArray.dot - DataArray.quantile - DataArray.differentiate - DataArray.integrate - DataArray.polyfit - DataArray.map_blocks - DataArray.curvefit - -Aggregation ------------ - -.. autosummary:: - :toctree: generated/ - - DataArray.all - DataArray.any - DataArray.argmax - DataArray.argmin - DataArray.count - DataArray.idxmax - DataArray.idxmin - DataArray.max - DataArray.min - DataArray.mean - DataArray.median - DataArray.prod - DataArray.sum - DataArray.std - DataArray.var - DataArray.cumsum - DataArray.cumprod - -ndarray methods ---------------- - -.. autosummary:: - :toctree: generated/ - - DataArray.argsort - DataArray.clip - DataArray.conj - DataArray.conjugate - DataArray.imag - DataArray.searchsorted - DataArray.round - DataArray.real - DataArray.T - DataArray.rank - - -String manipulation -------------------- - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor.rst - - DataArray.str - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_method.rst - - DataArray.str.capitalize - DataArray.str.casefold - DataArray.str.cat - DataArray.str.center - DataArray.str.contains - DataArray.str.count - DataArray.str.decode - DataArray.str.encode - DataArray.str.endswith - DataArray.str.extract - DataArray.str.extractall - DataArray.str.find - DataArray.str.findall - DataArray.str.format - DataArray.str.get - DataArray.str.get_dummies - DataArray.str.index - DataArray.str.isalnum - DataArray.str.isalpha - DataArray.str.isdecimal - DataArray.str.isdigit - DataArray.str.islower - DataArray.str.isnumeric - DataArray.str.isspace - DataArray.str.istitle - DataArray.str.isupper - DataArray.str.join - DataArray.str.len - DataArray.str.ljust - DataArray.str.lower - DataArray.str.lstrip - DataArray.str.match - DataArray.str.normalize - DataArray.str.pad - DataArray.str.partition - DataArray.str.repeat - DataArray.str.replace - DataArray.str.rfind - DataArray.str.rindex - DataArray.str.rjust - DataArray.str.rpartition - DataArray.str.rsplit - DataArray.str.rstrip - DataArray.str.slice - DataArray.str.slice_replace - DataArray.str.split - DataArray.str.startswith - DataArray.str.strip - DataArray.str.swapcase - DataArray.str.title - DataArray.str.translate - DataArray.str.upper - DataArray.str.wrap - DataArray.str.zfill - -Datetimelike properties ------------------------ - -**Datetime properties**: - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_attribute.rst - - DataArray.dt.year - DataArray.dt.month - DataArray.dt.day - DataArray.dt.hour - DataArray.dt.minute - DataArray.dt.second - DataArray.dt.microsecond - DataArray.dt.nanosecond - DataArray.dt.dayofweek - DataArray.dt.weekday - DataArray.dt.dayofyear - DataArray.dt.quarter - DataArray.dt.days_in_month - DataArray.dt.daysinmonth - DataArray.dt.days_in_year - DataArray.dt.season - DataArray.dt.time - DataArray.dt.date - DataArray.dt.decimal_year - DataArray.dt.calendar - DataArray.dt.is_month_start - DataArray.dt.is_month_end - DataArray.dt.is_quarter_end - DataArray.dt.is_year_start - DataArray.dt.is_leap_year - -**Datetime methods**: - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_method.rst - - DataArray.dt.floor - DataArray.dt.ceil - DataArray.dt.isocalendar - DataArray.dt.round - DataArray.dt.strftime - -**Timedelta properties**: - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_attribute.rst - - DataArray.dt.days - DataArray.dt.seconds - DataArray.dt.microseconds - DataArray.dt.nanoseconds - DataArray.dt.total_seconds - -**Timedelta methods**: - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_method.rst - - DataArray.dt.floor - DataArray.dt.ceil - DataArray.dt.round - - -Reshaping and reorganizing --------------------------- - -.. autosummary:: - :toctree: generated/ - - DataArray.transpose - DataArray.stack - DataArray.unstack - DataArray.to_unstacked_dataset - DataArray.shift - DataArray.roll - DataArray.pad - DataArray.sortby - DataArray.broadcast_like - -DataTree -======== - -Creating a DataTree -------------------- - -Methods of creating a ``DataTree``. - -.. autosummary:: - :toctree: generated/ - - DataTree - DataTree.from_dict - -Tree Attributes ---------------- - -Attributes relating to the recursive tree-like structure of a ``DataTree``. - -.. autosummary:: - :toctree: generated/ - - DataTree.parent - DataTree.children - DataTree.name - DataTree.path - DataTree.root - DataTree.is_root - DataTree.is_leaf - DataTree.leaves - DataTree.level - DataTree.depth - DataTree.width - DataTree.subtree - DataTree.subtree_with_keys - DataTree.descendants - DataTree.siblings - DataTree.lineage - DataTree.parents - DataTree.ancestors - DataTree.groups - DataTree.xindexes - -Data Contents -------------- - -Interface to the data objects (optionally) stored inside a single ``DataTree`` node. -This interface echoes that of ``xarray.Dataset``. - -.. autosummary:: - :toctree: generated/ - - DataTree.dims - DataTree.sizes - DataTree.data_vars - DataTree.ds - DataTree.coords - DataTree.attrs - DataTree.encoding - DataTree.indexes - DataTree.nbytes - DataTree.dataset - DataTree.to_dataset - DataTree.has_data - DataTree.has_attrs - DataTree.is_empty - DataTree.is_hollow - DataTree.chunksizes - -Dictionary Interface --------------------- - -``DataTree`` objects also have a dict-like interface mapping keys to either ``xarray.DataArray``\s or to child ``DataTree`` nodes. - -.. autosummary:: - :toctree: generated/ - - DataTree.__getitem__ - DataTree.__setitem__ - DataTree.__delitem__ - DataTree.update - DataTree.get - DataTree.items - DataTree.keys - DataTree.values - -Tree Manipulation ------------------ - -For manipulating, traversing, navigating, or mapping over the tree structure. - -.. autosummary:: - :toctree: generated/ - - DataTree.orphan - DataTree.same_tree - DataTree.relative_to - DataTree.iter_lineage - DataTree.find_common_ancestor - DataTree.map_over_datasets - DataTree.pipe - DataTree.match - DataTree.filter - DataTree.filter_like - -Pathlib-like Interface ----------------------- - -``DataTree`` objects deliberately echo some of the API of :py:class:`pathlib.PurePath`. - -.. autosummary:: - :toctree: generated/ - - DataTree.name - DataTree.parent - DataTree.parents - DataTree.relative_to - -.. Missing: - -.. .. - -.. ``DataTree.glob`` -.. ``DataTree.joinpath`` -.. ``DataTree.with_name`` -.. ``DataTree.walk`` -.. ``DataTree.rename`` -.. ``DataTree.replace`` - -DataTree Contents ------------------ - -Manipulate the contents of all nodes in a ``DataTree`` simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.copy - - .. DataTree.assign_coords - .. DataTree.merge - .. DataTree.rename - .. DataTree.rename_vars - .. DataTree.rename_dims - .. DataTree.swap_dims - .. DataTree.expand_dims - .. DataTree.drop_vars - .. DataTree.drop_dims - .. DataTree.set_coords - .. DataTree.reset_coords - -DataTree Node Contents ----------------------- - -Manipulate the contents of a single ``DataTree`` node. - -.. autosummary:: - :toctree: generated/ - - DataTree.assign - DataTree.drop_nodes - -DataTree Operations -------------------- - -Apply operations over multiple ``DataTree`` objects. - -.. autosummary:: - :toctree: generated/ - - map_over_datasets - group_subtrees - -Comparisons ------------ - -Compare one ``DataTree`` object to another. - -.. autosummary:: - :toctree: generated/ - - DataTree.isomorphic - DataTree.equals - DataTree.identical - -Indexing --------- - -Index into all nodes in the subtree simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.isel - DataTree.sel - -.. DataTree.drop_sel -.. DataTree.drop_isel -.. DataTree.head -.. DataTree.tail -.. DataTree.thin -.. DataTree.squeeze -.. DataTree.interp -.. DataTree.interp_like -.. DataTree.reindex -.. DataTree.reindex_like -.. DataTree.set_index -.. DataTree.reset_index -.. DataTree.reorder_levels -.. DataTree.query - -.. .. - -.. Missing: -.. ``DataTree.loc`` - - -.. Missing Value Handling -.. ---------------------- - -.. .. autosummary:: -.. :toctree: generated/ - -.. DataTree.isnull -.. DataTree.notnull -.. DataTree.combine_first -.. DataTree.dropna -.. DataTree.fillna -.. DataTree.ffill -.. DataTree.bfill -.. DataTree.interpolate_na -.. DataTree.where -.. DataTree.isin - -.. Computation -.. ----------- - -.. Apply a computation to the data in all nodes in the subtree simultaneously. - -.. .. autosummary:: -.. :toctree: generated/ - -.. DataTree.map -.. DataTree.reduce -.. DataTree.diff -.. DataTree.quantile -.. DataTree.differentiate -.. DataTree.integrate -.. DataTree.map_blocks -.. DataTree.polyfit -.. DataTree.curvefit - -Aggregation ------------ - -Aggregate data in all nodes in the subtree simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.all - DataTree.any - DataTree.max - DataTree.min - DataTree.mean - DataTree.median - DataTree.prod - DataTree.sum - DataTree.std - DataTree.var - DataTree.cumsum - DataTree.cumprod - -ndarray methods ---------------- - -Methods copied from :py:class:`numpy.ndarray` objects, here applying to the data in all nodes in the subtree. - -.. autosummary:: - :toctree: generated/ - - DataTree.argsort - DataTree.conj - DataTree.conjugate - DataTree.round -.. DataTree.astype -.. DataTree.clip -.. DataTree.rank - -.. Reshaping and reorganising -.. -------------------------- - -.. Reshape or reorganise the data in all nodes in the subtree. - -.. .. autosummary:: -.. :toctree: generated/ - -.. DataTree.transpose -.. DataTree.stack -.. DataTree.unstack -.. DataTree.shift -.. DataTree.roll -.. DataTree.pad -.. DataTree.sortby -.. DataTree.broadcast_like - -Coordinates -=========== - -Creating coordinates --------------------- - -.. autosummary:: - :toctree: generated/ - - Coordinates - Coordinates.from_xindex - Coordinates.from_pandas_multiindex - -Attributes ----------- - -.. autosummary:: - :toctree: generated/ - - Coordinates.dims - Coordinates.sizes - Coordinates.dtypes - Coordinates.variables - Coordinates.indexes - Coordinates.xindexes - -Dictionary Interface --------------------- - -Coordinates implement the mapping interface with keys given by variable names -and values given by ``DataArray`` objects. - -.. autosummary:: - :toctree: generated/ - - Coordinates.__getitem__ - Coordinates.__setitem__ - Coordinates.__delitem__ - Coordinates.update - Coordinates.get - Coordinates.items - Coordinates.keys - Coordinates.values - -Coordinates contents --------------------- - -.. autosummary:: - :toctree: generated/ - - Coordinates.to_dataset - Coordinates.to_index - Coordinates.assign - Coordinates.merge - Coordinates.copy - -Comparisons ------------ - -.. autosummary:: - :toctree: generated/ - - Coordinates.equals - Coordinates.identical - -Proxies -------- - -Coordinates that are accessed from the ``coords`` property of Dataset, DataArray -and DataTree objects, respectively. - -.. autosummary:: - :toctree: generated/ - - core.coordinates.DatasetCoordinates - core.coordinates.DataArrayCoordinates - core.coordinates.DataTreeCoordinates - -Universal functions -=================== - -These functions are equivalent to their NumPy versions, but for xarray -objects backed by non-NumPy array types (e.g. ``cupy``, ``sparse``, or ``jax``), -they will ensure that the computation is dispatched to the appropriate -backend. You can find them in the ``xarray.ufuncs`` module: - -.. autosummary:: - :toctree: generated/ - - ufuncs.abs - ufuncs.absolute - ufuncs.acos - ufuncs.acosh - ufuncs.arccos - ufuncs.arccosh - ufuncs.arcsin - ufuncs.arcsinh - ufuncs.arctan - ufuncs.arctanh - ufuncs.asin - ufuncs.asinh - ufuncs.atan - ufuncs.atanh - ufuncs.bitwise_count - ufuncs.bitwise_invert - ufuncs.bitwise_not - ufuncs.cbrt - ufuncs.ceil - ufuncs.conj - ufuncs.conjugate - ufuncs.cos - ufuncs.cosh - ufuncs.deg2rad - ufuncs.degrees - ufuncs.exp - ufuncs.exp2 - ufuncs.expm1 - ufuncs.fabs - ufuncs.floor - ufuncs.invert - ufuncs.isfinite - ufuncs.isinf - ufuncs.isnan - ufuncs.isnat - ufuncs.log - ufuncs.log10 - ufuncs.log1p - ufuncs.log2 - ufuncs.logical_not - ufuncs.negative - ufuncs.positive - ufuncs.rad2deg - ufuncs.radians - ufuncs.reciprocal - ufuncs.rint - ufuncs.sign - ufuncs.signbit - ufuncs.sin - ufuncs.sinh - ufuncs.spacing - ufuncs.sqrt - ufuncs.square - ufuncs.tan - ufuncs.tanh - ufuncs.trunc - ufuncs.add - ufuncs.arctan2 - ufuncs.atan2 - ufuncs.bitwise_and - ufuncs.bitwise_left_shift - ufuncs.bitwise_or - ufuncs.bitwise_right_shift - ufuncs.bitwise_xor - ufuncs.copysign - ufuncs.divide - ufuncs.equal - ufuncs.float_power - ufuncs.floor_divide - ufuncs.fmax - ufuncs.fmin - ufuncs.fmod - ufuncs.gcd - ufuncs.greater - ufuncs.greater_equal - ufuncs.heaviside - ufuncs.hypot - ufuncs.lcm - ufuncs.ldexp - ufuncs.left_shift - ufuncs.less - ufuncs.less_equal - ufuncs.logaddexp - ufuncs.logaddexp2 - ufuncs.logical_and - ufuncs.logical_or - ufuncs.logical_xor - ufuncs.maximum - ufuncs.minimum - ufuncs.mod - ufuncs.multiply - ufuncs.nextafter - ufuncs.not_equal - ufuncs.pow - ufuncs.power - ufuncs.remainder - ufuncs.right_shift - ufuncs.subtract - ufuncs.true_divide - ufuncs.angle - ufuncs.isreal - ufuncs.iscomplex - -IO / Conversion -=============== - -Dataset methods ---------------- - -.. autosummary:: - :toctree: generated/ - - load_dataset - open_dataset - open_mfdataset - open_zarr - save_mfdataset - Dataset.as_numpy - Dataset.from_dataframe - Dataset.from_dict - Dataset.to_dataarray - Dataset.to_dataframe - Dataset.to_dask_dataframe - Dataset.to_dict - Dataset.to_netcdf - Dataset.to_pandas - Dataset.to_zarr - Dataset.chunk - Dataset.close - Dataset.compute - Dataset.filter_by_attrs - Dataset.info - Dataset.load - Dataset.load_async - Dataset.persist - Dataset.unify_chunks - -DataArray methods ------------------ - -.. autosummary:: - :toctree: generated/ - - load_dataarray - open_dataarray - DataArray.as_numpy - DataArray.from_dict - DataArray.from_iris - DataArray.from_series - DataArray.to_dask_dataframe - DataArray.to_dataframe - DataArray.to_dataset - DataArray.to_dict - DataArray.to_index - DataArray.to_iris - DataArray.to_masked_array - DataArray.to_netcdf - DataArray.to_numpy - DataArray.to_pandas - DataArray.to_series - DataArray.to_zarr - DataArray.chunk - DataArray.close - DataArray.compute - DataArray.persist - DataArray.load - DataArray.load_async - DataArray.unify_chunks - -DataTree methods ----------------- - -.. autosummary:: - :toctree: generated/ - - open_datatree - open_groups - DataTree.to_dict - DataTree.to_netcdf - DataTree.to_zarr - DataTree.chunk - DataTree.load - DataTree.compute - DataTree.persist - -.. .. - -.. Missing: -.. ``open_mfdatatree`` - -Encoding/Decoding -================= - -Coder objects -------------- - -.. autosummary:: - :toctree: generated/ - - coders.CFDatetimeCoder - -Plotting -======== - -Dataset -------- - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_method.rst - - Dataset.plot.scatter - Dataset.plot.quiver - Dataset.plot.streamplot - -DataArray ---------- - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_callable.rst - - DataArray.plot - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_method.rst - - DataArray.plot.contourf - DataArray.plot.contour - DataArray.plot.hist - DataArray.plot.imshow - DataArray.plot.line - DataArray.plot.pcolormesh - DataArray.plot.step - DataArray.plot.scatter - DataArray.plot.surface - - -Faceting --------- -.. autosummary:: - :toctree: generated/ - - plot.FacetGrid - plot.FacetGrid.add_colorbar - plot.FacetGrid.add_legend - plot.FacetGrid.add_quiverkey - plot.FacetGrid.map - plot.FacetGrid.map_dataarray - plot.FacetGrid.map_dataarray_line - plot.FacetGrid.map_dataset - plot.FacetGrid.map_plot1d - plot.FacetGrid.set_axis_labels - plot.FacetGrid.set_ticks - plot.FacetGrid.set_titles - plot.FacetGrid.set_xlabels - plot.FacetGrid.set_ylabels - - - -GroupBy objects -=============== - -.. currentmodule:: xarray.core.groupby - -Dataset -------- - -.. autosummary:: - :toctree: generated/ - - DatasetGroupBy - DatasetGroupBy.map - DatasetGroupBy.reduce - DatasetGroupBy.assign - DatasetGroupBy.assign_coords - DatasetGroupBy.first - DatasetGroupBy.last - DatasetGroupBy.fillna - DatasetGroupBy.quantile - DatasetGroupBy.where - DatasetGroupBy.all - DatasetGroupBy.any - DatasetGroupBy.count - DatasetGroupBy.cumsum - DatasetGroupBy.cumprod - DatasetGroupBy.max - DatasetGroupBy.mean - DatasetGroupBy.median - DatasetGroupBy.min - DatasetGroupBy.prod - DatasetGroupBy.std - DatasetGroupBy.sum - DatasetGroupBy.var - DatasetGroupBy.dims - DatasetGroupBy.groups - DatasetGroupBy.shuffle_to_chunks - -DataArray ---------- - -.. autosummary:: - :toctree: generated/ - - DataArrayGroupBy - DataArrayGroupBy.map - DataArrayGroupBy.reduce - DataArrayGroupBy.assign_coords - DataArrayGroupBy.first - DataArrayGroupBy.last - DataArrayGroupBy.fillna - DataArrayGroupBy.quantile - DataArrayGroupBy.where - DataArrayGroupBy.all - DataArrayGroupBy.any - DataArrayGroupBy.count - DataArrayGroupBy.cumsum - DataArrayGroupBy.cumprod - DataArrayGroupBy.max - DataArrayGroupBy.mean - DataArrayGroupBy.median - DataArrayGroupBy.min - DataArrayGroupBy.prod - DataArrayGroupBy.std - DataArrayGroupBy.sum - DataArrayGroupBy.var - DataArrayGroupBy.dims - DataArrayGroupBy.groups - DataArrayGroupBy.shuffle_to_chunks - -Grouper Objects ---------------- - -.. currentmodule:: xarray - -.. autosummary:: - :toctree: generated/ - - groupers.BinGrouper - groupers.UniqueGrouper - groupers.TimeResampler - groupers.SeasonGrouper - groupers.SeasonResampler - - -Rolling objects -=============== - -.. currentmodule:: xarray.computation.rolling - -Dataset -------- - -.. autosummary:: - :toctree: generated/ - - DatasetRolling - DatasetRolling.construct - DatasetRolling.reduce - DatasetRolling.argmax - DatasetRolling.argmin - DatasetRolling.count - DatasetRolling.max - DatasetRolling.mean - DatasetRolling.median - DatasetRolling.min - DatasetRolling.prod - DatasetRolling.std - DatasetRolling.sum - DatasetRolling.var - -DataArray ---------- - -.. autosummary:: - :toctree: generated/ - - DataArrayRolling - DataArrayRolling.__iter__ - DataArrayRolling.construct - DataArrayRolling.reduce - DataArrayRolling.argmax - DataArrayRolling.argmin - DataArrayRolling.count - DataArrayRolling.max - DataArrayRolling.mean - DataArrayRolling.median - DataArrayRolling.min - DataArrayRolling.prod - DataArrayRolling.std - DataArrayRolling.sum - DataArrayRolling.var - -Coarsen objects -=============== - -Dataset -------- - -.. autosummary:: - :toctree: generated/ - - DatasetCoarsen - DatasetCoarsen.all - DatasetCoarsen.any - DatasetCoarsen.construct - DatasetCoarsen.count - DatasetCoarsen.max - DatasetCoarsen.mean - DatasetCoarsen.median - DatasetCoarsen.min - DatasetCoarsen.prod - DatasetCoarsen.reduce - DatasetCoarsen.std - DatasetCoarsen.sum - DatasetCoarsen.var - -DataArray ---------- - -.. autosummary:: - :toctree: generated/ - - DataArrayCoarsen - DataArrayCoarsen.all - DataArrayCoarsen.any - DataArrayCoarsen.construct - DataArrayCoarsen.count - DataArrayCoarsen.max - DataArrayCoarsen.mean - DataArrayCoarsen.median - DataArrayCoarsen.min - DataArrayCoarsen.prod - DataArrayCoarsen.reduce - DataArrayCoarsen.std - DataArrayCoarsen.sum - DataArrayCoarsen.var - -Exponential rolling objects -=========================== - -.. currentmodule:: xarray.computation.rolling_exp - -.. autosummary:: - :toctree: generated/ - - RollingExp - RollingExp.mean - RollingExp.sum - -Weighted objects -================ - -.. currentmodule:: xarray.computation.weighted - -Dataset -------- - -.. autosummary:: - :toctree: generated/ - - DatasetWeighted - DatasetWeighted.mean - DatasetWeighted.quantile - DatasetWeighted.sum - DatasetWeighted.std - DatasetWeighted.var - DatasetWeighted.sum_of_weights - DatasetWeighted.sum_of_squares - -DataArray ---------- - -.. autosummary:: - :toctree: generated/ - - DataArrayWeighted - DataArrayWeighted.mean - DataArrayWeighted.quantile - DataArrayWeighted.sum - DataArrayWeighted.std - DataArrayWeighted.var - DataArrayWeighted.sum_of_weights - DataArrayWeighted.sum_of_squares - -Resample objects -================ - -.. currentmodule:: xarray.core.resample - -Dataset -------- - -.. autosummary:: - :toctree: generated/ - - DatasetResample - DatasetResample.asfreq - DatasetResample.backfill - DatasetResample.interpolate - DatasetResample.nearest - DatasetResample.pad - DatasetResample.all - DatasetResample.any - DatasetResample.apply - DatasetResample.assign - DatasetResample.assign_coords - DatasetResample.bfill - DatasetResample.count - DatasetResample.ffill - DatasetResample.fillna - DatasetResample.first - DatasetResample.last - DatasetResample.map - DatasetResample.max - DatasetResample.mean - DatasetResample.median - DatasetResample.min - DatasetResample.prod - DatasetResample.quantile - DatasetResample.reduce - DatasetResample.std - DatasetResample.sum - DatasetResample.var - DatasetResample.where - DatasetResample.dims - DatasetResample.groups - - -DataArray ---------- - -.. autosummary:: - :toctree: generated/ - - DataArrayResample - DataArrayResample.asfreq - DataArrayResample.backfill - DataArrayResample.interpolate - DataArrayResample.nearest - DataArrayResample.pad - DataArrayResample.all - DataArrayResample.any - DataArrayResample.apply - DataArrayResample.assign_coords - DataArrayResample.bfill - DataArrayResample.count - DataArrayResample.ffill - DataArrayResample.fillna - DataArrayResample.first - DataArrayResample.last - DataArrayResample.map - DataArrayResample.max - DataArrayResample.mean - DataArrayResample.median - DataArrayResample.min - DataArrayResample.prod - DataArrayResample.quantile - DataArrayResample.reduce - DataArrayResample.std - DataArrayResample.sum - DataArrayResample.var - DataArrayResample.where - DataArrayResample.dims - DataArrayResample.groups - -Accessors -========= - -.. currentmodule:: xarray.core - -.. autosummary:: - :toctree: generated/ - - accessor_dt.DatetimeAccessor - accessor_dt.TimedeltaAccessor - accessor_str.StringAccessor - - -Custom Indexes -============== -.. currentmodule:: xarray - -.. autosummary:: - :toctree: generated/ - - CFTimeIndex - indexes.RangeIndex - -Creating custom indexes ------------------------ -.. autosummary:: - :toctree: generated/ - - cftime_range - date_range - date_range_like - indexes.RangeIndex.arange - indexes.RangeIndex.linspace - -Tutorial -======== - -.. autosummary:: - :toctree: generated/ - - tutorial.open_dataset - tutorial.load_dataset - tutorial.open_datatree - tutorial.load_datatree - -Testing -======= - -.. autosummary:: - :toctree: generated/ - - testing.assert_equal - testing.assert_identical - testing.assert_allclose - testing.assert_chunks_equal - -Test that two ``DataTree`` objects are similar. - -.. autosummary:: - :toctree: generated/ - - testing.assert_isomorphic - testing.assert_equal - testing.assert_identical - -Hypothesis Testing Strategies -============================= - -.. currentmodule:: xarray - -See the :ref:`documentation page on testing ` for a guide on how to use these strategies. - -.. warning:: - These strategies should be considered highly experimental, and liable to change at any time. - -.. autosummary:: - :toctree: generated/ - - testing.strategies.supported_dtypes - testing.strategies.names - testing.strategies.dimension_names - testing.strategies.dimension_sizes - testing.strategies.attrs - testing.strategies.variables - testing.strategies.unique_subset_of - -Exceptions -========== - -.. autosummary:: - :toctree: generated/ - - AlignmentError - CoordinateValidationError - MergeError - SerializationWarning - -DataTree --------- - -Exceptions raised when manipulating trees. - -.. autosummary:: - :toctree: generated/ - - TreeIsomorphismError - InvalidTreeError - NotFoundInTreeError - -Advanced API -============ - -.. autosummary:: - :toctree: generated/ - - Coordinates - Dataset.variables - DataArray.variable - DataTree.variables - Variable - IndexVariable - as_variable - Index - IndexSelResult - Context - register_dataset_accessor - register_dataarray_accessor - register_datatree_accessor - Dataset.set_close - backends.BackendArray - backends.BackendEntrypoint - backends.list_engines - backends.refresh_engines - -.. .. - -.. Missing: -.. ``DataTree.set_close`` - -Default, pandas-backed indexes built-in Xarray: - - indexes.PandasIndex - indexes.PandasMultiIndex - -These backends provide a low-level interface for lazily loading data from -external file-formats or protocols, and can be manually invoked to create -arguments for the ``load_store`` and ``dump_to_store`` Dataset methods: - -.. autosummary:: - :toctree: generated/ - - backends.NetCDF4DataStore - backends.H5NetCDFStore - backends.PydapDataStore - backends.ScipyDataStore - backends.ZarrStore - backends.FileManager - backends.CachingFileManager - backends.DummyFileManager - -These BackendEntrypoints provide a basic interface to the most commonly -used filetypes in the xarray universe. - -.. autosummary:: - :toctree: generated/ - - backends.NetCDF4BackendEntrypoint - backends.H5netcdfBackendEntrypoint - backends.PydapBackendEntrypoint - backends.ScipyBackendEntrypoint - backends.StoreBackendEntrypoint - backends.ZarrBackendEntrypoint - -Deprecated / Pending Deprecation -================================ - -.. autosummary:: - :toctree: generated/ - - Dataset.drop - DataArray.drop - Dataset.apply - core.groupby.DataArrayGroupBy.apply - core.groupby.DatasetGroupBy.apply - -.. autosummary:: - :toctree: generated/ - :template: autosummary/accessor_attribute.rst - - DataArray.dt.weekofyear - DataArray.dt.week -======= -.. toctree:: - :maxdepth: 1 - - api/top-level - api/dataset - api/dataarray - api/datatree - api/coordinates - api/indexes - api/ufuncs - api/io - api/encoding - api/plotting - api/groupby - api/rolling - api/coarsen - api/rolling-exp - api/weighted - api/resample - api/accessors - api/tutorial - api/testing - api/backends - api/exceptions - api/advanced - api/deprecated From 577cc72dc4720e1b54b14c65f9e117a09558064d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 18:57:22 +0100 Subject: [PATCH 069/112] fix bad merge in release note --- doc/whats-new.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 64f20a7e257..1d6dc821e7e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,11 +15,6 @@ New Features - Added new asynchronous loading methods :py:meth:`~xarray.Dataset.load_async`, :py:meth:`~xarray.DataArray.load_async`, :py:meth:`~xarray.Variable.load_async`. (:issue:`10326`, :pull:`10327`) By `Tom Nicholas `_. -- Allow an Xarray index that uses multiple dimensions checking equality with another - index for only a subset of those dimensions (i.e., ignoring the dimensions - that are excluded from alignment). - (:issue:`10243`, :pull:`10293`) - By `Benoit Bovy `_. Breaking changes ~~~~~~~~~~~~~~~~ From a68579fe8d9f7d82c99bece7ee7570e8dd7e6ef9 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 18:58:37 +0100 Subject: [PATCH 070/112] fix other bad merge in whatsnew --- doc/whats-new.rst | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1d6dc821e7e..d5d13e6a32b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,20 +38,10 @@ Deprecations Bug fixes ~~~~~~~~~ -- Fix Pydap test_cmp_local_file for numpy 2.3.0 changes, 1. do always return arrays for all versions and 2. skip astype(str) for numpy >= 2.3.0 for expected data. (:pull:`10421`) - By `Kai Mühlbauer `_. -- Fix the SciPy backend for netCDF3 files . (:issue:`8909`, :pull:`10376`) - By `Deepak Cherian `_. -- Allow accessing arbitrary attributes on Pandas ExtensionArrays. - By `Deepak Cherian `_. -- Check and fix character array string dimension names, issue warnings as needed (:issue:`6352`, :pull:`10395`). - By `Kai Mühlbauer `_. - - Fix Pydap Datatree backend testing. Testing now compares elements of (unordered) two sets (before, lists) (:pull:`10525`). By `Miguel Jimenez-Urias `_. - Fix ``KeyError`` when passing a ``dim`` argument different from the default to ``convert_calendar`` (:pull:`10544`). By `Eric Jansen `_. - - Fix transpose of boolean arrays read from disk. (:issue:`10536`) By `Deepak Cherian `_. - Fix detection of the ``h5netcdf`` backend. Xarray now selects ``h5netcdf`` if the default ``netCDF4`` engine is not available (:issue:`10401`, :pull:`10557`). From acc5c94327046296d61313dbd3cd6512c414796e Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 11:04:01 -0700 Subject: [PATCH 071/112] remove prints Co-authored-by: Deepak Cherian --- xarray/backends/zarr.py | 1 - xarray/core/indexing.py | 1 - xarray/namedarray/pycompat.py | 1 - 3 files changed, 3 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0645a0fae29..d136b54e581 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -263,7 +263,6 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here async def async_getitem(self, key): - print("async getting") array = self._array if isinstance(key, indexing.BasicIndexer): method = self._async_getitem diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 6d7f2fe44e3..4a41fa4f269 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -757,7 +757,6 @@ def get_duck_array(self): return _wrap_numpy_scalars(array) async def async_get_duck_array(self): - print("inside LazilyVectorizedIndexedArray.async_get_duck_array") from xarray.backends.common import BackendArray if isinstance(self.array, BackendArray): diff --git a/xarray/namedarray/pycompat.py b/xarray/namedarray/pycompat.py index 6e61d3445ab..997fe152f42 100644 --- a/xarray/namedarray/pycompat.py +++ b/xarray/namedarray/pycompat.py @@ -156,7 +156,6 @@ async def async_to_duck_array( IndexingAdapter, ) - print(type(data)) if isinstance(data, IndexingAdapter): # These wrap in-memory arrays, and async isn't needed return data.get_duck_array() From 7776d41b288f7518c64b10c5db96eb01c7f60723 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 19:05:07 +0100 Subject: [PATCH 072/112] remove last print statement --- xarray/namedarray/pycompat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/namedarray/pycompat.py b/xarray/namedarray/pycompat.py index 997fe152f42..0fe5cfdf3b5 100644 --- a/xarray/namedarray/pycompat.py +++ b/xarray/namedarray/pycompat.py @@ -160,7 +160,6 @@ async def async_to_duck_array( # These wrap in-memory arrays, and async isn't needed return data.get_duck_array() elif isinstance(data, ExplicitlyIndexed | ImplicitToExplicitIndexingAdapter): - print("async inside to_duck_array") return await data.async_get_duck_array() # type: ignore[no-untyped-call, no-any-return] else: return to_duck_array(data, **kwargs) From 0b1ebb51f2684ac4e392759dd317e4e8974716c1 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 22:02:44 +0100 Subject: [PATCH 073/112] test async basic indexing raises informative error before zarr-python v3.0.0 --- xarray/backends/zarr.py | 6 +++++- xarray/tests/test_async.py | 36 ++++++++++++++++++++++++++---------- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d136b54e581..6ff272a31b3 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -225,7 +225,11 @@ def _getitem(self, key): return self._array[key] async def _async_getitem(self, key): - # TODO requires zarr-python v3.0.0 + if not _zarr_v3(): + raise NotImplementedError( + "For lazy basic async indexing with zarr, zarr-python=>v3.0.0 is required" + ) + async_array = self._array._async_array return await async_array.getitem(key) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 83886e42579..868fda05ca0 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -9,6 +9,8 @@ import xarray.testing as xrt from xarray.tests import ( has_zarr, + has_zarr_v3, + # TODO rename this to show it's specifically for o/vindexing has_zarr_v3_async_index, requires_zarr, requires_zarr_v3_async_index, @@ -184,17 +186,32 @@ async def test_indexing( xrt.assert_identical(result, expected) @requires_zarr - @pytest.mark.skipif( - has_zarr_v3_async_index, reason="newer version of zarr has async indexing" - ) @pytest.mark.parametrize( "indexer", [ - {"dim2": [1, 3]}, # tests oindexing - { # test vindexing - "dim1": xr.Variable(data=[2, 3], dims="points"), - "dim2": xr.Variable(data=[1, 3], dims="points"), - }, + pytest.param( + {"dim2": 2}, + marks=pytest.mark.skipif( + has_zarr_v3, reason="current version of zarr has basic async indexing" + ), + ), # tests basic indexing + pytest.param( + {"dim2": [1, 3]}, + marks=pytest.mark.skipif( + has_zarr_v3_async_index, + reason="current version of zarr has async orthogonal indexing", + ), + ), # tests oindexing + pytest.param( + { + "dim1": xr.Variable(data=[2, 3], dims="points"), + "dim2": xr.Variable(data=[1, 3], dims="points"), + }, + marks=pytest.mark.skipif( + has_zarr_v3_async_index, + reason="current version of zarr has async vectorized indexing", + ), + ), # tests vindexing ], ) async def test_raise_on_older_zarr_version(self, store, indexer): @@ -202,7 +219,6 @@ async def test_raise_on_older_zarr_version(self, store, indexer): ds = xr.open_zarr(store, consolidated=False, chunks=None) + # TODO match the correct error message in each case with pytest.raises(NotImplementedError, match="async indexing"): await ds.isel(**indexer).load_async() - - # TODO also test raising informative error if attempting to do basic async indexing with 3.0.0 <= zarr <= 3.1.1? From df097800adca30d1d3cffe1bedc5740ed36ce633 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 22:05:56 +0100 Subject: [PATCH 074/112] test correct error message is raised for each indexing case --- xarray/backends/zarr.py | 2 +- xarray/tests/test_async.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 6ff272a31b3..af5e395cd72 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -245,7 +245,7 @@ async def _async_oindex(self, key): async def _async_vindex(self, key): if not has_zarr_async_index(): raise NotImplementedError( - "For lazy orthogonal async indexing with zarr, zarr-python=>v3.1.2 is required" + "For lazy vectorized async indexing with zarr, zarr-python=>v3.1.2 is required" ) async_array = self._array._async_array diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 868fda05ca0..ec9c9bb8112 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -187,16 +187,18 @@ async def test_indexing( @requires_zarr @pytest.mark.parametrize( - "indexer", + ("indexer", "expected_err_msg"), [ pytest.param( {"dim2": 2}, + "basic async indexing", marks=pytest.mark.skipif( has_zarr_v3, reason="current version of zarr has basic async indexing" ), ), # tests basic indexing pytest.param( {"dim2": [1, 3]}, + "orthogonal async indexing", marks=pytest.mark.skipif( has_zarr_v3_async_index, reason="current version of zarr has async orthogonal indexing", @@ -207,6 +209,7 @@ async def test_indexing( "dim1": xr.Variable(data=[2, 3], dims="points"), "dim2": xr.Variable(data=[1, 3], dims="points"), }, + "vectorized async indexing", marks=pytest.mark.skipif( has_zarr_v3_async_index, reason="current version of zarr has async vectorized indexing", @@ -214,11 +217,10 @@ async def test_indexing( ), # tests vindexing ], ) - async def test_raise_on_older_zarr_version(self, store, indexer): + async def test_raise_on_older_zarr_version(self, store, indexer, expected_err_msg): """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" ds = xr.open_zarr(store, consolidated=False, chunks=None) - # TODO match the correct error message in each case - with pytest.raises(NotImplementedError, match="async indexing"): + with pytest.raises(NotImplementedError, match=expected_err_msg): await ds.isel(**indexer).load_async() From 84f8e30b5d6eec71981c1264d5d920521480ece7 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 22:19:49 +0100 Subject: [PATCH 075/112] ensure each test runs on the earliest version of xaarr it can --- xarray/tests/test_async.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index ec9c9bb8112..c9d0eaca21e 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -13,6 +13,7 @@ # TODO rename this to show it's specifically for o/vindexing has_zarr_v3_async_index, requires_zarr, + requires_zarr_v3, requires_zarr_v3_async_index, ) from xarray.tests.test_backends import ZARR_FORMATS @@ -57,7 +58,7 @@ def _resolve_class_from_string(class_path: str) -> type[Any]: @pytest.mark.asyncio class TestAsyncLoad: - @requires_zarr_v3_async_index + @requires_zarr_v3 async def test_concurrent_load_multiple_variables(self, store) -> None: target_class = zarr.AsyncArray method_name = "getitem" @@ -82,7 +83,7 @@ async def test_concurrent_load_multiple_variables(self, store) -> None: xrt.assert_identical(result_ds, ds.load()) - @requires_zarr_v3_async_index + @requires_zarr_v3 @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: N_OBJECTS = 5 @@ -107,8 +108,9 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: for result in results: xrt.assert_identical(result, xr_obj.load()) - @requires_zarr_v3_async_index + @requires_zarr_v3 @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) + # TODO remove the method part as it's always the same @pytest.mark.parametrize( "indexer, method, zarr_class_and_method", [ @@ -162,6 +164,10 @@ async def test_indexing( indexer, zarr_class_and_method, ) -> None: + + if not has_zarr_v3_async_index and zarr_class_and_method[0] in ("zarr.core.indexing.AsyncOIndex", "zarr.core.indexing.AsyncVIndex"): + pytest.skip("current version of zarr does not support orthogonal or vectorized async indexing") + if cls_name == "Variable" and method == "sel": pytest.skip("Variable doesn't have a .sel method") From 19090b0cb89458e1facc7c544cc36011de23db0e Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 22:30:10 +0100 Subject: [PATCH 076/112] remove pointless repeated getitem --- xarray/tests/test_async.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index c9d0eaca21e..280bfb449d5 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -110,29 +110,28 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: @requires_zarr_v3 @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) - # TODO remove the method part as it's always the same @pytest.mark.parametrize( - "indexer, method, zarr_class_and_method", + "indexer, method, target_zarr_class", [ - ({}, "sel", ("zarr.AsyncArray", "getitem")), - ({}, "isel", ("zarr.AsyncArray", "getitem")), - ({"dim2": 1.0}, "sel", ("zarr.AsyncArray", "getitem")), - ({"dim2": 2}, "isel", ("zarr.AsyncArray", "getitem")), - ({"dim2": slice(1.0, 3.0)}, "sel", ("zarr.AsyncArray", "getitem")), - ({"dim2": slice(1, 3)}, "isel", ("zarr.AsyncArray", "getitem")), + ({}, "sel", "zarr.AsyncArray"), + ({}, "isel", "zarr.AsyncArray"), + ({"dim2": 1.0}, "sel", "zarr.AsyncArray"), + ({"dim2": 2}, "isel", "zarr.AsyncArray"), + ({"dim2": slice(1.0, 3.0)}, "sel", "zarr.AsyncArray"), + ({"dim2": slice(1, 3)}, "isel", "zarr.AsyncArray"), ( {"dim2": [1.0, 3.0]}, "sel", - ("zarr.core.indexing.AsyncOIndex", "getitem"), + "zarr.core.indexing.AsyncOIndex", ), - ({"dim2": [1, 3]}, "isel", ("zarr.core.indexing.AsyncOIndex", "getitem")), + ({"dim2": [1, 3]}, "isel", "zarr.core.indexing.AsyncOIndex"), ( { "dim1": xr.Variable(data=[2, 3], dims="points"), "dim2": xr.Variable(data=[1.0, 2.0], dims="points"), }, "sel", - ("zarr.core.indexing.AsyncVIndex", "getitem"), + "zarr.core.indexing.AsyncVIndex", ), ( { @@ -140,7 +139,7 @@ async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: "dim2": xr.Variable(data=[1, 3], dims="points"), }, "isel", - ("zarr.core.indexing.AsyncVIndex", "getitem"), + "zarr.core.indexing.AsyncVIndex", ), ], ids=[ @@ -162,18 +161,19 @@ async def test_indexing( cls_name, method, indexer, - zarr_class_and_method, + target_zarr_class, ) -> None: - if not has_zarr_v3_async_index and zarr_class_and_method[0] in ("zarr.core.indexing.AsyncOIndex", "zarr.core.indexing.AsyncVIndex"): + if not has_zarr_v3_async_index and target_zarr_class in ("zarr.core.indexing.AsyncOIndex", "zarr.core.indexing.AsyncVIndex"): pytest.skip("current version of zarr does not support orthogonal or vectorized async indexing") if cls_name == "Variable" and method == "sel": pytest.skip("Variable doesn't have a .sel method") - # each type of indexing ends up calling a different zarr indexing method - target_class_path, method_name = zarr_class_and_method - target_class = _resolve_class_from_string(target_class_path) + # Each type of indexing ends up calling a different zarr indexing method + # They all use a method named .getitem, but on a different internal zarr class + target_class = _resolve_class_from_string(target_zarr_class) + method_name = "getitem" original_method = getattr(target_class, method_name) with patch.object( From 49416db5d2e60e966b89a2619c8a00637304fd34 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 22:34:37 +0100 Subject: [PATCH 077/112] set N_LAZY_VARS correctly in test --- xarray/tests/test_async.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 280bfb449d5..4ccd3ddfa0a 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -30,6 +30,7 @@ def store(request) -> "zarr.storage.MemoryStore": memorystore = zarr.storage.MemoryStore({}) ds = create_test_data() + print(ds) ds.to_zarr(memorystore, zarr_format=request.param, consolidated=False) # type: ignore[call-overload] return memorystore @@ -64,9 +65,8 @@ async def test_concurrent_load_multiple_variables(self, store) -> None: method_name = "getitem" original_method = getattr(target_class, method_name) - # TODO up the number of variables in the dataset? - # the coordinate variable is not lazy - N_LAZY_VARS = 1 + # the indexed coordinate variables is not lazy, so the create_test_dataset has 4 lazy variables in total + N_LAZY_VARS = 4 with patch.object( target_class, method_name, wraps=original_method, autospec=True From 2ed8455c4c94c27a3cc18f4437e2c1923f589f17 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 22:35:16 +0100 Subject: [PATCH 078/112] remove unused import --- xarray/tests/test_async.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 4ccd3ddfa0a..3e517af41a0 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -14,7 +14,6 @@ has_zarr_v3_async_index, requires_zarr, requires_zarr_v3, - requires_zarr_v3_async_index, ) from xarray.tests.test_backends import ZARR_FORMATS from xarray.tests.test_dataset import create_test_data From a8a2860f461b6ad7a1214caffb40b733f5969210 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 4 Aug 2025 22:39:57 +0100 Subject: [PATCH 079/112] rename flag to make it more clear its only for orthogonal and vectorized indexing --- xarray/tests/__init__.py | 8 ++++---- xarray/tests/test_async.py | 9 ++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 652f26ede30..3b4e49c64d8 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -132,7 +132,7 @@ def _importorskip( has_zarr, requires_zarr = _importorskip("zarr") has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.1.0") -has_zarr_v3_async_index, requires_zarr_v3_async_index = _importorskip("zarr", "3.1.2") +has_zarr_v3_async_oindex, requires_zarr_v3_async_oindex = _importorskip("zarr", "3.1.2") if has_zarr_v3: import zarr @@ -141,13 +141,13 @@ def _importorskip( # installing from git main is giving me a lower version than the # most recently released zarr has_zarr_v3_dtypes = hasattr(zarr.core, "dtype") - has_zarr_v3_async_index = hasattr(zarr.AsyncArray, "oindex") + has_zarr_v3_async_oindex = hasattr(zarr.AsyncArray, "oindex") requires_zarr_v3_dtypes = pytest.mark.skipif( not has_zarr_v3_dtypes, reason="requires zarr>3.1.0" ) - requires_zarr_v3_async_index = pytest.mark.skipif( - not has_zarr_v3_async_index, reason="requires zarr>3.1.1" + requires_zarr_v3_async_oindex = pytest.mark.skipif( + not has_zarr_v3_async_oindex, reason="requires zarr>3.1.1" ) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 3e517af41a0..34e5dedb93a 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -10,8 +10,7 @@ from xarray.tests import ( has_zarr, has_zarr_v3, - # TODO rename this to show it's specifically for o/vindexing - has_zarr_v3_async_index, + has_zarr_v3_async_oindex, requires_zarr, requires_zarr_v3, ) @@ -163,7 +162,7 @@ async def test_indexing( target_zarr_class, ) -> None: - if not has_zarr_v3_async_index and target_zarr_class in ("zarr.core.indexing.AsyncOIndex", "zarr.core.indexing.AsyncVIndex"): + if not has_zarr_v3_async_oindex and target_zarr_class in ("zarr.core.indexing.AsyncOIndex", "zarr.core.indexing.AsyncVIndex"): pytest.skip("current version of zarr does not support orthogonal or vectorized async indexing") if cls_name == "Variable" and method == "sel": @@ -205,7 +204,7 @@ async def test_indexing( {"dim2": [1, 3]}, "orthogonal async indexing", marks=pytest.mark.skipif( - has_zarr_v3_async_index, + has_zarr_v3_async_oindex, reason="current version of zarr has async orthogonal indexing", ), ), # tests oindexing @@ -216,7 +215,7 @@ async def test_indexing( }, "vectorized async indexing", marks=pytest.mark.skipif( - has_zarr_v3_async_index, + has_zarr_v3_async_oindex, reason="current version of zarr has async vectorized indexing", ), ), # tests vindexing From ef6afdfd0400656cc0b9f2cc4e20fd02c0a5b3fc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Aug 2025 21:41:39 +0000 Subject: [PATCH 080/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_async.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 34e5dedb93a..0f0c97a0ec8 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -161,9 +161,13 @@ async def test_indexing( indexer, target_zarr_class, ) -> None: - - if not has_zarr_v3_async_oindex and target_zarr_class in ("zarr.core.indexing.AsyncOIndex", "zarr.core.indexing.AsyncVIndex"): - pytest.skip("current version of zarr does not support orthogonal or vectorized async indexing") + if not has_zarr_v3_async_oindex and target_zarr_class in ( + "zarr.core.indexing.AsyncOIndex", + "zarr.core.indexing.AsyncVIndex", + ): + pytest.skip( + "current version of zarr does not support orthogonal or vectorized async indexing" + ) if cls_name == "Variable" and method == "sel": pytest.skip("Variable doesn't have a .sel method") @@ -197,7 +201,8 @@ async def test_indexing( {"dim2": 2}, "basic async indexing", marks=pytest.mark.skipif( - has_zarr_v3, reason="current version of zarr has basic async indexing" + has_zarr_v3, + reason="current version of zarr has basic async indexing", ), ), # tests basic indexing pytest.param( From de98308f3d3a0be77282836326216f2bb46916c5 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 6 Aug 2025 13:37:00 +0100 Subject: [PATCH 081/112] remove IndexingAdapter special case --- xarray/namedarray/pycompat.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/xarray/namedarray/pycompat.py b/xarray/namedarray/pycompat.py index 0fe5cfdf3b5..7f80135eb20 100644 --- a/xarray/namedarray/pycompat.py +++ b/xarray/namedarray/pycompat.py @@ -153,13 +153,9 @@ async def async_to_duck_array( from xarray.core.indexing import ( ExplicitlyIndexed, ImplicitToExplicitIndexingAdapter, - IndexingAdapter, ) - if isinstance(data, IndexingAdapter): - # These wrap in-memory arrays, and async isn't needed - return data.get_duck_array() - elif isinstance(data, ExplicitlyIndexed | ImplicitToExplicitIndexingAdapter): + if isinstance(data, ExplicitlyIndexed | ImplicitToExplicitIndexingAdapter): return await data.async_get_duck_array() # type: ignore[no-untyped-call, no-any-return] else: return to_duck_array(data, **kwargs) From e32ea133a2251245936d31b5e6461ee5d13acf55 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 6 Aug 2025 08:53:53 -0600 Subject: [PATCH 082/112] type fixes --- xarray/backends/common.py | 4 ++-- xarray/core/indexing.py | 7 +++++++ xarray/namedarray/pycompat.py | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index a9f21e9a3bd..fe73fd4e1c2 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -270,7 +270,7 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500 class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): __slots__ = () - async def async_getitem(key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: + async def async_getitem(self, key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: raise NotImplementedError("Backend does not not support asynchronous loading") def get_duck_array(self, dtype: np.typing.DTypeLike = None): @@ -279,7 +279,7 @@ def get_duck_array(self, dtype: np.typing.DTypeLike = None): async def async_get_duck_array(self, dtype: np.typing.DTypeLike = None): key = indexing.BasicIndexer((slice(None),) * self.ndim) - return await self.async_getitem(key) # type: ignore[index] + return await self.async_getitem(key) class AbstractDataStore: diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 4a41fa4f269..487662f73c5 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -523,6 +523,13 @@ class IndexingAdapter: indexing semantics. """ + @property + def ndim(self): + raise NotImplementedError + + def __getitem__(self, indexer: ExplicitIndexer): + raise NotImplementedError + def get_duck_array(self): key = BasicIndexer((slice(None),) * self.ndim) return self[key] diff --git a/xarray/namedarray/pycompat.py b/xarray/namedarray/pycompat.py index 7f80135eb20..5832f7cc9e7 100644 --- a/xarray/namedarray/pycompat.py +++ b/xarray/namedarray/pycompat.py @@ -156,6 +156,6 @@ async def async_to_duck_array( ) if isinstance(data, ExplicitlyIndexed | ImplicitToExplicitIndexingAdapter): - return await data.async_get_duck_array() # type: ignore[no-untyped-call, no-any-return] + return await data.async_get_duck_array() # type: ignore[union-attr, no-any-return] else: return to_duck_array(data, **kwargs) From da2d43cc31b454a5ae2f782fbcecd6ed4307ab56 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 6 Aug 2025 15:54:20 +0100 Subject: [PATCH 083/112] return a deepcopy --- xarray/core/indexing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 4a41fa4f269..ab0a2cb9cdf 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -7,6 +7,7 @@ from collections import Counter, defaultdict from collections.abc import Callable, Hashable, Iterable, Mapping from contextlib import suppress +from copy import deepcopy from dataclasses import dataclass, field from datetime import timedelta from typing import TYPE_CHECKING, Any, cast, overload @@ -876,13 +877,13 @@ def get_duck_array(self): duck_array = self.array.get_duck_array() # ensure the array object is cached in-memory self.array = as_indexable(duck_array) - return duck_array + return deepcopy(self.array) async def async_get_duck_array(self): duck_array = await self.array.async_get_duck_array() # ensure the array object is cached in-memory self.array = as_indexable(duck_array) - return duck_array + return deepcopy(self.array) def _oindex_get(self, indexer: OuterIndexer): return type(self)(_wrap_numpy_scalars(self.array.oindex[indexer])) From d46fc3f39cbd63a6314bef38493deb4b83e2c5e6 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 6 Aug 2025 09:13:59 -0600 Subject: [PATCH 084/112] try again --- xarray/core/indexing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index e1aade7eb1f..547c24d1140 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1589,7 +1589,7 @@ def is_fancy_indexer(indexer: Any) -> bool: return True -class NumpyIndexingAdapter(IndexingAdapter, ExplicitlyIndexedNDArrayMixin): +class NumpyIndexingAdapter(ExplicitlyIndexedNDArrayMixin, IndexingAdapter): """Wrap a NumPy array to use explicit indexing.""" __slots__ = ("array",) @@ -1668,7 +1668,7 @@ def __init__(self, array): self.array = array -class ArrayApiIndexingAdapter(IndexingAdapter, ExplicitlyIndexedNDArrayMixin): +class ArrayApiIndexingAdapter(ExplicitlyIndexedNDArrayMixin, IndexingAdapter): """Wrap an array API array to use explicit indexing.""" __slots__ = ("array",) @@ -1809,7 +1809,7 @@ def transpose(self, order): return self.array.transpose(order) -class PandasIndexingAdapter(IndexingAdapter, ExplicitlyIndexedNDArrayMixin): +class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin, IndexingAdapter): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" __slots__ = ("_dtype", "array") @@ -2067,7 +2067,7 @@ def copy(self, deep: bool = True) -> Self: class CoordinateTransformIndexingAdapter( - IndexingAdapter, ExplicitlyIndexedNDArrayMixin + ExplicitlyIndexedNDArrayMixin, IndexingAdapter ): """Wrap a CoordinateTransform as a lazy coordinate array. From cc253c7d69342a944ab068ac737d149b9483d475 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 6 Aug 2025 09:15:22 -0600 Subject: [PATCH 085/112] one more --- xarray/core/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 547c24d1140..79da94e19a8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1733,7 +1733,7 @@ def _assert_not_chunked_indexer(idxr: tuple[Any, ...]) -> None: ) -class DaskIndexingAdapter(IndexingAdapter, ExplicitlyIndexedNDArrayMixin): +class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin, IndexingAdapter): """Wrap a dask array to support explicit indexing.""" __slots__ = ("array",) From 78c9116864e9036b9cbb9be4efc944240face0d3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 6 Aug 2025 09:48:55 -0600 Subject: [PATCH 086/112] Try again --- xarray/core/indexing.py | 51 +++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 79da94e19a8..ef2923bbe1f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -517,29 +517,6 @@ def get_duck_array(self): return self.array -class IndexingAdapter: - """Marker class for indexing adapters. - - These classes translate between Xarray's indexing semantics and the underlying array's - indexing semantics. - """ - - @property - def ndim(self): - raise NotImplementedError - - def __getitem__(self, indexer: ExplicitIndexer): - raise NotImplementedError - - def get_duck_array(self): - key = BasicIndexer((slice(None),) * self.ndim) - return self[key] - - async def async_get_duck_array(self): - """These classes are applied to in-memory arrays, so specific async support isn't needed.""" - return self.get_duck_array() - - class ExplicitlyIndexedNDArrayMixin(NDArrayMixin, ExplicitlyIndexed): __slots__ = () @@ -585,6 +562,22 @@ def vindex(self) -> IndexCallable: return IndexCallable(self._vindex_get, self._vindex_set) +class IndexingAdapter(ExplicitlyIndexedNDArrayMixin): + """Marker class for indexing adapters. + + These classes translate between Xarray's indexing semantics and the underlying array's + indexing semantics. + """ + + def get_duck_array(self): + key = BasicIndexer((slice(None),) * self.ndim) + return self[key] + + async def async_get_duck_array(self): + """These classes are applied to in-memory arrays, so specific async support isn't needed.""" + return self.get_duck_array() + + class ImplicitToExplicitIndexingAdapter(NDArrayMixin): """Wrap an array, converting tuples into the indicated explicit indexer.""" @@ -1589,7 +1582,7 @@ def is_fancy_indexer(indexer: Any) -> bool: return True -class NumpyIndexingAdapter(ExplicitlyIndexedNDArrayMixin, IndexingAdapter): +class NumpyIndexingAdapter(IndexingAdapter): """Wrap a NumPy array to use explicit indexing.""" __slots__ = ("array",) @@ -1668,7 +1661,7 @@ def __init__(self, array): self.array = array -class ArrayApiIndexingAdapter(ExplicitlyIndexedNDArrayMixin, IndexingAdapter): +class ArrayApiIndexingAdapter(IndexingAdapter): """Wrap an array API array to use explicit indexing.""" __slots__ = ("array",) @@ -1733,7 +1726,7 @@ def _assert_not_chunked_indexer(idxr: tuple[Any, ...]) -> None: ) -class DaskIndexingAdapter(ExplicitlyIndexedNDArrayMixin, IndexingAdapter): +class DaskIndexingAdapter(IndexingAdapter): """Wrap a dask array to support explicit indexing.""" __slots__ = ("array",) @@ -1809,7 +1802,7 @@ def transpose(self, order): return self.array.transpose(order) -class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin, IndexingAdapter): +class PandasIndexingAdapter(IndexingAdapter): """Wrap a pandas.Index to preserve dtypes and handle explicit indexing.""" __slots__ = ("_dtype", "array") @@ -2066,9 +2059,7 @@ def copy(self, deep: bool = True) -> Self: return type(self)(array, self._dtype, self.level) -class CoordinateTransformIndexingAdapter( - ExplicitlyIndexedNDArrayMixin, IndexingAdapter -): +class CoordinateTransformIndexingAdapter(IndexingAdapter): """Wrap a CoordinateTransform as a lazy coordinate array. Supports explicit indexing (both outer and vectorized). From a727ecb6d92e81df263cbc2cb53305288e18889e Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 11 Aug 2025 13:31:25 +0100 Subject: [PATCH 087/112] try fixing _in_memory error by not returning the adapter class --- xarray/core/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index ef2923bbe1f..839fc63926d 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -877,13 +877,13 @@ def get_duck_array(self): duck_array = self.array.get_duck_array() # ensure the array object is cached in-memory self.array = as_indexable(duck_array) - return deepcopy(self.array) + return duck_array async def async_get_duck_array(self): duck_array = await self.array.async_get_duck_array() # ensure the array object is cached in-memory self.array = as_indexable(duck_array) - return deepcopy(self.array) + return duck_array def _oindex_get(self, indexer: OuterIndexer): return type(self)(_wrap_numpy_scalars(self.array.oindex[indexer])) From 9b7afc24863a1669f5f03f42ca84038d713e5ca4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Aug 2025 12:34:13 +0000 Subject: [PATCH 088/112] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/core/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 839fc63926d..2876476474e 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -7,7 +7,6 @@ from collections import Counter, defaultdict from collections.abc import Callable, Hashable, Iterable, Mapping from contextlib import suppress -from copy import deepcopy from dataclasses import dataclass, field from datetime import timedelta from typing import TYPE_CHECKING, Any, cast, overload From b4ef26fb2a86cdce3b776756f9515efe4b3319f0 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 11 Aug 2025 14:49:02 +0100 Subject: [PATCH 089/112] remove scope=module from fixture for robustness --- xarray/tests/test_async.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 0f0c97a0ec8..3c8a0fb66fd 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -23,12 +23,11 @@ zarr = None # type: ignore[assignment] -@pytest.fixture(scope="module", params=ZARR_FORMATS) +@pytest.fixture(params=ZARR_FORMATS) def store(request) -> "zarr.storage.MemoryStore": memorystore = zarr.storage.MemoryStore({}) ds = create_test_data() - print(ds) ds.to_zarr(memorystore, zarr_format=request.param, consolidated=False) # type: ignore[call-overload] return memorystore From a7918e4f82aa7972bb448dcbf4110c14a1c0e930 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 11 Aug 2025 17:42:51 +0100 Subject: [PATCH 090/112] modify test to be happy with either error message --- xarray/tests/test_async.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 3c8a0fb66fd..1da28f81027 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -206,7 +206,7 @@ async def test_indexing( ), # tests basic indexing pytest.param( {"dim2": [1, 3]}, - "orthogonal async indexing", + "(orthogonal|vectorized) async indexing", # see https://github.com/pydata/xarray/pull/10327#issuecomment-3175821377 marks=pytest.mark.skipif( has_zarr_v3_async_oindex, reason="current version of zarr has async orthogonal indexing", @@ -217,7 +217,7 @@ async def test_indexing( "dim1": xr.Variable(data=[2, 3], dims="points"), "dim2": xr.Variable(data=[1, 3], dims="points"), }, - "vectorized async indexing", + "(orthogonal|vectorized) async indexing", # see https://github.com/pydata/xarray/pull/10327#issuecomment-3175821377 marks=pytest.mark.skipif( has_zarr_v3_async_oindex, reason="current version of zarr has async vectorized indexing", From 199d50aece6710a667299061726b0546c2390bb8 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Mon, 11 Aug 2025 19:02:37 +0100 Subject: [PATCH 091/112] use Variable instead of Dataset to avoid race condition of indexing between different variables --- xarray/tests/test_async.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index 1da28f81027..f2a6d93727a 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -206,7 +206,7 @@ async def test_indexing( ), # tests basic indexing pytest.param( {"dim2": [1, 3]}, - "(orthogonal|vectorized) async indexing", # see https://github.com/pydata/xarray/pull/10327#issuecomment-3175821377 + "orthogonal async indexing", marks=pytest.mark.skipif( has_zarr_v3_async_oindex, reason="current version of zarr has async orthogonal indexing", @@ -217,7 +217,7 @@ async def test_indexing( "dim1": xr.Variable(data=[2, 3], dims="points"), "dim2": xr.Variable(data=[1, 3], dims="points"), }, - "(orthogonal|vectorized) async indexing", # see https://github.com/pydata/xarray/pull/10327#issuecomment-3175821377 + "vectorized async indexing", marks=pytest.mark.skipif( has_zarr_v3_async_oindex, reason="current version of zarr has async vectorized indexing", @@ -229,6 +229,7 @@ async def test_raise_on_older_zarr_version(self, store, indexer, expected_err_ms """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" ds = xr.open_zarr(store, consolidated=False, chunks=None) + var = ds["var1"].variable with pytest.raises(NotImplementedError, match=expected_err_msg): - await ds.isel(**indexer).load_async() + await var.isel(**indexer).load_async() From 1cbe9133b23414cb37546a33f8438332df1a7c98 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 12 Aug 2025 10:56:51 +0100 Subject: [PATCH 092/112] fix bad merge in API docs --- doc/api.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index f7bb382e922..fc862c21e4c 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -11,3 +11,30 @@ and examples, refer to the relevant chapters in the main part of the documentation. See also: :ref:`public-api` and :ref:`api-stability`. + +.. toctree:: + :maxdepth: 1 + + api/top-level + api/dataset + api/dataarray + api/datatree + api/coordinates + api/indexes + api/ufuncs + api/io + api/encoding + api/plotting + api/groupby + api/rolling + api/coarsen + api/rolling-exp + api/weighted + api/resample + api/accessors + api/tutorial + api/testing + api/backends + api/exceptions + api/advanced + api/deprecated From 46d9414b9415cb9f18838ed07f968d6268ee3ed3 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 12 Aug 2025 11:57:47 +0100 Subject: [PATCH 093/112] add test to test_backends.py --- xarray/tests/test_backends.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ef68a959dfc..d5fe6e0d9f8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2454,6 +2454,44 @@ def test_non_existent_store(self) -> None: ): xr.open_zarr(f"{uuid.uuid4()}") + @pytest.mark.skipif( + not has_zarr_v3, reason="zarr-python <3 did not support async loading" + ) + @pytest.mark.asyncio + async def test_load_async(self) -> None: + """Copied from `test_load` on the base test class, but won't work for netcdf""" + expected = create_test_data() + + @contextlib.contextmanager + def assert_loads(vars=None): + if vars is None: + vars = expected + with self.roundtrip(expected) as actual: + for k, v in actual.variables.items(): + # IndexVariables are eagerly loaded into memory + assert v._in_memory == (k in actual.dims) + yield actual + for k, v in actual.variables.items(): + if k in vars: + assert v._in_memory + assert_identical(expected, actual) + + with pytest.raises(AssertionError): + # make sure the contextmanager works! + with assert_loads() as ds: + pass + + with assert_loads() as ds: + await ds.load_async() + + with assert_loads(["var1", "dim1", "dim2"]) as ds: + await ds["var1"].load_async() + + # verify we can read data even after closing the file + with self.roundtrip(expected) as ds: + actual = await ds.load_async() + assert_identical(expected, actual) + @pytest.mark.skipif(has_zarr_v3, reason="chunk_store not implemented in zarr v3") def test_with_chunkstore(self) -> None: expected = create_test_data() From b4a5a90c995fc63e3e82efb30afa823a351ad0bd Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 12 Aug 2025 11:58:17 +0100 Subject: [PATCH 094/112] fix bug found by new test, causing pandas indexes to be converted to numpy arrays --- xarray/core/variable.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 145836da743..b831e927878 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -2705,6 +2705,10 @@ def load(self): # data is already loaded into memory for IndexVariable return self + async def load_async(self): + # data is already loaded into memory for IndexVariable + return self + # https://github.com/python/mypy/issues/1465 @Variable.data.setter # type: ignore[attr-defined] def data(self, data): From bc1fe4efcfd0d963c4e44d9cec514cc3f79c24f7 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 12 Aug 2025 13:33:57 +0100 Subject: [PATCH 095/112] add test to test_variable.py for lazy async indexing --- xarray/tests/test_variable.py | 37 +++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index e2f4a3154f3..441db94349a 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2898,12 +2898,28 @@ def setUp(self): self.cat = PandasExtensionArray(pd.Categorical(["a", "b"] * 5)) def check_orthogonal_indexing(self, v): - assert np.allclose(v.isel(x=[8, 3], y=[2, 1]), self.d[[8, 3]][:, [2, 1]]) + result = v.isel(x=[8, 3], y=[2, 1]) + expected = self.d[[8, 3]][:, [2, 1]] + assert np.allclose(result, expected) + + async def check_orthogonal_async_indexing(self, v): + result = await v.isel(x=[8, 3], y=[2, 1]).load_async() + expected = self.d[[8, 3]][:, [2, 1]] + assert np.allclose(result, expected) def check_vectorized_indexing(self, v): ind_x = Variable("z", [0, 2]) ind_y = Variable("z", [2, 1]) - assert np.allclose(v.isel(x=ind_x, y=ind_y), self.d[ind_x, ind_y]) + result = v.isel(x=ind_x, y=ind_y) + expected = self.d[ind_x, ind_y] + assert np.allclose(result, expected) + + async def check_vectorized_async_indexing(self, v): + ind_x = Variable("z", [0, 2]) + ind_y = Variable("z", [2, 1]) + result = await v.isel(x=ind_x, y=ind_y).load_async() + expected = self.d[ind_x, ind_y] + assert np.allclose(result, expected) def test_NumpyIndexingAdapter(self): v = Variable(dims=("x", "y"), data=NumpyIndexingAdapter(self.d)) @@ -2939,6 +2955,23 @@ def test_LazilyIndexedArray(self): ) self.check_orthogonal_indexing(v) + @pytest.mark.asyncio + async def test_lazy_async_indexing(self) -> None: + v = Variable(dims=("x", "y"), data=LazilyIndexedArray(self.d)) + await self.check_orthogonal_async_indexing(v) + await self.check_vectorized_async_indexing(v) + # doubly wrapping + v = Variable( + dims=("x", "y"), + data=LazilyIndexedArray(LazilyIndexedArray(self.d)), + ) + await self.check_orthogonal_async_indexing(v) + # hierarchical wrapping + v = Variable( + dims=("x", "y"), data=LazilyIndexedArray(NumpyIndexingAdapter(self.d)) + ) + await self.check_orthogonal_async_indexing(v) + def test_CopyOnWriteArray(self): v = Variable(dims=("x", "y"), data=CopyOnWriteArray(self.d)) self.check_orthogonal_indexing(v) From 10f7e613c5a6c778f15bc8404e947cbec9d316ee Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 12 Aug 2025 13:54:28 +0100 Subject: [PATCH 096/112] move async load tests from test_async.py to test_backends.py --- xarray/tests/test_async.py | 164 ------------------------------- xarray/tests/test_backends.py | 178 ++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 164 deletions(-) diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py index f2a6d93727a..efbfce47c37 100644 --- a/xarray/tests/test_async.py +++ b/xarray/tests/test_async.py @@ -1,18 +1,11 @@ -import asyncio -from importlib import import_module -from typing import Any, Literal -from unittest.mock import patch - import pytest import xarray as xr -import xarray.testing as xrt from xarray.tests import ( has_zarr, has_zarr_v3, has_zarr_v3_async_oindex, requires_zarr, - requires_zarr_v3, ) from xarray.tests.test_backends import ZARR_FORMATS from xarray.tests.test_dataset import create_test_data @@ -33,165 +26,8 @@ def store(request) -> "zarr.storage.MemoryStore": return memorystore -def get_xr_obj( - store: "zarr.abc.store.Store", cls_name: Literal["Variable", "DataArray", "Dataset"] -): - ds = xr.open_zarr(store, consolidated=False, chunks=None) - - match cls_name: - case "Variable": - return ds["var1"].variable - case "DataArray": - return ds["var1"] - case "Dataset": - return ds - - -def _resolve_class_from_string(class_path: str) -> type[Any]: - """Resolve a string class path like 'zarr.AsyncArray' to the actual class.""" - module_path, class_name = class_path.rsplit(".", 1) - module = import_module(module_path) - return getattr(module, class_name) - - @pytest.mark.asyncio class TestAsyncLoad: - @requires_zarr_v3 - async def test_concurrent_load_multiple_variables(self, store) -> None: - target_class = zarr.AsyncArray - method_name = "getitem" - original_method = getattr(target_class, method_name) - - # the indexed coordinate variables is not lazy, so the create_test_dataset has 4 lazy variables in total - N_LAZY_VARS = 4 - - with patch.object( - target_class, method_name, wraps=original_method, autospec=True - ) as mocked_meth: - # blocks upon loading the coordinate variables here - ds = xr.open_zarr(store, consolidated=False, chunks=None) - - # TODO we're not actually testing that these indexing methods are not blocking... - result_ds = await ds.load_async() - - mocked_meth.assert_called() - assert mocked_meth.call_count >= N_LAZY_VARS - mocked_meth.assert_awaited() - - xrt.assert_identical(result_ds, ds.load()) - - @requires_zarr_v3 - @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) - async def test_concurrent_load_multiple_objects(self, store, cls_name) -> None: - N_OBJECTS = 5 - - target_class = zarr.AsyncArray - method_name = "getitem" - original_method = getattr(target_class, method_name) - - with patch.object( - target_class, method_name, wraps=original_method, autospec=True - ) as mocked_meth: - xr_obj = get_xr_obj(store, cls_name) - - # TODO we're not actually testing that these indexing methods are not blocking... - coros = [xr_obj.load_async() for _ in range(N_OBJECTS)] - results = await asyncio.gather(*coros) - - mocked_meth.assert_called() - assert mocked_meth.call_count >= N_OBJECTS - mocked_meth.assert_awaited() - - for result in results: - xrt.assert_identical(result, xr_obj.load()) - - @requires_zarr_v3 - @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) - @pytest.mark.parametrize( - "indexer, method, target_zarr_class", - [ - ({}, "sel", "zarr.AsyncArray"), - ({}, "isel", "zarr.AsyncArray"), - ({"dim2": 1.0}, "sel", "zarr.AsyncArray"), - ({"dim2": 2}, "isel", "zarr.AsyncArray"), - ({"dim2": slice(1.0, 3.0)}, "sel", "zarr.AsyncArray"), - ({"dim2": slice(1, 3)}, "isel", "zarr.AsyncArray"), - ( - {"dim2": [1.0, 3.0]}, - "sel", - "zarr.core.indexing.AsyncOIndex", - ), - ({"dim2": [1, 3]}, "isel", "zarr.core.indexing.AsyncOIndex"), - ( - { - "dim1": xr.Variable(data=[2, 3], dims="points"), - "dim2": xr.Variable(data=[1.0, 2.0], dims="points"), - }, - "sel", - "zarr.core.indexing.AsyncVIndex", - ), - ( - { - "dim1": xr.Variable(data=[2, 3], dims="points"), - "dim2": xr.Variable(data=[1, 3], dims="points"), - }, - "isel", - "zarr.core.indexing.AsyncVIndex", - ), - ], - ids=[ - "no-indexing-sel", - "no-indexing-isel", - "basic-int-sel", - "basic-int-isel", - "basic-slice-sel", - "basic-slice-isel", - "outer-sel", - "outer-isel", - "vectorized-sel", - "vectorized-isel", - ], - ) - async def test_indexing( - self, - store, - cls_name, - method, - indexer, - target_zarr_class, - ) -> None: - if not has_zarr_v3_async_oindex and target_zarr_class in ( - "zarr.core.indexing.AsyncOIndex", - "zarr.core.indexing.AsyncVIndex", - ): - pytest.skip( - "current version of zarr does not support orthogonal or vectorized async indexing" - ) - - if cls_name == "Variable" and method == "sel": - pytest.skip("Variable doesn't have a .sel method") - - # Each type of indexing ends up calling a different zarr indexing method - # They all use a method named .getitem, but on a different internal zarr class - target_class = _resolve_class_from_string(target_zarr_class) - method_name = "getitem" - original_method = getattr(target_class, method_name) - - with patch.object( - target_class, method_name, wraps=original_method, autospec=True - ) as mocked_meth: - xr_obj = get_xr_obj(store, cls_name) - - # TODO we're not actually testing that these indexing methods are not blocking... - result = await getattr(xr_obj, method)(**indexer).load_async() - - mocked_meth.assert_called() - mocked_meth.assert_awaited() - assert mocked_meth.call_count > 0 - - expected = getattr(xr_obj, method)(**indexer).load() - xrt.assert_identical(result, expected) - @requires_zarr @pytest.mark.parametrize( ("indexer", "expected_err_msg"), diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 66c9b0b11ad..e2f1f9227f6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import contextlib import gzip import itertools @@ -16,6 +17,7 @@ from collections import ChainMap from collections.abc import Generator, Iterator, Mapping from contextlib import ExitStack +from importlib import import_module from io import BytesIO from pathlib import Path from typing import TYPE_CHECKING, Any, Final, Literal, cast @@ -28,6 +30,7 @@ from pandas.errors import OutOfBoundsDatetime import xarray as xr +import xarray.testing as xrt from xarray import ( DataArray, Dataset, @@ -74,9 +77,11 @@ has_scipy, has_zarr, has_zarr_v3, + has_zarr_v3_async_oindex, has_zarr_v3_dtypes, mock, network, + parametrize_zarr_format, requires_cftime, requires_dask, requires_fsspec, @@ -3867,6 +3872,179 @@ def test_chunk_key_encoding_v2(self) -> None: # Verify chunks are preserved assert actual["var1"].encoding["chunks"] == (2, 2) + @pytest.mark.asyncio + @requires_zarr_v3 + @parametrize_zarr_format + async def test_async_load_multiple_variables(self, zarr_format) -> None: + target_class = zarr.AsyncArray + method_name = "getitem" + original_method = getattr(target_class, method_name) + + # the indexed coordinate variables is not lazy, so the create_test_dataset has 4 lazy variables in total + N_LAZY_VARS = 4 + + original = create_test_data() + with self.create_zarr_target() as store: + original.to_zarr(store, zarr_format=zarr_format, consolidated=False) + + with patch.object( + target_class, method_name, wraps=original_method, autospec=True + ) as mocked_meth: + # blocks upon loading the coordinate variables here + ds = xr.open_zarr(store, consolidated=False, chunks=None) + + # TODO we're not actually testing that these indexing methods are not blocking... + result_ds = await ds.load_async() + + mocked_meth.assert_called() + assert mocked_meth.call_count >= N_LAZY_VARS + mocked_meth.assert_awaited() + + xrt.assert_identical(result_ds, ds.load()) + + # TODO parametrize_zarr_format in to_zarr + @pytest.mark.asyncio + @requires_zarr_v3 + @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) + async def test_concurrent_load_multiple_objects(self, cls_name) -> None: + N_OBJECTS = 5 + + target_class = zarr.AsyncArray + method_name = "getitem" + original_method = getattr(target_class, method_name) + + original = create_test_data() + with self.create_zarr_target() as store: + original.to_zarr(store, consolidated=False) + + with patch.object( + target_class, method_name, wraps=original_method, autospec=True + ) as mocked_meth: + xr_obj = get_xr_obj(store, cls_name) + + # TODO we're not actually testing that these indexing methods are not blocking... + coros = [xr_obj.load_async() for _ in range(N_OBJECTS)] + results = await asyncio.gather(*coros) + + mocked_meth.assert_called() + assert mocked_meth.call_count >= N_OBJECTS + mocked_meth.assert_awaited() + + for result in results: + xrt.assert_identical(result, xr_obj.load()) + + # TODO parametrize zarr_format? + @pytest.mark.asyncio + @requires_zarr_v3 + @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) + @pytest.mark.parametrize( + "indexer, method, target_zarr_class", + [ + ({}, "sel", "zarr.AsyncArray"), + ({}, "isel", "zarr.AsyncArray"), + ({"dim2": 1.0}, "sel", "zarr.AsyncArray"), + ({"dim2": 2}, "isel", "zarr.AsyncArray"), + ({"dim2": slice(1.0, 3.0)}, "sel", "zarr.AsyncArray"), + ({"dim2": slice(1, 3)}, "isel", "zarr.AsyncArray"), + ( + {"dim2": [1.0, 3.0]}, + "sel", + "zarr.core.indexing.AsyncOIndex", + ), + ({"dim2": [1, 3]}, "isel", "zarr.core.indexing.AsyncOIndex"), + ( + { + "dim1": xr.Variable(data=[2, 3], dims="points"), + "dim2": xr.Variable(data=[1.0, 2.0], dims="points"), + }, + "sel", + "zarr.core.indexing.AsyncVIndex", + ), + ( + { + "dim1": xr.Variable(data=[2, 3], dims="points"), + "dim2": xr.Variable(data=[1, 3], dims="points"), + }, + "isel", + "zarr.core.indexing.AsyncVIndex", + ), + ], + ids=[ + "no-indexing-sel", + "no-indexing-isel", + "basic-int-sel", + "basic-int-isel", + "basic-slice-sel", + "basic-slice-isel", + "outer-sel", + "outer-isel", + "vectorized-sel", + "vectorized-isel", + ], + ) + async def test_indexing( + self, + cls_name, + method, + indexer, + target_zarr_class, + ) -> None: + if not has_zarr_v3_async_oindex and target_zarr_class in ( + "zarr.core.indexing.AsyncOIndex", + "zarr.core.indexing.AsyncVIndex", + ): + pytest.skip( + "current version of zarr does not support orthogonal or vectorized async indexing" + ) + + if cls_name == "Variable" and method == "sel": + pytest.skip("Variable doesn't have a .sel method") + + # Each type of indexing ends up calling a different zarr indexing method + # They all use a method named .getitem, but on a different internal zarr class + def _resolve_class_from_string(class_path: str) -> type[Any]: + """Resolve a string class path like 'zarr.AsyncArray' to the actual class.""" + module_path, class_name = class_path.rsplit(".", 1) + module = import_module(module_path) + return getattr(module, class_name) + + target_class = _resolve_class_from_string(target_zarr_class) + method_name = "getitem" + original_method = getattr(target_class, method_name) + + original = create_test_data() + with self.create_zarr_target() as store: + original.to_zarr(store, consolidated=False) + + with patch.object( + target_class, method_name, wraps=original_method, autospec=True + ) as mocked_meth: + xr_obj = get_xr_obj(store, cls_name) + + # TODO we're not actually testing that these indexing methods are not blocking... + result = await getattr(xr_obj, method)(**indexer).load_async() + + mocked_meth.assert_called() + mocked_meth.assert_awaited() + assert mocked_meth.call_count > 0 + + expected = getattr(xr_obj, method)(**indexer).load() + xrt.assert_identical(result, expected) + + +def get_xr_obj( + store: zarr.abc.store.Store, cls_name: Literal["Variable", "DataArray", "Dataset"] +): + ds = xr.open_zarr(store, consolidated=False, chunks=None) + + match cls_name: + case "Variable": + return ds["var1"].variable + case "DataArray": + return ds["var1"] + case "Dataset": + return ds + class NoConsolidatedMetadataSupportStore(WrapperStore): """ From 8416f00416df547369124aaa8cea3a97257bfaa4 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 12 Aug 2025 14:12:03 +0100 Subject: [PATCH 097/112] parametrize all tests over zarr_format --- xarray/tests/test_backends.py | 57 ++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e2f1f9227f6..8e076ca82d2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3902,11 +3902,11 @@ async def test_async_load_multiple_variables(self, zarr_format) -> None: xrt.assert_identical(result_ds, ds.load()) - # TODO parametrize_zarr_format in to_zarr @pytest.mark.asyncio + @parametrize_zarr_format @requires_zarr_v3 @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) - async def test_concurrent_load_multiple_objects(self, cls_name) -> None: + async def test_concurrent_load_multiple_objects(self, cls_name, zarr_format) -> None: N_OBJECTS = 5 target_class = zarr.AsyncArray @@ -3915,7 +3915,7 @@ async def test_concurrent_load_multiple_objects(self, cls_name) -> None: original = create_test_data() with self.create_zarr_target() as store: - original.to_zarr(store, consolidated=False) + original.to_zarr(store, consolidated=False, zarr_format=zarr_format) with patch.object( target_class, method_name, wraps=original_method, autospec=True @@ -3982,12 +3982,14 @@ async def test_concurrent_load_multiple_objects(self, cls_name) -> None: "vectorized-isel", ], ) + @parametrize_zarr_format async def test_indexing( self, cls_name, method, indexer, target_zarr_class, + zarr_format, ) -> None: if not has_zarr_v3_async_oindex and target_zarr_class in ( "zarr.core.indexing.AsyncOIndex", @@ -4014,7 +4016,7 @@ def _resolve_class_from_string(class_path: str) -> type[Any]: original = create_test_data() with self.create_zarr_target() as store: - original.to_zarr(store, consolidated=False) + original.to_zarr(store, consolidated=False, zarr_format=zarr_format) with patch.object( target_class, method_name, wraps=original_method, autospec=True @@ -4031,6 +4033,53 @@ def _resolve_class_from_string(class_path: str) -> type[Any]: expected = getattr(xr_obj, method)(**indexer).load() xrt.assert_identical(result, expected) + @pytest.mark.asyncio + @pytest.mark.parametrize( + ("indexer", "expected_err_msg"), + [ + pytest.param( + {"dim2": 2}, + "basic async indexing", + marks=pytest.mark.skipif( + has_zarr_v3, + reason="current version of zarr has basic async indexing", + ), + ), # tests basic indexing + pytest.param( + {"dim2": [1, 3]}, + "orthogonal async indexing", + marks=pytest.mark.skipif( + has_zarr_v3_async_oindex, + reason="current version of zarr has async orthogonal indexing", + ), + ), # tests oindexing + pytest.param( + { + "dim1": xr.Variable(data=[2, 3], dims="points"), + "dim2": xr.Variable(data=[1, 3], dims="points"), + }, + "vectorized async indexing", + marks=pytest.mark.skipif( + has_zarr_v3_async_oindex, + reason="current version of zarr has async vectorized indexing", + ), + ), # tests vindexing + ], + ) + @parametrize_zarr_format + async def test_raise_on_older_zarr_version(self, indexer, expected_err_msg, zarr_format): + """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" + + original = create_test_data() + with self.create_zarr_target() as store: + original.to_zarr(store, consolidated=False, zarr_format=zarr_format) + + ds = xr.open_zarr(store, consolidated=False, chunks=None) + var = ds["var1"].variable + + with pytest.raises(NotImplementedError, match=expected_err_msg): + await var.isel(**indexer).load_async() + def get_xr_obj( store: zarr.abc.store.Store, cls_name: Literal["Variable", "DataArray", "Dataset"] From 4edd503a8f0f3ad526d295d195083499e950c403 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 12 Aug 2025 14:16:01 +0100 Subject: [PATCH 098/112] remove test_async.py file entirely --- xarray/tests/test_async.py | 71 -------------------------------------- 1 file changed, 71 deletions(-) delete mode 100644 xarray/tests/test_async.py diff --git a/xarray/tests/test_async.py b/xarray/tests/test_async.py deleted file mode 100644 index efbfce47c37..00000000000 --- a/xarray/tests/test_async.py +++ /dev/null @@ -1,71 +0,0 @@ -import pytest - -import xarray as xr -from xarray.tests import ( - has_zarr, - has_zarr_v3, - has_zarr_v3_async_oindex, - requires_zarr, -) -from xarray.tests.test_backends import ZARR_FORMATS -from xarray.tests.test_dataset import create_test_data - -if has_zarr: - import zarr -else: - zarr = None # type: ignore[assignment] - - -@pytest.fixture(params=ZARR_FORMATS) -def store(request) -> "zarr.storage.MemoryStore": - memorystore = zarr.storage.MemoryStore({}) - - ds = create_test_data() - ds.to_zarr(memorystore, zarr_format=request.param, consolidated=False) # type: ignore[call-overload] - - return memorystore - - -@pytest.mark.asyncio -class TestAsyncLoad: - @requires_zarr - @pytest.mark.parametrize( - ("indexer", "expected_err_msg"), - [ - pytest.param( - {"dim2": 2}, - "basic async indexing", - marks=pytest.mark.skipif( - has_zarr_v3, - reason="current version of zarr has basic async indexing", - ), - ), # tests basic indexing - pytest.param( - {"dim2": [1, 3]}, - "orthogonal async indexing", - marks=pytest.mark.skipif( - has_zarr_v3_async_oindex, - reason="current version of zarr has async orthogonal indexing", - ), - ), # tests oindexing - pytest.param( - { - "dim1": xr.Variable(data=[2, 3], dims="points"), - "dim2": xr.Variable(data=[1, 3], dims="points"), - }, - "vectorized async indexing", - marks=pytest.mark.skipif( - has_zarr_v3_async_oindex, - reason="current version of zarr has async vectorized indexing", - ), - ), # tests vindexing - ], - ) - async def test_raise_on_older_zarr_version(self, store, indexer, expected_err_msg): - """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" - - ds = xr.open_zarr(store, consolidated=False, chunks=None) - var = ds["var1"].variable - - with pytest.raises(NotImplementedError, match=expected_err_msg): - await var.isel(**indexer).load_async() From fcb2c115eebf67d8e57537df753e3feb6f83fd51 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 12 Aug 2025 14:16:36 +0100 Subject: [PATCH 099/112] lint --- xarray/tests/test_backends.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8e076ca82d2..effd708c918 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3906,7 +3906,9 @@ async def test_async_load_multiple_variables(self, zarr_format) -> None: @parametrize_zarr_format @requires_zarr_v3 @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) - async def test_concurrent_load_multiple_objects(self, cls_name, zarr_format) -> None: + async def test_concurrent_load_multiple_objects( + self, cls_name, zarr_format + ) -> None: N_OBJECTS = 5 target_class = zarr.AsyncArray @@ -3933,7 +3935,6 @@ async def test_concurrent_load_multiple_objects(self, cls_name, zarr_format) -> for result in results: xrt.assert_identical(result, xr_obj.load()) - # TODO parametrize zarr_format? @pytest.mark.asyncio @requires_zarr_v3 @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) @@ -4067,7 +4068,9 @@ def _resolve_class_from_string(class_path: str) -> type[Any]: ], ) @parametrize_zarr_format - async def test_raise_on_older_zarr_version(self, indexer, expected_err_msg, zarr_format): + async def test_raise_on_older_zarr_version( + self, indexer, expected_err_msg, zarr_format + ): """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" original = create_test_data() From 476057459b8e1a0f3a9d57b24a002f374f24ba5f Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 02:15:44 -0700 Subject: [PATCH 100/112] Stricter assertions Co-authored-by: Deepak Cherian --- xarray/tests/test_backends.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index effd708c918..bb388f4f2fe 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3897,7 +3897,7 @@ async def test_async_load_multiple_variables(self, zarr_format) -> None: result_ds = await ds.load_async() mocked_meth.assert_called() - assert mocked_meth.call_count >= N_LAZY_VARS + assert mocked_meth.call_count == N_LAZY_VARS mocked_meth.assert_awaited() xrt.assert_identical(result_ds, ds.load()) @@ -3929,7 +3929,7 @@ async def test_concurrent_load_multiple_objects( results = await asyncio.gather(*coros) mocked_meth.assert_called() - assert mocked_meth.call_count >= N_OBJECTS + assert mocked_meth.call_count == N_OBJECTS mocked_meth.assert_awaited() for result in results: From 8bce3bbb4eb048abfd99dec38cdc2d4e92174aa1 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 02:17:06 -0700 Subject: [PATCH 101/112] Update doc/whats-new.rst Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9818c8cc494..24c7c9040ab 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,7 +13,7 @@ v2025.07.2 (unreleased) New Features ~~~~~~~~~~~~ -- Added new asynchronous loading methods :py:meth:`~xarray.Dataset.load_async`, :py:meth:`~xarray.DataArray.load_async`, :py:meth:`~xarray.Variable.load_async`. +- Added new asynchronous loading methods :py:meth:`Dataset.load_async`, :py:meth:`DataArray.load_async`, :py:meth:`Variable.load_async`. (:issue:`10326`, :pull:`10327`) By `Tom Nicholas `_. - :py:meth:`DataTree.to_netcdf` can now write to a file-like object, or return bytes if called without a filepath. (:issue:`10570`) By `Matthew Willson `_. From 37be891fdeef98bbe43f3ab1efab679943650538 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 10:48:36 +0100 Subject: [PATCH 102/112] add docstring for Variable.load_async --- xarray/core/variable.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index b831e927878..ceaa2b8bfa0 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -976,7 +976,7 @@ def _replace( encoding = copy.copy(self._encoding) return type(self)(dims, data, attrs, encoding, fastpath=True) - def load(self, **kwargs): + def load(self, **kwargs) -> Self: """Manually trigger loading of this variable's data from disk or a remote source into memory and return this variable. @@ -991,16 +991,34 @@ def load(self, **kwargs): See Also -------- + Variable.compute + Variable.load_async dask.array.compute """ self._data = to_duck_array(self._data, **kwargs) return self - async def load_async(self, **kwargs): + async def load_async(self, **kwargs) -> Self: + """Manually trigger and await asynchronous loading of this variable's data from disk or a + remote source into memory and return this variable. + + Only works when opening data lazily from IO storage backends which support lazy asynchronous loading. + Otherwise will raise a NotImplementedError. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments passed on to ``dask.array.compute``. + + See Also + -------- + Variable.load + dask.array.compute + """ self._data = await async_to_duck_array(self._data, **kwargs) return self - def compute(self, **kwargs): + def compute(self, **kwargs) -> Self: """Manually trigger loading of this variable's data from disk or a remote source into memory and return a new variable. The original is left unaltered. @@ -1017,6 +1035,7 @@ def compute(self, **kwargs): See Also -------- dask.array.compute + Variable.load """ new = self.copy(deep=False) return new.load(**kwargs) From 3bd7b8aa7cb1c8245ed1390f21a70771f8a86c0a Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 11:18:05 +0100 Subject: [PATCH 103/112] make all load-related docstrings consistent --- xarray/core/dataarray.py | 54 +++++++++++++++++++++++++++++++++----- xarray/core/dataset.py | 56 +++++++++++++++++++++++++++++++++++----- xarray/core/variable.py | 49 ++++++++++++++++++++++++++++------- 3 files changed, 138 insertions(+), 21 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 2167f159d2f..77f407a3b11 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1135,10 +1135,11 @@ def _dask_finalize(cls, results, name, func, *args, **kwargs) -> Self: return cls(variable, coords, name=name, indexes=indexes, fastpath=True) def load(self, **kwargs) -> Self: - """Manually trigger loading of this array's data from disk or a - remote source into memory and return this array. + """Trigger loading data into memory and return this dataarray. - Unlike compute, the original dataset is modified and returned. + Data will be computed and/or loaded from disk or a remote source. + + Unlike ``.compute``, the original dataarray is modified and returned. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -1150,9 +1151,18 @@ def load(self, **kwargs) -> Self: **kwargs : dict Additional keyword arguments passed on to ``dask.compute``. + Returns + ------- + object : DataArray + Same object but with lazy data and coordinates as in-memory arrays. + See Also -------- dask.compute + DataArray.load_async + DataArray.compute + Dataset.load + Variable.load """ ds = self._to_temp_dataset().load(**kwargs) new = self._from_temp_dataset(ds) @@ -1161,6 +1171,33 @@ def load(self, **kwargs) -> Self: return self async def load_async(self, **kwargs) -> Self: + """Trigger and await asynchronous loading of data into memory and return this dataarray. + + Data will be computed and/or loaded from disk or a remote source. + + Unlike ``.compute``, the original dataarray is modified and returned. + + Only works when opening data lazily from IO storage backends which support lazy asynchronous loading. + Otherwise will raise a NotImplementedError. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments passed on to ``dask.compute``. + + Returns + ------- + object : Dataarray + Same object but with lazy data and coordinates as in-memory arrays. + + See Also + -------- + dask.compute + DataArray.compute + DataArray.load + Dataset.load_async + Variable.load_async + """ temp_ds = self._to_temp_dataset() ds = await temp_ds.load_async(**kwargs) new = self._from_temp_dataset(ds) @@ -1169,10 +1206,11 @@ async def load_async(self, **kwargs) -> Self: return self def compute(self, **kwargs) -> Self: - """Manually trigger loading of this array's data from disk or a - remote source into memory and return a new array. + """Trigger loading data into memory and return a new dataarray. + + Data will be computed and/or loaded from disk or a remote source. - Unlike load, the original is left unaltered. + Unlike ``.load``, the original dataarray is left unaltered. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -1192,6 +1230,10 @@ def compute(self, **kwargs) -> Self: See Also -------- dask.compute + DataArray.load + DataArray.load_async + Dataset.compute + Variable.compute """ new = self.copy(deep=False) return new.load(**kwargs) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6b4a749f5dc..ccdf76c2bc4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -521,9 +521,11 @@ def dtypes(self) -> Frozen[Hashable, np.dtype]: ) def load(self, **kwargs) -> Self: - """Manually trigger loading and/or computation of this dataset's data - from disk or a remote source into memory and return this dataset. - Unlike compute, the original dataset is modified and returned. + """Trigger loading data into memory and return this dataset. + + Data will be computed and/or loaded from disk or a remote source. + + Unlike ``.compute``, the original dataset is modified and returned. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -535,9 +537,18 @@ def load(self, **kwargs) -> Self: **kwargs : dict Additional keyword arguments passed on to ``dask.compute``. + Returns + ------- + object : Dataset + Same object but with lazy data variables and coordinates as in-memory arrays. + See Also -------- dask.compute + Dataset.compute + Dataset.load_async + DataArray.load + Variable.load """ # access .data to coerce everything to numpy or dask arrays chunked_data = { @@ -560,6 +571,33 @@ def load(self, **kwargs) -> Self: return self async def load_async(self, **kwargs) -> Self: + """Trigger and await asynchronous loading of data into memory and return this dataset. + + Data will be computed and/or loaded from disk or a remote source. + + Unlike ``.compute``, the original dataset is modified and returned. + + Only works when opening data lazily from IO storage backends which support lazy asynchronous loading. + Otherwise will raise a NotImplementedError. + + Parameters + ---------- + **kwargs : dict + Additional keyword arguments passed on to ``dask.compute``. + + Returns + ------- + object : Dataset + Same object but with lazy data variables and coordinates as in-memory arrays. + + See Also + -------- + dask.compute + Dataset.compute + Dataset.load + DataArray.load_async + Variable.load_async + """ # TODO refactor this to pull out the common chunked_data codepath # this blocks on chunked arrays but not on lazily indexed arrays @@ -722,9 +760,11 @@ def _dask_postpersist( ) def compute(self, **kwargs) -> Self: - """Manually trigger loading and/or computation of this dataset's data - from disk or a remote source into memory and return a new dataset. - Unlike load, the original dataset is left unaltered. + """Trigger loading data into memory and return a new dataset. + + Data will be computed and/or loaded from disk or a remote source. + + Unlike ``.load``, the original dataset is left unaltered. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -744,6 +784,10 @@ def compute(self, **kwargs) -> Self: See Also -------- dask.compute + Dataset.load + Dataset.load_async + DataArray.compute + Variable.compute """ new = self.copy(deep=False) return new.load(**kwargs) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index ceaa2b8bfa0..5b84924177f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -977,8 +977,11 @@ def _replace( return type(self)(dims, data, attrs, encoding, fastpath=True) def load(self, **kwargs) -> Self: - """Manually trigger loading of this variable's data from disk or a - remote source into memory and return this variable. + """Trigger loading data into memory and return this variable. + + Data will be computed and/or loaded from disk or a remote source. + + Unlike ``.compute``, the original variable is modified and returned. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -989,18 +992,28 @@ def load(self, **kwargs) -> Self: **kwargs : dict Additional keyword arguments passed on to ``dask.array.compute``. + Returns + ------- + object : Variable + Same object but with lazy data as an in-memory array. + See Also -------- + dask.array.compute Variable.compute Variable.load_async - dask.array.compute + DataArray.load + Dataset.load """ self._data = to_duck_array(self._data, **kwargs) return self async def load_async(self, **kwargs) -> Self: - """Manually trigger and await asynchronous loading of this variable's data from disk or a - remote source into memory and return this variable. + """Trigger and await asynchronous loading of data into memory and return this variable. + + Data will be computed and/or loaded from disk or a remote source. + + Unlike ``.compute``, the original variable is modified and returned. Only works when opening data lazily from IO storage backends which support lazy asynchronous loading. Otherwise will raise a NotImplementedError. @@ -1010,18 +1023,28 @@ async def load_async(self, **kwargs) -> Self: **kwargs : dict Additional keyword arguments passed on to ``dask.array.compute``. + Returns + ------- + object : Variable + Same object but with lazy data as an in-memory array. + See Also -------- - Variable.load dask.array.compute + Variable.load + Variable.compute + DataArray.load_async + Dataset.load_async """ self._data = await async_to_duck_array(self._data, **kwargs) return self def compute(self, **kwargs) -> Self: - """Manually trigger loading of this variable's data from disk or a - remote source into memory and return a new variable. The original is - left unaltered. + """Trigger loading data into memory and return a new variable. + + Data will be computed and/or loaded from disk or a remote source. + + The original variable is left unaltered. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -1032,10 +1055,18 @@ def compute(self, **kwargs) -> Self: **kwargs : dict Additional keyword arguments passed on to ``dask.array.compute``. + Returns + ------- + object : Variable + New object with the data as an in-memory array. + See Also -------- dask.array.compute Variable.load + Variable.load_async + DataArray.compute + Dataset.compute """ new = self.copy(deep=False) return new.load(**kwargs) From dfaac7ee0ebce43a5fd071c243a406ec9e443eab Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 11:20:42 +0100 Subject: [PATCH 104/112] note about users being responsible for limiting concurrency --- doc/whats-new.rst | 1 + xarray/core/dataarray.py | 2 ++ xarray/core/dataset.py | 2 ++ xarray/core/variable.py | 2 ++ 4 files changed, 7 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 24c7c9040ab..3d8fe642723 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -14,6 +14,7 @@ New Features ~~~~~~~~~~~~ - Added new asynchronous loading methods :py:meth:`Dataset.load_async`, :py:meth:`DataArray.load_async`, :py:meth:`Variable.load_async`. + Note that users are expected to limit concurrency themselves - xarray does not internally limit concurrency in any way. (:issue:`10326`, :pull:`10327`) By `Tom Nicholas `_. - :py:meth:`DataTree.to_netcdf` can now write to a file-like object, or return bytes if called without a filepath. (:issue:`10570`) By `Matthew Willson `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 77f407a3b11..b1833d3266f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1180,6 +1180,8 @@ async def load_async(self, **kwargs) -> Self: Only works when opening data lazily from IO storage backends which support lazy asynchronous loading. Otherwise will raise a NotImplementedError. + Note users are expected to limit concurrency themselves - xarray does not internally limit concurrency in any way. + Parameters ---------- **kwargs : dict diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ccdf76c2bc4..f51853fc8fd 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -580,6 +580,8 @@ async def load_async(self, **kwargs) -> Self: Only works when opening data lazily from IO storage backends which support lazy asynchronous loading. Otherwise will raise a NotImplementedError. + Note users are expected to limit concurrency themselves - xarray does not internally limit concurrency in any way. + Parameters ---------- **kwargs : dict diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 5b84924177f..cc502e17d2e 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1018,6 +1018,8 @@ async def load_async(self, **kwargs) -> Self: Only works when opening data lazily from IO storage backends which support lazy asynchronous loading. Otherwise will raise a NotImplementedError. + Note users are expected to limit concurrency themselves - xarray does not internally limit concurrency in any way. + Parameters ---------- **kwargs : dict From d0a129abd68a6fbb57049900bd23462df355a5bf Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 11:31:19 +0100 Subject: [PATCH 105/112] remove parametrization over zarr_format --- xarray/tests/test_backends.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index bb388f4f2fe..f465001b600 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -81,7 +81,6 @@ has_zarr_v3_dtypes, mock, network, - parametrize_zarr_format, requires_cftime, requires_dask, requires_fsspec, @@ -3874,8 +3873,7 @@ def test_chunk_key_encoding_v2(self) -> None: @pytest.mark.asyncio @requires_zarr_v3 - @parametrize_zarr_format - async def test_async_load_multiple_variables(self, zarr_format) -> None: + async def test_async_load_multiple_variables(self) -> None: target_class = zarr.AsyncArray method_name = "getitem" original_method = getattr(target_class, method_name) @@ -3885,7 +3883,7 @@ async def test_async_load_multiple_variables(self, zarr_format) -> None: original = create_test_data() with self.create_zarr_target() as store: - original.to_zarr(store, zarr_format=zarr_format, consolidated=False) + original.to_zarr(store, zarr_format=3, consolidated=False) with patch.object( target_class, method_name, wraps=original_method, autospec=True @@ -3903,11 +3901,11 @@ async def test_async_load_multiple_variables(self, zarr_format) -> None: xrt.assert_identical(result_ds, ds.load()) @pytest.mark.asyncio - @parametrize_zarr_format @requires_zarr_v3 @pytest.mark.parametrize("cls_name", ["Variable", "DataArray", "Dataset"]) async def test_concurrent_load_multiple_objects( - self, cls_name, zarr_format + self, + cls_name, ) -> None: N_OBJECTS = 5 @@ -3917,7 +3915,7 @@ async def test_concurrent_load_multiple_objects( original = create_test_data() with self.create_zarr_target() as store: - original.to_zarr(store, consolidated=False, zarr_format=zarr_format) + original.to_zarr(store, consolidated=False, zarr_format=3) with patch.object( target_class, method_name, wraps=original_method, autospec=True @@ -3983,14 +3981,12 @@ async def test_concurrent_load_multiple_objects( "vectorized-isel", ], ) - @parametrize_zarr_format async def test_indexing( self, cls_name, method, indexer, target_zarr_class, - zarr_format, ) -> None: if not has_zarr_v3_async_oindex and target_zarr_class in ( "zarr.core.indexing.AsyncOIndex", @@ -4017,7 +4013,7 @@ def _resolve_class_from_string(class_path: str) -> type[Any]: original = create_test_data() with self.create_zarr_target() as store: - original.to_zarr(store, consolidated=False, zarr_format=zarr_format) + original.to_zarr(store, consolidated=False, zarr_format=3) with patch.object( target_class, method_name, wraps=original_method, autospec=True @@ -4067,15 +4063,16 @@ def _resolve_class_from_string(class_path: str) -> type[Any]: ), # tests vindexing ], ) - @parametrize_zarr_format async def test_raise_on_older_zarr_version( - self, indexer, expected_err_msg, zarr_format + self, + indexer, + expected_err_msg, ): """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" original = create_test_data() with self.create_zarr_target() as store: - original.to_zarr(store, consolidated=False, zarr_format=zarr_format) + original.to_zarr(store, consolidated=False, zarr_format=3) ds = xr.open_zarr(store, consolidated=False, chunks=None) var = ds["var1"].variable From f30a3a0c51889d25d50342a24653e26ee6ac7c21 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 11:38:02 +0100 Subject: [PATCH 106/112] account for Dataset having multiple lazy vars --- xarray/tests/test_backends.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f465001b600..4cc25be4bb4 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3908,6 +3908,11 @@ async def test_concurrent_load_multiple_objects( cls_name, ) -> None: N_OBJECTS = 5 + N_LAZY_VARS = { + "Variable": 1, + "DataArray": 1, + "Dataset": 4, + } # specific to the create_test_data() used target_class = zarr.AsyncArray method_name = "getitem" @@ -3927,7 +3932,7 @@ async def test_concurrent_load_multiple_objects( results = await asyncio.gather(*coros) mocked_meth.assert_called() - assert mocked_meth.call_count == N_OBJECTS + assert mocked_meth.call_count == N_OBJECTS * N_LAZY_VARS[cls_name] mocked_meth.assert_awaited() for result in results: From 432bbd5a9ebd1541adac180f6223dcc44f6230dc Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 11:42:31 +0100 Subject: [PATCH 107/112] refactor test parametrization to use pytest.param(..., id=...) syntax --- xarray/tests/test_backends.py | 47 +++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4cc25be4bb4..d7819d257f3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3944,47 +3944,50 @@ async def test_concurrent_load_multiple_objects( @pytest.mark.parametrize( "indexer, method, target_zarr_class", [ - ({}, "sel", "zarr.AsyncArray"), - ({}, "isel", "zarr.AsyncArray"), - ({"dim2": 1.0}, "sel", "zarr.AsyncArray"), - ({"dim2": 2}, "isel", "zarr.AsyncArray"), - ({"dim2": slice(1.0, 3.0)}, "sel", "zarr.AsyncArray"), - ({"dim2": slice(1, 3)}, "isel", "zarr.AsyncArray"), - ( + pytest.param({}, "sel", "zarr.AsyncArray", id="no-indexing-sel"), + pytest.param({}, "isel", "zarr.AsyncArray", id="no-indexing-isel"), + pytest.param({"dim2": 1.0}, "sel", "zarr.AsyncArray", id="basic-int-sel"), + pytest.param({"dim2": 2}, "isel", "zarr.AsyncArray", id="basic-int-isel"), + pytest.param( + {"dim2": slice(1.0, 3.0)}, + "sel", + "zarr.AsyncArray", + id="basic-slice-sel", + ), + pytest.param( + {"dim2": slice(1, 3)}, "isel", "zarr.AsyncArray", id="basic-slice-isel" + ), + pytest.param( {"dim2": [1.0, 3.0]}, "sel", "zarr.core.indexing.AsyncOIndex", + id="outer-sel", ), - ({"dim2": [1, 3]}, "isel", "zarr.core.indexing.AsyncOIndex"), - ( + pytest.param( + {"dim2": [1, 3]}, + "isel", + "zarr.core.indexing.AsyncOIndex", + id="outer-isel", + ), + pytest.param( { "dim1": xr.Variable(data=[2, 3], dims="points"), "dim2": xr.Variable(data=[1.0, 2.0], dims="points"), }, "sel", "zarr.core.indexing.AsyncVIndex", + id="vectorized-sel", ), - ( + pytest.param( { "dim1": xr.Variable(data=[2, 3], dims="points"), "dim2": xr.Variable(data=[1, 3], dims="points"), }, "isel", "zarr.core.indexing.AsyncVIndex", + id="vectorized-isel", ), ], - ids=[ - "no-indexing-sel", - "no-indexing-isel", - "basic-int-sel", - "basic-int-isel", - "basic-slice-sel", - "basic-slice-isel", - "outer-sel", - "outer-isel", - "vectorized-sel", - "vectorized-isel", - ], ) async def test_indexing( self, From 4f40792a27849508131465a9e341d359934b0ded Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 11:52:01 +0100 Subject: [PATCH 108/112] refactor TestBackendIndexing to combine sync and async checks in one function --- xarray/tests/test_variable.py | 82 ++++++++++++++--------------------- 1 file changed, 33 insertions(+), 49 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 441db94349a..18d7d640608 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2897,34 +2897,31 @@ def setUp(self): self.d = np.random.random((10, 3)).astype(np.float64) self.cat = PandasExtensionArray(pd.Categorical(["a", "b"] * 5)) - def check_orthogonal_indexing(self, v): - result = v.isel(x=[8, 3], y=[2, 1]) + async def check_orthogonal_indexing(self, v): expected = self.d[[8, 3]][:, [2, 1]] + + result = v.isel(x=[8, 3], y=[2, 1]) assert np.allclose(result, expected) - async def check_orthogonal_async_indexing(self, v): result = await v.isel(x=[8, 3], y=[2, 1]).load_async() - expected = self.d[[8, 3]][:, [2, 1]] assert np.allclose(result, expected) - def check_vectorized_indexing(self, v): + async def check_vectorized_indexing(self, v): ind_x = Variable("z", [0, 2]) ind_y = Variable("z", [2, 1]) - result = v.isel(x=ind_x, y=ind_y) expected = self.d[ind_x, ind_y] + + result = v.isel(x=ind_x, y=ind_y).load() assert np.allclose(result, expected) - async def check_vectorized_async_indexing(self, v): - ind_x = Variable("z", [0, 2]) - ind_y = Variable("z", [2, 1]) result = await v.isel(x=ind_x, y=ind_y).load_async() - expected = self.d[ind_x, ind_y] assert np.allclose(result, expected) - def test_NumpyIndexingAdapter(self): + @pytest.mark.asyncio + async def test_NumpyIndexingAdapter(self): v = Variable(dims=("x", "y"), data=NumpyIndexingAdapter(self.d)) - self.check_orthogonal_indexing(v) - self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v) + await self.check_vectorized_indexing(v) # could not doubly wrapping with pytest.raises(TypeError, match=r"NumpyIndexingAdapter only wraps "): v = Variable( @@ -2939,71 +2936,58 @@ def test_extension_array_duck_indexed(self): lazy = Variable(dims=("x"), data=LazilyIndexedArray(self.cat)) assert (lazy[[0, 1, 5]] == ["a", "b", "b"]).all() - def test_LazilyIndexedArray(self): - v = Variable(dims=("x", "y"), data=LazilyIndexedArray(self.d)) - self.check_orthogonal_indexing(v) - self.check_vectorized_indexing(v) - # doubly wrapping - v = Variable( - dims=("x", "y"), - data=LazilyIndexedArray(LazilyIndexedArray(self.d)), - ) - self.check_orthogonal_indexing(v) - # hierarchical wrapping - v = Variable( - dims=("x", "y"), data=LazilyIndexedArray(NumpyIndexingAdapter(self.d)) - ) - self.check_orthogonal_indexing(v) - @pytest.mark.asyncio - async def test_lazy_async_indexing(self) -> None: + async def test_LazilyIndexedArray(self): v = Variable(dims=("x", "y"), data=LazilyIndexedArray(self.d)) - await self.check_orthogonal_async_indexing(v) - await self.check_vectorized_async_indexing(v) + await self.check_orthogonal_indexing(v) + await self.check_vectorized_indexing(v) # doubly wrapping v = Variable( dims=("x", "y"), data=LazilyIndexedArray(LazilyIndexedArray(self.d)), ) - await self.check_orthogonal_async_indexing(v) + await self.check_orthogonal_indexing(v) # hierarchical wrapping v = Variable( dims=("x", "y"), data=LazilyIndexedArray(NumpyIndexingAdapter(self.d)) ) - await self.check_orthogonal_async_indexing(v) + await self.check_orthogonal_indexing(v) - def test_CopyOnWriteArray(self): + @pytest.mark.asyncio + async def test_CopyOnWriteArray(self): v = Variable(dims=("x", "y"), data=CopyOnWriteArray(self.d)) - self.check_orthogonal_indexing(v) - self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v) + await self.check_vectorized_indexing(v) # doubly wrapping v = Variable(dims=("x", "y"), data=CopyOnWriteArray(LazilyIndexedArray(self.d))) - self.check_orthogonal_indexing(v) - self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v) + await self.check_vectorized_indexing(v) - def test_MemoryCachedArray(self): + @pytest.mark.asyncio + async def test_MemoryCachedArray(self): v = Variable(dims=("x", "y"), data=MemoryCachedArray(self.d)) - self.check_orthogonal_indexing(v) - self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v) + await self.check_vectorized_indexing(v) # doubly wrapping v = Variable(dims=("x", "y"), data=CopyOnWriteArray(MemoryCachedArray(self.d))) - self.check_orthogonal_indexing(v) - self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v) + await self.check_vectorized_indexing(v) @requires_dask - def test_DaskIndexingAdapter(self): + @pytest.mark.asyncio + async def test_DaskIndexingAdapter(self): import dask.array as da dask_array = da.asarray(self.d) v = Variable(dims=("x", "y"), data=DaskIndexingAdapter(dask_array)) - self.check_orthogonal_indexing(v) - self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v) + await self.check_vectorized_indexing(v) # doubly wrapping v = Variable( dims=("x", "y"), data=CopyOnWriteArray(DaskIndexingAdapter(dask_array)) ) - self.check_orthogonal_indexing(v) - self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v) + await self.check_vectorized_indexing(v) def test_clip(var): From cf1d127bef99a33dda4fdbe011eb228bdcb8577a Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 12:29:29 +0100 Subject: [PATCH 109/112] move test_load_async onto test base class --- xarray/tests/test_backends.py | 111 ++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 44 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d7819d257f3..7fc772dc7ec 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -352,6 +352,11 @@ def __getitem__(self, key): class NetCDF3Only: netcdf3_formats: tuple[T_NetcdfTypes, ...] = ("NETCDF3_CLASSIC", "NETCDF3_64BIT") + @pytest.mark.asyncio + @pytest.mark.skip(reason="NetCDF backends don't support async loading") + async def test_load_async(self) -> None: + await super().test_load_async() + @requires_scipy def test_dtype_coercion_error(self) -> None: """Failing dtype coercion should lead to an error""" @@ -462,6 +467,7 @@ def test_roundtrip_test_data(self) -> None: assert_identical(expected, actual) def test_load(self) -> None: + # Note: please keep this in sync with test_load_async below as much as possible! expected = create_test_data() @contextlib.contextmanager @@ -494,6 +500,43 @@ def assert_loads(vars=None): actual = ds.load() assert_identical(expected, actual) + @pytest.mark.asyncio + async def test_load_async(self) -> None: + # Note: please keep this in sync with test_load above as much as possible! + + # Copied from `test_load` on the base test class, but won't work for netcdf + expected = create_test_data() + + @contextlib.contextmanager + def assert_loads(vars=None): + if vars is None: + vars = expected + with self.roundtrip(expected) as actual: + for k, v in actual.variables.items(): + # IndexVariables are eagerly loaded into memory + assert v._in_memory == (k in actual.dims) + yield actual + for k, v in actual.variables.items(): + if k in vars: + assert v._in_memory + assert_identical(expected, actual) + + with pytest.raises(AssertionError): + # make sure the contextmanager works! + with assert_loads() as ds: + pass + + with assert_loads() as ds: + await ds.load_async() + + with assert_loads(["var1", "dim1", "dim2"]) as ds: + await ds["var1"].load_async() + + # verify we can read data even after closing the file + with self.roundtrip(expected) as ds: + actual = await ds.load_async() + assert_identical(expected, actual) + def test_dataset_compute(self) -> None: expected = create_test_data() @@ -1525,6 +1568,11 @@ def test_indexing_roundtrip(self, indexer) -> None: class NetCDFBase(CFEncodedBase): """Tests for all netCDF3 and netCDF4 backends.""" + @pytest.mark.asyncio + @pytest.mark.skip(reason="NetCDF backends don't support async loading") + async def test_load_async(self) -> None: + await super().test_load_async() + @pytest.mark.skipif( ON_WINDOWS, reason="Windows does not allow modifying open files" ) @@ -2454,6 +2502,14 @@ def roundtrip( with self.open(store_target, **open_kwargs) as ds: yield ds + @pytest.mark.asyncio + @pytest.mark.skipif( + not has_zarr_v3, + reason="zarr-python <3 did not support async loading", + ) + async def test_load_async(self) -> None: + await super().test_load_async() + def test_roundtrip_bytes_with_fill_value(self): pytest.xfail("Broken by Zarr 3.0.7") @@ -2489,44 +2545,6 @@ def test_non_existent_store(self) -> None: ): xr.open_zarr(f"{uuid.uuid4()}") - @pytest.mark.skipif( - not has_zarr_v3, reason="zarr-python <3 did not support async loading" - ) - @pytest.mark.asyncio - async def test_load_async(self) -> None: - """Copied from `test_load` on the base test class, but won't work for netcdf""" - expected = create_test_data() - - @contextlib.contextmanager - def assert_loads(vars=None): - if vars is None: - vars = expected - with self.roundtrip(expected) as actual: - for k, v in actual.variables.items(): - # IndexVariables are eagerly loaded into memory - assert v._in_memory == (k in actual.dims) - yield actual - for k, v in actual.variables.items(): - if k in vars: - assert v._in_memory - assert_identical(expected, actual) - - with pytest.raises(AssertionError): - # make sure the contextmanager works! - with assert_loads() as ds: - pass - - with assert_loads() as ds: - await ds.load_async() - - with assert_loads(["var1", "dim1", "dim2"]) as ds: - await ds["var1"].load_async() - - # verify we can read data even after closing the file - with self.roundtrip(expected) as ds: - actual = await ds.load_async() - assert_identical(expected, actual) - @pytest.mark.skipif(has_zarr_v3, reason="chunk_store not implemented in zarr v3") def test_with_chunkstore(self) -> None: expected = create_test_data() @@ -4352,7 +4370,7 @@ def test_zarr_version_deprecated() -> None: @requires_scipy -class TestScipyInMemoryData(CFEncodedBase, NetCDF3Only): +class TestScipyInMemoryData(NetCDF3Only, CFEncodedBase): engine: T_NetcdfEngine = "scipy" @contextlib.contextmanager @@ -4360,6 +4378,11 @@ def create_store(self): fobj = BytesIO() yield backends.ScipyDataStore(fobj, "w") + @pytest.mark.asyncio + @pytest.mark.skip(reason="NetCDF backends don't support async loading") + async def test_load_async(self) -> None: + await super().test_load_async() + def test_to_netcdf_explicit_engine(self) -> None: with pytest.warns( FutureWarning, @@ -4390,7 +4413,7 @@ def test_bytes_pickle(self) -> None: @requires_scipy -class TestScipyFileObject(CFEncodedBase, NetCDF3Only): +class TestScipyFileObject(NetCDF3Only, CFEncodedBase): # TODO: Consider consolidating some of these cases (e.g., # test_file_remains_open) with TestH5NetCDFFileObject engine: T_NetcdfEngine = "scipy" @@ -4459,7 +4482,7 @@ def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: @requires_scipy -class TestScipyFilePath(CFEncodedBase, NetCDF3Only): +class TestScipyFilePath(NetCDF3Only, CFEncodedBase): engine: T_NetcdfEngine = "scipy" @contextlib.contextmanager @@ -4496,7 +4519,7 @@ def test_nc4_scipy(self) -> None: @requires_netCDF4 -class TestNetCDF3ViaNetCDF4Data(CFEncodedBase, NetCDF3Only): +class TestNetCDF3ViaNetCDF4Data(NetCDF3Only, CFEncodedBase): engine: T_NetcdfEngine = "netcdf4" file_format: T_NetcdfTypes = "NETCDF3_CLASSIC" @@ -4517,7 +4540,7 @@ def test_encoding_kwarg_vlen_string(self) -> None: @requires_netCDF4 -class TestNetCDF4ClassicViaNetCDF4Data(CFEncodedBase, NetCDF3Only): +class TestNetCDF4ClassicViaNetCDF4Data(NetCDF3Only, CFEncodedBase): engine: T_NetcdfEngine = "netcdf4" file_format: T_NetcdfTypes = "NETCDF4_CLASSIC" @@ -4531,7 +4554,7 @@ def create_store(self): @requires_scipy_or_netCDF4 -class TestGenericNetCDFData(CFEncodedBase, NetCDF3Only): +class TestGenericNetCDFData(NetCDF3Only, CFEncodedBase): # verify that we can read and write netCDF3 files as long as we have scipy # or netCDF4-python installed file_format: T_NetcdfTypes = "NETCDF3_64BIT" From dda58bf6652b5d74af212420498fdbf00d204c0f Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 12:35:58 +0100 Subject: [PATCH 110/112] should fix mypy error --- xarray/tests/test_backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7fc772dc7ec..f5221e0f08f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -355,7 +355,7 @@ class NetCDF3Only: @pytest.mark.asyncio @pytest.mark.skip(reason="NetCDF backends don't support async loading") async def test_load_async(self) -> None: - await super().test_load_async() + pass @requires_scipy def test_dtype_coercion_error(self) -> None: From 02d661d4f60dbe04529983abc5b2e96c4aed8e75 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 13:17:57 +0100 Subject: [PATCH 111/112] add back in the parametrize_zarr_format to avoid trying to write v3 data using zarr-python v2 --- xarray/tests/test_backends.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f5221e0f08f..c6883dcb5a5 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -81,6 +81,7 @@ has_zarr_v3_dtypes, mock, network, + parametrize_zarr_format, requires_cftime, requires_dask, requires_fsspec, @@ -4089,16 +4090,18 @@ def _resolve_class_from_string(class_path: str) -> type[Any]: ), # tests vindexing ], ) + @parametrize_zarr_format async def test_raise_on_older_zarr_version( self, indexer, expected_err_msg, + zarr_format, ): """Test that trying to use async load with insufficiently new version of zarr raises a clear error""" original = create_test_data() with self.create_zarr_target() as store: - original.to_zarr(store, consolidated=False, zarr_format=3) + original.to_zarr(store, consolidated=False, zarr_format=zarr_format) ds = xr.open_zarr(store, consolidated=False, chunks=None) var = ds["var1"].variable From a074a25e2cd74c6e5069519ab01016f563d8b11d Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Wed, 13 Aug 2025 21:17:22 +0100 Subject: [PATCH 112/112] parametrize test over async --- xarray/tests/test_variable.py | 69 +++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 18d7d640608..de77ec00c40 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2897,31 +2897,34 @@ def setUp(self): self.d = np.random.random((10, 3)).astype(np.float64) self.cat = PandasExtensionArray(pd.Categorical(["a", "b"] * 5)) - async def check_orthogonal_indexing(self, v): + async def check_orthogonal_indexing(self, v, load_async): expected = self.d[[8, 3]][:, [2, 1]] - result = v.isel(x=[8, 3], y=[2, 1]) - assert np.allclose(result, expected) + if load_async: + result = await v.isel(x=[8, 3], y=[2, 1]).load_async() + else: + result = v.isel(x=[8, 3], y=[2, 1]) - result = await v.isel(x=[8, 3], y=[2, 1]).load_async() assert np.allclose(result, expected) - async def check_vectorized_indexing(self, v): + async def check_vectorized_indexing(self, v, load_async): ind_x = Variable("z", [0, 2]) ind_y = Variable("z", [2, 1]) expected = self.d[ind_x, ind_y] - result = v.isel(x=ind_x, y=ind_y).load() - assert np.allclose(result, expected) + if load_async: + result = await v.isel(x=ind_x, y=ind_y).load_async() + else: + result = v.isel(x=ind_x, y=ind_y).load() - result = await v.isel(x=ind_x, y=ind_y).load_async() assert np.allclose(result, expected) @pytest.mark.asyncio - async def test_NumpyIndexingAdapter(self): + @pytest.mark.parametrize("load_async", [True, False]) + async def test_NumpyIndexingAdapter(self, load_async): v = Variable(dims=("x", "y"), data=NumpyIndexingAdapter(self.d)) - await self.check_orthogonal_indexing(v) - await self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v, load_async) + await self.check_vectorized_indexing(v, load_async) # could not doubly wrapping with pytest.raises(TypeError, match=r"NumpyIndexingAdapter only wraps "): v = Variable( @@ -2937,57 +2940,61 @@ def test_extension_array_duck_indexed(self): assert (lazy[[0, 1, 5]] == ["a", "b", "b"]).all() @pytest.mark.asyncio - async def test_LazilyIndexedArray(self): + @pytest.mark.parametrize("load_async", [True, False]) + async def test_LazilyIndexedArray(self, load_async): v = Variable(dims=("x", "y"), data=LazilyIndexedArray(self.d)) - await self.check_orthogonal_indexing(v) - await self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v, load_async) + await self.check_vectorized_indexing(v, load_async) # doubly wrapping v = Variable( dims=("x", "y"), data=LazilyIndexedArray(LazilyIndexedArray(self.d)), ) - await self.check_orthogonal_indexing(v) + await self.check_orthogonal_indexing(v, load_async) # hierarchical wrapping v = Variable( dims=("x", "y"), data=LazilyIndexedArray(NumpyIndexingAdapter(self.d)) ) - await self.check_orthogonal_indexing(v) + await self.check_orthogonal_indexing(v, load_async) @pytest.mark.asyncio - async def test_CopyOnWriteArray(self): + @pytest.mark.parametrize("load_async", [True, False]) + async def test_CopyOnWriteArray(self, load_async): v = Variable(dims=("x", "y"), data=CopyOnWriteArray(self.d)) - await self.check_orthogonal_indexing(v) - await self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v, load_async) + await self.check_vectorized_indexing(v, load_async) # doubly wrapping v = Variable(dims=("x", "y"), data=CopyOnWriteArray(LazilyIndexedArray(self.d))) - await self.check_orthogonal_indexing(v) - await self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v, load_async) + await self.check_vectorized_indexing(v, load_async) @pytest.mark.asyncio - async def test_MemoryCachedArray(self): + @pytest.mark.parametrize("load_async", [True, False]) + async def test_MemoryCachedArray(self, load_async): v = Variable(dims=("x", "y"), data=MemoryCachedArray(self.d)) - await self.check_orthogonal_indexing(v) - await self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v, load_async) + await self.check_vectorized_indexing(v, load_async) # doubly wrapping v = Variable(dims=("x", "y"), data=CopyOnWriteArray(MemoryCachedArray(self.d))) - await self.check_orthogonal_indexing(v) - await self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v, load_async) + await self.check_vectorized_indexing(v, load_async) @requires_dask @pytest.mark.asyncio - async def test_DaskIndexingAdapter(self): + @pytest.mark.parametrize("load_async", [True, False]) + async def test_DaskIndexingAdapter(self, load_async): import dask.array as da dask_array = da.asarray(self.d) v = Variable(dims=("x", "y"), data=DaskIndexingAdapter(dask_array)) - await self.check_orthogonal_indexing(v) - await self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v, load_async) + await self.check_vectorized_indexing(v, load_async) # doubly wrapping v = Variable( dims=("x", "y"), data=CopyOnWriteArray(DaskIndexingAdapter(dask_array)) ) - await self.check_orthogonal_indexing(v) - await self.check_vectorized_indexing(v) + await self.check_orthogonal_indexing(v, load_async) + await self.check_vectorized_indexing(v, load_async) def test_clip(var):