From 7ba5f852aa46a78c2a78fc6390a59da397eacea7 Mon Sep 17 00:00:00 2001 From: Matthew Willson Date: Fri, 25 Jul 2025 22:32:45 +0100 Subject: [PATCH 01/15] Improve reading and writing of NetCDF files to/from bytes or file-like objects. * Allows use of h5netcdf engine when writing to file-like objects (such as BytesIO), stop forcing use of scipy backend in this case (which is incompatible with groups and DataTree). Makes h5netcdf the default engine for DataTree.to_netcdf rather than leaving the choice of default up to Dataset.to_netcdf. * Allows use of h5netcdf engine to read from a bytes object. * Allows DataTree.to_netcdf to return bytes when filepath argument is omitted (similar to Dataset.to_netcdf. --- doc/whats-new.rst | 2 ++ xarray/backends/api.py | 16 ++++++--- xarray/backends/h5netcdf_.py | 8 ++--- xarray/core/datatree.py | 49 +++++++++++++++++++++++--- xarray/core/datatree_io.py | 20 +++++++---- xarray/tests/test_backends.py | 3 -- xarray/tests/test_backends_datatree.py | 22 ++++++++++++ 7 files changed, 96 insertions(+), 24 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index bace038bb17..d6d1dabbd1b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,6 +13,8 @@ v2025.07.2 (unreleased) New Features ~~~~~~~~~~~~ +- :py:meth:`DataTree.to_netcdf` can now write to a file-like object, or return bytes if called without a filepath. (:issue:`10570`) + By `Matthew Willson `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index cfd3ff7fc0f..8826e6f3555 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -43,7 +43,7 @@ from xarray.core.indexes import Index from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes -from xarray.core.utils import is_remote_uri +from xarray.core.utils import emit_user_level_warning, is_remote_uri from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager from xarray.structure.chunks import _get_chunk, _maybe_chunk @@ -1911,11 +1911,11 @@ def to_netcdf( if path_or_file is None: if engine is None: engine = "scipy" - elif engine != "scipy": + elif engine not in ("scipy", "h5netcdf"): raise ValueError( "invalid engine for creating bytes with " - f"to_netcdf: {engine!r}. Only the default engine " - "or engine='scipy' is supported" + f"to_netcdf: {engine!r}. Only the default engine, " + "engine='scipy' or engine='h5netcdf' is supported." ) if not compute: raise NotImplementedError( @@ -1927,7 +1927,13 @@ def to_netcdf( engine = _get_default_engine(path_or_file) path_or_file = _normalize_path(path_or_file) else: # file-like object - engine = "scipy" + if engine not in ("scipy", "h5netcdf"): + emit_user_level_warning( + f"Requested {engine=} is not compatible with writing to a file-like object. " + "This will raise an error in the future, for now defaulting to engine='scipy'.", + FutureWarning, + ) + engine = "scipy" # validate Dataset keys, DataArray names, and attr keys/values _validate_dataset_names(dataset) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index f3e434c6e5e..55203cffbdc 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -159,11 +159,9 @@ def open( ) if isinstance(filename, bytes): - raise ValueError( - "can't open netCDF4/HDF5 as bytes " - "try passing a path or file-like object" - ) - elif isinstance(filename, io.IOBase): + filename = io.BytesIO(filename) + + if isinstance(filename, io.IOBase) and mode == "r": magic_number = read_magic_number_from_file(filename) if not magic_number.startswith(b"\211HDF\r\n\032\n"): raise ValueError( diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index afef2f20094..4aa73e4ade1 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1,7 +1,9 @@ from __future__ import annotations import functools +import io import itertools +from os import PathLike import textwrap from collections import ChainMap from collections.abc import ( @@ -1659,9 +1661,11 @@ def _inplace_binary_op(self, other, f) -> Self: def __eq__(self, other: DtCompatible) -> Self: # type: ignore[override] return super().__eq__(other) + # filepath=None writes to bytes + @overload def to_netcdf( self, - filepath, + filepath: None = None, mode: NetcdfWriteModes = "w", encoding=None, unlimited_dims=None, @@ -1671,14 +1675,45 @@ def to_netcdf( write_inherited_coords: bool = False, compute: bool = True, **kwargs, - ): + ) -> bytes: ... + + @overload + def to_netcdf( + self, + filepath: str | PathLike, + mode: NetcdfWriteModes = "w", + encoding=None, + unlimited_dims=None, + format: T_DataTreeNetcdfTypes | None = None, + engine: T_DataTreeNetcdfEngine | None = None, + group: str | None = None, + write_inherited_coords: bool = False, + compute: bool = True, + **kwargs, + ) -> None: ... + + def to_netcdf( + self, + filepath: str | PathLike | io.IOBase | None = None, + mode: NetcdfWriteModes = "w", + encoding=None, + unlimited_dims=None, + format: T_DataTreeNetcdfTypes | None = None, + engine: T_DataTreeNetcdfEngine | None = None, + group: str | None = None, + write_inherited_coords: bool = False, + compute: bool = True, + **kwargs, + ) -> None | bytes: """ Write datatree contents to a netCDF file. Parameters ---------- - filepath : str or Path - Path to which to save this datatree. + filepath : str or PathLike or file-like object or None + Path to which to save this datatree, or a file-like object to write + it to (which must support read and write and be seekable) or None + to return in-memory bytes. mode : {"w", "a"}, default: "w" Write ('w') or append ('a') mode. If mode='w', any existing file at this location will be overwritten. If mode='a', existing variables @@ -1717,6 +1752,10 @@ def to_netcdf( kwargs : Additional keyword arguments to be passed to ``xarray.Dataset.to_netcdf`` + Returns + ------- + A bytes object with the byte content of the netCDF file, if filepath was None. + Note ---- Due to file format specifications the on-disk root group name @@ -1724,7 +1763,7 @@ def to_netcdf( """ from xarray.core.datatree_io import _datatree_to_netcdf - _datatree_to_netcdf( + return _datatree_to_netcdf( self, filepath, mode=mode, diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index cf3626dbb12..f615b661a32 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -1,8 +1,9 @@ from __future__ import annotations from collections.abc import Mapping +import io from os import PathLike -from typing import TYPE_CHECKING, Any, Literal, get_args +from typing import TYPE_CHECKING, Any, Literal, get_args, overload from xarray.core.datatree import DataTree from xarray.core.types import NetcdfWriteModes, ZarrWriteModes @@ -13,10 +14,9 @@ if TYPE_CHECKING: from xarray.core.types import ZarrStoreLike - def _datatree_to_netcdf( dt: DataTree, - filepath: str | PathLike, + filepath: str | PathLike | io.IOBase | None = None, mode: NetcdfWriteModes = "w", encoding: Mapping[str, Any] | None = None, unlimited_dims: Mapping | None = None, @@ -26,7 +26,7 @@ def _datatree_to_netcdf( write_inherited_coords: bool = False, compute: bool = True, **kwargs, -) -> None: +) -> None | bytes: """This function creates an appropriate datastore for writing a datatree to disk as a netCDF file. @@ -34,10 +34,13 @@ def _datatree_to_netcdf( """ if format not in [None, *get_args(T_DataTreeNetcdfTypes)]: - raise ValueError("to_netcdf only supports the NETCDF4 format") + raise ValueError("DataTree.to_netcdf only supports the NETCDF4 format") if engine not in [None, *get_args(T_DataTreeNetcdfEngine)]: - raise ValueError("to_netcdf only supports the netcdf4 and h5netcdf engines") + raise ValueError("DataTree.to_netcdf only supports the netcdf4 and h5netcdf engines") + + if engine is None: + engine = "h5netcdf" if group is not None: raise NotImplementedError( @@ -58,6 +61,9 @@ def _datatree_to_netcdf( f"unexpected encoding group name(s) provided: {set(encoding) - set(dt.groups)}" ) + if filepath is None: + filepath = io.BytesIO() + if unlimited_dims is None: unlimited_dims = {} @@ -78,6 +84,8 @@ def _datatree_to_netcdf( ) mode = "a" + return filepath.getvalue() if isinstance(filepath, io.BytesIO) else None + def _datatree_to_zarr( dt: DataTree, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 93329b2297d..1bb7f8fea5a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4543,9 +4543,6 @@ class TestH5NetCDFFileObject(TestH5NetCDFData): engine: T_NetcdfEngine = "h5netcdf" def test_open_badbytes(self) -> None: - with pytest.raises(ValueError, match=r"HDF5 as bytes"): - with open_dataset(b"\211HDF\r\n\032\n", engine="h5netcdf"): # type: ignore[arg-type] - pass with pytest.raises( ValueError, match=r"match in any of xarray's currently installed IO" ): diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index bf6b49c1ad9..f5fbc0a1157 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -539,6 +539,28 @@ def test_phony_dims_warning(self, tmpdir) -> None: "phony_dim_3": 25, } + def test_roundtrip_via_bytes(self, simple_datatree): + original_dt = simple_datatree + roundtrip_dt = open_datatree(original_dt.to_netcdf()) + assert_equal(original_dt, roundtrip_dt) + + def test_roundtrip_via_bytes_engine_specified(self, simple_datatree): + original_dt = simple_datatree + roundtrip_dt = open_datatree(original_dt.to_netcdf(engine=self.engine)) + assert_equal(original_dt, roundtrip_dt) + + def test_roundtrip_using_filelike_object(self, tmpdir, simple_datatree): + original_dt = simple_datatree + filepath = tmpdir + '/test.nc' + # h5py requires both read and write access when writing, it will + # work with file-like objects provided they support both, and are + # seekable. + with open(filepath, 'wb+') as file: + original_dt.to_netcdf(file, engine=self.engine) + with open(filepath, 'rb') as file: + roundtrip_dt = open_datatree(file, engine=self.engine) + assert_equal(original_dt, roundtrip_dt) + @requires_zarr @parametrize_zarr_format From 7b2d8aa58bb3de7c8785c3a7c8e25ba29b8cd63d Mon Sep 17 00:00:00 2001 From: Matthew Willson Date: Mon, 28 Jul 2025 23:12:28 +0100 Subject: [PATCH 02/15] Add a test for Dataset.to_netcdf(engine='h5netcdf') and fix a bug where bytes were being returned before the h5py.File had been closed, which it appears is needed for it to finish writing a valid file. This required a further workaround to prevent the BytesIO being closed by the scipy backend when it is used in a similar way. --- xarray/backends/api.py | 21 +++++++++++++++++---- xarray/tests/test_backends.py | 9 +++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 8826e6f3555..80e8c2c7be4 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1958,7 +1958,18 @@ def to_netcdf( f"is not currently supported with dask's {scheduler} scheduler" ) - target = path_or_file if path_or_file is not None else BytesIO() + if path_or_file is None: + target = BytesIO() + # We can't get the BytesIO's value *before* closing the store, since + # the h5netcdf backend won't finish writing until its close is called. + # However if we try to get the BytesIO's value *after* closing the store, + # the scipy backend will close the BytesIO, preventing its value from + # being read. The solution is to prevent the BytesIO from being closed: + close_bytesio = target.close + target.close = lambda: None # type: ignore[method-assign] + else: + target = path_or_file # type: ignore[assignment] + kwargs = dict(autoclose=True) if autoclose else {} if invalid_netcdf: if engine == "h5netcdf": @@ -1998,13 +2009,15 @@ def to_netcdf( writes = writer.sync(compute=compute) - if isinstance(target, BytesIO): - store.sync() - return target.getvalue() finally: if not multifile and compute: # type: ignore[redundant-expr] store.close() + if path_or_file is None: + value = target.getvalue() + close_bytesio() + return value + if not compute: import dask diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1bb7f8fea5a..aea8d8b88f8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4601,6 +4601,15 @@ def test_open_fileobj(self) -> None: pass +@requires_h5netcdf +class TestH5NetCDFInMemoryData: + def test_roundtrip_via_bytes(self) -> None: + original = create_test_data() + netcdf_bytes = original.to_netcdf(engine="h5netcdf") + roundtrip = open_dataset(netcdf_bytes, engine="h5netcdf") # type: ignore[arg-type] + assert_identical(roundtrip, original) + + @requires_h5netcdf @requires_dask @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") From 01765587aea280c39aa4307561188fc1d57d0d3b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 28 Jul 2025 22:48:01 +0000 Subject: [PATCH 03/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/backends/api.py | 15 +++++++-------- xarray/core/datatree.py | 2 +- xarray/core/datatree_io.py | 9 ++++++--- xarray/tests/test_backends_datatree.py | 6 +++--- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 80e8c2c7be4..d370faa010b 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1926,14 +1926,13 @@ def to_netcdf( if engine is None: engine = _get_default_engine(path_or_file) path_or_file = _normalize_path(path_or_file) - else: # file-like object - if engine not in ("scipy", "h5netcdf"): - emit_user_level_warning( - f"Requested {engine=} is not compatible with writing to a file-like object. " - "This will raise an error in the future, for now defaulting to engine='scipy'.", - FutureWarning, - ) - engine = "scipy" + elif engine not in ("scipy", "h5netcdf"): + emit_user_level_warning( + f"Requested {engine=} is not compatible with writing to a file-like object. " + "This will raise an error in the future, for now defaulting to engine='scipy'.", + FutureWarning, + ) + engine = "scipy" # validate Dataset keys, DataArray names, and attr keys/values _validate_dataset_names(dataset) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 4aa73e4ade1..f4d03e3ade3 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -3,7 +3,6 @@ import functools import io import itertools -from os import PathLike import textwrap from collections import ChainMap from collections.abc import ( @@ -14,6 +13,7 @@ Mapping, ) from html import escape +from os import PathLike from typing import ( TYPE_CHECKING, Any, diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index f615b661a32..00ce0eadc68 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -1,9 +1,9 @@ from __future__ import annotations -from collections.abc import Mapping import io +from collections.abc import Mapping from os import PathLike -from typing import TYPE_CHECKING, Any, Literal, get_args, overload +from typing import TYPE_CHECKING, Any, Literal, get_args from xarray.core.datatree import DataTree from xarray.core.types import NetcdfWriteModes, ZarrWriteModes @@ -14,6 +14,7 @@ if TYPE_CHECKING: from xarray.core.types import ZarrStoreLike + def _datatree_to_netcdf( dt: DataTree, filepath: str | PathLike | io.IOBase | None = None, @@ -37,7 +38,9 @@ def _datatree_to_netcdf( raise ValueError("DataTree.to_netcdf only supports the NETCDF4 format") if engine not in [None, *get_args(T_DataTreeNetcdfEngine)]: - raise ValueError("DataTree.to_netcdf only supports the netcdf4 and h5netcdf engines") + raise ValueError( + "DataTree.to_netcdf only supports the netcdf4 and h5netcdf engines" + ) if engine is None: engine = "h5netcdf" diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index f5fbc0a1157..2c36fb73399 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -551,13 +551,13 @@ def test_roundtrip_via_bytes_engine_specified(self, simple_datatree): def test_roundtrip_using_filelike_object(self, tmpdir, simple_datatree): original_dt = simple_datatree - filepath = tmpdir + '/test.nc' + filepath = tmpdir + "/test.nc" # h5py requires both read and write access when writing, it will # work with file-like objects provided they support both, and are # seekable. - with open(filepath, 'wb+') as file: + with open(filepath, "wb+") as file: original_dt.to_netcdf(file, engine=self.engine) - with open(filepath, 'rb') as file: + with open(filepath, "rb") as file: roundtrip_dt = open_datatree(file, engine=self.engine) assert_equal(original_dt, roundtrip_dt) From c4e2b9ac9609140b3e6c58daff30b404d48c7b17 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 29 Jul 2025 12:18:36 -0700 Subject: [PATCH 04/15] Move close() fixes into scipy backends I also updated the h5netcdf backend to silence warnings from not closing files that were already open (which are issued from CachingFileManager). --- xarray/backends/api.py | 10 ++-------- xarray/backends/file_manager.py | 7 +++++-- xarray/backends/h5netcdf_.py | 9 ++++++++- xarray/backends/scipy_.py | 26 +++++++++++++++++++++++-- xarray/tests/test_backends.py | 27 ++++++++++++++++++++++++++ xarray/tests/test_backends_datatree.py | 4 ++-- 6 files changed, 68 insertions(+), 15 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d370faa010b..034ca0dbeca 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1959,13 +1959,6 @@ def to_netcdf( if path_or_file is None: target = BytesIO() - # We can't get the BytesIO's value *before* closing the store, since - # the h5netcdf backend won't finish writing until its close is called. - # However if we try to get the BytesIO's value *after* closing the store, - # the scipy backend will close the BytesIO, preventing its value from - # being read. The solution is to prevent the BytesIO from being closed: - close_bytesio = target.close - target.close = lambda: None # type: ignore[method-assign] else: target = path_or_file # type: ignore[assignment] @@ -2013,8 +2006,9 @@ def to_netcdf( store.close() if path_or_file is None: + assert isinstance(target, BytesIO) # created in this function value = target.getvalue() - close_bytesio() + target.close() return value if not compute: diff --git a/xarray/backends/file_manager.py b/xarray/backends/file_manager.py index 77c6859650f..2a6f3691faf 100644 --- a/xarray/backends/file_manager.py +++ b/xarray/backends/file_manager.py @@ -339,8 +339,11 @@ def __hash__(self): class DummyFileManager(FileManager): """FileManager that simply wraps an open file in the FileManager interface.""" - def __init__(self, value): + def __init__(self, value, *, close=None): + if close is None: + close = value.close self._value = value + self._close = close def acquire(self, needs_lock=True): del needs_lock # ignored @@ -353,4 +356,4 @@ def acquire_context(self, needs_lock=True): def close(self, needs_lock=True): del needs_lock # ignored - self._value.close() + self._close() diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 55203cffbdc..8818232a4ba 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -152,6 +152,8 @@ def open( ): import h5netcdf + cacheable = isinstance(filename, str) + if isinstance(filename, str) and is_remote_uri(filename) and driver is None: mode_ = "rb" if mode == "r" else mode filename = _open_remote_file( @@ -187,7 +189,12 @@ def open( else: lock = combine_locks([HDF5_LOCK, get_write_lock(filename)]) - manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs) + if cacheable: + manager = CachingFileManager( + h5netcdf.File, filename, mode=mode, kwargs=kwargs + ) + else: + manager = DummyFileManager(h5netcdf.File(filename, mode=mode, **kwargs)) return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index b98d226cac6..c9cdfd9943a 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -167,7 +167,7 @@ def __init__( self.lock = ensure_lock(lock) - if isinstance(filename_or_obj, str): + if isinstance(filename_or_obj, str): # path manager = CachingFileManager( _open_scipy_netcdf, filename_or_obj, @@ -175,11 +175,33 @@ def __init__( lock=lock, kwargs=dict(mmap=mmap, version=version), ) - else: + elif isinstance(filename_or_obj, bytes): # file contents scipy_dataset = _open_scipy_netcdf( filename_or_obj, mode=mode, mmap=mmap, version=version ) manager = DummyFileManager(scipy_dataset) + elif hasattr(filename_or_obj, "seek"): # file object + # Note: checking for .seek matches the check for file objects + # in scipy.io.netcdf_file + scipy_dataset = _open_scipy_netcdf( + filename_or_obj, mode=mode, mmap=mmap, version=version + ) + # scipy.io.netcdf_file.close() incorrectly closes file objects that + # were passed in as constructor arguments: + # https://github.com/scipy/scipy/issues/13905 + # Instead of closing such files, only call flush(), which is + # equivalent as long as the netcdf_file object is not mmapped. + # This suffices to keep BytesIO objects open long enough to read + # their contents from to_netcdf(), but underlying files still get + # closed when the netcdf_file is garbage collected (via __del__), + # and will need to be fixed upstream in scipy. + if scipy_dataset.use_mmap: + close = scipy_dataset.close + else: + close = scipy_dataset.flush + manager = DummyFileManager(scipy_dataset, close=close) + else: + raise ValueError(f"cannot open {filename_or_obj=}") self._manager = manager diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index aea8d8b88f8..f8449d5e0f1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4069,6 +4069,8 @@ def test_bytes_pickle(self) -> None: @requires_scipy class TestScipyFileObject(CFEncodedBase, NetCDF3Only): + # TODO: Consider consolidating some of these cases (e.g., + # test_file_remains_open) with TestH5NetCDFFileObject engine: T_NetcdfEngine = "scipy" @contextlib.contextmanager @@ -4091,6 +4093,20 @@ def roundtrip( with self.open(f, **open_kwargs) as ds: yield ds + @pytest.mark.xfail( + reason="scipy.io.netcdf_file closes files upon garbage collection" + ) + def test_file_remains_open(self) -> None: + data = Dataset({"foo": ("x", [1, 2, 3])}) + f = BytesIO() + data.to_netcdf(f, engine="h5netcdf") + assert not f.closed + restored = open_dataset(f, engine="h5netcdf") + assert not f.closed + assert_identical(restored, data) + restored.close() + assert not f.closed + @pytest.mark.skip(reason="cannot pickle file objects") def test_pickle(self) -> None: pass @@ -4600,6 +4616,17 @@ def test_open_fileobj(self) -> None: with open_dataset(f): # ensure file gets closed pass + def test_file_remains_open(self) -> None: + data = Dataset({"foo": ("x", [1, 2, 3])}) + f = BytesIO() + data.to_netcdf(f, engine="h5netcdf") + assert not f.closed + restored = open_dataset(f, engine="h5netcdf") + assert not f.closed + assert_identical(restored, data) + restored.close() + assert not f.closed + @requires_h5netcdf class TestH5NetCDFInMemoryData: diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 2c36fb73399..ec57993c4b2 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -558,8 +558,8 @@ def test_roundtrip_using_filelike_object(self, tmpdir, simple_datatree): with open(filepath, "wb+") as file: original_dt.to_netcdf(file, engine=self.engine) with open(filepath, "rb") as file: - roundtrip_dt = open_datatree(file, engine=self.engine) - assert_equal(original_dt, roundtrip_dt) + with open_datatree(file, engine=self.engine) as roundtrip_dt: + assert_equal(original_dt, roundtrip_dt) @requires_zarr From 21b16181117b664bfdbde4bd12f15bccf75c0d65 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 30 Jul 2025 09:28:34 -0700 Subject: [PATCH 05/15] Fix type annotations --- xarray/backends/api.py | 8 ++++---- xarray/backends/h5netcdf_.py | 23 +++++++++++++++++++---- xarray/core/dataset.py | 5 +++-- xarray/core/datatree.py | 2 +- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 034ca0dbeca..20c0ce2b1b4 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,7 +10,7 @@ Sequence, ) from functools import partial -from io import BytesIO +from io import BytesIO, IOBase from itertools import starmap from numbers import Number from typing import ( @@ -1811,7 +1811,7 @@ def to_netcdf( @overload def to_netcdf( dataset: Dataset, - path_or_file: str | os.PathLike, + path_or_file: str | os.PathLike | IOBase, mode: NetcdfWriteModes = "w", format: T_NetcdfTypes | None = None, group: str | None = None, @@ -1867,7 +1867,7 @@ def to_netcdf( @overload def to_netcdf( dataset: Dataset, - path_or_file: str | os.PathLike | None, + path_or_file: str | os.PathLike | IOBase | None, mode: NetcdfWriteModes = "w", format: T_NetcdfTypes | None = None, group: str | None = None, @@ -1883,7 +1883,7 @@ def to_netcdf( def to_netcdf( dataset: Dataset, - path_or_file: str | os.PathLike | None = None, + path_or_file: str | os.PathLike | IOBase | None = None, mode: NetcdfWriteModes = "w", format: T_NetcdfTypes | None = None, group: str | None = None, diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 8818232a4ba..f445f4c1bc2 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -17,7 +17,11 @@ datatree_from_dict_with_io_cleanup, find_root_and_group, ) -from xarray.backends.file_manager import CachingFileManager, DummyFileManager +from xarray.backends.file_manager import ( + CachingFileManager, + DummyFileManager, + FileManager, +) from xarray.backends.locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock from xarray.backends.netCDF4_ import ( BaseNetCDF4Array, @@ -40,6 +44,8 @@ from xarray.core.variable import Variable if TYPE_CHECKING: + import h5netcdf + from xarray.backends.common import AbstractDataStore from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -109,7 +115,14 @@ class H5NetCDFStore(WritableCFDataStore): "lock", ) - def __init__(self, manager, group=None, mode=None, lock=HDF5_LOCK, autoclose=False): + def __init__( + self, + manager: FileManager | h5netcdf.File | h5netcdf.Group, + group=None, + mode=None, + lock=HDF5_LOCK, + autoclose=False, + ): import h5netcdf if isinstance(manager, h5netcdf.File | h5netcdf.Group): @@ -190,11 +203,13 @@ def open( lock = combine_locks([HDF5_LOCK, get_write_lock(filename)]) if cacheable: - manager = CachingFileManager( + manager: FileManager = CachingFileManager( h5netcdf.File, filename, mode=mode, kwargs=kwargs ) else: - manager = DummyFileManager(h5netcdf.File(filename, mode=mode, **kwargs)) + manager: FileManager = DummyFileManager( # type: ignore[no-redef] + h5netcdf.File(filename, mode=mode, **kwargs) + ) return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f79df3da7c2..faa78e42220 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2,6 +2,7 @@ import copy import datetime +import io import math import sys import warnings @@ -1901,7 +1902,7 @@ def to_netcdf( @overload def to_netcdf( self, - path: str | PathLike, + path: str | PathLike | io.IOBase, mode: NetcdfWriteModes = "w", format: T_NetcdfTypes | None = None, group: str | None = None, @@ -1932,7 +1933,7 @@ def to_netcdf( def to_netcdf( self, - path: str | PathLike | None = None, + path: str | PathLike | io.IOBase | None = None, mode: NetcdfWriteModes = "w", format: T_NetcdfTypes | None = None, group: str | None = None, diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index f4d03e3ade3..75425add6a6 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1680,7 +1680,7 @@ def to_netcdf( @overload def to_netcdf( self, - filepath: str | PathLike, + filepath: str | PathLike | io.IOBase, mode: NetcdfWriteModes = "w", encoding=None, unlimited_dims=None, From d778fdf81cb9cdd3ecc012b805f6b40fa33bdf81 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 30 Jul 2025 16:17:50 -0700 Subject: [PATCH 06/15] Fix error from arViz --- xarray/backends/h5netcdf_.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index f445f4c1bc2..84dfb5d9561 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -165,8 +165,6 @@ def open( ): import h5netcdf - cacheable = isinstance(filename, str) - if isinstance(filename, str) and is_remote_uri(filename) and driver is None: mode_ = "rb" if mode == "r" else mode filename = _open_remote_file( @@ -202,14 +200,10 @@ def open( else: lock = combine_locks([HDF5_LOCK, get_write_lock(filename)]) - if cacheable: - manager: FileManager = CachingFileManager( - h5netcdf.File, filename, mode=mode, kwargs=kwargs - ) - else: - manager: FileManager = DummyFileManager( # type: ignore[no-redef] - h5netcdf.File(filename, mode=mode, **kwargs) - ) + # TODO: Replace this with DummyFileManager in the case where filename is + # actually a file object. For mysterious reasons, this triggers test + # failures in arViz. + manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs) return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): From 6317631f80168a4f3be9605e0b143b804ddbbdb3 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 30 Jul 2025 23:08:43 -0700 Subject: [PATCH 07/15] better typing and different fixes --- xarray/backends/api.py | 33 ++++++++++++++++++++++----------- xarray/backends/h5netcdf_.py | 12 +++++------- xarray/backends/scipy_.py | 16 ++++------------ xarray/tests/test_backends.py | 12 +++++++++--- 4 files changed, 40 insertions(+), 33 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 20c0ce2b1b4..f6b716b0138 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -497,7 +497,7 @@ def _datatree_from_backend_datatree( def open_dataset( - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str | os.PathLike[Any] | ReadBuffer | bytes | AbstractDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -527,12 +527,12 @@ def open_dataset( Parameters ---------- - filename_or_obj : str, Path, file-like or DataStore + filename_or_obj : str, Path, file-like, bytes or DataStore Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like - objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). + objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf (netCDF4). engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "zarr", None}\ , installed backend \ or subclass of xarray.backends.BackendEntrypoint, optional @@ -686,6 +686,9 @@ def open_dataset( open_mfdataset """ + if isinstance(filename_or_obj, bytes): + filename_or_obj = BytesIO(filename_or_obj) + if cache is None: cache = chunks is None @@ -737,7 +740,7 @@ def open_dataset( def open_dataarray( - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str | os.PathLike[Any] | ReadBuffer | bytes | AbstractDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -768,12 +771,12 @@ def open_dataarray( Parameters ---------- - filename_or_obj : str, Path, file-like or DataStore + filename_or_obj : str, Path, file-like, bytes or DataStore Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like - objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). + objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf (netCDF4). engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "zarr", None}\ , installed backend \ or subclass of xarray.backends.BackendEntrypoint, optional @@ -964,7 +967,7 @@ def open_dataarray( def open_datatree( - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str | os.PathLike[Any] | ReadBuffer | bytes | AbstractDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -995,8 +998,9 @@ def open_datatree( Parameters ---------- - filename_or_obj : str, Path, file-like, or DataStore - Strings and Path objects are interpreted as a path to a netCDF file or Zarr store. + filename_or_obj : str, Path, file-like, bytes or DataStore + Strings and Path objects are interpreted as a path to a netCDF file or + Zarr store. Bytes are interpreted as file contents. engine : {"netcdf4", "h5netcdf", "zarr", None}, \ installed backend or xarray.backends.BackendEntrypoint, optional Engine to use when reading files. If not provided, the default engine @@ -1149,6 +1153,9 @@ def open_datatree( xarray.open_groups xarray.open_dataset """ + if isinstance(filename_or_obj, bytes): + filename_or_obj = BytesIO(filename_or_obj) + if cache is None: cache = chunks is None @@ -1237,8 +1244,9 @@ def open_groups( Parameters ---------- - filename_or_obj : str, Path, file-like, or DataStore - Strings and Path objects are interpreted as a path to a netCDF file or Zarr store. + filename_or_obj : str, Path, file-like, butes, or DataStore + Strings and Path objects are interpreted as a path to a netCDF file or + Zarr store. Bytes are interpreted as file contents. engine : {"netcdf4", "h5netcdf", "zarr", None}, \ installed backend or xarray.backends.BackendEntrypoint, optional Engine to use when reading files. If not provided, the default engine @@ -1390,6 +1398,9 @@ def open_groups( xarray.open_dataset xarray.DataTree.from_dict """ + if isinstance(filename_or_obj, bytes): + filename_or_obj = BytesIO(filename_or_obj) + if cache is None: cache = chunks is None diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 84dfb5d9561..4060274b9b8 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -171,9 +171,6 @@ def open( filename, mode=mode_, storage_options=storage_options ) - if isinstance(filename, bytes): - filename = io.BytesIO(filename) - if isinstance(filename, io.IOBase) and mode == "r": magic_number = read_magic_number_from_file(filename) if not magic_number.startswith(b"\211HDF\r\n\032\n"): @@ -200,10 +197,11 @@ def open( else: lock = combine_locks([HDF5_LOCK, get_write_lock(filename)]) - # TODO: Replace this with DummyFileManager in the case where filename is - # actually a file object. For mysterious reasons, this triggers test - # failures in arViz. - manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs) + manager = ( + CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs) + if isinstance(filename, str) + else h5netcdf.File(filename, mode=mode, **kwargs) + ) return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index c9cdfd9943a..295d3390f24 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -1,7 +1,6 @@ from __future__ import annotations import gzip -import io import os from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -119,10 +118,6 @@ def _open_scipy_netcdf(filename, mode, mmap, version): else: raise - if isinstance(filename, bytes) and filename.startswith(b"CDF"): - # it's a NetCDF3 bytestring - filename = io.BytesIO(filename) - try: return scipy.io.netcdf_file(filename, mode=mode, mmap=mmap, version=version) except TypeError as e: # netcdf3 message is obscure in this case @@ -141,7 +136,7 @@ def _open_scipy_netcdf(filename, mode, mmap, version): class ScipyDataStore(WritableCFDataStore): - """Store for reading and writing data via scipy.io.netcdf. + """Store for reading and writing data via scipy.io.netcdf_file. This store has the advantage of being able to be initialized with a StringIO object, allow for serialization without writing to disk. @@ -175,11 +170,6 @@ def __init__( lock=lock, kwargs=dict(mmap=mmap, version=version), ) - elif isinstance(filename_or_obj, bytes): # file contents - scipy_dataset = _open_scipy_netcdf( - filename_or_obj, mode=mode, mmap=mmap, version=version - ) - manager = DummyFileManager(scipy_dataset) elif hasattr(filename_or_obj, "seek"): # file object # Note: checking for .seek matches the check for file objects # in scipy.io.netcdf_file @@ -201,7 +191,9 @@ def __init__( close = scipy_dataset.flush manager = DummyFileManager(scipy_dataset, close=close) else: - raise ValueError(f"cannot open {filename_or_obj=}") + raise ValueError( + f"cannot open {filename_or_obj=} with scipy.io.netcdf_file" + ) self._manager = manager diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f8449d5e0f1..a0477263bae 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4562,10 +4562,10 @@ def test_open_badbytes(self) -> None: with pytest.raises( ValueError, match=r"match in any of xarray's currently installed IO" ): - with open_dataset(b"garbage"): # type: ignore[arg-type] + with open_dataset(b"garbage"): pass with pytest.raises(ValueError, match=r"can only read bytes"): - with open_dataset(b"garbage", engine="netcdf4"): # type: ignore[arg-type] + with open_dataset(b"garbage", engine="netcdf4"): pass with pytest.raises( ValueError, match=r"not the signature of a valid netCDF4 file" @@ -4633,7 +4633,13 @@ class TestH5NetCDFInMemoryData: def test_roundtrip_via_bytes(self) -> None: original = create_test_data() netcdf_bytes = original.to_netcdf(engine="h5netcdf") - roundtrip = open_dataset(netcdf_bytes, engine="h5netcdf") # type: ignore[arg-type] + roundtrip = open_dataset(netcdf_bytes, engine="h5netcdf") + assert_identical(roundtrip, original) + + def test_roundtrip_group_via_bytes(self) -> None: + original = create_test_data() + netcdf_bytes = original.to_netcdf(group="sub", engine="h5netcdf") + roundtrip = open_dataset(netcdf_bytes, group="sub", engine="h5netcdf") assert_identical(roundtrip, original) From 8fe46c07551a5c3356065af516379ffc469f5629 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 6 Aug 2025 15:54:05 -0700 Subject: [PATCH 08/15] Fixes per review, also use memoryview for return value --- xarray/backends/api.py | 98 ++++++++++++++++++---------- xarray/backends/common.py | 31 ++++++++- xarray/backends/h5netcdf_.py | 54 +++++++++++++-- xarray/backends/scipy_.py | 61 ++++++++++++++--- xarray/core/dataarray.py | 7 +- xarray/core/dataset.py | 8 +-- xarray/core/datatree.py | 11 ++-- xarray/core/datatree_io.py | 28 ++++---- xarray/core/utils.py | 13 ++-- xarray/tests/test_backends.py | 61 ++++++++++++++--- xarray/tests/test_backends_common.py | 4 +- 11 files changed, 278 insertions(+), 98 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f6b716b0138..019179cba28 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -30,6 +30,7 @@ from xarray.backends.common import ( AbstractDataStore, ArrayWriter, + BytesIOProxy, _find_absolute_paths, _normalize_path, ) @@ -497,7 +498,12 @@ def _datatree_from_backend_datatree( def open_dataset( - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | bytes | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -527,12 +533,13 @@ def open_dataset( Parameters ---------- - filename_or_obj : str, Path, file-like, bytes or DataStore + filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like - objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf (netCDF4). + scipy.io.netcdf (only netCDF3 supported). Bytes, memoryview and + file-like objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf + (netCDF4). engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "zarr", None}\ , installed backend \ or subclass of xarray.backends.BackendEntrypoint, optional @@ -686,7 +693,7 @@ def open_dataset( open_mfdataset """ - if isinstance(filename_or_obj, bytes): + if isinstance(filename_or_obj, bytes | memoryview): filename_or_obj = BytesIO(filename_or_obj) if cache is None: @@ -740,7 +747,12 @@ def open_dataset( def open_dataarray( - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | bytes | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -771,12 +783,13 @@ def open_dataarray( Parameters ---------- - filename_or_obj : str, Path, file-like, bytes or DataStore + filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like - objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf (netCDF4). + scipy.io.netcdf (only netCDF3 supported). Bytes, memoryview and + file-like objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf + (netCDF4). engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "zarr", None}\ , installed backend \ or subclass of xarray.backends.BackendEntrypoint, optional @@ -967,7 +980,12 @@ def open_dataarray( def open_datatree( - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | bytes | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -1000,7 +1018,8 @@ def open_datatree( ---------- filename_or_obj : str, Path, file-like, bytes or DataStore Strings and Path objects are interpreted as a path to a netCDF file or - Zarr store. Bytes are interpreted as file contents. + Zarr store. Bytes and memoryview objects are interpreted as file + contents. engine : {"netcdf4", "h5netcdf", "zarr", None}, \ installed backend or xarray.backends.BackendEntrypoint, optional Engine to use when reading files. If not provided, the default engine @@ -1153,7 +1172,7 @@ def open_datatree( xarray.open_groups xarray.open_dataset """ - if isinstance(filename_or_obj, bytes): + if isinstance(filename_or_obj, bytes | memoryview): filename_or_obj = BytesIO(filename_or_obj) if cache is None: @@ -1209,7 +1228,12 @@ def open_datatree( def open_groups( - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -1244,9 +1268,10 @@ def open_groups( Parameters ---------- - filename_or_obj : str, Path, file-like, butes, or DataStore + filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore Strings and Path objects are interpreted as a path to a netCDF file or - Zarr store. Bytes are interpreted as file contents. + Zarr store. Bytes and memoryview objects are interpreted as file + contents. engine : {"netcdf4", "h5netcdf", "zarr", None}, \ installed backend or xarray.backends.BackendEntrypoint, optional Engine to use when reading files. If not provided, the default engine @@ -1398,7 +1423,7 @@ def open_groups( xarray.open_dataset xarray.DataTree.from_dict """ - if isinstance(filename_or_obj, bytes): + if isinstance(filename_or_obj, bytes | memoryview): filename_or_obj = BytesIO(filename_or_obj) if cache is None: @@ -1781,7 +1806,7 @@ def to_netcdf( ) -> tuple[ArrayWriter, AbstractDataStore]: ... -# path=None writes to bytes +# path=None writes to bytes or memoryview, depending on store @overload def to_netcdf( dataset: Dataset, @@ -1796,7 +1821,7 @@ def to_netcdf( multifile: Literal[False] = False, invalid_netcdf: bool = False, auto_complex: bool | None = None, -) -> bytes: ... +) -> bytes | memoryview: ... # compute=False returns dask.Delayed @@ -1889,7 +1914,7 @@ def to_netcdf( multifile: bool = False, invalid_netcdf: bool = False, auto_complex: bool | None = None, -) -> tuple[ArrayWriter, AbstractDataStore] | bytes | Delayed | None: ... +) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None: ... def to_netcdf( @@ -1905,7 +1930,7 @@ def to_netcdf( multifile: bool = False, invalid_netcdf: bool = False, auto_complex: bool | None = None, -) -> tuple[ArrayWriter, AbstractDataStore] | bytes | Delayed | None: +) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None: """This function creates an appropriate datastore for writing a dataset to disk as a netCDF file @@ -1921,12 +1946,13 @@ def to_netcdf( if path_or_file is None: if engine is None: + # TODO: only use 'scipy' if format is None or a netCDF3 format engine = "scipy" elif engine not in ("scipy", "h5netcdf"): raise ValueError( - "invalid engine for creating bytes with " - f"to_netcdf: {engine!r}. Only the default engine, " - "engine='scipy' or engine='h5netcdf' is supported." + "invalid engine for creating bytes/memoryview with " + f"to_netcdf: {engine!r}. Only engine=None, engine='scipy' and " + "engine='h5netcdf' is supported." ) if not compute: raise NotImplementedError( @@ -1937,13 +1963,16 @@ def to_netcdf( if engine is None: engine = _get_default_engine(path_or_file) path_or_file = _normalize_path(path_or_file) - elif engine not in ("scipy", "h5netcdf"): - emit_user_level_warning( - f"Requested {engine=} is not compatible with writing to a file-like object. " - "This will raise an error in the future, for now defaulting to engine='scipy'.", - FutureWarning, - ) - engine = "scipy" + else: + # filelike object + if engine is not None and engine not in ("scipy", "h5netcdf"): + emit_user_level_warning( + f"Requested {engine=} is not compatible with writing to a file-like object. " + "This will raise an error in the future, for now defaulting to engine='scipy'.", + FutureWarning, + ) + if engine != "h5netcdf": + engine = "scipy" # validate Dataset keys, DataArray names, and attr keys/values _validate_dataset_names(dataset) @@ -1969,7 +1998,7 @@ def to_netcdf( ) if path_or_file is None: - target = BytesIO() + target = BytesIOProxy() else: target = path_or_file # type: ignore[assignment] @@ -2017,15 +2046,14 @@ def to_netcdf( store.close() if path_or_file is None: - assert isinstance(target, BytesIO) # created in this function - value = target.getvalue() - target.close() - return value + assert isinstance(target, BytesIOProxy) # created in this function + return target.getvalue_or_getbuffer() if not compute: import dask return dask.delayed(_finalize_store)(writes, store) + return None diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 8b56c8a2bf9..7072c9438d5 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -4,9 +4,18 @@ import os import time import traceback -from collections.abc import Hashable, Iterable, Mapping, Sequence +from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence +from dataclasses import dataclass from glob import glob -from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, Union, overload +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Generic, + TypeVar, + Union, + overload, +) import numpy as np import pandas as pd @@ -188,6 +197,24 @@ def _normalize_path_list( return _normalize_path_list(paths) +BytesOrMemory = TypeVar("BytesOrMemory", bytes, memoryview) + + +@dataclass +class BytesIOProxy(Generic[BytesOrMemory]): + """Proxy object for a write that either bytes or a memoryview.""" + + # TODO: remove this in favor of BytesIO when Dataset.to_netcdf() stops + # return bytes from the scipy engine + getvalue: Callable[[], BytesOrMemory] | None = None + + def getvalue_or_getbuffer(self) -> BytesOrMemory: + """Get the value of this write as bytes or memory.""" + if self.getvalue is None: + raise ValueError("must set getvalue before fetching value") + return self.getvalue() + + def _open_remote_file(file, mode, storage_options=None): import fsspec diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 4060274b9b8..7984d6ef0c1 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -11,6 +11,7 @@ from xarray.backends.common import ( BACKEND_ENTRYPOINTS, BackendEntrypoint, + BytesIOProxy, WritableCFDataStore, _normalize_path, _open_remote_file, @@ -171,6 +172,11 @@ def open( filename, mode=mode_, storage_options=storage_options ) + if isinstance(filename, BytesIOProxy): + source = filename + filename = io.BytesIO() + source.getvalue = filename.getbuffer + if isinstance(filename, io.IOBase) and mode == "r": magic_number = read_magic_number_from_file(filename) if not magic_number.startswith(b"\211HDF\r\n\032\n"): @@ -202,6 +208,7 @@ def open( if isinstance(filename, str) else h5netcdf.File(filename, mode=mode, **kwargs) ) + print(f"{manager=}") return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): @@ -400,6 +407,20 @@ def _emit_phony_dims_warning(): ) +def _normalize_filename_or_obj( + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, +) -> str | ReadBuffer | AbstractDataStore: + if isinstance(filename_or_obj, bytes | memoryview): + return io.BytesIO(filename_or_obj) + else: + return _normalize_path(filename_or_obj) # type: ignore[return-value] + + class H5netcdfBackendEntrypoint(BackendEntrypoint): """ Backend for netCDF files based on the h5netcdf package. @@ -429,8 +450,14 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): def guess_can_open( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, ) -> bool: + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None: return magic_number.startswith(b"\211HDF\r\n\032\n") @@ -443,7 +470,12 @@ def guess_can_open( def open_dataset( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -466,7 +498,7 @@ def open_dataset( # remove and set phony_dims="access" above emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims) - filename_or_obj = _normalize_path(filename_or_obj) + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) store = H5NetCDFStore.open( filename_or_obj, format=format, @@ -503,7 +535,12 @@ def open_dataset( def open_datatree( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -546,7 +583,12 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -573,7 +615,7 @@ def open_groups_as_dict( # remove and set phony_dims="access" above emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims) - filename_or_obj = _normalize_path(filename_or_obj) + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) store = H5NetCDFStore.open( filename_or_obj, format=format, diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index 295d3390f24..e3da1a6091a 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -1,6 +1,7 @@ from __future__ import annotations import gzip +import io import os from collections.abc import Iterable from typing import TYPE_CHECKING, Any @@ -11,6 +12,7 @@ BACKEND_ENTRYPOINTS, BackendArray, BackendEntrypoint, + BytesIOProxy, WritableCFDataStore, _normalize_path, ) @@ -27,12 +29,15 @@ Frozen, FrozenDict, close_on_error, + emit_user_level_warning, module_available, try_read_magic_number_from_file_or_path, ) from xarray.core.variable import Variable if TYPE_CHECKING: + import scipy.io + from xarray.backends.common import AbstractDataStore from xarray.core.dataset import Dataset from xarray.core.types import ReadBuffer @@ -162,6 +167,22 @@ def __init__( self.lock = ensure_lock(lock) + if isinstance(filename_or_obj, BytesIOProxy): + emit_user_level_warning( + "return value of to_netcdf() without a target for " + "engine='scipy' is currently bytes, but will switch to " + "memoryview in a future version of Xarray. To silence this " + "warning, use the following pattern or switch to " + "to_netcdf(engine='h5netcdf'):\n" + " target = io.BytesIO()\n" + " dataset.to_netcdf(target)\n" + " result = target.getbuffer()", + FutureWarning, + ) + source = filename_or_obj + filename_or_obj = io.BytesIO() + source.getvalue = filename_or_obj.getvalue + if isinstance(filename_or_obj, str): # path manager = CachingFileManager( _open_scipy_netcdf, @@ -185,11 +206,8 @@ def __init__( # their contents from to_netcdf(), but underlying files still get # closed when the netcdf_file is garbage collected (via __del__), # and will need to be fixed upstream in scipy. - if scipy_dataset.use_mmap: - close = scipy_dataset.close - else: - close = scipy_dataset.flush - manager = DummyFileManager(scipy_dataset, close=close) + assert not scipy_dataset.use_mmap # no mmap for file objects + manager = DummyFileManager(scipy_dataset, close=scipy_dataset.flush) else: raise ValueError( f"cannot open {filename_or_obj=} with scipy.io.netcdf_file" @@ -198,7 +216,7 @@ def __init__( self._manager = manager @property - def ds(self): + def ds(self) -> scipy.io.netcdf_file: return self._manager.acquire() def open_store_variable(self, name, var): @@ -279,6 +297,20 @@ def close(self): self._manager.close() +def _normalize_filename_or_obj( + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, +) -> str | ReadBuffer | AbstractDataStore: + if isinstance(filename_or_obj, bytes | memoryview): + return io.BytesIO(filename_or_obj) + else: + return _normalize_path(filename_or_obj) # type: ignore[return-value] + + class ScipyBackendEntrypoint(BackendEntrypoint): """ Backend for netCDF files based on the scipy package. @@ -305,8 +337,14 @@ class ScipyBackendEntrypoint(BackendEntrypoint): def guess_can_open( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, ) -> bool: + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None and magic_number.startswith(b"\x1f\x8b"): with gzip.open(filename_or_obj) as f: # type: ignore[arg-type] @@ -322,7 +360,12 @@ def guess_can_open( def open_dataset( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -337,7 +380,7 @@ def open_dataset( mmap=None, lock=None, ) -> Dataset: - filename_or_obj = _normalize_path(filename_or_obj) + filename_or_obj = _normalize_filename_or_obj(filename_or_obj) store = ScipyDataStore( filename_or_obj, mode=mode, format=format, group=group, mmap=mmap, lock=lock ) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 73b0eb19a64..98979ce05d7 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4015,7 +4015,7 @@ def to_netcdf( compute: bool = True, invalid_netcdf: bool = False, auto_complex: bool | None = None, - ) -> bytes: ... + ) -> bytes | memoryview: ... # compute=False returns dask.Delayed @overload @@ -4079,7 +4079,7 @@ def to_netcdf( compute: bool = True, invalid_netcdf: bool = False, auto_complex: bool | None = None, - ) -> bytes | Delayed | None: + ) -> bytes | memoryview | Delayed | None: """Write DataArray contents to a netCDF file. Parameters @@ -4149,8 +4149,7 @@ def to_netcdf( Returns ------- - store: bytes or Delayed or None - * ``bytes`` if path is None + * ``bytes`` or ``memoryview`` if path is None * ``dask.delayed.Delayed`` if compute is False * None otherwise diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index faa78e42220..8d971119e3d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1879,7 +1879,7 @@ def to_netcdf( compute: bool = True, invalid_netcdf: bool = False, auto_complex: bool | None = None, - ) -> bytes: ... + ) -> bytes | memoryview: ... # compute=False returns dask.Delayed @overload @@ -1943,7 +1943,7 @@ def to_netcdf( compute: bool = True, invalid_netcdf: bool = False, auto_complex: bool | None = None, - ) -> bytes | Delayed | None: + ) -> bytes | memoryview | Delayed | None: """Write dataset contents to a netCDF file. Parameters @@ -2015,9 +2015,9 @@ def to_netcdf( Returns ------- - * ``bytes`` if path is None + * ``bytes`` or ``memoryview`` if path is None * ``dask.delayed.Delayed`` if compute is False - * None otherwise + * ``None`` otherwise See Also -------- diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 75425add6a6..bf82baccb31 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1661,7 +1661,7 @@ def _inplace_binary_op(self, other, f) -> Self: def __eq__(self, other: DtCompatible) -> Self: # type: ignore[override] return super().__eq__(other) - # filepath=None writes to bytes + # filepath=None writes to a memoryview @overload def to_netcdf( self, @@ -1675,7 +1675,7 @@ def to_netcdf( write_inherited_coords: bool = False, compute: bool = True, **kwargs, - ) -> bytes: ... + ) -> memoryview: ... @overload def to_netcdf( @@ -1704,7 +1704,7 @@ def to_netcdf( write_inherited_coords: bool = False, compute: bool = True, **kwargs, - ) -> None | bytes: + ) -> None | memoryview: """ Write datatree contents to a netCDF file. @@ -1713,7 +1713,7 @@ def to_netcdf( filepath : str or PathLike or file-like object or None Path to which to save this datatree, or a file-like object to write it to (which must support read and write and be seekable) or None - to return in-memory bytes. + to return in-memory bytes as a memoryview. mode : {"w", "a"}, default: "w" Write ('w') or append ('a') mode. If mode='w', any existing file at this location will be overwritten. If mode='a', existing variables @@ -1754,7 +1754,8 @@ def to_netcdf( Returns ------- - A bytes object with the byte content of the netCDF file, if filepath was None. + * ``memoryview`` if path is None + * ``None`` otherwise Note ---- diff --git a/xarray/core/datatree_io.py b/xarray/core/datatree_io.py index 00ce0eadc68..c586caaba89 100644 --- a/xarray/core/datatree_io.py +++ b/xarray/core/datatree_io.py @@ -27,12 +27,8 @@ def _datatree_to_netcdf( write_inherited_coords: bool = False, compute: bool = True, **kwargs, -) -> None | bytes: - """This function creates an appropriate datastore for writing a datatree to - disk as a netCDF file. - - See `DataTree.to_netcdf` for full API docs. - """ +) -> None | memoryview: + """Implementation of `DataTree.to_netcdf`.""" if format not in [None, *get_args(T_DataTreeNetcdfTypes)]: raise ValueError("DataTree.to_netcdf only supports the NETCDF4 format") @@ -65,7 +61,11 @@ def _datatree_to_netcdf( ) if filepath is None: - filepath = io.BytesIO() + # No need to use BytesIOProxy here because the legacy scipy backend + # cannot write netCDF files with groups + target = io.BytesIO() + else: + target = filepath # type: ignore[assignment] if unlimited_dims is None: unlimited_dims = {} @@ -75,7 +75,7 @@ def _datatree_to_netcdf( ds = node.to_dataset(inherit=write_inherited_coords or at_root) group_path = None if at_root else "/" + node.relative_to(dt) ds.to_netcdf( - filepath, + target, group=group_path, mode=mode, encoding=encoding.get(node.path), @@ -87,7 +87,11 @@ def _datatree_to_netcdf( ) mode = "a" - return filepath.getvalue() if isinstance(filepath, io.BytesIO) else None + if filepath is None: + assert isinstance(target, io.BytesIO) + return target.getbuffer() + + return None def _datatree_to_zarr( @@ -101,11 +105,7 @@ def _datatree_to_zarr( compute: bool = True, **kwargs, ): - """This function creates an appropriate datastore for writing a datatree - to a zarr store. - - See `DataTree.to_zarr` for full API docs. - """ + """Implementation of `DataTree.to_zarr`.""" from zarr import consolidate_metadata diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 562706a1ac0..9c85f2ccc03 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -680,15 +680,12 @@ def is_remote_uri(path: str) -> bool: def read_magic_number_from_file(filename_or_obj, count=8) -> bytes: # check byte header to determine file type - if isinstance(filename_or_obj, bytes): - magic_number = filename_or_obj[:count] - elif isinstance(filename_or_obj, io.IOBase): - if filename_or_obj.tell() != 0: - filename_or_obj.seek(0) - magic_number = filename_or_obj.read(count) - filename_or_obj.seek(0) - else: + if not isinstance(filename_or_obj, io.IOBase): raise TypeError(f"cannot read the magic number from {type(filename_or_obj)}") + if filename_or_obj.tell() != 0: + filename_or_obj.seek(0) + magic_number = filename_or_obj.read(count) + filename_or_obj.seek(0) return magic_number diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a0477263bae..a80bda01a66 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4056,12 +4056,19 @@ def create_store(self): yield backends.ScipyDataStore(fobj, "w") def test_to_netcdf_explicit_engine(self) -> None: - # regression test for GH1321 - Dataset({"foo": 42}).to_netcdf(engine="scipy") + with pytest.warns( + FutureWarning, + match=re.escape("return value of to_netcdf() without a target"), + ): + Dataset({"foo": 42}).to_netcdf(engine="scipy") def test_bytes_pickle(self) -> None: data = Dataset({"foo": ("x", [1, 2, 3])}) - fobj = data.to_netcdf() + with pytest.warns( + FutureWarning, + match=re.escape("return value of to_netcdf() without a target"), + ): + fobj = data.to_netcdf() with self.open(fobj) as ds: unpickled = pickle.loads(pickle.dumps(ds)) assert_identical(unpickled, data) @@ -4099,9 +4106,9 @@ def roundtrip( def test_file_remains_open(self) -> None: data = Dataset({"foo": ("x", [1, 2, 3])}) f = BytesIO() - data.to_netcdf(f, engine="h5netcdf") + data.to_netcdf(f, engine="scipy") assert not f.closed - restored = open_dataset(f, engine="h5netcdf") + restored = open_dataset(f, engine="scipy") assert not f.closed assert_identical(restored, data) restored.close() @@ -4231,9 +4238,10 @@ def test_engine(self) -> None: with pytest.raises(ValueError, match=r"unrecognized engine"): open_dataset(tmp_file, engine="foobar") - netcdf_bytes = data.to_netcdf() + bytes_io = BytesIO() + data.to_netcdf(bytes_io, engine="scipy") with pytest.raises(ValueError, match=r"unrecognized engine"): - open_dataset(BytesIO(netcdf_bytes), engine="foobar") + open_dataset(bytes_io, engine="foobar") def test_cross_engine_read_write_netcdf3(self) -> None: data = create_test_data() @@ -4280,6 +4288,32 @@ def test_encoding_unlimited_dims(self) -> None: assert actual.encoding["unlimited_dims"] == set("y") assert_equal(ds, actual) + @requires_scipy + def test_roundtrip_via_bytes(self) -> None: + original = create_test_data() + with pytest.warns( + FutureWarning, + match=re.escape("return value of to_netcdf() without a target"), + ): + netcdf_bytes = original.to_netcdf() + roundtrip = open_dataset(netcdf_bytes) + assert_identical(roundtrip, original) + + @pytest.mark.xfail( + reason="scipy.io.netcdf_file closes files upon garbage collection" + ) + @requires_scipy + def test_roundtrip_via_file_object(self) -> None: + original = create_test_data() + f = BytesIO() + original.to_netcdf(f) + assert not f.closed + restored = open_dataset(f) + assert not f.closed + assert_identical(restored, original) + restored.close() + assert not f.closed + @requires_h5netcdf @requires_netCDF4 @@ -5863,7 +5897,11 @@ def test_open_dataarray_options(self) -> None: def test_dataarray_to_netcdf_return_bytes(self) -> None: # regression test for GH1410 data = xr.DataArray([1, 2, 3]) - output = data.to_netcdf() + with pytest.warns( + FutureWarning, + match=re.escape("return value of to_netcdf() without a target"), + ): + output = data.to_netcdf(engine="scipy") assert isinstance(output, bytes) def test_dataarray_to_netcdf_no_name_pathlib(self) -> None: @@ -6400,7 +6438,10 @@ def test_scipy_entrypoint(tmp_path: Path) -> None: with open(path, "rb") as f: _check_guess_can_open_and_open(entrypoint, f, engine="scipy", expected=ds) - contents = ds.to_netcdf(engine="scipy") + with pytest.warns( + FutureWarning, match=re.escape("return value of to_netcdf() without a target") + ): + contents = ds.to_netcdf(engine="scipy") _check_guess_can_open_and_open(entrypoint, contents, engine="scipy", expected=ds) _check_guess_can_open_and_open( entrypoint, BytesIO(contents), engine="scipy", expected=ds @@ -6415,7 +6456,7 @@ def test_scipy_entrypoint(tmp_path: Path) -> None: assert entrypoint.guess_can_open("something-local.nc") assert entrypoint.guess_can_open("something-local.nc.gz") assert not entrypoint.guess_can_open("not-found-and-no-extension") - assert not entrypoint.guess_can_open(b"not-a-netcdf-file") # type: ignore[arg-type] + assert not entrypoint.guess_can_open(b"not-a-netcdf-file") @requires_h5netcdf diff --git a/xarray/tests/test_backends_common.py b/xarray/tests/test_backends_common.py index 33da027ac97..a42381882ed 100644 --- a/xarray/tests/test_backends_common.py +++ b/xarray/tests/test_backends_common.py @@ -1,5 +1,6 @@ from __future__ import annotations +import io import re import numpy as np @@ -53,10 +54,11 @@ def test_infer_dtype_error_on_mixed_types(data): def test_encoding_failure_note(): # Create an arbitrary value that cannot be encoded in netCDF3 ds = xr.Dataset({"invalid": np.array([2**63 - 1], dtype=np.int64)}) + f = io.BytesIO() with pytest.raises( ValueError, match=re.escape( "Raised while encoding variable 'invalid' with value Date: Wed, 6 Aug 2025 16:01:42 -0700 Subject: [PATCH 09/15] one more test --- xarray/tests/test_backends.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a80bda01a66..79f187ce483 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4062,6 +4062,16 @@ def test_to_netcdf_explicit_engine(self) -> None: ): Dataset({"foo": 42}).to_netcdf(engine="scipy") + def test_roundtrip_via_bytes(self) -> None: + original = create_test_data() + with pytest.warns( + FutureWarning, + match=re.escape("return value of to_netcdf() without a target"), + ): + netcdf_bytes = original.to_netcdf(engine="scipy") + roundtrip = open_dataset(netcdf_bytes, engine="scipy") + assert_identical(roundtrip, original) + def test_bytes_pickle(self) -> None: data = Dataset({"foo": ("x", [1, 2, 3])}) with pytest.warns( From bf2c750b68d757d089dac0017bf7060c00888ce7 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 6 Aug 2025 16:45:15 -0700 Subject: [PATCH 10/15] remove unnecessary use of BytesIO --- xarray/backends/api.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 019179cba28..c0fa9f95d75 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -10,7 +10,7 @@ Sequence, ) from functools import partial -from io import BytesIO, IOBase +from io import IOBase from itertools import starmap from numbers import Number from typing import ( @@ -693,9 +693,6 @@ def open_dataset( open_mfdataset """ - if isinstance(filename_or_obj, bytes | memoryview): - filename_or_obj = BytesIO(filename_or_obj) - if cache is None: cache = chunks is None @@ -1172,9 +1169,6 @@ def open_datatree( xarray.open_groups xarray.open_dataset """ - if isinstance(filename_or_obj, bytes | memoryview): - filename_or_obj = BytesIO(filename_or_obj) - if cache is None: cache = chunks is None @@ -1423,9 +1417,6 @@ def open_groups( xarray.open_dataset xarray.DataTree.from_dict """ - if isinstance(filename_or_obj, bytes | memoryview): - filename_or_obj = BytesIO(filename_or_obj) - if cache is None: cache = chunks is None From 23d51474c55d1b8cc261bd6bce6191597c1fdd99 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 6 Aug 2025 18:52:29 -0700 Subject: [PATCH 11/15] remove inadvertent print() --- xarray/backends/h5netcdf_.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 7984d6ef0c1..292088397f7 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -208,7 +208,6 @@ def open( if isinstance(filename, str) else h5netcdf.File(filename, mode=mode, **kwargs) ) - print(f"{manager=}") return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): From 54427fee0e5c56da6aa2598634c3927b5706abe0 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 7 Aug 2025 11:20:55 -0700 Subject: [PATCH 12/15] Fix typing --- xarray/backends/common.py | 28 ++++++++++++++++++++++++---- xarray/backends/netCDF4_.py | 28 ++++++++++++++++++++++++---- xarray/backends/plugins.py | 7 ++++++- xarray/backends/pydap_.py | 28 ++++++++++++++++++++++++---- xarray/backends/store.py | 14 ++++++++++++-- xarray/backends/zarr.py | 28 ++++++++++++++++++++++++---- 6 files changed, 114 insertions(+), 19 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 7072c9438d5..b8d1913ddad 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -732,7 +732,12 @@ def __repr__(self) -> str: def open_dataset( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, ) -> Dataset: @@ -744,7 +749,12 @@ def open_dataset( def guess_can_open( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, ) -> bool: """ Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`. @@ -754,7 +764,12 @@ def guess_can_open( def open_datatree( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, ) -> DataTree: @@ -766,7 +781,12 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, drop_variables: str | Iterable[str] | None = None, ) -> dict[str, Dataset]: diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 8c3a01eba66..f3b6192b921 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -631,7 +631,12 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): def guess_can_open( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, ) -> bool: if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): return True @@ -648,7 +653,12 @@ def guess_can_open( def open_dataset( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -697,7 +707,12 @@ def open_dataset( def open_datatree( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -739,7 +754,12 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py index 555538c2562..76df963621e 100644 --- a/xarray/backends/plugins.py +++ b/xarray/backends/plugins.py @@ -138,7 +138,12 @@ def refresh_engines() -> None: def guess_engine( - store_spec: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + store_spec: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, ) -> str | type[BackendEntrypoint]: engines = list_engines() diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 73b719f8260..5d8929e4ce7 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -209,13 +209,23 @@ class PydapBackendEntrypoint(BackendEntrypoint): def guess_can_open( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, ) -> bool: return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) def open_dataset( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -258,7 +268,12 @@ def open_dataset( def open_datatree( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -295,7 +310,12 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/store.py b/xarray/backends/store.py index de52aa193ed..f908e13a339 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -24,13 +24,23 @@ class StoreBackendEntrypoint(BackendEntrypoint): def guess_can_open( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, ) -> bool: return isinstance(filename_or_obj, AbstractDataStore) def open_dataset( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 8b26a6b40ec..f624dc60792 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1550,7 +1550,12 @@ class ZarrBackendEntrypoint(BackendEntrypoint): def guess_can_open( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, ) -> bool: if isinstance(filename_or_obj, str | os.PathLike): _, ext = os.path.splitext(filename_or_obj) @@ -1560,7 +1565,12 @@ def guess_can_open( def open_dataset( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -1615,7 +1625,12 @@ def open_dataset( def open_datatree( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, @@ -1657,7 +1672,12 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore, + filename_or_obj: str + | os.PathLike[Any] + | ReadBuffer + | bytes + | memoryview + | AbstractDataStore, *, mask_and_scale=True, decode_times=True, From 07a3708eaa491d27754bde9200d42266b6f58184 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 7 Aug 2025 14:00:54 -0700 Subject: [PATCH 13/15] Don't silently override engine in to_netcdf --- xarray/backends/api.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 6a731aeb7a0..ee7aa3e8ff7 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -45,7 +45,7 @@ from xarray.core.indexes import Index from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes -from xarray.core.utils import emit_user_level_warning, is_remote_uri +from xarray.core.utils import is_remote_uri from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager from xarray.structure.chunks import _get_chunk, _maybe_chunk @@ -1945,35 +1945,27 @@ def to_netcdf( if encoding is None: encoding = {} - if path_or_file is None: + if isinstance(path_or_file, str): + if engine is None: + engine = _get_default_engine(path_or_file) + path_or_file = _normalize_path(path_or_file) + else: + # writing to bytes/memoryview or a file-like object if engine is None: # TODO: only use 'scipy' if format is None or a netCDF3 format engine = "scipy" elif engine not in ("scipy", "h5netcdf"): raise ValueError( - "invalid engine for creating bytes/memoryview with " - f"to_netcdf: {engine!r}. Only engine=None, engine='scipy' and " - "engine='h5netcdf' is supported." + "invalid engine for creating bytes/memoryview or writing to a " + f"file-like object with to_netcdf: {engine!r}. Only " + "engine=None, engine='scipy' and engine='h5netcdf' is " + "supported." ) if not compute: raise NotImplementedError( "to_netcdf() with compute=False is not yet implemented when " "returning bytes" ) - elif isinstance(path_or_file, str): - if engine is None: - engine = _get_default_engine(path_or_file) - path_or_file = _normalize_path(path_or_file) - else: - # filelike object - if engine is not None and engine not in ("scipy", "h5netcdf"): - emit_user_level_warning( - f"Requested {engine=} is not compatible with writing to a file-like object. " - "This will raise an error in the future, for now defaulting to engine='scipy'.", - FutureWarning, - ) - if engine != "h5netcdf": - engine = "scipy" # validate Dataset keys, DataArray names, and attr keys/values _validate_dataset_names(dataset) From 53739ff93a3c658d04020f78794f3b238b811907 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 8 Aug 2025 13:22:59 -0700 Subject: [PATCH 14/15] Use type alias instead of refining filename_or_obj type everywhere --- xarray/backends/api.py | 29 +++++-------------------- xarray/backends/common.py | 5 +++++ xarray/backends/h5netcdf_.py | 41 ++++++------------------------------ xarray/backends/netCDF4_.py | 34 +++++------------------------- xarray/backends/pydap_.py | 34 ++++++------------------------ xarray/backends/scipy_.py | 15 +++---------- xarray/backends/store.py | 24 +++++---------------- xarray/backends/zarr.py | 35 ++++++------------------------ 8 files changed, 43 insertions(+), 174 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ee7aa3e8ff7..2a6476ea828 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -32,6 +32,7 @@ AbstractDataStore, ArrayWriter, BytesIOProxy, + T_PathFileOrDataStore, _find_absolute_paths, _normalize_path, ) @@ -504,12 +505,7 @@ def _datatree_from_backend_datatree( def open_dataset( - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -750,12 +746,7 @@ def open_dataset( def open_dataarray( - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -983,12 +974,7 @@ def open_dataarray( def open_datatree( - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, @@ -1228,12 +1214,7 @@ def open_datatree( def open_groups( - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, engine: T_Engine = None, chunks: T_Chunks = None, diff --git a/xarray/backends/common.py b/xarray/backends/common.py index b8d1913ddad..0fd03f5f4d2 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -351,6 +351,11 @@ def __exit__(self, exception_type, exception_value, traceback): self.close() +T_PathFileOrDataStore = ( + str | os.PathLike[Any] | ReadBuffer | bytes | memoryview | AbstractDataStore +) + + class ArrayWriter: __slots__ = ("lock", "regions", "sources", "targets") diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 292088397f7..24a3324bf62 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -12,6 +12,7 @@ BACKEND_ENTRYPOINTS, BackendEntrypoint, BytesIOProxy, + T_PathFileOrDataStore, WritableCFDataStore, _normalize_path, _open_remote_file, @@ -407,17 +408,12 @@ def _emit_phony_dims_warning(): def _normalize_filename_or_obj( - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, ) -> str | ReadBuffer | AbstractDataStore: if isinstance(filename_or_obj, bytes | memoryview): return io.BytesIO(filename_or_obj) else: - return _normalize_path(filename_or_obj) # type: ignore[return-value] + return _normalize_path(filename_or_obj) class H5netcdfBackendEntrypoint(BackendEntrypoint): @@ -447,15 +443,7 @@ class H5netcdfBackendEntrypoint(BackendEntrypoint): ) url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.H5netcdfBackendEntrypoint.html" - def guess_can_open( - self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, - ) -> bool: + def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) if magic_number is not None: @@ -469,12 +457,7 @@ def guess_can_open( def open_dataset( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, @@ -534,12 +517,7 @@ def open_dataset( def open_datatree( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, @@ -582,12 +560,7 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index f3b6192b921..ab1841461f4 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -13,6 +13,7 @@ BACKEND_ENTRYPOINTS, BackendArray, BackendEntrypoint, + T_PathFileOrDataStore, WritableCFDataStore, _normalize_path, datatree_from_dict_with_io_cleanup, @@ -49,10 +50,8 @@ from h5netcdf.core import EnumType as h5EnumType from netCDF4 import EnumType as ncEnumType - from xarray.backends.common import AbstractDataStore from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree - from xarray.core.types import ReadBuffer # This lookup table maps from dtype.byteorder to a readable endian # string used by netCDF4. @@ -629,15 +628,7 @@ class NetCDF4BackendEntrypoint(BackendEntrypoint): ) url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.NetCDF4BackendEntrypoint.html" - def guess_can_open( - self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, - ) -> bool: + def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): return True magic_number = try_read_magic_number_from_path(filename_or_obj) @@ -653,12 +644,7 @@ def guess_can_open( def open_dataset( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, @@ -707,12 +693,7 @@ def open_dataset( def open_datatree( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, @@ -754,12 +735,7 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 5d8929e4ce7..4fbfe8ee210 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -10,6 +10,7 @@ AbstractDataStore, BackendArray, BackendEntrypoint, + T_PathFileOrDataStore, _normalize_path, datatree_from_dict_with_io_cleanup, robust_getitem, @@ -207,25 +208,14 @@ class PydapBackendEntrypoint(BackendEntrypoint): description = "Open remote datasets via OPeNDAP using pydap in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.PydapBackendEntrypoint.html" - def guess_can_open( - self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, - ) -> bool: + def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) def open_dataset( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: ( + str | os.PathLike[Any] | ReadBuffer | bytes | memoryview | AbstractDataStore + ), *, mask_and_scale=True, decode_times=True, @@ -268,12 +258,7 @@ def open_dataset( def open_datatree( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, @@ -310,12 +295,7 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index e3da1a6091a..a93c6465d49 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -13,6 +13,7 @@ BackendArray, BackendEntrypoint, BytesIOProxy, + T_PathFileOrDataStore, WritableCFDataStore, _normalize_path, ) @@ -337,12 +338,7 @@ class ScipyBackendEntrypoint(BackendEntrypoint): def guess_can_open( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, ) -> bool: filename_or_obj = _normalize_filename_or_obj(filename_or_obj) magic_number = try_read_magic_number_from_file_or_path(filename_or_obj) @@ -360,12 +356,7 @@ def guess_can_open( def open_dataset( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/store.py b/xarray/backends/store.py index f908e13a339..2c3cd42ae92 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -1,46 +1,32 @@ from __future__ import annotations from collections.abc import Iterable -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from xarray import conventions from xarray.backends.common import ( BACKEND_ENTRYPOINTS, AbstractDataStore, BackendEntrypoint, + T_PathFileOrDataStore, ) from xarray.core.coordinates import Coordinates from xarray.core.dataset import Dataset if TYPE_CHECKING: - import os - - from xarray.core.types import ReadBuffer + pass class StoreBackendEntrypoint(BackendEntrypoint): description = "Open AbstractDataStore instances in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.StoreBackendEntrypoint.html" - def guess_can_open( - self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, - ) -> bool: + def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: return isinstance(filename_or_obj, AbstractDataStore) def open_dataset( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f624dc60792..1b62a87d10c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -17,6 +17,7 @@ AbstractWritableDataStore, BackendArray, BackendEntrypoint, + T_PathFileOrDataStore, _encode_variable_name, _normalize_path, datatree_from_dict_with_io_cleanup, @@ -39,10 +40,9 @@ from xarray.namedarray.utils import module_available if TYPE_CHECKING: - from xarray.backends.common import AbstractDataStore from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree - from xarray.core.types import ReadBuffer, ZarrArray, ZarrGroup + from xarray.core.types import ZarrArray, ZarrGroup def _get_mappers(*, storage_options, store, chunk_store): @@ -1548,15 +1548,7 @@ class ZarrBackendEntrypoint(BackendEntrypoint): description = "Open zarr files (.zarr) using zarr in Xarray" url = "https://docs.xarray.dev/en/stable/generated/xarray.backends.ZarrBackendEntrypoint.html" - def guess_can_open( - self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, - ) -> bool: + def guess_can_open(self, filename_or_obj: T_PathFileOrDataStore) -> bool: if isinstance(filename_or_obj, str | os.PathLike): _, ext = os.path.splitext(filename_or_obj) return ext == ".zarr" @@ -1565,12 +1557,7 @@ def guess_can_open( def open_dataset( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, @@ -1625,12 +1612,7 @@ def open_dataset( def open_datatree( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, @@ -1672,12 +1654,7 @@ def open_datatree( def open_groups_as_dict( self, - filename_or_obj: str - | os.PathLike[Any] - | ReadBuffer - | bytes - | memoryview - | AbstractDataStore, + filename_or_obj: T_PathFileOrDataStore, *, mask_and_scale=True, decode_times=True, From 07bbe1865deb9ea4851ae5d34561f326c4923972 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 8 Aug 2025 13:23:23 -0700 Subject: [PATCH 15/15] Fix grammar --- xarray/backends/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 0fd03f5f4d2..542ca4c897b 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -202,10 +202,10 @@ def _normalize_path_list( @dataclass class BytesIOProxy(Generic[BytesOrMemory]): - """Proxy object for a write that either bytes or a memoryview.""" + """Proxy object for a write that returns either bytes or a memoryview.""" # TODO: remove this in favor of BytesIO when Dataset.to_netcdf() stops - # return bytes from the scipy engine + # returning bytes from the scipy engine getvalue: Callable[[], BytesOrMemory] | None = None def getvalue_or_getbuffer(self) -> BytesOrMemory: