Skip to content

Support for DataTree.to_netcdf to write to a file-like object or bytes #10571

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Aug 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ v2025.07.2 (unreleased)
New Features
~~~~~~~~~~~~

- :py:meth:`DataTree.to_netcdf` can now write to a file-like object, or return bytes if called without a filepath. (:issue:`10570`)
By `Matthew Willson <https://github.com/mjwillson>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
89 changes: 52 additions & 37 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
Sequence,
)
from functools import partial
from io import BytesIO
from io import IOBase
from itertools import starmap
from numbers import Number
from typing import (
Expand All @@ -31,6 +31,8 @@
from xarray.backends.common import (
AbstractDataStore,
ArrayWriter,
BytesIOProxy,
T_PathFileOrDataStore,
_find_absolute_paths,
_normalize_path,
)
Expand Down Expand Up @@ -503,7 +505,7 @@ def _datatree_from_backend_datatree(


def open_dataset(
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
filename_or_obj: T_PathFileOrDataStore,
*,
engine: T_Engine = None,
chunks: T_Chunks = None,
Expand Down Expand Up @@ -533,12 +535,13 @@ def open_dataset(

Parameters
----------
filename_or_obj : str, Path, file-like or DataStore
filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore
Strings and Path objects are interpreted as a path to a netCDF file
or an OpenDAP URL and opened with python-netCDF4, unless the filename
ends with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
scipy.io.netcdf (only netCDF3 supported). Bytes, memoryview and
file-like objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf
(netCDF4).
Comment on lines +538 to +544
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if the explicit mention of netCDF file here (and in the other open_*-functions is still valid in light of all the other engines which handle files of any provenience. A change to this might better be done in another PR. I just stumbled over this and wanted to keep log of this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is a good consideration for updating later.

engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "zarr", None}\
, installed backend \
or subclass of xarray.backends.BackendEntrypoint, optional
Expand Down Expand Up @@ -743,7 +746,7 @@ def open_dataset(


def open_dataarray(
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
filename_or_obj: T_PathFileOrDataStore,
*,
engine: T_Engine = None,
chunks: T_Chunks = None,
Expand Down Expand Up @@ -774,12 +777,13 @@ def open_dataarray(

Parameters
----------
filename_or_obj : str, Path, file-like or DataStore
filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore
Strings and Path objects are interpreted as a path to a netCDF file
or an OpenDAP URL and opened with python-netCDF4, unless the filename
ends with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
scipy.io.netcdf (only netCDF3 supported). Bytes, memoryview and
file-like objects are opened by scipy.io.netcdf (netCDF3) or h5netcdf
(netCDF4).
engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "zarr", None}\
, installed backend \
or subclass of xarray.backends.BackendEntrypoint, optional
Expand Down Expand Up @@ -970,7 +974,7 @@ def open_dataarray(


def open_datatree(
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
filename_or_obj: T_PathFileOrDataStore,
*,
engine: T_Engine = None,
chunks: T_Chunks = None,
Expand Down Expand Up @@ -1001,8 +1005,10 @@ def open_datatree(

Parameters
----------
filename_or_obj : str, Path, file-like, or DataStore
Strings and Path objects are interpreted as a path to a netCDF file or Zarr store.
filename_or_obj : str, Path, file-like, bytes or DataStore
Strings and Path objects are interpreted as a path to a netCDF file or
Zarr store. Bytes and memoryview objects are interpreted as file
contents.
engine : {"netcdf4", "h5netcdf", "zarr", None}, \
installed backend or xarray.backends.BackendEntrypoint, optional
Engine to use when reading files. If not provided, the default engine
Expand Down Expand Up @@ -1208,7 +1214,7 @@ def open_datatree(


def open_groups(
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
filename_or_obj: T_PathFileOrDataStore,
*,
engine: T_Engine = None,
chunks: T_Chunks = None,
Expand Down Expand Up @@ -1243,8 +1249,10 @@ def open_groups(

Parameters
----------
filename_or_obj : str, Path, file-like, or DataStore
Strings and Path objects are interpreted as a path to a netCDF file or Zarr store.
filename_or_obj : str, Path, file-like, bytes, memoryview or DataStore
Strings and Path objects are interpreted as a path to a netCDF file or
Zarr store. Bytes and memoryview objects are interpreted as file
contents.
engine : {"netcdf4", "h5netcdf", "zarr", None}, \
installed backend or xarray.backends.BackendEntrypoint, optional
Engine to use when reading files. If not provided, the default engine
Expand Down Expand Up @@ -1780,7 +1788,7 @@ def to_netcdf(
) -> tuple[ArrayWriter, AbstractDataStore]: ...


# path=None writes to bytes
# path=None writes to bytes or memoryview, depending on store
@overload
def to_netcdf(
dataset: Dataset,
Expand All @@ -1795,7 +1803,7 @@ def to_netcdf(
multifile: Literal[False] = False,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> bytes: ...
) -> bytes | memoryview: ...


# compute=False returns dask.Delayed
Expand All @@ -1821,7 +1829,7 @@ def to_netcdf(
@overload
def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike,
path_or_file: str | os.PathLike | IOBase,
mode: NetcdfWriteModes = "w",
format: T_NetcdfTypes | None = None,
group: str | None = None,
Expand Down Expand Up @@ -1877,7 +1885,7 @@ def to_netcdf(
@overload
def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike | None,
path_or_file: str | os.PathLike | IOBase | None,
mode: NetcdfWriteModes = "w",
format: T_NetcdfTypes | None = None,
group: str | None = None,
Expand All @@ -1888,12 +1896,12 @@ def to_netcdf(
multifile: bool = False,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | Delayed | None: ...
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None: ...


def to_netcdf(
dataset: Dataset,
path_or_file: str | os.PathLike | None = None,
path_or_file: str | os.PathLike | IOBase | None = None,
mode: NetcdfWriteModes = "w",
format: T_NetcdfTypes | None = None,
group: str | None = None,
Expand All @@ -1904,7 +1912,7 @@ def to_netcdf(
multifile: bool = False,
invalid_netcdf: bool = False,
auto_complex: bool | None = None,
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | Delayed | None:
) -> tuple[ArrayWriter, AbstractDataStore] | bytes | memoryview | Delayed | None:
"""This function creates an appropriate datastore for writing a dataset to
disk as a netCDF file

Expand All @@ -1918,26 +1926,27 @@ def to_netcdf(
if encoding is None:
encoding = {}

if path_or_file is None:
if isinstance(path_or_file, str):
if engine is None:
engine = _get_default_engine(path_or_file)
path_or_file = _normalize_path(path_or_file)
else:
# writing to bytes/memoryview or a file-like object
if engine is None:
# TODO: only use 'scipy' if format is None or a netCDF3 format
engine = "scipy"
elif engine != "scipy":
elif engine not in ("scipy", "h5netcdf"):
raise ValueError(
"invalid engine for creating bytes with "
f"to_netcdf: {engine!r}. Only the default engine "
"or engine='scipy' is supported"
"invalid engine for creating bytes/memoryview or writing to a "
f"file-like object with to_netcdf: {engine!r}. Only "
"engine=None, engine='scipy' and engine='h5netcdf' is "
"supported."
)
if not compute:
raise NotImplementedError(
"to_netcdf() with compute=False is not yet implemented when "
"returning bytes"
)
elif isinstance(path_or_file, str):
if engine is None:
engine = _get_default_engine(path_or_file)
path_or_file = _normalize_path(path_or_file)
else: # file-like object
engine = "scipy"

# validate Dataset keys, DataArray names, and attr keys/values
_validate_dataset_names(dataset)
Expand All @@ -1962,7 +1971,11 @@ def to_netcdf(
f"is not currently supported with dask's {scheduler} scheduler"
)

target = path_or_file if path_or_file is not None else BytesIO()
if path_or_file is None:
target = BytesIOProxy()
else:
target = path_or_file # type: ignore[assignment]

kwargs = dict(autoclose=True) if autoclose else {}
if invalid_netcdf:
if engine == "h5netcdf":
Expand Down Expand Up @@ -2002,17 +2015,19 @@ def to_netcdf(

writes = writer.sync(compute=compute)

if isinstance(target, BytesIO):
store.sync()
return target.getvalue()
finally:
if not multifile and compute: # type: ignore[redundant-expr]
store.close()

if path_or_file is None:
assert isinstance(target, BytesIOProxy) # created in this function
return target.getvalue_or_getbuffer()

if not compute:
import dask

return dask.delayed(_finalize_store)(writes, store)

return None


Expand Down
64 changes: 58 additions & 6 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,18 @@
import os
import time
import traceback
from collections.abc import Hashable, Iterable, Mapping, Sequence
from collections.abc import Callable, Hashable, Iterable, Mapping, Sequence
from dataclasses import dataclass
from glob import glob
from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, Union, overload
from typing import (
TYPE_CHECKING,
Any,
ClassVar,
Generic,
TypeVar,
Union,
overload,
)

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -188,6 +197,24 @@ def _normalize_path_list(
return _normalize_path_list(paths)


BytesOrMemory = TypeVar("BytesOrMemory", bytes, memoryview)


@dataclass
class BytesIOProxy(Generic[BytesOrMemory]):
"""Proxy object for a write that returns either bytes or a memoryview."""

# TODO: remove this in favor of BytesIO when Dataset.to_netcdf() stops
# returning bytes from the scipy engine
getvalue: Callable[[], BytesOrMemory] | None = None

def getvalue_or_getbuffer(self) -> BytesOrMemory:
"""Get the value of this write as bytes or memory."""
if self.getvalue is None:
raise ValueError("must set getvalue before fetching value")
return self.getvalue()


def _open_remote_file(file, mode, storage_options=None):
import fsspec

Expand Down Expand Up @@ -324,6 +351,11 @@ def __exit__(self, exception_type, exception_value, traceback):
self.close()


T_PathFileOrDataStore = (
str | os.PathLike[Any] | ReadBuffer | bytes | memoryview | AbstractDataStore
)


class ArrayWriter:
__slots__ = ("lock", "regions", "sources", "targets")

Expand Down Expand Up @@ -705,7 +737,12 @@ def __repr__(self) -> str:

def open_dataset(
self,
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
filename_or_obj: str
| os.PathLike[Any]
| ReadBuffer
| bytes
| memoryview
| AbstractDataStore,
*,
drop_variables: str | Iterable[str] | None = None,
) -> Dataset:
Expand All @@ -717,7 +754,12 @@ def open_dataset(

def guess_can_open(
self,
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
filename_or_obj: str
| os.PathLike[Any]
| ReadBuffer
| bytes
| memoryview
| AbstractDataStore,
) -> bool:
"""
Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`.
Expand All @@ -727,7 +769,12 @@ def guess_can_open(

def open_datatree(
self,
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
filename_or_obj: str
| os.PathLike[Any]
| ReadBuffer
| bytes
| memoryview
| AbstractDataStore,
*,
drop_variables: str | Iterable[str] | None = None,
) -> DataTree:
Expand All @@ -739,7 +786,12 @@ def open_datatree(

def open_groups_as_dict(
self,
filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
filename_or_obj: str
| os.PathLike[Any]
| ReadBuffer
| bytes
| memoryview
| AbstractDataStore,
*,
drop_variables: str | Iterable[str] | None = None,
) -> dict[str, Dataset]:
Expand Down
7 changes: 5 additions & 2 deletions xarray/backends/file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,8 +339,11 @@ def __hash__(self):
class DummyFileManager(FileManager):
"""FileManager that simply wraps an open file in the FileManager interface."""

def __init__(self, value):
def __init__(self, value, *, close=None):
if close is None:
close = value.close
self._value = value
self._close = close

def acquire(self, needs_lock=True):
del needs_lock # ignored
Expand All @@ -353,4 +356,4 @@ def acquire_context(self, needs_lock=True):

def close(self, needs_lock=True):
del needs_lock # ignored
self._value.close()
self._close()
Loading
Loading