-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
xarray.open_dataset()
and xarray.open_datatree()
create default indexes for each coordinate sequentially via the _maybe_create_default_indexes()
function. When reading from a cloud-based Zarr store, this results in one request per coordinate, which compounds into significant total latency.
This issue becomes especially noticeable when:
- There are many coordinate variables
- You are working with
DataTree
s containing many groups - The backing store has even modest per-request latency (e.g., 100ms)
Where this happens
The issue originates in the _maybe_create_default_indexes()
function:
source: xarray/backends/api.py#L383-L389
Each coordinate is processed in a Python loop, and coord.variable
triggers .data
access on Zarr-backed arrays. This causes a separate .get()
request for every coordinate.
In high-latency stores, the result is a linear slowdown proportional to the number of coordinates.
Reproducible Test Case
To reproduce this issue, I created:
- A single
Dataset
with multiple coordinates - A
DataTree
with multiple groups, each containing several coordinates - A
LatencyStore
wrapper aroundMemoryStore
that introduces 1s artificial latency per.get()
call
Click to expand Python test code
from zarr.storage import MemoryStore
import asyncio
import time
from collections.abc import Iterable
from typing import TypeVar
import numpy as np
import xarray as xr
from xarray.tests import has_zarr_v3
if has_zarr_v3:
from zarr.abc.store import ByteRequest, Store
from zarr.core.buffer import Buffer, BufferPrototype
from zarr.storage import MemoryStore
from zarr.storage._wrapper import WrapperStore
T_Store = TypeVar("T_Store", bound=Store)
class LatencyStore(WrapperStore[T_Store]):
"""Works the same way as the zarr LoggingStore"""
latency: float
__read_only: bool
def __init__(
self,
store: T_Store,
read_only: bool,
latency: float = 0.0,
) -> None:
"""
Store wrapper that adds artificial latency to each get call.
Parameters
----------
store : Store
Store to wrap
latency : float
Amount of artificial latency to add to each get call, in seconds.
"""
super().__init__(store)
self.__read_only = read_only
self.latency = latency
def __str__(self) -> str:
return f"latency-{self._store}"
def __repr__(self) -> str:
return f"LatencyStore({self._store.__class__.__name__}, '{self._store}', latency={self.latency})"
def with_read_only(self, read_only=False) -> "LatencyStore":
return LatencyStore(self._store, read_only=True, latency=self.latency)
async def get(
self,
key: str,
prototype: BufferPrototype,
byte_range: ByteRequest | None = None,
) -> Buffer | None:
await asyncio.sleep(self.latency)
return await self._store.get(
key=key, prototype=prototype, byte_range=byte_range
)
async def get_partial_values(
self,
prototype: BufferPrototype,
key_ranges: Iterable[tuple[str, ByteRequest | None]],
) -> list[Buffer | None]:
await asyncio.sleep(self.latency)
return await self._store.get_partial_values(
prototype=prototype, key_ranges=key_ranges
)
def create_dataset_with_many_coordinates(n_coords=20):
"""Create a dataset with many tiny coordinate variables"""
coords = {}
for i in range(n_coords):
coords[f'coord_{i}'] = (f'coord_{i}', np.arange(2))
ds = xr.Dataset(
coords=coords
)
return ds
def create_datatree_with_many_groups(n_groups=30, coords_per_group=5):
"""Create a DataTree with many groups, each having coordinate variables"""
print(f"Creating DataTree with {n_groups} groups, {coords_per_group} coords each...")
root_ds = create_dataset_with_many_coordinates(5)
tree_dict = {'/': root_ds}
for i in range(n_groups):
group_name = f'/group_{i:03d}'
group_ds = create_dataset_with_many_coordinates(n_coords=coords_per_group)
tree_dict[group_name] = group_ds
tree = xr.DataTree.from_dict(tree_dict)
return tree
if __name__ == '__main__':
print("testing dataset")
ds = create_dataset_with_many_coordinates(10)
store = MemoryStore()
latency_store = LatencyStore(store, latency=1, read_only=False)
print("storing dataset")
ds.to_zarr(latency_store, mode="w", consolidated=False, zarr_format=3)
print("opening dataset")
start_time = time.time()
ds_index = xr.open_zarr(
latency_store,
consolidated=False,
zarr_format=3,
create_default_indexes = True,
)
with_index_time = time.time() - start_time
print(f"open dataset with create_default_indexes=True: {with_index_time}")
start_time = time.time()
ds_no_index = xr.open_zarr(
latency_store,
consolidated=False,
zarr_format=3,
create_default_indexes=False
)
no_index = time.time() - start_time
print(f"open dataset with create_default_indexes=False: {no_index}")
print(f"creating indexes from cloud environment might take {with_index_time / no_index:.1f}x slower")
print("==="* 20)
print("Now testing datatrees")
dtree = create_datatree_with_many_groups(n_groups=5, coords_per_group=10)
dtree.to_zarr(store, mode="w", consolidated=False, zarr_format=3)
start_time = time.time()
dtree_index = xr.open_datatree(
latency_store,
consolidated=False,
zarr_format=3,
create_default_indexes = True,
engine="zarr"
)
with_index = time.time() - start_time
print(f"open dtree with create_default_indexes=True: {with_index}")
start_time = time.time()
dtree_no_index = xr.open_datatree(
latency_store,
consolidated=False,
zarr_format=3,
create_default_indexes = False,
engine="zarr"
)
no_index = time.time() - start_time
print(f"open dtree with create_default_indexes=False: {no_index}")
print(f"without Index creation is {with_index / no_index:.1f}x faster")
Results
Case | With Indexes | Without Indexes | Slowdown |
---|---|---|---|
Dataset (10 coordinates) | 12.6 s | 2.0 s | 6.3× |
DataTree (5 groups × 10 coords) | 48.1 s | 18.0 s | 2.7× |
This confirms that most of the time spent opening the dataset is used to build default indexes sequentially.
Additional Results (Larger Scale)
The 50-coordinate dataset took nearly 57 seconds to open, which was expected, as the latency added by each GET request was set to 1 second.