Skip to content

open_dataset creates default indexes sequentially, causing significant latency in cloud high-latency stores #10579

@aladinor

Description

@aladinor

xarray.open_dataset() and xarray.open_datatree() create default indexes for each coordinate sequentially via the _maybe_create_default_indexes() function. When reading from a cloud-based Zarr store, this results in one request per coordinate, which compounds into significant total latency.

This issue becomes especially noticeable when:

  • There are many coordinate variables
  • You are working with DataTrees containing many groups
  • The backing store has even modest per-request latency (e.g., 100ms)

Where this happens

The issue originates in the _maybe_create_default_indexes() function:
source: xarray/backends/api.py#L383-L389

Each coordinate is processed in a Python loop, and coord.variable triggers .data access on Zarr-backed arrays. This causes a separate .get() request for every coordinate.

In high-latency stores, the result is a linear slowdown proportional to the number of coordinates.

Reproducible Test Case

To reproduce this issue, I created:

  • A single Dataset with multiple coordinates
  • A DataTree with multiple groups, each containing several coordinates
  • A LatencyStore wrapper around MemoryStore that introduces 1s artificial latency per .get() call
Click to expand Python test code
from zarr.storage import MemoryStore
import asyncio
import time
from collections.abc import Iterable
from typing import TypeVar

import numpy as np

import xarray as xr
from xarray.tests import has_zarr_v3

if has_zarr_v3:
    from zarr.abc.store import ByteRequest, Store
    from zarr.core.buffer import Buffer, BufferPrototype
    from zarr.storage import MemoryStore
    from zarr.storage._wrapper import WrapperStore

    T_Store = TypeVar("T_Store", bound=Store)


class LatencyStore(WrapperStore[T_Store]):
    """Works the same way as the zarr LoggingStore"""
    latency: float

    __read_only: bool

    def __init__(
        self,
        store: T_Store,
        read_only: bool,
        latency: float = 0.0,
    ) -> None:
        """
        Store wrapper that adds artificial latency to each get call.

        Parameters
        ----------
        store : Store
            Store to wrap
        latency : float
            Amount of artificial latency to add to each get call, in seconds.
        """
        super().__init__(store)
        self.__read_only = read_only
        self.latency = latency

    def __str__(self) -> str:
        return f"latency-{self._store}"

    def __repr__(self) -> str:
        return f"LatencyStore({self._store.__class__.__name__}, '{self._store}', latency={self.latency})"

    def with_read_only(self, read_only=False) -> "LatencyStore":
        return LatencyStore(self._store, read_only=True, latency=self.latency)

    async def get(
        self,
        key: str,
        prototype: BufferPrototype,
        byte_range: ByteRequest | None = None,
    ) -> Buffer | None:
        await asyncio.sleep(self.latency)
        return await self._store.get(
            key=key, prototype=prototype, byte_range=byte_range
        )

    async def get_partial_values(
        self,
        prototype: BufferPrototype,
        key_ranges: Iterable[tuple[str, ByteRequest | None]],
    ) -> list[Buffer | None]:
        await asyncio.sleep(self.latency)
        return await self._store.get_partial_values(
            prototype=prototype, key_ranges=key_ranges
        )



def create_dataset_with_many_coordinates(n_coords=20):
    """Create a dataset with many tiny coordinate variables"""
    coords = {}
    for i in range(n_coords):
        coords[f'coord_{i}'] = (f'coord_{i}', np.arange(2))
    ds = xr.Dataset(
        coords=coords
    )
    return ds


def create_datatree_with_many_groups(n_groups=30, coords_per_group=5):
    """Create a DataTree with many groups, each having coordinate variables"""
    print(f"Creating DataTree with {n_groups} groups, {coords_per_group} coords each...")

    root_ds = create_dataset_with_many_coordinates(5)

    tree_dict = {'/': root_ds}

    for i in range(n_groups):
        group_name = f'/group_{i:03d}'
        group_ds = create_dataset_with_many_coordinates(n_coords=coords_per_group)
        tree_dict[group_name] = group_ds

    tree = xr.DataTree.from_dict(tree_dict)
    return tree


if __name__ == '__main__':
    print("testing dataset")
    ds = create_dataset_with_many_coordinates(10)

    store = MemoryStore()
    latency_store = LatencyStore(store, latency=1, read_only=False)
    print("storing dataset")
    ds.to_zarr(latency_store, mode="w", consolidated=False, zarr_format=3)

    print("opening dataset")
    start_time = time.time()
    ds_index = xr.open_zarr(
        latency_store,
        consolidated=False,
        zarr_format=3,
        create_default_indexes = True,

    )
    with_index_time = time.time() - start_time
    print(f"open dataset with create_default_indexes=True:  {with_index_time}")

    start_time = time.time()
    ds_no_index = xr.open_zarr(
        latency_store,
        consolidated=False,
        zarr_format=3,
        create_default_indexes=False
    )
    no_index = time.time() - start_time

    print(f"open dataset with create_default_indexes=False:  {no_index}")
    print(f"creating indexes from cloud environment might take {with_index_time / no_index:.1f}x slower")

    print("==="* 20)

    print("Now testing datatrees")

    dtree = create_datatree_with_many_groups(n_groups=5, coords_per_group=10)

    dtree.to_zarr(store, mode="w", consolidated=False, zarr_format=3)

    start_time = time.time()
    dtree_index = xr.open_datatree(
        latency_store,
        consolidated=False,
        zarr_format=3,
        create_default_indexes = True,
        engine="zarr"
    )
    with_index = time.time() - start_time

    print(f"open dtree with create_default_indexes=True:  {with_index}")

    start_time = time.time()
    dtree_no_index = xr.open_datatree(
        latency_store,
        consolidated=False,
        zarr_format=3,
        create_default_indexes = False,
        engine="zarr"
    )
    no_index = time.time() - start_time

    print(f"open dtree with create_default_indexes=False:  {no_index}")
    print(f"without Index creation is {with_index / no_index:.1f}x faster")

Results

Case With Indexes Without Indexes Slowdown
Dataset (10 coordinates) 12.6 s 2.0 s 6.3×
DataTree (5 groups × 10 coords) 48.1 s 18.0 s 2.7×

This confirms that most of the time spent opening the dataset is used to build default indexes sequentially.


Additional Results (Larger Scale)

The 50-coordinate dataset took nearly 57 seconds to open, which was expected, as the latency added by each GET request was set to 1 second.

CC: @dcherian @TomNicholas

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions