Skip to content

Commit 27d689c

Browse files
authored
fix/shard iteration redux (#3422)
* iterate over shards instead of chunks in second branch * add test * parametrize over array type * changelog * appease mypy
1 parent ee9c182 commit 27d689c

File tree

3 files changed

+31
-1
lines changed

3 files changed

+31
-1
lines changed

changes/3422.bugfix.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix a potential race condition when using :func:`zarr.create_array` with the ``data`` parameter
2+
set to a NumPy array. Previously Zarr was iterating over the newly created array with a granularity
3+
that was too low. Now Zarr chooses a granularity that matches the size of the stored objects for
4+
that array.

src/zarr/core/array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4421,7 +4421,7 @@ async def _copy_arraylike_region(chunk_coords: slice, _data: NDArrayLike) -> Non
44214421

44224422
# Stream data from the source array to the new array
44234423
await concurrent_map(
4424-
[(region, data) for region in result._iter_chunk_regions()],
4424+
[(region, data) for region in result._iter_shard_regions()],
44254425
_copy_arraylike_region,
44264426
zarr.core.config.config.get("async.concurrency"),
44274427
)

tests/test_array.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
ZarrUserWarning,
7575
)
7676
from zarr.storage import LocalStore, MemoryStore, StorePath
77+
from zarr.storage._logging import LoggingStore
7778

7879
from .test_dtype.conftest import zdtype_examples
7980

@@ -2119,3 +2120,28 @@ def test_iter_chunk_regions(
21192120
assert observed == expected
21202121
assert observed == tuple(arr._iter_chunk_regions())
21212122
assert observed == tuple(arr._async_array._iter_chunk_regions())
2123+
2124+
2125+
@pytest.mark.parametrize("num_shards", [1, 3])
2126+
@pytest.mark.parametrize("array_type", ["numpy", "zarr"])
2127+
def test_create_array_with_data_num_gets(
2128+
num_shards: int, array_type: Literal["numpy", "zarr"]
2129+
) -> None:
2130+
"""
2131+
Test that creating an array with data only invokes a single get request per stored object
2132+
"""
2133+
store = LoggingStore(store=MemoryStore())
2134+
2135+
chunk_shape = (1,)
2136+
shard_shape = (100,)
2137+
shape = (shard_shape[0] * num_shards,)
2138+
data: Array | npt.NDArray[np.int64]
2139+
if array_type == "numpy":
2140+
data = np.zeros(shape[0], dtype="int64")
2141+
else:
2142+
data = zarr.zeros(shape, dtype="int64")
2143+
2144+
zarr.create_array(store, data=data, chunks=chunk_shape, shards=shard_shape, fill_value=-1) # type: ignore[arg-type]
2145+
# one get for the metadata and one per shard.
2146+
# Note: we don't actually need one get per shard, but this is the current behavior
2147+
assert store.counter["get"] == 1 + num_shards

0 commit comments

Comments
 (0)