From 6abd5b5f8c0b0c1a23f55aa3d9749fbe232e7245 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 1 Sep 2025 18:21:31 +0200 Subject: [PATCH 1/5] iterate over shards instead of chunks in second branch --- src/zarr/core/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ce19f99ba0..f31b0cc0a4 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -4421,7 +4421,7 @@ async def _copy_arraylike_region(chunk_coords: slice, _data: NDArrayLike) -> Non # Stream data from the source array to the new array await concurrent_map( - [(region, data) for region in result._iter_chunk_regions()], + [(region, data) for region in result._iter_shard_regions()], _copy_arraylike_region, zarr.core.config.config.get("async.concurrency"), ) From 85d14d19e5fa95e84a6ef3e21db5921c3597fc78 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 1 Sep 2025 20:41:06 +0200 Subject: [PATCH 2/5] add test --- tests/test_array.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_array.py b/tests/test_array.py index 97aef9319b..93c762aa5f 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -74,6 +74,7 @@ ZarrUserWarning, ) from zarr.storage import LocalStore, MemoryStore, StorePath +from zarr.storage._logging import LoggingStore from .test_dtype.conftest import zdtype_examples @@ -2119,3 +2120,21 @@ def test_iter_chunk_regions( assert observed == expected assert observed == tuple(arr._iter_chunk_regions()) assert observed == tuple(arr._async_array._iter_chunk_regions()) + + +@pytest.mark.parametrize("num_shards", [1, 3]) +def test_create_array_with_data_num_gets(num_shards: int) -> None: + """ + Test that creating an array with data only invokes a single get request per stored object + """ + store = LoggingStore(store=MemoryStore()) + + chunk_shape = (1,) + shard_shape = (100,) + shape = (shard_shape[0] * num_shards,) + data = np.arange(shape[0]) + + zarr.create_array(store, data=data, chunks=chunk_shape, shards=shard_shape, fill_value=-1) + # one get for the metadata and one per shard. + # Note: we don't actually need one get per shard, but this is the current behavior + assert store.counter["get"] == 1 + num_shards From 39fb5e0f7a55fbe279cfe6678587959e895a1005 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 1 Sep 2025 20:47:55 +0200 Subject: [PATCH 3/5] parametrize over array type --- tests/test_array.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index 93c762aa5f..b3369cf6e8 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -2123,7 +2123,10 @@ def test_iter_chunk_regions( @pytest.mark.parametrize("num_shards", [1, 3]) -def test_create_array_with_data_num_gets(num_shards: int) -> None: +@pytest.mark.parametrize("array_type", ["numpy", "zarr"]) +def test_create_array_with_data_num_gets( + num_shards: int, array_type: Literal["numpy", "zarr"] +) -> None: """ Test that creating an array with data only invokes a single get request per stored object """ @@ -2132,7 +2135,10 @@ def test_create_array_with_data_num_gets(num_shards: int) -> None: chunk_shape = (1,) shard_shape = (100,) shape = (shard_shape[0] * num_shards,) - data = np.arange(shape[0]) + if array_type == "numpy": + data = np.arange(shape[0]) + else: + data = zarr.zeros(shape, dtype="int64") zarr.create_array(store, data=data, chunks=chunk_shape, shards=shard_shape, fill_value=-1) # one get for the metadata and one per shard. From 9924f909e3bfa5ef40c835cb996909824e692143 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 1 Sep 2025 20:51:07 +0200 Subject: [PATCH 4/5] changelog --- changes/3422.bugfix.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 changes/3422.bugfix.rst diff --git a/changes/3422.bugfix.rst b/changes/3422.bugfix.rst new file mode 100644 index 0000000000..ed4b8c266d --- /dev/null +++ b/changes/3422.bugfix.rst @@ -0,0 +1,4 @@ +Fix a potential race condition when using :func:`zarr.create_array` with the ``data`` parameter +set to a NumPy array. Previously Zarr was iterating over the newly created array with a granularity +that was too low. Now Zarr chooses a granularity that matches the size of the stored objects for +that array. \ No newline at end of file From edb149252ba606c6fd79f62b9bee67f7c6e0e223 Mon Sep 17 00:00:00 2001 From: Davis Vann Bennett Date: Mon, 1 Sep 2025 21:06:40 +0200 Subject: [PATCH 5/5] appease mypy --- tests/test_array.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index b3369cf6e8..cf201ce0c7 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -2135,12 +2135,13 @@ def test_create_array_with_data_num_gets( chunk_shape = (1,) shard_shape = (100,) shape = (shard_shape[0] * num_shards,) + data: Array | npt.NDArray[np.int64] if array_type == "numpy": - data = np.arange(shape[0]) + data = np.zeros(shape[0], dtype="int64") else: data = zarr.zeros(shape, dtype="int64") - zarr.create_array(store, data=data, chunks=chunk_shape, shards=shard_shape, fill_value=-1) + zarr.create_array(store, data=data, chunks=chunk_shape, shards=shard_shape, fill_value=-1) # type: ignore[arg-type] # one get for the metadata and one per shard. # Note: we don't actually need one get per shard, but this is the current behavior assert store.counter["get"] == 1 + num_shards