diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index 133265f348..cea2504316 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -30,6 +30,8 @@ jobs: steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 # grab all branches and tags # - name: cuda-toolkit # uses: Jimver/cuda-toolkit@v0.2.16 # id: cuda-toolkit diff --git a/changes/1798.feature.rst b/changes/1798.feature.rst new file mode 100644 index 0000000000..64d4efdf08 --- /dev/null +++ b/changes/1798.feature.rst @@ -0,0 +1,2 @@ +Add a command-line interface to migrate v2 Zarr metadata to v3. Corresponding functions are also +provided under zarr.metadata. diff --git a/docs/user-guide/cli.rst b/docs/user-guide/cli.rst new file mode 100644 index 0000000000..822b60d389 --- /dev/null +++ b/docs/user-guide/cli.rst @@ -0,0 +1,127 @@ +.. _user-guide-cli: + +Command-line interface +======================== + +Zarr-Python provides a command-line interface that enables: + +- migration of Zarr v2 metadata to v3 +- removal of v2 or v3 metadata + +To see available commands run the following in a terminal: + +.. code-block:: bash + + $ zarr --help + +or to get help on individual commands: + +.. code-block:: bash + + $ zarr migrate --help + + $ zarr remove-metadata --help + + +Migrate metadata from v2 to v3 +------------------------------ + +Migrate to a separate location +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To migrate a Zarr array/group's metadata from v2 to v3 run: + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr path/to/output.zarr + +This will write new ``zarr.json`` files to ``output.zarr``, leaving ``input.zarr`` un-touched. +Note - this will migrate the entire Zarr hierarchy, so if ``input.zarr`` contains multiple groups/arrays, +new ``zarr.json`` will be made for all of them. + +Migrate in-place +~~~~~~~~~~~~~~~~ + +If you'd prefer to migrate the metadata in-place run: + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr + +This will write new ``zarr.json`` files to ``input.zarr``, leaving the existing v2 metadata un-touched. + +To open the array/group using the new metadata use: + +.. code-block:: python + + >>> import zarr + >>> zarr_with_v3_metadata = zarr.open('path/to/input.zarr', zarr_format=3) + +Once you are happy with the conversion, you can run the following to remove the old v2 metadata: + +.. code-block:: bash + + $ zarr remove-metadata v2 path/to/input.zarr + +Note there is also a shortcut to migrate and remove v2 metadata in one step: + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr --remove-v2-metadata + + +Remove metadata +---------------- + +Remove v2 metadata using: + +.. code-block:: bash + + $ zarr remove-metadata v2 path/to/input.zarr + +or v3 with: + +.. code-block:: bash + + $ zarr remove-metadata v3 path/to/input.zarr + +By default, this will only allow removal of metadata if a valid alternative exists. For example, you can't +remove v2 metadata unless v3 metadata exists at that location. + +To override this behaviour use ``--force``: + +.. code-block:: bash + + $ zarr remove-metadata v3 path/to/input.zarr --force + + +Dry run +-------- +All commands provide a ``--dry-run`` option that will log changes that would be made on a real run, without creating +or modifying any files. + +.. code-block:: bash + + $ zarr migrate v3 path/to/input.zarr --dry-run + + Dry run enabled - no new files will be created or changed. Log of files that would be created on a real run: + Saving metadata to path/to/input.zarr/zarr.json + + +Verbose +-------- +You can also add ``--verbose`` **before** any command, to see a full log of its actions: + +.. code-block:: bash + + $ zarr --verbose migrate v3 path/to/input.zarr + + $ zarr --verbose remove-metadata v2 path/to/input.zarr + + +Equivalent functions +-------------------- +All features of the command-line interface are also available via functions under +:mod:`zarr.metadata`. + + diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index f92c576f32..a83a30172b 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -13,6 +13,7 @@ User guide storage config v3_migration + cli Advanced Topics --------------- diff --git a/pyproject.toml b/pyproject.toml index 95528c4558..1e494ce094 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,7 @@ remote = [ gpu = [ "cupy-cuda12x", ] +cli = ["typer"] # Development extras test = [ "coverage>=7.10", @@ -113,6 +114,9 @@ docs = [ 'pytest' ] +[project.scripts] +zarr = "zarr._cli.cli:app" + [project.urls] "Bug Tracker" = "https://github.com/zarr-developers/zarr-python/issues" @@ -163,7 +167,7 @@ deps = ["minimal", "optional"] [tool.hatch.envs.test.overrides] matrix.deps.dependencies = [ - {value = "zarr[remote, remote_tests, test, optional]", if = ["optional"]} + {value = "zarr[remote, remote_tests, test, optional, cli]", if = ["optional"]} ] [tool.hatch.envs.test.scripts] diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 0d58ecf8e8..3c6195c28f 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -1,3 +1,7 @@ +import functools +import logging +from typing import Literal + from zarr._version import version as __version__ from zarr.api.synchronous import ( array, @@ -37,6 +41,8 @@ # in case setuptools scm screw up and find version to be 0.0.0 assert not __version__.startswith("0.0.0") +_logger = logging.getLogger(__name__) + def print_debug_info() -> None: """ @@ -85,6 +91,58 @@ def print_packages(packages: list[str]) -> None: print_packages(optional) +# The decorator ensures this always returns the same handler (and it is only +# attached once). +@functools.cache +def _ensure_handler() -> logging.Handler: + """ + The first time this function is called, attach a `StreamHandler` using the + same format as `logging.basicConfig` to the Zarr-Python root logger. + + Return this handler every time this function is called. + """ + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT)) + _logger.addHandler(handler) + return handler + + +def set_log_level( + level: Literal["NOTSET", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], +) -> None: + """Set the logging level for Zarr-Python. + + Zarr-Python uses the standard library `logging` framework under the root + logger 'zarr'. This is a helper function to: + + - set Zarr-Python's root logger level + - set the root logger handler's level, creating the handler + if it does not exist yet + + Parameters + ---------- + level : str + The logging level to set. + """ + _logger.setLevel(level) + _ensure_handler().setLevel(level) + + +def set_format(log_format: str) -> None: + """Set the format of logging messages from Zarr-Python. + + Zarr-Python uses the standard library `logging` framework under the root + logger 'zarr'. This sets the format of log messages from the root logger's StreamHandler. + + Parameters + ---------- + log_format : str + A string determining the log format (as defined in the standard library's `logging` module + for logging.Formatter) + """ + _ensure_handler().setFormatter(logging.Formatter(fmt=log_format)) + + __all__ = [ "Array", "AsyncArray", diff --git a/src/zarr/_cli/__init__.py b/src/zarr/_cli/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/_cli/cli.py b/src/zarr/_cli/cli.py new file mode 100644 index 0000000000..785efe505b --- /dev/null +++ b/src/zarr/_cli/cli.py @@ -0,0 +1,186 @@ +import logging +from enum import Enum +from typing import Annotated, Literal, cast + +import typer + +import zarr +import zarr.metadata.migrate_v3 as migrate_metadata +from zarr.core.sync import sync +from zarr.storage._common import make_store + +app = typer.Typer() + +logger = logging.getLogger(__name__) + + +def _set_logging_level(*, verbose: bool) -> None: + if verbose: + lvl = "INFO" + else: + lvl = "WARNING" + zarr.set_log_level(cast(Literal["INFO", "WARNING"], lvl)) + zarr.set_format("%(message)s") + + +class ZarrFormat(str, Enum): + v2 = "v2" + v3 = "v3" + + +class ZarrFormatV3(str, Enum): + """Limit CLI choice to only v3""" + + v3 = "v3" + + +@app.command() # type: ignore[misc] +def migrate( + zarr_format: Annotated[ + ZarrFormatV3, + typer.Argument( + help="Zarr format to migrate to. Currently only 'v3' is supported.", + ), + ], + input_store: Annotated[ + str, + typer.Argument( + help=( + "Input Zarr to migrate - should be a store, path to directory in file system or name of zip file " + "e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ) + ), + ], + output_store: Annotated[ + str | None, + typer.Argument( + help=( + "Output location to write generated metadata (no array data will be copied). If not provided, " + "metadata will be written to input_store. Should be a store, path to directory in file system " + "or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ) + ), + ] = None, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be converted are logged, but no new files are created or changed." + ), + ] = False, + overwrite: Annotated[ + bool, + typer.Option( + help="Remove any existing v3 metadata at the output location, before migration starts." + ), + ] = False, + force: Annotated[ + bool, + typer.Option( + help=( + "Only used when --overwrite is given. Allows v3 metadata to be removed when no valid " + "v2 metadata exists at the output location." + ) + ), + ] = False, + remove_v2_metadata: Annotated[ + bool, + typer.Option( + help="Remove v2 metadata (if any) from the output location, after migration is complete." + ), + ] = False, +) -> None: + """Migrate all v2 metadata in a zarr hierarchy to v3. This will create a zarr.json file for each level + (every group / array). v2 files (.zarray, .zattrs etc.) will be left as-is. + """ + if dry_run: + _set_logging_level(verbose=True) + logger.info( + "Dry run enabled - no new files will be created or changed. Log of files that would be created on a real run:" + ) + + input_zarr_store = sync(make_store(input_store, mode="r+")) + + if output_store is not None: + output_zarr_store = sync(make_store(output_store, mode="w-")) + write_store = output_zarr_store + else: + output_zarr_store = None + write_store = input_zarr_store + + if overwrite: + sync(migrate_metadata.remove_metadata(write_store, 3, force=force, dry_run=dry_run)) + + migrate_metadata.migrate_v2_to_v3( + input_store=input_zarr_store, output_store=output_zarr_store, dry_run=dry_run + ) + + if remove_v2_metadata: + # There should always be valid v3 metadata at the output location after migration, so force=False + sync(migrate_metadata.remove_metadata(write_store, 2, force=False, dry_run=dry_run)) + + +@app.command() # type: ignore[misc] +def remove_metadata( + zarr_format: Annotated[ + ZarrFormat, + typer.Argument(help="Which format's metadata to remove - v2 or v3."), + ], + store: Annotated[ + str, + typer.Argument( + help="Store or path to directory in file system or name of zip file e.g. 'data/example-1.zarr', 's3://example-bucket/example'..." + ), + ], + force: Annotated[ + bool, + typer.Option( + help=( + "Allow metadata to be deleted when no valid alternative exists e.g. allow deletion of v2 metadata, " + "when no v3 metadata is present." + ) + ), + ] = False, + dry_run: Annotated[ + bool, + typer.Option( + help="Enable a dry-run: files that would be deleted are logged, but no files are removed or changed." + ), + ] = False, +) -> None: + """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + Note - this will remove metadata files at all levels of the hierarchy (every group and array). + """ + if dry_run: + _set_logging_level(verbose=True) + logger.info( + "Dry run enabled - no files will be deleted or changed. Log of files that would be deleted on a real run:" + ) + input_zarr_store = sync(make_store(store, mode="r+")) + + sync( + migrate_metadata.remove_metadata( + store=input_zarr_store, + zarr_format=cast(Literal[2, 3], int(zarr_format[1:])), + force=force, + dry_run=dry_run, + ) + ) + + +@app.callback() # type: ignore[misc] +def main( + verbose: Annotated[ + bool, + typer.Option( + help="enable verbose logging - will print info about metadata files being deleted / saved." + ), + ] = False, +) -> None: + """ + See available commands below - access help for individual commands with zarr COMMAND --help. + """ + _set_logging_level(verbose=verbose) + + +if __name__ == "__main__": + app() diff --git a/src/zarr/metadata/__init__.py b/src/zarr/metadata/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/zarr/metadata/migrate_v3.py b/src/zarr/metadata/migrate_v3.py new file mode 100644 index 0000000000..c2a2f4494a --- /dev/null +++ b/src/zarr/metadata/migrate_v3.py @@ -0,0 +1,294 @@ +import asyncio +import logging +from typing import Literal, cast + +import numcodecs.abc + +import zarr +from zarr import Array, Group +from zarr.abc.codec import ArrayArrayCodec, BytesBytesCodec, Codec +from zarr.abc.store import Store +from zarr.codecs.blosc import BloscCodec, BloscShuffle +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec +from zarr.codecs.zstd import ZstdCodec +from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding +from zarr.core.common import ( + ZARR_JSON, + ZARRAY_JSON, + ZATTRS_JSON, + ZGROUP_JSON, + ZMETADATA_V2_JSON, + ZarrFormat, +) +from zarr.core.dtype.common import HasEndianness +from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType +from zarr.core.group import GroupMetadata +from zarr.core.metadata.v2 import ArrayV2Metadata +from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.core.sync import sync +from zarr.registry import get_codec_class +from zarr.storage import StorePath + +_logger = logging.getLogger(__name__) + + +def migrate_v2_to_v3( + *, + input_store: Store, + output_store: Store | None = None, + dry_run: bool = False, +) -> None: + """Migrate all v2 metadata in a Zarr store to v3. + + This will create a zarr.json file at each level of a Zarr hierarchy (for every group / array). + v2 files (.zarray, .zattrs etc.) will be left as-is. + + Parameters + ---------- + input_store : Store + Input Zarr to migrate. + output_store : Store, optional + Output location to write v3 metadata (no array data will be copied). If not provided, v3 metadata will be + written to input_store. + dry_run : bool, optional + Enable a 'dry run' - files that would be created are logged, but no files are created or changed. + """ + + zarr_v2 = zarr.open(store=input_store, mode="r+") + + if output_store is not None: + # w- access to not allow overwrite of existing data + output_path = sync(StorePath.open(output_store, path="", mode="w-")) + else: + output_path = zarr_v2.store_path + + migrate_to_v3(zarr_v2, output_path, dry_run=dry_run) + + +def migrate_to_v3(zarr_v2: Array | Group, output_path: StorePath, dry_run: bool = False) -> None: + """Migrate all v2 metadata in a Zarr array/group to v3. + + Note - if a group is provided, then all arrays / groups within this group will also be converted. + A zarr.json file will be created for each level and written to output_path, with any v2 files + (.zarray, .zattrs etc.) left as-is. + + Parameters + ---------- + zarr_v2 : Array | Group + An array or group with zarr_format = 2 + output_path : StorePath + The store path to write generated v3 metadata to. + dry_run : bool, optional + Enable a 'dry run' - files that would be created are logged, but no files are created or changed. + """ + if not zarr_v2.metadata.zarr_format == 2: + raise TypeError("Only arrays / groups with zarr v2 metadata can be converted") + + if isinstance(zarr_v2.metadata, GroupMetadata): + _convert_group(zarr_v2, output_path, dry_run) + else: + _convert_array(zarr_v2, output_path, dry_run) + + +async def remove_metadata( + store: Store, + zarr_format: ZarrFormat, + force: bool = False, + dry_run: bool = False, +) -> None: + """Remove all v2 (.zarray, .zattrs, .zgroup, .zmetadata) or v3 (zarr.json) metadata files from the given Zarr. + + Note - this will remove metadata files at all levels of the hierarchy (every group and array). + + Parameters + ---------- + store : Store + Zarr to remove metadata from. + zarr_format : ZarrFormat + Which format's metadata to remove - 2 or 3. + force : bool, optional + When False, metadata can only be removed if a valid alternative exists e.g. deletion of v2 metadata will + only be allowed when v3 metadata is also present. When True, metadata can be removed when there is no + alternative. + dry_run : bool, optional + Enable a 'dry run' - files that would be deleted are logged, but no files are removed or changed. + """ + + if not store.supports_deletes: + raise ValueError("Store must support deletes to remove metadata") + store_path = await StorePath.open(store, path="", mode="r+") + + metadata_files_all = { + 2: [ZARRAY_JSON, ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON], + 3: [ZARR_JSON], + } + + if zarr_format == 2: + alternative_metadata = 3 + else: + alternative_metadata = 2 + + awaitables = [] + async for file_path in store.list(): + parent_path, _, file_name = file_path.rpartition("/") + + if file_name not in metadata_files_all[zarr_format]: + continue + + if force or await _metadata_exists( + cast(Literal[2, 3], alternative_metadata), store_path / parent_path + ): + _logger.info("Deleting metadata at %s", store_path / file_path) + if not dry_run: + awaitables.append((store_path / file_path).delete()) + else: + raise ValueError( + f"Cannot remove v{zarr_format} metadata at {store_path / file_path} - no v{alternative_metadata} " + "metadata exists. To delete anyway, use the 'force' option." + ) + + await asyncio.gather(*awaitables) + + +def _convert_group(zarr_v2: Group, output_path: StorePath, dry_run: bool) -> None: + if zarr_v2.metadata.consolidated_metadata is not None: + raise NotImplementedError("Migration of consolidated metadata isn't supported.") + + # process members of the group + for key in zarr_v2: + migrate_to_v3(zarr_v2[key], output_path=output_path / key, dry_run=dry_run) + + # write group's converted metadata + group_metadata_v3 = GroupMetadata( + attributes=zarr_v2.metadata.attributes, zarr_format=3, consolidated_metadata=None + ) + sync(_save_v3_metadata(group_metadata_v3, output_path, dry_run=dry_run)) + + +def _convert_array(zarr_v2: Array, output_path: StorePath, dry_run: bool) -> None: + array_metadata_v3 = _convert_array_metadata(cast(ArrayV2Metadata, zarr_v2.metadata)) + sync(_save_v3_metadata(array_metadata_v3, output_path, dry_run=dry_run)) + + +async def _metadata_exists(zarr_format: ZarrFormat, store_path: StorePath) -> bool: + metadata_files_required = {2: [ZARRAY_JSON, ZGROUP_JSON], 3: [ZARR_JSON]} + + for metadata_file in metadata_files_required[zarr_format]: + if await (store_path / metadata_file).exists(): + return True + + return False + + +def _convert_array_metadata(metadata_v2: ArrayV2Metadata) -> ArrayV3Metadata: + chunk_key_encoding = V2ChunkKeyEncoding(separator=metadata_v2.dimension_separator) + + codecs: list[Codec] = [] + + # array-array codecs + if metadata_v2.order == "F": + # F is equivalent to order: n-1, ... 1, 0 + codecs.append(TransposeCodec(order=list(range(len(metadata_v2.shape) - 1, -1, -1)))) + + if metadata_v2.filters is not None: + codecs.extend(_convert_filters(metadata_v2.filters)) + + # array-bytes codecs + if not isinstance(metadata_v2.dtype, HasEndianness): + codecs.append(BytesCodec(endian=None)) + else: + codecs.append(BytesCodec(endian=metadata_v2.dtype.endianness)) + + # bytes-bytes codecs + if metadata_v2.compressor is not None: + bytes_bytes_codec = _convert_compressor(metadata_v2.compressor, metadata_v2.dtype) + codecs.append(bytes_bytes_codec) + + return ArrayV3Metadata( + shape=metadata_v2.shape, + data_type=metadata_v2.dtype, + chunk_grid=metadata_v2.chunk_grid, + chunk_key_encoding=chunk_key_encoding, + fill_value=metadata_v2.fill_value, + codecs=codecs, + attributes=metadata_v2.attributes, + dimension_names=None, + storage_transformers=None, + ) + + +def _convert_filters(filters: tuple[numcodecs.abc.Codec, ...]) -> list[ArrayArrayCodec]: + filters_codecs = [_find_numcodecs_zarr3(filter) for filter in filters] + for codec in filters_codecs: + if not isinstance(codec, ArrayArrayCodec): + raise TypeError(f"Filter {type(codec)} is not an ArrayArrayCodec") + + return cast(list[ArrayArrayCodec], filters_codecs) + + +def _convert_compressor( + compressor: numcodecs.abc.Codec, dtype: ZDType[TBaseDType, TBaseScalar] +) -> BytesBytesCodec: + match compressor.codec_id: + case "blosc": + return BloscCodec( + typesize=dtype.to_native_dtype().itemsize, + cname=compressor.cname, + clevel=compressor.clevel, + shuffle=BloscShuffle.from_int(compressor.shuffle), + blocksize=compressor.blocksize, + ) + + case "zstd": + return ZstdCodec( + level=compressor.level, + checksum=compressor.checksum, + ) + + case "gzip": + return GzipCodec(level=compressor.level) + + case _: + # If possible, find matching numcodecs.zarr3 codec + compressor_codec = _find_numcodecs_zarr3(compressor) + + if not isinstance(compressor_codec, BytesBytesCodec): + raise TypeError(f"Compressor {type(compressor_codec)} is not a BytesBytesCodec") + + return compressor_codec + + +def _find_numcodecs_zarr3(numcodecs_codec: numcodecs.abc.Codec) -> Codec: + """Find matching numcodecs.zarr3 codec (if it exists)""" + + numcodec_name = f"numcodecs.{numcodecs_codec.codec_id}" + numcodec_dict = { + "name": numcodec_name, + "configuration": numcodecs_codec.get_config(), + } + + try: + codec_v3 = get_codec_class(numcodec_name) + except KeyError as exc: + raise ValueError( + f"Couldn't find corresponding numcodecs.zarr3 codec for {numcodecs_codec.codec_id}" + ) from exc + + return codec_v3.from_dict(numcodec_dict) + + +async def _save_v3_metadata( + metadata_v3: ArrayV3Metadata | GroupMetadata, output_path: StorePath, dry_run: bool = False +) -> None: + zarr_json_path = output_path / ZARR_JSON + if await zarr_json_path.exists(): + raise ValueError(f"{ZARR_JSON} already exists at {zarr_json_path}") + + _logger.info("Saving metadata to %s", zarr_json_path) + to_save = metadata_v3.to_buffer_dict(default_buffer_prototype()) + + if not dry_run: + await zarr_json_path.set_if_not_exists(to_save[ZARR_JSON]) diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index 3a63b30e9b..d3c0ade804 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -267,45 +267,30 @@ def __eq__(self, other: object) -> bool: StoreLike: TypeAlias = Store | StorePath | FSMap | Path | str | dict[str, Buffer] -async def make_store_path( +async def make_store( store_like: StoreLike | None, *, - path: str | None = "", mode: AccessModeLiteral | None = None, storage_options: dict[str, Any] | None = None, -) -> StorePath: +) -> Store: """ - Convert a `StoreLike` object into a StorePath object. - - This function takes a `StoreLike` object and returns a `StorePath` object. The - `StoreLike` object can be a `Store`, `StorePath`, `Path`, `str`, or `dict[str, Buffer]`. - If the `StoreLike` object is a Store or `StorePath`, it is converted to a - `StorePath` object. If the `StoreLike` object is a Path or str, it is converted - to a LocalStore object and then to a `StorePath` object. If the `StoreLike` - object is a dict[str, Buffer], it is converted to a `MemoryStore` object and - then to a `StorePath` object. + Convert a `StoreLike` object into a Store object. - If the `StoreLike` object is None, a `MemoryStore` object is created and - converted to a `StorePath` object. + `StoreLike` objects are converted to `Store` as follows: - If the `StoreLike` object is a str and starts with a protocol, it is - converted to a RemoteStore object and then to a `StorePath` object. - - If the `StoreLike` object is a dict[str, Buffer] and the mode is not None, - the `MemoryStore` object is created with the given mode. - - If the `StoreLike` object is a str and starts with a protocol, the - RemoteStore object is created with the given mode and storage options. + - `Store` or `StorePath` = `Store` object. + - `Path` or `str` = `LocalStore` object. + - `str` that starts with a protocol = `FsspecStore` object. + - `dict[str, Buffer]` = `MemoryStore` object. + - `None` = `MemoryStore` object. + - `FSMap` = `FsspecStore` object. Parameters ---------- store_like : StoreLike | None - The object to convert to a `StorePath` object. - path : str | None, optional - The path to use when creating the `StorePath` object. If None, the - default path is the empty string. + The object to convert to a `Store` object. mode : StoreAccessMode | None, optional - The mode to use when creating the `StorePath` object. If None, the + The mode to use when creating the `Store` object. If None, the default mode is 'r'. storage_options : dict[str, Any] | None, optional The storage options to use when creating the `RemoteStore` object. If @@ -313,18 +298,16 @@ async def make_store_path( Returns ------- - StorePath - The converted StorePath object. + Store + The converted Store object. Raises ------ TypeError - If the StoreLike object is not one of the supported types. + If the StoreLike object is not one of the supported types, or if storage_options is provided but not used. """ from zarr.storage._fsspec import FsspecStore # circular import - path_normalized = normalize_path(path) - if ( not (isinstance(store_like, str) and _is_fsspec_uri(store_like)) and storage_options is not None @@ -338,50 +321,107 @@ async def make_store_path( _read_only = mode == "r" if isinstance(store_like, StorePath): - # Already a StorePath - return store_like / path_normalized + # Get underlying store + return store_like.store elif isinstance(store_like, Store): # Already a Store - store = store_like + return store_like elif isinstance(store_like, dict): # Already a dictionary that can be a MemoryStore # # We deliberate only consider dict[str, Buffer] here, and not arbitrary mutable mappings. # By only allowing dictionaries, which are in-memory, we know that MemoryStore appropriate. - store = await MemoryStore.open(store_dict=store_like, read_only=_read_only) + return await MemoryStore.open(store_dict=store_like, read_only=_read_only) elif store_like is None: # Create a new in-memory store - return await make_store_path({}, path=path, mode=mode, storage_options=storage_options) + return await make_store({}, mode=mode, storage_options=storage_options) elif isinstance(store_like, Path): # Create a new LocalStore - store = await LocalStore.open(root=store_like, read_only=_read_only) + return await LocalStore.open(root=store_like, read_only=_read_only) elif isinstance(store_like, str): # Either a FSSpec URI or a local filesystem path if _is_fsspec_uri(store_like): - store = FsspecStore.from_url( + return FsspecStore.from_url( store_like, storage_options=storage_options, read_only=_read_only ) else: # Assume a filesystem path - return await make_store_path( - Path(store_like), path=path, mode=mode, storage_options=storage_options - ) + return await make_store(Path(store_like), mode=mode, storage_options=storage_options) elif _has_fsspec and isinstance(store_like, FSMap): - if path: - raise ValueError( - "'path' was provided but is not used for FSMap store_like objects. Specify the path when creating the FSMap instance instead." - ) - store = FsspecStore.from_mapper(store_like, read_only=_read_only) + return FsspecStore.from_mapper(store_like, read_only=_read_only) + else: raise TypeError(f"Unsupported type for store_like: '{type(store_like).__name__}'") - return await StorePath.open(store, path=path_normalized, mode=mode) + +async def make_store_path( + store_like: StoreLike | None, + *, + path: str | None = "", + mode: AccessModeLiteral | None = None, + storage_options: dict[str, Any] | None = None, +) -> StorePath: + """ + Convert a `StoreLike` object into a StorePath object. + + This function takes a `StoreLike` object and returns a `StorePath` object. See `make_store` for details + of which `Store` is used for each type of `store_like` object. + + Parameters + ---------- + store_like : StoreLike | None + The object to convert to a `StorePath` object. + path : str | None, optional + The path to use when creating the `StorePath` object. If None, the + default path is the empty string. + mode : StoreAccessMode | None, optional + The mode to use when creating the `StorePath` object. If None, the + default mode is 'r'. + storage_options : dict[str, Any] | None, optional + The storage options to use when creating the `RemoteStore` object. If + None, the default storage options are used. + + Returns + ------- + StorePath + The converted StorePath object. + + Raises + ------ + TypeError + If the StoreLike object is not one of the supported types, or if storage_options is provided but not used. + ValueError + If path is provided for a store that does not support it. + + See Also + -------- + make_store + """ + path_normalized = normalize_path(path) + + if isinstance(store_like, StorePath): + # Already a StorePath + if storage_options: + raise TypeError( + "'storage_options' was provided but unused. " + "'storage_options' is only used when the store is passed as a FSSpec URI string.", + ) + return store_like / path_normalized + + elif _has_fsspec and isinstance(store_like, FSMap) and path: + raise ValueError( + "'path' was provided but is not used for FSMap store_like objects. Specify the path when creating the FSMap instance instead." + ) + + else: + store = await make_store(store_like, mode=mode, storage_options=storage_options) + return await StorePath.open(store, path=path_normalized, mode=mode) def _is_fsspec_uri(uri: str) -> bool: diff --git a/tests/conftest.py b/tests/conftest.py index a1bf423c06..7128ed51ea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import math import os import pathlib +import sys from collections.abc import Mapping, Sequence from dataclasses import dataclass, field from typing import TYPE_CHECKING @@ -12,6 +13,7 @@ import pytest from hypothesis import HealthCheck, Verbosity, settings +import zarr.registry from zarr import AsyncGroup, config from zarr.abc.store import Store from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation @@ -177,6 +179,27 @@ def zarr_format(request: pytest.FixtureRequest) -> ZarrFormat: raise ValueError(msg) +def _clear_registries() -> None: + registries = zarr.registry._collect_entrypoints() + for registry in registries: + registry.lazy_load_list.clear() + + +@pytest.fixture +def set_path() -> Generator[None, None, None]: + tests_dir = str(pathlib.Path(__file__).parent.absolute()) + sys.path.append(tests_dir) + _clear_registries() + zarr.registry._collect_entrypoints() + + yield + + sys.path.remove(tests_dir) + _clear_registries() + zarr.registry._collect_entrypoints() + config.reset() + + def pytest_addoption(parser: Any) -> None: parser.addoption( "--run-slow-hypothesis", diff --git a/tests/test_cli/conftest.py b/tests/test_cli/conftest.py new file mode 100644 index 0000000000..4f95f47b5e --- /dev/null +++ b/tests/test_cli/conftest.py @@ -0,0 +1,146 @@ +from pathlib import Path +from typing import Any, Literal + +import pytest + +import zarr +from zarr.abc.store import Store +from zarr.core.common import ZarrFormat + + +def create_nested_zarr( + store: Store, + attributes: dict[str, Any] | None = None, + separator: Literal[".", "/"] = ".", + zarr_format: ZarrFormat = 2, +) -> list[str]: + """Create a zarr with nested groups / arrays for testing, returning the paths to all.""" + + if attributes is None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + + # 3 levels of nested groups + group_0 = zarr.create_group(store=store, zarr_format=zarr_format, attributes=attributes) + group_1 = group_0.create_group(name="group_1", attributes=attributes) + group_2 = group_1.create_group(name="group_2", attributes=attributes) + paths = [group_0.path, group_1.path, group_2.path] + + # 1 array per group + for i, group in enumerate([group_0, group_1, group_2]): + array = group.create_array( + name=f"array_{i}", + shape=(10, 10), + chunks=(5, 5), + dtype="uint16", + attributes=attributes, + chunk_key_encoding={"name": "v2", "separator": separator}, + ) + array[:] = 1 + paths.append(array.path) + + return paths + + +@pytest.fixture +def expected_paths() -> list[Path]: + """Expected paths for create_nested_zarr, with no metadata files or chunks""" + return [ + Path("array_0"), + Path("group_1"), + Path("group_1/array_1"), + Path("group_1/group_2"), + Path("group_1/group_2/array_2"), + ] + + +@pytest.fixture +def expected_chunks() -> list[Path]: + """Expected chunks for create_nested_zarr""" + return [ + Path("array_0/0.0"), + Path("array_0/0.1"), + Path("array_0/1.0"), + Path("array_0/1.1"), + Path("group_1/array_1/0.0"), + Path("group_1/array_1/0.1"), + Path("group_1/array_1/1.0"), + Path("group_1/array_1/1.1"), + Path("group_1/group_2/array_2/0.0"), + Path("group_1/group_2/array_2/0.1"), + Path("group_1/group_2/array_2/1.0"), + Path("group_1/group_2/array_2/1.1"), + ] + + +@pytest.fixture +def expected_v3_metadata() -> list[Path]: + """Expected v3 metadata for create_nested_zarr""" + return sorted( + [ + Path("zarr.json"), + Path("array_0/zarr.json"), + Path("group_1/zarr.json"), + Path("group_1/array_1/zarr.json"), + Path("group_1/group_2/zarr.json"), + Path("group_1/group_2/array_2/zarr.json"), + ] + ) + + +@pytest.fixture +def expected_v2_metadata() -> list[Path]: + """Expected v2 metadata for create_nested_zarr""" + return sorted( + [ + Path(".zgroup"), + Path(".zattrs"), + Path("array_0/.zarray"), + Path("array_0/.zattrs"), + Path("group_1/.zgroup"), + Path("group_1/.zattrs"), + Path("group_1/array_1/.zarray"), + Path("group_1/array_1/.zattrs"), + Path("group_1/group_2/.zgroup"), + Path("group_1/group_2/.zattrs"), + Path("group_1/group_2/array_2/.zarray"), + Path("group_1/group_2/array_2/.zattrs"), + ] + ) + + +@pytest.fixture +def expected_paths_no_metadata( + expected_paths: list[Path], expected_chunks: list[Path] +) -> list[Path]: + return sorted(expected_paths + expected_chunks) + + +@pytest.fixture +def expected_paths_v3_metadata( + expected_paths: list[Path], expected_chunks: list[Path], expected_v3_metadata: list[Path] +) -> list[Path]: + return sorted(expected_paths + expected_chunks + expected_v3_metadata) + + +@pytest.fixture +def expected_paths_v3_metadata_no_chunks( + expected_paths: list[Path], expected_v3_metadata: list[Path] +) -> list[Path]: + return sorted(expected_paths + expected_v3_metadata) + + +@pytest.fixture +def expected_paths_v2_metadata( + expected_paths: list[Path], expected_chunks: list[Path], expected_v2_metadata: list[Path] +) -> list[Path]: + return sorted(expected_paths + expected_chunks + expected_v2_metadata) + + +@pytest.fixture +def expected_paths_v2_v3_metadata( + expected_paths: list[Path], + expected_chunks: list[Path], + expected_v2_metadata: list[Path], + expected_v3_metadata: list[Path], +) -> list[Path]: + return sorted(expected_paths + expected_chunks + expected_v2_metadata + expected_v3_metadata) diff --git a/tests/test_cli/test_migrate_v3.py b/tests/test_cli/test_migrate_v3.py new file mode 100644 index 0000000000..b72562e5a3 --- /dev/null +++ b/tests/test_cli/test_migrate_v3.py @@ -0,0 +1,663 @@ +import lzma +from pathlib import Path +from typing import Literal, cast + +import numcodecs +import numcodecs.abc +import numpy as np +import pytest + +import zarr +from tests.test_cli.conftest import create_nested_zarr +from zarr.abc.codec import Codec +from zarr.codecs.blosc import BloscCodec +from zarr.codecs.bytes import BytesCodec +from zarr.codecs.gzip import GzipCodec +from zarr.codecs.transpose import TransposeCodec +from zarr.codecs.zstd import ZstdCodec +from zarr.core.array import Array +from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding +from zarr.core.common import JSON, ZarrFormat +from zarr.core.dtype.npy.int import UInt8, UInt16 +from zarr.core.group import Group, GroupMetadata +from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.storage._local import LocalStore + +typer_testing = pytest.importorskip( + "typer.testing", reason="optional cli dependencies aren't installed" +) +cli = pytest.importorskip("zarr._cli.cli", reason="optional cli dependencies aren't installed") + +runner = typer_testing.CliRunner() + +NUMCODECS_USER_WARNING = "Numcodecs codecs are not in the Zarr version 3 specification and may not be supported by other zarr implementations." + + +def test_migrate_array(local_store: LocalStore) -> None: + shape = (10, 10) + chunks = (10, 10) + dtype = "uint16" + compressors = numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1) + fill_value = 2 + attributes = cast(dict[str, JSON], {"baz": 42, "qux": [1, 4, 7, 12]}) + + zarr.create_array( + store=local_store, + shape=shape, + chunks=chunks, + dtype=dtype, + compressors=compressors, + zarr_format=2, + fill_value=fill_value, + attributes=attributes, + ) + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + + expected_metadata = ArrayV3Metadata( + shape=shape, + data_type=UInt16(endianness="little"), + chunk_grid=RegularChunkGrid(chunk_shape=chunks), + chunk_key_encoding=V2ChunkKeyEncoding(separator="."), + fill_value=fill_value, + codecs=( + BytesCodec(endian="little"), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ), + attributes=attributes, + dimension_names=None, + storage_transformers=None, + ) + assert zarr_array.metadata == expected_metadata + + +def test_migrate_group(local_store: LocalStore) -> None: + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + zarr.create_group(store=local_store, zarr_format=2, attributes=attributes) + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open(local_store.root, zarr_format=3) + expected_metadata = GroupMetadata( + attributes=attributes, zarr_format=3, consolidated_metadata=None + ) + assert zarr_array.metadata == expected_metadata + + +@pytest.mark.parametrize("separator", [".", "/"]) +def test_migrate_nested_groups_and_arrays_in_place( + local_store: LocalStore, separator: str, expected_v3_metadata: list[Path] +) -> None: + """Test that zarr.json are made at the correct points in a hierarchy of groups and arrays + (including when there are additional dirs due to using a / separator)""" + + attributes = {"baz": 42, "qux": [1, 4, 7, 12]} + paths = create_nested_zarr(local_store, attributes=attributes, separator=separator) + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + + zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) + expected_zarr_json_paths = [local_store.root / p for p in expected_v3_metadata] + assert zarr_json_paths == expected_zarr_json_paths + + # Check converted zarr can be opened + metadata accessed at all levels + zarr_array = zarr.open(local_store.root, zarr_format=3) + for path in paths: + zarr_v3 = cast(Array | Group, zarr_array[path]) + metadata = zarr_v3.metadata + assert metadata.zarr_format == 3 + assert metadata.attributes == attributes + + +@pytest.mark.parametrize("separator", [".", "/"]) +async def test_migrate_nested_groups_and_arrays_separate_location( + tmp_path: Path, + separator: str, + expected_v2_metadata: list[Path], + expected_v3_metadata: list[Path], +) -> None: + """Test that zarr.json are made at the correct paths, when saving to a separate output location.""" + + input_zarr_path = tmp_path / "input.zarr" + output_zarr_path = tmp_path / "output.zarr" + + local_store = await LocalStore.open(str(input_zarr_path)) + create_nested_zarr(local_store, separator=separator) + + result = runner.invoke(cli.app, ["migrate", "v3", str(input_zarr_path), str(output_zarr_path)]) + assert result.exit_code == 0 + + # Files in input zarr should be unchanged i.e. still v2 only + zarr_json_paths = sorted(input_zarr_path.rglob("zarr.json")) + assert len(zarr_json_paths) == 0 + + paths = [ + path + for path in input_zarr_path.rglob("*") + if path.stem in [".zarray", ".zgroup", ".zattrs"] + ] + expected_paths = [input_zarr_path / p for p in expected_v2_metadata] + assert sorted(paths) == expected_paths + + # Files in output zarr should only contain v3 metadata + zarr_json_paths = sorted(output_zarr_path.rglob("zarr.json")) + expected_zarr_json_paths = [output_zarr_path / p for p in expected_v3_metadata] + assert zarr_json_paths == expected_zarr_json_paths + + +def test_remove_v2_metadata_option_in_place( + local_store: LocalStore, expected_paths_v3_metadata: list[Path] +) -> None: + create_nested_zarr(local_store) + + # convert v2 metadata to v3, then remove v2 metadata + result = runner.invoke( + cli.app, ["migrate", "v3", str(local_store.root), "--remove-v2-metadata"] + ) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_v3_metadata] + assert paths == expected_paths + + +async def test_remove_v2_metadata_option_separate_location( + tmp_path: Path, + expected_paths_v2_metadata: list[Path], + expected_paths_v3_metadata_no_chunks: list[Path], +) -> None: + """Check that when using --remove-v2-metadata with a separate output location, no v2 metadata is removed from + the input location.""" + + input_zarr_path = tmp_path / "input.zarr" + output_zarr_path = tmp_path / "output.zarr" + + local_store = await LocalStore.open(str(input_zarr_path)) + create_nested_zarr(local_store) + + result = runner.invoke( + cli.app, + ["migrate", "v3", str(input_zarr_path), str(output_zarr_path), "--remove-v2-metadata"], + ) + assert result.exit_code == 0 + + # input image should be unchanged + paths = sorted(input_zarr_path.rglob("*")) + expected_paths = [input_zarr_path / p for p in expected_paths_v2_metadata] + assert paths == expected_paths + + # output image should be only v3 metadata + paths = sorted(output_zarr_path.rglob("*")) + expected_paths = [output_zarr_path / p for p in expected_paths_v3_metadata_no_chunks] + assert paths == expected_paths + + +def test_overwrite_option_in_place( + local_store: LocalStore, expected_paths_v2_v3_metadata: list[Path] +) -> None: + create_nested_zarr(local_store) + + # add v3 metadata in place + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + + # check that v3 metadata can be overwritten with --overwrite + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root), "--overwrite"]) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_v2_v3_metadata] + assert paths == expected_paths + + +async def test_overwrite_option_separate_location( + tmp_path: Path, + expected_paths_v2_metadata: list[Path], + expected_paths_v3_metadata_no_chunks: list[Path], +) -> None: + input_zarr_path = tmp_path / "input.zarr" + output_zarr_path = tmp_path / "output.zarr" + + local_store = await LocalStore.open(str(input_zarr_path)) + create_nested_zarr(local_store) + + # create v3 metadata at output_zarr_path + result = runner.invoke( + cli.app, + ["migrate", "v3", str(input_zarr_path), str(output_zarr_path)], + ) + assert result.exit_code == 0 + + # re-run with --overwrite option + result = runner.invoke( + cli.app, + ["migrate", "v3", str(input_zarr_path), str(output_zarr_path), "--overwrite", "--force"], + ) + assert result.exit_code == 0 + + # original image should be un-changed + paths = sorted(input_zarr_path.rglob("*")) + expected_paths = [input_zarr_path / p for p in expected_paths_v2_metadata] + assert paths == expected_paths + + # output image is only v3 metadata + paths = sorted(output_zarr_path.rglob("*")) + expected_paths = [output_zarr_path / p for p in expected_paths_v3_metadata_no_chunks] + assert paths == expected_paths + + +@pytest.mark.parametrize("separator", [".", "/"]) +def test_migrate_sub_group( + local_store: LocalStore, separator: str, expected_v3_metadata: list[Path] +) -> None: + """Test that only arrays/groups within group_1 are converted (+ no other files in store)""" + + create_nested_zarr(local_store, separator=separator) + group_path = local_store.root / "group_1" + + result = runner.invoke(cli.app, ["migrate", "v3", str(group_path)]) + assert result.exit_code == 0 + + zarr_json_paths = sorted(local_store.root.rglob("zarr.json")) + expected_zarr_json_paths = [ + local_store.root / p + for p in expected_v3_metadata + if group_path in (local_store.root / p).parents + ] + assert zarr_json_paths == expected_zarr_json_paths + + +@pytest.mark.parametrize( + ("compressor_v2", "compressor_v3"), + [ + ( + numcodecs.Blosc(cname="zstd", clevel=3, shuffle=1), + BloscCodec(typesize=2, cname="zstd", clevel=3, shuffle="shuffle", blocksize=0), + ), + (numcodecs.Zstd(level=3), ZstdCodec(level=3)), + (numcodecs.GZip(level=3), GzipCodec(level=3)), + ], + ids=["blosc", "zstd", "gzip"], +) +def test_migrate_compressor( + local_store: LocalStore, compressor_v2: numcodecs.abc.Codec, compressor_v3: Codec +) -> None: + zarr_array = zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=compressor_v2, + zarr_format=2, + fill_value=0, + ) + zarr_array[:] = 1 + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open_array(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.codecs == ( + BytesCodec(endian="little"), + compressor_v3, + ) + assert np.all(zarr_array[:] == 1) + + +@pytest.mark.filterwarnings(f"ignore:{NUMCODECS_USER_WARNING}:UserWarning") +def test_migrate_numcodecs_compressor(local_store: LocalStore) -> None: + """Test migration of a numcodecs compressor without a zarr.codecs equivalent.""" + + lzma_settings = { + "format": lzma.FORMAT_RAW, + "check": -1, + "preset": None, + "filters": [ + {"id": lzma.FILTER_DELTA, "dist": 4}, + {"id": lzma.FILTER_LZMA2, "preset": 1}, + ], + } + + zarr_array = zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=numcodecs.LZMA.from_config(lzma_settings), + zarr_format=2, + fill_value=0, + ) + zarr_array[:] = 1 + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open_array(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.codecs == ( + BytesCodec(endian="little"), + numcodecs.zarr3.LZMA( + format=lzma_settings["format"], + check=lzma_settings["check"], + preset=lzma_settings["preset"], + filters=lzma_settings["filters"], + ), + ) + assert np.all(zarr_array[:] == 1) + + +@pytest.mark.filterwarnings(f"ignore:{NUMCODECS_USER_WARNING}:UserWarning") +def test_migrate_filter(local_store: LocalStore) -> None: + filter_v2 = numcodecs.Delta(dtype=" None: + zarr_array = zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=None, + zarr_format=2, + fill_value=0, + order=order, + ) + zarr_array[:] = 1 + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open_array(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.codecs == expected_codecs + assert np.all(zarr_array[:] == 1) + + +@pytest.mark.parametrize( + ("dtype", "expected_data_type", "expected_codecs"), + [ + ("uint8", UInt8(), (BytesCodec(endian=None),)), + ("uint16", UInt16(), (BytesCodec(endian="little"),)), + ], + ids=["single_byte", "multi_byte"], +) +def test_migrate_endian( + local_store: LocalStore, + dtype: str, + expected_data_type: UInt8 | UInt16, + expected_codecs: tuple[Codec], +) -> None: + zarr_array = zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype=dtype, + compressors=None, + zarr_format=2, + fill_value=0, + ) + zarr_array[:] = 1 + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + assert (local_store.root / "zarr.json").exists() + + zarr_array = zarr.open_array(local_store.root, zarr_format=3) + metadata = zarr_array.metadata + assert metadata.zarr_format == 3 + assert metadata.data_type == expected_data_type + assert metadata.codecs == expected_codecs + assert np.all(zarr_array[:] == 1) + + +@pytest.mark.parametrize("node_type", ["array", "group"]) +def test_migrate_v3(local_store: LocalStore, node_type: str) -> None: + """Attempting to convert a v3 array/group should always fail""" + + if node_type == "array": + zarr.create_array( + store=local_store, shape=(10, 10), chunks=(10, 10), zarr_format=3, dtype="uint16" + ) + else: + zarr.create_group(store=local_store, zarr_format=3) + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, TypeError) + assert str(result.exception) == "Only arrays / groups with zarr v2 metadata can be converted" + + +def test_migrate_consolidated_metadata(local_store: LocalStore) -> None: + """Attempting to convert a group with consolidated metadata should always fail""" + + group = zarr.create_group(store=local_store, zarr_format=2) + group.create_array(shape=(1,), name="a", dtype="uint8") + zarr.consolidate_metadata(local_store) + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, NotImplementedError) + assert str(result.exception) == "Migration of consolidated metadata isn't supported." + + +def test_migrate_unknown_codec(local_store: LocalStore) -> None: + """Attempting to convert a codec without a v3 equivalent should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + filters=[numcodecs.Categorize(labels=["a", "b"], dtype=object)], + zarr_format=2, + fill_value=0, + ) + + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, ValueError) + assert ( + str(result.exception) == "Couldn't find corresponding numcodecs.zarr3 codec for categorize" + ) + + +def test_migrate_incorrect_filter(local_store: LocalStore) -> None: + """Attempting to convert a filter (which is the wrong type of codec) should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + filters=[numcodecs.Zstd(level=3)], + zarr_format=2, + fill_value=0, + ) + + with pytest.warns(UserWarning, match=NUMCODECS_USER_WARNING): + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + + assert result.exit_code == 1 + assert isinstance(result.exception, TypeError) + assert ( + str(result.exception) == "Filter is not an ArrayArrayCodec" + ) + + +def test_migrate_incorrect_compressor(local_store: LocalStore) -> None: + """Attempting to convert a compressor (which is the wrong type of codec) should always fail""" + + zarr.create_array( + store=local_store, + shape=(10, 10), + chunks=(10, 10), + dtype="uint16", + compressors=numcodecs.Delta(dtype=" is not a BytesBytesCodec" + ) + + +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_remove_metadata_fails_without_force( + local_store: LocalStore, zarr_format: ZarrFormat +) -> None: + """Test removing metadata (when no alternate metadata is present) fails without --force.""" + + create_nested_zarr(local_store, zarr_format=zarr_format) + + result = runner.invoke(cli.app, ["remove-metadata", f"v{zarr_format}", str(local_store.root)]) + assert result.exit_code == 1 + assert isinstance(result.exception, ValueError) + assert str(result.exception).startswith(f"Cannot remove v{zarr_format} metadata at file") + + +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_remove_metadata_succeeds_with_force( + local_store: LocalStore, zarr_format: ZarrFormat, expected_paths_no_metadata: list[Path] +) -> None: + """Test removing metadata (when no alternate metadata is present) succeeds with --force.""" + + create_nested_zarr(local_store, zarr_format=zarr_format) + + result = runner.invoke( + cli.app, ["remove-metadata", f"v{zarr_format}", str(local_store.root), "--force"] + ) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_no_metadata] + assert paths == expected_paths + + +def test_remove_metadata_sub_group( + local_store: LocalStore, expected_paths_no_metadata: list[Path] +) -> None: + """Test only v2 metadata within group_1 is removed and rest remains un-changed.""" + + create_nested_zarr(local_store) + + result = runner.invoke( + cli.app, ["remove-metadata", "v2", str(local_store.root / "group_1"), "--force"] + ) + assert result.exit_code == 0 + + # check all metadata files inside group_1 are removed (.zattrs / .zgroup / .zarray should remain only inside the top + # group) + paths = sorted(local_store.root.rglob("*")) + + expected_paths = [local_store.root / p for p in expected_paths_no_metadata] + expected_paths.append(local_store.root / ".zattrs") + expected_paths.append(local_store.root / ".zgroup") + expected_paths.append(local_store.root / "array_0" / ".zarray") + expected_paths.append(local_store.root / "array_0" / ".zattrs") + assert paths == sorted(expected_paths) + + +@pytest.mark.parametrize( + ("zarr_format", "expected_output_paths"), + [("v2", "expected_paths_v3_metadata"), ("v3", "expected_paths_v2_metadata")], +) +def test_remove_metadata_after_conversion( + local_store: LocalStore, + request: pytest.FixtureRequest, + zarr_format: str, + expected_output_paths: str, +) -> None: + """Test all v2/v3 metadata can be removed after metadata conversion (all groups / arrays / + metadata of other versions should remain as-is)""" + + create_nested_zarr(local_store) + + # convert v2 metadata to v3 (so now both v2 and v3 metadata present!), then remove either the v2 or v3 metadata + result = runner.invoke(cli.app, ["migrate", "v3", str(local_store.root)]) + assert result.exit_code == 0 + result = runner.invoke(cli.app, ["remove-metadata", zarr_format, str(local_store.root)]) + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = request.getfixturevalue(expected_output_paths) + expected_paths = [local_store.root / p for p in expected_paths] + assert paths == expected_paths + + +@pytest.mark.parametrize("cli_command", ["migrate", "remove-metadata"]) +def test_dry_run( + local_store: LocalStore, cli_command: str, expected_paths_v2_metadata: list[Path] +) -> None: + """Test that all files are un-changed after a dry run""" + + create_nested_zarr(local_store) + + if cli_command == "migrate": + result = runner.invoke( + cli.app, ["migrate", "v3", str(local_store.root), "--overwrite", "--force", "--dry-run"] + ) + else: + result = runner.invoke( + cli.app, ["remove-metadata", "v2", str(local_store.root), "--force", "--dry-run"] + ) + + assert result.exit_code == 0 + + paths = sorted(local_store.root.rglob("*")) + expected_paths = [local_store.root / p for p in expected_paths_v2_metadata] + assert paths == expected_paths diff --git a/tests/test_codec_entrypoints.py b/tests/test_codec_entrypoints.py index e1ef027dd4..fc7b79fe54 100644 --- a/tests/test_codec_entrypoints.py +++ b/tests/test_codec_entrypoints.py @@ -1,26 +1,8 @@ -import os.path -import sys -from collections.abc import Generator - import pytest import zarr.registry from zarr import config -here = os.path.abspath(os.path.dirname(__file__)) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - @pytest.mark.usefixtures("set_path") @pytest.mark.parametrize("codec_name", ["TestEntrypointCodec", "TestEntrypointGroup.Codec"]) diff --git a/tests/test_dtype_registry.py b/tests/test_dtype_registry.py index aedda5272c..58b14fe07a 100644 --- a/tests/test_dtype_registry.py +++ b/tests/test_dtype_registry.py @@ -1,16 +1,12 @@ from __future__ import annotations import re -import sys -from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, get_args import numpy as np import pytest -import zarr from tests.conftest import skip_object_dtype -from zarr.core.config import config from zarr.core.dtype import ( AnyDType, DataTypeRegistry, @@ -29,8 +25,6 @@ ) if TYPE_CHECKING: - from collections.abc import Generator - from zarr.core.common import ZarrFormat from .test_dtype.conftest import zdtype_examples @@ -147,22 +141,6 @@ def test_match_dtype_unique( data_type_registry_fixture.match_json(instance_dict, zarr_format=zarr_format) -# this is copied from the registry tests -- we should deduplicate -here = str(Path(__file__).parent.absolute()) - - -@pytest.fixture -def set_path() -> Generator[None, None, None]: - sys.path.append(here) - zarr.registry._collect_entrypoints() - yield - sys.path.remove(here) - registries = zarr.registry._collect_entrypoints() - for registry in registries: - registry.lazy_load_list.clear() - config.reset() - - @pytest.mark.usefixtures("set_path") def test_entrypoint_dtype(zarr_format: ZarrFormat) -> None: from package_with_entrypoint import TestDataType