diff --git a/doc/conf.py b/doc/conf.py index c9735b1f..8de6c071 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -45,3 +45,5 @@ # a list of builtin themes. # html_theme = "sphinx_rtd_theme" + +autodoc_member_order = "bysource" diff --git a/doc/usage.rst b/doc/usage.rst index 9fabd108..739d083c 100644 --- a/doc/usage.rst +++ b/doc/usage.rst @@ -58,6 +58,7 @@ Bitshuffle .. autoclass:: Bitshuffle :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: Blosc @@ -65,6 +66,7 @@ Blosc .. autoclass:: Blosc :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: Blosc2 @@ -72,6 +74,7 @@ Blosc2 .. autoclass:: Blosc2 :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: BZip2 @@ -79,6 +82,7 @@ BZip2 .. autoclass:: BZip2 :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: FciDecomp @@ -86,6 +90,7 @@ FciDecomp .. autoclass:: FciDecomp :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: LZ4 @@ -93,6 +98,7 @@ LZ4 .. autoclass:: LZ4 :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: Sperr @@ -100,6 +106,7 @@ Sperr .. autoclass:: Sperr :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: SZ @@ -107,6 +114,7 @@ SZ .. autoclass:: SZ :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: SZ3 @@ -114,6 +122,7 @@ SZ3 .. autoclass:: SZ3 :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: Zfp @@ -121,6 +130,7 @@ Zfp .. autoclass:: Zfp :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: Zstd @@ -128,6 +138,7 @@ Zstd .. autoclass:: Zstd :members: + :inherited-members: FilterRefBase, Mapping :undoc-members: Get information about hdf5plugin @@ -158,6 +169,33 @@ Registering with this function is required to perform additional initialisation .. autofunction:: register +Get dataset compression ++++++++++++++++++++++++ + +For compression filters provided by HDF5 and `h5py`_ (i.e., GZIP, LZF, SZIP), +dataset compression configuration can be retrieved with `h5py.Dataset`_'s +`compression `_ and +`compression_opts `_ properties. + +For third-party compression filters such as the one supported by `hdf5plugin`, +information about dataset compression is stored in HDF5 +`filter pipeline `_ configuration. +This filter pipeline configuration can be retrieved with `h5py.Dataset`_ "low level" API. +For a given `h5py.Dataset`_, ``dataset``: + +.. code-block:: python + + create_plist = dataset.id.get_create_plist() + + for index in range(create_plist.get_nfilters()): + filter_id, _, filter_options, _ = create_plist.get_filter(index) + print(filter_id, filter_options) + +For compression filters supported by `hdf5plugin`, +:func:`hdf5plugin.from_filter_options` instantiates the filter configuration from ``filter_id`` and ``filter_options``. + +.. autofunction:: from_filter_options + Use HDF5 filters in other applications ++++++++++++++++++++++++++++++++++++++ @@ -176,3 +214,4 @@ Setting the ``HDF5_PLUGIN_PATH`` environment variable allows already existing pr .. _h5py: https://www.h5py.org .. _h5py.h5z: https://github.com/h5py/h5py/blob/master/h5py/h5z.pyx .. _h5py.Group.create_dataset: https://docs.h5py.org/en/stable/high/group.html#h5py.Group.create_dataset +.. _h5py.Dataset: https://docs.h5py.org/en/stable/high/dataset.html diff --git a/src/hdf5plugin/__init__.py b/src/hdf5plugin/__init__.py index 23e0287c..8043b67e 100644 --- a/src/hdf5plugin/__init__.py +++ b/src/hdf5plugin/__init__.py @@ -52,7 +52,13 @@ Zfp, Zstd, ) -from ._utils import PLUGIN_PATH, get_config, get_filters, register # noqa +from ._utils import ( # noqa + PLUGIN_PATH, + from_filter_options, + get_config, + get_filters, + register, +) from ._version import version # noqa # Backward compatibility diff --git a/src/hdf5plugin/_filters.py b/src/hdf5plugin/_filters.py index babc52dc..3b73c91b 100644 --- a/src/hdf5plugin/_filters.py +++ b/src/hdf5plugin/_filters.py @@ -26,6 +26,8 @@ import logging import math import struct +from collections.abc import Mapping +from typing import Literal, TypeVar import h5py @@ -73,6 +75,58 @@ class FilterBase(h5py.filters.FilterRefBase): filter_id: int filter_name: str + def __init__( + self, + filter_options: tuple[int, ...] = (), + config: Mapping[str, int | float | bool | str] | None = None, + ) -> None: + super().__init__() + self.__filter_options = filter_options + self.__config = {} if config is None else dict(config) + + @property + def filter_options(self) -> tuple[int, ...]: + """HDF5 "cd_values" filter options""" + return self.__filter_options + + @filter_options.setter + def filter_options(self, value: tuple[int, ...]) -> None: + logger.warning( + "Deprecation: Setting filter_options is not supported and will raise an exception in future versions" + ) + self.__filter_options = value + + def get_config(self) -> dict[str, int | float | bool | str]: + """Returns filter configuration""" + return self.__config.copy() + + def __repr__(self) -> str: + arguments = ", ".join( + f"{name}={value!r}" for name, value in self.get_config().items() + ) + return f"{self.__class__.__name__}({arguments})" + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> FilterBase: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :raises ValueError: Unsupported filter_options + :raises NotImplementedError: Support of filter_options version is not implemented + """ + raise NotImplementedError() + + +_CNameLiteral = TypeVar("_CNameLiteral", bound=str) + + +def _cname_from_id( + compression_id: int, compressions: dict[_CNameLiteral, int] +) -> _CNameLiteral: + for cname, cid in compressions.items(): + if compression_id == cid: + return cname + raise ValueError(f"Unsupported compression id: {compression_id}") + class Bitshuffle(FilterBase): """``h5py.Group.create_dataset``'s compression arguments for using bitshuffle filter. @@ -90,8 +144,7 @@ class Bitshuffle(FilterBase): The number of elements per block. It needs to be divisible by eight. Default: 0 (for about 8 kilobytes per block). - :param cname: - `lz4` (default), `none`, `zstd` + :param cname: Compressor name. :param clevel: Compression level, used only for `zstd` compression. Can be negative, and must be below or equal to 22 (maximum compression). Default: 3. @@ -100,7 +153,9 @@ class Bitshuffle(FilterBase): filter_name = "bshuf" filter_id = BSHUF_ID - __COMPRESSIONS = { + _CNameType = Literal["none", "lz4", "zstd"] + + __COMPRESSIONS: dict[_CNameType, int] = { "none": 0, "lz4": 2, "zstd": 3, @@ -109,7 +164,7 @@ class Bitshuffle(FilterBase): def __init__( self, nelems: int = 0, - cname: str = None, + cname: Bitshuffle._CNameType | None = None, clevel: int = 3, lz4: bool = None, ): @@ -142,10 +197,53 @@ def __init__( if cname not in self.__COMPRESSIONS: raise ValueError(f"Unsupported compression: {cname}") + filter_options: tuple[int, ...] = (nelems, self.__COMPRESSIONS[cname]) + config = {"cname": cname, "nelems": nelems} if cname == "zstd": - self.filter_options = (nelems, self.__COMPRESSIONS[cname], clevel) - else: - self.filter_options = (nelems, self.__COMPRESSIONS[cname]) + filter_options += (clevel,) + config["clevel"] = clevel + super().__init__(filter_options, config) + + @property + def nelems(self) -> int: + """Number of elements per block""" + return self.filter_options[0] + + @property + def cname(self) -> Bitshuffle._CNameType: + """Compressor name""" + return _cname_from_id(self.filter_options[1], self.__COMPRESSIONS) + + @property + def clevel(self) -> int | None: + """Compression level, only for `zstd` compressor, None for others""" + return self.filter_options[2] if self.cname == "zstd" else None + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> Bitshuffle: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: + + - Zstd: (_, _, _, nelems, compression_id=3, compression_level) + - LZ4 and no compression: (_, _, _, nelems, compression_id) + + :raises ValueError: Unsupported filter_options + """ + if len(filter_options) <= 3: + return cls(cname="none") + + nelems = filter_options[3] + + if len(filter_options) <= 4: + return cls(nelems, cname="none") + + cname = _cname_from_id(filter_options[4], cls.__COMPRESSIONS) + + if cname == "zstd" and len(filter_options) > 5: + return cls(nelems, cname, clevel=filter_options[5]) + + return cls(nelems, cname) class Blosc(FilterBase): @@ -161,8 +259,8 @@ class Blosc(FilterBase): f.close() :param cname: - `blosclz`, `lz4` (default), `lz4hc`, `zlib`, `zstd` - Optional: `snappy`, depending on compilation (requires C++11). + Compressor name. + `snappy` availability depends on compilation (requires C++11). :param clevel: Compression level from 0 (no compression) to 9 (maximum compression). Default: 5. @@ -185,7 +283,9 @@ class Blosc(FilterBase): filter_name = "blosc" filter_id = BLOSC_ID - __COMPRESSIONS = { + _CNameType = Literal["blosclz", "lz4", "lz4hc", "snappy", "zlib", "zstd"] + + __COMPRESSIONS: dict[_CNameType, int] = { "blosclz": 0, "lz4": 1, "lz4hc": 2, @@ -194,14 +294,63 @@ class Blosc(FilterBase): "zstd": 5, } - def __init__(self, cname: str = "lz4", clevel: int = 5, shuffle: int = SHUFFLE): + def __init__( + self, + cname: Blosc._CNameType = "lz4", + clevel: int = 5, + shuffle: int = SHUFFLE, + ): compression = self.__COMPRESSIONS[cname] clevel = int(clevel) if not 0 <= clevel <= 9: raise ValueError("clevel must be in the range [0, 9]") if shuffle not in (self.NOSHUFFLE, self.SHUFFLE, self.BITSHUFFLE): raise ValueError(f"shuffle={shuffle} is not supported") - self.filter_options = (0, 0, 0, 0, clevel, shuffle, compression) + + super().__init__( + filter_options=(0, 0, 0, 0, clevel, shuffle, compression), + config={"cname": cname, "clevel": clevel, "shuffle": shuffle}, + ) + + @property + def cname(self) -> Blosc._CNameType: + """Compressor name""" + return _cname_from_id(self.filter_options[6], self.__COMPRESSIONS) + + @property + def clevel(self) -> int: + """Compression level from 0 (no compression) to 9 (maximum compression)""" + return self.filter_options[4] + + @property + def shuffle(self) -> int: + """Shuffle mode one of: NOSHUFFLE, SHUFFLE, BITSHUFFLE""" + return self.filter_options[5] + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> Blosc: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: (_, _, _, _, clevel*, shuffle*, compression*) + :raises ValueError: Unsupported filter_options + """ + default_cname: Blosc._CNameType = "blosclz" + + if len(filter_options) <= 4: + return cls(default_cname) + + clevel = filter_options[4] + + if len(filter_options) <= 5: + return cls(default_cname, clevel) + + shuffle = filter_options[5] + + if len(filter_options) <= 6: + return cls(default_cname, clevel, shuffle) + + cname = _cname_from_id(filter_options[6], cls.__COMPRESSIONS) + return cls(cname, clevel, shuffle) class Blosc2(FilterBase): @@ -216,8 +365,7 @@ class Blosc2(FilterBase): compression=hdf5plugin.Blosc2(cname='blosclz', clevel=9, filters=hdf5plugin.Blosc2.SHUFFLE)) f.close() - :param cname: - `blosclz` (default), `lz4`, `lz4hc`, `zlib`, `zstd` + :param cname: Compressor name. :param clevel: Compression level from 0 (no compression) to 9 (maximum compression). Default: 5. @@ -248,7 +396,9 @@ class Blosc2(FilterBase): filter_id = BLOSC2_ID filter_name = "blosc2" - __COMPRESSIONS = { + _CNameType = Literal["blosclz", "lz4", "lz4hc", "zlib", "zstd"] + + __COMPRESSIONS: dict[_CNameType, int] = { "blosclz": 0, "lz4": 1, "lz4hc": 2, @@ -256,7 +406,12 @@ class Blosc2(FilterBase): "zstd": 5, } - def __init__(self, cname: str = "blosclz", clevel: int = 5, filters: int = SHUFFLE): + def __init__( + self, + cname: Blosc2._CNameType = "blosclz", + clevel: int = 5, + filters: int = SHUFFLE, + ): compression = self.__COMPRESSIONS[cname] clevel = int(clevel) if not 0 <= clevel <= 9: @@ -269,7 +424,50 @@ def __init__(self, cname: str = "blosclz", clevel: int = 5, filters: int = SHUFF self.TRUNC_PREC, ): raise ValueError(f"filters={filters} is not supported") - self.filter_options = (0, 0, 0, 0, clevel, filters, compression) + super().__init__( + filter_options=(0, 0, 0, 0, clevel, filters, compression), + config={"cname": cname, "clevel": clevel, "filters": filters}, + ) + + @property + def cname(self) -> Blosc2._CNameType: + """Compressor name""" + return _cname_from_id(self.filter_options[6], self.__COMPRESSIONS) + + @property + def clevel(self) -> int: + """Compression level from 0 (no compression) to 9 (maximum compression)""" + return self.filter_options[4] + + @property + def filters(self) -> int: + """Pre-compression filter, one of: NOFILTER, SHUFFLE, BITSHUFFLE, DELTA, TRUNC_PREC""" + return self.filter_options[5] + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> Blosc2: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: (_, _, _, _, clevel*, filters*, compression*) + :raises ValueError: Unsupported filter_options + """ + default_cname: Blosc2._CNameType = "blosclz" + + if len(filter_options) <= 4: + return cls(default_cname) + + clevel = filter_options[4] + + if len(filter_options) <= 5: + return cls(default_cname, clevel) + + filters = filter_options[5] + + if len(filter_options) <= 6: + return cls(default_cname, clevel, filters) + + cname = _cname_from_id(filter_options[6], cls.__COMPRESSIONS) + return cls(cname=cname, clevel=clevel, filters=filters) class BZip2(FilterBase): @@ -294,7 +492,28 @@ def __init__(self, blocksize: int = 9): blocksize = int(blocksize) if not 1 <= blocksize <= 9: raise ValueError("blocksize must be in the range [1, 9]") - self.filter_options = (blocksize,) + + super().__init__( + filter_options=(blocksize,), + config={"blocksize": blocksize}, + ) + + @property + def blocksize(self) -> int: + """Size of the blocks as a multiple of 100k in [1, 9]""" + return self.filter_options[0] + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> BZip2: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: (blocksize,) + :raises ValueError: Unsupported filter_options + """ + if len(filter_options) == 0: + return cls() + else: + return cls(blocksize=filter_options[0]) class FciDecomp(FilterBase): @@ -314,12 +533,20 @@ class FciDecomp(FilterBase): filter_id = FCIDECOMP_ID def __init__(self) -> None: - super().__init__() if not build_config.cpp11: logger.error( "The FciDecomp filter is not available as hdf5plugin was not built with C++11.\n" "You may need to reinstall hdf5plugin with a recent version of pip, or rebuild it with a newer compiler." ) + super().__init__(filter_options=(), config={}) + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> FciDecomp: + """Returns compression arguments from HDF5 compression filters options + + :raises ValueError: Unsupported filter_options + """ + return cls() class LZ4(FilterBase): @@ -345,7 +572,29 @@ def __init__(self, nbytes: int = 0): nbytes = int(nbytes) if not 0 <= nbytes <= 0x7E000000: raise ValueError("clevel must be in the range [0, 2113929216]") - self.filter_options = (nbytes,) + super().__init__( + filter_options=(nbytes,), + config={"nbytes": nbytes}, + ) + + @property + def nbytes(self) -> int: + """The number of bytes per block. + + If 0, block size is 1GB.""" + return self.filter_options[0] + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> LZ4: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: (nbytes,) + :raises ValueError: Unsupported filter_options + """ + if len(filter_options) == 0: + return cls() + else: + return cls(nbytes=filter_options[0]) class Zfp(FilterBase): @@ -442,24 +691,31 @@ def __init__( maxprec: int = None, minexp: int = None, ): + filter_options: tuple[int, ...] + if rate is not None: - rateHigh, rateLow = struct.unpack("II", struct.pack("d", float(rate))) - self.filter_options = 1, 0, rateHigh, rateLow, 0, 0 + rate = float(rate) + rateHigh, rateLow = struct.unpack("II", struct.pack("d", rate)) + filter_options = 1, 0, rateHigh, rateLow, 0, 0 + config = {"rate": rate} logger.info("ZFP mode 1 used. H5Z_ZFP_MODE_RATE") elif precision is not None: - self.filter_options = 2, 0, int(precision), 0, 0, 0 + precision = int(precision) + filter_options = 2, 0, precision, 0, 0, 0 + config = {"precision": float(precision)} logger.info("ZFP mode 2 used. H5Z_ZFP_MODE_PRECISION") elif accuracy is not None: - accuracyHigh, accuracyLow = struct.unpack( - "II", struct.pack("d", float(accuracy)) - ) - self.filter_options = 3, 0, accuracyHigh, accuracyLow, 0, 0 + accuracy = float(accuracy) + accuracyHigh, accuracyLow = struct.unpack("II", struct.pack("d", accuracy)) + filter_options = 3, 0, accuracyHigh, accuracyLow, 0, 0 + config = {"accuracy": accuracy} logger.info("ZFP mode 3 used. H5Z_ZFP_MODE_ACCURACY") elif reversible: - self.filter_options = 5, 0, 0, 0, 0, 0 + filter_options = 5, 0, 0, 0, 0, 0 + config = {"reversible": True} logger.info("ZFP mode 5 used. H5Z_ZFP_MODE_REVERSIBLE") elif minbits is not None: @@ -470,14 +726,99 @@ def __init__( minbits = int(minbits) maxbits = int(maxbits) maxprec = int(maxprec) - minexp = struct.unpack("I", struct.pack("i", int(minexp)))[0] - self.filter_options = 4, 0, minbits, maxbits, maxprec, minexp + minexp = int(minexp) + minexp_converted = struct.unpack("I", struct.pack("i", minexp))[0] + filter_options = 4, 0, minbits, maxbits, maxprec, minexp_converted + config = { + "minbits": minbits, + "maxbits": maxbits, + "maxprec": maxprec, + "minexp": minexp, + } logger.info("ZFP mode 4 used. H5Z_ZFP_MODE_EXPERT") else: logger.info("ZFP default used") + filter_options = () + config = {} + + super().__init__(filter_options, config) + + # From zfp.h + _ZFP_MIN_BITS = 1 # minimum number of bits per block + _ZFP_MAX_BITS = 16658 # maximum number of bits per block + _ZFP_MAX_PREC = 64 # maximum precision supported + _ZFP_MIN_EXP = -1074 # minimum floating-point base-2 exponent + _ZFP_MODE_SHORT_BITS = 12 + _ZFP_MODE_SHORT_MAX = (1 << _ZFP_MODE_SHORT_BITS) - 2 + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> Zfp: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: (info, magic, meta, meta&short_mode, long_mode, long_mode) + :raises ValueError: Unsupported filter_options + :raises NotImplementedError: Support of filter_options version is not implemented + """ + # ZFP header parsing reference: + # zfp.c zfp_read_header() and zfp_stream_mode() functions + + if len(filter_options) < 4: + raise ValueError(f"Expected at least 4 values, got {len(filter_options)}") + + magic = filter_options[1] + if struct.pack("I", magic).startswith(b"zfp"): + endianness = ">" + else: + raise ValueError("Unsupported options: Wrong Zfp magic number") + + codec_version = int(struct.pack(f"{endianness}I", magic)[-1]) + if codec_version != 5: + raise NotImplementedError( + f"Unsupported version of Zfp codec: {codec_version}" + ) - logger.info(f"filter options = {self.filter_options}") + # Last 12 bits contains the "short" config value + short_mode = struct.unpack( + "I", struct.pack(f"{endianness}I", filter_options[3] >> 20) + )[0] + if short_mode < cls._ZFP_MODE_SHORT_MAX: + # 12 bits encoding + if short_mode < 2048: # Fixed rate + # Fixed rate is converted to ZFP parameters taking chunk's ndim into account + # this cannot be reverted here, it returns the corresponding "expert" mode config + # See zfp.c zfp_stream_set_rate() + return cls( + minbits=short_mode, + maxbits=short_mode, + maxprec=cls._ZFP_MAX_PREC, + minexp=cls._ZFP_MIN_EXP, + ) + elif short_mode < (2048 + 128): # Fixed precision + return cls(precision=short_mode + 1 - 2048) + elif short_mode == (2048 + 128): # Reversible + return cls(reversible=True) + else: # Fixed accuracy + minexp = short_mode + cls._ZFP_MIN_EXP - (2048 + 128 + 1) + return cls(accuracy=2**minexp) + + # 64 bits encoding + if len(filter_options) < 6: + raise ValueError(f"Expected at least 6 values, got {len(filter_options)}") + + long_mode = struct.unpack( + "Q", struct.pack(f"{endianness}II", filter_options[4], filter_options[5]) + )[0] + minbits = (long_mode & 0x7FFF) + 1 + long_mode >>= 15 + maxbits = (long_mode & 0x7FFF) + 1 + long_mode >>= 15 + maxprec = (long_mode & 0x007F) + 1 + long_mode >>= 7 + minexp = (long_mode & 0x7FFF) - 16495 + return cls(minbits=minbits, maxbits=maxbits, maxprec=maxprec, minexp=minexp) class Sperr(FilterBase): @@ -571,23 +912,60 @@ def __init__( if peak_signal_to_noise_ratio is not None: if peak_signal_to_noise_ratio <= 0: raise ValueError("peak_signal_to_noise_ratio must be strictly positive") + mode_name = "peak_signal_to_noise_ratio" mode = 2 quality = peak_signal_to_noise_ratio elif absolute is not None: if absolute <= 0: raise ValueError("absolute must be strictly positive") + mode_name = "absolute" mode = 3 quality = absolute else: if rate is not None and not 0 < rate < 64: raise ValueError("rate must be None or in the range ]0, 64[") + mode_name = "rate" mode = 1 quality = 16 if rate is None else rate - self.filter_options = self.__pack_options( - mode, quality, swap, missing_value_mode + super().__init__( + filter_options=self.__pack_options(mode, quality, swap, missing_value_mode), + config={ + mode_name: quality, + "swap": swap, + "missing_value_mode": missing_value_mode, + }, + ) + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> Sperr: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: (extra info, compression config) + :raises ValueError: Unsupported filter_options + """ + if len(filter_options) < 2: + raise ValueError(f"Expected at least 2 values, got {len(filter_options)}") + + mode, quality, swap, missing_value_mode = cls.__unpack_options( + meta=filter_options[0], ret=filter_options[1] ) + if mode == 2: + return cls( + peak_signal_to_noise_ratio=quality, + swap=swap, + missing_value_mode=missing_value_mode, + ) + if mode == 3: + return cls( + absolute=quality, swap=swap, missing_value_mode=missing_value_mode + ) + if mode == 1: + return cls(rate=quality, swap=swap, missing_value_mode=missing_value_mode) + + raise ValueError(f"Mode must be in [1, 3], got {mode}") + @classmethod def __pack_options( cls, mode: int, quality: float, swap: bool, missing_value_mode: int @@ -623,6 +1001,57 @@ def __pack_options( return ret, missing_value_mode + @classmethod + def __unpack_options(cls, meta: int, ret: int) -> tuple[int, float, bool, int]: + # Unpack missing value mode from packed_info bits 6-9 + # See h5zsperr_unpack_extra_info + missing_value_mode = (meta >> 6) & 0b1111 + + # Unpack other fields from ret + # See H5Z_SPERR_decode_cd_values + swap = bool(ret >> (cls._INTEGER_BITS + cls._FRACTIONAL_BITS + 3)) + + bit1 = (ret >> (cls._INTEGER_BITS + cls._FRACTIONAL_BITS)) & 1 + bit2 = (ret >> (cls._INTEGER_BITS + cls._FRACTIONAL_BITS + 1)) & 1 + if bit1 and not bit2: + mode = 1 + elif not bit1 and bit2: + mode = 2 + elif bit1 and bit2: + mode = 3 + else: + raise ValueError("Mode must be in [1, 3], got 0") + + negative = bool((ret >> (cls._INTEGER_BITS + cls._FRACTIONAL_BITS - 1)) & 1) + + mask = 1 << (cls._INTEGER_BITS + cls._FRACTIONAL_BITS - 1) + masked_ret = ret & (mask - 1) + + quality = float(masked_ret) / float(1 << cls._FRACTIONAL_BITS) + if negative: + quality *= -1.0 + if mode == 3: + quality = 2**quality + + return mode, quality, swap, missing_value_mode + + +def _sz_pack_float64(value: float) -> tuple[int, int]: + # Pack as big-endian IEEE 754 double + packed = struct.pack(">d", value) + # Unpack most-significant bits as unsigned int + high = struct.unpack(">I", packed[0:4])[0] + # Unpack least-significant bits as unsigned int + low = struct.unpack(">I", packed[4:8])[0] + return high, low + + +def _sz_unpack_float64(high: int, low: int) -> float: + # Pack most-significant & least-significant bits + packed = struct.pack(">II", high, low) + # Unpack as big-endian IEEE 754 double + return float(struct.unpack(">d", packed)[0]) + class SZ(FilterBase): """``h5py.Group.create_dataset``'s compression arguments for using SZ2 filter. @@ -671,7 +1100,7 @@ class SZ(FilterBase): For more details about the compressor, see `SZ2 compressor `_. - .. warning:: The SZ2 compressor is deprecated, see `SZ `_ + .. warning:: The SZ2 compressor is deprecated, see `SZ repository `_ """ filter_name = "sz" @@ -689,35 +1118,57 @@ def __init__( # Get SZ encoding options if absolute is not None: sz_mode = 0 + config = {"absolute": absolute} elif relative is not None: sz_mode = 1 + config = {"relative": relative} else: sz_mode = 10 if pointwise_relative is None: pointwise_relative = 1e-5 + config = {"pointwise_relative": pointwise_relative} - compression_opts = ( + filter_options = ( sz_mode, - *self.__pack_float64(absolute or 0.0), - *self.__pack_float64(relative or 0.0), - *self.__pack_float64(pointwise_relative or 0.0), - *self.__pack_float64(0.0), # psnr + *_sz_pack_float64(absolute or 0.0), + *_sz_pack_float64(relative or 0.0), + *_sz_pack_float64(pointwise_relative or 0.0), + *_sz_pack_float64(0.0), # psnr ) logger.info(f"SZ mode {sz_mode} used.") - logger.info(f"filter options {compression_opts}") + logger.info(f"filter options {filter_options}") + + super().__init__(filter_options, config) - self.filter_options = compression_opts + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> SZ: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: + (_, _, _, _, mode, absolute1, absolute2, relative1, relative2, pointwise1, pointwise2) + :raises ValueError: Unsupported filter_options + """ + if len(filter_options) < 13: + raise ValueError(f"Expected 13 values, got {len(filter_options)}") + + sz_mode = filter_options[4] + if sz_mode == 0: + return cls( + absolute=_sz_unpack_float64(filter_options[5], filter_options[6]) + ) + if sz_mode == 1: + return cls( + relative=_sz_unpack_float64(filter_options[7], filter_options[8]) + ) + if sz_mode == 10: + return cls( + pointwise_relative=_sz_unpack_float64( + filter_options[9], filter_options[10] + ) + ) - @staticmethod - def __pack_float64(error: float) -> tuple[int, int]: - # Pack as big-endian IEEE 754 double - packed = struct.pack(">d", error) - # Unpack most-significant bits as unsigned int - high = struct.unpack(">I", packed[0:4])[0] - # Unpack least-significant bits as unsigned int - low = struct.unpack(">I", packed[4:8])[0] - return high, low + raise ValueError(f"Unsupported sz_mode: {sz_mode}") class SZ3(FilterBase): @@ -763,39 +1214,61 @@ def __init__( # Get SZ3 encoding options: range [0, 5] if absolute is not None: sz_mode = 0 + config = {"absolute": absolute} elif relative is not None: sz_mode = 1 + config = {"relative": relative} elif norm2 is not None: sz_mode = 2 + config = {"norm2": norm2} elif peak_signal_to_noise_ratio is not None: sz_mode = 3 + config = {"peak_signal_to_noise_ratio": peak_signal_to_noise_ratio} if sz_mode not in [0, 2]: logger.warning("Only absolute and norm2 modes properly tested") - compression_opts = ( + filter_options = ( sz_mode, - *self.__pack_float64(absolute or 0.0), - *self.__pack_float64(relative or 0.0), - *self.__pack_float64(norm2 or 0.0), - *self.__pack_float64(peak_signal_to_noise_ratio or 0.0), + *_sz_pack_float64(absolute or 0.0), + *_sz_pack_float64(relative or 0.0), + *_sz_pack_float64(norm2 or 0.0), + *_sz_pack_float64(peak_signal_to_noise_ratio or 0.0), ) logger.info(f"SZ3 mode {sz_mode} used.") - logger.info(f"filter options {compression_opts}") + logger.info(f"filter options {filter_options}") # 9 values needed - if len(compression_opts) != 9: + if len(filter_options) != 9: raise IndexError("Invalid number of arguments") - self.filter_options = compression_opts + super().__init__(filter_options, config) + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> SZ3: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: + (_, _, _, _, mode, absolute1, absolute2, relative1, relative2, norm2_1, norm2_2, pointwise1, pointwise2) + :raises ValueError: Unsupported filter_options + """ + if len(filter_options) < 13: + raise ValueError(f"Expected 13 values, got {len(filter_options)}") + + sz_mode = filter_options[4] + if sz_mode == 0: + return cls( + absolute=_sz_unpack_float64(filter_options[5], filter_options[6]) + ) + if sz_mode == 1: + return cls( + relative=_sz_unpack_float64(filter_options[7], filter_options[8]) + ) + if sz_mode == 2: + return cls(norm2=_sz_unpack_float64(filter_options[9], filter_options[10])) + if sz_mode == 3: + psnr = _sz_unpack_float64(filter_options[11], filter_options[12]) + return cls(peak_signal_to_noise_ratio=psnr) - @staticmethod - def __pack_float64(error: float) -> tuple[int, int]: - # Pack as big-endian IEEE 754 double - packed = struct.pack(">d", error) - # Unpack most-significant bits as unsigned int - high = struct.unpack(">I", packed[0:4])[0] - # Unpack least-significant bits as unsigned int - low = struct.unpack(">I", packed[4:8])[0] - return high, low + raise ValueError(f"Unsupported sz_mode: {sz_mode}") class Zstd(FilterBase): @@ -820,7 +1293,27 @@ class Zstd(FilterBase): def __init__(self, clevel: int = 3): if not 1 <= clevel <= 22: raise ValueError("clevel must be in the range [1, 22]") - self.filter_options = (clevel,) + super().__init__( + filter_options=(clevel,), + config={"clevel": clevel}, + ) + + @property + def clevel(self) -> int: + """Compression level from 1 (lowest compression) to 22 (maximum compression)""" + return self.filter_options[0] + + @classmethod + def _from_filter_options(cls, filter_options: tuple[int, ...]) -> Zstd: + """Returns compression arguments from HDF5 compression filters "cd_values" options + + :param filter_options: Expected format: (clevel,) + :raises ValueError: Unsupported filter_options + """ + if len(filter_options) == 0: + return cls() + else: + return cls(clevel=filter_options[0]) FILTER_CLASSES: tuple[type[FilterBase], ...] = ( diff --git a/src/hdf5plugin/_utils.py b/src/hdf5plugin/_utils.py index cedafa3f..6d415e1d 100644 --- a/src/hdf5plugin/_utils.py +++ b/src/hdf5plugin/_utils.py @@ -230,6 +230,40 @@ def get_filters( return tuple(filter_classes) +def from_filter_options( + filter_id: int | str, filter_options: tuple[int, ...] +) -> FilterBase: + """Returns corresponding compression filter configuration instance. + + .. code-block:: python + + create_plist = dataset.id.get_create_plist() + + compression_filters = [] + + for index in range(create_plist.get_nfilters()): + filter_id, _, filter_options, _ = create_plist.get_filter(index) + if filter_id in hdf5plugin.FILTERS.values(): + compression_filters.append(hdf5plugin.from_filter_options(filter_id, filter_options)) + + :param filter_id: HDF5 compression filter ID + :param filter_options: Compression filter configuration as stored in HDF5 datasets + :raises ValueError: Unsupported or invalid filter_id, filter_options combination + :raises NotImplementedError: Given filter or version of the filter is not supported + """ + if isinstance(filter_id, str): + try: + filter_id = FILTERS[filter_id] + except KeyError: + raise ValueError(f"Unsupported filter id: {filter_id}") + + for filter_cls in FILTER_CLASSES: + if filter_id == filter_cls.filter_id: + return filter_cls._from_filter_options(filter_options) + + raise ValueError(f"Unsupported filter id: {filter_id}") + + def register( filters: int | str | tuple[int | str, ...] = tuple(FILTERS.keys()), force: bool = True, diff --git a/src/hdf5plugin/test.py b/src/hdf5plugin/test.py index e285db53..59676444 100644 --- a/src/hdf5plugin/test.py +++ b/src/hdf5plugin/test.py @@ -499,6 +499,299 @@ def testStringsZstd(self): self._test_strings("zstd") +class TestFromFilterOptionsMethods(unittest.TestCase): + """Test _from_filter_options methods""" + + def testBitshuffle(self): + for filter_options, expected_options in ( + # (_, _, _, nelems, compression_id, clevel) + ((), (0, 0)), # Default: no compression + ((0, 2, 4, 256), (256, 0)), # custom nelems + ((0, 2, 4, 0, 2), (0, 2)), # LZ4 + ((0, 2, 4, 0, 3), (0, 3, 3)), # Zstd with default clevel + ((0, 2, 4, 0, 3, 5), (0, 3, 5)), # Zstd with custom clevel + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.Bitshuffle._from_filter_options( + filter_options + ) + self.assertEqual(compression_filter.filter_options, expected_options) + + def testBlosc(self): + for filter_options, expected_options in ( + # (_, _, _, _, clevel, shuffle, compression_id) + ((), (0, 0, 0, 0, 5, 1, 0)), # Default: no compression + ((2, 2, 4, 40000, 3), (0, 0, 0, 0, 3, 1, 0)), # custom clevel + ( + (2, 2, 4, 40000, 3, 2), + (0, 0, 0, 0, 3, 2, 0), + ), # custom clevel and shuffle + ((2, 2, 4, 40000, 8, 2, 1), (0, 0, 0, 0, 8, 2, 1)), # all custom + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.Blosc._from_filter_options( + filter_options + ) + self.assertEqual(compression_filter.filter_options, expected_options) + + def testBlosc2(self): + for filter_options, expected_options in ( + # (_, _, _, _, clevel, filters, compression_id) + ((), (0, 0, 0, 0, 5, 1, 0)), # Default: no compression + ((2, 2, 4, 40000, 3), (0, 0, 0, 0, 3, 1, 0)), # custom clevel + ( + (2, 2, 4, 40000, 3, 2), + (0, 0, 0, 0, 3, 2, 0), + ), # custom clevel and filters + ((2, 2, 4, 40000, 8, 2, 1), (0, 0, 0, 0, 8, 2, 1)), # all custom + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.Blosc2._from_filter_options( + filter_options + ) + self.assertEqual(compression_filter.filter_options, expected_options) + + def testBZip2(self): + for filter_options, expected_options in ( + # (blocksize,) + ((), (9,)), + ((5,), (5,)), + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.BZip2._from_filter_options( + filter_options + ) + self.assertEqual(compression_filter.filter_options, expected_options) + + def testFciDecomp(self): + compression_filter = hdf5plugin.FciDecomp._from_filter_options((1, 2, 3)) + self.assertEqual(compression_filter.filter_options, ()) + + def testLZ4(self): + for filter_options, expected_options in ( + # (nbytes,) + ((), (0,)), + ((1024,), (1024,)), + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.LZ4._from_filter_options(filter_options) + self.assertEqual(compression_filter.filter_options, expected_options) + + def testSperr(self): + for filter_options, expected_filter in ( + ((1043, 269484032, 128, 0, 0), hdf5plugin.Sperr()), + ( + (1107, 2418016256, 256, 0, 0), + hdf5plugin.Sperr(rate=32, swap=True, missing_value_mode=1), + ), + ((1043, 940177214, 256, 0, 0), hdf5plugin.Sperr(absolute=1e-3)), + ( + (1171, 537001984, 256, 0, 0), + hdf5plugin.Sperr(peak_signal_to_noise_ratio=2.0, missing_value_mode=2), + ), + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.Sperr._from_filter_options( + filter_options + ) + self.assertEqual( + compression_filter.filter_options, expected_filter.filter_options + ) + + def testSZ(self): + for filter_options, expected_filter in ( + ( + (1, 0, 0, 256, 10, 0, 0, 0, 0, 1055193269, 2296604913, 0, 0), + hdf5plugin.SZ(), + ), + ( + (1, 0, 0, 256, 0, 1062232653, 3539053052, 0, 0, 0, 0, 0, 0), + hdf5plugin.SZ(absolute=1e-3), + ), + ( + (1, 0, 0, 256, 1, 0, 0, 1062232653, 3539053052, 0, 0, 0, 0), + hdf5plugin.SZ(relative=1e-3), + ), + ( + (1, 0, 0, 256, 10, 0, 0, 0, 0, 1062232653, 3539053052, 0, 0), + hdf5plugin.SZ(pointwise_relative=1e-3), + ), + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.SZ._from_filter_options(filter_options) + self.assertEqual( + compression_filter.filter_options, expected_filter.filter_options + ) + + def testSZ3(self): + for filter_options, expected_filter in ( + ( + (1, 0, 0, 256, 0, 1058682594, 3944497965, 0, 0, 0, 0, 0, 0), + hdf5plugin.SZ3(), + ), + ( + (1, 0, 0, 256, 0, 1051772663, 2696277389, 0, 0, 0, 0, 0, 0), + hdf5plugin.SZ3(absolute=1e-6), + ), + ( + (1, 0, 0, 256, 1, 0, 0, 1062232653, 3539053052, 0, 0, 0, 0), + hdf5plugin.SZ3(relative=1e-3), + ), + ( + (1, 0, 0, 256, 2, 0, 0, 0, 0, 1062232653, 3539053052, 0, 0), + hdf5plugin.SZ3(norm2=1e-3), + ), + ( + (1, 0, 0, 256, 3, 0, 0, 0, 0, 0, 0, 1062232653, 3539053052), + hdf5plugin.SZ3(peak_signal_to_noise_ratio=1e-3), + ), + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.SZ3._from_filter_options(filter_options) + self.assertEqual( + compression_filter.filter_options, expected_filter.filter_options + ) + + def testZfp(self): + for filter_options, expected_filter in ( + ( + (269504785, 91252346, 4026532854, 2167406593), + hdf5plugin.Zfp(precision=20), + ), + ( + (269504785, 91252346, 4026532854, 3404726273), + hdf5plugin.Zfp(accuracy=2**-4), + ), + ( + (269504785, 91252346, 4026532854, 2281701377), + hdf5plugin.Zfp(reversible=True), + ), + ( + (269504785, 91252346, 4026532854, 4293918721, 3767009280, 494351), + hdf5plugin.Zfp(minbits=1, maxbits=16657, maxprec=64, minexp=-1047), + ), + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.Zfp._from_filter_options(filter_options) + self.assertEqual( + compression_filter.filter_options, expected_filter.filter_options + ) + + def testZstd(self): + for filter_options, expected_options in ( + # (clevel,) + ((), (3,)), + ((10,), (10,)), + ): + with self.subTest(filter_options=filter_options): + compression_filter = hdf5plugin.Zstd._from_filter_options( + filter_options + ) + self.assertEqual(compression_filter.filter_options, expected_options) + + +class TestFromFilterOptions(unittest.TestCase): + """Test from_filter_options function""" + + def test_filter_name(self): + compression_filter = hdf5plugin.from_filter_options("bzip2", (5,)) + self.assertEqual(compression_filter, hdf5plugin.BZip2(blocksize=5)) + + +class TestFromFilterOptionsRoundtrip(unittest.TestCase): + """Test from_filter_options function roundtrip""" + + def _test( + self, compression_filter: _filters.FilterBase, data: numpy.ndarray[Any, Any] + ): + with h5py.File("in_memory", "w", driver="core", backing_store=False) as h5f: + h5f.create_dataset( + "data", + data=data, + chunks=data.shape, + compression=compression_filter, + ) + h5f.flush() + + plist = h5f["data"].id.get_create_plist() + filters = [plist.get_filter(i) for i in range(plist.get_nfilters())] + + self.assertEqual(len(filters), 1) + filter_id, _, filter_options, _ = filters[0] + + retrieved_filter = hdf5plugin.from_filter_options(filter_id, filter_options) + + self.assertEqual( + compression_filter, + retrieved_filter, + msg=f"{(compression_filter.filter_id, compression_filter.filter_options)} != {(retrieved_filter.filter_id, retrieved_filter.filter_options)}", + ) + + @unittest.skipUnless(should_test("bshuf"), "Bitshuffle filter not available") + def testBitshuffle(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.Bitshuffle(), data) + + @unittest.skipUnless(should_test("blosc"), "Blosc filter not available") + def testBlosc(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.Blosc(), data) + + @unittest.skipUnless(should_test("blosc2"), "Blosc2 filter not available") + def testBlosc2(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.Blosc2(), data) + + @unittest.skipUnless(should_test("bzip2"), "BZip2 filter not available") + def testBZip2(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.BZip2(), data) + + @unittest.skipUnless(should_test("fcidecomp"), "FCIDECOMP filter not available") + def testFciDecomp(self): + data = numpy.arange(256**2, dtype=numpy.uint16).reshape(256, 256) + self._test(hdf5plugin.FciDecomp(), data) + + @unittest.skipUnless(should_test("lz4"), "LZ4 filter not available") + def testLZ4(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.LZ4(), data) + + @unittest.skipUnless(should_test("sperr"), "Sperr filter not available") + def testSperr(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.Sperr(), data) + + @unittest.skipUnless(should_test("sz"), "SZ filter not available") + def testSZ(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.SZ(), data) + + @unittest.skipUnless(should_test("sz3"), "SZ3 filter not available") + def testSZ3(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.SZ3(), data) + + @unittest.skipUnless(should_test("zfp"), "Zfp filter not available") + def testZfp(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + # Roundtrip does not work for all parameters including the default + for mode_name, compression_filter in { + # rate does not roundtrip + "precision": hdf5plugin.Zfp(precision=10), + "accuracy": hdf5plugin.Zfp(accuracy=2**-3), # roundtrip only for 2^n + "reversible": hdf5plugin.Zfp(reversible=True), + "expert": hdf5plugin.Zfp(minbits=2, maxbits=100, maxprec=32, minexp=-10), + }.items(): + with self.subTest(mode_name): + self._test(compression_filter, data) + + @unittest.skipUnless(should_test("zstd"), "Zstd filter not available") + def testZstd(self): + data = numpy.arange(256**2, dtype=numpy.float32).reshape(256, 256) + self._test(hdf5plugin.Zstd(), data) + + class TestPackage(unittest.TestCase): """Test general features of the hdf5plugin package""" @@ -742,6 +1035,9 @@ def suite() -> unittest.TestSuite: for cls in ( TestHDF5PluginRW, TestStrings, + TestFromFilterOptionsMethods, + TestFromFilterOptions, + TestFromFilterOptionsRoundtrip, TestPackage, TestRegisterFilter, TestGetFilters,