3030from zarr .abc .store import Store , set_or_delete
3131from zarr .codecs ._v2 import V2Codec
3232from zarr .codecs .bytes import BytesCodec
33+ from zarr .codecs .vlen_utf8 import VLenBytesCodec , VLenUTF8Codec
34+ from zarr .codecs .zstd import ZstdCodec
3335from zarr .core ._info import ArrayInfo
3436from zarr .core .array_spec import ArrayConfig , ArrayConfigLike , parse_array_config
3537from zarr .core .attributes import Attributes
6870from zarr .core .config import categorize_data_type
6971from zarr .core .config import config as zarr_config
7072from zarr .core .dtype import (
73+ VariableLengthBytes ,
74+ VariableLengthUTF8 ,
7175 ZDType ,
7276 ZDTypeLike ,
7377 parse_data_type ,
7478)
75- from zarr .core .dtype .common import HasEndianness , HasItemSize
79+ from zarr .core .dtype .common import HasEndianness , HasItemSize , HasObjectCodec
7680from zarr .core .indexing import (
7781 BasicIndexer ,
7882 BasicSelection ,
109113)
110114from zarr .core .metadata .v2 import (
111115 CompressorLikev2 ,
116+ get_object_codec_id ,
112117 parse_compressor ,
113118 parse_filters ,
114119)
@@ -710,7 +715,10 @@ def _create_metadata_v3(
710715
711716 shape = parse_shapelike (shape )
712717 if codecs is None :
713- filters , serializer , compressors = _get_default_chunk_encoding_v3 (dtype )
718+ filters = default_filters_v3 (dtype )
719+ serializer = default_serializer_v3 (dtype )
720+ compressors = default_compressors_v3 (dtype )
721+
714722 codecs_parsed = (* filters , serializer , * compressors )
715723 else :
716724 codecs_parsed = tuple (codecs )
@@ -850,10 +858,9 @@ async def _create_v2(
850858 else :
851859 await ensure_no_existing_node (store_path , zarr_format = 2 )
852860
853- default_filters , default_compressor = _get_default_chunk_encoding_v2 (dtype )
854861 compressor_parsed : CompressorLikev2
855862 if compressor == "auto" :
856- compressor_parsed = default_compressor
863+ compressor_parsed = default_compressor_v2 ( dtype )
857864 elif isinstance (compressor , BytesBytesCodec ):
858865 raise ValueError (
859866 "Cannot use a BytesBytesCodec as a compressor for zarr v2 arrays. "
@@ -863,7 +870,7 @@ async def _create_v2(
863870 compressor_parsed = compressor
864871
865872 if filters is None :
866- filters = default_filters
873+ filters = default_filters_v2 ( dtype )
867874
868875 metadata = cls ._create_metadata_v2 (
869876 shape = shape ,
@@ -4654,19 +4661,80 @@ def _get_default_chunk_encoding_v3(
46544661 )
46554662
46564663
4657- def _get_default_chunk_encoding_v2 (
4658- dtype : ZDType [TBaseDType , TBaseScalar ],
4659- ) -> tuple [tuple [numcodecs .abc .Codec , ...] | None , numcodecs .abc .Codec | None ]:
4664+ def default_filters_v3 (dtype : ZDType [Any , Any ]) -> tuple [ArrayArrayCodec , ...]:
46604665 """
4661- Get the default chunk encoding for Zarr format 2 arrays, given a dtype
4666+ Given a data type, return the default filters for that data type.
4667+
4668+ This is an empty tuple. No data types have default filters.
46624669 """
4663- dtype_category = categorize_data_type (dtype )
4664- filters = zarr_config .get ("array.v2_default_filters" ).get (dtype_category )
4665- compressor = zarr_config .get ("array.v2_default_compressor" ).get (dtype_category )
4666- if filters is not None :
4667- filters = tuple (numcodecs .get_codec (f ) for f in filters )
4670+ return ()
4671+
4672+
4673+ def default_compressors_v3 (dtype : ZDType [Any , Any ]) -> tuple [BytesBytesCodec , ...]:
4674+ """
4675+ Given a data type, return the default compressors for that data type.
4676+
4677+ This is just a tuple containing ``ZstdCodec``
4678+ """
4679+ return (ZstdCodec (),)
4680+
4681+
4682+ def default_serializer_v3 (dtype : ZDType [Any , Any ]) -> ArrayBytesCodec :
4683+ """
4684+ Given a data type, return the default serializer for that data type.
4685+
4686+ The default serializer for most data types is the ``BytesCodec``, which may or may not be
4687+ parameterized with an endianness, depending on whether the data type has endianness. Variable
4688+ length strings and variable length bytes have hard-coded serializers -- ``VLenUTF8Codec`` and
4689+ ``VLenBytesCodec``, respectively.
4690+
4691+ """
4692+ serializer : ArrayBytesCodec = BytesCodec (endian = None )
4693+
4694+ if isinstance (dtype , HasEndianness ):
4695+ serializer = BytesCodec (endian = "little" )
4696+ elif isinstance (dtype , HasObjectCodec ):
4697+ if dtype .object_codec_id == "vlen-bytes" :
4698+ serializer = VLenBytesCodec ()
4699+ elif dtype .object_codec_id == "vlen-utf8" :
4700+ serializer = VLenUTF8Codec ()
4701+ else :
4702+ msg = f"Data type { dtype } requires an unknown object codec: { dtype .object_codec_id !r} ."
4703+ raise ValueError (msg )
4704+ return serializer
4705+
4706+
4707+ def default_filters_v2 (dtype : ZDType [Any , Any ]) -> tuple [numcodecs .abc .Codec ] | None :
4708+ """
4709+ Given a data type, return the default filters for that data type.
4710+
4711+ For data types that require an object codec, namely variable length data types,
4712+ this is a tuple containing the object codec. Otherwise it's ``None``.
4713+ """
4714+ if isinstance (dtype , HasObjectCodec ):
4715+ if dtype .object_codec_id == "vlen-bytes" :
4716+ from numcodecs import VLenBytes
46684717
4669- return filters , numcodecs .get_codec (compressor )
4718+ return (VLenBytes (),)
4719+ elif dtype .object_codec_id == "vlen-utf8" :
4720+ from numcodecs import VLenUTF8
4721+
4722+ return (VLenUTF8 (),)
4723+ else :
4724+ msg = f"Data type { dtype } requires an unknown object codec: { dtype .object_codec_id !r} ."
4725+ raise ValueError (msg )
4726+ return None
4727+
4728+
4729+ def default_compressor_v2 (dtype : ZDType [Any , Any ]) -> numcodecs .abc .Codec :
4730+ """
4731+ Given a data type, return the default compressors for that data type.
4732+
4733+ This is just the numcodecs ``Zstd`` codec.
4734+ """
4735+ from numcodecs import Zstd
4736+
4737+ return Zstd (level = 0 , checksum = False )
46704738
46714739
46724740def _parse_chunk_encoding_v2 (
@@ -4678,14 +4746,13 @@ def _parse_chunk_encoding_v2(
46784746 """
46794747 Generate chunk encoding classes for Zarr format 2 arrays with optional defaults.
46804748 """
4681- default_filters , default_compressor = _get_default_chunk_encoding_v2 (dtype )
46824749 _filters : tuple [numcodecs .abc .Codec , ...] | None
46834750 _compressor : numcodecs .abc .Codec | None
46844751
46854752 if compressor is None or compressor == ():
46864753 _compressor = None
46874754 elif compressor == "auto" :
4688- _compressor = default_compressor
4755+ _compressor = default_compressor_v2 ( dtype )
46894756 elif isinstance (compressor , tuple | list ) and len (compressor ) == 1 :
46904757 _compressor = parse_compressor (compressor [0 ])
46914758 else :
@@ -4697,7 +4764,7 @@ def _parse_chunk_encoding_v2(
46974764 if filters is None :
46984765 _filters = None
46994766 elif filters == "auto" :
4700- _filters = default_filters
4767+ _filters = default_filters_v2 ( dtype )
47014768 else :
47024769 if isinstance (filters , Iterable ):
47034770 for idx , f in enumerate (filters ):
@@ -4708,7 +4775,33 @@ def _parse_chunk_encoding_v2(
47084775 )
47094776 raise TypeError (msg )
47104777 _filters = parse_filters (filters )
4711-
4778+ if isinstance (dtype , HasObjectCodec ):
4779+ # check the filters and the compressor for the object codec required for this data type
4780+ if _filters is None :
4781+ if _compressor is None :
4782+ object_codec_id = None
4783+ else :
4784+ object_codec_id = get_object_codec_id ((_compressor .get_config (),))
4785+ else :
4786+ object_codec_id = get_object_codec_id (
4787+ (
4788+ * [f .get_config () for f in _filters ],
4789+ _compressor .get_config () if _compressor is not None else None ,
4790+ )
4791+ )
4792+ if object_codec_id is None :
4793+ if isinstance (dtype , VariableLengthUTF8 ): # type: ignore[unreachable]
4794+ codec_name = "the numcodecs.VLenUTF8 codec" # type: ignore[unreachable]
4795+ elif isinstance (dtype , VariableLengthBytes ): # type: ignore[unreachable]
4796+ codec_name = "the numcodecs.VLenBytes codec" # type: ignore[unreachable]
4797+ else :
4798+ codec_name = f"an unknown object codec with id { dtype .object_codec_id !r} "
4799+ msg = (
4800+ f"Data type { dtype } requires { codec_name } , "
4801+ "but no such codec was specified in the filters or compressor parameters for "
4802+ "this array. "
4803+ )
4804+ raise ValueError (msg )
47124805 return _filters , _compressor
47134806
47144807
@@ -4722,14 +4815,11 @@ def _parse_chunk_encoding_v3(
47224815 """
47234816 Generate chunk encoding classes for v3 arrays with optional defaults.
47244817 """
4725- default_array_array , default_array_bytes , default_bytes_bytes = _get_default_chunk_encoding_v3 (
4726- dtype
4727- )
47284818
47294819 if filters is None :
47304820 out_array_array : tuple [ArrayArrayCodec , ...] = ()
47314821 elif filters == "auto" :
4732- out_array_array = default_array_array
4822+ out_array_array = default_filters_v3 ( dtype )
47334823 else :
47344824 maybe_array_array : Iterable [Codec | dict [str , JSON ]]
47354825 if isinstance (filters , dict | Codec ):
@@ -4739,7 +4829,7 @@ def _parse_chunk_encoding_v3(
47394829 out_array_array = tuple (_parse_array_array_codec (c ) for c in maybe_array_array )
47404830
47414831 if serializer == "auto" :
4742- out_array_bytes = default_array_bytes
4832+ out_array_bytes = default_serializer_v3 ( dtype )
47434833 else :
47444834 # TODO: ensure that the serializer is compatible with the ndarray produced by the
47454835 # array-array codecs. For example, if a sequence of array-array codecs produces an
@@ -4749,7 +4839,7 @@ def _parse_chunk_encoding_v3(
47494839 if compressors is None :
47504840 out_bytes_bytes : tuple [BytesBytesCodec , ...] = ()
47514841 elif compressors == "auto" :
4752- out_bytes_bytes = default_bytes_bytes
4842+ out_bytes_bytes = default_compressors_v3 ( dtype )
47534843 else :
47544844 maybe_bytes_bytes : Iterable [Codec | dict [str , JSON ]]
47554845 if isinstance (compressors , dict | Codec ):
@@ -4759,17 +4849,11 @@ def _parse_chunk_encoding_v3(
47594849
47604850 out_bytes_bytes = tuple (_parse_bytes_bytes_codec (c ) for c in maybe_bytes_bytes )
47614851
4762- # specialize codecs as needed given the dtype
4763-
4764- # TODO: refactor so that the config only contains the name of the codec, and we use the dtype
4765- # to create the codec instance, instead of storing a dict representation of a full codec.
4766-
47674852 # TODO: ensure that the serializer is compatible with the ndarray produced by the
47684853 # array-array codecs. For example, if a sequence of array-array codecs produces an
47694854 # array with a single-byte data type, then the serializer should not specify endiannesss.
4770- if isinstance (out_array_bytes , BytesCodec ) and not isinstance (dtype , HasEndianness ):
4771- # The default endianness in the bytescodec might not be None, so we need to replace it
4772- out_array_bytes = replace (out_array_bytes , endian = None )
4855+
4856+ # TODO: add checks to ensure that the right serializer is used for vlen data types
47734857 return out_array_array , out_array_bytes , out_bytes_bytes
47744858
47754859
0 commit comments