diff --git a/.github/workflows/tox.yaml b/.github/workflows/tox.yaml index 55b8844..495b606 100644 --- a/.github/workflows/tox.yaml +++ b/.github/workflows/tox.yaml @@ -16,20 +16,17 @@ jobs: matrix: python-version: [ '3.12', '3.11', '3.10', '3.9', '3.8', '3.7' ] runs-on: ['ubuntu-latest'] - include: - - runs-on: 'ubuntu-20.04' - python-version: '3.6' runs-on: ${{ matrix.runs-on }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: lfs: true - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: PIP cache - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-python${{ matrix.python-version }}-${{ hashFiles('setup.cfg') }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b54ee7f..fc3c5dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,6 +21,7 @@ repos: hooks: # Run the linter. - id: ruff + args: [--fix] - repo: https://github.com/mgedmin/check-manifest rev: "0.49" diff --git a/setup.cfg b/setup.cfg index 422fc5b..79764ed 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,7 +13,6 @@ classifiers = Intended Audience :: Science/Research License :: OSI Approved :: MIT License Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 @@ -28,7 +27,7 @@ keywords = neuroimaging package_dir = = src packages = find: -python_requires = ~=3.6 +python_requires = ~=3.7 install_requires = nibabel >= 2 numpy >= 1.17 @@ -36,6 +35,7 @@ install_requires = requests >= 2 scikit-image # TODO use pillow instead tqdm ~= 4.29 + numcodecs imagecodecs # required to read LZW compressed tiff files [options.packages.find] diff --git a/src/neuroglancer_scripts/chunk_encoding.py b/src/neuroglancer_scripts/chunk_encoding.py index 164033e..09d78de 100644 --- a/src/neuroglancer_scripts/chunk_encoding.py +++ b/src/neuroglancer_scripts/chunk_encoding.py @@ -9,6 +9,8 @@ :func:`get_encoder` for instantiating a concrete encoder object. """ +from typing import Dict, Type + import numpy as np __all__ = [ @@ -48,14 +50,19 @@ def get_encoder(info, scale_info, encoder_options={}): num_channels = info["num_channels"] encoding = scale_info["encoding"] except KeyError as exc: - raise InvalidInfoError("The info dict is missing an essential key " - f"{exc}") from exc + raise InvalidInfoError( + "The info dict is missing an essential key " f"{exc}" + ) from exc if not isinstance(num_channels, int) or not num_channels > 0: - raise InvalidInfoError(f"Invalid value {num_channels} for " - "num_channels (must be a positive integer)") + raise InvalidInfoError( + f"Invalid value {num_channels} for " + "num_channels (must be a positive integer)" + ) if data_type not in NEUROGLANCER_DATA_TYPES: - raise InvalidInfoError(f"Invalid data_type {data_type} (should be one " - f"of {NEUROGLANCER_DATA_TYPES})") + raise InvalidInfoError( + f"Invalid data_type {data_type} (should be one " + f"of {NEUROGLANCER_DATA_TYPES})" + ) try: if encoding == "raw": return RawChunkEncoder(data_type, num_channels) @@ -65,15 +72,20 @@ def get_encoder(info, scale_info, encoder_options={}): except KeyError: raise InvalidInfoError( 'Encoding is set to "compressed_segmentation" but ' - '"compressed_segmentation_block_size" is missing') - return CompressedSegmentationEncoder(data_type, num_channels, - block_size) + '"compressed_segmentation_block_size" is missing' + ) + return CompressedSegmentationEncoder( + data_type, num_channels, block_size + ) elif encoding == "jpeg": jpeg_plane = encoder_options.get("jpeg_plane", "xy") jpeg_quality = encoder_options.get("jpeg_quality", 95) - return JpegChunkEncoder(data_type, num_channels, - jpeg_plane=jpeg_plane, - jpeg_quality=jpeg_quality) + return JpegChunkEncoder( + data_type, + num_channels, + jpeg_plane=jpeg_plane, + jpeg_quality=jpeg_quality, + ) else: raise InvalidInfoError(f"Invalid encoding {encoding}") except IncompatibleEncoderError as exc: @@ -97,37 +109,51 @@ def add_argparse_options(parser, allow_lossy=False): get_encoder(info, scale_info, vars(args)) """ import argparse + if allow_lossy: + def jpeg_quality(arg): q = int(arg) if not 1 <= q <= 100: raise argparse.ArgumentTypeError( - "JPEG quality must be between 1 and 100") + "JPEG quality must be between 1 and 100" + ) return q group = parser.add_argument_group("Options for JPEG compression") - group.add_argument("--jpeg-quality", type=jpeg_quality, - default=95, metavar="Q", - help="JPEG quality factor (1 is worst, 100 is " - "best, values above 95 increase file size but " - "provide hardly any extra quality)") - group.add_argument("--jpeg-plane", choices=("xy", "xz"), default="xy", - help='plane of JPEG compression (default: xy)') + group.add_argument( + "--jpeg-quality", + type=jpeg_quality, + default=95, + metavar="Q", + help="JPEG quality factor (1 is worst, 100 is " + "best, values above 95 increase file size but " + "provide hardly any extra quality)", + ) + group.add_argument( + "--jpeg-plane", + choices=("xy", "xz"), + default="xy", + help="plane of JPEG compression (default: xy)", + ) class IncompatibleEncoderError(Exception): """Raised when an Encoder cannot handle the requested data type.""" + pass # TODO inherit from a new DataError class? class InvalidInfoError(Exception): """Raised when an *info* dict is invalid or inconsistent.""" + pass class InvalidFormatError(Exception): """Raised when chunk data cannot be decoded properly.""" + pass @@ -138,6 +164,23 @@ class ChunkEncoder: :param int num_channels: number of image channels """ + _ENCODER_KLASS: Dict[str, Type["ChunkEncoder"]] = {} + + def __init_subclass__(cls, type: str): + assert ( + type not in cls._ENCODER_KLASS + ), f"type {type} already registered." + cls._ENCODER_KLASS[type] = cls + + @classmethod + def get_encoder(cls, type: str, data_type, num_channels, **kwargs): + assert ( + type in cls._ENCODER_KLASS + ), f"type {type} not found in encoder registry!" + + klass = cls._ENCODER_KLASS[type] + return klass(data_type, num_channels, **kwargs) + lossy = False """True if this encoder is lossy.""" @@ -170,12 +213,13 @@ def decode(self, buf, chunk_size): raise NotImplementedError -class RawChunkEncoder(ChunkEncoder): +class RawChunkEncoder(ChunkEncoder, type="raw"): """Codec for to the Neuroglancer raw chunk format. :param str data_type: data type supported by Neuroglancer :param int num_channels: number of image channels """ + lossy = False def encode(self, chunk): @@ -188,14 +232,22 @@ def encode(self, chunk): def decode(self, buf, chunk_size): try: return np.frombuffer(buf, dtype=self.dtype).reshape( - (self.num_channels, - chunk_size[2], chunk_size[1], chunk_size[0])) + ( + self.num_channels, + chunk_size[2], + chunk_size[1], + chunk_size[0], + ) + ) except Exception as exc: - raise InvalidFormatError(f"Cannot decode raw-encoded chunk: {exc}" - ) from exc + raise InvalidFormatError( + f"Cannot decode raw-encoded chunk: {exc}" + ) from exc -class CompressedSegmentationEncoder(ChunkEncoder): +class CompressedSegmentationEncoder( + ChunkEncoder, type="compressed_segmentation" +): """Codec for to the Neuroglancer precomputed chunk format. :param str data_type: data type supported by Neuroglancer @@ -205,18 +257,21 @@ class CompressedSegmentationEncoder(ChunkEncoder): :raises IncompatibleEncoderError: if data_type or num_channels are unsupported """ + lossy = False def __init__(self, data_type, num_channels, block_size): if data_type not in ("uint32", "uint64"): raise IncompatibleEncoderError( "The compressed_segmentation encoding can only handle uint32 " - "or uint64 data_type") + "or uint64 data_type" + ) super().__init__(data_type, num_channels) self.block_size = block_size def encode(self, chunk): from neuroglancer_scripts import _compressed_segmentation + chunk = np.asarray(chunk).astype(self.dtype, casting="safe") assert chunk.ndim == 4 assert chunk.shape[0] == self.num_channels @@ -225,15 +280,16 @@ def encode(self, chunk): def decode(self, buf, chunk_size): from neuroglancer_scripts import _compressed_segmentation + chunk = np.empty( (self.num_channels, chunk_size[2], chunk_size[1], chunk_size[0]), - dtype=self.dtype + dtype=self.dtype, ) _compressed_segmentation.decode_chunk_into(chunk, buf, self.block_size) return chunk -class JpegChunkEncoder(ChunkEncoder): +class JpegChunkEncoder(ChunkEncoder, type="jpeg"): """Codec for to the Neuroglancer raw chunk format. :param str data_type: data type supported by Neuroglancer @@ -243,15 +299,18 @@ class JpegChunkEncoder(ChunkEncoder): :raises IncompatibleEncoderError: if data_type or num_channels are unsupported """ + lossy = True mime_type = "image/jpeg" - def __init__(self, data_type, num_channels, - jpeg_quality=95, jpeg_plane="xy"): + def __init__( + self, data_type, num_channels, jpeg_quality=95, jpeg_plane="xy" + ): if data_type != "uint8" or num_channels not in (1, 3): raise IncompatibleEncoderError( "The JPEG encoding can only handle uint8 data_type with 1 or " - "3 channels") + "3 channels" + ) super().__init__(data_type, num_channels) assert 1 <= jpeg_quality <= 100 assert jpeg_plane in ("xy", "xz") @@ -260,6 +319,7 @@ def __init__(self, data_type, num_channels, def encode(self, chunk): from neuroglancer_scripts import _jpeg + assert np.can_cast(chunk.dtype, self.dtype, casting="safe") assert chunk.ndim == 4 assert chunk.shape[0] == self.num_channels @@ -268,4 +328,83 @@ def encode(self, chunk): def decode(self, buf, chunk_size): from neuroglancer_scripts import _jpeg + return _jpeg.decode_chunk(buf, chunk_size, self.num_channels) + + +class BloscEncoder(ChunkEncoder, type="blosc"): + lossy = False + + def __init__( + self, + data_type, + num_channels, + blosc_clevel=9, + blosc_shuffle="shuffle", + blosc_cname="zstd", + ): + super().__init__(data_type, num_channels) + self.blosc_clevel = blosc_clevel + self.blosc_shuffle = blosc_shuffle + self.blosc_cname = blosc_cname + + def encode(self, chunk): + from numcodecs import blosc + + return blosc.compress( + chunk, + clevel=self.blosc_clevel, + shuffle=self.blosc_shuffle, + cname=self.blosc_clevel, + ) + + def decode(self, buf, chunk_size): + from numcodecs import blosc + + try: + buf = blosc.decompress(buf) + return np.frombuffer(buf, dtype=self.dtype).reshape( + ( + self.num_channels, + chunk_size[2], + chunk_size[1], + chunk_size[0], + ) + ) + + except Exception as exc: + raise InvalidFormatError( + f"Cannot decode blosc-encoded chunk: {exc}" + ) from exc + + +class GZipEncoder(ChunkEncoder, type="gzip"): + lossy = False + + def __init__(self, data_type, num_channels, gzip_level=-1): + super().__init__(data_type, num_channels) + self.gzip_level = gzip_level + + def encode(self, chunk): + import zlib + + return zlib.compress(chunk, self.gzip_level) + + def decode(self, buf, chunk_size): + import zlib + + try: + buf = zlib.decompress(buf) + return np.frombuffer(buf, dtype=self.dtype).reshape( + ( + self.num_channels, + chunk_size[2], + chunk_size[1], + chunk_size[0], + ) + ) + + except Exception as exc: + raise InvalidFormatError( + f"Cannot decode blosc-encoded chunk: {exc}" + ) from exc diff --git a/src/neuroglancer_scripts/gcs_accessor.py b/src/neuroglancer_scripts/gcs_accessor.py new file mode 100644 index 0000000..23e61dc --- /dev/null +++ b/src/neuroglancer_scripts/gcs_accessor.py @@ -0,0 +1,67 @@ +from abc import ABC +from typing import Dict +from urllib.parse import quote_plus + +import neuroglancer_scripts.http_accessor + + +# TODO move to its own module? +class UrlParser(ABC): + base_url: str = None + + def __new__(cls, path: str): + for registered_protocol in cls._PROTOCOL_DICT: + if path.startswith(f"{registered_protocol}//"): + path = path.replace(f"{registered_protocol}//", "") + use_cls = cls._PROTOCOL_DICT[registered_protocol] + inst: UrlParser = object.__new__(use_cls) + inst.base_url = path + return inst + return super().__new__(cls) + + protocol: str = None + _PROTOCOL_DICT: Dict[str, "UrlParser"] = {} + + def __post_init__(self): + assert self.base_url is not None, "UrlParser.base_url must be defined!" + + def __init_subclass__(cls, protocol: str): + + if protocol in cls._PROTOCOL_DICT: + print(f"{protocol} already registered. Overriding") + cls._PROTOCOL_DICT[protocol] = cls + cls.protocol = protocol + return super().__init_subclass__() + + def format_url(self, path: str): + raise NotImplementedError("Child class must override format_url") + + +class GoogleBucketParser(UrlParser, protocol="gs:"): + + def format_url(self, path: str): + bucketname, prefix = self.base_url.split("/", maxsplit=1) + final_path = quote_plus(f"{prefix}/{path}") + return f"https://www.googleapis.com/storage/v1/b/{bucketname}/o/{final_path}?alt=media" + + +class GCSAccessor(neuroglancer_scripts.http_accessor.HttpAccessor): + can_read = True + can_write = False + + def __init__(self, base_url): + super().__init__(base_url) + self.parser = GoogleBucketParser(f"gs://{base_url}") + self.parser.base_url = base_url + + def format_path(self, relative_path): + return self.parser.format_url(relative_path) + + def fetch_file(self, relative_path): + import json + + import numpy as np + + if relative_path == "transform.json": + return json.dumps(np.eye(4).tolist()).encode() + return super().fetch_file(relative_path) diff --git a/src/neuroglancer_scripts/http_accessor.py b/src/neuroglancer_scripts/http_accessor.py index da423a4..fc63dd9 100644 --- a/src/neuroglancer_scripts/http_accessor.py +++ b/src/neuroglancer_scripts/http_accessor.py @@ -38,35 +38,45 @@ def __init__(self, base_url): # Fix the base URL to end with a slash, discard query and fragment r = urllib.parse.urlsplit(base_url) - self.base_url = urllib.parse.urlunsplit(( - r.scheme, r.netloc, - r.path if r.path[-1] == "/" else r.path + "/", - "", "")) + self.base_url = urllib.parse.urlunsplit( + ( + r.scheme, + r.netloc, + r.path if r.path[-1] == "/" else r.path + "/", + "", + "", + ) + ) def chunk_relative_url(self, key, chunk_coords): xmin, xmax, ymin, ymax, zmin, zmax = chunk_coords url_suffix = _CHUNK_PATTERN_FLAT.format( - xmin, xmax, ymin, ymax, zmin, zmax, key=key) + xmin, xmax, ymin, ymax, zmin, zmax, key=key + ) return url_suffix def fetch_chunk(self, key, chunk_coords): chunk_url = self.chunk_relative_url(key, chunk_coords) return self.fetch_file(chunk_url) + def format_path(self, relative_path): + return self.base_url + relative_path + def file_exists(self, relative_path): - file_url = self.base_url + relative_path + file_url = self.format_path(relative_path) try: r = self._session.head(file_url) if r.status_code == requests.codes.not_found: return False r.raise_for_status() except requests.exceptions.RequestException as exc: - raise DataAccessError("Error probing the existence of " - f"{file_url}: {exc}") from exc + raise DataAccessError( + "Error probing the existence of " f"{file_url}: {exc}" + ) from exc return True def fetch_file(self, relative_path): - file_url = self.base_url + relative_path + file_url = self.format_path(relative_path) try: r = self._session.get(file_url) r.raise_for_status() diff --git a/src/neuroglancer_scripts/precomputed_io.py b/src/neuroglancer_scripts/precomputed_io.py index daf8bf6..5328604 100644 --- a/src/neuroglancer_scripts/precomputed_io.py +++ b/src/neuroglancer_scripts/precomputed_io.py @@ -14,6 +14,7 @@ from neuroglancer_scripts import chunk_encoding from neuroglancer_scripts.chunk_encoding import InvalidInfoError +from neuroglancer_scripts.volume_io.base_io import MultiResIOBase __all__ = [ "get_IO_for_existing_dataset", @@ -41,8 +42,9 @@ def get_IO_for_existing_dataset(accessor, encoder_options={}): return PrecomputedIO(info, accessor, encoder_options=encoder_options) -def get_IO_for_new_dataset(info, accessor, overwrite_info=False, - encoder_options={}): +def get_IO_for_new_dataset( + info, accessor, overwrite_info=False, encoder_options={} +): """Create a new pyramid and store the provided *info*. :param dict info: the *info* of the new pyramid @@ -53,13 +55,16 @@ def get_IO_for_new_dataset(info, accessor, overwrite_info=False, """ info_str = json.dumps(info, separators=(",", ":"), sort_keys=True) info_bytes = info_str.encode("utf-8") - accessor.store_file("info", info_bytes, - mime_type="application/json", - overwrite=overwrite_info) + accessor.store_file( + "info", + info_bytes, + mime_type="application/json", + overwrite=overwrite_info, + ) return PrecomputedIO(info, accessor, encoder_options=encoder_options) -class PrecomputedIO: +class PrecomputedIO(MultiResIOBase): """Object for high-level access to a Neuroglancer precomputed dataset. An object of this class provides access to chunk data in terms of NumPy @@ -79,15 +84,18 @@ class PrecomputedIO: :param dict encoder_options: extrinsic encoder options (see :func:`neuroglancer_scripts.chunk_encoding.get_encoder`). """ + def __init__(self, info, accessor, encoder_options={}): + super().__init__() self._info = info self.accessor = accessor self._scale_info = { scale_info["key"]: scale_info for scale_info in info["scales"] } self._encoders = { - scale_info["key"]: chunk_encoding.get_encoder(info, scale_info, - encoder_options) + scale_info["key"]: chunk_encoding.get_encoder( + info, scale_info, encoder_options + ) for scale_info in info["scales"] } @@ -96,16 +104,6 @@ def info(self): """The precomputed dataset's *info* dictionary.""" return self._info - def scale_info(self, scale_key): - """The *info* for a given scale. - - :param str scale_key: the *key* property of the chosen scale. - :return: ``info["scales"][i]`` where ``info["scales"][i]["key"] - == scale_key`` - :rtype: dict - """ - return self._scale_info[scale_key] - def scale_is_lossy(self, scale_key): """Test if the scale is using a lossy encoding. @@ -131,9 +129,14 @@ def validate_chunk_coords(self, scale_key, chunk_coords): raise NotImplementedError("voxel_offset is not supported") for chunk_size in scale_info["chunk_sizes"]: xcs, ycs, zcs = chunk_size - if (xmin % xcs == 0 and (xmax == min(xmin + xcs, xs)) - and ymin % ycs == 0 and (ymax == min(ymin + ycs, ys)) - and zmin % zcs == 0 and (zmax == min(zmin + zcs, zs))): + if ( + xmin % xcs == 0 + and (xmax == min(xmin + xcs, xs)) + and ymin % ycs == 0 + and (ymax == min(ymin + ycs, ys)) + and zmin % zcs == 0 + and (zmax == min(zmin + zcs, zs)) + ): return True return False @@ -179,6 +182,5 @@ def write_chunk(self, chunk, scale_key, chunk_coords): encoder = self._encoders[scale_key] buf = encoder.encode(chunk) self.accessor.store_chunk( - buf, scale_key, chunk_coords, - mime_type=encoder.mime_type + buf, scale_key, chunk_coords, mime_type=encoder.mime_type ) diff --git a/src/neuroglancer_scripts/volume_io/__init__.py b/src/neuroglancer_scripts/volume_io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/neuroglancer_scripts/volume_io/base_io.py b/src/neuroglancer_scripts/volume_io/base_io.py new file mode 100644 index 0000000..72b4b28 --- /dev/null +++ b/src/neuroglancer_scripts/volume_io/base_io.py @@ -0,0 +1,75 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import List, Optional + + +@dataclass +class PrecompInfoScaleSharding: + # @type cannot be easily and cleaningly serialized. + # Pop "@type" out of dict before pass + minishard_bits: int + shard_bits: int + hash: str # use typing.Literal once 3.7 support is dropped + minishard_index_encoding: ( + str # use typing.Literal once 3.7 support is dropped + ) + data_encoding: str # use typing.Literal once 3.7 support is dropped + preshift_bits: int + + +@dataclass +class PrecompInfoScale: + chunk_sizes: List[List[int]] + encoding: str # use typing.Literal once 3.7 support is dropped + key: str + resolution: List[int] + size: List[int] + voxel_offset: List[float] + sharding: Optional[PrecompInfoScaleSharding] = None + + +@dataclass +class PrecompInfo: + type: str # use typing.Literal once 3.7 support is dropped + data_type: str # use typing.Literal once 3.7 support is dropped + num_channels: int + scales: List[PrecompInfoScale] + + +class MultiResIOBase(ABC): + + @property + @abstractmethod + def info(self): + raise NotImplementedError + + @abstractmethod + def read_chunk(self, scale_key, chunk_coords): + raise NotImplementedError + + @abstractmethod + def write_chunk(self, chunk, scale_key, chunk_coords): + raise NotImplementedError + + def iter_scale(self): + for scale in self.info.get("scales"): + yield scale.get("key"), scale + + def scale_info(self, scale_key): + """The *info* for a given scale. + + :param str scale_key: the *key* property of the chosen scale. + :return: ``info["scales"][i]`` where ``info["scales"][i]["key"] + == scale_key`` + :rtype: dict + """ + found_scale = [ + scale + for scale in self.info.get("scales", []) + if scale.get("key") == scale_key + ] + if len(found_scale) == 0: + raise IndexError(f"Cannot find {scale_key}") + if len(found_scale) > 1: + raise IndexError(f"Found multiple {scale_key}") + return found_scale[0] diff --git a/src/neuroglancer_scripts/volume_io/n5_io.py b/src/neuroglancer_scripts/volume_io/n5_io.py new file mode 100644 index 0000000..82ab725 --- /dev/null +++ b/src/neuroglancer_scripts/volume_io/n5_io.py @@ -0,0 +1,158 @@ +import json +import struct +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from typing import Dict, List + +from neuroglancer_scripts.accessor import Accessor +from neuroglancer_scripts.chunk_encoding import ( + ChunkEncoder, +) +from neuroglancer_scripts.volume_io.base_io import MultiResIOBase + +# N5 spec: https://github.com/saalfeldlab/n5 +# supporting an (undocumented?) custom group per +# https://github.com/bigdataviewer/bigdataviewer-core/blob/master/BDV%20N5%20format.md +# https://github.com/saalfeldlab/n5-viewer + + +@dataclass +class N5ScaleAttr: + dimensions: List[int] + blockSize: List[int] # noqa: N815 + # once py3.7 is dropped, use Literal instead + # {uint8, uint16, uint32, uint64, int8, int16, int32, int64, float32,} + # {float64} + dataType: str # noqa: N815 + compression: Dict + + +@dataclass +class N5RootAttr: + downsamplingFactors: List[List[int]] # noqa: N815 + # once py3.7 is dropped, use Literal instead + # {uint8, uint16, uint32, uint64, int8, int16, int32, int64, float32,} + # {float64} + dataType: str # noqa: N815 + multiScale: bool # noqa: N815 + resolution: List[int] + unit: List[str] # seems to be... neuroglancer specific? + + +class N5IO(MultiResIOBase): + + UNIT_TO_NM = {"um": 1e3} + + def __init__(self, accessor: Accessor): + super().__init__() + self.accessor = accessor + assert accessor.can_read, "N5IO must have readable accessor" + + self.attribute_json = N5RootAttr( + **json.loads(self.accessor.fetch_file("attributes.json")) + ) + + # networkbound, use threads + with ThreadPoolExecutor() as ex: + self.scale_attributes = [ + N5ScaleAttr(**json.loads(attr)) + for attr in list( + ex.map( + accessor.fetch_file, + [ + f"s{str(idx)}/attributes.json" + for idx, _ in enumerate( + self.attribute_json.downsamplingFactors + ) + ], + ) + ) + ] + + self._scale_attributes_dict = { + f"s{idx}": scale_attribute + for idx, scale_attribute in enumerate(self.scale_attributes) + } + + self._decoder_dict = {} + + def _get_encoder(self, scale_key: str): + + if scale_key in self._decoder_dict: + return self._decoder_dict[scale_key] + + compression_type = self._scale_attributes_dict[scale_key].compression[ + "type" + ] + datatype = self._scale_attributes_dict[scale_key].dataType + + encoder = ChunkEncoder.get_encoder(compression_type, datatype, 1) + + self._decoder_dict[scale_key] = encoder + + return encoder + + @property + def info(self): + downsample_factors = self.attribute_json.downsamplingFactors + resolution = self.attribute_json.resolution + unit = self.attribute_json.unit + + return { + "type": "image", + "data_type": self.scale_attributes[0].get("dataType"), + "num_channels": 1, + "scales": [ + { + "chunk_sizes": [scale_attribute.blockSize], + "encoding": scale_attribute.compression.get("type"), + "key": f"s{str(scale_idx)}", + "resolution": [ + res + * downsample_factors[scale_idx][order_idx] + * self.UNIT_TO_NM.get(unit[order_idx], 1) + for order_idx, res in enumerate(resolution) + ], + "size": scale_attribute.dimensions, + "voxel_offset": [0, 0, 0], + } + for scale_idx, scale_attribute in enumerate( + self.scale_attributes + ) + ], + } + + def _get_grididx_from_chunkcoord(self, scale_key, chunk_coords): + xmin, xmax, ymin, ymax, zmin, zmax = chunk_coords + block_sizex, block_sizey, block_sizez = self._scale_attributes_dict[ + scale_key + ].blockSize + return xmin // block_sizex, ymin // block_sizey, zmin // block_sizez + + def read_chunk(self, scale_key, chunk_coords): + gridx, gridy, gridz = self._get_grididx_from_chunkcoord( + scale_key, chunk_coords + ) + chunk = self.accessor.fetch_file( + f"{scale_key}/{gridx}/{gridy}/{gridz}" + ) + mode, dim, sizex, sizey, sizez = struct.unpack(">HHIII", chunk[:16]) + assert ( + dim == 3 + ), "N5 currently can only handle single channel, three dimension array" + encoder = self._get_encoder(scale_key) + return encoder.decode(chunk[16:], (sizex, sizey, sizez)) + + def write_chunk(self, chunk, scale_key, chunk_coords): + xmin, xmax, ymin, ymax, zmin, zmax = chunk_coords + gridx, gridy, gridz = self._get_grididx_from_chunkcoord( + scale_key, chunk_coords + ) + encoder = self._get_encoder(scale_key) + buf = encoder.encode(chunk) + hdr = struct.pack( + ">HHIII", 0, 3, xmax - xmin, ymax - ymin, zmax - zmin + ) + self.accessor.store_file( + f"{scale_key}/{gridx}/{gridy}/{gridz}", hdr + buf + ) diff --git a/src/neuroglancer_scripts/volume_io/zarr_io.py b/src/neuroglancer_scripts/volume_io/zarr_io.py new file mode 100644 index 0000000..b457718 --- /dev/null +++ b/src/neuroglancer_scripts/volume_io/zarr_io.py @@ -0,0 +1,201 @@ +import json +import logging +from concurrent.futures import ThreadPoolExecutor +from typing import Tuple + +import numpy as np + +from neuroglancer_scripts.accessor import Accessor +from neuroglancer_scripts.chunk_encoding import ChunkEncoder +from neuroglancer_scripts.volume_io.base_io import MultiResIOBase + +logger = logging.getLogger(__name__) + +# zarr v2 spec at https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html +# specifically, the ngff v0.4 implementation of zarr v2 https://ngff.openmicroscopy.org/0.4/ + + +class ZarrV2IO(MultiResIOBase): + + UNIT_TO_NM = {"micrometer": 1e3} + + def __init__(self, accessor: Accessor): + super().__init__() + self.accessor = accessor + self._zattrs = None + self._zarrays = None + self._zarray_info_dict = None + self._decoder_dict = {} + self._verify() + + def _verify(self): + zgroup = json.loads(self.accessor.fetch_file(".zgroup")) + assert ( + "zarr_format" in zgroup + ), "Expected key 'zarr_format' to be in .zgroup, but was not: " + str( + zgroup + ) + assert ( + len(zgroup) == 1 + ), "Expected zarr_format to be the only key in .zgroup, but was not" + assert zgroup["zarr_format"] == 2, ( + "Expected zarr v2, but was:" + zgroup["zarr_format"] + ) + assert ( + "multiscales" in self.zattrs + ), "Expected 'multiscales' in zattrs, but was not found" + + @property + def zarray_info_dict(self): + if self._zarray_info_dict is None: + multiscale = self.multiscale + + path_to_datasets = [] + for dataset in multiscale.get("datasets", []): + path = dataset.get("path") + assert ( + path is not None + ), f"Expected path to exist, but it does not. {dataset}" + path_to_datasets.append(path) + + # often network bound, use threads + with ThreadPoolExecutor() as ex: + zarray_bytes = list( + ex.map( + self.accessor.fetch_file, + [f"{p}/.zarray" for p in path_to_datasets], + ) + ) + self._zarray_info_dict = { + path: json.loads(b) + for path, b in zip(path_to_datasets, zarray_bytes) + } + + return self._zarray_info_dict + + @property + def zattrs(self): + if self._zattrs is None: + self._zattrs = json.loads(self.accessor.fetch_file(".zattrs")) + return self._zattrs + + @property + def multiscale(self): + multiscales = self.zattrs.get("multiscales", []) + assert ( + len(multiscales) == 1 + ), f"Expected one and only one multiscale, but got {len(multiscales)}" + return multiscales[0] + + @property + def zarrays(self): + return list(self.zarray_info_dict.values()) + + @property + def info(self): + data_type_set = { + np.dtype(zarray.get("dtype")) for zarray in self.zarrays + } + assert ( + len(data_type_set) == 1 + ), f"Expected one and one type of datatype, but got {data_type_set}" + data_type = list(data_type_set)[0] + + num_channels = 1 + axes = self.multiscale.get("axes", []) + for channel_idx, axis in enumerate(axes): + if axis.get("type") == "channel": + num_channels = len( + { + zarray.get("shape", [])[channel_idx] + for zarray in self.zarrays + } + ) + break + + datasets = self.multiscale.get("datasets", []) + + datasets_path_scale = [] + for dataset in datasets: + path = dataset.get("path") + coord_transforms = dataset.get("coordinateTransformations", []) + assert ( + len(coord_transforms) == 1 + ), "Expected one and only one coord_transform" + coord_transform = coord_transforms[0] + + resolutions = [] + scale = coord_transform.get("scale") + for axis_idx, axis in enumerate(axes): + if axis.get("type") != "space": + continue + unit = axis.get("unit") + assert ( + unit in self.UNIT_TO_NM + ), f"Expected {unit} to be found, but was not found" + resolutions.append(scale[axis_idx] * self.UNIT_TO_NM[unit]) + + datasets_path_scale.append((path, resolutions)) + + return { + "type": "image", + "data_type": data_type.name, + "num_channels": num_channels, + "scales": [ + { + "chunk_sizes": [zarray.get("chunks")], + "encoding": "raw", + "key": datasets_path_scale[zarr_idx][0], + "resolution": datasets_path_scale[zarr_idx][1], + "size": zarray.get("shape"), + "voxel_offset": [0, 0, 0], + } + for zarr_idx, zarray in enumerate(self.zarrays) + ], + } + + def format_path(self, scale_key: str, chunk_coords: Tuple[int]): + xmin, xmax, ymin, ymax, zmin, zmax = chunk_coords + zarray_info = self.zarray_info_dict[scale_key] + sep = zarray_info.get("dimension_separator", ".") + chunks = zarray_info.get("chunks") + assert chunks, "chunks key must be defined" + block_sizex, block_sizey, block_sizez = chunks + + assert xmin >= 0 and ymin >= 0 and zmin >= 0, "{x,y,z}min must be >= 0" + return ( + f"{scale_key}/{xmin // block_sizex}{sep}{ymin // block_sizey}" + + f"{sep}{zmin // block_sizez}" + ) + + def get_encoder(self, scale_key: str): + if scale_key in self._decoder_dict: + return self._decoder_dict[scale_key] + zarray_info = self.zarray_info_dict[scale_key] + compressor = zarray_info.get("compressor") + id = (compressor or {}).get("id") + + dtype = zarray_info.get("dtype") + assert dtype, "dtype must be defined." + data_type = np.dtype(dtype) + + # TODO get num of channel properly + encoder = ChunkEncoder.get_encoder(id, data_type, 1) + self._decoder_dict[scale_key] = encoder + return encoder + + def read_chunk(self, scale_key: str, chunk_coords: Tuple[int]): + path = self.format_path(scale_key, chunk_coords) + zarray_info = self.zarray_info_dict[scale_key] + encoder = self.get_encoder(scale_key) + + b = self.accessor.fetch_file(path) + return encoder.decode(b, zarray_info.get("chunks")) + + def write_chunk( + self, chunk: bytes, scale_key: str, chunk_coords: Tuple[int] + ): + encoder = self.get_encoder(scale_key) + path = self.format_path(scale_key, chunk_coords) + buf = encoder.encode(chunk) + self.accessor.store_file(path, buf) diff --git a/tox.ini b/tox.ini index 9383619..020b289 100644 --- a/tox.ini +++ b/tox.ini @@ -6,12 +6,11 @@ # NOTE: remember to update the classifiers in setup.cfg when Python versions # are added/removed [tox] -envlist = py36, py37, py38, py39, py310, py311, py312, codestyle, docs, cov +envlist = py37, py38, py39, py310, py311, py312, codestyle, docs, cov isolated_build = True [gh-actions] python = - 3.6: py36 3.7: py37 3.8: py38 3.9: py39 diff --git a/unit_tests/test_gcs_accessor.py b/unit_tests/test_gcs_accessor.py new file mode 100644 index 0000000..dcbf0bd --- /dev/null +++ b/unit_tests/test_gcs_accessor.py @@ -0,0 +1,38 @@ +from urllib.parse import quote_plus + +import pytest +from neuroglancer_scripts.gcs_accessor import GoogleBucketParser, UrlParser + + +@pytest.fixture +def gs_parser(): + yield UrlParser("gs://foo/bar") + + +def test_url_parser_init_type(gs_parser): + assert isinstance(gs_parser, GoogleBucketParser) + + +def test_gs_url_parser_base_url(gs_parser): + assert gs_parser.base_url == "foo/bar" + + +url_format_args = [ + ([], None, TypeError), + ( + ["buzz.txt"], + "https://www.googleapis.com/storage/v1/b/{bucketname}/o/{final_path}?alt=media".format( + bucketname="foo", final_path=quote_plus("bar/buzz.txt") + ), + None, + ), +] + + +@pytest.mark.parametrize("paths, expected, err", url_format_args) +def test_gs_url_parser_format_url(paths, expected, err, gs_parser): + if err is not None: + with pytest.raises(err): + gs_parser.format_url(*paths) + return + assert expected == gs_parser.format_url(*paths) diff --git a/unit_tests/volume_io/test_base_io.py b/unit_tests/volume_io/test_base_io.py new file mode 100644 index 0000000..91d1e88 --- /dev/null +++ b/unit_tests/volume_io/test_base_io.py @@ -0,0 +1,56 @@ +from unittest.mock import PropertyMock, patch + +import pytest +from neuroglancer_scripts.volume_io.base_io import MultiResIOBase + + +class DummyIO(MultiResIOBase): + @property + def info(self): + pass + + def read_chunk(self, scale_key, chunk_coords): + return super().read_chunk(scale_key, chunk_coords) + + def write_chunk(self, chunk, scale_key, chunk_coords): + return super().write_chunk(chunk, scale_key, chunk_coords) + + +dummy_info = { + "scales": [ + {"key": "scale0", "foo": "bar"}, + {"key": "scale1", "hello": "world"}, + ] +} + + +@pytest.fixture +def patched_dummy_io_info(): + with patch.object(DummyIO, "info", new_callable=PropertyMock) as info_mock: + info_mock.return_value = dummy_info + yield info_mock + + +def test_iter_scale(patched_dummy_io_info): + io = DummyIO() + assert [ + ("scale0", {"key": "scale0", "foo": "bar"}), + ("scale1", {"key": "scale1", "hello": "world"}), + ] == list(io.iter_scale()) + + +@pytest.mark.parametrize( + "key, error, expected_value", + [ + ("scale0", None, {"key": "scale0", "foo": "bar"}), + ("scale1", None, {"key": "scale1", "hello": "world"}), + ("foo", IndexError, None), + ], +) +def test_scale_info(key, error, expected_value, patched_dummy_io_info): + io = DummyIO() + if error: + with pytest.raises(error): + io.scale_info(key) + return + assert expected_value == io.scale_info(key) diff --git a/unit_tests/volume_io/test_n5_io.py b/unit_tests/volume_io/test_n5_io.py new file mode 100644 index 0000000..0f7ea83 --- /dev/null +++ b/unit_tests/volume_io/test_n5_io.py @@ -0,0 +1,114 @@ +import json +from itertools import chain, repeat +from unittest.mock import call, patch + +import numpy as np +import pytest +from neuroglancer_scripts.accessor import Accessor, get_accessor_for_url +from neuroglancer_scripts.volume_io.n5_io import N5IO + + +@pytest.fixture +def mock_accessor_fetch_file(): + with patch.object(Accessor, "fetch_file") as fetch_file_mock: + yield fetch_file_mock + + +no_scales_root_attr = { + "downsamplingFactors": [], + "dataType": "utf-16", + "multiScale": True, + "resolution": [1, 1, 1], + "unit": ["m", "m", "m"], +} + +sample_root_attr = { + "downsamplingFactors": [ + [1, 1, 1], + [2, 2, 2], + [4, 4, 4], + ], + "dataType": "utf-16", + "multiScale": True, + "resolution": [1, 1, 1], + "unit": ["m", "m", "m"], +} + +sample_scale_attr = { + "dataType": "uint16", + "blockSize": [8, 8, 8], + "dimensions": [1, 1, 1], + "compression": {"type": "gzip"}, +} +sample_scale_attr_bytes = json.dumps(sample_scale_attr) + + +@pytest.mark.parametrize( + "fetch_file_sideeffects, error, expected_calls", + [ + (repeat(b"foobar"), json.JSONDecodeError, [call("attributes.json")]), + ( + chain( + [json.dumps(no_scales_root_attr).encode("utf-8")], + repeat(json.dumps(sample_scale_attr).encode("utf-8")), + ), + None, + [call("attributes.json")], + ), + ( + chain( + [json.dumps(sample_root_attr).encode("utf-8")], + repeat(json.dumps(sample_scale_attr).encode("utf-8")), + ), + None, + [ + call("attributes.json"), + call("s0/attributes.json"), + call("s1/attributes.json"), + call("s2/attributes.json"), + ], + ), + ], +) +def test_n5_init( + fetch_file_sideeffects, error, expected_calls, mock_accessor_fetch_file +): + mock_accessor_fetch_file.side_effect = fetch_file_sideeffects + + accessor = Accessor() + accessor.can_read = True + if error is not None: + with pytest.raises(error): + N5IO(accessor) + assert mock_accessor_fetch_file.call_args_list == expected_calls + return + N5IO(accessor) + assert mock_accessor_fetch_file.call_args_list == expected_calls + + +def test_n5io_roundtrip(tmpdir): + accessor = get_accessor_for_url(str(tmpdir)) + + root = { + "downsamplingFactors": [[1, 1, 1]], + "dataType": "utf-16", + "multiScale": True, + "resolution": [1, 1, 1], + "unit": ["m", "m", "m"], + } + s0 = { + "dataType": "uint16", + "blockSize": [8, 3, 7], + "dimensions": [1, 1, 1], + "compression": {"type": "gzip"}, + } + + accessor.store_file("attributes.json", json.dumps(root).encode("utf-8")) + accessor.store_file("s0/attributes.json", json.dumps(s0).encode("utf-8")) + + dummy_chunk = np.arange(8 * 3 * 7, dtype="uint16").reshape(1, 7, 3, 8) + io = N5IO(accessor) + io.write_chunk(dummy_chunk, "s0", (0, 8, 0, 3, 0, 7)) + + retrieved_chunk = io.read_chunk("s0", (0, 8, 0, 3, 0, 7)) + assert np.array_equal(dummy_chunk, retrieved_chunk) diff --git a/unit_tests/volume_io/test_zarr_io.py b/unit_tests/volume_io/test_zarr_io.py new file mode 100644 index 0000000..7dc2217 --- /dev/null +++ b/unit_tests/volume_io/test_zarr_io.py @@ -0,0 +1,346 @@ +import json +from unittest.mock import MagicMock, PropertyMock, call, patch + +import numpy as np +import pytest +from neuroglancer_scripts.accessor import Accessor, get_accessor_for_url +from neuroglancer_scripts.volume_io.zarr_io import ChunkEncoder, ZarrV2IO + + +class MockAccessor(Accessor): + pass + + +@pytest.mark.parametrize( + "zgroup_content, zattrs_content, error", + [ + (1, b'{"multiscales": [{}]}', TypeError), + (b"", b'{"multiscales": [{}]}', json.JSONDecodeError), + (b'{"zarr_format": 2}', 1, TypeError), + (b'{"zarr_format": 2}', b"", json.JSONDecodeError), + (b'{"zarr_format": 2, "foo": "bar"}', + b'{"multiscales": [{}]}', + AssertionError), + (b'{"foo": "bar"}', b'{"multiscales": [{}]}', AssertionError), + (b'{"zarr_format": 2}', b'{"foo": "bar"}', AssertionError), + (b'{"zarr_format": 2}', b'{"multiscales": [{}]}', None), + ], +) +def test_zarrv2_init(zgroup_content, zattrs_content, error): + + called_count = 0 + + def mock_fetch(_, relative_path): + nonlocal called_count + called_count += 1 + if relative_path == ".zgroup": + return zgroup_content + if relative_path == ".zattrs": + return zattrs_content + raise NotImplementedError + + with patch.object(MockAccessor, "fetch_file", new=mock_fetch): + acc = MockAccessor() + if error is not None: + with pytest.raises(error): + ZarrV2IO(acc) + return + + ZarrV2IO(acc) + assert called_count == 2 + + +@pytest.fixture +def mock_pass_verify(): + with patch.object(ZarrV2IO, "_verify", return_value=None) as verify_call: + yield verify_call + + +@pytest.fixture +def mocked_multiscale_property(mock_pass_verify): + with patch.object( + ZarrV2IO, "multiscale", new_callable=PropertyMock, + ) as multiscale_prop: + yield multiscale_prop + + +@pytest.mark.parametrize( + "multiscale_prop, error", + [ + (None, AttributeError), + ({"datasets": None}, TypeError), + ({"datasets": [{"path": "1"}, {"foo": "bar"}]}, AssertionError), + ({"datasets": [{"path": "1"}, {"path": "2"}]}, None), + ({"datasets": []}, None), + ({}, None), + ], +) +def test_zarray_info_dict_vary_multiscale( + multiscale_prop, error, mocked_multiscale_property, +): + + def mock_fetch(*args, **kwargs): + return b'{"foo": "bar"}' + + with patch.object(MockAccessor, "fetch_file", new=mock_fetch): + acc = MockAccessor() + + mocked_multiscale_property.return_value = multiscale_prop + if error is not None: + with pytest.raises(error): + io = ZarrV2IO(acc) + io.zarray_info_dict + return + io = ZarrV2IO(acc) + assert io.zarray_info_dict is not None + + +class CustomError(Exception): + pass + + +@pytest.mark.parametrize( + "fetch_file_return_values, error", + [ + ([CustomError, b'{"foo": "bar"}'], CustomError), + ([b'{"foo": "bar"}', CustomError], CustomError), + ([b"foobar", b'{"foo": "bar"}'], json.JSONDecodeError), + ([b'{"foo": "bar"}', b"foobar"], json.JSONDecodeError), + ([b'{"foo": "bar"}', b'{"foo": "bar"}'], None), + ], +) +def test_zarray_info_dict_vary_fetch_file( + fetch_file_return_values, error, mocked_multiscale_property, +): + + mocked_multiscale_property.return_value = { + "datasets": [ + {"path": "1"}, + {"path": "2"}, + ], + } + counter = 0 + + def mock_fetch(*args, **kwargs): + nonlocal counter + return_value = fetch_file_return_values[ + counter % len(fetch_file_return_values)] + counter += 1 + if ( + isinstance(return_value, type) + and issubclass(return_value, Exception) + ): + raise return_value + return return_value + + with patch.object(MockAccessor, "fetch_file", new=mock_fetch): + acc = MockAccessor() + if error is not None: + with pytest.raises(error): + io = ZarrV2IO(acc) + io.zarray_info_dict + return + + io = ZarrV2IO(acc) + assert io.zarray_info_dict is not None + + +def test_zarray_info_dict_valid(mocked_multiscale_property): + + mocked_multiscale_property.return_value = { + "datasets": [ + {"path": "1"}, + {"path": "2"}, + ], + } + + mock_accessor = MagicMock() + mock_accessor.fetch_file.side_effect = [ + b'{"foo": "bar0"}', + b'{"foo": "bar1"}'] + + io = ZarrV2IO(mock_accessor) + assert io.zarray_info_dict == { + "1": {"foo": "bar0"}, + "2": {"foo": "bar1"}, + } + + assert mock_accessor.fetch_file.call_args_list == [ + call("1/.zarray"), + call("2/.zarray"), + ] + + +@pytest.fixture +def mock_get_encoder(): + with patch.object(ChunkEncoder, "get_encoder") as mock_get_encoder: + yield mock_get_encoder + + +SCALE_KEY = "foo" + + +# TODO technically, dtype must be