hpc4cmb
diff --git a/‎src/flacarray/array.py‎
Lines changed: 43 additions & 9 deletions b/‎src/flacarray/array.py‎
Lines changed: 43 additions & 9 deletions
diff --git a/‎src/flacarray/compress.py‎
Lines changed: 8 additions & 13 deletions b/‎src/flacarray/compress.py‎
Lines changed: 8 additions & 13 deletions
diff --git a/‎src/flacarray/decompress.py‎
Lines changed: 26 additions & 22 deletions b/‎src/flacarray/decompress.py‎
Lines changed: 26 additions & 22 deletions
@@ -11,7 +11,7 @@
 from .hdf5 import write_compressed as hdf5_write_compressed
 from .hdf5 import read_compressed as hdf5_read_compressed
 from .mpi import global_bytes, global_array_properties
-from .utils import log
+from .utils import log, compressed_dtype
 from .zarr import write_compressed as zarr_write_compressed
 from .zarr import read_compressed as zarr_read_compressed
 
@@ -29,7 +29,7 @@ class FlacArray:
     stream in the overall bytes array.  The shape of the starting array corresponds
     to the shape of the leading, un-compressed dimensions of the original array.
 
-    The input data is converted to 32bit integers.  The "quanta" value is used
+    The input data is converted to 32bit or 64bit integers.  The "quanta" value is used
     for floating point data conversion and represents the floating point increment
     for a single integer value.  If quanta is None, each stream is scaled independently
     based on its data range.  If quanta is a scalar, all streams are scaled with the
@@ -42,20 +42,19 @@ class FlacArray:
     The following rules specify the data conversion that is performed depending on
     the input type:
 
-    * int32:  No conversion.
+    * int32:  No conversion.  Compressed to single channel FLAC bytestream.
 
-    * int64:  Subtract the integer closest to the mean, then truncate to lower
-        32 bits, and check that the higher bits were zero.
+    * int64:  No conversion.  Compressed to 2-channel (stereo) FLAC bytestream.
 
     * float32:  Subtract the mean and scale data based on the quanta value (see
         above).  Then round to nearest 32bit integer.
 
     * float64:  Subtract the mean and scale data based on the quanta value (see
-        above).  Then round to nearest 32bit integer.
+        above).  Then round to nearest 64bit integer.
 
-    After conversion to 32bit integers, each stream's data is separately compressed
-    into a sequence of FLAC bytes, which is appended to the bytestream.  The offset in
-    bytes for each stream is recorded.
+    After conversion to integers, each stream's data is separately compressed into a
+    sequence of FLAC bytes, which is appended to the bytestream.  The offset in bytes
+    for each stream is recorded.
 
     A FlacArray is only constructed directly when making a copy.  Use the class methods
     to create FlacArrays from numpy arrays or on-disk representations.
@@ -125,6 +124,11 @@ def _init_params(self):
         self._global_nstreams = np.prod(self._global_leading_shape)
         # For reference, record the type string of the original data.
         self._typestr = self._dtype_str(self._dtype)
+        # Track whether we have 32bit or 64bit data
+        self._is_int64 = (
+            self._dtype == np.dtype(np.int64) or
+            self._dtype == np.dtype(np.float64)
+        )
 
     @staticmethod
     def _dtype_str(dt):
@@ -245,6 +249,11 @@ def dtype(self):
         """The dtype of the uncompressed array."""
         return self._dtype
 
+    @property
+    def typestr(self):
+        """A string representation of the original data type."""
+        return self._typestr
+
     def _keep_view(self, key):
         if len(key) != len(self._leading_shape):
             raise ValueError("view size does not match leading dimensions")
@@ -341,6 +350,7 @@ def __getitem__(self, raw_key):
                 keep=keep,
                 first_stream_sample=first,
                 last_stream_sample=last,
+                is_int64=self._is_int64,
             )
             return arr.reshape(full_shape)
 
@@ -366,6 +376,9 @@ def __eq__(self, other):
         if self._shape != other._shape:
             log.debug(f"other shape {other._shape} != {self._shape}")
             return False
+        if self._dtype != other._dtype:
+            log.debug(f"other dtype {other._dtype} != {self._dtype}")
+            return False
         if self._global_shape != other._global_shape:
             msg = f"other global_shape {other._global_shape} != {self._global_shape}"
             log.debug(msg)
@@ -463,6 +476,7 @@ def to_array(
             keep=keep,
             first_stream_sample=first_samp,
             last_stream_sample=last_samp,
+            is_int64=self._is_int64,
             use_threads=use_threads,
         )
         if keep is not None and keep_indices:
@@ -514,6 +528,7 @@ def from_array(
             shape=arr.shape,
             global_shape=global_shape,
             compressed=compressed,
+            dtype=arr.dtype,
             stream_starts=starts,
             stream_nbytes=nbytes,
             stream_offsets=offsets,
@@ -542,6 +557,11 @@ def write_hdf5(self, hgrp):
             None
 
         """
+        if self._is_int64:
+            n_channels = 2
+        else:
+            n_channels = 1
+
         hdf5_write_compressed(
             hgrp,
             self._leading_shape,
@@ -553,6 +573,7 @@ def write_hdf5(self, hgrp):
             self._stream_offsets,
             self._stream_gains,
             self._compressed,
+            n_channels,
             self._compressed.nbytes,
             self._global_nbytes,
             self._global_proc_nbytes,
@@ -604,6 +625,7 @@ def read_hdf5(
             local_shape,
             global_shape,
             compressed,
+            n_channels,
             stream_starts,
             stream_nbytes,
             stream_offsets,
@@ -617,11 +639,14 @@ def read_hdf5(
             mpi_dist=mpi_dist,
         )
 
+        dt = compressed_dtype(n_channels, stream_offsets, stream_gains)
+
         return FlacArray(
             None,
             shape=local_shape,
             global_shape=global_shape,
             compressed=compressed,
+            dtype=dt,
             stream_starts=stream_starts,
             stream_nbytes=stream_nbytes,
             stream_offsets=stream_offsets,
@@ -646,6 +671,10 @@ def write_zarr(self, zgrp):
             None
 
         """
+        if self._is_int64:
+            n_channels = 2
+        else:
+            n_channels = 1
         zarr_write_compressed(
             zgrp,
             self._leading_shape,
@@ -657,6 +686,7 @@ def write_zarr(self, zgrp):
             self._stream_offsets,
             self._stream_gains,
             self._compressed,
+            n_channels,
             self._compressed.nbytes,
             self._global_nbytes,
             self._global_proc_nbytes,
@@ -706,6 +736,7 @@ def read_zarr(
             local_shape,
             global_shape,
             compressed,
+            n_channels,
             stream_starts,
             stream_nbytes,
             stream_offsets,
@@ -719,11 +750,14 @@ def read_zarr(
             mpi_dist=mpi_dist,
         )
 
+        dt = compressed_dtype(n_channels, stream_offsets, stream_gains)
+
         return FlacArray(
             None,
             shape=local_shape,
             global_shape=global_shape,
             compressed=compressed,
+            dtype=dt,
             stream_starts=stream_starts,
             stream_nbytes=stream_nbytes,
             stream_offsets=stream_offsets,
 
@@ -5,17 +5,18 @@
 import numpy as np
 
 from .libflacarray import encode_flac
-from .utils import int64_to_int32, float_to_int32, function_timer
+from .utils import float_to_int, function_timer
 
 
 @function_timer
 def array_compress(arr, level=5, quanta=None, precision=None, use_threads=False):
     """Compress a numpy array with optional floating point conversion.
 
     If `arr` is an int32 array, the returned stream offsets and gains will be None.
-    if `arr` is an int64 array, the stream offsets will be the integer value subtracted
-    when converting to int32.  Both float32 and float64 data will have floating point
-    offset and gain arrays returned.
+    if `arr` is an int64 array, the returned stream offsets and gains will be None and
+    the calling code is responsible for tracking that the compressed bytes are
+    associated with a 64bit stream.  Both float32 and float64 data will have floating
+    point offset and gain arrays returned.
 
     Args:
         arr (numpy.ndarray):  The input array data.
@@ -55,17 +56,11 @@ def array_compress(arr, level=5, quanta=None, precision=None, use_threads=False)
     else:
         dquanta = None
 
-    if arr.dtype == np.dtype(np.int32):
+    if arr.dtype == np.dtype(np.int32) or arr.dtype == np.dtype(np.int64):
         (compressed, starts, nbytes) = encode_flac(arr, level, use_threads=use_threads)
         return (compressed, starts, nbytes, None, None)
-    elif arr.dtype == np.dtype(np.int64):
-        idata, ioff = int64_to_int32(arr)
-        (compressed, starts, nbytes) = encode_flac(
-            idata, level, use_threads=use_threads
-        )
-        return (compressed, starts, nbytes, ioff, None)
-    elif arr.dtype == np.dtype(np.float64) or arr.dtype == np.dtype(np.float32):
-        idata, foff, gains = float_to_int32(arr, quanta=dquanta, precision=precision)
+    elif arr.dtype == np.dtype(np.float32) or arr.dtype == np.dtype(np.float64):
+        idata, foff, gains = float_to_int(arr, quanta=dquanta, precision=precision)
         (compressed, starts, nbytes) = encode_flac(
             idata, level, use_threads=use_threads
         )
 
@@ -5,7 +5,7 @@
 import numpy as np
 
 from .libflacarray import decode_flac
-from .utils import int32_to_float, keep_select, function_timer, select_keep_indices
+from .utils import int_to_float, keep_select, function_timer, select_keep_indices
 
 
 @function_timer
@@ -19,14 +19,18 @@ def array_decompress_slice(
     keep=None,
     first_stream_sample=None,
     last_stream_sample=None,
+    is_int64=False,
     use_threads=False,
 ):
     """Decompress a slice of a FLAC encoded array and restore original data type.
 
-    If `stream_gains` is specified, the output data will be float32 and `stream_offsets`
-    must also be provided.  If `stream_gains` is not specified, but `stream_offsets` is,
-    then the returned data will be int64.  If neither offsets or gains are specified,
-    the decompressed int32 array is returned.
+    If both `stream_gains` and `stream_offsets` are specified, the output will be
+    floating point data.  If neither is specified, the output will be integer data.
+    It is an error to specify only one of those options.
+
+    The compressed byte stream might contain either int32 or int64 data, and the calling
+    code is responsible for tracking this.  The `is_int64` parameter should be set to
+    True if the byte stream contains 64bit integers.
 
     To decompress a subset of samples in all streams, specify the `first_stream_sample`
     and `last_stream_sample` values.  None values or negative values disable this
@@ -52,6 +56,7 @@ def array_decompress_slice(
         keep (array):  Bool array of streams to keep in the decompression.
         first_stream_sample (int):  The first sample of every stream to decompress.
         last_stream_sample (int):  The last sample of every stream to decompress.
+        is_int64 (bool):  If True, the compressed stream contains 64bit integers.
         use_threads (bool):  If True, use OpenMP threads to parallelize decoding.
             This is only beneficial for large arrays.
 
@@ -79,27 +84,19 @@ def array_decompress_slice(
                 first_sample=first_stream_sample,
                 last_sample=last_stream_sample,
                 use_threads=use_threads,
+                is_int64=is_int64,
             )
-            arr = int32_to_float(idata, offsets, gains)
+            arr = int_to_float(idata, offsets, gains)
         else:
-            # This is int64 data
-            idata = decode_flac(
-                compressed,
-                starts,
-                nbytes,
-                stream_size,
-                first_sample=first_stream_sample,
-                last_sample=last_stream_sample,
-                use_threads=use_threads,
+            raise RuntimeError(
+                "When specifying offsets, you must also provide the gains"
             )
-            ext_shape = offsets.shape + (1,)
-            arr = idata.astype(np.int64) + offsets.reshape(ext_shape)
     else:
         if stream_gains is not None:
             raise RuntimeError(
                 "When specifying gains, you must also provide the offsets"
             )
-        # This is int32 data
+        # This is integer data
         arr = decode_flac(
             compressed,
             starts,
@@ -108,6 +105,7 @@ def array_decompress_slice(
             first_sample=first_stream_sample,
             last_sample=last_stream_sample,
             use_threads=use_threads,
+            is_int64=is_int64,
         )
     return (arr, indices)
 
@@ -122,14 +120,18 @@ def array_decompress(
     stream_gains=None,
     first_stream_sample=None,
     last_stream_sample=None,
+    is_int64=False,
     use_threads=False,
 ):
     """Decompress a FLAC encoded array and restore original data type.
 
-    If `stream_gains` is specified, the output data will be float32 and `stream_offsets`
-    must also be provided.  If `stream_gains` is not specified, but `stream_offsets` is,
-    then the returned data will be int64.  If neither offsets or gains are specified,
-    the decompressed int32 array is returned.
+    If both `stream_gains` and `stream_offsets` are specified, the output will be
+    floating point data.  If neither is specified, the output will be integer data.
+    It is an error to specify only one of those options.
+
+    The compressed byte stream might contain either int32 or int64 data, and the calling
+    code is responsible for tracking this.  The `is_int64` parameter should be set to
+    True if the byte stream contains 64bit integers.
 
     To decompress a subset of samples in all streams, specify the `first_stream_sample`
     and `last_stream_sample` values.  None values or negative values disable this
@@ -144,6 +146,7 @@ def array_decompress(
         stream_gains (array):  The array of gains, one per stream.
         first_stream_sample (int):  The first sample of every stream to decompress.
         last_stream_sample (int):  The last sample of every stream to decompress.
+        is_int64 (bool):  If True, the compressed stream contains 64bit integers.
         use_threads (bool):  If True, use OpenMP threads to parallelize decoding.
             This is only beneficial for large arrays.
 
@@ -161,6 +164,7 @@ def array_decompress(
         keep=None,
         first_stream_sample=first_stream_sample,
         last_stream_sample=last_stream_sample,
+        is_int64=is_int64,
         use_threads=use_threads,
     )
     return arr