Cloud ingestion optimizations (#738)

BrianMichell · web-flow · commit 93e22eccb23b · 2025-10-31T16:39:09.000-05:00
* Working tests with fake server

* Fix possible import issues

* Cleanup

* Strip out local gcs fake server and revert some regressions

* Put max workers config back in worker processes as setting globally did not appear to be fully honored

* Reordering to match original code

* pre-commit and cleanup warnings

* Remove unnecessary global `SegyFile`
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -8,17 +8,15 @@
 import numpy as np
 from segy.arrays import HeaderArray
 
-from mdio.api.io import _normalize_storage_options
 from mdio.core.config import MDIOSettings
 from mdio.segy._raw_trace_wrapper import SegyFileRawTraceWrapper
 from mdio.segy.file import SegyFileArguments
 from mdio.segy.file import SegyFileWrapper
 
 if TYPE_CHECKING:
-    from upath import UPath
+    from segy import SegyFile
     from zarr import Array as zarr_Array
 
-from zarr import open_group as zarr_open_group
 from zarr.core.config import config as zarr_config
 
 from mdio.builder.schemas.v1.stats import CenteredBinHistogram
@@ -71,26 +69,30 @@ def header_scan_worker(
 
 
 def trace_worker(  # noqa: PLR0913
-    segy_file_kwargs: SegyFileArguments,
-    output_path: UPath,
-    data_variable_name: str,
+    segy_file: SegyFile,
+    data_array: zarr_Array,
+    header_array: zarr_Array | None,
+    raw_header_array: zarr_Array | None,
     region: dict[str, slice],
     grid_map: zarr_Array,
 ) -> SummaryStatistics | None:
     """Writes a subset of traces from a region of the dataset of Zarr file.
 
     Args:
-        segy_file_kwargs: Arguments to open SegyFile instance.
-        output_path: Universal Path for the output Zarr dataset
-            (e.g. local file path or cloud storage URI) the location
-            also includes storage options for cloud storage.
-        data_variable_name: Name of the data variable to write.
+        segy_file: The opened SEG-Y file.
+        data_array: Zarr array for writing trace data.
+        header_array: Zarr array for writing trace headers (or None if not needed).
+        raw_header_array: Zarr array for writing raw headers (or None if not needed).
         region: Region of the dataset to write to.
         grid_map: Zarr array mapping live traces to their positions in the dataset.
 
     Returns:
         SummaryStatistics object containing statistics about the written traces.
     """
+    # Setting the zarr config to 1 thread to ensure we honor the `MDIO__IMPORT__CPU_COUNT` environment variable.
+    # The Zarr 3 engine utilizes multiple threads. This can lead to resource contention and unpredictable memory usage.
+    zarr_config.set({"threading.max_workers": 1})
+
     region_slices = tuple(region.values())
     local_grid_map = grid_map[region_slices[:-1]]  # minus last (vertical) axis
 
@@ -100,26 +102,8 @@ def trace_worker(  # noqa: PLR0913
     if not not_null.any():
         return None
 
-    # Open the SEG-Y file in this process since the open file handles cannot be shared across processes.
-    segy_file = SegyFileWrapper(**segy_file_kwargs)
-
-    # Setting the zarr config to 1 thread to ensure we honor the `MDIO__IMPORT__MAX_WORKERS` environment variable.
-    # The Zarr 3 engine utilizes multiple threads. This can lead to resource contention and unpredictable memory usage.
-    zarr_config.set({"threading.max_workers": 1})
-
     live_trace_indexes = local_grid_map[not_null].tolist()
 
-    # Open the zarr group to write directly
-    storage_options = _normalize_storage_options(output_path)
-    zarr_group = zarr_open_group(output_path.as_posix(), mode="r+", storage_options=storage_options)
-
-    header_key = "headers"
-    raw_header_key = "raw_headers"
-
-    # Check which variables exist in the zarr store
-    available_arrays = list(zarr_group.array_keys())
-
-    # traces = segy_file.trace[live_trace_indexes]
     # Raw headers are not intended to remain as a feature of the SEGY ingestion.
     # For that reason, we have wrapped the accessors to provide an interface that can be removed
     # and not require additional changes to the below code.
@@ -132,24 +116,21 @@ def trace_worker(  # noqa: PLR0913
     full_shape = tuple(s.stop - s.start for s in region_slices)
     header_shape = tuple(s.stop - s.start for s in header_region_slices)
 
-    # Write raw headers if they exist
+    # Write raw headers if array was provided
     # Headers only have spatial dimensions (no sample dimension)
-    if raw_header_key in available_arrays:
-        raw_header_array = zarr_group[raw_header_key]
+    if raw_header_array is not None:
         tmp_raw_headers = np.full(header_shape, raw_header_array.fill_value)
         tmp_raw_headers[not_null] = traces.raw_header
         raw_header_array[header_region_slices] = tmp_raw_headers
 
-    # Write headers if they exist
+    # Write headers if array was provided
     # Headers only have spatial dimensions (no sample dimension)
-    if header_key in available_arrays:
-        header_array = zarr_group[header_key]
+    if header_array is not None:
         tmp_headers = np.full(header_shape, header_array.fill_value)
         tmp_headers[not_null] = traces.header
         header_array[header_region_slices] = tmp_headers
 
     # Write the data variable
-    data_array = zarr_group[data_variable_name]
     tmp_samples = np.full(full_shape, data_array.fill_value)
     tmp_samples[not_null] = traces.sample
     data_array[region_slices] = tmp_samples
diff --git a/src/mdio/segy/blocked_io.py b/src/mdio/segy/blocked_io.py
@@ -12,6 +12,7 @@
 import zarr
 from dask.array import Array
 from dask.array import map_blocks
+from segy import SegyFile
 from tqdm.auto import tqdm
 from zarr import open_group as zarr_open_group
 
@@ -80,18 +81,48 @@ def to_zarr(  # noqa: PLR0913, PLR0915
     chunk_iter = ChunkIterator(shape=data.shape, chunks=worker_chunks, dim_names=data.dims)
     num_chunks = chunk_iter.num_chunks
 
+    zarr_format = zarr.config.get("default_zarr_format")
+
+    # Open zarr group once in main process
+    storage_options = _normalize_storage_options(output_path)
+    zarr_group = zarr_open_group(
+        output_path.as_posix(),
+        mode="r+",
+        storage_options=storage_options,
+        use_consolidated=zarr_format == ZarrFormat.V2,
+    )
+
+    # Get array handles from the opened group
+    data_array = zarr_group[data_variable_name]
+    header_array = zarr_group.get("headers")
+    raw_header_array = zarr_group.get("raw_headers")
+
     # For Unix async writes with s3fs/fsspec & multiprocessing, use 'spawn' instead of default
     # 'fork' to avoid deadlocks on cloud stores. Slower but necessary. Default on Windows.
     num_workers = min(num_chunks, settings.import_cpus)
     context = mp.get_context("spawn")
-    executor = ProcessPoolExecutor(max_workers=num_workers, mp_context=context)
+
+    # Use initializer to open segy file once per worker
+    executor = ProcessPoolExecutor(
+        max_workers=num_workers,
+        mp_context=context,
+    )
+
+    segy_file = SegyFile(**segy_file_kwargs)
 
     with executor:
         futures = []
-        common_args = (segy_file_kwargs, output_path, data_variable_name)
         for region in chunk_iter:
-            subset_args = (region, grid_map)
-            future = executor.submit(trace_worker, *common_args, *subset_args)
+            # Pass zarr array handles directly to workers
+            future = executor.submit(
+                trace_worker,
+                segy_file,
+                data_array,
+                header_array,
+                raw_header_array,
+                region,
+                grid_map,
+            )
             futures.append(future)
 
         iterable = tqdm(
@@ -106,11 +137,9 @@ def to_zarr(  # noqa: PLR0913, PLR0915
             if result is not None:
                 _update_stats(final_stats, result)
 
+    # Update statistics using the already-open zarr group
     # Xarray doesn't directly support incremental attribute updates when appending to an existing Zarr store.
     # HACK: We will update the array attribute using zarr's API directly.
-    # Use the data_variable_name to get the array in the Zarr group and write "statistics" metadata there
-    storage_options = _normalize_storage_options(output_path)
-    zarr_group = zarr_open_group(output_path.as_posix(), mode="a", storage_options=storage_options)
     attr_json = final_stats.model_dump_json()
     zarr_group[data_variable_name].attrs.update({"statsV1": attr_json})