ghiggi · ghiggi · Aug 23, 2025 · Aug 25, 2025
diff --git a/gpm/__init__.py b/gpm/__init__.py
@@ -40,7 +40,7 @@
     define_configs,
     read_configs,
 )
-from gpm.dataset.dataset import open_dataset, open_datatree  # noqa
+from gpm.dataset.dataset import open_dataset, open_datatree, open_files  # noqa
 from gpm.dataset.datatree import open_raw_datatree  # noqa
 from gpm.dataset.granule import open_granule, open_granule_dataset, open_granule_datatree  # noqa
 from gpm.dataset.tcprimed import open_granule_tcprimed  # noqa

diff --git a/gpm/bucket/dataframe.py b/gpm/bucket/dataframe.py
@@ -46,12 +46,11 @@ def pl_cut(values, bounds, include_lowest=True, right=True):
         left_closed=not right,  # left_closed=False equivalent of pandas right=True
         include_breaks=False,
     )
-
-    indices = indices.cast(float)
+    indices = indices.cast(str).cast(float)  # NaN are represented as null
     # Include values of first bins (include_lowest=True of pd.cut)
     if include_lowest:
         indices[values == bounds[0]] = 0
-    # Replace -1 and len(bounds)
+    # Replace -1 and len(bounds) with null
     indices[indices == -1.0] = None
     indices[indices == len(bounds) - 1] = None
     indices[indices.is_nan()] = None

diff --git a/gpm/dataset/conventions.py b/gpm/dataset/conventions.py
@@ -124,6 +124,15 @@
     return ds
 
 
+def add_gpm_api_product(ds, product):
+    """Add gpm_api_product attribute to Dataset and DataArray variables."""
+    product = "UNDEFINED" if product is None else product
+    ds.attrs["gpm_api_product"] = product
+    for var in ds.data_vars:
+        ds[var].attrs["gpm_api_product"] = product
+    return ds
+
+
 def finalize_dataset(ds, product, decode_cf, scan_mode, start_time=None, end_time=None):
     """Finalize GPM xarray.Dataset object."""
     import pyproj
@@ -141,69 +150,71 @@
     # - Units --> units
     # - Remove DimensionNames
     # - Sanitize LongName --> description
-
-    # - Add <gpm_api_product> : <product> key : value
-    ds = standardize_dataarrays_attrs(ds, product)
+    ds = standardize_dataarrays_attrs(ds)
 
     ##------------------------------------------------------------------------.
     # Decode dataset
    # - With numpy > 2.0, the _FillValue attribute must be a numpy scalar so that CF decoding is applied
    # - _FillValue is moved from attrs to encoding !
    if decode_cf:
        ds = apply_cf_decoding(ds)
     if "time_bnds" in ds:
         ds["time_bnds"] = ds["time_bnds"].astype("M8[ns]").compute()
 
-    ##------------------------------------------------------------------------.
-    # Set relevant coordinates
-    # - Add range id, radar and pmw frequencies ...
-    ds = set_coordinates(ds, product, scan_mode)
-
     ###-----------------------------------------------------------------------.
     ## Check swath time coordinate
     # --> Ensure validity of the time dimension
     # - Infill up to 10 consecutive NaT
     # - Do not check for regular time dimension !
     ds = ensure_time_validity(ds, limit=10)
 
+    ##------------------------------------------------------------------------.
+    # Set relevant coordinates
+    # - Add range id, radar and pmw frequencies ...
+    ds = set_coordinates(ds, product=product, scan_mode=scan_mode)
+
+    ##------------------------------------------------------------------------.
+    # Add gpm_api product name to Dataset and DataArrays  attributes
+    # - This is required in decode_variables for some products !
+    ds = add_gpm_api_product(ds, product)
+
     ##------------------------------------------------------------------------.
     # Decode variables
-    if config.get("decode_variables"):
-        ds = decode_variables(ds, product)
+    if config.get("decode_variables") and product is not None:
+        ds = decode_variables(ds, product=product)
 
     ##------------------------------------------------------------------------.
     # Add CF-compliant coordinates attributes and encoding
    ds = set_coords_attrs(ds)

    ##------------------------------------------------------------------------.
    # Add time encodings
    encoding = {}
    encoding["units"] = EPOCH
    encoding["calendar"] = "proleptic_gregorian"
    ds["time"].encoding = encoding
    if "time_bnds" in ds:
        ds["time_bnds"].encoding = encoding

    ##------------------------------------------------------------------------.
    # Transpose to have (y, x) dimension order
    ds = reshape_dataset(ds)

    ##------------------------------------------------------------------------.
    # Add CF-compliant CRS information
    # - See Geolocation toolkit ATBD at
    #   https://gpm.nasa.gov/sites/default/files/document_files/GPMGeolocationToolkitATBDv2.1-2012-07-31.pdf
    # TODO: set_dataset_crs should be migrated to cf_xarray ideally
    try:
        crs = pyproj.CRS(proj="longlat", ellps="WGS84")
        ds = set_dataset_crs(ds, crs=crs, grid_mapping_name="crsWGS84", inplace=False)
    except Exception:
        msg = "The CRS coordinate is not set because the dataset variables does not have 2D spatial dimensions."
         warnings.warn(msg, GPM_Warning, stacklevel=2)
 
     ##------------------------------------------------------------------------.
-    # Add GPM-API global attributes
+    # Add history into dataset attributes
     ds = add_history(ds)
-    ds.attrs["gpm_api_product"] = product
 
     ##------------------------------------------------------------------------.
     # Subset dataset for start_time and end_time

diff --git a/gpm/dataset/coords.py b/gpm/dataset/coords.py
@@ -51,29 +51,45 @@ def _get_orbit_scan_time(dt, scan_mode):
 
 def get_orbit_coords(dt, scan_mode):
     """Get coordinates from Orbit objects."""
+    # Decode FileHeader string
     attrs = decode_string(dt.attrs["FileHeader"])
+    # Retrieve Granule ID
     granule_id = attrs["GranuleNumber"]
 
+    # Retrieve time and lat/lon coordinates
     ds = dt[scan_mode]
     time = _get_orbit_scan_time(dt, scan_mode)
-
     lon = ds["Longitude"].data
     lat = ds["Latitude"].data
-    n_along_track, n_cross_track = lon.shape
+
+    # Define other coordinates
+    shape = lon.shape
+    if len(shape) == 2:
+        n_along_track, n_cross_track = shape
+        geolocation_dims = ["along_track", "cross_track"]
+    else:  # 1 (along-track only, e.g GMI-1A S3)
+        n_along_track = shape[0]
+        n_cross_track = 0
+        geolocation_dims = ["along_track"]
+
     granule_id = np.repeat(granule_id, n_along_track)
     along_track_id = np.arange(n_along_track)
     cross_track_id = np.arange(n_cross_track)
     gpm_id = [str(g) + "-" + str(z) for g, z in zip(granule_id, along_track_id, strict=False)]
 
-    return {
-        "lon": xr.DataArray(lon, dims=["along_track", "cross_track"]),
-        "lat": xr.DataArray(lat, dims=["along_track", "cross_track"]),
+    # Define dictionary with DataArray coordinates
+    dict_coords = {
+        "lon": xr.DataArray(lon, dims=geolocation_dims),
+        "lat": xr.DataArray(lat, dims=geolocation_dims),
         "time": xr.DataArray(time, dims="along_track"),
         "gpm_id": xr.DataArray(gpm_id, dims="along_track"),
         "gpm_granule_id": xr.DataArray(granule_id, dims="along_track"),
         "gpm_cross_track_id": xr.DataArray(cross_track_id, dims="cross_track"),
         "gpm_along_track_id": xr.DataArray(along_track_id, dims="along_track"),
     }
+    if n_cross_track == 0:
+        _ = dict_coords.pop("gpm_cross_track_id")
+    return dict_coords
 
 
 def get_time_delta_from_time_interval(time_interval):

diff --git a/gpm/dataset/dataset.py b/gpm/dataset/dataset.py
@@ -1,4 +1,4 @@
 # -----------------------------------------------------------------------------.
 # MIT License

 # Copyright (c) 2024 GPM-API developers
@@ -105,6 +105,9 @@
     if parallel:
         list_info = dask.compute(*list_info)
 
+    # Retrieve scan modes list
+    scan_modes = list(list_info[0][0])
+
     # ----------------------------------------------------.
     # Retrieve datatree closers
     list_dt_closers = [dt_closer for _, dt_closer in list_info]
@@ -488,3 +491,87 @@
 
     ##------------------------------------------------------------------------.
     return dt
+
+
+def _infer_product_name(ds) -> str | None:
+    """Infer product name from GPM Dataset attributes."""
+    from gpm.io.products import get_products_attributes_dict
+
+    products_dict = get_products_attributes_dict()
+    for product, attrs in products_dict.items():
+        if (
+            attrs["AlgorithmID"] == ds.attrs["AlgorithmID"]
+            and attrs["SatelliteName"] == ds.attrs["SatelliteName"]
+            and attrs["InstrumentName"] == ds.attrs["InstrumentName"]
+        ):
+            return product
+    return None
+
+
+def open_files(
+    filepaths,
+    parallel=False,
+    scan_modes=None,
+    groups=None,
+    variables=None,
+    prefix_group=False,
+    start_time=None,
+    end_time=None,
+    chunks=-1,
+    decode_cf=True,
+    **kwargs,
+):
+
+    ##------------------------------------------------------------------------.
+    # Ensure filepaths is a list
+    if isinstance(filepaths, str):
+        filepaths = [filepaths]
+
+    ##------------------------------------------------------------------------.
+    dict_scan_modes, list_dt_closers = _get_scan_modes_datasets_and_closers(
+        filepaths=filepaths,
+        parallel=parallel,
+        scan_modes=scan_modes,
+        decode_cf=False,
+        # Custom options
+        variables=variables,
+        groups=groups,
+        prefix_group=prefix_group,
+        chunks=chunks,
+        **kwargs,
+    )
+
+    # Retrieve scan_modes from dictionary
+    scan_modes = sorted(dict_scan_modes)
+
+    # Infer product from file
+    product = _infer_product_name(dict_scan_modes[scan_modes[0]])
+
+    # Warn if product is unknown
+    if product is None:
+        msg = "GPM-API didn't apply specialized variables decoding because product is unknown !"
+        warnings.warn(msg, GPM_Warning, stacklevel=2)
+
+    # Finalize datatree
+    dict_scan_modes = {
+        scan_mode: finalize_dataset(
+            ds=ds,
+            product=product,
+            scan_mode=scan_mode,
+            decode_cf=decode_cf,
+            start_time=start_time,
+            end_time=end_time,
+        )
+        for scan_mode, ds in dict_scan_modes.items()
+    }
+
+    # Create datatree
+    dt = xr.DataTree.from_dict(dict_scan_modes)
+
+    # Specify scan modes closers
+    for scan_mode, ds in dict_scan_modes.items():
+        dt[scan_mode].set_close(ds._close)
+
+    # Specify files closers
+    dt.set_close(partial(_multi_file_closer, list_dt_closers))
+    return dt
diff --git a/gpm/dataset/decoding/coordinates.py b/gpm/dataset/decoding/coordinates.py
@@ -246,23 +246,25 @@
         ds = _parse_sun_local_time(ds)
         ds = ds.set_coords("sunLocalTime")
 
-    #### PMW
-    # - 1B and 1C products
-    if product.startswith("1C") or product.startswith("1B"):
-        ds = _add_pmw_coordinates(ds, product, scan_mode)
-    # - Deal with incidenceAngleIndex in PMW 1C products
-    if product.startswith("1C"):
-        ds = _deal_with_pmw_incidence_angle_index(ds)
-    #### RADAR
-    if product in ["2A-DPR", "2A-Ku", "2A-Ka", "2A-PR", "2A-ENV-DPR", "2A-ENV-PR", "2A-ENV-Ka", "2A-ENV-Ku"]:
-        ds = _add_radar_coordinates(ds, product, scan_mode)
-
-    #### CMB
-    if product in ["2B-GPM-CORRA", "2B-TRMM-CORRA"]:
-        ds = _add_cmb_coordinates(ds, product, scan_mode)
-
-    #### SLH and CSH products
-    if product in ["2A-GPM-SLH", "2B-GPM-CSH"] and "range" in list(ds.dims):
-        ds = add_lh_height(ds)
+    # Add specific coordinates depending on product
+    if product is not None:
+        #### PMW
+        # - 1B and 1C products
+        if product.startswith("1C") or product.startswith("1B"):
+            ds = _add_pmw_coordinates(ds, product, scan_mode)
+        # - Deal with incidenceAngleIndex in PMW 1C products
+        if product.startswith("1C"):
+            ds = _deal_with_pmw_incidence_angle_index(ds)
+        #### RADAR
+        if product in ["2A-DPR", "2A-Ku", "2A-Ka", "2A-PR", "2A-ENV-DPR", "2A-ENV-PR", "2A-ENV-Ka", "2A-ENV-Ku"]:
+            ds = _add_radar_coordinates(ds, product, scan_mode)
+
+        #### CMB
+        if product in ["2B-GPM-CORRA", "2B-TRMM-CORRA"]:
+            ds = _add_cmb_coordinates(ds, product, scan_mode)
+
+        #### SLH and CSH products
+        if product in ["2A-GPM-SLH", "2B-GPM-CSH"] and "range" in list(ds.dims):
+            ds = add_lh_height(ds)
 
     return ds
diff --git a/gpm/dataset/decoding/dataarray_attrs.py b/gpm/dataset/decoding/dataarray_attrs.py
@@ -94,7 +94,7 @@ def _sanitize_attributes(attrs):
     return attrs
 
 
-def _format_dataarray_attrs(da, product=None):
+def _format_dataarray_attrs(da):
     attrs = da.attrs
 
     # Ensure fill values are numbers
@@ -116,20 +116,16 @@ def _format_dataarray_attrs(da, product=None):
     if "source_dtype" not in attrs and "dtype" in da.encoding:
         attrs["source_dtype"] = da.encoding["dtype"]
 
-    # Add gpm_api product name
-    if product is not None:
-        attrs["gpm_api_product"] = product
-
     # Attach attributes
     da.attrs = attrs
 
     return da
 
 
-def standardize_dataarrays_attrs(ds, product):
+def standardize_dataarrays_attrs(ds):
     # Sanitize variable attributes
     for var, da in ds.items():
-        ds[var] = _format_dataarray_attrs(da, product)
+        ds[var] = _format_dataarray_attrs(da)
 
     # Drop attributes from bounds coordinates
     # - https://github.com/pydata/xarray/issues/8368

diff --git a/gpm/dataset/granule.py b/gpm/dataset/granule.py
@@ -231,7 +231,7 @@ def _get_scan_mode_dataset(
     return ds
 
 
-def get_scan_modes_datasets(filepath, scan_modes, groups, variables, decode_cf, chunks, prefix_group, **kwargs):
+def get_scan_modes_datasets(filepath, groups, variables, decode_cf, chunks, prefix_group, scan_modes=None, **kwargs):
     """Return a dictionary with a dataset for each scan mode."""
     from gpm.dataset.datatree import open_raw_datatree
     from gpm.dataset.granule import _get_scan_mode_dataset
@@ -240,17 +240,31 @@ def get_scan_modes_datasets(filepath, scan_modes, groups, variables, decode_cf,
     dt = open_raw_datatree(filepath=filepath, chunks=chunks, decode_cf=decode_cf, use_api_defaults=True, **kwargs)
     dt_closer = dt._close
 
+    # List scan modes if not specified
+    if scan_modes is None:
+        nodes = list(dt)
+        invalid_nodes = [
+            "gmi1aHeader",
+            "tmi1aHeader",
+            "DiagGroup",
+            "AlgorithmRuntimeInfo",
+            "GprofDHeadr",
+        ]
+        scan_modes = set(nodes) - set(invalid_nodes)
+
     # Retrieve scan mode dataset (without cf decoding)
-    dict_scan_modes = {
-        scan_mode: _get_scan_mode_dataset(
-            dt=dt,
-            scan_mode=scan_mode,
-            groups=groups,
-            variables=variables,
-            prefix_group=prefix_group,
-        )
-        for scan_mode in scan_modes
-    }
+    dict_scan_modes = {}
+    for scan_mode in scan_modes:
+        try:
+            dict_scan_modes[scan_mode] = _get_scan_mode_dataset(
+                dt=dt,
+                scan_mode=scan_mode,
+                groups=groups,
+                variables=variables,
+                prefix_group=prefix_group,
+            )
+        except Exception as e:
+            print(f"Skipping scan mode {scan_mode}: {e}")
 
     return dict_scan_modes, dt_closer