Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gpm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
define_configs,
read_configs,
)
from gpm.dataset.dataset import open_dataset, open_datatree # noqa
from gpm.dataset.dataset import open_dataset, open_datatree, open_files # noqa
from gpm.dataset.datatree import open_raw_datatree # noqa
from gpm.dataset.granule import open_granule, open_granule_dataset, open_granule_datatree # noqa
from gpm.dataset.tcprimed import open_granule_tcprimed # noqa
Expand Down
5 changes: 2 additions & 3 deletions gpm/bucket/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,11 @@ def pl_cut(values, bounds, include_lowest=True, right=True):
left_closed=not right, # left_closed=False equivalent of pandas right=True
include_breaks=False,
)

indices = indices.cast(float)
indices = indices.cast(str).cast(float) # NaN are represented as null
# Include values of first bins (include_lowest=True of pd.cut)
if include_lowest:
indices[values == bounds[0]] = 0
# Replace -1 and len(bounds)
# Replace -1 and len(bounds) with null
indices[indices == -1.0] = None
indices[indices == len(bounds) - 1] = None
indices[indices.is_nan()] = None
Expand Down
35 changes: 23 additions & 12 deletions gpm/dataset/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,15 @@
return ds


def add_gpm_api_product(ds, product):
"""Add gpm_api_product attribute to Dataset and DataArray variables."""
product = "UNDEFINED" if product is None else product
ds.attrs["gpm_api_product"] = product
for var in ds.data_vars:
ds[var].attrs["gpm_api_product"] = product
return ds


def finalize_dataset(ds, product, decode_cf, scan_mode, start_time=None, end_time=None):
"""Finalize GPM xarray.Dataset object."""
import pyproj
Expand All @@ -141,69 +150,71 @@
# - Units --> units
# - Remove DimensionNames
# - Sanitize LongName --> description

# - Add <gpm_api_product> : <product> key : value
ds = standardize_dataarrays_attrs(ds, product)
ds = standardize_dataarrays_attrs(ds)

##------------------------------------------------------------------------.
# Decode dataset
# - With numpy > 2.0, the _FillValue attribute must be a numpy scalar so that CF decoding is applied
# - _FillValue is moved from attrs to encoding !
if decode_cf:
ds = apply_cf_decoding(ds)
if "time_bnds" in ds:
ds["time_bnds"] = ds["time_bnds"].astype("M8[ns]").compute()

##------------------------------------------------------------------------.
# Set relevant coordinates
# - Add range id, radar and pmw frequencies ...
ds = set_coordinates(ds, product, scan_mode)

###-----------------------------------------------------------------------.
## Check swath time coordinate
# --> Ensure validity of the time dimension
# - Infill up to 10 consecutive NaT
# - Do not check for regular time dimension !
ds = ensure_time_validity(ds, limit=10)

##------------------------------------------------------------------------.
# Set relevant coordinates
# - Add range id, radar and pmw frequencies ...
ds = set_coordinates(ds, product=product, scan_mode=scan_mode)

##------------------------------------------------------------------------.
# Add gpm_api product name to Dataset and DataArrays attributes
# - This is required in decode_variables for some products !
ds = add_gpm_api_product(ds, product)

##------------------------------------------------------------------------.
# Decode variables
if config.get("decode_variables"):
ds = decode_variables(ds, product)
if config.get("decode_variables") and product is not None:
ds = decode_variables(ds, product=product)

##------------------------------------------------------------------------.
# Add CF-compliant coordinates attributes and encoding
ds = set_coords_attrs(ds)

##------------------------------------------------------------------------.
# Add time encodings
encoding = {}
encoding["units"] = EPOCH
encoding["calendar"] = "proleptic_gregorian"
ds["time"].encoding = encoding
if "time_bnds" in ds:
ds["time_bnds"].encoding = encoding

##------------------------------------------------------------------------.
# Transpose to have (y, x) dimension order
ds = reshape_dataset(ds)

##------------------------------------------------------------------------.
# Add CF-compliant CRS information
# - See Geolocation toolkit ATBD at
# https://gpm.nasa.gov/sites/default/files/document_files/GPMGeolocationToolkitATBDv2.1-2012-07-31.pdf
# TODO: set_dataset_crs should be migrated to cf_xarray ideally
try:
crs = pyproj.CRS(proj="longlat", ellps="WGS84")
ds = set_dataset_crs(ds, crs=crs, grid_mapping_name="crsWGS84", inplace=False)
except Exception:
msg = "The CRS coordinate is not set because the dataset variables does not have 2D spatial dimensions."
warnings.warn(msg, GPM_Warning, stacklevel=2)

##------------------------------------------------------------------------.
# Add GPM-API global attributes
# Add history into dataset attributes
ds = add_history(ds)

Check warning on line 217 in gpm/dataset/conventions.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

❌ Getting worse: Complex Method

finalize_dataset increases in cyclomatic complexity from 16 to 17, threshold = 9. This function has many conditional statements (e.g. if, for, while), leading to lower code health. Avoid adding more conditionals and code to it without refactoring.
ds.attrs["gpm_api_product"] = product

##------------------------------------------------------------------------.
# Subset dataset for start_time and end_time
Expand Down
26 changes: 21 additions & 5 deletions gpm/dataset/coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,29 +51,45 @@ def _get_orbit_scan_time(dt, scan_mode):

def get_orbit_coords(dt, scan_mode):
"""Get coordinates from Orbit objects."""
# Decode FileHeader string
attrs = decode_string(dt.attrs["FileHeader"])
# Retrieve Granule ID
granule_id = attrs["GranuleNumber"]

# Retrieve time and lat/lon coordinates
ds = dt[scan_mode]
time = _get_orbit_scan_time(dt, scan_mode)

lon = ds["Longitude"].data
lat = ds["Latitude"].data
n_along_track, n_cross_track = lon.shape

# Define other coordinates
shape = lon.shape
if len(shape) == 2:
n_along_track, n_cross_track = shape
geolocation_dims = ["along_track", "cross_track"]
else: # 1 (along-track only, e.g GMI-1A S3)
n_along_track = shape[0]
n_cross_track = 0
geolocation_dims = ["along_track"]

granule_id = np.repeat(granule_id, n_along_track)
along_track_id = np.arange(n_along_track)
cross_track_id = np.arange(n_cross_track)
gpm_id = [str(g) + "-" + str(z) for g, z in zip(granule_id, along_track_id, strict=False)]

return {
"lon": xr.DataArray(lon, dims=["along_track", "cross_track"]),
"lat": xr.DataArray(lat, dims=["along_track", "cross_track"]),
# Define dictionary with DataArray coordinates
dict_coords = {
"lon": xr.DataArray(lon, dims=geolocation_dims),
"lat": xr.DataArray(lat, dims=geolocation_dims),
"time": xr.DataArray(time, dims="along_track"),
"gpm_id": xr.DataArray(gpm_id, dims="along_track"),
"gpm_granule_id": xr.DataArray(granule_id, dims="along_track"),
"gpm_cross_track_id": xr.DataArray(cross_track_id, dims="cross_track"),
"gpm_along_track_id": xr.DataArray(along_track_id, dims="along_track"),
}
if n_cross_track == 0:
_ = dict_coords.pop("gpm_cross_track_id")
return dict_coords


def get_time_delta_from_time_interval(time_interval):
Expand Down
87 changes: 87 additions & 0 deletions gpm/dataset/dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# -----------------------------------------------------------------------------.

Check warning on line 1 in gpm/dataset/dataset.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

❌ New issue: Code Duplication

The module contains 2 functions with similar structure: open_dataset,open_datatree. Avoid duplicated, aka copy-pasted, code inside the module. More duplication lowers the code health.

Check warning on line 1 in gpm/dataset/dataset.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

❌ New issue: Overall Code Complexity

This module has a mean cyclomatic complexity of 6.57 across 7 functions. The mean complexity threshold is 4. This file has many conditional statements (e.g. if, for, while) across its implementation, leading to lower code health. Avoid adding more conditionals.

Check warning on line 1 in gpm/dataset/dataset.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

❌ New issue: Missing Arguments Abstractions

The average number of function arguments in this module is 8.00 across 7 functions. The average arguments threshold is 4.00. The functions in this file have too many arguments, indicating a lack of encapsulation or too many responsibilities in the same functions. Avoid adding more.
# MIT License

# Copyright (c) 2024 GPM-API developers
Expand Down Expand Up @@ -105,6 +105,9 @@
if parallel:
list_info = dask.compute(*list_info)

# Retrieve scan modes list
scan_modes = list(list_info[0][0])

# ----------------------------------------------------.
# Retrieve datatree closers
list_dt_closers = [dt_closer for _, dt_closer in list_info]
Expand Down Expand Up @@ -488,3 +491,87 @@

##------------------------------------------------------------------------.
return dt


def _infer_product_name(ds) -> str | None:
"""Infer product name from GPM Dataset attributes."""
from gpm.io.products import get_products_attributes_dict

products_dict = get_products_attributes_dict()
for product, attrs in products_dict.items():
if (
attrs["AlgorithmID"] == ds.attrs["AlgorithmID"]
and attrs["SatelliteName"] == ds.attrs["SatelliteName"]
and attrs["InstrumentName"] == ds.attrs["InstrumentName"]

Check warning on line 505 in gpm/dataset/dataset.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

❌ New issue: Complex Conditional

_infer_product_name has 1 complex conditionals with 2 branches, threshold = 2. A complex conditional is an expression inside a branch (e.g. if, for, while) which consists of multiple, logical operators such as AND/OR. The more logical operators in an expression, the more severe the code smell.
):
return product
return None


def open_files(
filepaths,
parallel=False,
scan_modes=None,
groups=None,
variables=None,
prefix_group=False,
start_time=None,
end_time=None,
chunks=-1,
decode_cf=True,
**kwargs,
):

##------------------------------------------------------------------------.
# Ensure filepaths is a list
if isinstance(filepaths, str):
filepaths = [filepaths]

##------------------------------------------------------------------------.
dict_scan_modes, list_dt_closers = _get_scan_modes_datasets_and_closers(
filepaths=filepaths,
parallel=parallel,
scan_modes=scan_modes,
decode_cf=False,
# Custom options
variables=variables,
groups=groups,
prefix_group=prefix_group,
chunks=chunks,
**kwargs,
)

# Retrieve scan_modes from dictionary
scan_modes = sorted(dict_scan_modes)

# Infer product from file
product = _infer_product_name(dict_scan_modes[scan_modes[0]])

# Warn if product is unknown
if product is None:
msg = "GPM-API didn't apply specialized variables decoding because product is unknown !"
warnings.warn(msg, GPM_Warning, stacklevel=2)

# Finalize datatree
dict_scan_modes = {
scan_mode: finalize_dataset(
ds=ds,
product=product,
scan_mode=scan_mode,
decode_cf=decode_cf,
start_time=start_time,
end_time=end_time,
)
for scan_mode, ds in dict_scan_modes.items()
}

# Create datatree
dt = xr.DataTree.from_dict(dict_scan_modes)

# Specify scan modes closers
for scan_mode, ds in dict_scan_modes.items():
dt[scan_mode].set_close(ds._close)

# Specify files closers
dt.set_close(partial(_multi_file_closer, list_dt_closers))
return dt
38 changes: 20 additions & 18 deletions gpm/dataset/decoding/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,23 +246,25 @@
ds = _parse_sun_local_time(ds)
ds = ds.set_coords("sunLocalTime")

#### PMW
# - 1B and 1C products
if product.startswith("1C") or product.startswith("1B"):
ds = _add_pmw_coordinates(ds, product, scan_mode)
# - Deal with incidenceAngleIndex in PMW 1C products
if product.startswith("1C"):
ds = _deal_with_pmw_incidence_angle_index(ds)
#### RADAR
if product in ["2A-DPR", "2A-Ku", "2A-Ka", "2A-PR", "2A-ENV-DPR", "2A-ENV-PR", "2A-ENV-Ka", "2A-ENV-Ku"]:
ds = _add_radar_coordinates(ds, product, scan_mode)

#### CMB
if product in ["2B-GPM-CORRA", "2B-TRMM-CORRA"]:
ds = _add_cmb_coordinates(ds, product, scan_mode)

#### SLH and CSH products
if product in ["2A-GPM-SLH", "2B-GPM-CSH"] and "range" in list(ds.dims):
ds = add_lh_height(ds)
# Add specific coordinates depending on product
if product is not None:
#### PMW
# - 1B and 1C products
if product.startswith("1C") or product.startswith("1B"):
ds = _add_pmw_coordinates(ds, product, scan_mode)
# - Deal with incidenceAngleIndex in PMW 1C products
if product.startswith("1C"):
ds = _deal_with_pmw_incidence_angle_index(ds)
#### RADAR
if product in ["2A-DPR", "2A-Ku", "2A-Ka", "2A-PR", "2A-ENV-DPR", "2A-ENV-PR", "2A-ENV-Ka", "2A-ENV-Ku"]:
ds = _add_radar_coordinates(ds, product, scan_mode)

#### CMB
if product in ["2B-GPM-CORRA", "2B-TRMM-CORRA"]:
ds = _add_cmb_coordinates(ds, product, scan_mode)

#### SLH and CSH products
if product in ["2A-GPM-SLH", "2B-GPM-CSH"] and "range" in list(ds.dims):
ds = add_lh_height(ds)

Check warning on line 268 in gpm/dataset/decoding/coordinates.py

View check run for this annotation

CodeScene Delta Analysis / CodeScene Cloud Delta Analysis (main)

❌ Getting worse: Complex Method

set_coordinates increases in cyclomatic complexity from 11 to 12, threshold = 9. This function has many conditional statements (e.g. if, for, while), leading to lower code health. Avoid adding more conditionals and code to it without refactoring.

return ds
10 changes: 3 additions & 7 deletions gpm/dataset/decoding/dataarray_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def _sanitize_attributes(attrs):
return attrs


def _format_dataarray_attrs(da, product=None):
def _format_dataarray_attrs(da):
attrs = da.attrs

# Ensure fill values are numbers
Expand All @@ -116,20 +116,16 @@ def _format_dataarray_attrs(da, product=None):
if "source_dtype" not in attrs and "dtype" in da.encoding:
attrs["source_dtype"] = da.encoding["dtype"]

# Add gpm_api product name
if product is not None:
attrs["gpm_api_product"] = product

# Attach attributes
da.attrs = attrs

return da


def standardize_dataarrays_attrs(ds, product):
def standardize_dataarrays_attrs(ds):
# Sanitize variable attributes
for var, da in ds.items():
ds[var] = _format_dataarray_attrs(da, product)
ds[var] = _format_dataarray_attrs(da)

# Drop attributes from bounds coordinates
# - https://github.com/pydata/xarray/issues/8368
Expand Down
36 changes: 25 additions & 11 deletions gpm/dataset/granule.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def _get_scan_mode_dataset(
return ds


def get_scan_modes_datasets(filepath, scan_modes, groups, variables, decode_cf, chunks, prefix_group, **kwargs):
def get_scan_modes_datasets(filepath, groups, variables, decode_cf, chunks, prefix_group, scan_modes=None, **kwargs):
"""Return a dictionary with a dataset for each scan mode."""
from gpm.dataset.datatree import open_raw_datatree
from gpm.dataset.granule import _get_scan_mode_dataset
Expand All @@ -240,17 +240,31 @@ def get_scan_modes_datasets(filepath, scan_modes, groups, variables, decode_cf,
dt = open_raw_datatree(filepath=filepath, chunks=chunks, decode_cf=decode_cf, use_api_defaults=True, **kwargs)
dt_closer = dt._close

# List scan modes if not specified
if scan_modes is None:
nodes = list(dt)
invalid_nodes = [
"gmi1aHeader",
"tmi1aHeader",
"DiagGroup",
"AlgorithmRuntimeInfo",
"GprofDHeadr",
]
scan_modes = set(nodes) - set(invalid_nodes)

# Retrieve scan mode dataset (without cf decoding)
dict_scan_modes = {
scan_mode: _get_scan_mode_dataset(
dt=dt,
scan_mode=scan_mode,
groups=groups,
variables=variables,
prefix_group=prefix_group,
)
for scan_mode in scan_modes
}
dict_scan_modes = {}
for scan_mode in scan_modes:
try:
dict_scan_modes[scan_mode] = _get_scan_mode_dataset(
dt=dt,
scan_mode=scan_mode,
groups=groups,
variables=variables,
prefix_group=prefix_group,
)
except Exception as e:
print(f"Skipping scan mode {scan_mode}: {e}")

return dict_scan_modes, dt_closer

Expand Down
Loading
Loading