Merge pull request #44 from trchudley/development

trchudley · web-flow · commit 967126a7003d · 2025-05-07T16:43:36.000+01:00
v1.0.1
diff --git a/docs/appendix/faq.md b/docs/appendix/faq.md
@@ -17,3 +17,7 @@ We would encourage users who are interested in larger-scale analysis to get in t
 Strips can appear empty (i.e. `NaN` values) when photogrammetry fails (due to cloud, water, etc) or it is masked by the PGC bitmask. Data previewed by the `pdt.load.preview()` function is masked according to this bitmask, and data loaded by the `pdt.load.from_search()` function is also masked by default (this can be disabled by using the `bitmask = False` option).
 
 As a result, it is entirely possible that the `search()` function can return a valid datastrip that covers a sufficient proportion of the AOI to meet the `min_aoi_frac` requirements, but it will appear empty (i.e. all-`NaN`) when viewed. The `preview()` function will help you identify these 'empty' scenes, but there may still be (poor-quality) data present if it is downloaded using the `load.from_search()` function with `bitmask = False`.
+
+#### Q: Can I load DEMs as `dask` arrays, and/or enable lazy evaluation for downloads?
+
+Yes! Like the [`rioxarray.load_rasterio()`](https://corteva.github.io/rioxarray/html/rioxarray.html#rioxarray-open-rasterio) function they wrap, `load.from_search()`, `load.mosaic()`, and `load.from_fpath()` accept a `chunks` variable (e.g.`(50, 50)`, `True`, `"auto"`), which triggers loading as a `dask` array (and, as a bonus, will result in 'lazy evaluation', where the data is not downloaded or computed until it is needed by a further command).
diff --git a/notebooks/get_icebergs.ipynb b/notebooks/get_icebergs.ipynb
@@ -175,7 +175,6 @@
     "\n",
     "if not os.path.exists(out_fpath):\n",
     "    dem = pdt.load.from_search(gdf.iloc[i], bounds=bounds, bitmask=True)\n",
-    "    dem.compute()  # rioxarray uses lazy evaluation, so we can force the download using the `.compute()` function.\n",
     "    dem.rio.to_raster(out_fpath, compress='ZSTD', predictor=3, zlevel=1)\n",
     "    \n",
     "else:\n",
@@ -492,7 +491,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.10"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/strip_search.ipynb b/notebooks/strip_search.ipynb
@@ -514,9 +514,7 @@
     "    if not os.path.exists(out_fpath):\n",
     "        \n",
     "        dem = pdt.load.from_search(gdf_row, bounds=bounds, bitmask=True)\n",
-    "        \n",
-    "        dem.compute()  # rioxarray uses lazy evaluation, so we can force the download using the `.compute()` function.\n",
-    "        \n",
+    "                \n",
     "        dem.rio.to_raster(out_fpath, compress='ZSTD', predictor=3, zlevel=1)\n",
     "    \n",
     "        return dem\n",
@@ -533,6 +531,9 @@
    "source": [
     "Note, these are 2 m strips that will take a while to download! To save on added time when rerunning this notebook, I've added an additional test to the function: if we've already downloaded the DEM and saved it to the local directory, this function will instead load it from the local file location using the `load.from_fpath()` function.\n",
     "\n",
+    "> ⚠️\n",
+    "> **NOTE**: More advanced geospatial python users may wish to note that DEMs can be loaded as `dask` arrays, enabling lazy evaluation and only triggering download when required by a further command. This is done providing a `chunks` parameter to `load.from_search()`, `load.from_id()`, or `load.mosaic()` (e.g.`(50, 50)`, `True`, `\"auto\"`), as is the case for [the `rioxarray.load_rasterio()` function that it wraps](https://corteva.github.io/rioxarray/html/rioxarray.html#rioxarray-open-rasterio).\n",
+    "\n",
     "Regardless, we can now use this function to select and download relevant rows from our geodataframe using the standard Pandax indexing method (`DataFrame.iloc[[i]]`, where `i` is the desired row index):"
    ]
   },
@@ -704,7 +705,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.10"
   }
  },
  "nbformat": 4,
diff --git a/src/pdemtools/__init__.py b/src/pdemtools/__init__.py
@@ -13,6 +13,6 @@
 from . import _geomorphometry
 from . import _utils
 
-__version__ = "1.0.0"
+__version__ = "1.1.0"
 
 __all__ = ["search", "DemAccessor"]
diff --git a/src/pdemtools/load.py b/src/pdemtools/load.py
@@ -40,6 +40,7 @@ def from_fpath(
     bounds: Optional[Union[tuple, Polygon]] = None,
     bitmask_fpath: Optional[str] = None,
     pad: Optional[bool] = False,
+    chunks: Optional[Union[int, tuple, dict]] = None,
 ) -> DataArray:
     """Given a filepath (local or an AWS link), loads the desired ArcticDEM/REMA DEM
     strip as an ``xarray`` ``DataArray``. Option to filter to bounds and bitmask, if
@@ -59,6 +60,9 @@ def from_fpath(
     :param pad: If the DEM strip is not the full extent of the given bounds,
         pad with NaNs to match the full bounds. Defaults to False.
     :type pad: bool
+    :param chunks: Chunk size for `rioxarray.open_rasterio`, triggering `dask`
+        parallelisation and lazy evaluation/loading. Defaults to `None`.
+    :type chunks: int | tuple | dict
 
     :returns: xarray DataArray of DEM strip
     :rtype: DataArray
@@ -69,7 +73,7 @@ def from_fpath(
         raise ValueError("pad must be True or False")
 
     # Open dataarray using rioxarray
-    dem = rxr.open_rasterio(dem_fpath)
+    dem = rxr.open_rasterio(dem_fpath, chunks=chunks)
 
     # Convert shapely geometry to bounds
     if isinstance(bounds, Polygon):
@@ -169,6 +173,7 @@ def from_search(
     bounds: Optional[Union[tuple, Polygon]] = None,
     bitmask: Optional[bool] = True,
     pad: Optional[bool] = False,
+    chunks: Optional[Union[int, tuple, dict]] = None,
 ):
     """Given a row from the GeoDataFrame output of ``pdemtools.search()``, loads the 2
     m DEM strip of the desired ArcticDEM/REMA DEM strip as an xarray DataArray.
@@ -190,6 +195,9 @@ def from_search(
     :param pad: If the DEM strip is not the full extent of the given bounds,
         pad with NaNs to match the full bounds. Defaults to False.
     :type pad: bool
+    :param chunks: Chunk size for `rioxarray.open_rasterio`, triggering `dask`
+        parallelisation and lazy evaluation/loading. Defaults to `None`.
+    :type chunks: int | tuple | dict
 
     :returns: xarray DataArray of DEM strip
     :rtype: DataArray
@@ -223,6 +231,7 @@ def from_search(
         bounds,
         bitmask_url,
         pad,
+        chunks,
     )
 
 
@@ -236,6 +245,7 @@ def from_id(
     version: Optional[str] = "s2s041",
     preview: Optional[bool] = False,
     pad: Optional[bool] = False,
+    chunks: Optional[Union[int, tuple, dict]] = None,
 ) -> DataArray:
     """An alternative method of loading the selected ArcticDEM/REMA strip, which
     requires only the geocell and the dem_id (e.g. ``geocell = 'n70w051'``, ``dem_id =
@@ -270,6 +280,9 @@ def from_id(
     :param pad: If the DEM strip is not the full extent of the given bounds,
         pad with NaNs to match the full bounds. Defaults to False.
     :type pad: bool
+    :param chunks: Chunk size for `rioxarray.open_rasterio`, triggering `dask`
+        parallelisation and lazy evaluation/loading. Defaults to `None`.
+    :type chunks: int | tuple | dict
 
     :return: xarray DataArray of DEM strip
     :rtype: DataArray
@@ -309,6 +322,7 @@ def from_id(
         bounds,
         bitmask_fpath,
         pad,
+        chunks,
     )
 
 
@@ -317,6 +331,7 @@ def mosaic(
     resolution: Literal["2m", "10m", "32m"],
     bounds: Union[tuple, Polygon] = None,
     version: Optional[Literal["v2.0", "v3.0", "v4.1"]] = None,
+    chunks: Optional[Union[int, tuple, dict]] = None,
 ):
     """Given a dataset, resolution, and bounding box, downloads the ArcticDEM or REMA
     mosiac from AWS.
@@ -333,6 +348,12 @@ def mosaic(
     :param bounds: Clip to bounds [xmin, ymin, xmax, ymax], in EPSG:3413 (ArcticDEM) or
         EPSG:3031 (REMA). Will accept a shapely geometry to extract bounds from.
     :type bounds: tuple | Polygon, optional
+    :param chunks: Chunk size for `rioxarray.open_rasterio`, triggering `dask`
+        parallelisation and lazy evaluation/loading. Defaults to `None`.
+    :type chunks: int | tuple | dict
+
+    :return: xarray DataArray of DEM mosaic
+    :rtype: DataArray
     """
 
     # sanity check that datset and versioning is correct versioning is valid for selected dataset
@@ -401,13 +422,14 @@ def mosaic(
     # load dem(s)
     dems = []
     for fpath in fpaths:
-        dem = rxr.open_rasterio(fpath).rio.clip_box(*bounds)
+        dem = rxr.open_rasterio(fpath, chunks=chunks).rio.clip_box(*bounds)
         dems.append(dem)
 
     if len(fpaths) == 1:
-        dem = rxr.open_rasterio(fpaths[0]).rio.clip_box(*bounds)
+        dem = rxr.open_rasterio(fpaths[0], chunks=chunks).rio.clip_box(*bounds)
 
-    # If multiple dems, merge them
+    # If multiple dems, merge them - NB I don't know whether this breaks lazy
+    # evaluation for chunked data
     if len(dems) > 1:
         dem = merge_arrays(dems)
     else: