Merge branch 'develop' into gtc-3084_new_geostore_endpts

dmannarino · web-flow · commit cdf590060c5b · 2025-01-21T12:32:08.000-05:00
diff --git a/app/models/orm/assets.py b/app/models/orm/assets.py
@@ -4,8 +4,8 @@
 class Asset(Base):
     __tablename__ = "assets"
     asset_id = db.Column(db.UUID, primary_key=True)
-    dataset = db.Column(db.String, nullable=False)
-    version = db.Column(db.String, nullable=False)
+    dataset = db.Column(db.String, nullable=False, index=True)
+    version = db.Column(db.String, nullable=False, index=True)
     asset_type = db.Column(db.String, nullable=False)
     asset_uri = db.Column(db.String, nullable=False)
     status = db.Column(db.String, nullable=False, default="pending")
diff --git a/app/models/orm/migrations/versions/3e524ef0525f_.py b/app/models/orm/migrations/versions/3e524ef0525f_.py
@@ -0,0 +1,32 @@
+"""empty message.
+
+Revision ID: 3e524ef0525f
+Revises: 604bf4e66c2b
+Create Date: 2024-12-18 00:43:46.681427
+"""
+import sqlalchemy as sa
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "3e524ef0525f"
+down_revision = "604bf4e66c2b"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_index(op.f("ix_assets_dataset"), "assets", ["dataset"], unique=False)
+    op.create_index(op.f("ix_assets_version"), "assets", ["version"], unique=False)
+    op.add_column("dataset_metadata", sa.Column("subtitle", sa.String(), nullable=True))
+    op.add_column("version_metadata", sa.Column("subtitle", sa.String(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("version_metadata", "subtitle")
+    op.drop_column("dataset_metadata", "subtitle")
+    op.drop_index(op.f("ix_assets_version"), table_name="assets")
+    op.drop_index(op.f("ix_assets_dataset"), table_name="assets")
+    # ### end Alembic commands ###
diff --git a/app/models/orm/mixins.py b/app/models/orm/mixins.py
@@ -3,6 +3,7 @@
 
 class MetadataMixin:
     title = db.Column(db.String)
+    subtitle = db.Column(db.String)
     spatial_resolution = db.Column(db.Numeric)
     resolution_description = db.Column(db.String)
     geographic_coverage = db.Column(db.String)
diff --git a/app/models/pydantic/creation_options.py b/app/models/pydantic/creation_options.py
@@ -122,24 +122,69 @@ class RasterTileSetAssetCreationOptions(StrictBaseModel):
             "when input files are in different projections from each other."
         )
     )
-    pixel_meaning: str
+    pixel_meaning: str = Field(
+        ..., description="Description of what the pixel value in the "
+        "raster represents. This is used to clarify the meaning of the raster "
+        "and distinguish multiple raster tile sets based on the same dataset "
+        "version. The pixel_meaning string should be fairly short, use all "
+        "lower-case letters, and use underscores instead of spaces."
+    )
     data_type: DataType
-    nbits: Optional[int]
-    calc: Optional[str]
+    nbits: Optional[int] = Field(
+        None,
+        description="Advanced option that lets GDAL compress the data even "
+        "more based on the number of bits you need."
+    )
+    calc: Optional[str] = Field(
+        None,
+        description="There are two modes for this field, one for rasterizing vector "
+        "sources and one for transforming and/or combining one or more "
+        "sources that are already raster. For rasterizing vector sources, "
+        "this field should be an SQL expression that yields the desired "
+        "raster value based on the fields of your vector dataset.\n\nFor raster "
+        "sources, this should be a raster algebra expression, similar to that "
+        "provided to gdal_calc (see "
+        "https://gdal.org/en/stable/programs/gdal_calc.html), "
+        "that transforms one or more input bands into one or more output "
+        "bands. For use in this expression, each band in "
+        "the sources is assigned an alphabetic variable (A-Z, then AA-AZ, "
+        "etc.) in the order it exists in those sources, with those of the "
+        "first source first, continuing with those of the second, and so on. "
+        "So with two input sources of two bands each, they would be assigned "
+        "to variables A and B (for the first source) and C and D (for the "
+        "second source). The NumPy module is in scope, accessible as np"
+    )
     band_count: int = 1
     union_bands: bool = False
     no_data: Optional[Union[List[NoDataType], NoDataType]]
-    rasterize_method: Optional[RasterizeMethod]
+    rasterize_method: Optional[RasterizeMethod] = Field(
+        RasterizeMethod.value,
+        description="For raster sources or default assets, 'value' (the "
+        "default) means use the value from the last or only band processed, "
+        "and 'count' means count the number of bands with data values."
+    )
     resampling: ResamplingMethod = PIXETL_DEFAULT_RESAMPLING
-    order: Optional[Order]
+    order: Optional[Order] = Field(
+        None,
+        description="For vector default assets, order the features by the "
+        "calculated raster value. For 'asc', the features are ordered by "
+        "ascending calculated value so that the largest calculated value is "
+        "used in the raster when there are overlapping features. For 'desc', "
+        "the ordering is descending, so that the smallest calculated value "
+        "is used when there are overlaps."
+    )
     overwrite: bool = False
     subset: Optional[str]
     grid: Grid
     symbology: Optional[Symbology] = None
     compute_stats: bool = True
     compute_histogram: bool = False
     process_locally: bool = True
-    auxiliary_assets: Optional[List[UUID]] = None
+    auxiliary_assets: Optional[List[UUID]] = Field(
+        None,
+        description="Asset IDs of additional rasters you might want to include "
+        "in your calc expression."
+    )
     photometric: Optional[PhotometricType] = None
     num_processes: Optional[StrictInt] = None
     timeout_sec: Optional[StrictInt] = Field(
@@ -209,7 +254,15 @@ class VectorSourceCreationOptions(StrictBaseModel):
             Index(index_type=IndexType.gist.value, column_names=["geom_wm"]),
             Index(index_type=IndexType.hash.value, column_names=["gfw_geostore_id"]),
         ],
-        description="List of indices to add to table",
+        description="List of indices to add to the database table representing "
+        "the vector dataset.  Each element of the indices field contains an "
+        "index_type field (which is a string) and a column_names field (which "
+        "is a list of field names included in this index). The possibilities "
+        "for the index_type field are hash, btree, or gist. hash is efficient "
+        "for standard exact-value lookups, while btree is efficient for range "
+        "lookups. gist is used for geometry fields and can do "
+        "intersection-type lookups. See "
+        "https://www.postgresql.org/docs/current/indexes-types.html"
     )
     cluster: Optional[Index] = Field(None, description="Index to use for clustering.")
     table_schema: Optional[List[FieldType]] = Field(
@@ -331,7 +384,7 @@ class RasterTileCacheCreationOptions(TileCacheBaseModel):
         "default",
         description="Name space to use for raster tile cache. "
         "This will be part of the URI and will "
-        "allow to create multiple raster tile caches per version,",
+        "allow creation of multiple raster tile caches per version,",
     )
     symbology: Symbology = Field(..., description="Symbology to use for output tiles")
     source_asset_id: str = Field(
diff --git a/app/models/pydantic/metadata.py b/app/models/pydantic/metadata.py
@@ -3,7 +3,7 @@
 from uuid import UUID
 
 from fastapi import HTTPException
-from pydantic import Field, validator, BaseModel
+from pydantic import BaseModel, Field, validator
 from pydantic.utils import GetterDict
 
 from .base import BaseRecord, StrictBaseModel
@@ -34,6 +34,7 @@ class Config:
 
 class DatasetMetadata(CommonMetadata):
     title: Optional[str]
+    subtitle: Optional[str]
     source: Optional[str]
     license: Optional[str]
     data_language: Optional[str]
@@ -51,6 +52,7 @@ class Config:
             "examples": [
                 {
                     "title": "Deforestation alerts (GLAD-S2)",
+                    "subtitle": "Sentinel-2 based deforestation alerts",
                     "source": "Global Land Analysis and Discovery (GLAD), University of Maryland",
                     "license": "[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)",
                     "data_language": "en",
diff --git a/app/models/pydantic/versions.py b/app/models/pydantic/versions.py
@@ -18,8 +18,12 @@ class Version(BaseRecord):
     metadata: Union[VersionMetadataOut, BaseModel]
     status: VersionStatus = VersionStatus.pending
 
-    # Each element of assets is a tuple (asset_type, assert_uri, asset_id)
-    assets: List[Tuple[str, str, str]] = list()
+    assets: List[Tuple[str, str, str]] = Field(
+        list(),
+        description="List of saved (non-pending and non-failed) assets, with "
+        " elements in the form: [asset_type, asset_uri, asset_id]. The list "
+        "of assets is sorted by the creation time of each asset."
+    )
 
 
 class VersionCreateIn(StrictBaseModel):
diff --git a/app/routes/assets/asset.py b/app/routes/assets/asset.py
@@ -1,12 +1,13 @@
-"""Assets are replicas of the original source files.
+"""Assets are usually alternate representations of the base dataset
+version, sometimes combining in extra data from other datasets.
 
 Assets might be served in different formats, attribute values might be
 altered, additional attributes added, and feature resolution might have
 changed. Assets are either managed or unmanaged. Managed assets are
 created by the API and users can rely on data integrity. Unmanaged
 assets are only loosely linked to a dataset version and users must
-cannot rely on full integrity. We can only assume that unmanaged are
-based on the same version and do not know the processing history.
+cannot rely on full integrity. We can only assume that unmanaged assets
+are based on the same version and do not know the processing history.
 """
 
 from typing import List, Optional, Union
@@ -87,7 +88,9 @@ async def get_asset(
     *,
     asset_id: UUID = Path(...),
 ) -> AssetResponse:
-    """Get a specific asset."""
+    """Get a specific asset.  This provides information on the asset, including
+    the asset id, the asset status, the asset URI, and creation & last update
+    times."""
     try:
         row: ORMAsset = await assets.get_asset(asset_id)
     except RecordNotFoundError as e:
diff --git a/app/routes/datasets/asset.py b/app/routes/datasets/asset.py
@@ -66,8 +66,8 @@ async def get_version_assets(
         description="The number of assets per page. Default is `10`.",
     ),
 ) -> Union[PaginatedAssetsResponse, AssetsResponse]:
-    """Get all assets for a given dataset version. The list of assets
-    is sorted by the creation time of each asset.
+    """Get all assets for a given dataset version (including pending/failed assets).
+    The list of assets is sorted by the creation time of each asset.
 
     Will attempt to paginate if `page[size]` or `page[number]` is
     provided. Otherwise, it will attempt to return the entire list of
diff --git a/app/routes/datasets/dataset.py b/app/routes/datasets/dataset.py
@@ -128,7 +128,11 @@ async def update_dataset(
     request: DatasetUpdateIn,
     user: User = Depends(get_owner),
 ) -> DatasetResponse:
-    """Update metadata, accessibility or ownership of a dataset."""
+    """Update metadata, accessibility or ownership of a dataset.
+
+    Individual fields of the metadata can be modified, without affecting other
+    existing fields.
+    """
     input_data: Dict = request.dict(exclude_none=True, by_alias=True)
 
     if request.owner_id is not None:
diff --git a/app/routes/datasets/versions.py b/app/routes/datasets/versions.py
@@ -81,7 +81,8 @@
 async def get_version(
     *, dv: Tuple[str, str] = Depends(dataset_version_dependency)
 ) -> VersionResponse:
-    """Get basic metadata for a given version. The list of assets is sorted by
+    """Get basic metadata for a given version. The list of assets only includes
+    saved (non-pending and non-failed) assets and is sorted by
     the creation time of each asset."""
 
     dataset, version = dv
@@ -106,8 +107,8 @@ async def add_new_version(
     user: User = Depends(get_owner),
     response: Response,
 ):
-    """Create a version for a given dataset by uploading the geospatial/tabular
-    asset.
+    """Create a version for a given dataset by uploading the tabular, vector,
+    or raster asset.
 
     Only the dataset's owner or a user with `ADMIN` user role can do
     this operation.
@@ -373,6 +374,14 @@ async def get_stats(dv: Tuple[str, str] = Depends(dataset_version_dependency)):
     response_model=Union[FieldsMetadataResponse, RasterBandsMetadataResponse],
 )
 async def get_fields(dv: Tuple[str, str] = Depends(dataset_version_dependency)):
+    """Get the fields of a version.  For a version with a vector default asset,
+    these are the fields (attributes) of the features of the base vector dataset.
+
+    For a version with a raster default asset, the fields are all the raster
+    tile sets that use the same grid as the raster default asset.  Also
+    included are some fields with special meaning such as 'area__ha',
+    'latitude', and 'longitude'.
+    """
     dataset, version = dv
     orm_asset: ORMAsset = await assets.get_default_asset(dataset, version)
 
diff --git a/batch/pixetl.dockerfile b/batch/pixetl.dockerfile
@@ -1,4 +1,4 @@
-FROM globalforestwatch/pixetl:v1.7.7_test_parallel
+FROM globalforestwatch/pixetl:v1.7.7
 
 # Copy scripts
 COPY ./batch/scripts/ /opt/scripts/