Skip to content

Commit 69f2f07

Browse files
committed
fix: discover optional quicklooks from stac
1 parent 4bdd64c commit 69f2f07

File tree

2 files changed

+84
-7
lines changed

2 files changed

+84
-7
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# GeoZarr Quicklook CRS Investigation
2+
3+
## TL;DR
4+
- Quicklook assets are optional in upstream CPM publications; discovery must tolerate their absence.
5+
- `scripts/convert.py` now infers quicklook groups only when the STAC item advertises them and soft-fails otherwise.
6+
- Latest `data-model` CRS fallback ensures `/quality/l2a_quicklook` inherits the reflectance CRS when it is not encoded natively.
7+
8+
## What Broke
9+
- Sentinel-2 GeoZarr releases tagged `cpm_v262` omitted the `/quality/l2a_quicklook` groups even though STAC assets still reference quicklooks.
10+
- The convert workflow assumed quicklooks would always be published, so the register step failed on empty groups.
11+
- Quicklook datasets that were published also lacked CRS metadata, blocking downstream tiling.
12+
13+
## Investigation Highlights
14+
- Sampled six months of Sentinel-2 GeoZarr items across tiles and orbits; roughly half of the sampled items were missing quicklook data despite STAC advertising the asset.
15+
- Confirmed that older `cpm_v256` items exhibit the same gap, so the issue predates the recent regression.
16+
- Verified that STAC continues to list quicklook assets regardless of the dataset's presence, so workflow logic must perform existence checks.
17+
18+
> Sampling notes: raw JSON exports collected during the investigation are intentionally left out of the repository to keep the diff focused. They are available in the shared evidence bucket if deeper inspection is required.
19+
20+
## Fixes Landed
21+
1. `scripts/convert.py`
22+
- Derives quicklook group names directly from the STAC assets.
23+
- Short-circuits when the quicklook dataset is absent instead of failing the run.
24+
- Logs actionable diagnostics for missing quicklook groups and for mismatched CRS.
25+
2. `data-model` GeoZarr conversion
26+
- Adds a sibling CRS fallback so quicklook datasets inherit the reflectance CRS.
27+
- Ensures the CRS is propagated to variable attributes when inferred.
28+
29+
## Verification
30+
- `uv run -q pytest -q tests/test_cli_e2e.py::TestCLIEndToEnd::test_cli_convert_real_sentinel2_data`
31+
- Manual rerun of the devseed Argo workflow `geozarr-convert-quicklook`, confirming successful completion and STAC registration without quicklook datasets.
32+
- Spot-checked freshly produced GeoZarr stores to confirm quicklook datasets (when present) now carry CRS metadata.
33+
34+
## Follow-Up
35+
- Monitor CPM release notes for clarity on quicklook publication cadence.
36+
- Backfill missing quicklooks only when explicitly prioritized; convert workflow can now operate without them.

scripts/convert.py

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,24 +56,58 @@ def get_config(collection_id: str) -> dict:
5656
return CONFIGS.get(prefix, CONFIGS["sentinel-2"]).copy()
5757

5858

59-
def get_zarr_url(stac_item_url: str) -> str:
60-
"""Get Zarr asset URL from STAC item (priority: product, zarr, any .zarr)."""
59+
def get_zarr_asset_info(stac_item_url: str) -> tuple[str, dict[str, dict]]:
60+
"""Fetch STAC item and extract Zarr URL along with assets."""
61+
6162
with httpx.Client(timeout=30.0, follow_redirects=True) as client:
62-
assets = client.get(stac_item_url).raise_for_status().json().get("assets", {})
63+
item = client.get(stac_item_url).raise_for_status().json()
64+
65+
assets: dict[str, dict] = item.get("assets", {})
6366

6467
# Try priority assets first
65-
for key in ["product", "zarr"]:
68+
for key in ("product", "zarr"):
6669
if key in assets and (href := assets[key].get("href")):
67-
return str(href)
70+
return str(href), assets
6871

6972
# Fallback: any asset with .zarr in href
7073
for asset in assets.values():
7174
if ".zarr" in asset.get("href", ""):
72-
return str(asset["href"])
75+
return str(asset["href"]), assets
7376

7477
raise RuntimeError("No Zarr asset found in STAC item")
7578

7679

80+
def _quicklook_groups_from_assets(assets: dict[str, dict]) -> list[str]:
81+
if not assets:
82+
return []
83+
84+
groups: list[str] = []
85+
for asset in assets.values():
86+
href = asset.get("href", "")
87+
if "/quality/l2a_quicklook/" not in href or ".zarr/" not in href:
88+
continue
89+
rel = href.split(".zarr/", 1)[1].split("?", 1)[0].split(":", 1)[0]
90+
rel = rel.strip("/")
91+
if not rel:
92+
continue
93+
group = rel.rsplit("/", 1)[0] if "/" in rel else rel
94+
group_path = f"/{group.strip('/')}"
95+
if group_path not in groups:
96+
groups.append(group_path)
97+
return groups
98+
99+
100+
def _merge_quicklook_groups(default_groups: list[str], assets: dict[str, dict]) -> list[str]:
101+
discovered = _quicklook_groups_from_assets(assets)
102+
if not discovered:
103+
return default_groups
104+
105+
merged = [group for group in default_groups if not group.startswith("/quality/l2a_quicklook")]
106+
merged.extend(path for path in discovered if path not in merged)
107+
logger.debug("Using quicklook groups derived from STAC assets: %s", discovered)
108+
return merged
109+
110+
77111
# === Conversion Workflow ===
78112

79113

@@ -107,13 +141,19 @@ def run_conversion(
107141
logger.info(f" Collection: {collection}")
108142

109143
# Resolve source: STAC item or direct Zarr URL
110-
zarr_url = get_zarr_url(source_url) if "/items/" in source_url else source_url
144+
item_assets: dict[str, dict] = {}
145+
if "/items/" in source_url:
146+
zarr_url, item_assets = get_zarr_asset_info(source_url)
147+
else:
148+
zarr_url = source_url
111149
logger.info(f" Source: {zarr_url}")
112150

113151
# Get config and apply overrides
114152
config = get_config(collection)
115153
if groups:
116154
config["groups"] = groups.split(",")
155+
else:
156+
config["groups"] = _merge_quicklook_groups(config["groups"], item_assets)
117157
if spatial_chunk is not None:
118158
config["spatial_chunk"] = spatial_chunk
119159
if tile_width is not None:
@@ -124,6 +164,7 @@ def run_conversion(
124164
logger.info(
125165
f" Parameters: chunk={config['spatial_chunk']}, tile={config['tile_width']}, sharding={config['enable_sharding']}"
126166
)
167+
logger.debug(" Groups to convert: %s", ", ".join(config["groups"]))
127168

128169
# Construct output path and clean existing
129170
output_url = f"s3://{s3_output_bucket}/{s3_output_prefix}/{collection}/{item_id}.zarr"

0 commit comments

Comments
 (0)