Skip to content

Commit f6d4c1b

Browse files
committed
feat: add GeoZarr validation tools
- Add scripts/validate_geozarr.py for compliance validation - Validate CF conventions, STAC metadata, TileMatrixSet - CLI interface with verbose and output options
1 parent 8198ca6 commit f6d4c1b

File tree

2 files changed

+678
-0
lines changed

2 files changed

+678
-0
lines changed

scripts/validate_geozarr.py

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
#!/usr/bin/env python3
2+
"""Validate GeoZarr compliance and generate quality metrics.
3+
4+
Validates:
5+
- GeoZarr spec 0.4 compliance (via eopf-geozarr CLI)
6+
- STAC item spec compliance (via pystac)
7+
- TileMatrixSet OGC compliance (via morecantile)
8+
- CF-conventions compliance (via cf-xarray)
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import argparse
14+
import json
15+
import logging
16+
import subprocess
17+
import sys
18+
from datetime import UTC, datetime
19+
from pathlib import Path
20+
21+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
22+
logger = logging.getLogger(__name__)
23+
24+
25+
def validate_geozarr(dataset_path: str, verbose: bool = False) -> dict:
26+
"""Run eopf-geozarr validate and parse results.
27+
28+
Returns:
29+
dict with validation status and any errors/warnings
30+
"""
31+
logger.info(f"Validating: {dataset_path}")
32+
33+
cmd = ["eopf-geozarr", "validate", dataset_path]
34+
if verbose:
35+
cmd.append("--verbose")
36+
37+
try:
38+
result = subprocess.run(
39+
cmd,
40+
capture_output=True,
41+
text=True,
42+
timeout=300, # 5 minute timeout
43+
)
44+
45+
validation_result = {
46+
"valid": result.returncode == 0,
47+
"exit_code": result.returncode,
48+
"stdout": result.stdout,
49+
"stderr": result.stderr,
50+
}
51+
52+
if result.returncode == 0:
53+
logger.info("✅ Validation passed")
54+
else:
55+
logger.error(f"❌ Validation failed (exit code {result.returncode})")
56+
if result.stderr:
57+
logger.error(f"Errors:\n{result.stderr}")
58+
59+
return validation_result
60+
61+
except subprocess.TimeoutExpired:
62+
logger.error("❌ Validation timeout (>5 minutes)")
63+
return {
64+
"valid": False,
65+
"exit_code": -1,
66+
"error": "Validation timeout",
67+
}
68+
except Exception as e:
69+
logger.error(f"❌ Validation error: {e}")
70+
return {
71+
"valid": False,
72+
"exit_code": -1,
73+
"error": str(e),
74+
}
75+
76+
77+
def validate_stac_item(item_path: str | Path) -> dict:
78+
"""Validate STAC item against spec.
79+
80+
Args:
81+
item_path: Path to STAC item JSON file
82+
83+
Returns:
84+
dict with validation status
85+
"""
86+
try:
87+
import pystac
88+
89+
logger.info(f"Validating STAC item: {item_path}")
90+
item = pystac.Item.from_file(str(item_path))
91+
item.validate()
92+
93+
logger.info("✅ STAC item valid")
94+
return {"valid": True, "item_id": item.id, "collection": item.collection_id}
95+
96+
except Exception as e:
97+
logger.error(f"❌ STAC validation failed: {e}")
98+
return {"valid": False, "error": str(e)}
99+
100+
101+
def validate_tile_matrix_set(zarr_path: str) -> dict:
102+
"""Validate TileMatrixSet against OGC spec.
103+
104+
Args:
105+
zarr_path: Path to GeoZarr dataset
106+
107+
Returns:
108+
dict with validation status
109+
"""
110+
try:
111+
import zarr
112+
from morecantile import TileMatrixSet
113+
114+
logger.info("Validating TileMatrixSet...")
115+
store = zarr.open(zarr_path, mode="r")
116+
attrs = store.attrs.asdict()
117+
118+
if "tile_matrix_set" not in attrs:
119+
logger.warning("⚠️ No tile_matrix_set found in attributes")
120+
return {"valid": False, "error": "Missing tile_matrix_set attribute"}
121+
122+
# Parse and validate TMS
123+
tms = TileMatrixSet(**attrs["tile_matrix_set"])
124+
# morecantile validates on instantiation
125+
126+
logger.info("✅ TileMatrixSet valid")
127+
return {
128+
"valid": True,
129+
"tms_id": tms.id,
130+
"crs": str(tms.crs),
131+
"num_levels": len(tms.tileMatrices),
132+
}
133+
134+
except Exception as e:
135+
logger.error(f"❌ TMS validation failed: {e}")
136+
return {"valid": False, "error": str(e)}
137+
138+
139+
def validate_cf_conventions(zarr_path: str) -> dict:
140+
"""Validate CF-conventions compliance.
141+
142+
Args:
143+
zarr_path: Path to GeoZarr dataset
144+
145+
Returns:
146+
dict with validation status
147+
"""
148+
try:
149+
import cf_xarray # noqa: F401
150+
import xarray as xr
151+
152+
logger.info("Validating CF-conventions...")
153+
ds = xr.open_zarr(zarr_path, consolidated=False)
154+
155+
# Attempt CF decoding (raises if non-compliant)
156+
ds.cf.decode()
157+
158+
# Check for required CF attributes
159+
issues = []
160+
for var_name in ds.data_vars:
161+
var = ds[var_name]
162+
if "standard_name" not in var.attrs and "long_name" not in var.attrs:
163+
issues.append(f"Variable {var_name} missing standard_name/long_name")
164+
165+
if issues:
166+
logger.warning(f"⚠️ CF compliance warnings: {len(issues)}")
167+
for issue in issues[:5]: # Show first 5
168+
logger.warning(f" - {issue}")
169+
return {"valid": True, "warnings": issues}
170+
171+
logger.info("✅ CF-conventions valid")
172+
return {"valid": True}
173+
174+
except Exception as e:
175+
logger.error(f"❌ CF validation failed: {e}")
176+
return {"valid": False, "error": str(e)}
177+
178+
179+
def main() -> None:
180+
parser = argparse.ArgumentParser(description="Validate GeoZarr compliance")
181+
parser.add_argument("dataset_path", help="Path to GeoZarr dataset (S3 or local)")
182+
parser.add_argument("--item-id", help="STAC item ID for tracking")
183+
parser.add_argument("--stac-item", help="Path to STAC item JSON for validation")
184+
parser.add_argument("--output", help="Output JSON file path")
185+
parser.add_argument("--skip-cf", action="store_true", help="Skip CF-conventions check")
186+
parser.add_argument("--skip-tms", action="store_true", help="Skip TileMatrixSet check")
187+
parser.add_argument("--verbose", action="store_true", help="Verbose validation output")
188+
args = parser.parse_args()
189+
190+
# Run all validations
191+
validations = {}
192+
193+
# 1. GeoZarr spec compliance (via eopf-geozarr CLI)
194+
validations["geozarr"] = validate_geozarr(args.dataset_path, args.verbose)
195+
196+
# 2. STAC item validation (if provided)
197+
if args.stac_item:
198+
validations["stac_item"] = validate_stac_item(args.stac_item)
199+
200+
# 3. TileMatrixSet validation
201+
if not args.skip_tms:
202+
validations["tile_matrix_set"] = validate_tile_matrix_set(args.dataset_path)
203+
204+
# 4. CF-conventions validation
205+
if not args.skip_cf:
206+
validations["cf_conventions"] = validate_cf_conventions(args.dataset_path)
207+
208+
# Determine overall validity
209+
all_valid = all(v.get("valid", False) for v in validations.values())
210+
211+
# Build complete result
212+
result = {
213+
"timestamp": datetime.now(UTC).isoformat(),
214+
"dataset_path": args.dataset_path,
215+
"item_id": args.item_id,
216+
"valid": all_valid,
217+
"validations": validations,
218+
}
219+
220+
# Write to file if requested
221+
if args.output:
222+
output_path = Path(args.output)
223+
output_path.parent.mkdir(parents=True, exist_ok=True)
224+
with open(output_path, "w") as f:
225+
json.dump(result, f, indent=2)
226+
logger.info(f"Results written to: {output_path}")
227+
228+
# Print summary
229+
logger.info("\n" + "=" * 60)
230+
logger.info(f"Dataset: {args.dataset_path}")
231+
logger.info(f"Overall Valid: {all_valid}")
232+
for check_name, check_result in validations.items():
233+
status = "✅" if check_result.get("valid") else "❌"
234+
logger.info(f" {status} {check_name}: {check_result.get('valid')}")
235+
if args.item_id:
236+
logger.info(f"Item ID: {args.item_id}")
237+
logger.info("=" * 60 + "\n")
238+
239+
# Output JSON for workflow
240+
print(json.dumps(result, indent=2))
241+
242+
# Exit with validation status
243+
sys.exit(0 if all_valid else 1)
244+
245+
246+
if __name__ == "__main__":
247+
main()

0 commit comments

Comments
 (0)