|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Automated GeoZarr vs EOPF performance comparison. |
| 3 | +
|
| 4 | +Measures load time and memory usage comparing original EOPF Zarr format |
| 5 | +against optimized GeoZarr format. |
| 6 | +
|
| 7 | +Usage: |
| 8 | + benchmark_geozarr.py --eopf-url s3://... --geozarr-url s3://... --output results.json |
| 9 | +""" |
| 10 | + |
| 11 | +import argparse |
| 12 | +import json |
| 13 | +import logging |
| 14 | +import sys |
| 15 | +import time |
| 16 | +from dataclasses import asdict, dataclass |
| 17 | +from pathlib import Path |
| 18 | + |
| 19 | +import xarray as xr |
| 20 | + |
| 21 | +logging.basicConfig(level=logging.INFO) |
| 22 | +logger = logging.getLogger(__name__) |
| 23 | + |
| 24 | + |
| 25 | +@dataclass |
| 26 | +class BenchmarkResult: |
| 27 | + """Performance measurement result.""" |
| 28 | + |
| 29 | + format_type: str # "eopf" or "geozarr" |
| 30 | + dataset_url: str |
| 31 | + load_time_seconds: float |
| 32 | + dataset_size_mb: float |
| 33 | + num_variables: int |
| 34 | + chunk_sizes: dict[str, tuple[int, ...]] |
| 35 | + |
| 36 | + |
| 37 | +def benchmark_load_time(dataset_url: str, format_type: str) -> BenchmarkResult: |
| 38 | + """Measure dataset load time and basic metrics.""" |
| 39 | + logger.info(f"Benchmarking {format_type}: {dataset_url}") |
| 40 | + |
| 41 | + start = time.perf_counter() |
| 42 | + ds = xr.open_zarr(dataset_url, consolidated=True) |
| 43 | + load_time = time.perf_counter() - start |
| 44 | + |
| 45 | + # Collect metrics |
| 46 | + chunks = {var: ds[var].chunks for var in list(ds.data_vars)[:3]} # Sample 3 vars |
| 47 | + size_mb = sum(var.nbytes for var in ds.data_vars.values()) / 1024 / 1024 |
| 48 | + |
| 49 | + result = BenchmarkResult( |
| 50 | + format_type=format_type, |
| 51 | + dataset_url=dataset_url, |
| 52 | + load_time_seconds=round(load_time, 3), |
| 53 | + dataset_size_mb=round(size_mb, 2), |
| 54 | + num_variables=len(ds.data_vars), |
| 55 | + chunk_sizes=chunks, |
| 56 | + ) |
| 57 | + |
| 58 | + ds.close() |
| 59 | + logger.info(f"✓ {format_type} load time: {load_time:.3f}s") |
| 60 | + return result |
| 61 | + |
| 62 | + |
| 63 | +def compare_results(eopf: BenchmarkResult, geozarr: BenchmarkResult) -> dict: |
| 64 | + """Generate comparison summary.""" |
| 65 | + speedup = ( |
| 66 | + eopf.load_time_seconds / geozarr.load_time_seconds if geozarr.load_time_seconds > 0 else 0 |
| 67 | + ) |
| 68 | + |
| 69 | + return { |
| 70 | + "eopf": asdict(eopf), |
| 71 | + "geozarr": asdict(geozarr), |
| 72 | + "comparison": { |
| 73 | + "speedup_factor": round(speedup, 2), |
| 74 | + "time_saved_seconds": round(eopf.load_time_seconds - geozarr.load_time_seconds, 3), |
| 75 | + "faster_format": "geozarr" if speedup > 1 else "eopf", |
| 76 | + }, |
| 77 | + } |
| 78 | + |
| 79 | + |
| 80 | +def main(argv: list[str] | None = None) -> int: |
| 81 | + parser = argparse.ArgumentParser(description="Benchmark GeoZarr vs EOPF performance") |
| 82 | + parser.add_argument("--eopf-url", required=True, help="URL to EOPF Zarr dataset") |
| 83 | + parser.add_argument("--geozarr-url", required=True, help="URL to GeoZarr dataset") |
| 84 | + parser.add_argument("--output", type=Path, help="Output JSON file path") |
| 85 | + parser.add_argument("--verbose", action="store_true") |
| 86 | + |
| 87 | + args = parser.parse_args(argv) |
| 88 | + |
| 89 | + if args.verbose: |
| 90 | + logging.getLogger().setLevel(logging.DEBUG) |
| 91 | + |
| 92 | + try: |
| 93 | + # Run benchmarks |
| 94 | + eopf_result = benchmark_load_time(args.eopf_url, "eopf") |
| 95 | + geozarr_result = benchmark_load_time(args.geozarr_url, "geozarr") |
| 96 | + |
| 97 | + # Generate comparison |
| 98 | + results = compare_results(eopf_result, geozarr_result) |
| 99 | + |
| 100 | + # Write output |
| 101 | + if args.output: |
| 102 | + args.output.parent.mkdir(parents=True, exist_ok=True) |
| 103 | + args.output.write_text(json.dumps(results, indent=2)) |
| 104 | + logger.info(f"Results written to: {args.output}") |
| 105 | + |
| 106 | + # Print summary |
| 107 | + print(json.dumps(results, indent=2)) |
| 108 | + |
| 109 | + speedup = results["comparison"]["speedup_factor"] |
| 110 | + if speedup > 1: |
| 111 | + logger.info(f"✅ GeoZarr is {speedup}x faster than EOPF") |
| 112 | + else: |
| 113 | + logger.warning(f"⚠️ EOPF is {1 / speedup:.2f}x faster than GeoZarr") |
| 114 | + |
| 115 | + return 0 |
| 116 | + |
| 117 | + except Exception as e: |
| 118 | + logger.error(f"Benchmark failed: {e}") |
| 119 | + return 1 |
| 120 | + |
| 121 | + |
| 122 | +if __name__ == "__main__": |
| 123 | + sys.exit(main()) |
0 commit comments