1
- """S3 utilities for GeoZarr conversion."""
1
+ """S3 utilities for GeoZarr conversion.
2
+
3
+ Note: Optional dependencies may lack type stubs; suppress their missing-import
4
+ noise locally to keep global mypy strictness intact.
5
+ """
6
+
7
+ # mypy: disable-error-code=import-not-found
2
8
3
9
import json
4
10
import os
5
- from typing import Any , Dict , Optional
11
+ from typing import Any , Dict , Literal , Optional
6
12
from urllib .parse import urlparse
7
13
8
14
import s3fs
@@ -108,9 +114,7 @@ def get_s3_storage_options(s3_path: str, **s3_kwargs: Any) -> Dict[str, Any]:
108
114
default_s3_kwargs = {
109
115
"anon" : False , # Use credentials
110
116
"use_ssl" : True ,
111
- "client_kwargs" : {
112
- "region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )
113
- },
117
+ "client_kwargs" : {"region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )},
114
118
}
115
119
116
120
# Add custom endpoint support (e.g., for OVH Cloud)
@@ -147,6 +151,36 @@ def get_storage_options(path: str, **kwargs: Any) -> Optional[Dict[str, Any]]:
147
151
"""
148
152
if is_s3_path (path ):
149
153
return get_s3_storage_options (path , ** kwargs )
154
+ # For HTTP(S) paths, ensure servers don't apply content-encoding (e.g., gzip)
155
+ # to chunk responses which would corrupt codec bytes (e.g., Blosc) and
156
+ # trigger decompression errors. Force identity encoding and set a sane
157
+ # default block size for ranged requests.
158
+ if path .startswith (("http://" , "https://" )):
159
+ headers = {"Accept-Encoding" : "identity" }
160
+ # Merge user headers if provided
161
+ user_headers = kwargs .get ("headers" )
162
+ if isinstance (user_headers , dict ):
163
+ headers .update (user_headers )
164
+ http_opts : Dict [str , Any ] = {
165
+ "headers" : headers ,
166
+ "block_size" : kwargs .get ("block_size" , 0 ),
167
+ "simple_links" : kwargs .get ("simple_links" , True ),
168
+ }
169
+ # Add conservative aiohttp client settings to mitigate disconnects
170
+ try :
171
+ import aiohttp
172
+
173
+ timeout = kwargs .get ("timeout" ) or aiohttp .ClientTimeout (total = 120 )
174
+ connector = kwargs .get ("connector" ) or aiohttp .TCPConnector (limit = 8 )
175
+ client_kwargs = kwargs .get ("client_kwargs" , {}) or {}
176
+ if not isinstance (client_kwargs , dict ):
177
+ client_kwargs = {}
178
+ client_kwargs .setdefault ("timeout" , timeout )
179
+ client_kwargs .setdefault ("connector" , connector )
180
+ http_opts ["client_kwargs" ] = client_kwargs
181
+ except Exception :
182
+ pass
183
+ return http_opts
150
184
# For local paths, return None (no storage options needed)
151
185
# Future protocols (gcs://, azure://, etc.) can be added here
152
186
return None
@@ -201,9 +235,7 @@ def create_s3_store(s3_path: str, **s3_kwargs: Any) -> str:
201
235
return s3_path
202
236
203
237
204
- def write_s3_json_metadata (
205
- s3_path : str , metadata : Dict [str , Any ], ** s3_kwargs : Any
206
- ) -> None :
238
+ def write_s3_json_metadata (s3_path : str , metadata : Dict [str , Any ], ** s3_kwargs : Any ) -> None :
207
239
"""
208
240
Write JSON metadata directly to S3.
209
241
@@ -224,9 +256,7 @@ def write_s3_json_metadata(
224
256
"anon" : False ,
225
257
"use_ssl" : True ,
226
258
"asynchronous" : False , # Force synchronous mode
227
- "client_kwargs" : {
228
- "region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )
229
- },
259
+ "client_kwargs" : {"region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )},
230
260
}
231
261
232
262
# Add custom endpoint support (e.g., for OVH Cloud)
@@ -266,9 +296,7 @@ def read_s3_json_metadata(s3_path: str, **s3_kwargs: Any) -> Dict[str, Any]:
266
296
"anon" : False ,
267
297
"use_ssl" : True ,
268
298
"asynchronous" : False , # Force synchronous mode
269
- "client_kwargs" : {
270
- "region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )
271
- },
299
+ "client_kwargs" : {"region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )},
272
300
}
273
301
274
302
# Add custom endpoint support (e.g., for OVH Cloud)
@@ -308,9 +336,7 @@ def s3_path_exists(s3_path: str, **s3_kwargs: Any) -> bool:
308
336
"anon" : False ,
309
337
"use_ssl" : True ,
310
338
"asynchronous" : False , # Force synchronous mode
311
- "client_kwargs" : {
312
- "region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )
313
- },
339
+ "client_kwargs" : {"region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )},
314
340
}
315
341
316
342
# Add custom endpoint support (e.g., for OVH Cloud)
@@ -327,7 +353,9 @@ def s3_path_exists(s3_path: str, **s3_kwargs: Any) -> bool:
327
353
return result
328
354
329
355
330
- def open_s3_zarr_group (s3_path : str , mode : str = "r" , ** s3_kwargs : Any ) -> zarr .Group :
356
+ def open_s3_zarr_group (
357
+ s3_path : str , mode : Literal ["r" , "r+" , "w" , "a" , "w-" ] = "r" , ** s3_kwargs : Any
358
+ ) -> zarr .Group :
331
359
"""
332
360
Open a Zarr group from S3 using storage_options.
333
361
@@ -346,9 +374,7 @@ def open_s3_zarr_group(s3_path: str, mode: str = "r", **s3_kwargs: Any) -> zarr.
346
374
Zarr group
347
375
"""
348
376
storage_options = get_s3_storage_options (s3_path , ** s3_kwargs )
349
- return zarr .open_group (
350
- s3_path , mode = mode , zarr_format = 3 , storage_options = storage_options
351
- )
377
+ return zarr .open_group (s3_path , mode = mode , zarr_format = 3 , storage_options = storage_options )
352
378
353
379
354
380
def get_s3_credentials_info () -> Dict [str , Optional [str ]]:
@@ -362,9 +388,7 @@ def get_s3_credentials_info() -> Dict[str, Optional[str]]:
362
388
"""
363
389
return {
364
390
"aws_access_key_id" : os .environ .get ("AWS_ACCESS_KEY_ID" ),
365
- "aws_secret_access_key" : "***"
366
- if os .environ .get ("AWS_SECRET_ACCESS_KEY" )
367
- else None ,
391
+ "aws_secret_access_key" : "***" if os .environ .get ("AWS_SECRET_ACCESS_KEY" ) else None ,
368
392
"aws_session_token" : "***" if os .environ .get ("AWS_SESSION_TOKEN" ) else None ,
369
393
"aws_default_region" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" ),
370
394
"aws_profile" : os .environ .get ("AWS_PROFILE" ),
@@ -395,9 +419,7 @@ def validate_s3_access(s3_path: str, **s3_kwargs: Any) -> tuple[bool, Optional[s
395
419
"anon" : False ,
396
420
"use_ssl" : True ,
397
421
"asynchronous" : False , # Force synchronous mode
398
- "client_kwargs" : {
399
- "region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )
400
- },
422
+ "client_kwargs" : {"region_name" : os .environ .get ("AWS_DEFAULT_REGION" , "us-east-1" )},
401
423
}
402
424
403
425
# Add custom endpoint support (e.g., for OVH Cloud)
@@ -441,9 +463,34 @@ def get_filesystem(path: str, **kwargs: Any) -> Any:
441
463
# Get S3 storage options and use them for fsspec
442
464
storage_options = get_s3_storage_options (path , ** kwargs )
443
465
return fsspec .filesystem ("s3" , ** storage_options )
444
- else :
445
- # For local paths, use the local filesystem
446
- return fsspec .filesystem ("file" )
466
+ if path .startswith (("http://" , "https://" )):
467
+ # Ensure identity encoding for raw chunk bytes over HTTP(S)
468
+ headers = {"Accept-Encoding" : "identity" }
469
+ user_headers = kwargs .get ("headers" )
470
+ if isinstance (user_headers , dict ):
471
+ headers .update (user_headers )
472
+ http_opts : Dict [str , Any ] = {
473
+ "headers" : headers ,
474
+ "block_size" : kwargs .get ("block_size" , 0 ),
475
+ "simple_links" : kwargs .get ("simple_links" , True ),
476
+ }
477
+ # Add conservative aiohttp client settings to mitigate disconnects
478
+ try :
479
+ import aiohttp
480
+
481
+ timeout = kwargs .get ("timeout" ) or aiohttp .ClientTimeout (total = 120 )
482
+ connector = kwargs .get ("connector" ) or aiohttp .TCPConnector (limit = 8 )
483
+ client_kwargs = kwargs .get ("client_kwargs" , {}) or {}
484
+ if not isinstance (client_kwargs , dict ):
485
+ client_kwargs = {}
486
+ client_kwargs .setdefault ("timeout" , timeout )
487
+ client_kwargs .setdefault ("connector" , connector )
488
+ http_opts ["client_kwargs" ] = client_kwargs
489
+ except Exception :
490
+ pass
491
+ return fsspec .filesystem ("http" , ** http_opts )
492
+ # For local paths, use the local filesystem
493
+ return fsspec .filesystem ("file" )
447
494
448
495
449
496
def write_json_metadata (path : str , metadata : Dict [str , Any ], ** kwargs : Any ) -> None :
@@ -519,7 +566,9 @@ def path_exists(path: str, **kwargs: Any) -> bool:
519
566
return result
520
567
521
568
522
- def open_zarr_group (path : str , mode : str = "r" , ** kwargs : Any ) -> zarr .Group :
569
+ def open_zarr_group (
570
+ path : str , mode : Literal ["r" , "r+" , "w" , "a" , "w-" ] = "r" , ** kwargs : Any
571
+ ) -> zarr .Group :
523
572
"""
524
573
Open a Zarr group from any path type using unified storage options.
525
574
@@ -538,6 +587,4 @@ def open_zarr_group(path: str, mode: str = "r", **kwargs: Any) -> zarr.Group:
538
587
Zarr group
539
588
"""
540
589
storage_options = get_storage_options (path , ** kwargs )
541
- return zarr .open_group (
542
- path , mode = mode , zarr_format = 3 , storage_options = storage_options
543
- )
590
+ return zarr .open_group (path , mode = mode , zarr_format = 3 , storage_options = storage_options )
0 commit comments