1616
1717class _DataLoader :
1818 """Internal S3 data loader with local caching."""
19-
19+
2020 DEFAULT_BUCKET = "cache-datasets"
2121 DEFAULT_CACHE_DIR = Path (os .environ .get ("LCS_HUB_CACHE" , Path .home () / ".cache/libcachesim/hub" ))
2222
2323 # Characters that are problematic on various filesystems
2424 INVALID_CHARS = set ('<>:"|?*\x00 ' )
2525 # Reserved names on Windows
2626 RESERVED_NAMES = {
27- 'CON' , 'PRN' , 'AUX' , 'NUL' ,
28- 'COM1' , 'COM2' , 'COM3' , 'COM4' , 'COM5' , 'COM6' , 'COM7' , 'COM8' , 'COM9' ,
29- 'LPT1' , 'LPT2' , 'LPT3' , 'LPT4' , 'LPT5' , 'LPT6' , 'LPT7' , 'LPT8' , 'LPT9'
27+ "CON" ,
28+ "PRN" ,
29+ "AUX" ,
30+ "NUL" ,
31+ "COM1" ,
32+ "COM2" ,
33+ "COM3" ,
34+ "COM4" ,
35+ "COM5" ,
36+ "COM6" ,
37+ "COM7" ,
38+ "COM8" ,
39+ "COM9" ,
40+ "LPT1" ,
41+ "LPT2" ,
42+ "LPT3" ,
43+ "LPT4" ,
44+ "LPT5" ,
45+ "LPT6" ,
46+ "LPT7" ,
47+ "LPT8" ,
48+ "LPT9" ,
3049 }
3150
3251 def __init__ (
@@ -42,65 +61,65 @@ def _validate_bucket_name(self, bucket_name: str) -> str:
4261 """Validate S3 bucket name according to AWS rules."""
4362 if not bucket_name :
4463 raise ValueError ("Bucket name cannot be empty" )
45-
64+
4665 if len (bucket_name ) < 3 or len (bucket_name ) > 63 :
4766 raise ValueError ("Bucket name must be between 3 and 63 characters" )
48-
49- if not re .match (r' ^[a-z0-9.-]+$' , bucket_name ):
67+
68+ if not re .match (r" ^[a-z0-9.-]+$" , bucket_name ):
5069 raise ValueError ("Bucket name can only contain lowercase letters, numbers, periods, and hyphens" )
51-
52- if bucket_name .startswith ('.' ) or bucket_name .endswith ('.' ):
70+
71+ if bucket_name .startswith ("." ) or bucket_name .endswith ("." ):
5372 raise ValueError ("Bucket name cannot start or end with a period" )
54-
55- if bucket_name .startswith ('-' ) or bucket_name .endswith ('-' ):
73+
74+ if bucket_name .startswith ("-" ) or bucket_name .endswith ("-" ):
5675 raise ValueError ("Bucket name cannot start or end with a hyphen" )
57-
58- if '..' in bucket_name :
76+
77+ if ".." in bucket_name :
5978 raise ValueError ("Bucket name cannot contain consecutive periods" )
60-
79+
6180 return bucket_name
6281
6382 def _validate_and_sanitize_key (self , key : str ) -> str :
6483 """Validate and sanitize S3 key for safe local filesystem usage."""
6584 if not key :
6685 raise ValueError ("S3 key cannot be empty" )
67-
86+
6887 if len (key ) > 1024 : # S3 limit is 1024 bytes
6988 raise ValueError ("S3 key is too long (max 1024 characters)" )
70-
89+
7190 # Check for path traversal attempts
72- if '..' in key :
91+ if ".." in key :
7392 raise ValueError ("S3 key cannot contain '..' (path traversal not allowed)" )
74-
75- if key .startswith ('/' ):
93+
94+ if key .startswith ("/" ):
7695 raise ValueError ("S3 key cannot start with '/'" )
77-
96+
7897 # Split key into parts and validate each part
79- parts = key .split ('/' )
98+ parts = key .split ("/" )
8099 sanitized_parts = []
81-
100+
82101 for part in parts :
83102 if not part : # Empty part (double slash)
84103 continue
85-
104+
86105 # Check for reserved names (case insensitive)
87106 if part .upper () in self .RESERVED_NAMES :
88107 raise ValueError (f"S3 key contains reserved name: { part } " )
89-
108+
90109 # Check for invalid characters
91110 if any (c in self .INVALID_CHARS for c in part ):
92111 raise ValueError (f"S3 key contains invalid characters in part: { part } " )
93-
112+
94113 # Check if part is too long for filesystem
95114 if len (part ) > 255 : # Most filesystems have 255 char limit per component
96115 raise ValueError (f"S3 key component too long: { part } " )
97-
116+
98117 sanitized_parts .append (part )
99-
118+
100119 if not sanitized_parts :
101120 raise ValueError ("S3 key resulted in empty path after sanitization" )
102-
103- return '/' .join (sanitized_parts )
121+
122+ return "/" .join (sanitized_parts )
104123
105124 def _ensure_cache_dir (self ) -> None :
106125 (self .cache_dir / self .bucket_name ).mkdir (parents = True , exist_ok = True )
@@ -114,10 +133,11 @@ def _get_available_disk_space(self, path: Path) -> int:
114133 # Fallback for Windows or other systems
115134 try :
116135 import shutil
136+
117137 return shutil .disk_usage (path ).free
118138 except Exception :
119139 logger .warning ("Could not determine available disk space" )
120- return float (' inf' ) # Assume unlimited space if we can't check
140+ return float (" inf" ) # Assume unlimited space if we can't check
121141
122142 @property
123143 def s3_client (self ):
@@ -138,20 +158,20 @@ def _cache_path(self, key: str) -> Path:
138158 """Create cache path that mirrors S3 structure after validation."""
139159 sanitized_key = self ._validate_and_sanitize_key (key )
140160 cache_path = self .cache_dir / self .bucket_name / sanitized_key
141-
161+
142162 # Double-check that the resolved path is still within cache directory
143163 try :
144164 cache_path .resolve ().relative_to (self .cache_dir .resolve ())
145165 except ValueError :
146166 raise ValueError (f"S3 key resolves outside cache directory: { key } " )
147-
167+
148168 return cache_path
149169
150170 def _get_object_size (self , key : str ) -> int :
151171 """Get the size of an S3 object without downloading it."""
152172 try :
153173 response = self .s3_client .head_object (Bucket = self .bucket_name , Key = key )
154- return response [' ContentLength' ]
174+ return response [" ContentLength" ]
155175 except Exception as e :
156176 logger .warning (f"Could not determine object size for s3://{ self .bucket_name } /{ key } : { e } " )
157177 return 0
@@ -254,10 +274,10 @@ def list_s3_objects(self, prefix: str = "", delimiter: str = "/") -> dict:
254274def set_cache_dir (cache_dir : Union [str , Path ]) -> None :
255275 """
256276 Set the global cache directory for S3 downloads.
257-
277+
258278 Args:
259279 cache_dir: Path to the cache directory
260-
280+
261281 Example:
262282 >>> import libcachesim as lcs
263283 >>> lcs.set_cache_dir("/tmp/my_cache")
@@ -269,10 +289,10 @@ def set_cache_dir(cache_dir: Union[str, Path]) -> None:
269289def get_cache_dir () -> Path :
270290 """
271291 Get the current cache directory.
272-
292+
273293 Returns:
274294 Path to the current cache directory
275-
295+
276296 Example:
277297 >>> import libcachesim as lcs
278298 >>> print(lcs.get_cache_dir())
@@ -284,10 +304,10 @@ def get_cache_dir() -> Path:
284304def clear_cache (s3_path : Optional [str ] = None ) -> None :
285305 """
286306 Clear cached files.
287-
307+
288308 Args:
289309 s3_path: Specific S3 path to clear, or None to clear all cache
290-
310+
291311 Example:
292312 >>> import libcachesim as lcs
293313 >>> # Clear specific file
@@ -298,7 +318,7 @@ def clear_cache(s3_path: Optional[str] = None) -> None:
298318 if s3_path and s3_path .startswith ("s3://" ):
299319 parsed = urlparse (s3_path )
300320 bucket = parsed .netloc
301- key = parsed .path .lstrip ('/' )
321+ key = parsed .path .lstrip ("/" )
302322 if bucket == _data_loader .bucket_name :
303323 _data_loader .clear_cache (key )
304324 else :
@@ -310,10 +330,10 @@ def clear_cache(s3_path: Optional[str] = None) -> None:
310330def get_cache_size () -> int :
311331 """
312332 Get total size of cached files in bytes.
313-
333+
314334 Returns:
315335 Total cache size in bytes
316-
336+
317337 Example:
318338 >>> import libcachesim as lcs
319339 >>> size_mb = lcs.get_cache_size() / (1024**2)
@@ -325,10 +345,10 @@ def get_cache_size() -> int:
325345def list_cached_files () -> list [str ]:
326346 """
327347 List all cached files.
328-
348+
329349 Returns:
330350 List of cached file paths
331-
351+
332352 Example:
333353 >>> import libcachesim as lcs
334354 >>> files = lcs.list_cached_files()
@@ -344,4 +364,4 @@ def get_data_loader(bucket_name: str = None) -> _DataLoader:
344364 if bucket_name is None or bucket_name == _data_loader .bucket_name :
345365 return _data_loader
346366 else :
347- return _DataLoader (bucket_name = bucket_name , cache_dir = _data_loader .cache_dir .parent )
367+ return _DataLoader (bucket_name = bucket_name , cache_dir = _data_loader .cache_dir .parent )
0 commit comments