@@ -56,8 +56,13 @@ class BaseCache:
5656
5757 def __init__ (self , blocksize : int , fetcher : Fetcher , size : int ) -> None :
5858 self .blocksize = blocksize
59+ self .nblocks = 0
5960 self .fetcher = fetcher
6061 self .size = size
62+ self .hit_count = 0
63+ self .miss_count = 0
64+ # the bytes that we actually requested
65+ self .total_requested_bytes = 0
6166
6267 def _fetch (self , start : int | None , stop : int | None ) -> bytes :
6368 if start is None :
@@ -68,6 +73,36 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
6873 return b""
6974 return self .fetcher (start , stop )
7075
76+ def _reset_stats (self ) -> None :
77+ """Reset hit and miss counts for a more ganular report e.g. by file."""
78+ self .hit_count = 0
79+ self .miss_count = 0
80+ self .total_requested_bytes = 0
81+
82+ def _log_stats (self ) -> str :
83+ """Return a formatted string of the cache statistics."""
84+ if self .hit_count == 0 and self .miss_count == 0 :
85+ # a cache that does nothing, this is for logs only
86+ return ""
87+ return " , %s: %d hits, %d misses, %d total requested bytes" % (
88+ self .name ,
89+ self .hit_count ,
90+ self .miss_count ,
91+ self .total_requested_bytes ,
92+ )
93+
94+ def __repr__ (self ) -> str :
95+ # TODO: use rich for better formatting
96+ return f"""
97+ <{ self .__class__ .__name__ } :
98+ block size : { self .blocksize }
99+ block count : { self .nblocks }
100+ file size : { self .size }
101+ cache hits : { self .hit_count }
102+ cache misses: { self .miss_count }
103+ total requested bytes: { self .total_requested_bytes } >
104+ """
105+
71106
72107class MMapCache (BaseCache ):
73108 """memory-mapped sparse file cache
@@ -126,13 +161,18 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
126161 start_block = start // self .blocksize
127162 end_block = end // self .blocksize
128163 need = [i for i in range (start_block , end_block + 1 ) if i not in self .blocks ]
164+ hits = [i for i in range (start_block , end_block + 1 ) if i in self .blocks ]
165+ self .miss_count += len (need )
166+ self .hit_count += len (hits )
129167 while need :
130168 # TODO: not a for loop so we can consolidate blocks later to
131169 # make fewer fetch calls; this could be parallel
132170 i = need .pop (0 )
171+
133172 sstart = i * self .blocksize
134173 send = min (sstart + self .blocksize , self .size )
135- logger .debug (f"MMap get block #{ i } ({ sstart } -{ send } " )
174+ self .total_requested_bytes += send - sstart
175+ logger .debug (f"MMap get block #{ i } ({ sstart } -{ send } )" )
136176 self .cache [sstart :send ] = self .fetcher (sstart , send )
137177 self .blocks .add (i )
138178
@@ -176,16 +216,20 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
176216 l = end - start
177217 if start >= self .start and end <= self .end :
178218 # cache hit
219+ self .hit_count += 1
179220 return self .cache [start - self .start : end - self .start ]
180221 elif self .start <= start < self .end :
181222 # partial hit
223+ self .miss_count += 1
182224 part = self .cache [start - self .start :]
183225 l -= len (part )
184226 start = self .end
185227 else :
186228 # miss
229+ self .miss_count += 1
187230 part = b""
188231 end = min (self .size , end + self .blocksize )
232+ self .total_requested_bytes += end - start
189233 self .cache = self .fetcher (start , end ) # new block replaces old
190234 self .start = start
191235 self .end = self .start + len (self .cache )
@@ -202,24 +246,39 @@ class FirstChunkCache(BaseCache):
202246 name = "first"
203247
204248 def __init__ (self , blocksize : int , fetcher : Fetcher , size : int ) -> None :
249+ if blocksize > size :
250+ # this will buffer the whole thing
251+ blocksize = size
205252 super ().__init__ (blocksize , fetcher , size )
206253 self .cache : bytes | None = None
207254
208255 def _fetch (self , start : int | None , end : int | None ) -> bytes :
209256 start = start or 0
210- end = end or self .size
257+ if start > self .size :
258+ logger .debug ("FirstChunkCache: requested start > file size" )
259+ return b""
260+
261+ end = min (end , self .size )
262+
211263 if start < self .blocksize :
212264 if self .cache is None :
265+ self .miss_count += 1
213266 if end > self .blocksize :
267+ self .total_requested_bytes += end
214268 data = self .fetcher (0 , end )
215269 self .cache = data [: self .blocksize ]
216270 return data [start :]
217271 self .cache = self .fetcher (0 , self .blocksize )
272+ self .total_requested_bytes += self .blocksize
218273 part = self .cache [start :end ]
219274 if end > self .blocksize :
275+ self .total_requested_bytes += end - self .blocksize
220276 part += self .fetcher (self .blocksize , end )
277+ self .hit_count += 1
221278 return part
222279 else :
280+ self .miss_count += 1
281+ self .total_requested_bytes += end - start
223282 return self .fetcher (start , end )
224283
225284
@@ -256,12 +315,6 @@ def __init__(
256315 self .maxblocks = maxblocks
257316 self ._fetch_block_cached = functools .lru_cache (maxblocks )(self ._fetch_block )
258317
259- def __repr__ (self ) -> str :
260- return (
261- f"<BlockCache blocksize={ self .blocksize } , "
262- f"size={ self .size } , nblocks={ self .nblocks } >"
263- )
264-
265318 def cache_info (self ):
266319 """
267320 The statistics on the block cache.
@@ -319,6 +372,8 @@ def _fetch_block(self, block_number: int) -> bytes:
319372
320373 start = block_number * self .blocksize
321374 end = start + self .blocksize
375+ self .total_requested_bytes += end - start
376+ self .miss_count += 1
322377 logger .info ("BlockCache fetching block %d" , block_number )
323378 block_contents = super ()._fetch (start , end )
324379 return block_contents
@@ -339,6 +394,7 @@ def _read_cache(
339394 start_pos = start % self .blocksize
340395 end_pos = end % self .blocksize
341396
397+ self .hit_count += 1
342398 if start_block_number == end_block_number :
343399 block : bytes = self ._fetch_block_cached (start_block_number )
344400 return block [start_pos :end_pos ]
@@ -404,6 +460,7 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
404460 ):
405461 # cache hit: we have all the required data
406462 offset = start - self .start
463+ self .hit_count += 1
407464 return self .cache [offset : offset + end - start ]
408465
409466 if self .blocksize :
@@ -418,27 +475,34 @@ def _fetch(self, start: int | None, end: int | None) -> bytes:
418475 self .end is None or end > self .end
419476 ):
420477 # First read, or extending both before and after
478+ self .total_requested_bytes += bend - start
479+ self .miss_count += 1
421480 self .cache = self .fetcher (start , bend )
422481 self .start = start
423482 else :
424483 assert self .start is not None
425484 assert self .end is not None
485+ self .miss_count += 1
426486
427487 if start < self .start :
428488 if self .end is None or self .end - end > self .blocksize :
489+ self .total_requested_bytes += bend - start
429490 self .cache = self .fetcher (start , bend )
430491 self .start = start
431492 else :
493+ self .total_requested_bytes += self .start - start
432494 new = self .fetcher (start , self .start )
433495 self .start = start
434496 self .cache = new + self .cache
435497 elif self .end is not None and bend > self .end :
436498 if self .end > self .size :
437499 pass
438500 elif end - self .end > self .blocksize :
501+ self .total_requested_bytes += bend - start
439502 self .cache = self .fetcher (start , bend )
440503 self .start = start
441504 else :
505+ self .total_requested_bytes += bend - self .end
442506 new = self .fetcher (self .end , bend )
443507 self .cache = self .cache + new
444508
@@ -470,10 +534,13 @@ def __init__(
470534 ) -> None :
471535 super ().__init__ (blocksize , fetcher , size ) # type: ignore[arg-type]
472536 if data is None :
537+ self .miss_count += 1
538+ self .total_requested_bytes += self .size
473539 data = self .fetcher (0 , self .size )
474540 self .data = data
475541
476542 def _fetch (self , start : int | None , stop : int | None ) -> bytes :
543+ self .hit_count += 1
477544 return self .data [start :stop ]
478545
479546
@@ -551,6 +618,7 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
551618 # are allowed to pad reads beyond the
552619 # buffer with zero
553620 out += b"\x00 " * (stop - start - len (out ))
621+ self .hit_count += 1
554622 return out
555623 else :
556624 # The request ends outside a known range,
@@ -572,6 +640,8 @@ def _fetch(self, start: int | None, stop: int | None) -> bytes:
572640 f"IO/caching performance may be poor!"
573641 )
574642 logger .debug (f"KnownPartsOfAFile cache fetching { start } -{ stop } " )
643+ self .total_requested_bytes += stop - start
644+ self .miss_count += 1
575645 return out + super ()._fetch (start , stop )
576646
577647
@@ -676,12 +746,6 @@ def __init__(
676746 self ._fetch_future : Future [bytes ] | None = None
677747 self ._fetch_future_lock = threading .Lock ()
678748
679- def __repr__ (self ) -> str :
680- return (
681- f"<BackgroundBlockCache blocksize={ self .blocksize } , "
682- f"size={ self .size } , nblocks={ self .nblocks } >"
683- )
684-
685749 def cache_info (self ) -> UpdatableLRU .CacheInfo :
686750 """
687751 The statistics on the block cache.
@@ -799,6 +863,8 @@ def _fetch_block(self, block_number: int, log_info: str = "sync") -> bytes:
799863 start = block_number * self .blocksize
800864 end = start + self .blocksize
801865 logger .info ("BlockCache fetching block (%s) %d" , log_info , block_number )
866+ self .total_requested_bytes += end - start
867+ self .miss_count += 1
802868 block_contents = super ()._fetch (start , end )
803869 return block_contents
804870
@@ -818,6 +884,9 @@ def _read_cache(
818884 start_pos = start % self .blocksize
819885 end_pos = end % self .blocksize
820886
887+ # kind of pointless to count this as a hit, but it is
888+ self .hit_count += 1
889+
821890 if start_block_number == end_block_number :
822891 block = self ._fetch_block_cached (start_block_number )
823892 return block [start_pos :end_pos ]
0 commit comments