@@ -78,8 +78,20 @@ def _to_numpy_dates(d):
78
78
79
79
80
80
class BaseRecordsDataset :
81
+ """This is the base class for all datasets based on records.
82
+ Records datasets are datasets that can be indexed by time (int) or by group (str).
83
+ A record dataset is designed for observations, where multiple array of difference shapes need to be stored for each date.
84
+ They have the same concept or start_date, end_date, frequency as fields datasets, but each date correspond to a window.
85
+ All windows have the same size (the window span can be different from the dataset frequency)
81
86
82
- def __getitem__ (self , i ):
87
+ variables in a record datasets are identified by a group and a name.
88
+ """
89
+
90
+ # Depending on the context, a variable is identified by "group.name",
91
+ # or using a dict with keys as groups and values as list of names.
92
+ # most of the code should be agnostic and transform one format to the other when needed.
93
+
94
+ def __getitem__ (self , i : int | str ):
83
95
if isinstance (i , str ):
84
96
return self ._getgroup (i )
85
97
@@ -90,15 +102,31 @@ def __getitem__(self, i):
90
102
91
103
@cached_property
92
104
def window (self ):
105
+ """Returns a string representation of the relative window of the dataset, such as '(-3h, 3h]'."""
93
106
return str (self ._window )
94
107
95
- def _getgroup (self , i ):
96
- return Tabular (self , i )
108
+ def _getgroup (self , group : str ):
109
+ """Returns a Tabular object for the group. As a partial function when argument group is given but i is not."""
110
+ return Tabular (self , group )
97
111
98
- def _getrecord (self , i ):
112
+ def _getrecord (self , i : int ):
113
+ """Returns a Record object for the time step i. As a partial function when argument i is given but group is not."""
99
114
return Record (self , i )
100
115
101
- def _load_data (self , i ):
116
+ def _load_data (self , i : int ) -> dict :
117
+ """
118
+ Load the data for a specific time step or window (i).
119
+ It is expected to return a dict containing keys of the form:
120
+
121
+ - "data:group1" : numpy array
122
+ - "latitudes:group1" : numpy array
123
+ - "longitudes:group1" : numpy array
124
+ - "metadata:group1" :
125
+ - ...
126
+ - "data:group2" : numpy array
127
+ - "latitudes:group2" : numpy array
128
+ - ...
129
+ """
102
130
raise NotImplementedError ("Must be implemented in subclass" )
103
131
104
132
@property
@@ -221,6 +249,13 @@ class FieldsRecords(RecordsForward):
221
249
"""A wrapper around a FieldsDataset to provide a consistent interface for records datasets."""
222
250
223
251
def __init__ (self , fields_dataset , name ):
252
+ """wrapper around a fields dataset to provide a consistent interface for records datasets.
253
+ A FieldsRecords appears as a RecordsDataset with a single group.
254
+ This allows merging fields datasets with other records datasets.
255
+ Parameters:
256
+ fields_dataset: must be a regular fields dataset
257
+ name: the name of the group
258
+ ."""
224
259
self .forward = fields_dataset
225
260
from anemoi .datasets .data .dataset import Dataset
226
261
@@ -293,7 +328,9 @@ def __len__(self):
293
328
return len (self .forward .dates )
294
329
295
330
296
- class GenericRename (RecordsForward ):
331
+ class BaseRename (RecordsForward ):
332
+ """Renames variables in a records dataset."""
333
+
297
334
def __init__ (self , dataset , rename ):
298
335
self .forward = dataset
299
336
assert isinstance (rename , dict )
@@ -320,16 +357,16 @@ def groups(self):
320
357
return [self .rename .get (k , k ) for k in self .forward .groups ]
321
358
322
359
323
- class Rename (GenericRename ):
360
+ class Rename (BaseRename ):
324
361
pass
325
362
326
363
327
- class SetGroup (GenericRename ):
364
+ class SetGroup (BaseRename ):
328
365
def __init__ (self , dataset , set_group ):
329
366
if len (dataset .groups ) != 1 :
330
367
raise ValueError (f"{ self .__class__ .__name__ } can only be used with datasets containing a single group." )
331
368
332
- super .__init__ (dataset , {dataset .groups [0 ]: set_group })
369
+ super () .__init__ (dataset , {dataset .groups [0 ]: set_group })
333
370
334
371
def _load_data (self , i ):
335
372
return self .dataset ._load_data (i )
@@ -411,6 +448,7 @@ def _to_timedelta(t):
411
448
412
449
413
450
class AbsoluteWindow :
451
+ # not used but expected to be useful when building datasets. And used in tests
414
452
def __init__ (self , start , end , include_start = True , include_end = True ):
415
453
assert isinstance (start , datetime .datetime ), f"start must be a datetime.datetime, got { type (start )} "
416
454
assert isinstance (end , datetime .datetime ), f"end must be a datetime.datetime, got { type (end )} "
@@ -428,6 +466,14 @@ def __repr__(self):
428
466
429
467
430
468
class WindowsSpec :
469
+ # A window specified by relative timedeltas, such as (-6h, 0h]
470
+ #
471
+ # the term "WindowSpec" is used here to avoid confusion between
472
+ # - a relative window, such as (-6h, 0h] which this class represents (WindowsSpec)
473
+ # - an actual time interval, such as [2023-01-01 00:00, 2023-01-01 06:00] which is an (AbsoluteWindow)
474
+ #
475
+ # but is is more confusing, it should be renamed as Window.
476
+
431
477
def __init__ (self , * , start , end , include_start = False , include_end = True ):
432
478
assert isinstance (start , (str , datetime .timedelta )), f"start must be a str or timedelta, got { type (start )} "
433
479
assert isinstance (end , (str , datetime .timedelta )), f"end must be a str or timedelta, got { type (end )} "
@@ -447,6 +493,7 @@ def __init__(self, *, start, end, include_start=False, include_end=True):
447
493
448
494
def to_absolute_window (self , date ):
449
495
"""Convert the window to an absolute window based on a date."""
496
+ # not used but expected to be useful when building datasets. And used in tests
450
497
assert isinstance (date , datetime .datetime ), f"date must be a datetime.datetime, got { type (date )} "
451
498
start = date + self .start
452
499
end = date + self .end
@@ -466,6 +513,8 @@ def _frequency_to_string(t):
466
513
return f"{ first } { _frequency_to_string (self .start )} ,{ _frequency_to_string (self .end )} { last } "
467
514
468
515
def compute_mask (self , timedeltas ):
516
+ """Returns a boolean numpy array of the same shape as timedeltas."""
517
+
469
518
assert timedeltas .dtype == "timedelta64[s]" , f"expecting np.timedelta64[s], got { timedeltas .dtype } "
470
519
if self .include_start :
471
520
lower_mask = timedeltas >= self ._start_np
@@ -480,6 +529,9 @@ def compute_mask(self, timedeltas):
480
529
return lower_mask & upper_mask
481
530
482
531
def starts_before (self , my_dates , other_dates , other_window ):
532
+ # apply this window to my_dates[0] and the other_window to other_dates[0]
533
+ # return True if this window starts before the other window
534
+
483
535
assert my_dates .dtype == "datetime64[s]" , f"expecting np.datetime64[s], got { my_dates .dtype } "
484
536
assert other_dates .dtype == "datetime64[s]" , f"expecting np.datetime64[s], got { other_dates .dtype } "
485
537
assert isinstance (other_window , WindowsSpec ), f"other_window must be a WindowsSpec, got { type (other_window )} "
@@ -492,6 +544,7 @@ def starts_before(self, my_dates, other_dates, other_window):
492
544
return my_start <= other_start
493
545
494
546
def ends_after (self , my_dates , other_dates , other_window ):
547
+ # same as starts_before
495
548
assert my_dates .dtype == "datetime64[s]" , f"expecting np.datetime64[s], got { my_dates .dtype } "
496
549
assert other_dates .dtype == "datetime64[s]" , f"expecting np.datetime64[s], got { other_dates .dtype } "
497
550
assert isinstance (other_window , WindowsSpec ), f"other_window must be a WindowsSpec, got { type (other_window )} "
@@ -507,13 +560,15 @@ def ends_after(self, my_dates, other_dates, other_window):
507
560
508
561
509
562
class Rewindowed (RecordsForward ):
563
+ # change the window of a records dataset
564
+ # similar to changing the frequency of a dataset
565
+
510
566
def __init__ (self , dataset , window ):
511
567
super ().__init__ (dataset )
512
568
self .dataset = dataset
513
569
514
570
# in this class anything with 1 refers to the original window/dataset
515
571
# and anything with 2 refers to the new window/dataset
516
- # and we use _Δ for timedeltas
517
572
518
573
self ._window1 = self .forward ._window
519
574
self ._window2 = window_from_str (window )
@@ -602,6 +657,13 @@ def _load_data(self, i):
602
657
603
658
604
659
class Select (RecordsForward ):
660
+ # Select a subset of variables from a records dataset
661
+ # select can be a list of strings with dots (or a dict with keys as groups and values as list of strings)
662
+ #
663
+ # the selection is a filter, not a reordering, which is different from fields datasets and should be documented/fixed
664
+ #
665
+ # Drop should be implemented
666
+
605
667
def __init__ (self , dataset , select ):
606
668
super ().__init__ (dataset )
607
669
@@ -693,6 +755,8 @@ def statistics(self):
693
755
694
756
695
757
class RecordsSubset (RecordsForward ):
758
+ """Subset of a records dataset based on a list of integer indices."""
759
+
696
760
def __init__ (self , dataset , indices , reason ):
697
761
super ().__init__ (dataset )
698
762
self .dataset = dataset
@@ -711,6 +775,7 @@ def __len__(self):
711
775
712
776
713
777
class RecordsDataset (BaseRecordsDataset ):
778
+ """This is the base class for all datasets based on records stored on disk."""
714
779
715
780
def __init__ (self , path , backend = None , ** kwargs ):
716
781
if kwargs :
@@ -806,7 +871,13 @@ def tree(self):
806
871
807
872
808
873
class Record :
809
- def __init__ (self , dataset , n ):
874
+ """A record corresponds to a single time step in a record dataset."""
875
+
876
+ def __init__ (self , dataset : RecordsDataset , n : int ):
877
+ """A record corresponds to a single time step in a record dataset.
878
+ n : int, the index of the time step in the dataset.
879
+ dataset : RecordsDataset, the dataset this record belongs to.
880
+ """
810
881
self .dataset = dataset
811
882
self .n = n
812
883
@@ -867,6 +938,8 @@ def as_dict(self):
867
938
868
939
869
940
class Tabular :
941
+ """A RecordsDataset for a single group, similar to a fields dataset, but allowing different shapes for each date."""
942
+
870
943
def __init__ (self , dataset , name ):
871
944
self .dataset = dataset
872
945
self .name = name
0 commit comments