@@ -154,9 +154,13 @@ def setup_processed(self):
154
154
print ("Load data from file" , filename )
155
155
data = self ._load_data_from_file (filename )
156
156
print ("Create splits" )
157
- train , test = train_test_split (data , train_size = self .train_split )
157
+ train , test = train_test_split (
158
+ data , train_size = 1 - (self .validation_split + self .test_split )
159
+ )
158
160
del data
159
- test , val = train_test_split (test , train_size = self .train_split )
161
+ test , val = train_test_split (
162
+ test , train_size = self .test_split / (self .validation_split + self .test_split )
163
+ )
160
164
torch .save (train , os .path .join (self .processed_dir , "train.pt" ))
161
165
torch .save (test , os .path .join (self .processed_dir , "test.pt" ))
162
166
torch .save (val , os .path .join (self .processed_dir , "validation.pt" ))
@@ -179,6 +183,21 @@ def processed_file_names(self) -> List[str]:
179
183
"""
180
184
return ["test.pt" , "train.pt" , "validation.pt" ]
181
185
186
+ def _set_processed_data_props (self ):
187
+ """
188
+ Self-supervised learning with PubChem does not use this metadata, therefore set them to zero.
189
+
190
+ Sets:
191
+ - self._num_of_labels: 0
192
+ - self._feature_vector_size: 0.
193
+ """
194
+
195
+ self ._num_of_labels = 0
196
+ self ._feature_vector_size = 0
197
+
198
+ print (f"Number of labels for loaded data: { self ._num_of_labels } " )
199
+ print (f"Feature vector size: { self ._feature_vector_size } " )
200
+
182
201
def _perform_data_preparation (self , * args , ** kwargs ):
183
202
"""
184
203
Checks for raw data and downloads if necessary.
0 commit comments