Skip to content

Commit 016b5ea

Browse files
authored
Merge pull request #119 from ChEB-AI/fix/pubchem-splits
Fix data splits for pubchem
2 parents fe01f5a + 4043b23 commit 016b5ea

File tree

1 file changed

+21
-2
lines changed

1 file changed

+21
-2
lines changed

chebai/preprocessing/datasets/pubchem.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,13 @@ def setup_processed(self):
154154
print("Load data from file", filename)
155155
data = self._load_data_from_file(filename)
156156
print("Create splits")
157-
train, test = train_test_split(data, train_size=self.train_split)
157+
train, test = train_test_split(
158+
data, train_size=1 - (self.validation_split + self.test_split)
159+
)
158160
del data
159-
test, val = train_test_split(test, train_size=self.train_split)
161+
test, val = train_test_split(
162+
test, train_size=self.test_split / (self.validation_split + self.test_split)
163+
)
160164
torch.save(train, os.path.join(self.processed_dir, "train.pt"))
161165
torch.save(test, os.path.join(self.processed_dir, "test.pt"))
162166
torch.save(val, os.path.join(self.processed_dir, "validation.pt"))
@@ -179,6 +183,21 @@ def processed_file_names(self) -> List[str]:
179183
"""
180184
return ["test.pt", "train.pt", "validation.pt"]
181185

186+
def _set_processed_data_props(self):
187+
"""
188+
Self-supervised learning with PubChem does not use this metadata, therefore set them to zero.
189+
190+
Sets:
191+
- self._num_of_labels: 0
192+
- self._feature_vector_size: 0.
193+
"""
194+
195+
self._num_of_labels = 0
196+
self._feature_vector_size = 0
197+
198+
print(f"Number of labels for loaded data: {self._num_of_labels}")
199+
print(f"Feature vector size: {self._feature_vector_size}")
200+
182201
def _perform_data_preparation(self, *args, **kwargs):
183202
"""
184203
Checks for raw data and downloads if necessary.

0 commit comments

Comments
 (0)