ChEB-AI · sfluegel05 · Nov 4, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt b/chebai/preprocessing/bin/protein_token_3_gram/tokens.txt
diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
@@ -80,6 +80,12 @@ def __init__(self, **kwargs):
             self.max_sequence_length >= 1
         ), "Max sequence length should be greater than or equal to 1."
 
+        if self.reader.n_gram is not None:
+            assert self.max_sequence_length >= self.reader.n_gram, (
+                f"max_sequence_length ({self.max_sequence_length}) must be greater than "
+                f"or equal to n_gram ({self.reader.n_gram})."
+            )
+
     @classmethod
     def _get_go_branch(cls, **kwargs) -> str:
         """
@@ -536,7 +542,8 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:
 
         This method overrides the dataloader method from the superclass. After fetching the dataset from the
         superclass, it truncates the 'features' of each data instance to a maximum length specified by
-        `self.max_sequence_length`.
+        `self.max_sequence_length`. The truncation is adjusted based on the value of `n_gram` to ensure that
+        the correct number of amino acids is preserved in the truncated sequences.
 
         Args:
             kind (str): The kind of data to load (e.g., 'train', 'val', 'test').
@@ -547,9 +554,18 @@ def dataloader(self, kind: str, **kwargs) -> DataLoader:
         """
         dataloader = super().dataloader(kind, **kwargs)
 
-        # Truncate the 'features' to max_sequence_length for each instance
+        if self.reader.n_gram is None:
+            # Truncate the 'features' to max_sequence_length for each instance
+            truncate_index = self.max_sequence_length
+        else:
+            # If n_gram is given, adjust truncation to ensure maximum sequence length refers to the maximum number of
+            # amino acids in sequence rather than number of n-grams. Eg, Sequence "ABCDEFGHIJ" can form 8 trigrams,
+            # if max length is 5, then only first 3 trigrams should be considered as they are formed by first 5 letters.
+            truncate_index = self.max_sequence_length - (self.reader.n_gram - 1)
+
         for instance in dataloader.dataset:
-            instance["features"] = instance["features"][: self.max_sequence_length]
+            instance["features"] = instance["features"][:truncate_index]
+
         return dataloader
 
     # ------------------------------ Phase: Raw Properties -----------------------------------
@@ -563,16 +579,6 @@ def base_dir(self) -> str:
         """
         return os.path.join("data", f"GO_UniProt")
 
-    @property
-    def identifier(self) -> tuple:
-        """Identifier for the dataset."""
-        # overriding identifier instead of reader.name to keep same tokens.txt file, but different processed_dir folder
-        if not isinstance(self.reader, dr.ProteinDataReader):
-            raise ValueError("Need Protein DataReader for identifier")
-        if self.reader.n_gram is not None:
-            return (f"{self.reader.name()}_{self.reader.n_gram}_gram",)
-        return (self.reader.name(),)
-
     @property
     def raw_file_names_dict(self) -> dict:
         """

diff --git a/chebai/preprocessing/datasets/tox21.py b/chebai/preprocessing/datasets/tox21.py
@@ -68,7 +68,7 @@ def download(self) -> None:
     def setup_processed(self) -> None:
         """Processes and splits the dataset."""
         print("Create splits")
-        data = self._load_data_from_file(os.path.join(self.raw_dir, f"tox21.csv"))
+        data = list(self._load_data_from_file(os.path.join(self.raw_dir, f"tox21.csv")))
         groups = np.array([d["group"] for d in data])
         if not all(g is None for g in groups):
             split_size = int(len(set(groups)) * self.train_split)
@@ -145,7 +145,10 @@ def _load_data_from_file(self, input_file_path: str) -> List[Dict]:
                 labels = [
                     bool(int(l)) if l else None for l in (row[k] for k in self.HEADERS)
                 ]
-                yield dict(features=smiles, labels=labels, ident=row["mol_id"])
+                group = row.get("group", None)
+                yield dict(
+                    features=smiles, labels=labels, ident=row["mol_id"], group=group
+                )
 
 
 class Tox21Challenge(XYBaseDataModule):

diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
@@ -372,14 +372,16 @@ class ProteinDataReader(DataReader):
         "V",
     ]
 
-    @classmethod
-    def name(cls) -> str:
+    def name(self) -> str:
         """
         Returns the name of the data reader. This method identifies the specific type of data reader.
 
         Returns:
             str: The name of the data reader, which is "protein_token".
         """
+        if self.n_gram is not None:
+            return f"protein_token_{self.n_gram}_gram"
+
         return "protein_token"
 
     def __init__(self, *args, n_gram: Optional[int] = None, **kwargs):

diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
@@ -0,0 +1,3 @@
+"""
+This directory contains integration tests that cover the overall behavior of the data preprocessing tool.
+"""
diff --git a/tests/testChebiData.py → tests/integration/testChebiData.py b/tests/testChebiData.py → tests/integration/testChebiData.py
diff --git a/tests/testChebiDynamicDataSplits.py → ...integration/testChebiDynamicDataSplits.py b/tests/testChebiDynamicDataSplits.py → ...integration/testChebiDynamicDataSplits.py
diff --git a/tests/testCustomBalancedAccuracyMetric.py → ...ation/testCustomBalancedAccuracyMetric.py b/tests/testCustomBalancedAccuracyMetric.py → ...ation/testCustomBalancedAccuracyMetric.py
diff --git a/tests/testCustomMacroF1Metric.py → tests/integration/testCustomMacroF1Metric.py b/tests/testCustomMacroF1Metric.py → tests/integration/testCustomMacroF1Metric.py
diff --git a/tests/testPubChemData.py → tests/integration/testPubChemData.py b/tests/testPubChemData.py → tests/integration/testPubChemData.py
diff --git a/tests/testTox21MolNetData.py → tests/integration/testTox21MolNetData.py b/tests/testTox21MolNetData.py → tests/integration/testTox21MolNetData.py
diff --git a/.../test_data/ChEBIOver100_test/labels000.pt → .../test_data/ChEBIOver100_test/labels000.pt b/.../test_data/ChEBIOver100_test/labels000.pt → .../test_data/ChEBIOver100_test/labels000.pt
diff --git a/.../test_data/ChEBIOver100_test/labels001.pt → .../test_data/ChEBIOver100_test/labels001.pt b/.../test_data/ChEBIOver100_test/labels001.pt → .../test_data/ChEBIOver100_test/labels001.pt
diff --git a/.../test_data/ChEBIOver100_test/labels002.pt → .../test_data/ChEBIOver100_test/labels002.pt b/.../test_data/ChEBIOver100_test/labels002.pt → .../test_data/ChEBIOver100_test/labels002.pt
diff --git a/.../test_data/ChEBIOver100_test/labels003.pt → .../test_data/ChEBIOver100_test/labels003.pt b/.../test_data/ChEBIOver100_test/labels003.pt → .../test_data/ChEBIOver100_test/labels003.pt
diff --git a/.../test_data/ChEBIOver100_test/labels004.pt → .../test_data/ChEBIOver100_test/labels004.pt b/.../test_data/ChEBIOver100_test/labels004.pt → .../test_data/ChEBIOver100_test/labels004.pt
diff --git a/.../test_data/ChEBIOver100_test/labels005.pt → .../test_data/ChEBIOver100_test/labels005.pt b/.../test_data/ChEBIOver100_test/labels005.pt → .../test_data/ChEBIOver100_test/labels005.pt
diff --git a/.../test_data/ChEBIOver100_test/labels006.pt → .../test_data/ChEBIOver100_test/labels006.pt b/.../test_data/ChEBIOver100_test/labels006.pt → .../test_data/ChEBIOver100_test/labels006.pt
diff --git a/.../test_data/ChEBIOver100_test/labels007.pt → .../test_data/ChEBIOver100_test/labels007.pt b/.../test_data/ChEBIOver100_test/labels007.pt → .../test_data/ChEBIOver100_test/labels007.pt
diff --git a/.../test_data/ChEBIOver100_test/labels008.pt → .../test_data/ChEBIOver100_test/labels008.pt b/.../test_data/ChEBIOver100_test/labels008.pt → .../test_data/ChEBIOver100_test/labels008.pt
diff --git a/.../test_data/ChEBIOver100_test/labels009.pt → .../test_data/ChEBIOver100_test/labels009.pt b/.../test_data/ChEBIOver100_test/labels009.pt → .../test_data/ChEBIOver100_test/labels009.pt
diff --git a/.../test_data/ChEBIOver100_test/labels010.pt → .../test_data/ChEBIOver100_test/labels010.pt b/.../test_data/ChEBIOver100_test/labels010.pt → .../test_data/ChEBIOver100_test/labels010.pt
diff --git a/.../test_data/ChEBIOver100_test/labels011.pt → .../test_data/ChEBIOver100_test/labels011.pt b/.../test_data/ChEBIOver100_test/labels011.pt → .../test_data/ChEBIOver100_test/labels011.pt
diff --git a/.../test_data/ChEBIOver100_test/labels012.pt → .../test_data/ChEBIOver100_test/labels012.pt b/.../test_data/ChEBIOver100_test/labels012.pt → .../test_data/ChEBIOver100_test/labels012.pt
diff --git a/.../test_data/ChEBIOver100_test/labels013.pt → .../test_data/ChEBIOver100_test/labels013.pt b/.../test_data/ChEBIOver100_test/labels013.pt → .../test_data/ChEBIOver100_test/labels013.pt
diff --git a/.../test_data/ChEBIOver100_test/labels014.pt → .../test_data/ChEBIOver100_test/labels014.pt b/.../test_data/ChEBIOver100_test/labels014.pt → .../test_data/ChEBIOver100_test/labels014.pt
diff --git a/.../test_data/ChEBIOver100_test/labels015.pt → .../test_data/ChEBIOver100_test/labels015.pt b/.../test_data/ChEBIOver100_test/labels015.pt → .../test_data/ChEBIOver100_test/labels015.pt
diff --git a/.../test_data/ChEBIOver100_test/labels016.pt → .../test_data/ChEBIOver100_test/labels016.pt b/.../test_data/ChEBIOver100_test/labels016.pt → .../test_data/ChEBIOver100_test/labels016.pt
diff --git a/.../test_data/ChEBIOver100_test/labels017.pt → .../test_data/ChEBIOver100_test/labels017.pt b/.../test_data/ChEBIOver100_test/labels017.pt → .../test_data/ChEBIOver100_test/labels017.pt
diff --git a/.../test_data/ChEBIOver100_test/labels018.pt → .../test_data/ChEBIOver100_test/labels018.pt b/.../test_data/ChEBIOver100_test/labels018.pt → .../test_data/ChEBIOver100_test/labels018.pt
diff --git a/.../test_data/ChEBIOver100_test/labels019.pt → .../test_data/ChEBIOver100_test/labels019.pt b/.../test_data/ChEBIOver100_test/labels019.pt → .../test_data/ChEBIOver100_test/labels019.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds000.pt → ...n/test_data/ChEBIOver100_test/preds000.pt b/...s/test_data/ChEBIOver100_test/preds000.pt → ...n/test_data/ChEBIOver100_test/preds000.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds001.pt → ...n/test_data/ChEBIOver100_test/preds001.pt b/...s/test_data/ChEBIOver100_test/preds001.pt → ...n/test_data/ChEBIOver100_test/preds001.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds002.pt → ...n/test_data/ChEBIOver100_test/preds002.pt b/...s/test_data/ChEBIOver100_test/preds002.pt → ...n/test_data/ChEBIOver100_test/preds002.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds003.pt → ...n/test_data/ChEBIOver100_test/preds003.pt b/...s/test_data/ChEBIOver100_test/preds003.pt → ...n/test_data/ChEBIOver100_test/preds003.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds004.pt → ...n/test_data/ChEBIOver100_test/preds004.pt b/...s/test_data/ChEBIOver100_test/preds004.pt → ...n/test_data/ChEBIOver100_test/preds004.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds005.pt → ...n/test_data/ChEBIOver100_test/preds005.pt b/...s/test_data/ChEBIOver100_test/preds005.pt → ...n/test_data/ChEBIOver100_test/preds005.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds006.pt → ...n/test_data/ChEBIOver100_test/preds006.pt b/...s/test_data/ChEBIOver100_test/preds006.pt → ...n/test_data/ChEBIOver100_test/preds006.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds007.pt → ...n/test_data/ChEBIOver100_test/preds007.pt b/...s/test_data/ChEBIOver100_test/preds007.pt → ...n/test_data/ChEBIOver100_test/preds007.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds008.pt → ...n/test_data/ChEBIOver100_test/preds008.pt b/...s/test_data/ChEBIOver100_test/preds008.pt → ...n/test_data/ChEBIOver100_test/preds008.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds009.pt → ...n/test_data/ChEBIOver100_test/preds009.pt b/...s/test_data/ChEBIOver100_test/preds009.pt → ...n/test_data/ChEBIOver100_test/preds009.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds010.pt → ...n/test_data/ChEBIOver100_test/preds010.pt b/...s/test_data/ChEBIOver100_test/preds010.pt → ...n/test_data/ChEBIOver100_test/preds010.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds011.pt → ...n/test_data/ChEBIOver100_test/preds011.pt b/...s/test_data/ChEBIOver100_test/preds011.pt → ...n/test_data/ChEBIOver100_test/preds011.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds012.pt → ...n/test_data/ChEBIOver100_test/preds012.pt b/...s/test_data/ChEBIOver100_test/preds012.pt → ...n/test_data/ChEBIOver100_test/preds012.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds013.pt → ...n/test_data/ChEBIOver100_test/preds013.pt b/...s/test_data/ChEBIOver100_test/preds013.pt → ...n/test_data/ChEBIOver100_test/preds013.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds014.pt → ...n/test_data/ChEBIOver100_test/preds014.pt b/...s/test_data/ChEBIOver100_test/preds014.pt → ...n/test_data/ChEBIOver100_test/preds014.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds015.pt → ...n/test_data/ChEBIOver100_test/preds015.pt b/...s/test_data/ChEBIOver100_test/preds015.pt → ...n/test_data/ChEBIOver100_test/preds015.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds016.pt → ...n/test_data/ChEBIOver100_test/preds016.pt b/...s/test_data/ChEBIOver100_test/preds016.pt → ...n/test_data/ChEBIOver100_test/preds016.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds017.pt → ...n/test_data/ChEBIOver100_test/preds017.pt b/...s/test_data/ChEBIOver100_test/preds017.pt → ...n/test_data/ChEBIOver100_test/preds017.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds018.pt → ...n/test_data/ChEBIOver100_test/preds018.pt b/...s/test_data/ChEBIOver100_test/preds018.pt → ...n/test_data/ChEBIOver100_test/preds018.pt
diff --git a/...s/test_data/ChEBIOver100_test/preds019.pt → ...n/test_data/ChEBIOver100_test/preds019.pt b/...s/test_data/ChEBIOver100_test/preds019.pt → ...n/test_data/ChEBIOver100_test/preds019.pt
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
@@ -0,0 +1,4 @@
+"""
+This directory contains unit tests, which focus on individual functions and methods, ensuring they work as
+expected in isolation.
+"""
diff --git a/tests/unit/collators/__init__.py b/tests/unit/collators/__init__.py
diff --git a/tests/unit/collators/testDefaultCollator.py b/tests/unit/collators/testDefaultCollator.py
@@ -0,0 +1,65 @@
+import unittest
+from typing import Dict, List
+
+from chebai.preprocessing.collate import DefaultCollator
+from chebai.preprocessing.structures import XYData
+
+
+class TestDefaultCollator(unittest.TestCase):
+    """
+    Unit tests for the DefaultCollator class.
+    """
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        """
+        Set up the test environment by initializing a DefaultCollator instance.
+        """
+        cls.collator = DefaultCollator()
+
+    def test_call_with_valid_data(self) -> None:
+        """
+        Test the __call__ method with valid data to ensure features and labels are correctly extracted.
+        """
+        data: List[Dict] = [
+            {"features": [1.0, 2.0], "labels": [True, False, True]},
+            {"features": [3.0, 4.0], "labels": [False, False, True]},
+        ]
+
+        result: XYData = self.collator(data)
+        self.assertIsInstance(
+            result, XYData, "The result should be an instance of XYData."
+        )
+
+        expected_x = ([1.0, 2.0], [3.0, 4.0])
+        expected_y = ([True, False, True], [False, False, True])
+
+        self.assertEqual(
+            result.x,
+            expected_x,
+            "The feature data 'x' does not match the expected output.",
+        )
+        self.assertEqual(
+            result.y,
+            expected_y,
+            "The label data 'y' does not match the expected output.",
+        )
+
+    def test_call_with_empty_data(self) -> None:
+        """
+        Test the __call__ method with an empty list to ensure it handles the edge case correctly.
+        """
+        data: List[Dict] = []
+
+        with self.assertRaises(ValueError) as context:
+            self.collator(data)
+
+        self.assertEqual(
+            str(context.exception),
+            "not enough values to unpack (expected 2, got 0)",
+            "The exception message for empty data is not as expected.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/collators/testRaggedCollator.py b/tests/unit/collators/testRaggedCollator.py
@@ -0,0 +1,204 @@
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+
+from chebai.preprocessing.collate import RaggedCollator
+from chebai.preprocessing.structures import XYData
+
+
+class TestRaggedCollator(unittest.TestCase):
+    """
+    Unit tests for the RaggedCollator class.
+    """
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        """
+        Set up the test environment by initializing a RaggedCollator instance.
+        """
+        cls.collator = RaggedCollator()
+
+    def test_call_with_valid_data(self) -> None:
+        """
+        Test the __call__ method with valid ragged data to ensure features, labels, and masks are correctly handled.
+        """
+        data: List[Dict] = [
+            {"features": [1, 2], "labels": [True, False], "ident": "sample1"},
+            {"features": [3, 4, 5], "labels": [False, True, True], "ident": "sample2"},
+            {"features": [6], "labels": [True], "ident": "sample3"},
+        ]
+
+        result: XYData = self.collator(data)
+
+        expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]])
+        expected_y = torch.tensor(
+            [[True, False, False], [False, True, True], [True, False, False]]
+        )
+        expected_mask_for_x = torch.tensor(
+            [[True, True, False], [True, True, True], [True, False, False]]
+        )
+        expected_lens_for_x = torch.tensor([2, 3, 1])
+
+        self.assertTrue(
+            torch.equal(result.x, expected_x),
+            "The feature tensor 'x' does not match the expected output.",
+        )
+        self.assertTrue(
+            torch.equal(result.y, expected_y),
+            "The label tensor 'y' does not match the expected output.",
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
+            ),
+            "The mask tensor does not match the expected output.",
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
+            ),
+            "The lens tensor does not match the expected output.",
+        )
+        self.assertEqual(
+            result.additional_fields["idents"],
+            ("sample1", "sample2", "sample3"),
+            "The identifiers do not match the expected output.",
+        )
+
+    def test_call_with_missing_entire_labels(self) -> None:
+        """
+        Test the __call__ method with data where some samples are missing labels.
+        """
+        data: List[Dict] = [
+            {"features": [1, 2], "labels": [True, False], "ident": "sample1"},
+            {"features": [3, 4, 5], "labels": None, "ident": "sample2"},
+            {"features": [6], "labels": [True], "ident": "sample3"},
+        ]
+
+        result: XYData = self.collator(data)
+
+        # https://github.com/ChEB-AI/python-chebai/pull/48#issuecomment-2324393829
+        expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]])
+        expected_y = torch.tensor(
+            [[True, False], [True, False]]
+        )  # True -> 1, False -> 0
+        expected_mask_for_x = torch.tensor(
+            [[True, True, False], [True, True, True], [True, False, False]]
+        )
+        expected_lens_for_x = torch.tensor([2, 3, 1])
+
+        self.assertTrue(
+            torch.equal(result.x, expected_x),
+            "The feature tensor 'x' does not match the expected output when labels are missing.",
+        )
+        self.assertTrue(
+            torch.equal(result.y, expected_y),
+            "The label tensor 'y' does not match the expected output when labels are missing.",
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
+            ),
+            "The mask tensor does not match the expected output when labels are missing.",
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
+            ),
+            "The lens tensor does not match the expected output when labels are missing.",
+        )
+        self.assertEqual(
+            result.additional_fields["loss_kwargs"]["non_null_labels"],
+            [0, 2],
+            "The non-null labels list does not match the expected output.",
+        )
+        self.assertEqual(
+            len(result.additional_fields["loss_kwargs"]["non_null_labels"]),
+            result.y.shape[1],
+            "The length of non null labels list must match with target label variable size",
+        )
+        self.assertEqual(
+            result.additional_fields["idents"],
+            ("sample1", "sample2", "sample3"),
+            "The identifiers do not match the expected output when labels are missing.",
+        )
+
+    def test_call_with_none_in_labels(self) -> None:
+        """
+        Test the __call__ method with data where one of the elements in the labels is None.
+        """
+        data: List[Dict] = [
+            {"features": [1, 2], "labels": [None, True], "ident": "sample1"},
+            {"features": [3, 4, 5], "labels": [True, False], "ident": "sample2"},
+            {"features": [6], "labels": [True], "ident": "sample3"},
+        ]
+
+        result: XYData = self.collator(data)
+
+        expected_x = torch.tensor([[1, 2, 0], [3, 4, 5], [6, 0, 0]])
+        expected_y = torch.tensor(
+            [[False, True], [True, False], [True, False]]
+        )  # None -> False
+        expected_mask_for_x = torch.tensor(
+            [[True, True, False], [True, True, True], [True, False, False]]
+        )
+        expected_lens_for_x = torch.tensor([2, 3, 1])
+
+        self.assertTrue(
+            torch.equal(result.x, expected_x),
+            "The feature tensor 'x' does not match the expected output when labels contain None.",
+        )
+        self.assertTrue(
+            torch.equal(result.y, expected_y),
+            "The label tensor 'y' does not match the expected output when labels contain None.",
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["mask"], expected_mask_for_x
+            ),
+            "The mask tensor does not match the expected output when labels contain None.",
+        )
+        self.assertTrue(
+            torch.equal(
+                result.additional_fields["model_kwargs"]["lens"], expected_lens_for_x
+            ),
+            "The lens tensor does not match the expected output when labels contain None.",
+        )
+        self.assertEqual(
+            result.additional_fields["idents"],
+            ("sample1", "sample2", "sample3"),
+            "The identifiers do not match the expected output when labels contain None.",
+        )
+
+    def test_call_with_empty_data(self) -> None:
+        """
+        Test the __call__ method with an empty list to ensure it raises an error.
+        """
+        data: List[Dict] = []
+
+        with self.assertRaises(
+            Exception, msg="Expected an Error when no data is provided"
+        ):
+            self.collator(data)
+
+    def test_process_label_rows(self) -> None:
+        """
+        Test the process_label_rows method to ensure it pads label sequences correctly.
+        """
+        labels: Tuple = ([True, False], [False, True, True], [True])
+
+        result: torch.Tensor = self.collator.process_label_rows(labels)
+
+        expected_output = torch.tensor(
+            [[True, False, False], [False, True, True], [True, False, False]]
+        )
+
+        self.assertTrue(
+            torch.equal(result, expected_output),
+            "The processed label rows tensor does not match the expected output.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unit/dataset_classes/__init__.py b/tests/unit/dataset_classes/__init__.py