Merge branch 'dev' into fix/model-inference-dependencies

aditya0by0 · aditya0by0 · commit 795e67a5efaa · 2025-08-09T23:53:39.000+02:00
diff --git a/chebai/preprocessing/bin/smiles_token/tokens.txt b/chebai/preprocessing/bin/smiles_token/tokens.txt
@@ -819,3 +819,168 @@ p
 [16N]
 [17N]
 [14N]
+[Pb+2]
+[AlH4-]
+[BH4-]
+[Pt-2]
+[Cl+2]
+[I+3]
+[Br+2]
+[Cl+3]
+[Os-2]
+[Cr-2]
+[Hg-2]
+[PH]
+[Br+3]
+[I+2]
+[AsH2]
+[SH]
+[W-2]
+[Cd-2]
+[Ir-2]
+[Ru-2]
+[Rh-2]
+[Ag-2]
+[Be-2]
+[TeH2+]
+[13c]
+[13cH]
+[PH4]
+[AsH4]
+[As-2]
+[SbH3+]
+[SbH4]
+[BiH3]
+[BH3-]
+[GeH3]
+[GeH2]
+[SiH2-]
+[SiH2+]
+[SnH2]
+[SnH3]
+[SnH]
+[PbH]
+[PbH3]
+[Al-2]
+[B+2]
+[N+2]
+[SbH]
+[SbH2]
+[InH2]
+[GaH2]
+[TlH2]
+[Au+2]
+[sH+]
+[Hg+2]
+[Si-2]
+[Sn-2]
+[Pb-2]
+[AsH3]
+[Cr+2]
+[Ag+2]
+[V-2]
+[Ce-2]
+[13C@]
+[*+2]
+[He+2]
+[4He+2]
+[3He+2]
+[Eu+2]
+[Ge+2]
+[Os+2]
+[Y+2]
+[Gd+2]
+[La+2]
+[Se+2]
+[NH-2]
+[TeH2-]
+[AlH3-]
+[SbH3-]
+[AsH3-]
+[BiH3-]
+[PH3-]
+[CH2-2]
+[AsH4+]
+[AlH3+]
+[BiH3+]
+[FH+]
+[CH3+]
+[Te-2]
+[OH]
+[CH3]
+[18OH2]
+[OH3+]
+[OH4+2]
+[SH3]
+[SH3+]
+[SH3-]
+[SH4]
+[SeH2]
+[SeH-]
+[SeH3+]
+[SeH3-]
+[SeH3]
+[SeH+]
+[TeH2]
+[TeH-]
+[TeH3-]
+[TeH3+]
+[TeH+]
+[TeH3]
+[TeH4]
+[PoH2]
+[NH2]
+[NH+2]
+[PH5]
+[PH4+]
+[PH-2]
+[PH4-]
+[PH+2]
+[AsH2+]
+[AsH2-]
+[AsH+2]
+[AsH-2]
+[AsH5]
+[SbH3]
+[SbH4+]
+[SbH5]
+[BiH4+]
+[BiH5]
+[BiH4-]
+[BH2]
+[BH2+]
+[BH2-]
+[BH-2]
+[BH+2]
+[GeH4]
+[GeH3+]
+[GeH3-]
+[SiH3-]
+[SiH3+]
+[SiH+]
+[SiH4]
+[HeH+2]
+[HeH+]
+[AlH]
+[AlH+]
+[SnH4]
+[SnH3-]
+[SnH3+]
+[PbH4]
+[PbH3-]
+[PbH3+]
+[BeH4-2]
+[BeH]
+[BeH+]
+[BeH-]
+[BeH2]
+[AtH]
+[InH3]
+[GaH3]
+[TlH3]
+[IH3]
+[FeH6-4]
+[FH2+]
+[ClH2+]
+[BrH2+]
+[IH2+]
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
@@ -309,9 +309,7 @@ def _graph_to_raw_dataset(self, g: "nx.DiGraph") -> pd.DataFrame:
         data = pd.DataFrame(data)
         data = data[~data["SMILES"].isnull()]
         data = data[[name not in CHEBI_BLACKLIST for name, _ in data.iterrows()]]
-        # This filters the DataFrame to include only the rows where at least one value in the row from 4th column
-        # onwards is True/non-zero.
-        data = data[data.iloc[:, self._LABELS_START_IDX :].any(axis=1)]
+
         return data
 
     # ------------------------------ Phase: Setup data -----------------------------------
@@ -712,18 +710,24 @@ class ChEBIOverXPartial(ChEBIOverX):
         top_class_id (int): The ID of the top class from which to extract subclasses.
     """
 
-    def __init__(self, top_class_id: int, **kwargs):
+    def __init__(self, top_class_id: int, external_data_ratio: float, **kwargs):
         """
         Initializes the ChEBIOverXPartial dataset.
 
         Args:
             top_class_id (int): The ID of the top class from which to extract subclasses.
             **kwargs: Additional keyword arguments passed to the superclass initializer.
+            external_data_ratio (float): How much external data (i.e., samples where top_class_id
+            is no positive label) to include in the dataset. 0 means no external data, 1 means
+            the maximum amount (i.e., the complete ChEBI dataset).
         """
         if "top_class_id" not in kwargs:
             kwargs["top_class_id"] = top_class_id
+        if "external_data_ratio" not in kwargs:
+            kwargs["external_data_ratio"] = external_data_ratio
 
         self.top_class_id: int = top_class_id
+        self.external_data_ratio: float = external_data_ratio
         super().__init__(**kwargs)
 
     @property
@@ -737,7 +741,7 @@ def processed_dir_main(self) -> str:
         return os.path.join(
             self.base_dir,
             self._name,
-            f"partial_{self.top_class_id}",
+            f"partial_{self.top_class_id}_ext_ratio_{self.external_data_ratio:.2f}",
             "processed",
         )
 
@@ -756,9 +760,53 @@ def _extract_class_hierarchy(self, chebi_path: str) -> "nx.DiGraph":
             descendants of the top class ID.
         """
         g = super()._extract_class_hierarchy(chebi_path)
-        g = g.subgraph(list(g.successors(self.top_class_id)) + [self.top_class_id])
+        top_class_successors = list(g.successors(self.top_class_id)) + [
+            self.top_class_id
+        ]
+        external_nodes = list(set(n for n in g.nodes if n not in top_class_successors))
+        if 0 < self.external_data_ratio < 1:
+            n_external_nodes = int(
+                len(top_class_successors)
+                * self.external_data_ratio
+                / (1 - self.external_data_ratio)
+            )
+            print(
+                f"Extracting {n_external_nodes} external nodes from the ChEBI dataset (ratio: {self.external_data_ratio:.2f})"
+            )
+            external_nodes = external_nodes[: int(n_external_nodes)]
+        elif self.external_data_ratio == 0:
+            external_nodes = []
+
+        g = g.subgraph(top_class_successors + external_nodes)
+        print(
+            f"Subgraph contains {len(g.nodes)} nodes, of which {len(top_class_successors)} are subclasses of the top class ID {self.top_class_id}."
+        )
         return g
 
+    def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List:
+        """Only selects classes that meet the threshold AND are subclasses of the top class ID (including itself)."""
+        smiles = nx.get_node_attributes(g, "smiles")
+        nodes = list(
+            sorted(
+                {
+                    node
+                    for node in g.nodes
+                    if sum(
+                        1 if smiles[s] is not None else 0 for s in g.successors(node)
+                    )
+                    >= self.THRESHOLD
+                    and (
+                        self.top_class_id in g.predecessors(node)
+                        or node == self.top_class_id
+                    )
+                }
+            )
+        )
+        filename = "classes.txt"
+        with open(os.path.join(self.processed_dir_main, filename), "wt") as fout:
+            fout.writelines(str(node) + "\n" for node in nodes)
+        return nodes
+
 
 class ChEBIOver50Partial(ChEBIOverXPartial, ChEBIOver50):
     """
@@ -854,7 +902,7 @@ def term_callback(doc: "fastobo.term.TermFrame") -> Union[Dict, bool]:
 
 
 atom_index = (
-    "\*",
+    r"\*",
     "H",
     "He",
     "Li",
@@ -1485,3 +1533,15 @@ def term_callback(doc: "fastobo.term.TermFrame") -> Union[Dict, bool]:
 ]
 
 JCI_500_COLUMNS_INT = [int(n.split(":")[-1]) for n in JCI_500_COLUMNS]
+
+if __name__ == "__main__":
+    data_module_05 = ChEBIOver50Partial(
+        chebi_version=241,
+        splits_file_path=os.path.join(
+            "data", "chebi_v241", "ChEBI50", "splits_80_10_10.csv"
+        ),
+        top_class_id=22712,
+        external_data_ratio=0.5,
+    )
+    data_module_05.prepare_data()
+    data_module_05.setup()
diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
@@ -154,9 +154,13 @@ def setup_processed(self):
         print("Load data from file", filename)
         data = self._load_data_from_file(filename)
         print("Create splits")
-        train, test = train_test_split(data, train_size=self.train_split)
+        train, test = train_test_split(
+            data, train_size=1 - (self.validation_split + self.test_split)
+        )
         del data
-        test, val = train_test_split(test, train_size=self.train_split)
+        test, val = train_test_split(
+            test, train_size=self.test_split / (self.validation_split + self.test_split)
+        )
         torch.save(train, os.path.join(self.processed_dir, "train.pt"))
         torch.save(test, os.path.join(self.processed_dir, "test.pt"))
         torch.save(val, os.path.join(self.processed_dir, "validation.pt"))
@@ -179,6 +183,21 @@ def processed_file_names(self) -> List[str]:
         """
         return ["test.pt", "train.pt", "validation.pt"]
 
+    def _set_processed_data_props(self):
+        """
+        Self-supervised learning with PubChem does not use this metadata, therefore set them to zero.
+
+        Sets:
+            - self._num_of_labels: 0
+            - self._feature_vector_size: 0.
+        """
+
+        self._num_of_labels = 0
+        self._feature_vector_size = 0
+
+        print(f"Number of labels for loaded data: {self._num_of_labels}")
+        print(f"Feature vector size: {self._feature_vector_size}")
+
     def _perform_data_preparation(self, *args, **kwargs):
         """
         Checks for raw data and downloads if necessary.
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
@@ -6,6 +6,7 @@
 from typing import Any, Dict, List, Optional
 
 from pysmiles.read_smiles import _tokenize
+from rdkit import Chem
 
 from chebai.preprocessing.collate import DefaultCollator, RaggedCollator
 
@@ -173,21 +174,35 @@ class ChemDataReader(TokenIndexerReader):
 
     COLLATOR = RaggedCollator
 
+    def __init__(self, canonicalize_smiles=True, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.canonicalize_smiles = canonicalize_smiles
+        print(f"Using SMILES canonicalization: {self.canonicalize_smiles}")
+
     @classmethod
     def name(cls) -> str:
         """Returns the name of the data reader."""
         return "smiles_token"
 
     def _read_data(self, raw_data: str) -> List[int]:
         """
-        Reads and tokenizes raw SMILES data into a list of token indices.
+        Reads and tokenizes raw SMILES data into a list of token indices. Canonicalizes the SMILES string using RDKit.
 
         Args:
             raw_data (str): The raw SMILES string to be tokenized.
 
         Returns:
             List[int]: A list of integers representing the indices of the SMILES tokens.
         """
+        if self.canonicalize_smiles:
+            try:
+                mol = Chem.MolFromSmiles(raw_data.strip())
+                if mol is not None:
+                    raw_data = Chem.MolToSmiles(mol, canonical=True)
+            except Exception as e:
+                print(f"RDKit failed to process {raw_data}")
+                print(f"\t{e}")
+
         return [self._get_token_index(v[1]) for v in _tokenize(raw_data)]
 
 
diff --git a/chebai/result/generate_class_properties.py b/chebai/result/generate_class_properties.py
diff --git a/tests/unit/dataset_classes/testChebiOverXPartial.py b/tests/unit/dataset_classes/testChebiOverXPartial.py