Skip to content

Avoid using iterrows, use vectorization wherever possible #120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 44 additions & 43 deletions chebai/preprocessing/datasets/chebi.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import fastobo
import networkx as nx
import numpy as np
import pandas as pd
import requests
import torch
Expand Down Expand Up @@ -300,7 +301,7 @@ def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame:

data = pd.DataFrame(data)
data = data[~data["SMILES"].isnull()]
data = data[[name not in CHEBI_BLACKLIST for name, _ in data.iterrows()]]
data = data[~data["name"].isin(CHEBI_BLACKLIST)]

return data

Expand Down Expand Up @@ -358,18 +359,18 @@ def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, No
"""
with open(input_file_path, "rb") as input_file:
df = pd.read_pickle(input_file)
if self.single_class is not None:
single_cls_index = list(df.columns).index(int(self.single_class))
for row in df.values:
if self.single_class is None:
labels = row[self._LABELS_START_IDX :].astype(bool)
else:
labels = [bool(row[single_cls_index])]
yield dict(
features=row[self._DATA_REPRESENTATION_IDX],
labels=labels,
ident=row[self._ID_IDX],
)

if self.single_class is None:
all_labels = df.iloc[:, self._LABELS_START_IDX :].to_numpy(dtype=bool)
else:
single_cls_index = df.columns.get_loc(int(self.single_class))
all_labels = df.iloc[:, [single_cls_index]].to_numpy(dtype=bool)

features = df.iloc[:, self._DATA_REPRESENTATION_IDX].to_numpy()
idents = df.iloc[:, self._ID_IDX].to_numpy()

for feat, labels, ident in zip(features, all_labels, idents):
yield dict(features=feat, labels=labels, ident=ident)

# ------------------------------ Phase: Dynamic Splits -----------------------------------
def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
Expand Down Expand Up @@ -465,43 +466,43 @@ def _setup_pruned_test_set(
Returns:
pd.DataFrame: The pruned test dataset.
"""
# TODO: find a more efficient way to do this
filename_old = "classes.txt"
# filename_new = f"classes_v{self.chebi_version_train}.txt"
# dataset = torch.load(os.path.join(self.processed_dir, "test.pt"))

# Load original classes (from the current ChEBI version - chebi_version)
with open(os.path.join(self.processed_dir_main, filename_old), "r") as file:
orig_classes = file.readlines()
classes_file_name = "classes.txt"

# Load new classes (from the training ChEBI version - chebi_version_train)
# Load original and new classes
with open(os.path.join(self.processed_dir_main, classes_file_name), "r") as f:
orig_classes = f.readlines()
with open(
os.path.join(
self._chebi_version_train_obj.processed_dir_main, filename_old
self._chebi_version_train_obj.processed_dir_main, classes_file_name
),
"r",
) as file:
new_classes = file.readlines()

# Create a mapping which give index of a class from chebi_version, if the corresponding
# class exists in chebi_version_train, Size = Number of classes in chebi_version
mapping = [
None if or_class not in new_classes else new_classes.index(or_class)
for or_class in orig_classes
]
) as f:
new_classes = f.readlines()

# Mapping array (-1 means no match in new classes)
mapping_array = np.array(
[
-1 if oc not in new_classes else new_classes.index(oc)
for oc in orig_classes
],
dtype=int,
)

# Convert labels column to 2D NumPy array
labels_matrix = np.array(df_test_chebi_version["labels"].tolist(), dtype=bool)

# Allocate new labels matrix
num_new_classes = len(new_classes)
new_labels_matrix = np.zeros(
(labels_matrix.shape[0], num_new_classes), dtype=bool
)

# Iterate over each data instance in the test set which is derived from chebi_version
for _, row in df_test_chebi_version.iterrows():
# Size = Number of classes in chebi_version_train
new_labels = [False for _ in new_classes]
for ind, label in enumerate(row["labels"]):
# If the chebi_version class exists in the chebi_version_train and has a True label,
# set the corresponding label in new_labels to True
if mapping[ind] is not None and label:
new_labels[mapping[ind]] = label
# Update the labels from test instance from chebi_version to the new labels, which are compatible to both versions
row["labels"] = new_labels
# Copy only valid columns
valid_mask = mapping_array != -1
new_labels_matrix[:, mapping_array[valid_mask]] = labels_matrix[:, valid_mask]

# Assign back
df_test_chebi_version["labels"] = new_labels_matrix.tolist()
return df_test_chebi_version

# ------------------------------ Phase: Raw Properties -----------------------------------
Expand Down
12 changes: 6 additions & 6 deletions chebai/preprocessing/datasets/pubchem.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,8 +628,8 @@ def download(self):
if not os.path.exists(os.path.join(self.raw_dir, f"{name}.txt")):
open(os.path.join(self.raw_dir, f"{name}.txt"), "x").close()
with open(os.path.join(self.raw_dir, f"{name}.txt"), "w") as f:
for id, row in splits[i].iterrows():
f.writelines(f"{id}\t{row['smiles']}\n")
for id, row in splits[i].itertuples(index=True):
f.writelines(f"{id}\t{row.smiles}\n")


class PubChemDissimilarSMILES(PubChemDissimilar):
Expand Down Expand Up @@ -809,12 +809,12 @@ def download(self):
csv_path = os.path.join(self.raw_dir, "pubchem_hazardous_compound_list.csv")
compounds = pd.read_csv(csv_path)
smiles_list = []
for id, compound in compounds.iterrows():
for compound in compounds.itertuples(index=False):
if (
not isinstance(compound["cmpdsynonym"], str)
or "CHEBI" not in compound["cmpdsynonym"]
not isinstance(compound.cmpdsynonym, str)
or "CHEBI" not in compound.cmpdsynonym
):
smiles_list.append(f"{compound['cid']}\t{compound['isosmiles']}")
smiles_list.append(f"{compound.cid}\t{compound.isosmiles}")
with open(os.path.join(self.raw_dir, "smiles.txt"), "w") as f:
f.write("\n".join(smiles_list))

Expand Down
18 changes: 9 additions & 9 deletions chebai/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,11 @@ def prepare_data(infile: pickle.Pickler) -> pd.DataFrame:
data_frame[col] = data_frame[col].astype(int)

train_data = []
for index, row in data_frame.iterrows():
for row in data_frame.itertuples(index=False):
train_data.append(
[
data_frame.iloc[index].values[1],
data_frame.iloc[index].values[2:502].tolist(),
row.SMILES,
row.LABELS,
Comment on lines -249 to +253
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure about this lines of code, whether this was the actual change intended

]
)

Expand Down Expand Up @@ -309,28 +309,28 @@ def load_data() -> (
train_dataset = []
train_actual_labels = []

for index, row in prepare_data(train_infile).iterrows():
for row in prepare_data(train_infile).itertuples(index=False):
try:
mol = Molecule(row["SMILES"], True)
mol = Molecule(row.SMILES, True)

# DAGs_meta_info = mol.dag_to_node
train_dataset.append(mol)
train_actual_labels.append(torch.tensor(row["LABELS"]).float())
train_actual_labels.append(torch.tensor(row.LABELS).float())
except Exception:
pass

print("prepare validation data!")
validation_dataset = []
validation_actual_labels = []

for index, row in prepare_data(validation_infile).iterrows():
for row in prepare_data(validation_infile).itertuples(index=False):
try:
mol = Molecule(row["SMILES"], True)
mol = Molecule(row.SMILES, True)

# DAGs_meta_info = mol.dag_to_node

validation_dataset.append(mol)
validation_actual_labels.append(torch.tensor(row["LABELS"]).float())
validation_actual_labels.append(torch.tensor(row.LABELS).float())
except Exception:
pass

Expand Down