Skip to content
Binary file added .DS_Store
Binary file not shown.
Binary file added pyaptamer/.DS_Store
Binary file not shown.
4 changes: 2 additions & 2 deletions pyaptamer/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pyaptamer.datasets._loaders._aptacom_loader import (
load_aptacom_full,
load_aptacom_xy,
load_aptacom_x_y,
)
from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
Expand All @@ -12,7 +12,7 @@

__all__ = [
"load_aptacom_full",
"load_aptacom_xy",
"load_aptacom_x_y",
"load_csv_dataset",
"load_hf_dataset",
"load_pfoa",
Expand Down
4 changes: 2 additions & 2 deletions pyaptamer/datasets/_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from pyaptamer.datasets._loaders._aptacom_loader import (
load_aptacom_full,
load_aptacom_xy,
load_aptacom_x_y,
)
from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
Expand All @@ -13,7 +13,7 @@
"load_pfoa_structure",
"load_1gnh_structure",
"load_aptacom_full",
"load_aptacom_xy",
"load_aptacom_x_y",
"load_csv_dataset",
"load_hf_dataset",
"load_pfoa_structure",
Expand Down
164 changes: 93 additions & 71 deletions pyaptamer/datasets/_loaders/_aptacom_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# file: aptacom_loader.py

__author__ = "rpgv"
__all__ = ["load_aptacom_full", "load_aptacom_xy"]
__all__ = ["load_aptacom_full", "load_aptacom_x_y"]

from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset

Expand Down Expand Up @@ -40,97 +42,117 @@
}


def filter_columns(ds, columns=None):
""" " Selects columns to keep on dataset
Parameters:
-----------
ds: pd dataframe, required
Pandas dataframe to filter
columns: list, optional, default=None
If empty returns entire AptaCom dataset, otherwise
returns only the selected columns from the
AptaCom dataset
Returns:
--------
object: pandas dataframe object with
the selected columns
def _filter_columns(df, columns=None):
"""
Select a subset of columns from a pandas DataFrame.

Parameters
----------
df : pandas.DataFrame
Input DataFrame to filter.
columns : list[str] or None, optional
Column names to keep. If None, returns the input DataFrame unchanged.

Returns
-------
pandas.DataFrame
A DataFrame containing only the requested columns (or the original
DataFrame if `columns` is None).

"""
if columns is not None:
ds = ds[columns]
return ds
df = df[columns]
return df


def prepare_xy(ds):
""" " Prepares dataset for usage as training data
Parameters:
-----------
ds: pandas dataframe, required
def prepare_x_y(df):
"""
Prepare dataset by selecting required columns and dropping rows with missing values.

This function:
- Drops rows with missing values in the columns
"aptamer_sequence", "target_sequence", and "new_affinity".
- Keeps only those three columns.

Returns:
--------
Pandas dataframe object processed for training
with columns "aptamer_sequence", "target_sequence",
"new_affinity" and a total of 709 rows
Parameters
----------
df : pandas.DataFrame
Input DataFrame containing at least the columns
"aptamer_sequence", "target_sequence", and "new_affinity".

Returns
-------
pandas.DataFrame
A DataFrame with exactly the columns:
["aptamer_sequence", "target_sequence", "new_affinity"],
and with rows containing no missing values in those columns.
"""
ds.dropna(
df.dropna(
subset=["aptamer_sequence", "target_sequence", "new_affinity"], inplace=True
)
ds = ds[["aptamer_sequence", "target_sequence", "new_affinity"]]
return ds
df = df[["aptamer_sequence", "target_sequence", "new_affinity"]]
return df


def load_aptacom_full(select_columns=None):
"""Loads a AptaCom dataset from hugging face
with customizable options.

Parameters:
-----------
select_columns: list, optional, default=None
A list used to filter the columns dataset features.
Defaults to empty, which returns the complete dataset.
Column names:
['reference',
'aptamer_chemistry',
'aptamer_name',
'target_name',
'aptamer_sequence',
'origin',
'target_chemistry',
'external_id',
'target_sequence',
'new_affinity']

Returns:
--------
object: A pandas dataframe with 5556 rows in total.
The returned object contains the dataset, possibly
filtered with different columns.
"""
aptacom = load_hf_dataset("AptaCom", store=False)
dataset = filter_columns(aptacom, columns=select_columns)
Load the AptaCom dataset from Hugging Face, with optional column selection.

Parameters
----------
select_columns : list[str] or None, optional
List of column names to retain. If None, returns the full dataset.

Available columns include (subject to upstream changes):
[
'reference',
'aptamer_chemistry',
'aptamer_name',
'target_name',
'aptamer_sequence',
'origin',
'target_chemistry',
'external_id',
'target_sequence',
'new_affinity'
]

Returns
-------
pandas.DataFrame
The loaded dataset, optionally filtered to the selected columns.

"""
aptacom = load_hf_dataset("AptaCom", store=False)
dataset = _filter_columns(aptacom, columns=select_columns)
return dataset


def load_aptacom_xy(return_X_y=False):
"""Loads Aptacom dataset for training
def load_aptacom_x_y(return_X_y=False):
"""
Load the AptaCom dataset prepared for model training.

Depending on `return_X_y`, returns either a single DataFrame containing
the features and target, or a tuple of (X, y) DataFrames.

Parameters:
Parameters
----------
return_X_y: bool, optional, default = False
If true returns X (aptamer and target sequence)
and y (new_affinity) otherwise returns a
pandas dataframe containing the three columns

Returns:
--------
Either a pandas dataframe with three columns
or two pandas dataframe objects with two and one
columns respectively.
return_X_y : bool, optional
If True, return a tuple `(X, y)` where:
- `X` has columns ["aptamer_sequence", "target_sequence"]
- `y` has column ["new_affinity"]
If False (default), return a single DataFrame with all three columns.

Returns
-------
pandas.DataFrame or tuple[pandas.DataFrame, pandas.DataFrame]
- If `return_X_y` is False: a DataFrame with columns
["aptamer_sequence", "target_sequence", "new_affinity"].
- If `return_X_y` is True: a tuple `(X, y)` where `X` contains the two
feature columns and `y` contains the target column.
"""
aptacom = load_hf_dataset("AptaCom", store=False)
dataset = prepare_xy(aptacom)
dataset = prepare_x_y(aptacom)
if return_X_y:
X = dataset[["aptamer_sequence", "target_sequence"]]
y = dataset[["new_affinity"]]
Expand Down
Loading