diff --git a/examples/aptanet_tutorial.ipynb b/examples/aptanet_tutorial.ipynb index d89d41ec..a9f66b27 100644 --- a/examples/aptanet_tutorial.ipynb +++ b/examples/aptanet_tutorial.ipynb @@ -554,7 +554,7 @@ " ]\n", ")\n", "\n", - "pipeline = AptaNetPipeline(classifier=model)" + "pipeline = AptaNetPipeline(estimator=model)" ] }, { diff --git a/pyaptamer/aptanet/_pipeline.py b/pyaptamer/aptanet/_pipeline.py index d9d78777..3c882fb6 100644 --- a/pyaptamer/aptanet/_pipeline.py +++ b/pyaptamer/aptanet/_pipeline.py @@ -2,6 +2,7 @@ __all__ = ["AptaNetPipeline"] __required__ = ["python>=3.9,<3.13"] +from skbase.base import BaseObject from sklearn.base import clone from sklearn.pipeline import Pipeline from sklearn.preprocessing import FunctionTransformer @@ -11,7 +12,7 @@ from pyaptamer.utils._aptanet_utils import pairs_to_features -class AptaNetPipeline: +class AptaNetPipeline(BaseObject): """ AptaNet algorithm for aptamer–protein interaction prediction [1]_ @@ -22,14 +23,14 @@ class AptaNetPipeline: The pipeline starts from string pairs, converts them into numeric features (aptamer k-mer frequencies + protein PSeAAC), applies tree-based feature - selection, and feeds the result into the classifier. + selection, and feeds the result into the estimator. Parameters ---------- k : int, optional, default=4 The k-mer size used to generate aptamer k-mer vectors. - classifier : sklearn-compatible estimator or None, default=None + estimator : sklearn-compatible estimator or None, default=None Estimator applied after feature selection. If None, uses `AptaNetClassifier`. Attributes @@ -62,18 +63,18 @@ class AptaNetPipeline: >>> proba = pipe.predict_proba(X_test_pairs) """ - def __init__(self, k=None, classifier=None): + def __init__(self, k=4, estimator=None): self.k = k - self.classifier = classifier + self.estimator = estimator def _build_pipeline(self): transformer = FunctionTransformer( func=pairs_to_features, - kw_args=self.k, + kw_args={"k": self.k}, validate=False, ) - self._classifier = self.classifier or AptaNetClassifier() - return Pipeline([("features", transformer), ("clf", clone(self._classifier))]) + self._estimator = self.estimator or AptaNetClassifier() + return Pipeline([("features", transformer), ("clf", clone(self._estimator))]) def fit(self, X, y): self.pipeline_ = self._build_pipeline() diff --git a/pyaptamer/aptanet/tests/test_aptanet.py b/pyaptamer/aptanet/tests/test_aptanet.py index 5f5e3cf5..7b0dcf19 100644 --- a/pyaptamer/aptanet/tests/test_aptanet.py +++ b/pyaptamer/aptanet/tests/test_aptanet.py @@ -26,7 +26,7 @@ def test_pipeline_fit_and_predict_classification(aptamer_seq, protein_seq): Test if Pipeline predictions are valid class labels and shape matches input for classification. """ - pipe = AptaNetPipeline() + pipe = AptaNetPipeline(k=4) X_raw = [(aptamer_seq, protein_seq) for _ in range(40)] y = np.array([0] * 20 + [1] * 20, dtype=np.float32) @@ -66,7 +66,7 @@ def test_pipeline_fit_and_predict_regression(aptamer_seq, protein_seq): Test if Pipeline predictions are valid floats and shape matches input for regression. """ - pipe = AptaNetPipeline(classifier=AptaNetRegressor()) + pipe = AptaNetPipeline(estimator=AptaNetRegressor()) X_raw = [(aptamer_seq, protein_seq) for _ in range(40)] y = np.linspace(0, 1, 40).astype(np.float32) diff --git a/pyaptamer/benchmarking/__init__.py b/pyaptamer/benchmarking/__init__.py new file mode 100644 index 00000000..0066312f --- /dev/null +++ b/pyaptamer/benchmarking/__init__.py @@ -0,0 +1,5 @@ +"""Benchmarking module.""" + +from pyaptamer.benchmarking._base import Benchmarking + +__all__ = ["Benchmarking"] diff --git a/pyaptamer/benchmarking/_base.py b/pyaptamer/benchmarking/_base.py new file mode 100644 index 00000000..7fc1bdca --- /dev/null +++ b/pyaptamer/benchmarking/_base.py @@ -0,0 +1,154 @@ +__author__ = "satvshr" +__all__ = ["Benchmarking"] + +import numpy as np +import pandas as pd +from sklearn.metrics import make_scorer +from sklearn.model_selection import cross_validate + + +class Benchmarking: + """ + Benchmark estimators using cross-validation. + + You can: + + - pass `X, y` (feature matrix and labels/targets) along with `cv` + to use any cross-validation strategy; + - if you want a fixed train/test split, pass a `PredefinedSplit` + object as `cv`. + + Parameters + ---------- + estimators : list[estimator] | estimator + List of sklearn-like estimators implementing `fit` and `predict`. + metrics : list[callable] | callable + List of callables with signature `(y_true, y_pred) -> float`. + X : array-like + Feature matrix. + y : array-like + Target vector. + cv : int, CV splitter, or None, default=None + Cross-validation strategy. If `None`, defaults to 5-fold CV. + If you want to use an explicit train/test split, pass a + `PredefinedSplit` object. + + Attributes + ---------- + results : pd.DataFrame + DataFrame produced by :meth:`run`. + + - Index: pandas.MultiIndex with two levels (names shown in parentheses) + - level 0 "estimator": estimator name + - level 1 "metric": evaluator name + - Columns: ["train", "test"] (both floats) + - Cell values: mean scores (float) computed across CV folds: + - "train" = mean of cross_validate(...)[f"train_{metric}"] + - "test" = mean of cross_validate(...)[f"test_{metric}"] + + Example + ------- + >>> import numpy as np + >>> from sklearn.metrics import accuracy_score + >>> from sklearn.model_selection import PredefinedSplit + >>> from pyaptamer.benchmarking._base import Benchmarking + >>> from pyaptamer.aptanet import AptaNetPipeline + >>> aptamer_seq = "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCCCGCGTAC" + >>> protein_seq = "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY" + >>> # dataset: 20 aptamer–protein pairs + >>> X = [(aptamer_seq, protein_seq) for _ in range(20)] + >>> y = np.array([0] * 10 + [1] * 10, dtype=np.float32) + >>> clf = AptaNetPipeline(k=4) + >>> # define a fixed train/test split + >>> test_fold = np.ones(len(y)) * -1 + >>> test_fold[-2:] = 0 + >>> cv = PredefinedSplit(test_fold) + >>> bench = Benchmarking( + ... estimators=[clf], + ... metrics=[accuracy_score], + ... X=X, + ... y=y, + ... cv=cv, + ... ) + >>> summary = bench.run() # doctest: +SKIP + """ + + def __init__(self, estimators, metrics, X, y, cv=None): + self.estimators = estimators if isinstance(estimators, list) else [estimators] + self.metrics = metrics if isinstance(metrics, list) else [metrics] + self.X = X + self.y = y + self.cv = cv + self.results = None + + def _to_scorers(self, metrics): + """Convert metric callables to a dict of scorers.""" + scorers = {} + for metric in metrics: + if not callable(metric): + raise ValueError("Each metric should be a callable.") + name = ( + metric.__name__ + if hasattr(metric, "__name__") + else metric.__class__.__name__ + ) + scorers[name] = make_scorer(metric) + return scorers + + def _to_df(self, results): + """Convert nested results to a unified DataFrame.""" + records = [] + index = [] + + for est_name, est_scores in results.items(): + for metric_name, scores in est_scores.items(): + records.append(scores) + index.append((est_name, metric_name)) + + index = pd.MultiIndex.from_tuples(index, names=["estimator", "metric"]) + return pd.DataFrame(records, index=index, columns=["train", "test"]) + + def run(self): + """ + Train each estimator and evaluate with cross-validation. + + Returns + ------- + results : pd.DataFrame + + - Index: pandas.MultiIndex with two levels (names shown in parentheses) + - level 0 "estimator": estimator name + - level 1 "metric": evaluator name + - Columns: ["train", "test"] (both floats) + - Cell values: mean scores (float) computed across CV folds: + - "train" = mean of cross_validate(...)[f"train_{metric}"] + - "test" = mean of cross_validate(...)[f"test_{metric}"] + + """ + self.scorers_ = self._to_scorers(self.metrics) + results = {} + + for estimator in self.estimators: + est_name = estimator.__class__.__name__ + + cv_results = cross_validate( + estimator, + self.X, + self.y, + cv=self.cv, + scoring=self.scorers_, + return_train_score=True, + ) + + # average across folds + est_scores = {} + for metric in self.scorers_.keys(): + est_scores[metric] = { + "train": float(np.mean(cv_results[f"train_{metric}"])), + "test": float(np.mean(cv_results[f"test_{metric}"])), + } + + results[est_name] = est_scores + + self.results = self._to_df(results) + return self.results diff --git a/pyaptamer/benchmarking/tests/__init__.py b/pyaptamer/benchmarking/tests/__init__.py new file mode 100644 index 00000000..cf40889b --- /dev/null +++ b/pyaptamer/benchmarking/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for the benchmarking module""" diff --git a/pyaptamer/benchmarking/tests/test_benchmarking.py b/pyaptamer/benchmarking/tests/test_benchmarking.py new file mode 100644 index 00000000..4a9508e2 --- /dev/null +++ b/pyaptamer/benchmarking/tests/test_benchmarking.py @@ -0,0 +1,78 @@ +import sys + +import numpy as np +import pytest +from sklearn.metrics import accuracy_score, mean_squared_error +from sklearn.model_selection import PredefinedSplit + +from pyaptamer.aptanet import AptaNetPipeline, AptaNetRegressor +from pyaptamer.benchmarking._base import Benchmarking + +params = [ + ( + "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCCCGCGTAC", + "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY", + ) +] + + +@pytest.mark.skipif( + sys.version_info >= (3, 13), reason="skorch does not support Python 3.13" +) +@pytest.mark.parametrize("aptamer_seq, protein_seq", params) +def test_benchmarking_with_predefined_split_classification(aptamer_seq, protein_seq): + """ + Test Benchmarking on a classification task using PredefinedSplit. + """ + X_raw = [(aptamer_seq, protein_seq) for _ in range(40)] + y = np.array([0] * 20 + [1] * 20, dtype=np.float32) + + clf = AptaNetPipeline() + + test_fold = np.ones(len(y), dtype=int) * -1 + test_fold[-2:] = 0 + cv = PredefinedSplit(test_fold) + + bench = Benchmarking( + estimators=[clf], + metrics=[accuracy_score], + X=X_raw, + y=y, + cv=cv, + ) + summary = bench.run() + + assert "train" in summary.columns + assert "test" in summary.columns + assert (clf.__class__.__name__, "accuracy_score") in summary.index + + +@pytest.mark.skipif( + sys.version_info >= (3, 13), reason="skorch does not support Python 3.13" +) +@pytest.mark.parametrize("aptamer_seq, protein_seq", params) +def test_benchmarking_with_predefined_split_regression(aptamer_seq, protein_seq): + """ + Test Benchmarking on a regression task using PredefinedSplit. + """ + X_raw = [(aptamer_seq, protein_seq) for _ in range(40)] + y = np.linspace(0, 1, 40).astype(np.float32) + + reg = AptaNetPipeline(estimator=AptaNetRegressor()) + + test_fold = np.ones(len(y), dtype=int) * -1 + test_fold[-3:] = 0 + cv = PredefinedSplit(test_fold) + + bench = Benchmarking( + estimators=[reg], + metrics=[mean_squared_error], + X=X_raw, + y=y, + cv=cv, + ) + summary = bench.run() + + assert "train" in summary.columns + assert "test" in summary.columns + assert (reg.__class__.__name__, "mean_squared_error") in summary.index diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py index 1388254f..ff8c494f 100644 --- a/pyaptamer/datasets/__init__.py +++ b/pyaptamer/datasets/__init__.py @@ -12,4 +12,5 @@ "load_pfoa_structure", "load_1gnh_structure", "load_from_rcsb", + "load_csv_dataset", ] diff --git a/pyaptamer/datasets/tests/test_pfoa.py b/pyaptamer/datasets/tests/test_pfoa.py deleted file mode 100644 index 14eec9d0..00000000 --- a/pyaptamer/datasets/tests/test_pfoa.py +++ /dev/null @@ -1,22 +0,0 @@ -__author__ = "satvshr" - -from Bio.PDB.Structure import Structure - -from pyaptamer.datasets._loaders import load_pfoa_structure - - -def test_pfoa_loader(): - """ - Test that the load_pfoa_structure function runs without error and returns a valid - Structure object. - - Asserts - ------- - The datasets loads and the return value must be an instance of Biopython's - Structure class. - """ - structure = load_pfoa_structure() - - assert isinstance(structure, Structure), ( - "Returned object is not a Biopython Structure" - )