diff --git a/examples/aptanet_tutorial.ipynb b/examples/aptanet_tutorial.ipynb
index d89d41ec..a9f66b27 100644
--- a/examples/aptanet_tutorial.ipynb
+++ b/examples/aptanet_tutorial.ipynb
@@ -554,7 +554,7 @@
     "    ]\n",
     ")\n",
     "\n",
-    "pipeline = AptaNetPipeline(classifier=model)"
+    "pipeline = AptaNetPipeline(estimator=model)"
    ]
   },
   {
diff --git a/pyaptamer/aptanet/_pipeline.py b/pyaptamer/aptanet/_pipeline.py
index d9d78777..3c882fb6 100644
--- a/pyaptamer/aptanet/_pipeline.py
+++ b/pyaptamer/aptanet/_pipeline.py
@@ -2,6 +2,7 @@
 __all__ = ["AptaNetPipeline"]
 __required__ = ["python>=3.9,<3.13"]
 
+from skbase.base import BaseObject
 from sklearn.base import clone
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import FunctionTransformer
@@ -11,7 +12,7 @@
 from pyaptamer.utils._aptanet_utils import pairs_to_features
 
 
-class AptaNetPipeline:
+class AptaNetPipeline(BaseObject):
     """
     AptaNet algorithm for aptamer–protein interaction prediction [1]_
 
@@ -22,14 +23,14 @@ class AptaNetPipeline:
 
     The pipeline starts from string pairs, converts them into numeric features
     (aptamer k-mer frequencies + protein PSeAAC), applies tree-based feature
-    selection, and feeds the result into the classifier.
+    selection, and feeds the result into the estimator.
 
     Parameters
     ----------
     k : int, optional, default=4
         The k-mer size used to generate aptamer k-mer vectors.
 
-    classifier : sklearn-compatible estimator or None, default=None
+    estimator : sklearn-compatible estimator or None, default=None
         Estimator applied after feature selection. If None, uses `AptaNetClassifier`.
 
     Attributes
@@ -62,18 +63,18 @@ class AptaNetPipeline:
     >>> proba = pipe.predict_proba(X_test_pairs)
     """
 
-    def __init__(self, k=None, classifier=None):
+    def __init__(self, k=4, estimator=None):
         self.k = k
-        self.classifier = classifier
+        self.estimator = estimator
 
     def _build_pipeline(self):
         transformer = FunctionTransformer(
             func=pairs_to_features,
-            kw_args=self.k,
+            kw_args={"k": self.k},
             validate=False,
         )
-        self._classifier = self.classifier or AptaNetClassifier()
-        return Pipeline([("features", transformer), ("clf", clone(self._classifier))])
+        self._estimator = self.estimator or AptaNetClassifier()
+        return Pipeline([("features", transformer), ("clf", clone(self._estimator))])
 
     def fit(self, X, y):
         self.pipeline_ = self._build_pipeline()
diff --git a/pyaptamer/aptanet/tests/test_aptanet.py b/pyaptamer/aptanet/tests/test_aptanet.py
index 5f5e3cf5..7b0dcf19 100644
--- a/pyaptamer/aptanet/tests/test_aptanet.py
+++ b/pyaptamer/aptanet/tests/test_aptanet.py
@@ -26,7 +26,7 @@ def test_pipeline_fit_and_predict_classification(aptamer_seq, protein_seq):
     Test if Pipeline predictions are valid class labels and shape matches input
     for classification.
     """
-    pipe = AptaNetPipeline()
+    pipe = AptaNetPipeline(k=4)
 
     X_raw = [(aptamer_seq, protein_seq) for _ in range(40)]
     y = np.array([0] * 20 + [1] * 20, dtype=np.float32)
@@ -66,7 +66,7 @@ def test_pipeline_fit_and_predict_regression(aptamer_seq, protein_seq):
     Test if Pipeline predictions are valid floats and shape matches input
     for regression.
     """
-    pipe = AptaNetPipeline(classifier=AptaNetRegressor())
+    pipe = AptaNetPipeline(estimator=AptaNetRegressor())
 
     X_raw = [(aptamer_seq, protein_seq) for _ in range(40)]
     y = np.linspace(0, 1, 40).astype(np.float32)
diff --git a/pyaptamer/benchmarking/__init__.py b/pyaptamer/benchmarking/__init__.py
new file mode 100644
index 00000000..0066312f
--- /dev/null
+++ b/pyaptamer/benchmarking/__init__.py
@@ -0,0 +1,5 @@
+"""Benchmarking module."""
+
+from pyaptamer.benchmarking._base import Benchmarking
+
+__all__ = ["Benchmarking"]
diff --git a/pyaptamer/benchmarking/_base.py b/pyaptamer/benchmarking/_base.py
new file mode 100644
index 00000000..7fc1bdca
--- /dev/null
+++ b/pyaptamer/benchmarking/_base.py
@@ -0,0 +1,154 @@
+__author__ = "satvshr"
+__all__ = ["Benchmarking"]
+
+import numpy as np
+import pandas as pd
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import cross_validate
+
+
+class Benchmarking:
+    """
+    Benchmark estimators using cross-validation.
+
+    You can:
+
+    - pass `X, y` (feature matrix and labels/targets) along with `cv`
+      to use any cross-validation strategy;
+    - if you want a fixed train/test split, pass a `PredefinedSplit`
+      object as `cv`.
+
+    Parameters
+    ----------
+    estimators : list[estimator] | estimator
+        List of sklearn-like estimators implementing `fit` and `predict`.
+    metrics : list[callable] | callable
+        List of callables with signature `(y_true, y_pred) -> float`.
+    X : array-like
+        Feature matrix.
+    y : array-like
+        Target vector.
+    cv : int, CV splitter, or None, default=None
+        Cross-validation strategy. If `None`, defaults to 5-fold CV.
+        If you want to use an explicit train/test split, pass a
+        `PredefinedSplit` object.
+
+    Attributes
+    ----------
+    results : pd.DataFrame
+        DataFrame produced by :meth:`run`.
+
+        - Index: pandas.MultiIndex with two levels (names shown in parentheses)
+            - level 0 "estimator": estimator name
+            - level 1 "metric": evaluator name
+        - Columns: ["train", "test"] (both floats)
+        - Cell values: mean scores (float) computed across CV folds:
+            - "train" = mean of cross_validate(...)[f"train_{metric}"]
+            - "test"  = mean of cross_validate(...)[f"test_{metric}"]
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> from sklearn.metrics import accuracy_score
+    >>> from sklearn.model_selection import PredefinedSplit
+    >>> from pyaptamer.benchmarking._base import Benchmarking
+    >>> from pyaptamer.aptanet import AptaNetPipeline
+    >>> aptamer_seq = "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCCCGCGTAC"
+    >>> protein_seq = "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY"
+    >>> # dataset: 20 aptamer–protein pairs
+    >>> X = [(aptamer_seq, protein_seq) for _ in range(20)]
+    >>> y = np.array([0] * 10 + [1] * 10, dtype=np.float32)
+    >>> clf = AptaNetPipeline(k=4)
+    >>> # define a fixed train/test split
+    >>> test_fold = np.ones(len(y)) * -1
+    >>> test_fold[-2:] = 0
+    >>> cv = PredefinedSplit(test_fold)
+    >>> bench = Benchmarking(
+    ...     estimators=[clf],
+    ...     metrics=[accuracy_score],
+    ...     X=X,
+    ...     y=y,
+    ...     cv=cv,
+    ... )
+    >>> summary = bench.run()  # doctest: +SKIP
+    """
+
+    def __init__(self, estimators, metrics, X, y, cv=None):
+        self.estimators = estimators if isinstance(estimators, list) else [estimators]
+        self.metrics = metrics if isinstance(metrics, list) else [metrics]
+        self.X = X
+        self.y = y
+        self.cv = cv
+        self.results = None
+
+    def _to_scorers(self, metrics):
+        """Convert metric callables to a dict of scorers."""
+        scorers = {}
+        for metric in metrics:
+            if not callable(metric):
+                raise ValueError("Each metric should be a callable.")
+            name = (
+                metric.__name__
+                if hasattr(metric, "__name__")
+                else metric.__class__.__name__
+            )
+            scorers[name] = make_scorer(metric)
+        return scorers
+
+    def _to_df(self, results):
+        """Convert nested results to a unified DataFrame."""
+        records = []
+        index = []
+
+        for est_name, est_scores in results.items():
+            for metric_name, scores in est_scores.items():
+                records.append(scores)
+                index.append((est_name, metric_name))
+
+        index = pd.MultiIndex.from_tuples(index, names=["estimator", "metric"])
+        return pd.DataFrame(records, index=index, columns=["train", "test"])
+
+    def run(self):
+        """
+        Train each estimator and evaluate with cross-validation.
+
+        Returns
+        -------
+        results : pd.DataFrame
+
+            - Index: pandas.MultiIndex with two levels (names shown in parentheses)
+                - level 0 "estimator": estimator name
+                - level 1 "metric": evaluator name
+            - Columns: ["train", "test"] (both floats)
+            - Cell values: mean scores (float) computed across CV folds:
+                - "train" = mean of cross_validate(...)[f"train_{metric}"]
+                - "test"  = mean of cross_validate(...)[f"test_{metric}"]
+
+        """
+        self.scorers_ = self._to_scorers(self.metrics)
+        results = {}
+
+        for estimator in self.estimators:
+            est_name = estimator.__class__.__name__
+
+            cv_results = cross_validate(
+                estimator,
+                self.X,
+                self.y,
+                cv=self.cv,
+                scoring=self.scorers_,
+                return_train_score=True,
+            )
+
+            # average across folds
+            est_scores = {}
+            for metric in self.scorers_.keys():
+                est_scores[metric] = {
+                    "train": float(np.mean(cv_results[f"train_{metric}"])),
+                    "test": float(np.mean(cv_results[f"test_{metric}"])),
+                }
+
+            results[est_name] = est_scores
+
+        self.results = self._to_df(results)
+        return self.results
diff --git a/pyaptamer/benchmarking/tests/__init__.py b/pyaptamer/benchmarking/tests/__init__.py
new file mode 100644
index 00000000..cf40889b
--- /dev/null
+++ b/pyaptamer/benchmarking/tests/__init__.py
@@ -0,0 +1 @@
+"""Test suite for the benchmarking module"""
diff --git a/pyaptamer/benchmarking/tests/test_benchmarking.py b/pyaptamer/benchmarking/tests/test_benchmarking.py
new file mode 100644
index 00000000..4a9508e2
--- /dev/null
+++ b/pyaptamer/benchmarking/tests/test_benchmarking.py
@@ -0,0 +1,78 @@
+import sys
+
+import numpy as np
+import pytest
+from sklearn.metrics import accuracy_score, mean_squared_error
+from sklearn.model_selection import PredefinedSplit
+
+from pyaptamer.aptanet import AptaNetPipeline, AptaNetRegressor
+from pyaptamer.benchmarking._base import Benchmarking
+
+params = [
+    (
+        "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCCCGCGTAC",
+        "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY",
+    )
+]
+
+
+@pytest.mark.skipif(
+    sys.version_info >= (3, 13), reason="skorch does not support Python 3.13"
+)
+@pytest.mark.parametrize("aptamer_seq, protein_seq", params)
+def test_benchmarking_with_predefined_split_classification(aptamer_seq, protein_seq):
+    """
+    Test Benchmarking on a classification task using PredefinedSplit.
+    """
+    X_raw = [(aptamer_seq, protein_seq) for _ in range(40)]
+    y = np.array([0] * 20 + [1] * 20, dtype=np.float32)
+
+    clf = AptaNetPipeline()
+
+    test_fold = np.ones(len(y), dtype=int) * -1
+    test_fold[-2:] = 0
+    cv = PredefinedSplit(test_fold)
+
+    bench = Benchmarking(
+        estimators=[clf],
+        metrics=[accuracy_score],
+        X=X_raw,
+        y=y,
+        cv=cv,
+    )
+    summary = bench.run()
+
+    assert "train" in summary.columns
+    assert "test" in summary.columns
+    assert (clf.__class__.__name__, "accuracy_score") in summary.index
+
+
+@pytest.mark.skipif(
+    sys.version_info >= (3, 13), reason="skorch does not support Python 3.13"
+)
+@pytest.mark.parametrize("aptamer_seq, protein_seq", params)
+def test_benchmarking_with_predefined_split_regression(aptamer_seq, protein_seq):
+    """
+    Test Benchmarking on a regression task using PredefinedSplit.
+    """
+    X_raw = [(aptamer_seq, protein_seq) for _ in range(40)]
+    y = np.linspace(0, 1, 40).astype(np.float32)
+
+    reg = AptaNetPipeline(estimator=AptaNetRegressor())
+
+    test_fold = np.ones(len(y), dtype=int) * -1
+    test_fold[-3:] = 0
+    cv = PredefinedSplit(test_fold)
+
+    bench = Benchmarking(
+        estimators=[reg],
+        metrics=[mean_squared_error],
+        X=X_raw,
+        y=y,
+        cv=cv,
+    )
+    summary = bench.run()
+
+    assert "train" in summary.columns
+    assert "test" in summary.columns
+    assert (reg.__class__.__name__, "mean_squared_error") in summary.index
diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py
index 1388254f..ff8c494f 100644
--- a/pyaptamer/datasets/__init__.py
+++ b/pyaptamer/datasets/__init__.py
@@ -12,4 +12,5 @@
     "load_pfoa_structure",
     "load_1gnh_structure",
     "load_from_rcsb",
+    "load_csv_dataset",
 ]
diff --git a/pyaptamer/datasets/tests/test_pfoa.py b/pyaptamer/datasets/tests/test_pfoa.py
deleted file mode 100644
index 14eec9d0..00000000
--- a/pyaptamer/datasets/tests/test_pfoa.py
+++ /dev/null
@@ -1,22 +0,0 @@
-__author__ = "satvshr"
-
-from Bio.PDB.Structure import Structure
-
-from pyaptamer.datasets._loaders import load_pfoa_structure
-
-
-def test_pfoa_loader():
-    """
-    Test that the load_pfoa_structure function runs without error and returns a valid
-    Structure object.
-
-    Asserts
-    -------
-        The datasets loads and the return value must be an instance of Biopython's
-        Structure class.
-    """
-    structure = load_pfoa_structure()
-
-    assert isinstance(structure, Structure), (
-        "Returned object is not a Biopython Structure"
-    )