diff --git a/docs/source/whats_new.rst b/docs/source/whats_new.rst
index 6d7e43058..070bd9feb 100644
--- a/docs/source/whats_new.rst
+++ b/docs/source/whats_new.rst
@@ -42,6 +42,7 @@ Enhancements
 - Adding :func:`moabb.analysis.plotting.dataset_bubble_plot` plus the corresponding tutorial (:gh:`753` by `Pierre Guetschel`_)
 - Adding :func:`moabb.datasets.utils.plot_all_datasets` and update the tutorial (:gh:`758` by `Pierre Guetschel`_)
 - Improve the dataset model cards in each API page (:gh:`765` by `Pierre Guetschel`_)
+- Refactor :class:`moabb.evaluation.CrossSessionEvaluation`, :class:`moabb.evaluation.CrossSubjectEvaluation` and  :class:`moabb.evaluation.WithinSessionEvaluation` to use the new splitter classes (:gh:`769` by `Bruno Aristimunha`_)
 - Adding tutorial on using mne-features (:gh:`762` by `Alexander de Ranitz`_, `Luuk Neervens`_, `Charlynn van Osch`_ and `Bruno Aristimunha`_)
 - Creating tutorial to expose the pre-processing steps (:gh:`771` by `Bruno Aristimunha`_)
 - Add function to auto-generate tables for the paper results documentation page (:gh:`785` by `Lucas Heck`_)
diff --git a/examples/advanced_examples/plot_grid_search_withinsession.py b/examples/advanced_examples/plot_grid_search_withinsession.py
index a2f1aefb7..f63bb82be 100644
--- a/examples/advanced_examples/plot_grid_search_withinsession.py
+++ b/examples/advanced_examples/plot_grid_search_withinsession.py
@@ -9,7 +9,6 @@
 """
 
 import os
-from pickle import load
 
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -132,44 +131,3 @@
 )
 sns.pointplot(data=result, y="score", x="pipeline", ax=axes, palette="Set1")
 axes.set_ylabel("ROC AUC")
-
-##########################################################
-# Load Best Model Parameter
-# -------------------------
-# The best model are automatically saved in a pickle file, in the
-# results directory. It is possible to load those model for each
-# dataset, subject and session. Here, we could see that the grid
-# search found a l1_ratio that is different from the baseline
-# value.
-
-with open(
-    "./Results/Models_WithinSession/BNCI2014-001/1/1test/GridSearchEN/fitted_model_best.pkl",
-    "rb",
-) as pickle_file:
-    GridSearchEN_Session_E = load(pickle_file)
-
-print(
-    "Best Parameter l1_ratio Session_E GridSearchEN ",
-    GridSearchEN_Session_E.best_params_["LogistReg__l1_ratio"],
-)
-
-print(
-    "Best Parameter l1_ratio Session_E VanillaEN: ",
-    pipelines["VanillaEN"].steps[2][1].l1_ratio,
-)
-
-with open(
-    "./Results/Models_WithinSession/BNCI2014-001/1/0train/GridSearchEN/fitted_model_best.pkl",
-    "rb",
-) as pickle_file:
-    GridSearchEN_Session_T = load(pickle_file)
-
-print(
-    "Best Parameter l1_ratio Session_T GridSearchEN ",
-    GridSearchEN_Session_T.best_params_["LogistReg__l1_ratio"],
-)
-
-print(
-    "Best Parameter l1_ratio Session_T VanillaEN: ",
-    pipelines["VanillaEN"].steps[2][1].l1_ratio,
-)
diff --git a/moabb/evaluations/__init__.py b/moabb/evaluations/__init__.py
index 9f8eceff5..4a5695f48 100644
--- a/moabb/evaluations/__init__.py
+++ b/moabb/evaluations/__init__.py
@@ -10,4 +10,4 @@
     WithinSessionEvaluation,
 )
 from .splitters import CrossSessionSplitter, CrossSubjectSplitter, WithinSessionSplitter
-from .utils import create_save_path, save_model_cv, save_model_list
+from .utils import _create_save_path, _save_model_cv
diff --git a/moabb/evaluations/base.py b/moabb/evaluations/base.py
index c15c2699d..7f8d70158 100644
--- a/moabb/evaluations/base.py
+++ b/moabb/evaluations/base.py
@@ -3,29 +3,23 @@
 from warnings import warn
 
 import pandas as pd
+from joblib import Parallel, delayed
 from sklearn.base import BaseEstimator
-from sklearn.model_selection import GridSearchCV
 
 from moabb.analysis import Results
 from moabb.datasets.base import BaseDataset
-from moabb.evaluations.utils import _convert_sklearn_params_to_optuna
+from moabb.evaluations.utils import (
+    _convert_sklearn_params_to_optuna,
+    check_search_available,
+)
 from moabb.paradigms.base import BaseParadigm
 
 
+search_methods, optuna_available = check_search_available()
+
 log = logging.getLogger(__name__)
 
 # Making the optuna soft dependency
-try:
-    from optuna.integration import OptunaSearchCV
-
-    optuna_available = True
-except ImportError:
-    optuna_available = False
-
-if optuna_available:
-    search_methods = {"grid": GridSearchCV, "optuna": OptunaSearchCV}
-else:
-    search_methods = {"grid": GridSearchCV}
 
 
 class BaseEvaluation(ABC):
@@ -83,6 +77,8 @@ class BaseEvaluation(ABC):
        optuna, time_out parameters.
     """
 
+    search = False
+
     def __init__(
         self,
         paradigm,
@@ -201,7 +197,6 @@ def process(self, pipelines, param_grid=None, postprocess_pipeline=None):
             This pipeline must be "fixed" because it will not be trained,
             i.e. no call to ``fit`` will be made.
 
-
         Returns
         -------
         results: pd.DataFrame
@@ -216,26 +211,44 @@ def process(self, pipelines, param_grid=None, postprocess_pipeline=None):
             if not (isinstance(pipeline, BaseEstimator)):
                 raise (ValueError("pipelines must only contains Pipelines " "instance"))
 
-        res_per_db = []
-        for dataset in self.datasets:
-            log.info("Processing dataset: {}".format(dataset.code))
-            process_pipeline = self.paradigm.make_process_pipelines(
+        # Prepare dataset processing parameters
+        processing_params = [
+            (
                 dataset,
-                return_epochs=self.return_epochs,
-                return_raws=self.return_raws,
-                postprocess_pipeline=postprocess_pipeline,
-            )[0]
-            # (we only keep the pipeline for the first frequency band, better ideas?)
-
-            results = self.evaluate(
-                dataset,
-                pipelines,
-                param_grid=param_grid,
-                process_pipeline=process_pipeline,
-                postprocess_pipeline=postprocess_pipeline,
+                self.paradigm.make_process_pipelines(
+                    dataset,
+                    return_epochs=self.return_epochs,
+                    return_raws=self.return_raws,
+                    postprocess_pipeline=postprocess_pipeline,
+                )[0],
             )
+            for dataset in self.datasets
+        ]
+
+        # Parallel processing...
+        parallel_results = Parallel(n_jobs=self.n_jobs)(
+            delayed(
+                lambda d, p: list(
+                    self.evaluate(
+                        d,
+                        pipelines,
+                        param_grid=param_grid,
+                        process_pipeline=p,
+                        postprocess_pipeline=postprocess_pipeline,
+                    )
+                )
+            )(dataset, process_pipeline)
+            for dataset, process_pipeline in processing_params
+        )
+
+        res_per_db = []
+        # Process results in order
+        for (dataset, process_pipeline), results in zip(
+            processing_params, parallel_results
+        ):
             for res in results:
                 self.push_result(res, pipelines, process_pipeline)
+
             res_per_db.append(
                 self.results.to_dataframe(
                     pipelines=pipelines, process_pipeline=process_pipeline
@@ -316,9 +329,12 @@ def _grid_search(self, param_grid, name, grid_clf, inner_cv):
                     return_train_score=True,
                     **extra_params,
                 )
+                self.search = True
                 return search
             else:
+                self.search = True
                 return grid_clf
 
         else:
+            self.search = False
             return grid_clf
diff --git a/moabb/evaluations/evaluations.py b/moabb/evaluations/evaluations.py
index a2e39800d..93d4d55f6 100644
--- a/moabb/evaluations/evaluations.py
+++ b/moabb/evaluations/evaluations.py
@@ -12,14 +12,21 @@
     LeaveOneGroupOut,
     StratifiedKFold,
     StratifiedShuffleSplit,
-    cross_validate,
 )
-from sklearn.model_selection._validation import _fit_and_score, _score
+from sklearn.model_selection._validation import _score
 from sklearn.preprocessing import LabelEncoder
 from tqdm import tqdm
 
 from moabb.evaluations.base import BaseEvaluation
-from moabb.evaluations.utils import create_save_path, save_model_cv, save_model_list
+from moabb.evaluations.splitters import (
+    CrossSessionSplitter,
+    CrossSubjectSplitter,
+    WithinSessionSplitter,
+)
+from moabb.evaluations.utils import (
+    _create_save_path,
+    _save_model_cv,
+)
 
 
 try:
@@ -29,6 +36,7 @@
 except ImportError:
     _carbonfootprint = False
 
+
 log = logging.getLogger(__name__)
 
 # Numpy ArrayLike is only available starting from Numpy 1.20 and Python 3.8
@@ -134,7 +142,6 @@ def __init__(
             super().__init__(**kwargs)
 
     # flake8: noqa: C901
-
     def _evaluate(
         self,
         dataset,
@@ -172,8 +179,13 @@ def _evaluate(
                         # Initialize CodeCarbon
                         tracker = EmissionsTracker(save_to_file=False, log_level="error")
                         tracker.start()
+
                     t_start = time()
-                    cv = StratifiedKFold(5, shuffle=True, random_state=self.random_state)
+                    self.cv = WithinSessionSplitter(
+                        n_folds=5,
+                        shuffle=True,
+                        random_state=self.random_state,
+                    )
                     inner_cv = StratifiedKFold(
                         3, shuffle=True, random_state=self.random_state
                     )
@@ -185,17 +197,6 @@ def _evaluate(
 
                     grid_clf = clone(clf)
 
-                    # Create folder for grid search results
-                    create_save_path(
-                        self.hdf5_path,
-                        dataset.code,
-                        subject,
-                        session,
-                        name,
-                        grid=True,
-                        eval_type="WithinSession",
-                    )
-
                     # Implement Grid Search
                     grid_clf = self._grid_search(
                         param_grid=param_grid,
@@ -203,64 +204,51 @@ def _evaluate(
                         grid_clf=grid_clf,
                         inner_cv=inner_cv,
                     )
+
                     if self.hdf5_path is not None and self.save_model:
-                        model_save_path = create_save_path(
+                        model_save_path = _create_save_path(
                             self.hdf5_path,
                             dataset.code,
                             subject,
                             session,
                             name,
-                            grid=False,
+                            grid=self.search,
                             eval_type="WithinSession",
                         )
 
-                    if isinstance(X, BaseEpochs):
-                        scorer = get_scorer(self.paradigm.scoring)
-                        acc = list()
-                        X_ = X[ix]
-                        y_ = y[ix] if self.mne_labels else y_cv
-                        for cv_ind, (train, test) in enumerate(cv.split(X_, y_)):
-                            cvclf = clone(grid_clf)
-                            cvclf.fit(X_[train], y_[train])
-                            acc.append(scorer(cvclf, X_[test], y_[test]))
-
-                            if self.hdf5_path is not None and self.save_model:
-                                save_model_cv(
-                                    model=cvclf,
-                                    save_path=model_save_path,
-                                    cv_index=cv_ind,
-                                )
+                    scorer = get_scorer(self.paradigm.scoring)
+                    acc = list()
+                    X_ = X[ix]
+                    y_ = y[ix] if self.mne_labels else y_cv
+                    meta_ = metadata[ix].reset_index(drop=True)
+
+                    for cv_ind, (train, test) in enumerate(self.cv.split(y_, meta_)):
+                        cvclf = clone(grid_clf)
+
+                        cvclf.fit(X_[train], y_[train])
+
+                        score = scorer(cvclf, X_[test], y_[test])
+
+                        acc.append(score)
 
-                        acc = np.array(acc)
-                        score = acc.mean()
-                    else:
-                        results = cross_validate(
-                            grid_clf,
-                            X[ix],
-                            y_cv,
-                            cv=cv,
-                            scoring=self.paradigm.scoring,
-                            n_jobs=self.n_jobs,
-                            error_score=self.error_score,
-                            return_estimator=True,
-                        )
-                        score = results["test_score"].mean()
                         if self.hdf5_path is not None and self.save_model:
-                            save_model_list(
-                                results["estimator"],
-                                score_list=results["test_score"],
+                            _save_model_cv(
+                                model=cvclf,
                                 save_path=model_save_path,
+                                cv_index=cv_ind,
                             )
 
+                    acc = np.array(acc)
+                    score = acc.mean()
+
                     if _carbonfootprint:
                         emissions = tracker.stop()
                         if emissions is None:
                             emissions = np.nan
                     duration = time() - t_start
-
                     nchan = X.info["nchan"] if isinstance(X, BaseEpochs) else X.shape[1]
                     res = {
-                        "time": duration / 5.0,  # 5 fold CV
+                        "time": duration / self.cv.n_folds,  # 5 fold CV
                         "dataset": dataset,
                         "subject": subject,
                         "session": session,
@@ -519,7 +507,8 @@ def evaluate(
                     tracker.start()
 
                 # we want to store a results per session
-                cv = LeaveOneGroupOut()
+                self.cv = CrossSessionSplitter(random_state=self.random_state)
+
                 inner_cv = StratifiedKFold(
                     3, shuffle=True, random_state=self.random_state
                 )
@@ -532,62 +521,42 @@ def evaluate(
                 )
 
                 if self.hdf5_path is not None and self.save_model:
-                    model_save_path = create_save_path(
+                    model_save_path = _create_save_path(
                         hdf5_path=self.hdf5_path,
                         code=dataset.code,
                         subject=subject,
                         session="",
                         name=name,
-                        grid=False,
+                        grid=self.search,
                         eval_type="CrossSession",
                     )
 
-                for cv_ind, (train, test) in enumerate(cv.split(X, y, groups)):
+                for cv_ind, (train, test) in enumerate(self.cv.split(y, metadata)):
                     model_list = []
                     if _carbonfootprint:
                         tracker.start()
                     t_start = time()
-                    if isinstance(X, BaseEpochs):
-                        cvclf = clone(grid_clf)
-                        cvclf.fit(X[train], y[train])
-                        model_list.append(cvclf)
-                        score = scorer(cvclf, X[test], y[test])
 
-                        if self.hdf5_path is not None and self.save_model:
-                            save_model_cv(
-                                model=cvclf,
-                                save_path=model_save_path,
-                                cv_index=str(cv_ind),
-                            )
-                    else:
-                        result = _fit_and_score(
-                            estimator=clone(grid_clf),
-                            X=X,
-                            y=y,
-                            scorer=scorer,
-                            train=train,
-                            test=test,
-                            verbose=False,
-                            parameters=None,
-                            fit_params=None,
-                            error_score=self.error_score,
-                            return_estimator=True,
-                            score_params={},
+                    cvclf = clone(grid_clf)
+
+                    cvclf.fit(X[train], y[train])
+
+                    model_list.append(cvclf)
+                    score = scorer(cvclf, X[test], y[test])
+
+                    if self.hdf5_path is not None and self.save_model:
+                        _save_model_cv(
+                            model=cvclf,
+                            save_path=model_save_path,
+                            cv_index=str(cv_ind),
                         )
-                        score = result["test_scores"]
-                        model_list = result["estimator"]
+
                     if _carbonfootprint:
                         emissions = tracker.stop()
                         if emissions is None:
                             emissions = 0
 
                     duration = time() - t_start
-                    if self.hdf5_path is not None and self.save_model:
-                        save_model_list(
-                            model_list=model_list,
-                            score_list=score,
-                            save_path=model_save_path,
-                        )
 
                     nchan = X.info["nchan"] if isinstance(X, BaseEpochs) else X.shape[1]
                     res = {
@@ -677,7 +646,6 @@ def evaluate(
         if len(run_pipes) == 0:
             return
 
-        # get the data
         X, y, metadata = self.paradigm.get_data(
             dataset=dataset,
             return_epochs=self.return_epochs,
@@ -686,8 +654,6 @@ def evaluate(
             postprocess_pipeline=postprocess_pipeline,
             process_pipelines=[process_pipeline],
         )
-
-        # encode labels
         le = LabelEncoder()
         y = y if self.mne_labels else le.fit_transform(y)
 
@@ -700,15 +666,20 @@ def evaluate(
 
         # perform leave one subject out CV
         if self.n_splits is None:
-            cv = LeaveOneGroupOut()
+            cv_class = LeaveOneGroupOut
+            cv_kwargs = {}
         else:
-            cv = GroupKFold(n_splits=self.n_splits)
+            cv_class = GroupKFold
+            cv_kwargs = {"n_splits": self.n_splits}
             n_subjects = self.n_splits
 
+        self.cv = CrossSubjectSplitter(
+            cv_class=cv_class, random_state=self.random_state, **cv_kwargs
+        )
+
         inner_cv = StratifiedKFold(3, shuffle=True, random_state=self.random_state)
 
         # Implement Grid Search
-
         if _carbonfootprint:
             # Initialise CodeCarbon
             tracker = EmissionsTracker(save_to_file=False, log_level="error")
@@ -716,7 +687,7 @@ def evaluate(
         # Progressbar at subject level
         for cv_ind, (train, test) in enumerate(
             tqdm(
-                cv.split(X, y, groups),
+                self.cv.split(y, metadata),
                 total=n_subjects,
                 desc=f"{dataset.code}-CrossSubject",
             )
@@ -734,7 +705,26 @@ def evaluate(
                 clf = self._grid_search(
                     param_grid=param_grid, name=name, grid_clf=clf, inner_cv=inner_cv
                 )
+
+                if self.hdf5_path is not None and self.save_model:
+                    # Save the best model from grid search
+                    model_save_path = _create_save_path(
+                        hdf5_path=self.hdf5_path,
+                        code=dataset.code,
+                        subject=subject,
+                        session="",
+                        name=name,
+                        grid=self.search,
+                        eval_type="CrossSubject",
+                    )
+                    _save_model_cv(
+                        model=clf,
+                        save_path=model_save_path,
+                        cv_index=str(cv_ind),
+                    )
+
                 model = deepcopy(clf).fit(X[train], y[train])
+
                 if _carbonfootprint:
                     emissions = tracker.stop()
                     if emissions is None:
@@ -742,17 +732,18 @@ def evaluate(
                 duration = time() - t_start
 
                 if self.hdf5_path is not None and self.save_model:
-                    model_save_path = create_save_path(
+
+                    model_save_path = _create_save_path(
                         hdf5_path=self.hdf5_path,
                         code=dataset.code,
                         subject=subject,
                         session="",
                         name=name,
-                        grid=False,
+                        grid=self.search,
                         eval_type="CrossSubject",
                     )
 
-                    save_model_cv(
+                    _save_model_cv(
                         model=model, save_path=model_save_path, cv_index=str(cv_ind)
                     )
                 # we eval on each session
diff --git a/moabb/evaluations/utils.py b/moabb/evaluations/utils.py
index cc6fbc197..f642e1191 100644
--- a/moabb/evaluations/utils.py
+++ b/moabb/evaluations/utils.py
@@ -7,6 +7,7 @@
 
 from mne.utils.config import _open_lock
 from numpy import argmax
+from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
 
 
@@ -53,7 +54,7 @@ def _check_if_is_pytorch_steps(model):
         return skorch_valid
 
 
-def save_model_cv(model: object, save_path: str | Path, cv_index: str | int):
+def _save_model_cv(model: object, save_path: str | Path, cv_index: str | int):
     """Save a model fitted to a given fold from cross-validation.
 
     Parameters
@@ -95,7 +96,7 @@ def save_model_cv(model: object, save_path: str | Path, cv_index: str | int):
             dump(model, file, protocol=HIGHEST_PROTOCOL)
 
 
-def save_model_list(model_list: list | Pipeline, score_list: Sequence, save_path: str):
+def _save_model_list(model_list: list | Pipeline, score_list: Sequence, save_path: str):
     """Save a list of models fitted to a folder.
 
     Parameters
@@ -119,14 +120,14 @@ def save_model_list(model_list: list | Pipeline, score_list: Sequence, save_path
         model_list = [model_list]
 
     for cv_index, model in enumerate(model_list):
-        save_model_cv(model, save_path, str(cv_index))
+        _save_model_cv(model, save_path, str(cv_index))
 
     best_model = model_list[argmax(score_list)]
 
-    save_model_cv(best_model, save_path, "best")
+    _save_model_cv(best_model, save_path, "best")
 
 
-def create_save_path(
+def _create_save_path(
     hdf5_path,
     code: str,
     subject: int | str,
@@ -166,7 +167,7 @@ def create_save_path(
         if grid:
             path_save = (
                 Path(hdf5_path)
-                / f"GridSearch_{eval_type}"
+                / f"Search_{eval_type}"
                 / code
                 / f"{str(subject)}"
                 / str(session)
@@ -218,3 +219,20 @@ def _convert_sklearn_params_to_optuna(param_grid: dict) -> dict:
             except Exception as e:
                 raise ValueError(f"Conversion failed for parameter {key}: {e}")
         return optuna_params
+
+
+def check_search_available():
+    """Check if optuna is available"""
+    try:
+        from optuna.integration import OptunaSearchCV
+
+        optuna_available = True
+    except ImportError:
+        optuna_available = False
+
+    if optuna_available:
+        search_methods = {"grid": GridSearchCV, "optuna": OptunaSearchCV}
+    else:
+        search_methods = {"grid": GridSearchCV}
+
+    return search_methods, optuna_available
diff --git a/moabb/tests/acceptance_tests/reference_results_dataset_BNCI2014_001.csv b/moabb/tests/acceptance_tests/reference_results_dataset_BNCI2014_001.csv
new file mode 100644
index 000000000..b029c526f
--- /dev/null
+++ b/moabb/tests/acceptance_tests/reference_results_dataset_BNCI2014_001.csv
@@ -0,0 +1,19 @@
+,score,time,samples,subject,session,channels,n_sessions,dataset,pipeline
+0,0.7430556,0.28345227,288.0,1,0train,22,2,BNCI2014-001,mdm
+1,0.6944444,0.2819698,288.0,1,1test,22,2,BNCI2014-001,mdm
+2,0.5486111,0.28295708,288.0,2,0train,22,2,BNCI2014-001,mdm
+3,0.5555556,0.28221202,288.0,2,1test,22,2,BNCI2014-001,mdm
+4,0.6527778,0.27323103,288.0,3,0train,22,2,BNCI2014-001,mdm
+5,0.6319444,0.28558397,288.0,3,1test,22,2,BNCI2014-001,mdm
+6,0.4652778,0.28424382,288.0,4,0train,22,2,BNCI2014-001,mdm
+7,0.6076389,0.28512216,288.0,4,1test,22,2,BNCI2014-001,mdm
+8,0.4340278,0.26603198,288.0,5,0train,22,2,BNCI2014-001,mdm
+9,0.47569445,0.2672441,288.0,5,1test,22,2,BNCI2014-001,mdm
+10,0.38194445,0.28032613,288.0,6,0train,22,2,BNCI2014-001,mdm
+11,0.4652778,0.29096103,288.0,6,1test,22,2,BNCI2014-001,mdm
+12,0.5625,0.26360798,288.0,7,0train,22,2,BNCI2014-001,mdm
+13,0.46875,0.26497293,288.0,7,1test,22,2,BNCI2014-001,mdm
+14,0.6041667,0.27954388,288.0,8,0train,22,2,BNCI2014-001,mdm
+15,0.6111111,0.29071403,288.0,8,1test,22,2,BNCI2014-001,mdm
+16,0.5451389,0.27546215,288.0,9,0train,22,2,BNCI2014-001,mdm
+17,0.7326389,0.2862649,288.0,9,1test,22,2,BNCI2014-001,mdm
diff --git a/moabb/tests/acceptance_tests/reference_results_dataset_BNCI2015_001.csv b/moabb/tests/acceptance_tests/reference_results_dataset_BNCI2015_001.csv
new file mode 100644
index 000000000..97d2c3265
--- /dev/null
+++ b/moabb/tests/acceptance_tests/reference_results_dataset_BNCI2015_001.csv
@@ -0,0 +1,29 @@
+,score,time,samples,subject,session,channels,n_sessions,dataset,pipeline
+0,0.9898,0.104274035,200.0,1,0A,13,2,BNCI2015-001,mdm
+1,0.996,0.109023094,200.0,1,1B,13,2,BNCI2015-001,mdm
+2,0.9822,0.11902189,200.0,2,0A,13,2,BNCI2015-001,mdm
+3,0.9817,0.10449815,200.0,2,1B,13,2,BNCI2015-001,mdm
+4,0.9411,0.10515785,200.0,3,0A,13,2,BNCI2015-001,mdm
+5,0.9713,0.10190797,200.0,3,1B,13,2,BNCI2015-001,mdm
+6,0.8777,0.107106924,200.0,4,0A,13,2,BNCI2015-001,mdm
+7,0.9653,0.10397911,200.0,4,1B,13,2,BNCI2015-001,mdm
+8,0.8416,0.105483055,200.0,5,0A,13,2,BNCI2015-001,mdm
+9,0.8118,0.10831189,200.0,5,1B,13,2,BNCI2015-001,mdm
+10,0.6624,0.12765789,200.0,6,0A,13,2,BNCI2015-001,mdm
+11,0.6314,0.10389686,200.0,6,1B,13,2,BNCI2015-001,mdm
+12,0.8948,0.10865617,200.0,7,0A,13,2,BNCI2015-001,mdm
+13,0.8931,0.09851694,200.0,7,1B,13,2,BNCI2015-001,mdm
+14,0.6032,0.18366313,400.0,8,0A,13,2,BNCI2015-001,mdm
+15,0.7523,0.19959378,400.0,8,1B,13,2,BNCI2015-001,mdm
+16,0.8488,0.18477702,400.0,8,2C,13,2,BNCI2015-001,mdm
+17,0.7601,0.1761918,400.0,9,0A,13,2,BNCI2015-001,mdm
+18,0.8687,0.17262912,400.0,9,1B,13,2,BNCI2015-001,mdm
+19,0.9154,0.17855692,400.0,9,2C,13,2,BNCI2015-001,mdm
+20,0.6787,0.21773195,400.0,10,0A,13,2,BNCI2015-001,mdm
+21,0.6402,0.20742917,400.0,10,1B,13,2,BNCI2015-001,mdm
+22,0.6116,0.19268918,400.0,10,2C,13,2,BNCI2015-001,mdm
+23,0.7974,0.20285797,400.0,11,0A,13,2,BNCI2015-001,mdm
+24,0.7403,0.20020509,400.0,11,1B,13,2,BNCI2015-001,mdm
+25,0.7949,0.18860793,400.0,11,2C,13,2,BNCI2015-001,mdm
+26,0.6574,0.10171008,200.0,12,0A,13,2,BNCI2015-001,mdm
+27,0.6693,0.10934806,200.0,12,1B,13,2,BNCI2015-001,mdm
diff --git a/moabb/tests/acceptance_tests/test_accurary.py b/moabb/tests/acceptance_tests/test_accurary.py
new file mode 100644
index 000000000..5b99ee760
--- /dev/null
+++ b/moabb/tests/acceptance_tests/test_accurary.py
@@ -0,0 +1,46 @@
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+from pyriemann.classification import MDM
+from pyriemann.estimation import XdawnCovariances
+from sklearn.pipeline import make_pipeline
+from sklearn.utils import check_random_state
+
+from moabb.datasets import BNCI2014_001, BNCI2015_001
+from moabb.evaluations import CrossSessionEvaluation
+from moabb.paradigms import MotorImagery
+
+
+@pytest.mark.parametrize("dataset_class", [BNCI2014_001, BNCI2015_001])
+def test_decoding_performance_stable(dataset_class):
+    dataset_name = dataset_class.__name__
+    random_state = check_random_state(42)
+
+    dataset_cls = dataset_class
+    dataset = dataset_cls()
+    paradigm = MotorImagery()
+
+    # Simple pipeline
+    pipeline = make_pipeline(XdawnCovariances(nfilter=4), MDM(n_jobs=4))
+
+    # Evaluate
+    evaluation = CrossSessionEvaluation(
+        paradigm=paradigm, datasets=[dataset], overwrite=True, random_state=random_state
+    )
+    results = evaluation.process({"mdm": pipeline})
+    results.drop(columns=["time"], inplace=True)
+    results["score"] = results["score"].astype(np.float32)
+    results["samples"] = results["samples"].astype(int)
+    results["subject"] = results["subject"].astype(int)
+
+    folder_path = Path(__file__).parent / "reference_results_dataset_{}.csv".format(
+        dataset_name
+    )
+    reference_performance = pd.read_csv(folder_path)
+    reference_performance.drop(columns=["time", "Unnamed: 0"], inplace=True)
+    reference_performance["score"] = reference_performance["score"].astype(np.float32)
+    reference_performance["samples"] = reference_performance["samples"].astype(int)
+
+    pd.testing.assert_frame_equal(results, reference_performance)
diff --git a/moabb/tests/test_evaluations.py b/moabb/tests/test_evaluations.py
index 0dc4f98c0..6beb73edb 100644
--- a/moabb/tests/test_evaluations.py
+++ b/moabb/tests/test_evaluations.py
@@ -18,7 +18,8 @@
 from moabb.datasets.fake import FakeDataset
 from moabb.evaluations import evaluations as ev
 from moabb.evaluations.base import optuna_available
-from moabb.evaluations.utils import create_save_path, save_model_cv, save_model_list
+from moabb.evaluations.utils import _create_save_path as create_save_path
+from moabb.evaluations.utils import _save_model_cv as save_model_cv
 from moabb.paradigms.motor_imagery import FakeImageryParadigm
 
 
@@ -393,17 +394,6 @@ def test_save_model_cv(self):
         # Assert that the saved model file exists
         assert os.path.isfile(os.path.join(save_path, "fitted_model_0.pkl"))
 
-    def test_save_model_list(self):
-        step = Dummy()
-        model = Pipeline([("step", step)])
-        model_list = [model]
-        score_list = [0.8]
-        save_path = "test_save_path"
-        save_model_list(model_list, score_list, save_path)
-
-        # Assert that the saved model file for best model exists
-        assert os.path.isfile(os.path.join(save_path, "fitted_model_best.pkl"))
-
     def test_create_save_path(self):
         hdf5_path = "base_path"
         code = "evaluation_code"
@@ -454,21 +444,6 @@ def test_save_model_cv_with_pytorch_model(self):
         assert os.path.isfile(os.path.join(save_path, "step_fitted_0_history.json"))
         assert os.path.isfile(os.path.join(save_path, "step_fitted_0_criterion.pkl"))
 
-    def test_save_model_list_with_multiple_models(self):
-        model1 = Dummy()
-        model2 = Dummy()
-        model_list = [model1, model2]
-        score_list = [0.8, 0.9]
-        save_path = "test_save_path"
-        save_model_list(model_list, score_list, save_path)
-
-        # Assert that the saved model files for each model exist
-        assert os.path.isfile(os.path.join(save_path, "fitted_model_0.pkl"))
-        assert os.path.isfile(os.path.join(save_path, "fitted_model_1.pkl"))
-
-        # Assert that the saved model file for the best model exists
-        assert os.path.isfile(os.path.join(save_path, "fitted_model_best.pkl"))
-
     def test_create_save_path_with_cross_session_evaluation(self):
         hdf5_path = "base_path"
         code = "evaluation_code"
@@ -516,19 +491,6 @@ def test_save_model_cv_without_hdf5_path(self):
         with pytest.raises(IOError):
             save_model_cv(model, save_path, cv_index)
 
-    def test_save_model_list_with_single_model(self):
-        model = Dummy()
-        model_list = model
-        score_list = [0.8]
-        save_path = "test_save_path"
-        save_model_list(model_list, score_list, save_path)
-
-        # Assert that the saved model file for the single model exists
-        assert os.path.isfile(os.path.join(save_path, "fitted_model_0.pkl"))
-
-        # Assert that the saved model file for the best model exists
-        assert os.path.isfile(os.path.join(save_path, "fitted_model_best.pkl"))
-
     def test_create_save_path_with_cross_subject_evaluation(self):
         hdf5_path = "base_path"
         code = "evaluation_code"
diff --git a/pyproject.toml b/pyproject.toml
index 5bbb054b5..a65bf9a8b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ license = "BSD-3-Clause"
 
 [tool.poetry.dependencies]
 python = ">=3.10"
-numpy = "^2.0"
+numpy = ">=2.0"
 scipy = "^1.9.3"
 mne = "^1.10.0"
 pandas = ">=1.5.2"
@@ -30,7 +30,7 @@ memory-profiler = "^0.61.0"
 edflib-python = "^1.0.6"
 edfio = "^0.4.2"
 pytest = "^8.3.5"
-mne-bids = ">=0.14"
+mne-bids = ">=0.16"
 scikit-learn = "<1.6"
 
 # Optional dependencies for carbon emission