From 5e3503467caf65ac7a1fe50505475cb36f98dbc9 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 13 Nov 2024 09:48:09 +0000 Subject: [PATCH 1/6] release --- MANIFEST.in | 3 --- README.md | 5 +++-- pyproject.toml | 4 ++-- tsml/__init__.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index b459cfe..28ac35a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,3 @@ -recursive-include docs * recursive-include tsml *.py recursive-include tsml/datasets *.ts include .coveragerc @@ -13,5 +12,3 @@ exclude .codecov.yml exclude .gitattributes exclude .gitignore exclude .pre-commit-config.yaml -exclude .readthedocs.yml -exclude sweep.yaml diff --git a/README.md b/README.md index fbd08e9..7609c25 100644 --- a/README.md +++ b/README.md @@ -8,14 +8,15 @@ # tsml-py -A toolkit for in-development time series machine learning algorithms. +A repository for in-development time series machine learning algorithms and other odd +bits by Matthew Middlehurst. Please see [`tsml_eval`](https://github.com/time-series-machine-learning/tsml-eval) and [`aeon`](https://github.com/aeon-toolkit/aeon) for more developed and stable packages. This package is more of a sandbox for testing out new ideas and algorithms. It may contain some algorithms and implementations that are not available in the other toolkits. -The current release of `tsml` is v0.4.0. +The current release of `tsml` is v0.5.0. ## Installation diff --git a/pyproject.toml b/pyproject.toml index 7679260..3cb3db7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta" [project] name = "tsml" -version = "0.4.0" -description = "A toolkit for time series machine learning algorithms." +version = "0.5.0" +description = "A development sandbox for time series machine learning algorithms." authors = [ {name = "Matthew Middlehurst", email = "m.b.middlehurst@soton.ac.uk"}, ] diff --git a/tsml/__init__.py b/tsml/__init__.py index da155f0..90b5545 100644 --- a/tsml/__init__.py +++ b/tsml/__init__.py @@ -1,3 +1,3 @@ """tsml.""" -__version__ = "0.4.0" +__version__ = "0.5.0" From 697ab0e9a6953c6b6d82c04b471c81a5b73accc9 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 13 Nov 2024 10:48:41 +0000 Subject: [PATCH 2/6] seql --- .github/workflows/release.yml | 4 +- pyproject.toml | 3 +- tsml/dictionary_based/__init__.py | 9 + tsml/dictionary_based/_mrseql.py | 185 ++++++++++++++++++ .../_mrsqm.py | 11 -- tsml/shapelet_based/__init__.py | 2 - 6 files changed, 198 insertions(+), 16 deletions(-) create mode 100644 tsml/dictionary_based/__init__.py create mode 100644 tsml/dictionary_based/_mrseql.py rename tsml/{shapelet_based => dictionary_based}/_mrsqm.py (94%) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 53dbf44..01621ad 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -79,14 +79,14 @@ jobs: with: timeout_minutes: 30 max_attempts: 3 - command: python -m pip install "${env:WHEELNAME}[dev,all_extras,unstable_extras]" + command: python -m pip install "${env:WHEELNAME}[dev,all_extras]" - if: matrix.os != 'windows-2022' name: Unix install uses: nick-fields/retry@v3 with: timeout_minutes: 30 max_attempts: 3 - command: python -m pip install "${{ env.WHEELNAME }}[dev,all_extras,unstable_extras]" + command: python -m pip install "${{ env.WHEELNAME }}[dev,all_extras]" - name: Tests run: python -m pytest -n logical diff --git a/pyproject.toml b/pyproject.toml index 3cb3db7..6a52ed1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,8 @@ all_extras = [ unstable_extras = [ "pycatch22", "pyfftw>=0.12.0; python_version < '3.12'", # requires fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html) - "mrsqm>=0.0.7; platform_system != 'Windows' and python_version < '3.12'", # requires gcc and fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html) + "mrsqm>=0.0.7; platform_system == 'Linux' and python_version < '3.12'", # requires gcc and fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html) + "mrseql>=0.0.4,<0.1.0; platform_system == 'Linux' and python_version < '3.12'", # requires gcc and fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html) ] dev = [ "pre-commit", diff --git a/tsml/dictionary_based/__init__.py b/tsml/dictionary_based/__init__.py new file mode 100644 index 0000000..0c2ceaf --- /dev/null +++ b/tsml/dictionary_based/__init__.py @@ -0,0 +1,9 @@ +"""Dictionary-based estimators.""" + +__all__ = [ + "MrSEQLClassifier", + "MrSQMClassifier", +] + +from tsml.dictionary_based._mrseql import MrSEQLClassifier +from tsml.dictionary_based._mrsqm import MrSQMClassifier diff --git a/tsml/dictionary_based/_mrseql.py b/tsml/dictionary_based/_mrseql.py new file mode 100644 index 0000000..9197b23 --- /dev/null +++ b/tsml/dictionary_based/_mrseql.py @@ -0,0 +1,185 @@ +"""Multiple Representations Sequence Learning (MrSEQL) Classifier.""" + +from typing import List, Union + +import numpy as np +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_is_fitted + +from tsml.base import BaseTimeSeriesEstimator +from tsml.utils.validation import _check_optional_dependency + + +class MrSEQLClassifier(ClassifierMixin, BaseTimeSeriesEstimator): + """ + Multiple Representations Sequence Learning (MrSEQL) Classifier. + + This is a wrapper for the MrSEQLClassifier algorithm from the `mrseql` package. + MrSEQL is not included in ``all_extras`` as it requires gcc and fftw + (http://www.fftw.org/index.html) to be installed for Windows and some Linux OS. + + Overview: MrSEQL extends the symbolic sequence classifier (SEQL) to work with + multiple symbolic representations of time series, using features extracted from the + SAX and SFA transformations. + + Parameters + ---------- + seql_mode : "clf" or "fs", default="fs". + If "fs", trains a logistic regression model with features extracted by SEQL. + IF "clf", builds an ensemble of SEQL models + symrep : "sax" or "sfa", or ["sax", "sfa"], default = "sax" + The symbolic features to extract from the time series. + custom_config : dict, default=None + Additional configuration for the symbolic transformations. See the original + package for details. ``symrep`` will be ignored if used. + + References + ---------- + .. [1] Le Nguyen, Thach, et al. "Interpretable time series classification using + linear models and multi-resolution multi-domain symbolic representations." + Data mining and knowledge discovery 33 (2019): 1183-1222. + """ + + def __init__(self, seql_mode="fs", symrep=("sax"), custom_config=None) -> None: + self.seql_mode = seql_mode + self.symrep = symrep + self.custom_config = custom_config + + _check_optional_dependency("mrseql", "mrseql", self) + + super().__init__() + + def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object: + """Fit the estimator to training data. + + Parameters + ---------- + X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) + The training data. + y : 1D np.ndarray of shape (n_instances) + The class labels for fitting, indices correspond to instance indices in X + + Returns + ------- + self : + Reference to self. + """ + X, y = self._validate_data(X=X, y=y, ensure_min_samples=2) + X = self._convert_X(X) + + check_classification_targets(y) + + self.n_instances_, self.n_dims_, self.series_length_ = ( + X.shape if X.ndim == 3 else (X.shape[0], 1, X.shape[1]) + ) + self.classes_ = np.unique(y) + self.n_classes_ = self.classes_.shape[0] + self.class_dictionary_ = {} + for index, class_val in enumerate(self.classes_): + self.class_dictionary_[class_val] = index + + if self.n_classes_ == 1: + return self + + from mrseql import MrSEQLClassifier + + _X = _convert_data(X) + + self.clf_ = MrSEQLClassifier( + seql_mode=self.seql_mode, + symrep=self.symrep, + custom_config=self.custom_config, + ) + self.clf_.fit(_X, y) + + return self + + def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: + """Predicts labels for sequences in X. + + Parameters + ---------- + X : 3D np.array of shape (n_instances, n_channels, n_timepoints) + The testing data. + + Returns + ------- + y : array-like of shape (n_instances) + Predicted class labels. + """ + check_is_fitted(self) + + # treat case of single class seen in fit + if self.n_classes_ == 1: + return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0) + + X = self._validate_data(X=X, reset=False) + X = self._convert_X(X) + + return self.clf_.predict(_convert_data(X)) + + def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: + """Predicts labels probabilities for sequences in X. + + Parameters + ---------- + X : 3D np.array of shape (n_instances, n_channels, n_timepoints) + The testing data. + + Returns + ------- + y : array-like of shape (n_instances, n_classes_) + Predicted probabilities using the ordering in classes_. + """ + check_is_fitted(self) + + # treat case of single class seen in fit + if self.n_classes_ == 1: + return np.repeat([[1]], X.shape[0], axis=0) + + X = self._validate_data(X=X, reset=False) + X = self._convert_X(X) + + return self.clf_.predict_proba(_convert_data(X)) + + def _more_tags(self) -> dict: + return { + "non_deterministic": True, + "_xfail_checks": {"check_estimators_pickle": "External failure to pickle."}, + "optional_dependency": True, + } + + @classmethod + def get_test_params( + cls, parameter_set: Union[str, None] = None + ) -> Union[dict, List[dict]]: + """Return unit test parameter settings for the estimator. + + Parameters + ---------- + parameter_set : None or str, default=None + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict + Parameters to create testing instances of the class. + """ + return {} + + +def _convert_data(X): + column_list = [] + for i in range(X.shape[1]): + nested_column = ( + pd.DataFrame(X[:, i, :]) + .apply(lambda x: [pd.Series(x, dtype=X.dtype)], axis=1) + .str[0] + .rename(str(i)) + ) + column_list.append(nested_column) + df = pd.concat(column_list, axis=1) + return df diff --git a/tsml/shapelet_based/_mrsqm.py b/tsml/dictionary_based/_mrsqm.py similarity index 94% rename from tsml/shapelet_based/_mrsqm.py rename to tsml/dictionary_based/_mrsqm.py index b1367ba..87fe4e4 100644 --- a/tsml/shapelet_based/_mrsqm.py +++ b/tsml/dictionary_based/_mrsqm.py @@ -59,17 +59,6 @@ class MrSQMClassifier(ClassifierMixin, BaseTimeSeriesEstimator): .. [2] Nguyen, Thach Le, and Georgiana Ifrim. "MrSQM: Fast time series classification with symbolic representations." arXiv preprint arXiv:2109.01036 (2021). - - Examples - -------- - >>> from tsml.shapelet_based import MrSQMClassifier - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0) - >>> clf = MrSQMClassifier(random_state=0) # doctest: +SKIP - >>> clf.fit(X, y) # doctest: +SKIP - MrSQMClassifier(...) - >>> clf.predict(X) # doctest: +SKIP - array([0, 1, 1, 0, 0, 1, 0, 1]) """ def __init__( diff --git a/tsml/shapelet_based/__init__.py b/tsml/shapelet_based/__init__.py index dba6f01..b36f609 100644 --- a/tsml/shapelet_based/__init__.py +++ b/tsml/shapelet_based/__init__.py @@ -1,12 +1,10 @@ """Shapelet-based estimators.""" __all__ = [ - "MrSQMClassifier", "RandomShapeletForestClassifier", "RandomShapeletForestRegressor", ] -from tsml.shapelet_based._mrsqm import MrSQMClassifier from tsml.shapelet_based._rsf import ( RandomShapeletForestClassifier, RandomShapeletForestRegressor, From 9b7bd1bde8bf3d6659f2a80f0205d1ac0fcd13c9 Mon Sep 17 00:00:00 2001 From: MatthewMiddlehurst Date: Sat, 11 Jan 2025 15:31:10 +0000 Subject: [PATCH 3/6] remove stuff in aeon and eval --- pyproject.toml | 16 +- tsml/distance_based/__init__.py | 2 - tsml/distance_based/_mpdist.py | 168 -- tsml/distances/__init__.py | 7 - tsml/distances/_manhattan.py | 66 - tsml/interval_based/__init__.py | 21 - tsml/interval_based/_base.py | 1086 ------------ tsml/interval_based/_interval_forest.py | 454 ----- tsml/interval_based/_interval_pipelines.py | 820 --------- tsml/interval_based/tests/__init__.py | 1 - .../tests/test_interval_forest.py | 206 --- .../tests/test_interval_pipelines.py | 38 - tsml/transformations/__init__.py | 16 - tsml/transformations/_acf.py | 140 -- tsml/transformations/_ar_coefficient.py | 119 -- tsml/transformations/_catch22.py | 335 ---- tsml/transformations/_interval_extraction.py | 1572 ----------------- tsml/transformations/_periodogram.py | 121 -- tsml/transformations/_quantile.py | 57 - tsml/vector/__init__.py | 13 - tsml/vector/_cit.py | 487 ----- tsml/vector/_rotation_forest.py | 792 --------- tsml/vector/tests/__init__.py | 1 - tsml/vector/tests/test_rotation_forest.py | 29 - 24 files changed, 6 insertions(+), 6561 deletions(-) delete mode 100644 tsml/distance_based/_mpdist.py delete mode 100644 tsml/distances/__init__.py delete mode 100644 tsml/distances/_manhattan.py delete mode 100644 tsml/interval_based/__init__.py delete mode 100644 tsml/interval_based/_base.py delete mode 100644 tsml/interval_based/_interval_forest.py delete mode 100644 tsml/interval_based/_interval_pipelines.py delete mode 100644 tsml/interval_based/tests/__init__.py delete mode 100644 tsml/interval_based/tests/test_interval_forest.py delete mode 100644 tsml/interval_based/tests/test_interval_pipelines.py delete mode 100644 tsml/transformations/_acf.py delete mode 100644 tsml/transformations/_ar_coefficient.py delete mode 100644 tsml/transformations/_catch22.py delete mode 100644 tsml/transformations/_interval_extraction.py delete mode 100644 tsml/transformations/_periodogram.py delete mode 100644 tsml/transformations/_quantile.py delete mode 100644 tsml/vector/__init__.py delete mode 100644 tsml/vector/_cit.py delete mode 100644 tsml/vector/_rotation_forest.py delete mode 100644 tsml/vector/tests/__init__.py delete mode 100644 tsml/vector/tests/test_rotation_forest.py diff --git a/pyproject.toml b/pyproject.toml index 6a52ed1..9ade4e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "tsml" -version = "0.5.0" +version = "0.6.0" description = "A development sandbox for time series machine learning algorithms." authors = [ {name = "Matthew Middlehurst", email = "m.b.middlehurst@soton.ac.uk"}, @@ -38,11 +38,11 @@ classifiers = [ "Programming Language :: Python :: 3.12", ] dependencies = [ - "numba>=0.55.0,<0.61.0", - "numpy>=1.21.0,<2.2.0", - "scipy>=1.9.0,<1.14.0", - "pandas>=1.5.3,<2.3.0", - "scikit-learn>=1.0.0,<1.4.0", + "numba>=0.55.0", + "numpy>=1.21.0", + "scipy>=1.9.0", + "pandas>=1.5.3", + "scikit-learn>=1.0.0", "packaging>=20.0", ] @@ -50,13 +50,9 @@ dependencies = [ all_extras = [ "grailts", "scikit-fda>=0.7.0", - "statsmodels>=0.12.1", - "stumpy>=1.6.0", "wildboar", ] unstable_extras = [ - "pycatch22", - "pyfftw>=0.12.0; python_version < '3.12'", # requires fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html) "mrsqm>=0.0.7; platform_system == 'Linux' and python_version < '3.12'", # requires gcc and fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html) "mrseql>=0.0.4,<0.1.0; platform_system == 'Linux' and python_version < '3.12'", # requires gcc and fftw to be installed for Windows and some other OS (see http://www.fftw.org/index.html) ] diff --git a/tsml/distance_based/__init__.py b/tsml/distance_based/__init__.py index be9fde8..1e8b6f7 100644 --- a/tsml/distance_based/__init__.py +++ b/tsml/distance_based/__init__.py @@ -2,8 +2,6 @@ __all__ = [ "GRAILClassifier", - "MPDistClassifier", ] from tsml.distance_based._grail import GRAILClassifier -from tsml.distance_based._mpdist import MPDistClassifier diff --git a/tsml/distance_based/_mpdist.py b/tsml/distance_based/_mpdist.py deleted file mode 100644 index b931414..0000000 --- a/tsml/distance_based/_mpdist.py +++ /dev/null @@ -1,168 +0,0 @@ -"""Matrix Profile Distance 1-NN Classifier.""" - -__author__ = ["TonyBagnall", "patrickzib", "MatthewMiddlehurst"] -__all__ = ["MPDistClassifier"] - -from typing import List, Union - -import numpy as np -import stumpy -from sklearn.base import ClassifierMixin -from sklearn.metrics import pairwise -from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.validation import check_is_fitted - -from tsml.base import BaseTimeSeriesEstimator -from tsml.utils.validation import check_n_jobs - - -class MPDistClassifier(ClassifierMixin, BaseTimeSeriesEstimator): - """Matrix Profile Distance 1-NN Classifier. - - Calculates the matrix profile distance to the training data for each case and - returns the label of the nearest neighbour. - - Parameters - ---------- - window : int or float, default=10 - Window size for the matrix profile. If float, will use a proportion of the - series length. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `predict`. - ``-1`` means using all processors. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_timepoints_ : int - The length of each series in the training set. - n_classes_ : int - Number of classes. Extracted from the data. - classes_ : ndarray of shape (n_classes_) - Holds the label for each class. - class_dictionary_ : dict - A dictionary mapping class labels to class indices in classes_. - - References - ---------- - .. [1] Gharghabi, Shaghayegh, et al. "Matrix profile xii: Mpdist: a novel time - series distance measure to allow data mining in more challenging scenarios." - 2018 IEEE International Conference on Data Mining (ICDM). IEEE, 2018. - - Examples - -------- - >>> from tsml.distance_based import MPDistClassifier - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0) - >>> clf = MPDistClassifier() - >>> clf.fit(X, y) - MPDistClassifier(...) - >>> clf.predict(X) - array([0, 1, 1, 0, 0, 1, 0, 1]) - """ - - def __init__(self, window=10, n_jobs=1): - self.window = window - self.n_jobs = n_jobs - - super().__init__() - - def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object: - """Fit the estimator to training data. - - Parameters - ---------- - X : 2D np.ndarray of shape (n_instances, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - X, y = self._validate_data(X=X, y=y, ensure_min_samples=2) - X = self._convert_X(X) - - check_classification_targets(y) - - self.n_instances_, self.n_timepoints_ = X.shape - self.classes_ = np.unique(y) - self.n_classes_ = self.classes_.shape[0] - self.class_dictionary_ = {} - for index, class_val in enumerate(self.classes_): - self.class_dictionary_[class_val] = index - - if self.n_classes_ == 1: - return self - - self._n_jobs = check_n_jobs(self.n_jobs) - - self._X_train = X.astype(np.float64) - self._y_train = y - - return self - - def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: - """Predicts labels for sequences in X. - - Parameters - ---------- - X : 2D np.array of shape (n_instances, n_timepoints) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances) - Predicted class labels. - """ - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0) - - X = self._validate_data(X=X, reset=False) - X = self._convert_X(X) - - window = ( - self.window if self.window >= 1 else int(self.window * self.n_timepoints_) - ) - - distance_matrix = pairwise.pairwise_distances( - X.astype(np.float64), - self._X_train, - metric=(lambda x, y: stumpy.mpdist(x, y, window)), - n_jobs=self._n_jobs, - ) - - return self._y_train[np.argmin(distance_matrix, axis=1)] - - def _more_tags(self) -> dict: - return { - "X_types": ["2darray"], - "optional_dependency": True, - } - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - return { - "window": 0.8, - } diff --git a/tsml/distances/__init__.py b/tsml/distances/__init__.py deleted file mode 100644 index 7aeafe0..0000000 --- a/tsml/distances/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -"""Distance functions.""" - -__all__ = [ - "manhattan_distance", -] - -from tsml.distances._manhattan import manhattan_distance diff --git a/tsml/distances/_manhattan.py b/tsml/distances/_manhattan.py deleted file mode 100644 index 125ae8e..0000000 --- a/tsml/distances/_manhattan.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Manhattan distance.""" - -__author__ = ["chrisholder", "TonyBagnall", "baraline"] - -import numpy as np -from numba import njit - - -@njit(cache=True, fastmath=True) -def manhattan_distance(x: np.ndarray, y: np.ndarray) -> float: - r"""Compute the manhattan distance between two time series. - - The manhattan distance between two time series is defined as: - .. math:: - manhattan(x, y) = \sum_{i=1}^{n} |x_i - y_i| - - Parameters - ---------- - x: np.ndarray, of shape (n_channels, n_timepoints) or (n_timepoints) - First time series. - y: np.ndarray, of shape (m_channels, m_timepoints) or (m_timepoints) - Second time series. - - Returns - ------- - float : - manhattan distance between x and y. - - Raises - ------ - ValueError - If x and y are not 1D or 2D arrays. - - Examples - -------- - >>> import numpy as np - >>> from tsml.distances import manhattan_distance - >>> x = np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) - >>> y = np.array([[11, 12, 13, 14, 15, 16, 17, 18, 19, 20]]) - >>> manhattan_distance(x, y) - 100.0 - """ - if x.ndim == 1 and y.ndim == 1: - return _univariate_manhattan_distance(x, y) - if x.ndim == 2 and y.ndim == 2: - return _manhattan_distance(x, y) - raise ValueError("x and y must be 1D or 2D") - - -@njit(cache=True, fastmath=True) -def _manhattan_distance(x: np.ndarray, y: np.ndarray) -> float: - distance = 0.0 - min_val = min(x.shape[0], y.shape[0]) - for i in range(min_val): - distance += _univariate_manhattan_distance(x[i], y[i]) - return distance - - -@njit(cache=True, fastmath=True) -def _univariate_manhattan_distance(x: np.ndarray, y: np.ndarray) -> float: - distance = 0.0 - min_length = min(x.shape[0], y.shape[0]) - for i in range(min_length): - difference = x[i] - y[i] - distance += abs(difference) - return distance diff --git a/tsml/interval_based/__init__.py b/tsml/interval_based/__init__.py deleted file mode 100644 index 3633a1f..0000000 --- a/tsml/interval_based/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -"""Interval-based estimators.""" - -__all__ = [ - "BaseIntervalForest", - "IntervalForestClassifier", - "IntervalForestRegressor", - "RandomIntervalClassifier", - "RandomIntervalRegressor", - "SupervisedIntervalClassifier", -] - -from tsml.interval_based._base import BaseIntervalForest -from tsml.interval_based._interval_forest import ( - IntervalForestClassifier, - IntervalForestRegressor, -) -from tsml.interval_based._interval_pipelines import ( - RandomIntervalClassifier, - RandomIntervalRegressor, - SupervisedIntervalClassifier, -) diff --git a/tsml/interval_based/_base.py b/tsml/interval_based/_base.py deleted file mode 100644 index 1af0f90..0000000 --- a/tsml/interval_based/_base.py +++ /dev/null @@ -1,1086 +0,0 @@ -"""A base class for interval extracting forest estimators.""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = ["BaseIntervalForest"] - -import inspect -import time -import warnings -from abc import ABCMeta, abstractmethod -from typing import List, Union - -import numpy as np -from joblib import Parallel -from sklearn.base import BaseEstimator, is_classifier, is_regressor -from sklearn.tree import BaseDecisionTree, DecisionTreeClassifier, DecisionTreeRegressor -from sklearn.utils import check_random_state -from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.parallel import delayed -from sklearn.utils.validation import check_is_fitted - -from tsml.base import BaseTimeSeriesEstimator, _clone_estimator -from tsml.transformations._interval_extraction import ( - RandomIntervalTransformer, - SupervisedIntervalTransformer, -) -from tsml.utils.numba_functions.stats import row_mean, row_slope, row_std -from tsml.utils.validation import check_n_jobs, is_transformer -from tsml.vector import CITClassifier - - -class BaseIntervalForest(BaseTimeSeriesEstimator, metaclass=ABCMeta): - """A base class for interval extracting forest estimators. - - Allows the implementation of classifiers and regressors along the lines of [1][2][3] - which extract intervals and create an ensemble from the subsequent features. - - Parameters - ---------- - base_estimator : BaseEstimator or None, default=None - scikit-learn BaseEstimator used to build the interval ensemble. If None, use a - simple decision tree. - n_estimators : int, default=200 - Number of estimators to build for the ensemble. - interval_selection_method : "random", "supervised" or "random-supervised", - default="random" - The interval selection transformer to use. - - "random" uses a RandomIntervalTransformer. - - "supervised" uses a SupervisedIntervalTransformer. - - "random-supervised" uses a SupervisedIntervalTransformer with - randomised elements. - - Supervised methods can only be used for classification tasks, and require - function inputs for interval_features rather than transformers. - n_intervals : int, str, list or tuple, default="sqrt" - Number of intervals to extract per tree for each series_transformers series. - - An int input will extract that number of intervals from the series, while a str - input will return a function of the series length (may differ per - series_transformers output) to extract that number of intervals. - Valid str inputs are: - - "sqrt": square root of the series length. - - "sqrt-div": sqrt of series length divided by the number - of series_transformers. - - A list or tuple of ints and/or strs will extract the number of intervals using - the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"] - will extract sqrt(n_timepoints) + 4 intervals. - - Different number of intervals for each series_transformers series can be - specified using a nested list or tuple. Any list or tuple input containing - another list or tuple must be the same length as the number of - series_transformers. - - While random interval extraction will extract the n_intervals intervals total - (removing duplicates), supervised intervals will run the supervised extraction - process n_intervals times, returning more intervals than specified. - min_interval_length : int, float, list, or tuple, default=3 - Minimum length of intervals to extract from series. float inputs take a - proportion of the series length to use as the minimum interval length. - - Different minimum interval lengths for each series_transformers series can be - specified using a list or tuple. Any list or tuple input must be the same length - as the number of series_transformers. - max_interval_length : int, float, list, or tuple, default=np.inf - Maximum length of intervals to extract from series. float inputs take a - proportion of the series length to use as the maximum interval length. - - Different maximum interval lengths for each series_transformers series can be - specified using a list or tuple. Any list or tuple input must be the same length - as the number of series_transformers. - - Ignored for supervised interval_selection_method inputs. - interval_features : TransformerMixin, callable, list, tuple, or None, default=None - The features to extract from the intervals using transformers or callable - functions. If None, use the mean, standard deviation, and slope of the series. - - Both transformers and functions should be able to take a 2D np.ndarray input. - Functions should output a 1d array (the feature for each series), and - transformers should output a 2d array where rows are the features for each - series. A list or tuple of transformers and/or functions will extract all - features and concatenate the output. - - Different features for each series_transformers series can be specified using a - nested list or tuple. Any list or tuple input containing another list or tuple - must be the same length as the number of series_transformers. - series_transformers : TransformerMixin, list, tuple, or None, default=None - The transformers to apply to the series before extracting intervals. If None, - use the series as is. - - A list or tuple of transformers will extract intervals from - all transformations concatenate the output. Including None in the list or tuple - will use the series as is for interval extraction. - att_subsample_size : int, float, list, tuple or None, default=None - The number of attributes to subsample for each estimator. If None, use all - - If int, use that number of attributes for all estimators. If float, use that - proportion of attributes for all estimators. - - Different subsample sizes for each series_transformers series can be specified - using a list or tuple. Any list or tuple input must be the same length as the - number of series_transformers. - replace_nan : "nan", int, float or None, default=None - The value to replace NaNs and infinite values with before fitting the base - estimator. int or float input will replace with the specified value, while - "nan" will replace infinite values with NaNs. If None, do not replace NaNs. - time_limit_in_minutes : int, default=0 - Time contract to limit build time in minutes, overriding n_estimators. - Default of 0 means n_estimators are used. - contract_max_n_estimators : int, default=500 - Max number of estimators when time_limit_in_minutes is set. - save_transformed_data : bool, default=False - Save the data transformed in fit. - random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `predict`. - ``-1`` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - Attributes - ---------- - n_instances_ : int - The number of train cases. - n_channels_ : int - The number of channels per case. - n_timepoints_ : int - The length of each series. - total_intervals_ : int - Total number of intervals per tree from all representations. - estimators_ : list of shape (n_estimators) of BaseEstimator - The collections of estimators trained in fit. - intervals_ : list of shape (n_estimators) of TransformerMixin - Stores the interval extraction transformer for all estimators. - transformed_data_ : list of shape (n_estimators) of ndarray with shape - (n_instances_ ,total_intervals * att_subsample_size) - The transformed dataset for all estimators. Only saved when - save_transformed_data is true. - - References - ---------- - .. [1] H.Deng, G.Runger, E.Tuv and M.Vladimir, "A time series forest for - classification and feature extraction", Information Sciences, 239, 2013 - .. [2] Matthew Middlehurst and James Large and Anthony Bagnall. "The Canonical - Interval Forest (CIF) Classifier for Time Series Classification." - IEEE International Conference on Big Data 2020 - .. [3] Cabello, Nestor, et al. "Fast and Accurate Time Series Classification - Through Supervised Interval Search." IEEE ICDM 2020 - """ - - @abstractmethod - def __init__( - self, - base_estimator=None, - n_estimators=200, - interval_selection_method="random", - n_intervals="sqrt", - min_interval_length=3, - max_interval_length=np.inf, - interval_features=None, - series_transformers=None, - att_subsample_size=None, - replace_nan=None, - time_limit_in_minutes=None, - contract_max_n_estimators=500, - save_transformed_data=False, - random_state=None, - n_jobs=1, - parallel_backend=None, - ): - self.base_estimator = base_estimator - self.n_estimators = n_estimators - self.interval_selection_method = interval_selection_method - self.n_intervals = n_intervals - self.min_interval_length = min_interval_length - self.max_interval_length = max_interval_length - self.interval_features = interval_features - self.series_transformers = series_transformers - self.att_subsample_size = att_subsample_size - self.replace_nan = replace_nan - self.time_limit_in_minutes = time_limit_in_minutes - self.contract_max_n_estimators = contract_max_n_estimators - self.save_transformed_data = save_transformed_data - self.random_state = random_state - self.n_jobs = n_jobs - self.parallel_backend = parallel_backend - - super().__init__() - - # if subsampling attributes, an interval_features transformer must contain a - # parameter name from transformer_feature_selection and an attribute name - # (or property) from transformer_feature_names to allow features to be subsampled - transformer_feature_selection = ["features"] - transformer_feature_names = [ - "features_arguments_", - "_features_arguments", - "get_features_arguments", - "_get_features_arguments", - ] - # an interval_features transformer must contain one of these attribute names to - # be able to skip transforming features in predict - transformer_feature_skip = ["transform_features_", "_transform_features"] - - def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object: - """Fit the estimator to training data. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The target labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - X, y = self._validate_data(X=X, y=y, ensure_min_samples=2) - X = self._convert_X(X) - - rng = check_random_state(self.random_state) - - self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape - if is_classifier(self): - check_classification_targets(y) - - self.classes_ = np.unique(y) - self.n_classes_ = self.classes_.shape[0] - self.class_dictionary_ = {} - for index, class_val in enumerate(self.classes_): - self.class_dictionary_[class_val] = index - - if self.n_classes_ == 1: - return self - - self._base_estimator = self.base_estimator - if self.base_estimator is None: - if is_classifier(self): - self._base_estimator = DecisionTreeClassifier(criterion="entropy") - elif is_regressor(self): - self._base_estimator = DecisionTreeRegressor(criterion="absolute_error") - else: - raise ValueError( - f"{self} must be a scikit-learn compatible classifier or " - "regressor." - ) - # base_estimator must be an sklearn estimator - elif not isinstance(self.base_estimator, BaseEstimator): - raise ValueError( - "base_estimator must be a scikit-learn BaseEstimator or None. " - f"Found: {self.base_estimator}" - ) - - # use the base series if series_transformers is None - if self.series_transformers is None or self.series_transformers == []: - Xt = [X] - self._series_transformers = [None] - # clone series_transformers if it is a transformer and transform the input data - elif is_transformer(self.series_transformers): - t = _clone_estimator(self.series_transformers, random_state=rng) - Xt = [t.fit_transform(X, y)] - self._series_transformers = [t] - # clone each series_transformers transformer and include the base series if None - # is in the list - elif isinstance(self.series_transformers, (list, tuple)): - Xt = [] - self._series_transformers = [] - - for transformer in self.series_transformers: - if transformer is None: - Xt.append(X) - self._series_transformers.append(None) - elif is_transformer(transformer): - t = _clone_estimator(transformer, random_state=rng) - Xt.append(t.fit_transform(X, y)) - self._series_transformers.append(t) - else: - raise ValueError( - f"Invalid series_transformers list input. Found {transformer}" - ) - # other inputs are invalid - else: - raise ValueError( - f"Invalid series_transformers input. Found {self.series_transformers}" - ) - - # if only a single n_intervals value is passed it must be an int or str - if isinstance(self.n_intervals, (int, str)): - n_intervals = [[self.n_intervals]] * len(Xt) - elif isinstance(self.n_intervals, (list, tuple)): - # if input is a list and only contains ints or strs, use the list for all - # series in Xt - if all(isinstance(item, (int, str)) for item in self.n_intervals): - n_intervals = [self.n_intervals] * len(Xt) - # other lists must be the same length as Xt - elif len(self.n_intervals) != len(Xt): - raise ValueError( - "n_intervals as a list or tuple containing other lists or tuples " - "must be the same length as series_transformers." - ) - # list items can be a list of items or a single item for each - # series_transformer, but each individual item must be an int or str - else: - n_intervals = [] - for items in self.n_intervals: - if isinstance(items, (list, tuple)): - if not all(isinstance(item, (int, str)) for item in items): - raise ValueError( - "Individual items in a n_intervals list or tuple must " - f"be an int or str. Input {items} does not contain " - "only ints or strs" - ) - n_intervals.append(items) - elif isinstance(items, (int, str)): - n_intervals.append([items]) - else: - raise ValueError( - "Individual items in a n_intervals list or tuple must be " - f"an int or str. Found: {items}" - ) - # other inputs are invalid - else: - raise ValueError(f"Invalid n_intervals input. Found {self.n_intervals}") - - # add together the number of intervals for each series_transformer - # str input must be one of a set valid options - self._n_intervals = [0] * len(Xt) - for i, series in enumerate(Xt): - for method in n_intervals[i]: - if isinstance(method, int): - self._n_intervals[i] += method - elif isinstance(method, str): - # sqrt of series length - if method.lower() == "sqrt": - self._n_intervals[i] += int( - np.sqrt(series.shape[2]) * np.sqrt(series.shape[1]) - ) - # sqrt of series length divided by the number of series_transformers - elif method.lower() == "sqrt-div": - self._n_intervals[i] += int( - (np.sqrt(series.shape[2]) * np.sqrt(series.shape[1])) - / len(Xt) - ) - else: - raise ValueError( - "Invalid str input for n_intervals. Must be " - f'("sqrt","sqrt-div"). Found {method}' - ) - - # each series_transformer must have at least 1 interval extracted - for i, n in enumerate(self._n_intervals): - if n <= 0: - self._n_intervals[i] = 1 - - self.total_intervals_ = sum(self._n_intervals) - - # minimum interval length - if isinstance(self.min_interval_length, int): - self._min_interval_length = [self.min_interval_length] * len(Xt) - # min_interval_length must be at less than one if it is a float (proportion of - # of the series length) - elif ( - isinstance(self.min_interval_length, float) - and self.min_interval_length <= 1 - ): - self._min_interval_length = [ - int(self.min_interval_length * t.shape[2]) for t in Xt - ] - # if the input is a list, it must be the same length as the number of - # series_transformers - # list values must be ints or floats. The same checks as above are performed - elif isinstance(self.min_interval_length, (list, tuple)): - if len(self.min_interval_length) != len(Xt): - raise ValueError( - "min_interval_length as a list or tuple must be the same length " - "as series_transformers." - ) - - self._min_interval_length = [] - for i, length in enumerate(self.min_interval_length): - if isinstance(length, float) and length <= 1: - self._min_interval_length.append(int(length * Xt[i].shape[2])) - elif isinstance(length, int): - self._min_interval_length.append(length) - else: - raise ValueError( - "min_interval_length list items must be int or floats. " - f"Found {length}" - ) - # other inputs are invalid - else: - raise ValueError( - f"Invalid min_interval_length input. Found {self.min_interval_length}" - ) - - # min_interval_length cannot be less than 3 or greater than the series length - for i, n in enumerate(self._min_interval_length): - if n > Xt[i].shape[2]: - self._min_interval_length[i] = Xt[i].shape[2] - elif n < 3: - self._min_interval_length[i] = 3 - - # maximum interval length - if ( - isinstance(self.max_interval_length, int) - or self.max_interval_length == np.inf - ): - self._max_interval_length = [self.max_interval_length] * len(Xt) - # max_interval_length must be at less than one if it is a float (proportion of - # of the series length) - elif ( - isinstance(self.max_interval_length, float) - and self.max_interval_length <= 1 - ): - self._max_interval_length = [ - int(self.max_interval_length * t.shape[2]) for t in Xt - ] - # if the input is a list, it must be the same length as the number of - # series_transformers - # list values must be ints or floats. The same checks as above are performed - elif isinstance(self.max_interval_length, (list, tuple)): - if len(self.max_interval_length) != len(Xt): - raise ValueError( - "max_interval_length as a list or tuple must be the same length " - "as series_transformers." - ) - - self._max_interval_length = [] - for i, length in enumerate(self.max_interval_length): - if isinstance(length, float) and length <= 1: - self._max_interval_length.append(int(length * Xt[i].shape[2])) - elif isinstance(length, int): - self._max_interval_length.append(length) - else: - raise ValueError( - "max_interval_length list items must be int or floats. " - f"Found {length}" - ) - # other inputs are invalid - else: - raise ValueError( - f"Invalid max_interval_length input. Found {self.max_interval_length}" - ) - - # max_interval_length cannot be less than min_interval_length or greater than - # the series length - for i, n in enumerate(self._max_interval_length): - if n < self._min_interval_length[i]: - self._max_interval_length[i] = self._min_interval_length[i] - elif n > Xt[i].shape[2]: - self._max_interval_length[i] = Xt[i].shape[2] - - # we store whether each series_transformer contains a transformer and/or - # function in its interval_features - self._interval_transformer = [False] * len(Xt) - self._interval_function = [False] * len(Xt) - # single transformer or function for all series_transformers - if is_transformer(self.interval_features): - self._interval_transformer = [True] * len(Xt) - transformer = _clone_estimator(self.interval_features, random_state=rng) - self._interval_features = [[transformer]] * len(Xt) - elif callable(self.interval_features): - self._interval_function = [True] * len(Xt) - self._interval_features = [[self.interval_features]] * len(Xt) - elif isinstance(self.interval_features, (list, tuple)): - # if input is a list and only contains transformers or functions, use the - # list for all series in Xt - if all( - is_transformer(item) or callable(item) - for item in self.interval_features - ): - for feature in self.interval_features: - if is_transformer(feature): - self._interval_transformer[0] = True - elif callable(feature): - self._interval_function[0] = True - self._interval_features = [self.interval_features] * len(Xt) - # other lists must be the same length as Xt - elif len(self.interval_features) != len(Xt): - raise ValueError( - "interval_features as a list or tuple containing other lists or " - "tuples must be the same length as series_transformers." - ) - # list items can be a list of items or a single item for each - # series_transformer, but each individual item must be a transformer - # or function - else: - self._interval_features = [] - for i, feature in enumerate(self.interval_features): - if isinstance(feature, (list, tuple)): - for method in feature: - if is_transformer(method): - self._interval_transformer[i] = True - feature = _clone_estimator(feature, random_state=rng) - elif callable(method): - self._interval_function[i] = True - else: - raise ValueError( - "Individual items in a interval_features list or " - "tuple must be a transformer or function. Input " - f"{feature} does not contain only transformers and " - f"functions." - ) - self._interval_features.append(feature) - elif is_transformer(feature): - self._interval_transformer[i] = True - feature = _clone_estimator(feature, random_state=rng) - self._interval_features.append([feature]) - elif callable(feature): - self._interval_function[i] = True - self._interval_features.append([feature]) - else: - raise ValueError( - "Individual items in a interval_features list or tuple " - f"must be a transformer or function. Found {feature}" - ) - # use basic summary stats by default if None - elif self.interval_features is None: - self._interval_function = [True] * len(Xt) - self._interval_features = [[row_mean, row_std, row_slope]] * len(Xt) - # other inputs are invalid - else: - raise ValueError( - f"Invalid interval_features input. Found {self.interval_features}" - ) - - # att_subsample_size must be at least one if it is an int - if isinstance(self.att_subsample_size, int): - if self.att_subsample_size < 1: - raise ValueError( - "att_subsample_size must be at least one if it is an int." - ) - - self._att_subsample_size = [self.att_subsample_size] * len(Xt) - # att_subsample_size must be at less than one if it is a float (proportion of - # total attributed to subsample) - elif isinstance(self.att_subsample_size, float): - if self.att_subsample_size > 1 or self.att_subsample_size <= 0: - raise ValueError( - "att_subsample_size must be between 0 and 1 if it is a float." - ) - - self._att_subsample_size = [self.att_subsample_size] * len(Xt) - # default is no attribute subsampling with None - elif self.att_subsample_size is None: - self._att_subsample_size = [self.att_subsample_size] * len(Xt) - # if the input is a list, it must be the same length as the number of - # series_transformers - # list values must be ints, floats or None. The same checks as above are - # performed - elif isinstance(self.att_subsample_size, (list, tuple)): - if len(self.att_subsample_size) != len(Xt): - raise ValueError( - "att_subsample_size as a list or tuple must be the same length as " - "series_transformers." - ) - - self._att_subsample_size = [] - for ssize in self.att_subsample_size: - if isinstance(ssize, int): - if ssize < 1: - raise ValueError( - "att_subsample_size in list must be at least one if it is " - "an int." - ) - - self._att_subsample_size.append(ssize) - elif isinstance(ssize, float): - if ssize > 1: - raise ValueError( - "att_subsample_size in list must be between 0 and 1 if it " - "is a " - "float." - ) - - self._att_subsample_size.append(ssize) - elif ssize is None: - self._att_subsample_size.append(ssize) - else: - raise ValueError( - "Invalid interval_features input in list. Found " - f"{self.att_subsample_size}" - ) - # other inputs are invalid - else: - raise ValueError( - f"Invalid interval_features input. Found {self.att_subsample_size}" - ) - - # if we are subsampling attributes for a series_transformer and it uses a - # BaseTransformer, we must ensure it has the required parameters and - # attributes to do so - self._transformer_feature_selection = [[]] * len(Xt) - self._transformer_feature_names = [[]] * len(Xt) - for r, att_subsample in enumerate(self._att_subsample_size): - if att_subsample is not None: - for transformer in self._interval_features[r]: - if is_transformer(transformer): - params = inspect.signature(transformer.__init__).parameters - - # the transformer must have a parameter with one of the - # names listed in transformer_feature_selection as a way to - # select which features the transformer should transform - has_params = False - for n in self.transformer_feature_selection: - if params.get(n, None) is not None: - has_params = True - self._transformer_feature_selection[r].append(n) - break - - if not has_params: - raise ValueError( - "All transformers in interval_features must have a " - "parameter named in transformer_feature_selection to " - "be used in attribute subsampling." - ) - - # the transformer must have an attribute with one of the - # names listed in transformer_feature_names as a list or tuple - # of valid options for the previous parameter - has_feature_names = False - for n in self.transformer_feature_names: - if hasattr(transformer, n) and isinstance( - getattr(transformer, n), (list, tuple) - ): - has_feature_names = True - self._transformer_feature_names[r].append(n) - break - - if not has_feature_names: - raise ValueError( - "All transformers in interval_features must have an " - "attribute or propertynamed in " - "transformer_feature_names to be used in attribute " - "subsampling." - ) - - # verify the interval_selection_method is a valid string - if isinstance(self.interval_selection_method, str): - # SupervisedIntervals cannot currently handle transformers or regression - if ( - self.interval_selection_method.lower() == "supervised" - or self.interval_selection_method.lower() == "random-supervised" - ): - if any(self._interval_transformer): - raise ValueError( - "Supervised interval_selection_method must only have function " - "inputs for interval_features." - ) - - if is_regressor(self): - raise ValueError( - "Supervised interval_selection_method cannot be used for " - "regression." - ) - # RandomIntervals - elif not self.interval_selection_method.lower() == "random": - raise ValueError( - 'Unknown interval_selection_method, must be one of ("random",' - '"supervised","random-supervised"). ' - f"Found: {self.interval_selection_method}" - ) - # other inputs are invalid - else: - raise ValueError( - 'Unknown interval_selection_method, must be one of ("random",' - '"supervised","random-supervised"). ' - f"Found: {self.interval_selection_method}" - ) - - # verify replace_nan is a valid string, number or None - if ( - (not isinstance(self.replace_nan, str) or self.replace_nan.lower() != "nan") - and not isinstance(self.replace_nan, (int, float)) - and self.replace_nan is not None - ): - raise ValueError(f"Invalid replace_nan input. Found {self.replace_nan}") - - self._n_jobs = check_n_jobs(self.n_jobs) - - if self.time_limit_in_minutes is not None and self.time_limit_in_minutes > 0: - time_limit = self.time_limit_in_minutes * 60 - start_time = time.time() - train_time = 0 - - self._n_estimators = 0 - self.estimators_ = [] - self.intervals_ = [] - self.transformed_data_ = [] - - while ( - train_time < time_limit - and self._n_estimators < self.contract_max_n_estimators - ): - fit = Parallel( - n_jobs=self._n_jobs, - backend=self.parallel_backend, - prefer="threads", - )( - delayed(self._fit_estimator)( - Xt, - y, - rng.randint(np.iinfo(np.int32).max), - ) - for _ in range(self._n_jobs) - ) - - ( - estimators, - intervals, - transformed_data, - ) = zip(*fit) - - self.estimators_ += estimators - self.intervals_ += intervals - self.transformed_data_ += transformed_data - - self._n_estimators += self._n_jobs - train_time = time.time() - start_time - else: - self._n_estimators = self.n_estimators - - fit = Parallel( - n_jobs=self._n_jobs, - backend=self.parallel_backend, - prefer="threads", - )( - delayed(self._fit_estimator)( - Xt, - y, - rng.randint(np.iinfo(np.int32).max), - ) - for _ in range(self._n_estimators) - ) - - ( - self.estimators_, - self.intervals_, - self.transformed_data_, - ) = zip(*fit) - - return self - - def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: - """Predicts labels for sequences in X. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances) - Predicted target labels. - """ - if is_regressor(self): - check_is_fitted(self) - - Xt = self._predict_setup(X) - - y_preds = Parallel( - n_jobs=self._n_jobs, - backend=self.parallel_backend, - prefer="threads", - )( - delayed(self._predict_for_estimator)( - Xt, - self.estimators_[i], - self.intervals_[i], - predict_proba=False, - ) - for i in range(self._n_estimators) - ) - - return np.mean(y_preds, axis=0) - else: - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat( - list(self.class_dictionary_.keys()), X.shape[0], axis=0 - ) - - return np.array( - [self.classes_[int(np.argmax(prob))] for prob in self._predict_proba(X)] - ) - - def _predict_proba(self, X): - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat([[1]], X.shape[0], axis=0) - - Xt = self._predict_setup(X) - - y_probas = Parallel( - n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" - )( - delayed(self._predict_for_estimator)( - Xt, - self.estimators_[i], - self.intervals_[i], - predict_proba=True, - ) - for i in range(self._n_estimators) - ) - - output = np.sum(y_probas, axis=0) / ( - np.ones(self.n_classes_) * self._n_estimators - ) - return output - - def _fit_estimator(self, Xt, y, seed): - # random state for this estimator - rng = check_random_state(seed) - - intervals = [] - transform_data_lengths = [] - interval_features = np.empty((self.n_instances_, 0)) - - # for each transformed series - for r in range(len(Xt)): - # subsample attributes if enabled - if self._att_subsample_size[r] is not None: - # separate transformers and functions in separate lists - # add the feature names of transformers to a list to subsample from - # and calculate the total number of features - all_transformers = [] - all_transformer_features = [] - all_function_features = [] - for feature in self._interval_features[r]: - if is_transformer(feature): - all_transformer_features += getattr( - feature, - self._transformer_feature_names[r][len(all_transformers)], - ) - all_transformers.append(feature) - else: - all_function_features.append(feature) - - # handle float subsample size - num_features = len(all_transformer_features) + len( - all_function_features - ) - att_subsample_size = self._att_subsample_size[r] - if isinstance(self._att_subsample_size[r], float): - att_subsample_size = int(att_subsample_size * num_features) - - # if the att_subsample_size is greater than the number of features - # give a warning and add all features - features = [] - if att_subsample_size < num_features: - # subsample the transformer and function features by index - atts = rng.choice( - num_features, - att_subsample_size, - replace=False, - ) - atts.sort() - - # subsample the feature transformers using the - # transformer_feature_names and transformer_feature_selection - # attributes. - # the presence of valid attributes is verified in fit. - count = 0 - length = 0 - for n, transformer in enumerate(all_transformers): - this_len = len( - getattr(transformer, self._transformer_feature_names[r][n]) - ) - length += this_len - - # subsample feature names from this transformer - t_features = [] - while count < len(atts) and atts[count] < length: - t_features.append( - getattr( - transformer, - self._transformer_feature_names[r][n], - )[atts[count] + this_len - length] - ) - count += 1 - - # tell this transformer to only transform the selected features - if len(t_features) > 0: - new_transformer = _clone_estimator(transformer, seed) - setattr( - new_transformer, - self._transformer_feature_selection[r][n], - t_features, - ) - features.append(new_transformer) - - # subsample the remaining function features - for i in range(att_subsample_size - count): - features.append(all_function_features[atts[count + i] - length]) - else: - warnings.warn( - f"Attribute subsample size {att_subsample_size} is larger than " - f"or equal to the number of attributes {num_features} for " - f"series {self._series_transformers[r]}", - stacklevel=2, - ) - for feature in self._interval_features[r]: - if is_transformer(feature): - features.append(_clone_estimator(feature, seed)) - else: - features.append(feature) - # add all features while cloning estimators if not subsampling - else: - features = [] - for feature in self._interval_features[r]: - if is_transformer(feature): - features.append(_clone_estimator(feature, seed)) - else: - features.append(feature) - - # create the selected interval selector and set its parameters - if self.interval_selection_method == "random": - selector = RandomIntervalTransformer( - n_intervals=self._n_intervals[r], - min_interval_length=self._min_interval_length[r], - max_interval_length=self._max_interval_length[r], - features=features, - random_state=seed, - ) - elif self.interval_selection_method == "supervised": - selector = SupervisedIntervalTransformer( - n_intervals=self._n_intervals[r], - min_interval_length=self._min_interval_length[r], - features=features, - randomised_split_point=False, - random_state=seed, - ) - elif self.interval_selection_method == "random-supervised": - selector = SupervisedIntervalTransformer( - n_intervals=self._n_intervals[r], - min_interval_length=self._min_interval_length[r], - features=features, - randomised_split_point=True, - random_state=seed, - ) - - # fit the interval selector, transform the current series using it and save - # the transformer - intervals.append(selector) - f = intervals[r].fit_transform(Xt[r], y) - - # concatenate the data and save this transforms number of attributes - transform_data_lengths.append(f.shape[1]) - interval_features = np.hstack((interval_features, f)) - - if isinstance(self.replace_nan, str) and self.replace_nan.lower() == "nan": - interval_features = np.nan_to_num( - interval_features, False, np.nan, np.nan, np.nan - ) - elif isinstance(self.replace_nan, (int, float)): - interval_features = np.nan_to_num( - interval_features, - False, - self.replace_nan, - self.replace_nan, - self.replace_nan, - ) - - # clone and fit the base estimator using the transformed data - tree = _clone_estimator(self._base_estimator, random_state=seed) - tree.fit(interval_features, y) - - # find the features used in the tree and inform the interval selectors to not - # transform these features if possible - self._efficient_predictions = True - relevant_features = None - if isinstance(tree, BaseDecisionTree): - relevant_features = np.unique(tree.tree_.feature[tree.tree_.feature >= 0]) - elif isinstance(tree, CITClassifier): - relevant_features, _ = tree.tree_node_splits_and_gain() - - if relevant_features is not None: - features_to_transform = [False] * interval_features.shape[1] - for i in relevant_features: - features_to_transform[i] = True - - count = 0 - for r in range(len(Xt)): - intervals[r].transformer_feature_skip = self.transformer_feature_skip - - # if the transformers don't have valid attributes to skip False is - # returned - completed = intervals[r].set_features_to_transform( - features_to_transform[count : count + transform_data_lengths[r]], - raise_error=False, - ) - count += transform_data_lengths[r] - - if not completed: - self._efficient_predictions = False - else: - self._efficient_predictions = False - - return [ - tree, - intervals, - interval_features if self.save_transformed_data else None, - ] - - def _predict_setup(self, X): - X = self._validate_data(X=X, reset=False) - X = self._convert_X(X) - - n_instances, n_channels, n_timepoints = X.shape - - if n_channels != self.n_channels_: - raise ValueError( - "The number of channels in the train data does not match the number " - "of channels in the test data" - ) - if n_timepoints != self.n_timepoints_: - raise ValueError( - "The series length of the train data does not match the series length " - "of the test data" - ) - - Xt = [] - for transformer in self._series_transformers: - if transformer is None: - Xt.append(X) - elif is_transformer(transformer): - Xt.append(transformer.transform(X)) - - return Xt - - def _predict_for_estimator(self, Xt, estimator, intervals, predict_proba=False): - interval_features = np.empty((Xt[0].shape[0], 0)) - - for r in range(len(Xt)): - f = intervals[r].transform(Xt[r]) - interval_features = np.hstack((interval_features, f)) - - if isinstance(self.replace_nan, str) and self.replace_nan.lower() == "nan": - interval_features = np.nan_to_num( - interval_features, False, np.nan, np.nan, np.nan - ) - elif isinstance(self.replace_nan, (int, float)): - interval_features = np.nan_to_num( - interval_features, - False, - self.replace_nan, - self.replace_nan, - self.replace_nan, - ) - - if predict_proba: - return estimator.predict_proba(interval_features) - else: - return estimator.predict(interval_features) diff --git a/tsml/interval_based/_interval_forest.py b/tsml/interval_based/_interval_forest.py deleted file mode 100644 index 32d3c7f..0000000 --- a/tsml/interval_based/_interval_forest.py +++ /dev/null @@ -1,454 +0,0 @@ -"""Configurable interval forest estimators.""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = ["IntervalForestClassifier", "IntervalForestRegressor"] - -from typing import List, Union - -import numpy as np -from sklearn.base import ClassifierMixin, RegressorMixin - -from tsml.interval_based._base import BaseIntervalForest - - -class IntervalForestClassifier(ClassifierMixin, BaseIntervalForest): - """Configurable interval extracting forest classifier. - - Extracts multiple phase-dependent intervals from time series data and builds a - base classifier on summary statistic extracted from each interval. Forms and - ensemble of these classifiers. - - Allows the implementation of classifiers along the lines of [1][2][3] - which extract intervals and create an ensemble from the subsequent features. - - By default, uses a configuration similar to TimeSeriesFroest [1]. - - Parameters - ---------- - base_estimator : BaseEstimator or None, default=None - scikit-learn BaseEstimator used to build the interval ensemble. If None, use a - simple decision tree. - n_estimators : int, default=200 - Number of estimators to build for the ensemble. - interval_selection_method : "random", "supervised" or "random-supervised", - default="random" - The interval selection transformer to use. - - "random" uses a RandomIntervalTransformer. - - "supervised" uses a SupervisedIntervalTransformer. - - "random-supervised" uses a SupervisedIntervalTransformer with - randomised elements. - n_intervals : int, str, list or tuple, default="sqrt" - Number of intervals to extract per tree for each series_transformers series. - - An int input will extract that number of intervals from the series, while a str - input will return a function of the series length (may differ per - series_transformers output) to extract that number of intervals. - Valid str inputs are: - - "sqrt": square root of the series length. - - "sqrt-div": sqrt of series length divided by the number - of series_transformers. - - A list or tuple of ints and/or strs will extract the number of intervals using - the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"] - will extract sqrt(n_timepoints) + 4 intervals. - - Different number of intervals for each series_transformers series can be - specified using a nested list or tuple. Any list or tuple input containing - another list or tuple must be the same length as the number of - series_transformers. - - While random interval extraction will extract the n_intervals intervals total - (removing duplicates), supervised intervals will run the supervised extraction - process n_intervals times, returning more intervals than specified. - min_interval_length : int, float, list, or tuple, default=3 - Minimum length of intervals to extract from series. float inputs take a - proportion of the series length to use as the minimum interval length. - - Different minimum interval lengths for each series_transformers series can be - specified using a list or tuple. Any list or tuple input must be the same length - as the number of series_transformers. - max_interval_length : int, float, list, or tuple, default=np.inf - Maximum length of intervals to extract from series. float inputs take a - proportion of the series length to use as the maximum interval length. - - Different maximum interval lengths for each series_transformers series can be - specified using a list or tuple. Any list or tuple input must be the same length - as the number of series_transformers. - - Ignored for supervised interval_selection_method inputs. - interval_features : TransformerMixin, callable, list, tuple, or None, default=None - The features to extract from the intervals using transformers or callable - functions. If None, use the mean, standard deviation, and slope of the series. - - Both transformers and functions should be able to take a 2D np.ndarray input. - Functions should output a 1d array (the feature for each series), and - transformers should output a 2d array where rows are the features for each - series. A list or tuple of transformers and/or functions will extract all - features and concatenate the output. - - Different features for each series_transformers series can be specified using a - nested list or tuple. Any list or tuple input containing another list or tuple - must be the same length as the number of series_transformers. - series_transformers : TransformerMixin, list, tuple, or None, default=None - The transformers to apply to the series before extracting intervals. If None, - use the series as is. - - A list or tuple of transformers will extract intervals from - all transformations concatenate the output. Including None in the list or tuple - will use the series as is for interval extraction. - att_subsample_size : int, float, list, tuple or None, default=None - The number of attributes to subsample for each estimator. If None, use all - - If int, use that number of attributes for all estimators. If float, use that - proportion of attributes for all estimators. - - Different subsample sizes for each series_transformers series can be specified - using a list or tuple. Any list or tuple input must be the same length as the - number of series_transformers. - replace_nan : "nan", int, float or None, default=None - The value to replace NaNs and infinite values with before fitting the base - estimator. int or float input will replace with the specified value, while - "nan" will replace infinite values with NaNs. If None, do not replace NaNs. - time_limit_in_minutes : int, default=0 - Time contract to limit build time in minutes, overriding n_estimators. - Default of 0 means n_estimators are used. - contract_max_n_estimators : int, default=500 - Max number of estimators when time_limit_in_minutes is set. - save_transformed_data : bool, default=False - Save the data transformed in fit. - random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `predict`. - ``-1`` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_channels_ : int - The number of dimensions per case in the training set. - n_timepoints_ : int - The length of each series in the training set. - n_classes_ : int - Number of classes. Extracted from the data. - classes_ : ndarray of shape (n_classes_) - Holds the label for each class. - class_dictionary_ : dict - A dictionary mapping class labels to class indices in classes_. - total_intervals_ : int - Total number of intervals per tree from all representations. - estimators_ : list of shape (n_estimators) of BaseEstimator - The collections of estimators trained in fit. - intervals_ : list of shape (n_estimators) of TransformerMixin - Stores the interval extraction transformer for all estimators. - transformed_data_ : list of shape (n_estimators) of ndarray with shape - (n_instances_ ,total_intervals * att_subsample_size) - The transformed dataset for all estimators. Only saved when - save_transformed_data is true. - - References - ---------- - .. [1] H.Deng, G.Runger, E.Tuv and M.Vladimir, "A time series forest for - classification and feature extraction", Information Sciences, 239, 2013 - .. [2] Matthew Middlehurst and James Large and Anthony Bagnall. "The Canonical - Interval Forest (CIF) Classifier for Time Series Classification." - IEEE International Conference on Big Data 2020 - .. [3] Cabello, Nestor, et al. "Fast and Accurate Time Series Classification - Through Supervised Interval Search." IEEE ICDM 2020 - - Examples - -------- - >>> from tsml.interval_based import IntervalForestClassifier - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, random_state=0) - >>> clf = IntervalForestClassifier(n_estimators=10, random_state=0) - >>> clf.fit(X, y) - IntervalForestClassifier(...) - >>> clf.predict(X) - array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0]) - """ - - def __init__( - self, - base_estimator=None, - n_estimators=200, - interval_selection_method="random", - n_intervals="sqrt", - min_interval_length=3, - max_interval_length=np.inf, - interval_features=None, - series_transformers=None, - att_subsample_size=None, - replace_nan=None, - time_limit_in_minutes=None, - contract_max_n_estimators=500, - save_transformed_data=False, - random_state=None, - n_jobs=1, - parallel_backend=None, - ): - super().__init__( - base_estimator=base_estimator, - n_estimators=n_estimators, - interval_selection_method=interval_selection_method, - n_intervals=n_intervals, - min_interval_length=min_interval_length, - max_interval_length=max_interval_length, - interval_features=interval_features, - series_transformers=series_transformers, - att_subsample_size=att_subsample_size, - replace_nan=replace_nan, - time_limit_in_minutes=time_limit_in_minutes, - contract_max_n_estimators=contract_max_n_estimators, - save_transformed_data=save_transformed_data, - random_state=random_state, - n_jobs=n_jobs, - parallel_backend=parallel_backend, - ) - - def predict_proba(self, X): - return self._predict_proba(X) - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - return { - "n_estimators": 2, - "n_intervals": 2, - } - - -class IntervalForestRegressor(RegressorMixin, BaseIntervalForest): - """Configurable interval extracting forest regressor. - - Extracts multiple phase-dependent intervals from time series data and builds a - base regressor on summary statistic extracted from each interval. Forms and - ensemble of these regressors. - - Allows the implementation of regressors along the lines of [1][2][3] - which extract intervals and create an ensemble from the subsequent features. - - By default, uses a configuration similar to TimeSeriesFroest [1]. - - Parameters - ---------- - base_estimator : BaseEstimator or None, default=None - scikit-learn BaseEstimator used to build the interval ensemble. If None, use a - simple decision tree. - n_estimators : int, default=200 - Number of estimators to build for the ensemble. - interval_selection_method : "random", default="random" - The interval selection transformer to use. - - "random" uses a RandomIntervalTransformer. - n_intervals : int, str, list or tuple, default="sqrt" - Number of intervals to extract per tree for each series_transformers series. - - An int input will extract that number of intervals from the series, while a str - input will return a function of the series length (may differ per - series_transformers output) to extract that number of intervals. - Valid str inputs are: - - "sqrt": square root of the series length. - - "sqrt-div": sqrt of series length divided by the number - of series_transformers. - - A list or tuple of ints and/or strs will extract the number of intervals using - the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"] - will extract sqrt(n_timepoints) + 4 intervals. - - Different number of intervals for each series_transformers series can be - specified using a nested list or tuple. Any list or tuple input containing - another list or tuple must be the same length as the number of - series_transformers. - min_interval_length : int, float, list, or tuple, default=3 - Minimum length of intervals to extract from series. float inputs take a - proportion of the series length to use as the minimum interval length. - - Different minimum interval lengths for each series_transformers series can be - specified using a list or tuple. Any list or tuple input must be the same length - as the number of series_transformers. - max_interval_length : int, float, list, or tuple, default=np.inf - Maximum length of intervals to extract from series. float inputs take a - proportion of the series length to use as the maximum interval length. - - Different maximum interval lengths for each series_transformers series can be - specified using a list or tuple. Any list or tuple input must be the same length - as the number of series_transformers. - interval_features : TransformerMixin, callable, list, tuple, or None, default=None - The features to extract from the intervals using transformers or callable - functions. If None, use the mean, standard deviation, and slope of the series. - - Both transformers and functions should be able to take a 2D np.ndarray input. - Functions should output a 1d array (the feature for each series), and - transformers should output a 2d array where rows are the features for each - series. A list or tuple of transformers and/or functions will extract all - features and concatenate the output. - - Different features for each series_transformers series can be specified using a - nested list or tuple. Any list or tuple input containing another list or tuple - must be the same length as the number of series_transformers. - series_transformers : TransformerMixin, list, tuple, or None, default=None - The transformers to apply to the series before extracting intervals. If None, - use the series as is. - - A list or tuple of transformers will extract intervals from - all transformations concatenate the output. Including None in the list or tuple - will use the series as is for interval extraction. - att_subsample_size : int, float, list, tuple or None, default=None - The number of attributes to subsample for each estimator. If None, use all - - If int, use that number of attributes for all estimators. If float, use that - proportion of attributes for all estimators. - - Different subsample sizes for each series_transformers series can be specified - using a list or tuple. Any list or tuple input must be the same length as the - number of series_transformers. - replace_nan : "nan", int, float or None, default=None - The value to replace NaNs and infinite values with before fitting the base - estimator. int or float input will replace with the specified value, while - "nan" will replace infinite values with NaNs. If None, do not replace NaNs. - time_limit_in_minutes : int, default=0 - Time contract to limit build time in minutes, overriding n_estimators. - Default of 0 means n_estimators are used. - contract_max_n_estimators : int, default=500 - Max number of estimators when time_limit_in_minutes is set. - save_transformed_data : bool, default=False - Save the data transformed in fit. - random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `predict`. - ``-1`` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_channels_ : int - The number of dimensions per case in the training set. - n_timepoints_ : int - The length of each series in the training set. - total_intervals_ : int - Total number of intervals per tree from all representations. - estimators_ : list of shape (n_estimators) of BaseEstimator - The collections of estimators trained in fit. - intervals_ : list of shape (n_estimators) of TransformerMixin - Stores the interval extraction transformer for all estimators. - transformed_data_ : list of shape (n_estimators) of ndarray with shape - (n_instances_ ,total_intervals * att_subsample_size) - The transformed dataset for all estimators. Only saved when - save_transformed_data is true. - - References - ---------- - .. [1] H.Deng, G.Runger, E.Tuv and M.Vladimir, "A time series forest for - classification and feature extraction", Information Sciences, 239, 2013 - .. [2] Matthew Middlehurst and James Large and Anthony Bagnall. "The Canonical - Interval Forest (CIF) Classifier for Time Series Classification." - IEEE International Conference on Big Data 2020 - .. [3] Cabello, Nestor, et al. "Fast and Accurate Time Series Classification - Through Supervised Interval Search." IEEE ICDM 2020 - - Examples - -------- - >>> from tsml.interval_based import IntervalForestRegressor - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, - ... regression_target=True, random_state=0) - >>> reg = IntervalForestRegressor(n_estimators=10, random_state=0) - >>> reg.fit(X, y) - IntervalForestRegressor(...) - >>> reg.predict(X) - array([0.7252543 , 1.50132442, 0.95608366, 1.64399016, 0.42385504, - 0.60639322, 1.01919317, 1.30157483, 1.66017354, 0.2900776 ]) - """ - - def __init__( - self, - base_estimator=None, - n_estimators=200, - interval_selection_method="random", - n_intervals="sqrt", - min_interval_length=3, - max_interval_length=np.inf, - interval_features=None, - series_transformers=None, - att_subsample_size=None, - replace_nan=None, - time_limit_in_minutes=None, - contract_max_n_estimators=500, - save_transformed_data=False, - random_state=None, - n_jobs=1, - parallel_backend=None, - ): - super().__init__( - base_estimator=base_estimator, - n_estimators=n_estimators, - interval_selection_method=interval_selection_method, - n_intervals=n_intervals, - min_interval_length=min_interval_length, - max_interval_length=max_interval_length, - interval_features=interval_features, - series_transformers=series_transformers, - att_subsample_size=att_subsample_size, - replace_nan=replace_nan, - time_limit_in_minutes=time_limit_in_minutes, - contract_max_n_estimators=contract_max_n_estimators, - save_transformed_data=save_transformed_data, - random_state=random_state, - n_jobs=n_jobs, - parallel_backend=parallel_backend, - ) - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - return { - "n_estimators": 2, - "n_intervals": 2, - } diff --git a/tsml/interval_based/_interval_pipelines.py b/tsml/interval_based/_interval_pipelines.py deleted file mode 100644 index 3606758..0000000 --- a/tsml/interval_based/_interval_pipelines.py +++ /dev/null @@ -1,820 +0,0 @@ -"""Interval Extraction Pipeline Estimators. - -Pipeline estimators using summary statistics extracted from random or supervised - intervals and an estimator. -""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = [ - "RandomIntervalClassifier", - "RandomIntervalRegressor", - "SupervisedIntervalClassifier", -] - -from typing import List, Union - -import numpy as np -from sklearn.base import ClassifierMixin, RegressorMixin -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.ensemble._base import _set_random_states -from sklearn.utils.validation import check_is_fitted, check_random_state - -from tsml.base import BaseTimeSeriesEstimator, _clone_estimator -from tsml.transformations._interval_extraction import ( - RandomIntervalTransformer, - SupervisedIntervalTransformer, -) -from tsml.utils.validation import check_n_jobs - - -class RandomIntervalClassifier(ClassifierMixin, BaseTimeSeriesEstimator): - """Random Interval Classifier. - - Extracts multiple intervals with random length, position and dimension from series - qnd concatenates them into a feature vector. Builds an estimator on the - transformed data. - - Parameters - ---------- - n_intervals : int or callable, default=100, - The number of intervals of random length, position and dimension to be - extracted. Input should be an int or a function that takes a 3D np.ndarray - input and returns an int. - min_interval_length : int, default=3 - The minimum length of extracted intervals. Minimum value of 3. - max_interval_length : int, default=3 - The maximum length of extracted intervals. Minimum value of min_interval_length. - features : TransformerMixin, a function taking a 2d numpy array parameter, or list - of said transformers and functions, default=None - Transformers and functions used to extract features from selected intervals. - If None, defaults to [mean, median, min, max, std, 25% quantile, 75% quantile] - series_transformers : TransformerMixin, list, tuple, or None, default=None - The transformers to apply to the series before extracting intervals and - shapelets. If None, use the series as is. - - A list or tuple of transformers will extract intervals from - all transformations concatenate the output. Including None in the list or tuple - will use the series as is for interval extraction. - dilation : int, list or None, default=None - Add dilation to extracted intervals. No dilation is added if None or 1. If a - list of ints, a random dilation value is selected from the list for each - interval. - - The dilation value is selected after the interval star and end points. If the - number of values in the dilated interval is less than the min_interval_length, - the amount of dilation applied is reduced. - estimator : sklearn classifier, optional, default=None - An sklearn estimator to be built using the transformed data. - Defaults to sklearn RandomForestClassifier(n_estimators=200) - random_state : None, int or instance of RandomState, default=None - Seed or RandomState object used for random number generation. - If random_state is None, use the RandomState singleton used by np.random. - If random_state is an int, use a new RandomState instance seeded with seed. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `transform` functions. - `-1` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_channels_ : int - The number of dimensions per case in the training set. - n_timepoints_ : int - The length of each series in the training set. - n_classes_ : int - Number of classes. Extracted from the data. - classes_ : ndarray of shape (n_classes_) - Holds the label for each class. - class_dictionary_ : dict - A dictionary mapping class labels to class indices in classes_. - - See Also - -------- - RandomIntervalTransformer - RandomIntervalRegressor - SupervisedIntervalClassifier - - Examples - -------- - >>> from tsml.interval_based import RandomIntervalClassifier - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0) - >>> clf = RandomIntervalClassifier(random_state=0) - >>> clf.fit(X, y) - RandomIntervalClassifier(...) - >>> clf.predict(X) - array([0, 1, 1, 0, 0, 1, 0, 1]) - """ - - def __init__( - self, - n_intervals=100, - min_interval_length=3, - max_interval_length=np.inf, - features=None, - series_transformers=None, - dilation=None, - estimator=None, - n_jobs=1, - random_state=None, - parallel_backend=None, - ): - self.n_intervals = n_intervals - self.min_interval_length = min_interval_length - self.max_interval_length = max_interval_length - self.features = features - self.series_transformers = series_transformers - self.dilation = dilation - self.estimator = estimator - self.random_state = random_state - self.n_jobs = n_jobs - self.parallel_backend = parallel_backend - - super().__init__() - - def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object: - """Fit the estimator to training data. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - X, y = self._validate_data( - X=X, y=y, ensure_min_samples=2, ensure_min_series_length=3 - ) - X = self._convert_X(X) - - self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape - self.classes_ = np.unique(y) - self.n_classes_ = self.classes_.shape[0] - self.class_dictionary_ = {} - for index, class_val in enumerate(self.classes_): - self.class_dictionary_[class_val] = index - - if self.n_classes_ == 1: - return self - - self._n_jobs = check_n_jobs(self.n_jobs) - rng = check_random_state(self.random_state) - - if isinstance(self.series_transformers, (list, tuple)): - self._series_transformers = [ - None if st is None else _clone_estimator(st, random_state=rng) - for st in self.series_transformers - ] - else: - self._series_transformers = [ - ( - None - if self.series_transformers is None - else _clone_estimator(self.series_transformers, random_state=rng) - ) - ] - - X_t = np.empty((X.shape[0], 0)) - self._transformers = [] - for st in self._series_transformers: - if st is not None: - s = st.fit_transform(X, y) - else: - s = X - - ct = RandomIntervalTransformer( - n_intervals=self.n_intervals, - min_interval_length=self.min_interval_length, - max_interval_length=self.max_interval_length, - features=self.features, - dilation=self.dilation, - n_jobs=self._n_jobs, - parallel_backend=self.parallel_backend, - ) - _set_random_states(ct, rng) - self._transformers.append(ct) - t = ct.fit_transform(s, y) - - X_t = np.hstack((X_t, t)) - - self._estimator = _clone_estimator( - ( - RandomForestClassifier(n_estimators=200) - if self.estimator is None - else self.estimator - ), - self.random_state, - ) - - m = getattr(self._estimator, "n_jobs", None) - if m is not None: - self._estimator.n_jobs = self._n_jobs - - self._estimator.fit(X_t, y) - - return self - - def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: - """Predicts labels for sequences in X. - - Parameters - ---------- - X : 3D np.array of shape (n_instances, n_channels, n_timepoints) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances) - Predicted class labels. - """ - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0) - - X = self._validate_data(X=X, reset=False, ensure_min_series_length=3) - X = self._convert_X(X) - - X_t = np.empty((X.shape[0], 0)) - for i, st in enumerate(self._series_transformers): - if st is not None: - s = st.transform(X) - else: - s = X - - t = self._transformers[i].transform(s) - X_t = np.hstack((X_t, t)) - - return self._estimator.predict(X_t) - - def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: - """Predicts labels probabilities for sequences in X. - - Parameters - ---------- - X : 3D np.array of shape (n_instances, n_channels, n_timepoints) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances, n_classes_) - Predicted probabilities using the ordering in classes_. - """ - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat([[1]], X.shape[0], axis=0) - - X = self._validate_data(X=X, reset=False, ensure_min_series_length=3) - X = self._convert_X(X) - - X_t = np.empty((X.shape[0], 0)) - for i, st in enumerate(self._series_transformers): - if st is not None: - s = st.transform(X) - else: - s = X - - t = self._transformers[i].transform(s) - X_t = np.hstack((X_t, t)) - - m = getattr(self._estimator, "predict_proba", None) - if callable(m): - return self._estimator.predict_proba(X_t) - else: - dists = np.zeros((X.shape[0], self.n_classes_)) - preds = self._estimator.predict(X_t) - for i in range(0, X.shape[0]): - dists[i, self.class_dictionary_[preds[i]]] = 1 - return dists - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - from tsml.utils.numba_functions.stats import row_mean, row_numba_min - - return { - "n_intervals": 2, - "estimator": RandomForestClassifier(n_estimators=2), - "features": [row_mean, row_numba_min], - } - - -class RandomIntervalRegressor(RegressorMixin, BaseTimeSeriesEstimator): - """Random Interval Regressor. - - Extracts multiple intervals with random length, position and dimension from series - and concatenates them into a feature vector. Builds an estimator on the - transformed data. - - Parameters - ---------- - n_intervals : int or callable, default=100, - The number of intervals of random length, position and dimension to be - extracted. Input should be an int or a function that takes a 3D np.ndarray - input and returns an int. - min_interval_length : int, default=3 - The minimum length of extracted intervals. Minimum value of 3. - max_interval_length : int, default=3 - The maximum length of extracted intervals. Minimum value of min_interval_length. - features : TransformerMixin, a function taking a 2d numpy array parameter, or list - of said transformers and functions, default=None - Transformers and functions used to extract features from selected intervals. - If None, defaults to [mean, median, min, max, std, 25% quantile, 75% quantile] - series_transformers : TransformerMixin, list, tuple, or None, default=None - The transformers to apply to the series before extracting intervals and - shapelets. If None, use the series as is. - - A list or tuple of transformers will extract intervals from - all transformations concatenate the output. Including None in the list or tuple - will use the series as is for interval extraction. - dilation : int, list or None, default=None - Add dilation to extracted intervals. No dilation is added if None or 1. If a - list of ints, a random dilation value is selected from the list for each - interval. - - The dilation value is selected after the interval star and end points. If the - number of values in the dilated interval is less than the min_interval_length, - the amount of dilation applied is reduced. - estimator : sklearn regressor, optional, default=None - An sklearn estimator to be built using the transformed data. - Defaults to sklearn RandomForestRegressor(n_estimators=200) - random_state : None, int or instance of RandomState, default=None - Seed or RandomState object used for random number generation. - If random_state is None, use the RandomState singleton used by np.random. - If random_state is an int, use a new RandomState instance seeded with seed. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `transform` functions. - `-1` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_channels_ : int - The number of dimensions per case in the training set. - n_timepoints_ : int - The length of each series in the training set. - - See Also - -------- - RandomIntervalTransformer - RandomIntervalClassifier - - Examples - -------- - >>> from tsml.interval_based import RandomIntervalRegressor - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, - ... regression_target=True, random_state=0) - >>> reg = RandomIntervalRegressor(random_state=0) - >>> reg.fit(X, y) - RandomIntervalRegressor(...) - >>> reg.predict(X) - array([0.44924979, 1.31424037, 1.11951504, 0.63780969, 0.58123516, - 1.17135463, 0.56450198, 1.10128837]) - """ - - def __init__( - self, - n_intervals=100, - min_interval_length=3, - max_interval_length=np.inf, - features=None, - series_transformers=None, - dilation=None, - estimator=None, - n_jobs=1, - random_state=None, - parallel_backend=None, - ): - self.n_intervals = n_intervals - self.min_interval_length = min_interval_length - self.max_interval_length = max_interval_length - self.features = features - self.series_transformers = series_transformers - self.dilation = dilation - self.estimator = estimator - self.random_state = random_state - self.n_jobs = n_jobs - self.parallel_backend = parallel_backend - - super().__init__() - - def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object: - """Fit the estimator to training data. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The target labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - X, y = self._validate_data( - X=X, y=y, ensure_min_samples=2, ensure_min_series_length=3 - ) - X = self._convert_X(X) - - self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape - - self._n_jobs = check_n_jobs(self.n_jobs) - rng = check_random_state(self.random_state) - - if isinstance(self.series_transformers, (list, tuple)): - self._series_transformers = [ - None if st is None else _clone_estimator(st, random_state=rng) - for st in self.series_transformers - ] - else: - self._series_transformers = [ - ( - None - if self.series_transformers is None - else _clone_estimator(self.series_transformers, random_state=rng) - ) - ] - - X_t = np.empty((X.shape[0], 0)) - self._transformers = [] - for st in self._series_transformers: - if st is not None: - s = st.fit_transform(X, y) - else: - s = X - - ct = RandomIntervalTransformer( - n_intervals=self.n_intervals, - min_interval_length=self.min_interval_length, - max_interval_length=self.max_interval_length, - features=self.features, - dilation=self.dilation, - n_jobs=self._n_jobs, - parallel_backend=self.parallel_backend, - ) - _set_random_states(ct, rng) - self._transformers.append(ct) - t = ct.fit_transform(s, y) - - X_t = np.hstack((X_t, t)) - - self._estimator = _clone_estimator( - ( - RandomForestRegressor(n_estimators=200) - if self.estimator is None - else self.estimator - ), - self.random_state, - ) - - m = getattr(self._estimator, "n_jobs", None) - if m is not None: - self._estimator.n_jobs = self._n_jobs - - self._estimator.fit(X_t, y) - - return self - - def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: - """Predicts labels for sequences in X. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances) - Predicted target labels. - """ - check_is_fitted(self) - - X = self._validate_data(X=X, reset=False, ensure_min_series_length=3) - X = self._convert_X(X) - - X_t = np.empty((X.shape[0], 0)) - for i, st in enumerate(self._series_transformers): - if st is not None: - s = st.transform(X) - else: - s = X - - t = self._transformers[i].transform(s) - X_t = np.hstack((X_t, t)) - - return self._estimator.predict(X_t) - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - from tsml.utils.numba_functions.stats import row_mean, row_numba_min - - return { - "n_intervals": 3, - "estimator": RandomForestRegressor(n_estimators=2), - "features": [row_mean, row_numba_min], - } - - -class SupervisedIntervalClassifier(ClassifierMixin, BaseTimeSeriesEstimator): - """Supervised Interval Classifier. - - Extracts multiple intervals from series with using a supervised process - and concatenates them into a feature vector. Builds an estimator on the - transformed data. - - Parameters - ---------- - n_intervals : int, default=50 - The number of times the supervised interval selection process is run. This - process will extract more then one interval per run. - Each supervised extraction will output a varying amount of features based on - series length, number of dimensions and the number of features. - min_interval_length : int, default=3 - The minimum length of extracted intervals. Minimum value of 3. - features : callable, list of callables, default=None - Functions used to extract features from selected intervals. Must take a 2d - array of shape (n_instances, interval_length) and return a 1d array of shape - (n_instances) containing the features. - If None, defaults to the following statistics used in [2]: - [mean, median, std, slope, min, max, iqr, count_mean_crossing, - count_above_mean]. - metric : ["fisher"] or callable, default="fisher" - The metric used to evaluate the usefulness of a feature extracted on an - interval. If "fisher", the Fisher score is used. If a callable, it must take - a 1d array of shape (n_instances) and return a 1d array of scores of shape - (n_instances). - randomised_split_point : bool, default=True - If True, the split point for interval extraction is randomised as is done in [2] - rather than split in half. - normalise_for_search : bool, default=True - If True, the data is normalised for the supervised interval search process. - Features extracted for the transform output will not use normalised data. - estimator : sklearn classifier, optional, default=None - An sklearn estimator to be built using the transformed data. - Defaults to sklearn RandomForestClassifier(n_estimators=200) - random_state : None, int or instance of RandomState, default=None - Seed or RandomState object used for random number generation. - If random_state is None, use the RandomState singleton used by np.random. - If random_state is an int, use a new RandomState instance seeded with seed. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `transform` functions. - `-1` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_channels_ : int - The number of dimensions per case in the training set. - n_timepoints_ : int - The length of each series in the training set. - n_classes_ : int - Number of classes. Extracted from the data. - classes_ : ndarray of shape (n_classes_) - Holds the label for each class. - class_dictionary_ : dict - A dictionary mapping class labels to class indices in classes_. - - See Also - -------- - SupervisedIntervalTransformer - RandomIntervalClassifier - - Examples - -------- - >>> from tsml.interval_based import SupervisedIntervalClassifier - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0) - >>> clf = SupervisedIntervalClassifier(random_state=0) - >>> clf.fit(X, y) - SupervisedIntervalClassifier(...) - >>> clf.predict(X) - array([0, 1, 1, 0, 0, 1, 0, 1]) - """ - - def __init__( - self, - n_intervals=50, - min_interval_length=3, - features=None, - metric="fisher", - randomised_split_point=True, - normalise_for_search=True, - estimator=None, - random_state=None, - n_jobs=1, - parallel_backend=None, - ): - self.n_intervals = n_intervals - self.min_interval_length = min_interval_length - self.features = features - self.metric = metric - self.randomised_split_point = randomised_split_point - self.normalise_for_search = normalise_for_search - self.estimator = estimator - self.random_state = random_state - self.n_jobs = n_jobs - self.parallel_backend = parallel_backend - - super().__init__() - - def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object: - """Fit the estimator to training data. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - X, y = self._validate_data( - X=X, y=y, ensure_min_samples=2, ensure_min_series_length=7 - ) - X = self._convert_X(X) - - self.n_instances_, self.n_channels_, self.n_timepoints_ = X.shape - self.classes_ = np.unique(y) - self.n_classes_ = self.classes_.shape[0] - self.class_dictionary_ = {} - for index, class_val in enumerate(self.classes_): - self.class_dictionary_[class_val] = index - - if self.n_classes_ == 1: - return self - - self._n_jobs = check_n_jobs(self.n_jobs) - - self._transformer = SupervisedIntervalTransformer( - n_intervals=self.n_intervals, - min_interval_length=self.min_interval_length, - features=self.features, - metric=self.metric, - randomised_split_point=self.randomised_split_point, - normalise_for_search=self.normalise_for_search, - random_state=self.random_state, - n_jobs=self.n_jobs, - parallel_backend=self.parallel_backend, - ) - - self._estimator = _clone_estimator( - ( - RandomForestClassifier(n_estimators=200) - if self.estimator is None - else self.estimator - ), - self.random_state, - ) - - m = getattr(self._estimator, "n_jobs", None) - if m is not None: - self._estimator.n_jobs = self._n_jobs - - X_t = self._transformer.fit_transform(X, y) - self._estimator.fit(X_t, y) - - return self - - def predict(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: - """Predicts labels for sequences in X. - - Parameters - ---------- - X : 3D np.array of shape (n_instances, n_channels, n_timepoints) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances) - Predicted class labels. - """ - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat(list(self.class_dictionary_.keys()), X.shape[0], axis=0) - - X = self._validate_data(X=X, reset=False, ensure_min_series_length=7) - X = self._convert_X(X) - - return self._estimator.predict(self._transformer.transform(X)) - - def predict_proba(self, X: Union[np.ndarray, List[np.ndarray]]) -> np.ndarray: - """Predicts labels probabilities for sequences in X. - - Parameters - ---------- - X : 3D np.array of shape (n_instances, n_channels, n_timepoints) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances, n_classes_) - Predicted probabilities using the ordering in classes_. - """ - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat([[1]], X.shape[0], axis=0) - - X = self._validate_data(X=X, reset=False, ensure_min_series_length=7) - X = self._convert_X(X) - - m = getattr(self._estimator, "predict_proba", None) - if callable(m): - return self._estimator.predict_proba(self._transformer.transform(X)) - else: - dists = np.zeros((X.shape[0], self.n_classes_)) - preds = self._estimator.predict(self._transformer.transform(X)) - for i in range(0, X.shape[0]): - dists[i, self.class_dictionary_[preds[i]]] = 1 - return dists - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - from tsml.utils.numba_functions.stats import row_mean, row_numba_min - - return { - "n_intervals": 1, - "estimator": RandomForestClassifier(n_estimators=2), - "features": [row_mean, row_numba_min], - } diff --git a/tsml/interval_based/tests/__init__.py b/tsml/interval_based/tests/__init__.py deleted file mode 100644 index e472d7d..0000000 --- a/tsml/interval_based/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Testing for interval-based base classes.""" diff --git a/tsml/interval_based/tests/test_interval_forest.py b/tsml/interval_based/tests/test_interval_forest.py deleted file mode 100644 index 3a92417..0000000 --- a/tsml/interval_based/tests/test_interval_forest.py +++ /dev/null @@ -1,206 +0,0 @@ -"""Tests for the BaseIntervalForest class.""" - -import numpy as np -import pytest -from sklearn.pipeline import make_pipeline -from sklearn.tree import DecisionTreeClassifier - -from tsml.base import _clone_estimator -from tsml.interval_based import IntervalForestClassifier -from tsml.transformations import ( - AutocorrelationFunctionTransformer, - Catch22Transformer, - FunctionTransformer, - SevenNumberSummaryTransformer, -) -from tsml.utils.numba_functions.stats import row_mean, row_numba_min -from tsml.utils.testing import generate_3d_test_data -from tsml.utils.validation import _check_optional_dependency -from tsml.vector import CITClassifier - - -@pytest.mark.parametrize( - "base_estimator", - [DecisionTreeClassifier(), CITClassifier()], -) -def test_interval_forest_feature_skipping(base_estimator): - """Test BaseIntervalForest feature skipping with different base estimators.""" - X, y = generate_3d_test_data() - rs = np.random.randint(np.iinfo(np.int32).max) - - est = IntervalForestClassifier( - base_estimator=base_estimator, - n_estimators=2, - n_intervals=2, - random_state=rs, - ) - est.fit(X, y) - preds = est.predict(X) - - assert est._efficient_predictions is True - - est = IntervalForestClassifier( - base_estimator=make_pipeline(base_estimator), - n_estimators=2, - n_intervals=2, - random_state=rs, - ) - est.fit(X, y) - - assert est._efficient_predictions is False - assert (preds == est.predict(X)).all() - - -def test_interval_forest_invalid_feature_skipping(): - """Test BaseIntervalForest with an invalid transformer for feature skipping.""" - X, y = generate_3d_test_data() - - est = IntervalForestClassifier( - n_estimators=2, - n_intervals=2, - interval_features=SevenNumberSummaryTransformer(), - ) - est.fit(X, y) - - assert est._efficient_predictions is False - - -@pytest.mark.parametrize( - "interval_selection_method", - ["random", "supervised", "random-supervised"], -) -def test_interval_forest_selection_methods(interval_selection_method): - """Test BaseIntervalForest with different interval selection methods.""" - X, y = generate_3d_test_data() - - est = IntervalForestClassifier( - n_estimators=2, - n_intervals=2, - interval_selection_method=interval_selection_method, - ) - est.fit(X, y) - - assert est.predict_proba(X).shape == (10, 2) - - -@pytest.mark.parametrize( - "n_intervals,n_intervals_len", - [ - ("sqrt", 24), - ("sqrt-div", 12), - (["sqrt-div", 2], 24), - ([[1, 2], "sqrt-div"], 15), - ], -) -def test_interval_forest_n_intervals(n_intervals, n_intervals_len): - """Test BaseIntervalForest n_interval options.""" - X, y = generate_3d_test_data(series_length=20) - - est = IntervalForestClassifier( - n_estimators=2, - n_intervals=n_intervals, - series_transformers=[None, FunctionTransformer(np.log1p)], - save_transformed_data=True, - random_state=0, - ) - est.fit(X, y) - est.predict_proba(X) - - data = est.transformed_data_ - assert data[0].shape[1] == n_intervals_len - - -if _check_optional_dependency("pycatch22", "pycatch22", None, raise_error=False): - att_subsample_c22 = Catch22Transformer( - features=[ - "DN_HistogramMode_5", - "DN_HistogramMode_10", - "SB_BinaryStats_diff_longstretch0", - ] - ) -else: - att_subsample_c22 = SevenNumberSummaryTransformer() - - -@pytest.mark.skipif( - not _check_optional_dependency("pycatch22", "pycatch22", None, raise_error=False), - reason="pycatch22 not installed", -) -@pytest.mark.parametrize( - "features,output_len", - [ - (None, 3), - (_clone_estimator(att_subsample_c22), 3), - ([_clone_estimator(att_subsample_c22), _clone_estimator(att_subsample_c22)], 6), - ( - [ - row_mean, - _clone_estimator(att_subsample_c22), - row_numba_min, - ], - 4, - ), - ], -) -def test_interval_forest_attribute_subsample(features, output_len): - """Test BaseIntervalForest subsampling with different interval features.""" - X, y = generate_3d_test_data() - - est = IntervalForestClassifier( - n_estimators=2, - n_intervals=2, - att_subsample_size=0.5, - interval_features=features, - replace_nan=0, - save_transformed_data=True, - random_state=0, - ) - est.fit(X, y) - est.predict_proba(X) - - data = est.transformed_data_ - assert data[0].shape[1] == int(output_len * 0.5) * 2 - - -def test_interval_forest_invalid_attribute_subsample(): - """Test BaseIntervalForest with an invalid transformer for subsampling.""" - X, y = generate_3d_test_data() - - est = IntervalForestClassifier( - n_estimators=2, - n_intervals=2, - att_subsample_size=2, - interval_features=SevenNumberSummaryTransformer(), - ) - - with pytest.raises(ValueError): - est.fit(X, y) - - -@pytest.mark.parametrize( - "series_transformer", - [ - FunctionTransformer(np.log1p), - [None, FunctionTransformer(np.log1p)], - [FunctionTransformer(np.log1p), AutocorrelationFunctionTransformer(n_lags=6)], - ], -) -def test_interval_forest_series_transformer(series_transformer): - """Test BaseIntervalForest with different series transformers.""" - X, y = generate_3d_test_data() - - est = IntervalForestClassifier( - n_estimators=2, - n_intervals=2, - series_transformers=series_transformer, - save_transformed_data=True, - random_state=0, - ) - est.fit(X, y) - est.predict_proba(X) - - data = est.transformed_data_ - expected = ( - len(series_transformer) * 6 if isinstance(series_transformer, list) else 6 - ) - assert data[0].shape[1] == expected diff --git a/tsml/interval_based/tests/test_interval_pipelines.py b/tsml/interval_based/tests/test_interval_pipelines.py deleted file mode 100644 index e9bdb9c..0000000 --- a/tsml/interval_based/tests/test_interval_pipelines.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Tests for the interval pipeline classes.""" - -from tsml.interval_based import RandomIntervalClassifier -from tsml.transformations import FunctionTransformer -from tsml.utils.numba_functions.general import first_order_differences_3d -from tsml.utils.testing import generate_3d_test_data - - -def test_random_interval_callable(): - """Test RandomIntervalClassifier with a callable n_intervals.""" - X, y = generate_3d_test_data() - - def interval_func(X): - return int(X.shape[2] / 5) - - est = RandomIntervalClassifier( - n_intervals=interval_func, - ) - est.fit(X, y) - - assert est._transformers[0]._n_intervals == 2 - - -def test_random_interval_series_transform_callable(): - """Test RandomIntervalClassifier with a series transformer.""" - X, y = generate_3d_test_data() - - est = RandomIntervalClassifier( - n_intervals=2, - series_transformers=[ - None, - FunctionTransformer(func=first_order_differences_3d, validate=False), - ], - ) - est.fit(X, y) - est.predict_proba(X) - - assert len(est._transformers) == 2 diff --git a/tsml/transformations/__init__.py b/tsml/transformations/__init__.py index 5186259..0944e8a 100644 --- a/tsml/transformations/__init__.py +++ b/tsml/transformations/__init__.py @@ -1,29 +1,13 @@ """tsml transformations.""" __all__ = [ - "AutocorrelationFunctionTransformer", - "ARCoefficientTransformer", - "Catch22Transformer", "FPCATransformer", "FunctionTransformer", - "RandomIntervalTransformer", - "SupervisedIntervalTransformer", - # "FixedIntervalTransformer", - "PeriodogramTransformer", - # "QuantileTransformer", "SevenNumberSummaryTransformer", "TransformerConcatenator", ] -from tsml.transformations._acf import AutocorrelationFunctionTransformer -from tsml.transformations._ar_coefficient import ARCoefficientTransformer -from tsml.transformations._catch22 import Catch22Transformer from tsml.transformations._fpca import FPCATransformer from tsml.transformations._function_transformer import FunctionTransformer -from tsml.transformations._interval_extraction import ( - RandomIntervalTransformer, - SupervisedIntervalTransformer, -) -from tsml.transformations._periodogram import PeriodogramTransformer from tsml.transformations._summary_features import SevenNumberSummaryTransformer from tsml.transformations._transform_concatenator import TransformerConcatenator diff --git a/tsml/transformations/_acf.py b/tsml/transformations/_acf.py deleted file mode 100644 index a553ac0..0000000 --- a/tsml/transformations/_acf.py +++ /dev/null @@ -1,140 +0,0 @@ -"""Autocorrelation function transformer.""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = ["AutocorrelationFunctionTransformer"] - -from typing import List, Union - -import numpy as np -from numba import njit -from sklearn.base import TransformerMixin - -from tsml.base import BaseTimeSeriesEstimator - - -class AutocorrelationFunctionTransformer(TransformerMixin, BaseTimeSeriesEstimator): - """Autocorrelation function transformer. - - The autocorrelation function measures how correlated a timeseries is - with itself at different lags. The AutocorrelationFunctionTransformer returns - these values as a series for each lag up to the `n_lags` specified. - - Efficient implementation for collections using numba - - Parameters - ---------- - n_lags : int or callable, default=100 - The maximum number of autocorrelation terms to use. If callable, the - function should take a 3D numpy array of shape (n_instances, n_channels, - n_timepoints) and return an integer. - min_values : int, default=0 - Never use fewer than this number of terms to find a correlation unless the - series length is too short. This will reduce n_lags if needed. - - Examples - -------- - >>> from tsml.transformations import AutocorrelationFunctionTransformer - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, _ = generate_3d_test_data(n_samples=4, n_channels=2, series_length=20, - ... random_state=0) - >>> tnf = AutocorrelationFunctionTransformer(n_lags=10) - >>> tnf.fit(X) - AutocorrelationFunctionTransformer(...) - >>> print(tnf.transform(X)[0]) - [[ 0.10642255 -0.04497476 -0.27607675 -0.24169331 0.04717655 0.07221666 - -0.36798515 -0.53768553 0.07550288 0.08557519] - [-0.21166957 0.24992846 -0.38036068 0.10243325 -0.18565336 0.05619381 - -0.19569665 0.28835692 -0.42359509 0.21378191]] - """ - - def __init__( - self, - n_lags=100, - min_values=0, - ): - self.n_lags = n_lags - self.min_values = min_values - - super().__init__() - - def fit(self, X, y=None): - self._validate_data(X=X) - return self - - def transform(self, X, y=None): - X = self._validate_data(X=X, reset=False) - X = self._convert_X(X) - - n_instances, n_channels, n_timepoints = X.shape - - lags = self.n_lags(X) if callable(self.n_lags) else self.n_lags - if lags > n_timepoints - self.min_values: - lags = n_timepoints - self.min_values - if lags < 0: - lags = 1 - - if lags > n_timepoints - 1: - raise ValueError( - f"lags ({lags}) must be smaller than n_timepoints - 1 " - f"({n_timepoints - 1})." - ) - - Xt = np.zeros((n_instances, n_channels, lags)) - for n in range(n_channels): - Xt[:, n, :] = self._acf_2d(X[:, n, :], lags) - - return Xt - - def _more_tags(self) -> dict: - return {"requires_fit": False} - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - return { - "n_lags": 4, - } - - @staticmethod - @njit(cache=True, fastmath=True) - def _acf_2d(X, max_lag): - n_instances, length = X.shape - - X_t = np.zeros((n_instances, max_lag)) - for i, x in enumerate(X): - for lag in range(1, max_lag + 1): - lag_length = length - lag - x1, x2 = x[:-lag], x[lag:] - s1 = np.sum(x1) - s2 = np.sum(x2) - m1 = s1 / lag_length - m2 = s2 / lag_length - ss1 = np.sum(x1 * x1) - ss2 = np.sum(x2 * x2) - v1 = ss1 - s1 * m1 - v2 = ss2 - s2 * m2 - v1_is_zero, v2_is_zero = v1 <= 1e-9, v2 <= 1e-9 - if v1_is_zero and v2_is_zero: # Both zero variance, - # so must be 100% correlated - X_t[i][lag - 1] = 1 - elif v1_is_zero or v2_is_zero: # One zero variance - # the other not - X_t[i][lag - 1] = 0 - else: - X_t[i][lag - 1] = np.sum((x1 - m1) * (x2 - m2)) / np.sqrt(v1 * v2) - - return X_t diff --git a/tsml/transformations/_ar_coefficient.py b/tsml/transformations/_ar_coefficient.py deleted file mode 100644 index b798d3a..0000000 --- a/tsml/transformations/_ar_coefficient.py +++ /dev/null @@ -1,119 +0,0 @@ -"""AR coefficient feature transformer.""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = ["ARCoefficientTransformer"] - -from typing import List, Union - -import numpy as np -from sklearn.base import TransformerMixin - -from tsml.base import BaseTimeSeriesEstimator -from tsml.utils.validation import _check_optional_dependency - - -class ARCoefficientTransformer(TransformerMixin, BaseTimeSeriesEstimator): - """Autoreggression coefficient feature transformer. - - Coefficients of an autoregressive model using Burg's method. The Burg method - fits a forward-backward autoregressive model to the data using least squares - regression. - - Parameters - ---------- - order : int or callable, default=100 - The order of the autoregression. If callable, the function should take a 3D - numpy array of shape (n_instances, n_channels, n_timepoints) and return an - integer. - min_values : int, default=0 - Always transform at least this many values unless the series length is too - short. This will reduce order if needed. - replace_nan : bool, default=False - If True, replace NaNs in output with 0s. - - Examples - -------- - >>> from tsml.transformations import ARCoefficientTransformer - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, _ = generate_3d_test_data(n_samples=4, n_channels=2, series_length=20, - ... random_state=0) - >>> tnf = ARCoefficientTransformer(order=5) - >>> tnf.fit(X) - ARCoefficientTransformer(...) - >>> print(tnf.transform(X)[0]) - [[ 0.05445952 -0.02106654 -0.24989205 -0.19153596 0.08833235] - [-0.13034384 0.16255828 -0.27993791 -0.06842601 -0.01382752]] - """ - - def __init__( - self, - order=100, - min_values=0, - replace_nan=False, - ): - self.order = order - self.min_values = min_values - self.replace_nan = replace_nan - - _check_optional_dependency("statsmodels", "statsmodels", self) - - super().__init__() - - def fit(self, X, y=None): - self._validate_data(X=X) - return self - - def transform(self, X, y=None): - from statsmodels.regression.linear_model import burg - - X = self._validate_data(X=X, reset=False) - X = self._convert_X(X) - - n_instances, n_channels, n_timepoints = X.shape - - order = self.order(X) if callable(self.order) else self.order - if order > n_timepoints - self.min_values: - order = n_timepoints - self.min_values - if order < 0: - order = 1 - - if order > n_timepoints - 1: - raise ValueError( - f"order ({order}) must be smaller than n_timepoints - 1 " - f"({n_timepoints - 1})." - ) - - Xt = np.zeros((n_instances, n_channels, order)) - for i in range(n_instances): - for n in range(n_channels): - coefs, _ = burg(X[i, n], order=order) - Xt[i, n] = coefs - - if self.replace_nan: - Xt[np.isnan(Xt)] = 0 - - return Xt - - def _more_tags(self) -> dict: - return {"requires_fit": False, "optional_dependency": True} - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - return { - "order": 4, - } diff --git a/tsml/transformations/_catch22.py b/tsml/transformations/_catch22.py deleted file mode 100644 index 932f4a8..0000000 --- a/tsml/transformations/_catch22.py +++ /dev/null @@ -1,335 +0,0 @@ -"""Catch22 features. - -A transformer for the Catch22 features. -""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = ["Catch22Transformer"] - - -import numpy as np -from joblib import Parallel -from sklearn.base import TransformerMixin -from sklearn.utils.parallel import delayed - -from tsml.base import BaseTimeSeriesEstimator -from tsml.utils.numba_functions.general import z_normalise_series -from tsml.utils.validation import _check_optional_dependency, check_n_jobs - -feature_names = [ - "DN_HistogramMode_5", - "DN_HistogramMode_10", - "SB_BinaryStats_diff_longstretch0", - "DN_OutlierInclude_p_001_mdrmd", - "DN_OutlierInclude_n_001_mdrmd", - "CO_f1ecac", - "CO_FirstMin_ac", - "SP_Summaries_welch_rect_area_5_1", - "SP_Summaries_welch_rect_centroid", - "FC_LocalSimple_mean3_stderr", - "CO_trev_1_num", - "CO_HistogramAMI_even_2_5", - "IN_AutoMutualInfoStats_40_gaussian_fmmi", - "MD_hrv_classic_pnn40", - "SB_BinaryStats_mean_longstretch1", - "SB_MotifThree_quantile_hh", - "FC_LocalSimple_mean1_tauresrat", - "CO_Embed2_Dist_tau_d_expfit_meandiff", - "SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1", - "SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1", - "SB_TransitionMatrix_3ac_sumdiagcov", - "PD_PeriodicityWang_th0_01", -] - - -class Catch22Transformer(TransformerMixin, BaseTimeSeriesEstimator): - """Canonical Time-series Characteristics (Catch22). - - Overview: Input n series with d dimensions of length m - Transforms series into the 22 Catch22 [1]_ features extracted from the hctsa [2]_ - toolbox. - - Parameters - ---------- - features : int/str or List of int/str, optional, default="all" - The Catch22 features to extract by feature index, feature name as a str or as a - list of names or indices for multiple features. If "all", all features are - extracted. - Valid features are as follows: - ["DN_HistogramMode_5", "DN_HistogramMode_10", - "SB_BinaryStats_diff_longstretch0", "DN_OutlierInclude_p_001_mdrmd", - "DN_OutlierInclude_n_001_mdrmd", "CO_f1ecac", "CO_FirstMin_ac", - "SP_Summaries_welch_rect_area_5_1", "SP_Summaries_welch_rect_centroid", - "FC_LocalSimple_mean3_stderr", "CO_trev_1_num", "CO_HistogramAMI_even_2_5", - "IN_AutoMutualInfoStats_40_gaussian_fmmi", "MD_hrv_classic_pnn40", - "SB_BinaryStats_mean_longstretch1", "SB_MotifThree_quantile_hh", - "FC_LocalSimple_mean1_tauresrat", "CO_Embed2_Dist_tau_d_expfit_meandiff", - "SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1", - "SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1", - "SB_TransitionMatrix_3ac_sumdiagcov", "PD_PeriodicityWang_th0_01"] - catch24 : bool, optional, default=False - Extract the mean and standard deviation as well as the 22 Catch22 features if - true. If a List of specific features to extract is provided, "Mean" and/or - "StandardDeviation" must be added to the List to extract these features. - outlier_norm : bool, optional, default=False - Normalise each series during the two outlier Catch22 features, which can take a - while to process for large values. - replace_nans : bool, optional, default=False - Replace NaN or inf values from the Catch22 transform with 0. - use_pycatch22 : bool, optional, default=True - Wraps the C based pycatch22 implementation for tsml. - (https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the - ``pycatch22`` package to be installed if True. - n_jobs : int, optional, default=1 - The number of jobs to run in parallel for `transform`. Requires multiple input - cases. ``-1`` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - See Also - -------- - Catch22Classifier - - Notes - ----- - Original Catch22 package implementations: - https://github.com/DynamicsAndNeuralSystems/Catch22 - - For the Java version, see - https://github.com/uea-machine-learning/tsml/blob/master/src/main/java - /tsml/transformers/Catch22.java - - References - ---------- - .. [1] Lubba, C. H., Sethi, S. S., Knaute, P., Schultz, S. R., Fulcher, B. D., & - Jones, N. S. (2019). catch22: Canonical time-series characteristics. Data Mining - and Knowledge Discovery, 33(6), 1821-1852. - .. [2] Fulcher, B. D., Little, M. A., & Jones, N. S. (2013). Highly comparative - time-series analysis: the empirical structure of time series and their methods. - Journal of the Royal Society Interface, 10(83), 20130048. - """ - - def __init__( - self, - features="all", - catch24=False, - outlier_norm=False, - replace_nans=False, - n_jobs=1, - parallel_backend=None, - ): - self.features = features - self.catch24 = catch24 - self.outlier_norm = outlier_norm - self.replace_nans = replace_nans - self.n_jobs = n_jobs - self.parallel_backend = parallel_backend - - _check_optional_dependency("pycatch22", "pycatch22", self) - - super().__init__() - - def fit(self, X, y=None): - """Unused. Validates X.""" - self._validate_data(X=X) - return self - - def transform(self, X, y=None): - """Transform X into the catch22 features. - - Parameters - ---------- - X : 3D np.array (any number of channels, equal length series) - of shape (n_instances, n_channels, n_timepoints) - or list of numpy arrays (any number of channels, unequal length series) - of shape [n_instances], 2D np.array (n_channels, n_timepoints_i), where - n_timepoints_i is length of series i - - Returns - ------- - Xt : array-like, shape = [n_instances, num_features*n_channels] - The catch22 features for each dimension. - """ - X = self._validate_data(X=X, reset=False) - X = self._convert_X(X) - - n_instances = len(X) - - f_idx = _verify_features(self.features, self.catch24) - - threads_to_use = check_n_jobs(self.n_jobs) - - import pycatch22 - - features = [ - pycatch22.DN_HistogramMode_5, - pycatch22.DN_HistogramMode_10, - pycatch22.SB_BinaryStats_diff_longstretch0, - pycatch22.DN_OutlierInclude_p_001_mdrmd, - pycatch22.DN_OutlierInclude_n_001_mdrmd, - pycatch22.CO_f1ecac, - pycatch22.CO_FirstMin_ac, - pycatch22.SP_Summaries_welch_rect_area_5_1, - pycatch22.SP_Summaries_welch_rect_centroid, - pycatch22.FC_LocalSimple_mean3_stderr, - pycatch22.CO_trev_1_num, - pycatch22.CO_HistogramAMI_even_2_5, - pycatch22.IN_AutoMutualInfoStats_40_gaussian_fmmi, - pycatch22.MD_hrv_classic_pnn40, - pycatch22.SB_BinaryStats_mean_longstretch1, - pycatch22.SB_MotifThree_quantile_hh, - pycatch22.FC_LocalSimple_mean1_tauresrat, - pycatch22.CO_Embed2_Dist_tau_d_expfit_meandiff, - pycatch22.SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1, - pycatch22.SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1, - pycatch22.SB_TransitionMatrix_3ac_sumdiagcov, - pycatch22.PD_PeriodicityWang_th0_01, - ] - - c22_list = Parallel( - n_jobs=threads_to_use, backend=self.parallel_backend, prefer="threads" - )( - delayed(self._transform_case_pycatch22)( - X[i], - f_idx, - features, - ) - for i in range(n_instances) - ) - - if self.replace_nans: - c22_list = np.nan_to_num(c22_list, False, 0, 0, 0) - - return np.array(c22_list) - - def _transform_case_pycatch22(self, X, f_idx, features): - c22 = np.zeros(len(f_idx) * len(X)) - - if hasattr(self, "_transform_features") and len( - self._transform_features - ) == len(c22): - transform_feature = self._transform_features - else: - transform_feature = [True] * len(c22) - - f_count = -1 - for i in range(len(X)): - dim = i * len(f_idx) - series = list(X[i]) - - if self.outlier_norm and (3 in f_idx or 4 in f_idx): - outlier_series = list(z_normalise_series(X[i])) - - for n, feature in enumerate(f_idx): - f_count += 1 - if not transform_feature[f_count]: - continue - - if self.outlier_norm and feature in [3, 4]: - c22[dim + n] = features[feature](outlier_series) - if feature == 22: - c22[dim + n] = np.mean(series) - elif feature == 23: - c22[dim + n] = np.std(series) - else: - c22[dim + n] = features[feature](series) - - return c22 - - @property - def get_features_arguments(self): - """Return feature names for the estimators features argument.""" - return ( - self.features - if self.features != "all" - else ( - feature_names + ["Mean", "StandardDeviation"] - if self.catch24 - else feature_names - ) - ) - - def _more_tags(self) -> dict: - return { - "X_types": ["np_list", "3darray"], - "requires_fit": False, - "optional_dependency": True, - } - - @classmethod - def get_test_params(cls, parameter_set="default"): - """Return testing parameter settings for the estimator. - - Parameters - ---------- - parameter_set : str, default="default" - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict, default = {} - Parameters to create testing instances of the class - Each dict are parameters to construct an "interesting" test instance, i.e., - `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. - `create_test_instance` uses the first (or only) dictionary in `params` - """ - return {} - - -def _verify_features(features, catch24): - if isinstance(features, str): - if features == "all": - f_idx = [i for i in range(22)] - if catch24: - f_idx += [22, 23] - elif features in feature_names: - f_idx = [feature_names.index(features)] - elif catch24 and features == "Mean": - f_idx = [22] - elif catch24 and features == "StandardDeviation": - f_idx = [23] - else: - raise ValueError("Invalid feature selection.") - elif isinstance(features, int): - if features >= 0 and features < 22: - f_idx = [features] - elif catch24 and features == 22: - f_idx = [22] - elif catch24 and features == 23: - f_idx = [23] - else: - raise ValueError("Invalid feature selection.") - elif isinstance(features, (list, tuple)): - if len(features) > 0: - f_idx = [] - for f in features: - if isinstance(f, str): - if f in feature_names: - f_idx.append(feature_names.index(f)) - elif catch24 and f == "Mean": - f_idx.append(22) - elif catch24 and f == "StandardDeviation": - f_idx.append(23) - else: - raise ValueError("Invalid feature selection.") - elif isinstance(f, int): - if f >= 0 and f < 22: - f_idx.append(f) - elif catch24 and f == 22: - f_idx.append(22) - elif catch24 and f == 23: - f_idx.append(23) - else: - raise ValueError("Invalid feature selection.") - else: - raise ValueError("Invalid feature selection.") - else: - raise ValueError("Invalid feature selection.") - else: - raise ValueError("Invalid feature selection.") - - return f_idx diff --git a/tsml/transformations/_interval_extraction.py b/tsml/transformations/_interval_extraction.py deleted file mode 100644 index eb15711..0000000 --- a/tsml/transformations/_interval_extraction.py +++ /dev/null @@ -1,1572 +0,0 @@ -"""Interval extraction transformers.""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = [ - "RandomIntervalTransformer", - "SupervisedIntervalTransformer", - # "FixedIntervalTransformer", -] - -import inspect -from typing import List, Union - -import numpy as np -from joblib import Parallel -from sklearn import preprocessing -from sklearn.base import TransformerMixin -from sklearn.utils import check_random_state -from sklearn.utils.parallel import delayed -from sklearn.utils.validation import check_is_fitted - -from tsml.base import BaseTimeSeriesEstimator, _clone_estimator -from tsml.utils._tags import _safe_tags -from tsml.utils.numba_functions.general import z_normalise_series_3d -from tsml.utils.numba_functions.stats import ( - fisher_score, - row_count_above_mean, - row_count_mean_crossing, - row_iqr, - row_mean, - row_median, - row_numba_max, - row_numba_min, - row_quantile25, - row_quantile75, - row_slope, - row_std, -) -from tsml.utils.validation import check_n_jobs, is_transformer - - -class RandomIntervalTransformer(TransformerMixin, BaseTimeSeriesEstimator): - """Random interval feature transformer. - - Extracts intervals with random length, position and dimension from series in fit. - Transforms each interval sub-series using the given transformer(s)/features and - concatenates them into a feature vector in transform. - - Identical intervals are pruned at the end of fit, as such the number of features may - be less than expected from n_intervals. - - Parameters - ---------- - n_intervals : int or callable, default=100, - The number of intervals of random length, position and dimension to be - extracted. Input should be an int or a function that takes a 3D np.ndarray - input and returns an int. - min_interval_length : int, default=3 - The minimum length of extracted intervals. Minimum value of 3. - max_interval_length : int, default=3 - The maximum length of extracted intervals. Minimum value of min_interval_length. - features : TransformerMixin, a function taking a 2d numpy array parameter, or list - of said transformers and functions, default=None - Transformers and functions used to extract features from selected intervals. - If None, defaults to [mean, median, min, max, std, 25% quantile, 75% quantile] - dilation : int, list or None, default=None - Add dilation to extracted intervals. No dilation is added if None or 1. If a - list of ints, a random dilation value is selected from the list for each - interval. - - The dilation value is selected after the interval star and end points. If the - amount of values in the dilated interval is less than the min_interval_length, - the amount of dilation applied is reduced. - random_state : None, int or instance of RandomState, default=None - Seed or RandomState object used for random number generation. - If random_state is None, use the RandomState singleton used by np.random. - If random_state is an int, use a new RandomState instance seeded with seed. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `transform` functions. - `-1` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - Attributes - ---------- - n_instances_ : int - The number of train cases. - n_dims_ : int - The number of dimensions per case. - series_length_ : int - The length of each series. - n_intervals_ : int - The number of intervals extracted after pruning identical intervals. - intervals_ : list of tuples - Contains information for each feature extracted in fit. Each tuple contains the - interval start, interval end, interval dimension, the feature(s) extracted and - the dilation. - Length will be n_intervals*len(features). - - See Also - -------- - SupervisedIntervalTransformer - FixedIntervalTransformer - - Examples - -------- - >>> from tsml.transformations import RandomIntervalTransformer - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, _ = generate_3d_test_data(n_samples=4, series_length=12, random_state=0) - >>> tnf = RandomIntervalTransformer(n_intervals=2, random_state=0) - >>> tnf.fit(X) - RandomIntervalTransformer(...) - >>> print(tnf.transform(X)[0]) - [1.04753424 0.14925939 0.8473096 1.20552675 1.08976637 0.96853798 - 1.14764656 1.07628806 0.18170775 0.8473096 1.29178823 1.08976637 - 0.96853798 1.1907773 ] - """ - - def __init__( - self, - n_intervals=100, - min_interval_length=3, - max_interval_length=np.inf, - features=None, - dilation=None, - random_state=None, - n_jobs=1, - parallel_backend=None, - ): - self.n_intervals = n_intervals - self.min_interval_length = min_interval_length - self.max_interval_length = max_interval_length - self.features = features - self.dilation = dilation - self.random_state = random_state - self.n_jobs = n_jobs - self.parallel_backend = parallel_backend - - super().__init__() - - transformer_feature_skip = ["transform_features_", "_transform_features"] - - def fit_transform( - self, X: Union[np.ndarray, List[np.ndarray]], y: Union[np.ndarray, None] = None - ) -> np.ndarray: - """Fit the transformer to training data and return transformed data. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - X_t : 2D np.ndarray of shape (n_instances, n_features) - Transformed data. - """ - X, rng = self._fit_setup(X) - - fit = Parallel( - n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" - )( - delayed(self._generate_interval)( - X, - y, - rng.randint(np.iinfo(np.int32).max), - True, - ) - for _ in range(self._n_intervals) - ) - - ( - intervals, - transformed_intervals, - ) = zip(*fit) - - current = [] - removed_idx = [] - self.n_intervals_ = 0 - for i, interval in enumerate(intervals): - new_interval = ( - interval[0][0], - interval[0][1], - interval[0][2], - interval[0][4], - ) - if new_interval not in current: - current.append(new_interval) - self.intervals_.extend(interval) - self.n_intervals_ += 1 - else: - removed_idx.append(i) - - Xt = transformed_intervals[0] - for i in range(1, self._n_intervals): - if i not in removed_idx: - Xt = np.hstack((Xt, transformed_intervals[i])) - - return Xt - - def fit( - self, X: Union[np.ndarray, List[np.ndarray]], y: Union[np.ndarray, None] = None - ) -> object: - """Fit the transformer to training data. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - X, rng = self._fit_setup(X) - - fit = Parallel( - n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" - )( - delayed(self._generate_interval)( - X, - y, - rng.randint(np.iinfo(np.int32).max), - False, - ) - for _ in range(self.n_intervals) - ) - - ( - intervals, - _, - ) = zip(*fit) - - current = [] - self.n_intervals_ = 0 - for i in intervals: - interval = (i[0][0], i[0][1], i[0][2], i[0][4]) - if interval not in current: - current.append(interval) - self.intervals_.extend(i) - self.n_intervals_ += 1 - - return self - - def transform( - self, X: Union[np.ndarray, List[np.ndarray]], y: Union[np.ndarray, None] = None - ) -> np.ndarray: - """Transform input cases in X. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - X_t : 2D np.ndarray of shape (n_instances, n_features) - Transformed data. - """ - check_is_fitted(self) - - X = self._validate_data(X=X, reset=False, ensure_min_series_length=3) - - if self._transform_features is None: - transform_features = [None] * len(self.intervals_) - else: - count = 0 - transform_features = [] - for _ in range(self.n_intervals_): - for feature in self._features: - if is_transformer(feature): - nf = feature.n_transformed_features - transform_features.append( - self._transform_features[count : count + nf] - ) - count += nf - else: - transform_features.append(self._transform_features[count]) - count += 1 - - transform = Parallel( - n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" - )( - delayed(self._transform_interval)( - X, - i, - transform_features[i], - ) - for i in range(len(self.intervals_)) - ) - - Xt = transform[0] - for i in range(1, len(self.intervals_)): - Xt = np.hstack((Xt, transform[i])) - - return Xt - - def _fit_setup(self, X): - X = self._validate_data(X=X, ensure_min_series_length=3) - X = self._convert_X(X) - - self.intervals_ = [] - self._transform_features = None - - self.n_instances_, self.n_dims_, self.series_length_ = X.shape - - if callable(self.n_intervals): - self._n_intervals = self.n_intervals(X) - else: - self._n_intervals = self.n_intervals - - self._min_interval_length = self.min_interval_length - if self.min_interval_length < 3: - self._min_interval_length = 3 - - self._max_interval_length = self.max_interval_length - if self.max_interval_length < self._min_interval_length: - self._max_interval_length = self._min_interval_length - elif self.max_interval_length > self.series_length_: - self._max_interval_length = self.series_length_ - - self._features = self.features - if self.features is None: - self._features = [ - row_mean, - row_std, - row_numba_min, - row_numba_max, - row_median, - row_quantile25, - row_quantile75, - ] - elif not isinstance(self.features, list): - self._features = [self.features] - - li = [] - for feature in self._features: - if is_transformer(feature): - li.append( - _clone_estimator( - feature, - self.random_state, - ) - ) - elif callable(feature): - li.append(feature) - else: - raise ValueError( - "Input features must be a list of callables or aeon transformers." - ) - self._features = li - - if self.dilation is None: - self._dilation = [1] - elif isinstance(self.dilation, list): - self._dilation = self.dilation - else: - self._dilation = [self.dilation] - - self._n_jobs = check_n_jobs(self.n_jobs) - - rng = check_random_state(self.random_state) - - return X, rng - - def _generate_interval(self, X, y, seed, transform): - rng = check_random_state(seed) - - dim = rng.randint(self.n_dims_) - - if rng.random() < 0.5: - interval_start = ( - rng.randint(0, self.series_length_ - self._min_interval_length) - if self.series_length_ > self._min_interval_length - else 0 - ) - len_range = min( - self.series_length_ - interval_start, - self._max_interval_length, - ) - length = ( - rng.randint(0, len_range - self._min_interval_length) - + self._min_interval_length - if len_range > self._min_interval_length - else self._min_interval_length - ) - interval_end = interval_start + length - else: - interval_end = ( - rng.randint(0, self.series_length_ - self._min_interval_length) - + self._min_interval_length - if self.series_length_ > self._min_interval_length - else self._min_interval_length - ) - len_range = min(interval_end, self._max_interval_length) - length = ( - rng.randint(0, len_range - self._min_interval_length) - + self._min_interval_length - if len_range > self._min_interval_length - else self._min_interval_length - ) - interval_start = interval_end - length - - interval_length = interval_end - interval_start - dilation = rng.choice(self._dilation) - while interval_length / dilation < self._min_interval_length: - dilation -= 1 - - Xt = np.empty((self.n_instances_, 0)) if transform else None - intervals = [] - - for feature in self._features: - if is_transformer(feature): - if transform: - feature = _clone_estimator( - feature, - seed, - ) - - t = feature.fit_transform( - np.expand_dims( - X[:, dim, interval_start:interval_end:dilation], axis=1 - ), - y, - ) - - if t.ndim == 3 and t.shape[1] == 1: - t = t.reshape((t.shape[0], t.shape[2])) - - Xt = np.hstack((Xt, t)) - else: - feature.fit( - np.expand_dims( - X[:, dim, interval_start:interval_end:dilation], axis=1 - ), - y, - ) - elif transform: - t = [ - [f] - for f in feature(X[:, dim, interval_start:interval_end:dilation]) - ] - Xt = np.hstack((Xt, t)) - - intervals.append((interval_start, interval_end, dim, feature, dilation)) - - return intervals, Xt - - def _transform_interval(self, X, idx, keep_transform): - interval_start, interval_end, dim, feature, dilation = self.intervals_[idx] - - if keep_transform is not None: - if is_transformer(feature): - for n in self.transformer_feature_skip: - if hasattr(feature, n): - setattr(feature, n, keep_transform) - break - elif not keep_transform: - return [[0] for _ in range(X.shape[0])] - - if is_transformer(feature): - Xt = feature.transform( - np.expand_dims(X[:, dim, interval_start:interval_end:dilation], axis=1) - ) - - if Xt.ndim == 3: - Xt = Xt.reshape((Xt.shape[0], Xt.shape[2])) - else: - Xt = [[f] for f in feature(X[:, dim, interval_start:interval_end:dilation])] - - return Xt - - def set_features_to_transform(self, arr, raise_error=True): - """Set transform_features to the given array. - - Each index in the list corresponds to the index of an interval, True intervals - are included in the transform, False intervals skipped and are set to 0. - - If any transformers are in features, they must also have a "transform_features" - or "_transform_features" attribute as well as a "n_transformed_features" - attribute. The input array should contain an item for each of the transformers - "n_transformed_features" output features. - - Parameters - ---------- - arr : list of bools - A list of intervals to skip. - raise_error : bool, default=True - Whether to raise and error or return None if input or transformers are - invalid. - - Returns - ------- - completed: bool - Whether the operation was successful. - """ - length = 0 - for feature in self._features: - if is_transformer(feature): - if not any( - hasattr(feature, n) for n in self.transformer_feature_skip - ) or not hasattr(feature, "n_transformed_features"): - if raise_error: - raise ValueError( - "Transformer must have one of " - f"{self.transformer_feature_skip} as an attribute and " - "a n_transformed_features attribute." - ) - else: - return False - - length += feature.n_transformed_features - else: - length += 1 - - if len(arr) != length * self.n_intervals_ or not all( - isinstance(b, bool) for b in arr - ): - if raise_error: - raise ValueError( - "Input must be a list bools, matching the length of the transform " - "output." - ) - else: - return False - - self._transform_features = arr - - return True - - @classmethod - def get_test_params(cls, parameter_set="default"): - """Return testing parameter settings for the estimator. - - Parameters - ---------- - parameter_set : str, default="default" - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict, default = {} - Parameters to create testing instances of the class - Each dict are parameters to construct an "interesting" test instance, i.e., - `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. - `create_test_instance` uses the first (or only) dictionary in `params` - """ - return {"n_intervals": 2} - - -class SupervisedIntervalTransformer(TransformerMixin, BaseTimeSeriesEstimator): - """Supervised interval feature transformer. - - Extracts intervals in fit using the supervised process described in [1]. - Interval subseries are extracted for each input feature, and the usefulness of that - feature extracted on an interval is evaluated using the Fisher score metric. - Intervals are continually split in half, with the better scoring half retained as a - feature for the transform. - - Multivariate capability is added by running the supervised interval extraction - process on each dimension of the input data. - - As the interval features are already extracted for the supervised - evaluation in fit, the fit_transform method is recommended if the transformed fit - data is required. - - Parameters - ---------- - n_intervals : int, default=50 - The number of times the supervised interval selection process is run. - Each supervised extraction will output a varying amount of features based on - series length, number of dimensions and the number of features. - min_interval_length : int, default=3 - The minimum length of extracted intervals. Minimum value of 3. - features : callable, list of callables, default=None - Functions used to extract features from selected intervals. Must take a 2d - array of shape (n_instances, interval_length) and return a 1d array of shape - (n_instances) containing the features. - If None, defaults to the following statistics used in [2]: - [mean, median, std, slope, min, max, iqr, count_mean_crossing, - count_above_mean]. - metric : ["fisher"] or callable, default="fisher" - The metric used to evaluate the usefulness of a feature extracted on an - interval. If "fisher", the Fisher score is used. If a callable, it must take - a 1d array of shape (n_instances) and return a 1d array of scores of shape - (n_instances). - randomised_split_point : bool, default=True - If True, the split point for interval extraction is randomised as is done in [2] - rather than split in half. - normalise_for_search : bool, default=True - If True, the data is normalised for the supervised interval search process. - Features extracted for the transform output will not use normalised data. - random_state : None, int or instance of RandomState, default=None - Seed or RandomState object used for random number generation. - If random_state is None, use the RandomState singleton used by np.random. - If random_state is an int, use a new RandomState instance seeded with seed. - n_jobs : int, default=1 - The number of jobs to run in parallel for both `fit` and `transform` functions. - `-1` means using all processors. - parallel_backend : str, ParallelBackendBase instance or None, default=None - Specify the parallelisation backend implementation in joblib, if None a 'prefer' - value of "threads" is used by default. - Valid options are "loky", "multiprocessing", "threading" or a custom backend. - See the joblib Parallel documentation for more details. - - Attributes - ---------- - n_instances_ : int - The number of train cases. - n_dims_ : int - The number of dimensions per case. - series_length_ : int - The length of each series. - intervals_ : list of tuples - Contains information for each feature extracted in fit. Each tuple contains the - interval start, interval end, interval dimension and the feature extracted. - Length will be the same as the amount of transformed features. - - See Also - -------- - RandomIntervalTransformer - FixedIntervalTransformer - - Notes - ----- - Based on the authors (stevcabello) code: https://github.com/stevcabello/r-STSF/ - - References - ---------- - .. [1] Cabello, N., Naghizade, E., Qi, J. and Kulik, L., 2020, November. Fast and - accurate time series classification through supervised interval search. In 2020 - IEEE International Conference on Data Mining (ICDM) (pp. 948-953). IEEE. - .. [2] Cabello, N., Naghizade, E., Qi, J. and Kulik, L., 2021. Fast, accurate and - interpretable time series classification through randomization. arXiv preprint - arXiv:2105.14876. - - Examples - -------- - >>> from tsml.transformations import SupervisedIntervalTransformer - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, y = generate_3d_test_data(n_samples=10, series_length=12, random_state=0) - >>> tnf = SupervisedIntervalTransformer(n_intervals=1, random_state=0) - >>> tnf.fit(X, y) - SupervisedIntervalTransformer(...) - >>> print(tnf.transform(X)[0]) - [1.4237989 1.20552675 0.45060352 0.13125638 0.10101093 0.76688304 - 1.92732552 0.54651945 3. 2. ] - """ - - def __init__( - self, - n_intervals=50, - min_interval_length=3, - features=None, - metric="fisher", - randomised_split_point=True, - normalise_for_search=True, - random_state=None, - n_jobs=1, - parallel_backend=None, - ): - self.n_intervals = n_intervals - self.min_interval_length = min_interval_length - self.features = features - self.metric = metric - self.randomised_split_point = randomised_split_point - self.normalise_for_search = normalise_for_search - self.random_state = random_state - self.n_jobs = n_jobs - self.parallel_backend = parallel_backend - - super().__init__() - - # if features contains a transformer, it must contain a parameter name from - # transformer_feature_selection and an attribute name (or property) from - # transformer_feature_names to allow a single feature to be transformed at a time. - transformer_feature_selection = ["features"] - transformer_feature_names = [ - "features_arguments_", - "_features_arguments", - "get_features_arguments", - "_get_features_arguments", - ] - - def fit_transform( - self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray - ) -> np.ndarray: - """Fit the transformer to training data and return transformed data. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - X_t : 2D np.ndarray of shape (n_instances, n_features) - Transformed data. - """ - X, y, rng = self._fit_setup(X, y) - - X_norm = z_normalise_series_3d(X) if self.normalise_for_search else X - - fit = Parallel( - n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" - )( - delayed(self._generate_intervals)( - X, - X_norm, - y, - rng.randint(np.iinfo(np.int32).max), - True, - ) - for _ in range(self.n_intervals) - ) - - ( - intervals, - transformed_intervals, - ) = zip(*fit) - - for i in intervals: - self.intervals_.extend(i) - - self._transform_features = [True] * len(self.intervals_) - - Xt = transformed_intervals[0] - for i in range(1, self.n_intervals): - Xt = np.hstack((Xt, transformed_intervals[i])) - - return Xt - - def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object: - """Fit the transformer to training data. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - X, y, rng = self._fit_setup(X, y) - - X_norm = z_normalise_series_3d(X) if self.normalise_for_search else X - - fit = Parallel( - n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" - )( - delayed(self._generate_intervals)( - X, - X_norm, - y, - rng.randint(np.iinfo(np.int32).max), - False, - ) - for _ in range(self.n_intervals) - ) - - ( - intervals, - _, - ) = zip(*fit) - - for i in intervals: - self.intervals_.extend(i) - - self._transform_features = [True] * len(self.intervals_) - - return self - - def transform( - self, X: Union[np.ndarray, List[np.ndarray]], y: Union[np.ndarray, None] = None - ) -> np.ndarray: - """Transform input cases in X. - - Parameters - ---------- - X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - X_t : 2D np.ndarray of shape (n_instances, n_features) - Transformed data. - """ - check_is_fitted(self) - - X = self._validate_data(X=X, reset=False, ensure_min_series_length=7) - - transform = Parallel( - n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" - )( - delayed(self._transform_intervals)( - X, - i, - ) - for i in range(len(self.intervals_)) - ) - - Xt = np.zeros((X.shape[0], len(transform))) - for i, t in enumerate(transform): - Xt[:, i] = t - - return Xt - - def _fit_setup(self, X, y): - X, y = self._validate_data( - X=X, y=y, ensure_min_samples=2, ensure_min_series_length=5 - ) - X = self._convert_X(X) - - self.intervals_ = [] - - self.n_instances_, self.n_dims_, self.series_length_ = X.shape - - if self.n_instances_ <= 1: - raise ValueError( - "Supervised intervals requires more than 1 training time series." - ) - - self._min_interval_length = self.min_interval_length - if self.min_interval_length < 3: - self._min_interval_length = 3 - - if self._min_interval_length * 2 + 1 > self.series_length_: - raise ValueError( - "Minimum interval length must be less than half the series length." - ) - - self._features = self.features - if self.features is None: - self._features = [ - row_mean, - row_median, - row_std, - row_slope, - row_numba_min, - row_numba_max, - row_iqr, - row_count_mean_crossing, - row_count_above_mean, - ] - - if not isinstance(self._features, list): - self._features = [self._features] - - rng = check_random_state(self.random_state) - - msg = ( - "Transformers must have a parameter from 'transformer_feature_names' to " - "allow selecting single features, and a list of feature names in " - "'transformer_feature_names'. Transformers which require 'fit' are " - "currently unsupported." - ) - - li = [] - for f in self._features: - if callable(f): - li.append(f) - elif is_transformer(f): - if _safe_tags(f, key="requires_fit") is True: - raise ValueError(msg) - - params = inspect.signature(f.__init__).parameters - - att_name = None - for n in self.transformer_feature_selection: - if params.get(n, None) is not None: - att_name = n - break - - if att_name is None: - raise ValueError(msg) - - t_features = None - for n in self.transformer_feature_names: - if hasattr(f, n) and isinstance(getattr(f, n), (list, tuple)): - t_features = getattr(f, n) - break - - if t_features is None: - raise ValueError(msg) - - for t_f in t_features: - new_transformer = _clone_estimator(f, rng) - setattr( - new_transformer, - att_name, - t_f, - ) - li.append(new_transformer) - else: - raise ValueError() - self._features = li - - if callable(self.metric): - self._metric = self.metric - elif self.metric == "fisher": - self._metric = fisher_score - else: - raise ValueError("metric must be callable or 'fisher'") - - self._n_jobs = check_n_jobs(self.n_jobs) - - le = preprocessing.LabelEncoder() - return X, le.fit_transform(y), rng - - def _generate_intervals(self, X, X_norm, y, seed, keep_transform): - rng = check_random_state(seed) - - Xt = np.empty((self.n_instances_, 0)) if keep_transform else None - intervals = [] - - for i in range(self.n_dims_): - for feature in self._features: - random_cut_point = int(rng.randint(1, self.series_length_ - 1)) - while ( - self.series_length_ - random_cut_point - < self._min_interval_length * 2 - and self.series_length_ - (self.series_length_ - random_cut_point) - < self._min_interval_length * 2 - ): - random_cut_point = int(rng.randint(1, self.series_length_ - 1)) - - intervals_L, Xt_L = self._supervised_search( - X_norm[:, i, :random_cut_point], - y, - 0, - feature, - i, - X[:, i, :], - rng, - keep_transform, - is_transformer(feature), - ) - intervals.extend(intervals_L) - - if keep_transform: - Xt = np.hstack((Xt, Xt_L)) - - intervals_R, Xt_R = self._supervised_search( - X_norm[:, i, random_cut_point:], - y, - random_cut_point, - feature, - i, - X[:, i, :], - rng, - keep_transform, - is_transformer(feature), - ) - intervals.extend(intervals_R) - - if keep_transform: - Xt = np.hstack((Xt, Xt_R)) - - return intervals, Xt - - def _transform_intervals(self, X, idx): - if not self._transform_features[idx]: - return np.zeros(X.shape[0]) - - start, end, dim, feature = self.intervals_[idx] - - if is_transformer(feature): - return feature.transform(X[:, dim, start:end]).flatten() - else: - return feature(X[:, dim, start:end]) - - def _supervised_search( - self, - X, - y, - ini_idx, - feature, - dim, - X_ori, - rng, - keep_transform, - feature_is_transformer, - ): - intervals = [] - Xt = np.empty((X.shape[0], 0)) if keep_transform else None - - while X.shape[1] >= self._min_interval_length * 2: - if ( - self.randomised_split_point - and X.shape[1] != self._min_interval_length * 2 - ): - div_point = rng.randint( - self._min_interval_length, X.shape[1] - self._min_interval_length - ) - else: - div_point = int(X.shape[1] / 2) - - sub_interval_0 = X[:, :div_point] - sub_interval_1 = X[:, div_point:] - - if feature_is_transformer: - interval_feature_0 = feature.transform(sub_interval_0).flatten() - interval_feature_1 = feature.transform(sub_interval_1).flatten() - else: - interval_feature_0 = feature(sub_interval_0) - interval_feature_1 = feature(sub_interval_1) - - score_0 = self._metric(interval_feature_0, y) - score_1 = self._metric(interval_feature_1, y) - - if score_0 >= score_1 and score_0 != 0: - end = ini_idx + len(sub_interval_0[0]) - - intervals.append((ini_idx, end, dim, feature)) - X = sub_interval_0 - - if keep_transform: - if self.normalise_for_search: - if feature_is_transformer: - interval_feature_to_use = feature.transform( - X_ori[:, ini_idx:end] - ).flatten() - else: - interval_feature_to_use = feature(X_ori[:, ini_idx:end]) - else: - interval_feature_to_use = interval_feature_0 - - Xt = np.hstack( - ( - Xt, - np.reshape( - interval_feature_to_use, - (interval_feature_to_use.shape[0], 1), - ), - ) - ) - elif score_1 > score_0: - ini_idx = ini_idx + div_point - end = ini_idx + len(sub_interval_1[0]) - - intervals.append((ini_idx, end, dim, feature)) - X = sub_interval_1 - - if keep_transform: - if self.normalise_for_search: - if feature_is_transformer: - interval_feature_to_use = feature.transform( - X_ori[:, ini_idx:end] - ).flatten() - else: - interval_feature_to_use = feature(X_ori[:, ini_idx:end]) - else: - interval_feature_to_use = interval_feature_1 - - Xt = np.hstack( - ( - Xt, - np.reshape( - interval_feature_to_use, - (interval_feature_to_use.shape[0], 1), - ), - ) - ) - else: - break - - return intervals, Xt - - def set_features_to_transform(self, arr, raise_error=True): - """Set transform_features to the given array. - - Each index in the list corresponds to the index of an interval, True intervals - are included in the transform, False intervals skipped and are set to 0. - - Parameters - ---------- - arr : list of bools - A list of intervals to skip. - raise_error : bool, default=True - Whether to raise and error or return None if input is invalid. - - Returns - ------- - completed: bool - Whether the operation was successful. - """ - if len(arr) != len(self.intervals_) or not all( - isinstance(b, bool) for b in arr - ): - if raise_error: - raise ValueError( - "Input must be a list bools of length len(intervals_)." - ) - else: - return False - - self._transform_features = arr - - return True - - def _more_tags(self) -> dict: - return {"requires_y": True} - - @classmethod - def get_test_params(cls, parameter_set="default"): - """Return testing parameter settings for the estimator. - - Parameters - ---------- - parameter_set : str, default="default" - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict, default = {} - Parameters to create testing instances of the class - Each dict are parameters to construct an "interesting" test instance, i.e., - `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. - `create_test_instance` uses the first (or only) dictionary in `params` - """ - return { - "n_intervals": 1, - "randomised_split_point": False, - } - - -# class FixedIntervalTransformer(TransformerMixin, BaseTimeSeriesEstimator): -# """Fixed interval feature transformer. -# -# Extracts features using a fixed set of intervals, contiunually halving the interval -# length until the given depth is reached. -# Transforms each interval sub-series using the given transformer(s)/features and -# concatenates them into a feature vector in transform. -# -# Parameters -# ---------- -# n_intervals : int or callable, default=4, -# The depth to extract intervals from, with the total number of intervals -# extracted increasing exponentially with depth. i.e. if n_intervals=3, 1 interval -# will be extracted from the whole series, 2 from both halves and 4 from the -# four quartiles of the series for 7 total intervals. As the number of intervals -# extracted doubles per layer, the length of each interval extracted halves. -# shifted_intervals : bool, default=True -# Whether to include additional intervals per layer by shifting the layer -# intervals to the right by 1/2 the interval length for each depth past 1. -# This effectively doubles the number of intervals extracted per layer (minus 1) -# by including overlapping intervals. -# min_interval_length : int, default=2 -# The minimum length of extracted intervals. Minimum value of 2. -# features : TransformerMixin, a function taking a 2d numpy array parameter, or list -# of said transformers and functions, default=None -# Transformers and functions used to extract features from selected intervals. -# If None, defaults to [QuantileTransformer, -# QuantileTransformer(subtract_mean=True)]. -# random_state : None, int or instance of RandomState, default=None -# Seed or RandomState object used for random number generation. -# If random_state is None, use the RandomState singleton used by np.random. -# If random_state is an int, use a new RandomState instance seeded with seed. -# n_jobs : int, default=1 -# The number of jobs to run in parallel for both `fit` and `transform` functions. -# `-1` means using all processors. -# parallel_backend : str, ParallelBackendBase instance or None, default=None -# Specify the parallelisation backend implementation in joblib, if None a 'prefer' -# value of "threads" is used by default. -# Valid options are "loky", "multiprocessing", "threading" or a custom backend. -# See the joblib Parallel documentation for more details. -# -# Attributes -# ---------- -# n_instances_ : int -# The number of train cases. -# n_dims_ : int -# The number of dimensions per case. -# series_length_ : int -# The length of each series. -# n_intervals_ : int -# The number of intervals extracted after pruning identical intervals. -# intervals_ : list of tuples -# Contains information for each feature extracted in fit. Each tuple contains the -# interval start, interval end, interval dimension, the feature(s) extracted and -# the dilation. -# Length will be n_intervals*len(features). -# -# See Also -# -------- -# RandomIntervalTransformer -# SupervisedIntervalTransformer -# -# Examples -# -------- -# >>> from tsml.transformations import FixedIntervalTransformer -# >>> from tsml.utils.testing import generate_3d_test_data -# >>> X, _ = generate_3d_test_data(n_samples=4, series_length=12, random_state=0) -# >>> tnf = FixedIntervalTransformer(n_intervals=2, random_state=0) -# >>> tnf.fit(X) -# FixedIntervalTransformer(...) -# >>> print(tnf.transform(X)[0]) -# [1.04753424 0.14925939 0.8473096 1.20552675 1.08976637 0.96853798 -# 1.14764656 1.07628806 0.18170775 0.8473096 1.29178823 1.08976637 -# 0.96853798 1.1907773 ] -# """ -# -# def __init__( -# self, -# n_intervals=4, -# shifted_intervals=True, -# min_interval_length=2, -# features=None, -# random_state=None, -# n_jobs=1, -# parallel_backend=None, -# ): -# self.n_intervals = n_intervals -# self.shifted_intervals = shifted_intervals -# self.min_interval_length = min_interval_length -# self.features = features -# self.random_state = random_state -# self.n_jobs = n_jobs -# self.parallel_backend = parallel_backend -# -# super(FixedIntervalTransformer, self).__init__() -# -# transformer_feature_skip = ["transform_features_", "_transform_features"] -# -# def fit_transform( -# self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray -# ) -> np.ndarray: -# """Fit the transformer to training data and return transformed data. -# -# Parameters -# ---------- -# X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) -# The training data. -# y : 1D np.ndarray of shape (n_instances) -# The class labels for fitting, indices correspond to instance indices in X -# -# Returns -# ------- -# X_t : 2D np.ndarray of shape (n_instances, n_features) -# Transformed data. -# """ -# X = self._fit_setup(X) -# -# fit = Parallel( -# n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" -# )( -# delayed(self._generate_intervals)( -# X, -# y, -# i, -# True, -# ) -# for i in range(self._n_intervals) -# ) -# -# ( -# self.intervals_, -# Xt, -# ) = zip(*fit) -# -# self.n_intervals_ = len(self.intervals_) -# -# return Xt -# -# def fit(self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray) -> object: -# """Fit the transformer to training data. -# -# Parameters -# ---------- -# X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) -# The training data. -# y : 1D np.ndarray of shape (n_instances) -# The class labels for fitting, indices correspond to instance indices in X -# -# Returns -# ------- -# self : -# Reference to self. -# """ -# X = self._fit_setup(X) -# -# fit = Parallel( -# n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" -# )( -# delayed(self._generate_intervals)( -# X, -# y, -# i, -# False, -# ) -# for i in range(self.n_intervals) -# ) -# -# ( -# self.intervals_, -# _, -# ) = zip(*fit) -# -# self.n_intervals_ = len(self.intervals_) -# -# return self -# -# def transform( -# self, X: Union[np.ndarray, List[np.ndarray]], y: np.ndarray -# ) -> np.ndarray: -# """Transform input cases in X. -# -# Parameters -# ---------- -# X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) -# The training data. -# y : 1D np.ndarray of shape (n_instances) -# The class labels for fitting, indices correspond to instance indices in X -# -# Returns -# ------- -# X_t : 2D np.ndarray of shape (n_instances, n_features) -# Transformed data. -# """ -# check_is_fitted(self) -# -# X = self._validate_data(X=X, reset=False, ensure_min_series_length=2) -# -# if self._transform_features is None: -# transform_features = [None] * self.n_intervals_ -# else: -# count = 0 -# transform_features = [] -# for _ in range(self.n_intervals_): -# for feature in self._features: -# if is_transformer(feature): -# nf = feature.n_transformed_features -# transform_features.append( -# self._transform_features[count : count + nf] -# ) -# count += nf -# else: -# transform_features.append(self._transform_features[count]) -# count += 1 -# -# transform = Parallel( -# n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads" -# )( -# delayed(self._transform_interval)( -# X, -# i, -# transform_features[i], -# ) -# for i in range(self.n_intervals_) -# ) -# -# Xt = transform[0] -# for i in range(1, self.n_intervals_): -# Xt = np.hstack((Xt, transform[i])) -# -# return Xt -# -# def _fit_setup(self, X): -# X = self._validate_data(X=X, ensure_min_series_length=2) -# X = self._convert_X(X) -# -# self.intervals_ = [] -# self._transform_features = None -# -# self.n_instances_, self.n_dims_, self.series_length_ = X.shape -# -# if callable(self.n_intervals): -# self._n_intervals = self.n_intervals(X) -# else: -# self._n_intervals = self.n_intervals -# -# self._min_interval_length = self.min_interval_length -# if self.min_interval_length < 2: -# self._min_interval_length = 2 -# -# self._features = self.features -# if self.features is None: -# self._features = [ -# QuantileTransformer(), -# QuantileTransformer(subtract_mean=True), -# ] -# elif not isinstance(self.features, list): -# self._features = [self.features] -# -# li = [] -# for feature in self._features: -# if is_transformer(feature): -# li.append( -# _clone_estimator( -# feature, -# self.random_state, -# ) -# ) -# elif callable(feature): -# li.append(feature) -# else: -# raise ValueError( -# "Input features must be a list of callables or aeon transformers." -# ) -# self._features = li -# -# self._n_jobs = check_n_jobs(self.n_jobs) -# -# return X -# -# def _generate_intervals(self, X, y, depth, transform): -# Xt = np.empty((self.n_instances_, 0)) if transform else None -# intervals = [] -# -# fixed_points = [];;;;; -# -# for dim in range(self.n_dims_): -# for points in fixed_points: -# interval_start, interval_end = points -# -# for feature in self._features: -# if is_transformer(feature): -# if transform: -# feature = _clone_estimator( -# feature, -# self.random_state -# ) -# -# t = feature.fit_transform( -# np.expand_dims( -# X[:, dim, interval_start:interval_end], axis=1 -# ), -# y, -# ) -# -# if t.ndim == 3 and t.shape[1] == 1: -# t = t.reshape((t.shape[0], t.shape[2])) -# -# Xt = np.hstack((Xt, t)) -# else: -# feature.fit( -# np.expand_dims( -# X[:, dim, interval_start:interval_end], axis=1 -# ), -# y, -# ) -# elif transform: -# t = [ -# [f] -# for f in feature(X[:, dim, interval_start:interval_end]) -# ] -# Xt = np.hstack((Xt, t)) -# -# intervals.append((interval_start, interval_end, dim, feature)) -# -# return intervals, Xt -# -# def _transform_interval(self, X, idx, keep_transform): -# interval_start, interval_end, dim, feature = self.intervals_[idx] -# -# if keep_transform is not None: -# if is_transformer(feature): -# for n in self.transformer_feature_skip: -# if hasattr(feature, n): -# setattr(feature, n, keep_transform) -# break -# elif not keep_transform: -# return [[0] for _ in range(X.shape[0])] -# -# if is_transformer(feature): -# Xt = feature.transform( -# np.expand_dims(X[:, dim, interval_start:interval_end], axis=1) -# ) -# -# if Xt.ndim == 3: -# Xt = Xt.reshape((Xt.shape[0], Xt.shape[2])) -# else: -# Xt = [[f] for f in feature(X[:, dim, interval_start:interval_end])] -# -# return Xt -# -# def set_features_to_transform(self, arr, raise_error=True): -# """Set transform_features to the given array. -# -# Each index in the list corresponds to the index of an interval, True intervals -# are included in the transform, False intervals skipped and are set to 0. -# -# If any transformers are in features, they must also have a "transform_features" -# or "_transform_features" attribute as well as a "n_transformed_features" -# attribute. The input array should contain an item for each of the transformers -# "n_transformed_features" output features. -# -# Parameters -# ---------- -# arr : list of bools -# A list of intervals to skip. -# raise_error : bool, default=True -# Whether to raise and error or return None if input or transformers are -# invalid. -# -# Returns -# ------- -# completed: bool -# Whether the operation was successful. -# """ -# length = 0 -# for feature in self._features: -# if is_transformer(feature): -# if not any( -# hasattr(feature, n) for n in self.transformer_feature_skip -# ) or not hasattr(feature, "n_transformed_features"): -# if raise_error: -# raise ValueError( -# "Transformer must have one of " -# f"{self.transformer_feature_skip} as an attribute and " -# "a n_transformed_features attribute." -# ) -# else: -# return False -# -# length += feature.n_transformed_features -# else: -# length += 1 -# -# if len(arr) != length * self.n_intervals_ or not all( -# isinstance(b, bool) for b in arr -# ): -# if raise_error: -# raise ValueError( -# "Input must be a list bools, matching the length of the transform " -# "output." -# ) -# else: -# return False -# -# self._transform_features = arr -# -# return True -# -# @classmethod -# def get_test_params(cls, parameter_set="default"): -# """Return testing parameter settings for the estimator. -# -# Parameters -# ---------- -# parameter_set : str, default="default" -# Name of the set of test parameters to return, for use in tests. If no -# special parameters are defined for a value, will return `"default"` set. -# -# Returns -# ------- -# params : dict or list of dict, default = {} -# Parameters to create testing instances of the class -# Each dict are parameters to construct an "interesting" test instance, i.e., -# `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. -# `create_test_instance` uses the first (or only) dictionary in `params` -# """ -# return {"n_intervals": 2} diff --git a/tsml/transformations/_periodogram.py b/tsml/transformations/_periodogram.py deleted file mode 100644 index c230180..0000000 --- a/tsml/transformations/_periodogram.py +++ /dev/null @@ -1,121 +0,0 @@ -"""Periodogram transformer.""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = ["PeriodogramTransformer"] - -import math - -import numpy as np -from sklearn.base import TransformerMixin - -from tsml.base import BaseTimeSeriesEstimator -from tsml.utils.validation import _check_optional_dependency, check_n_jobs - - -class PeriodogramTransformer(TransformerMixin, BaseTimeSeriesEstimator): - """Periodogram transformer. - - This transformer converts a time series into its periodogram representation. - - Parameters - ---------- - pad_series : bool, default=True - Whether to pad the series to the next power of 2. If False, the series - will be used as is. - pad_with : str, default="constant" - The type of padding to use. see the numpy.pad documentation mode parameter for - options. By default, the series will be padded with zeros. - constant_value : int, default=0 - The value to use when padding with a constant value. - use_pyfftw : bool, default=False - Whether to use the pyfftw library for FFT calculations. Requires the pyfftw - package to be installed. - n_jobs : int, default=1 - The number of threads to use for FFT calculations. Only used if use_pyfftw is - True. - - Examples - -------- - >>> from tsml.transformations import PeriodogramTransformer - >>> from tsml.utils.testing import generate_3d_test_data - >>> X, _ = generate_3d_test_data(n_samples=4, n_channels=2, series_length=20, - ... random_state=0) - >>> tnf = PeriodogramTransformer() # doctest: +SKIP - >>> tnf.fit(X) # doctest: +SKIP - PeriodogramTransformer(...) - >>> print(tnf.transform(X)[0]) # doctest: +SKIP - [[22.16456597 11.08122685 3.69018936 2.17665255 5.27387039 3.10598557 - 6.311107 1.70468284 1.8269671 0.88838033 1.56747869 3.42037058 - 1.67988661 1.71142437 3.49821716 1.25120108] - [22.71382067 8.64933688 6.36412194 0.9298486 5.70358068 2.70669743 - 4.33906385 0.36544821 2.28769936 3.67702091 1.45018642 1.26838712 - 3.36395549 2.69146494 2.27041859 3.9023142 ]] - """ - - def __init__( - self, - pad_series=True, - pad_with="constant", - constant_value=0, - use_pyfftw=False, - n_jobs=1, - ): - self.use_pyfftw = use_pyfftw - self.pad_series = pad_series - self.pad_with = pad_with - self.constant_value = constant_value - self.n_jobs = n_jobs - - if use_pyfftw: - _check_optional_dependency("pyfftw", "pyfftw", self) - - super().__init__() - - def fit(self, X, y=None): - self._validate_data(X=X) - return self - - def transform(self, X, y=None): - X = self._validate_data(X=X, reset=False) - X = self._convert_X(X) - - threads_to_use = check_n_jobs(self.n_jobs) - - if self.pad_series: - kwargs = {"mode": self.pad_with} - if self.pad_with == "constant": - kwargs["constant_values"] = self.constant_value - - X = np.pad( - X, - ( - (0, 0), - (0, 0), - ( - 0, - int( - math.pow(2, math.ceil(math.log(X.shape[2], 2))) - X.shape[2] - ), - ), - ), - **kwargs, - ) - - if self.use_pyfftw: - import pyfftw - - old_threads = pyfftw.config.NUM_THREADS - pyfftw.config.NUM_THREADS = threads_to_use - - fft_object = pyfftw.builders.fft(X[:, :, :]) - Xt = np.abs(fft_object()) - Xt = Xt[:, :, : int(X.shape[2] / 2)] - - pyfftw.config.NUM_THREADS = old_threads - else: - Xt = np.abs(np.fft.fft(X)[:, :, : int(X.shape[2] / 2)]) - - return Xt - - def _more_tags(self) -> dict: - return {"requires_fit": False, "optional_dependency": True} diff --git a/tsml/transformations/_quantile.py b/tsml/transformations/_quantile.py deleted file mode 100644 index d718176..0000000 --- a/tsml/transformations/_quantile.py +++ /dev/null @@ -1,57 +0,0 @@ -# from typing import List, Union -# -# import numpy as np -# from sklearn.base import TransformerMixin -# -# from tsml.base import BaseTimeSeriesEstimator -# -# -# class QuantileTransformer(TransformerMixin, BaseTimeSeriesEstimator): -# """QuantileTransformer""" -# -# def __init__( -# self, -# divisor=4, -# subtract_mean=False, -# ): -# self.divisor = divisor -# self.subtract_mean = subtract_mean -# -# super(QuantileTransformer).__init__() -# -# def fit( -# self, X: Union[np.ndarray, List[np.ndarray]], y: Union[np.ndarray, None] = None -# ) -> object: -# """Unused. Validates X.""" -# self._validate_data(X=X) -# return self -# -# def transform( -# self, X: Union[np.ndarray, List[np.ndarray]], y: Union[np.ndarray, None] = None -# ) -> np.ndarray: -# """Transform input cases in X. -# -# Parameters -# ---------- -# X : 3D np.ndarray of shape (n_instances, n_channels, n_timepoints) -# The training data. -# y : 1D np.ndarray of shape (n_instances) -# The class labels for fitting, indices correspond to instance indices in X -# -# Returns -# ------- -# X_t : 2D np.ndarray of shape (n_instances, n_features) -# Transformed data. -# """ -# X = self._validate_data(X=X, reset=False) -# X = self._convert_X(X) -# -# num_quantiles = 1 + (X.shape[2] - 1) // self.divisor -# if num_quantiles == 1: -# return X.quantile(torch.tensor([0.5]), dim=-1).permute(1, 2, 0) -# else: -# quantiles = X.quantile(torch.linspace(0, 1, num_quantiles), dim=-1).permute( -# 1, 2, 0 -# ) -# quantiles[..., 1::2] = quantiles[..., 1::2] - X.mean(-1, keepdims=True) -# return quantiles diff --git a/tsml/vector/__init__.py b/tsml/vector/__init__.py deleted file mode 100644 index f85cb6a..0000000 --- a/tsml/vector/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""sklearn-like vector estimators.""" - -__all__ = [ - "RotationForestClassifier", - "RotationForestRegressor", - "CITClassifier", -] - -from tsml.vector._cit import CITClassifier -from tsml.vector._rotation_forest import ( - RotationForestClassifier, - RotationForestRegressor, -) diff --git a/tsml/vector/_cit.py b/tsml/vector/_cit.py deleted file mode 100644 index 2ca1995..0000000 --- a/tsml/vector/_cit.py +++ /dev/null @@ -1,487 +0,0 @@ -"""Continuous interval tree (CIT) vector classifier (aka Time Series Tree). - -Continuous Interval Tree aka Time Series Tree, base classifier originally used -in the time series forest interval-based classification algorithm. Fits sklearn -conventions. -""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = ["CITClassifier"] - -import math -import sys -from typing import Union - -import numpy as np -import pandas as pd -from numba import njit -from sklearn import preprocessing -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.utils import check_random_state -from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.validation import check_is_fitted - - -class CITClassifier(ClassifierMixin, BaseEstimator): - """Continuous interval tree (CIT) vector classifier (aka Time Series Tree). - - The `Time Series Tree` described in the Time Series Forest (TSF) paper Deng et al - (2013) [1]. A simple information gain based tree for continuous attributes using a - bespoke margin gain metric for tie breaking. - - Implemented as a bade classifier for interval based time series classifiers such as - `CanonicalIntervalForest` and `DrCIF`. - - Parameters - ---------- - max_depth : int, default=sys.maxsize - Maximum depth for the tree. - thresholds : int, default=20 - Number of thresholds to split continous attributes on at tree nodes. - random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_atts_ : int - The number of attributes in the training set. - n_classes_ : int - Number of classes. Extracted from the data. - classes_ : ndarray of shape (n_classes_) - Holds the label for each class. - class_dictionary_ : dict - A dictionary mapping class labels to class indices in classes_. - - Notes - ----- - For the Java version, see - `tsml `_. - - References - ---------- - .. [1] H.Deng, G.Runger, E.Tuv and M.Vladimir, "A time series forest for - classification and feature extraction", Information Sciences, 239, 2013 - - Examples - -------- - >>> from tsml.vector import CITClassifier - >>> from tsml.utils.testing import generate_2d_test_data - >>> X, y = generate_2d_test_data(n_samples=8, random_state=0) - >>> clf = CITClassifier(random_state=0) - >>> clf.fit(X, y) - CITClassifier(...) - >>> clf.predict(X) - array([0, 1, 0, 0, 0, 0, 0, 1]) - """ - - def __init__( - self, - max_depth=sys.maxsize, - thresholds=20, - random_state=None, - ): - self.max_depth = max_depth - self.thresholds = thresholds - self.random_state = random_state - - super().__init__() - - def fit(self, X: Union[np.ndarray, pd.DataFrame], y: np.ndarray) -> object: - """Fit the estimator to training data. - - Parameters - ---------- - X : 2d ndarray or DataFrame of shape (n_instances, n_atts) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - - X, y = self._validate_data( - X=X, y=y, ensure_min_samples=2, force_all_finite="allow-nan" - ) - - check_classification_targets(y) - - self.n_instances_, self.n_atts_ = X.shape - self.classes_ = np.unique(y) - self.n_classes_ = self.classes_.shape[0] - self.class_dictionary_ = {} - for index, class_val in enumerate(self.classes_): - self.class_dictionary_[class_val] = index - - # escape if only one class seen - if self.n_classes_ == 1: - self._is_fitted = True - return self - - le = preprocessing.LabelEncoder() - y = le.fit_transform(y) - - rng = check_random_state(self.random_state) - self._root = _TreeNode(random_state=rng) - - thresholds = np.linspace(np.min(X, axis=0), np.max(X, axis=0), self.thresholds) - - distribution = np.zeros(self.n_classes_) - for i in range(len(y)): - distribution[y[i]] += 1 - - entropy = _entropy(distribution, distribution.sum()) - - self._root.build_tree( - X, - y, - thresholds, - entropy, - distribution, - 0, - self.max_depth, - self.n_classes_, - False, - ) - - self._is_fitted = True - return self - - def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: - """Predicts labels for sequences in X. - - Parameters - ---------- - X : 2d ndarray or DataFrame of shape (n_instances, n_atts) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances) - Predicted class labels. - """ - return np.array( - [self.classes_[int(np.argmax(prob))] for prob in self.predict_proba(X)] - ) - - def predict_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: - """Predicts labels probabilities for sequences in X. - - Parameters - ---------- - X : 2d ndarray or DataFrame of shape (n_instances, n_atts) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances, n_classes_) - Predicted probabilities using the ordering in classes_. - """ - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat([[1]], X.shape[0], axis=0) - - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - - X = self._validate_data(X=X, reset=False, force_all_finite="allow-nan") - - dists = np.zeros((X.shape[0], self.n_classes_)) - for i in range(X.shape[0]): - dists[i] = self._root.predict_proba(X[i], self.n_classes_) - return dists - - def tree_node_splits_and_gain(self): - """Recursively find the split and information gain for each tree node.""" - splits = [] - gains = [] - - if self._root.best_split > -1: - self._find_splits_gain(self._root, splits, gains) - - return splits, gains - - def _find_splits_gain(self, node, splits, gains): - """Recursively find the split and information gain for each tree node.""" - splits.append(node.best_split) - gains.append(node.best_gain) - - for next_node in node.children: - if next_node.best_split > -1: - self._find_splits_gain(next_node, splits, gains) - - def _more_tags(self) -> dict: - return {"allow_nan": True} - - -class _TreeNode: - """ContinuousIntervalTree tree node.""" - - def __init__( - self, - random_state=None, - ): - self.random_state = random_state - - self.best_split = -1 - self.best_threshold = 0 - self.best_gain = 0.000001 - self.best_margin = -1 - self.children = [] - self.leaf_distribution = [] - self.depth = -1 - - def build_tree( - self, - X, - y, - thresholds, - entropy, - distribution, - depth, - max_depth, - n_classes, - leaf, - ): - self.depth = depth - best_distributions = [] - best_entropies = [] - - if leaf is False and self.remaining_classes(distribution) and depth < max_depth: - for (_, att), threshold in np.ndenumerate(thresholds): - ( - info_gain, - distributions, - entropies, - ) = self.information_gain(X, y, att, threshold, entropy, n_classes) - - if info_gain > self.best_gain: - self.best_split = att - self.best_threshold = threshold - self.best_gain = info_gain - self.best_margin = -1 - best_distributions = distributions - best_entropies = entropies - elif info_gain == self.best_gain and info_gain > 0.000001: - margin = self.margin_gain(X, att, threshold) - if self.best_margin == -1: - self.best_margin = self.margin_gain( - X, self.best_split, self.best_threshold - ) - - if margin > self.best_margin or ( - margin == self.best_margin - and self.random_state.choice([True, False]) - ): - self.best_split = att - self.best_threshold = threshold - self.best_margin = margin - best_distributions = distributions - best_entropies = entropies - - if self.best_split > -1: - self.children = [None, None, None] - - left_idx, right_idx, missing_idx = self.split_data( - X, self.best_split, self.best_threshold - ) - - if len(left_idx) > 0: - self.children[0] = _TreeNode(random_state=self.random_state) - self.children[0].build_tree( - X[left_idx], - y[left_idx], - thresholds, - best_entropies[0], - best_distributions[0], - depth + 1, - max_depth, - n_classes, - False, - ) - else: - self.children[0] = _TreeNode(random_state=self.random_state) - self.children[0].build_tree( - X, - y, - thresholds, - entropy, - distribution, - depth + 1, - max_depth, - n_classes, - True, - ) - - if len(right_idx) > 0: - self.children[1] = _TreeNode(random_state=self.random_state) - self.children[1].build_tree( - X[right_idx], - y[right_idx], - thresholds, - best_entropies[1], - best_distributions[1], - depth + 1, - max_depth, - n_classes, - False, - ) - else: - self.children[1] = _TreeNode(random_state=self.random_state) - self.children[1].build_tree( - X, - y, - thresholds, - entropy, - distribution, - depth + 1, - max_depth, - n_classes, - True, - ) - - if len(missing_idx) > 0: - self.children[2] = _TreeNode(random_state=self.random_state) - self.children[2].build_tree( - X[missing_idx], - y[missing_idx], - thresholds, - best_entropies[2], - best_distributions[2], - depth + 1, - max_depth, - n_classes, - False, - ) - else: - self.children[2] = _TreeNode(random_state=self.random_state) - self.children[2].build_tree( - X, - y, - thresholds, - entropy, - distribution, - depth + 1, - max_depth, - n_classes, - True, - ) - else: - self.leaf_distribution = distribution / np.sum(distribution) - - return self - - def predict_proba(self, X, n_classes): - if self.best_split > -1: - if X[self.best_split] <= self.best_threshold: - return self.children[0].predict_proba(X, n_classes) - elif X[self.best_split] > self.best_threshold: - return self.children[1].predict_proba(X, n_classes) - else: - return self.children[2].predict_proba(X, n_classes) - else: - return self.leaf_distribution - - @staticmethod - @njit(fastmath=True, cache=True) - def information_gain(X, y, attribute, threshold, parent_entropy, n_classes): - dist_left = np.zeros(n_classes) - dist_right = np.zeros(n_classes) - dist_missing = np.zeros(n_classes) - for i, case in enumerate(X): - if case[attribute] <= threshold: - dist_left[y[i]] += 1 - elif case[attribute] > threshold: - dist_right[y[i]] += 1 - else: - dist_missing[y[i]] += 1 - - sum_missing = 0 - for v in dist_missing: - sum_missing += v - sum_left = 0 - for v in dist_left: - sum_left += v - sum_right = 0 - for v in dist_right: - sum_right += v - - entropy_left = _entropy(dist_left, sum_left) - entropy_right = _entropy(dist_right, sum_right) - entropy_missing = _entropy(dist_missing, sum_missing) - - num_cases = X.shape[0] - info_gain = ( - parent_entropy - - sum_left / num_cases * entropy_left - - sum_right / num_cases * entropy_right - - sum_missing / num_cases * entropy_missing - ) - - return ( - info_gain, - [dist_left, dist_right, dist_missing], - [entropy_left, entropy_right, entropy_missing], - ) - - @staticmethod - @njit(fastmath=True, cache=True) - def margin_gain(X, attribute, threshold): - margins = np.abs(X[:, attribute] - threshold) - return np.min(margins) - - @staticmethod - @njit(fastmath=True, cache=True) - def split_data(X, best_split, best_threshold): - left_idx = np.zeros(len(X), dtype=np.int_) - left_count = 0 - right_idx = np.zeros(len(X), dtype=np.int_) - right_count = 0 - missing_idx = np.zeros(len(X), dtype=np.int_) - missing_count = 0 - for i, case in enumerate(X): - if case[best_split] <= best_threshold: - left_idx[left_count] = i - left_count += 1 - elif case[best_split] > best_threshold: - right_idx[right_count] = i - right_count += 1 - else: - missing_idx[missing_count] = i - missing_count += 1 - - return ( - left_idx[:left_count], - right_idx[:right_count], - missing_idx[:missing_count], - ) - - @staticmethod - @njit(fastmath=True, cache=True) - def remaining_classes(distribution): - remaining_classes = 0 - for d in distribution: - if d > 0: - remaining_classes += 1 - return remaining_classes > 1 - - -@njit(fastmath=True, cache=True) -def _entropy(x, s): - e = 0 - for i in x: - p = i / s if s > 0 else 0 - e += -(p * math.log(p) / 0.6931471805599453) if p > 0 else 0 - return e diff --git a/tsml/vector/_rotation_forest.py b/tsml/vector/_rotation_forest.py deleted file mode 100644 index 639083e..0000000 --- a/tsml/vector/_rotation_forest.py +++ /dev/null @@ -1,792 +0,0 @@ -"""A rotation forest (RotF) vector classifier. - -A rotation Forest sktime implementation for continuous values only. Fits sklearn -conventions. -""" - -__author__ = ["MatthewMiddlehurst"] -__all__ = ["RotationForestClassifier", "RotationForestRegressor"] - -import time -from typing import List, Union - -import numpy as np -import pandas as pd -from joblib import Parallel -from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin -from sklearn.decomposition import PCA -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor -from sklearn.utils import check_random_state -from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.parallel import delayed -from sklearn.utils.validation import check_is_fitted - -from tsml.base import _clone_estimator -from tsml.utils.validation import check_n_jobs - - -class RotationForestClassifier(ClassifierMixin, BaseEstimator): - """A Rotation Forest (RotF) classifier. - - Implementation of the Rotation Forest classifier described in Rodriguez et al - (2013) [1]. Builds a forest of trees build on random portions of the data - transformed using PCA. - - Intended as a benchmark for time series data and a base classifier for - transformation based appraoches such as ShapeletTransformClassifier, this tsml - implementation only works with continuous attributes. - - Parameters - ---------- - n_estimators : int, default=200 - Number of estimators to build for the ensemble. - min_group : int, default=3 - The minimum size of an attribute subsample group. - max_group : int, default=3 - The maximum size of an attribute subsample group. - remove_proportion : float, default=0.5 - The proportion of cases to be removed per group. - base_estimator : BaseEstimator or None, default="None" - Base estimator for the ensemble. By default, uses the sklearn - `DecisionTreeClassifier` using entropy as a splitting measure. - time_limit_in_minutes : int, default=0 - Time contract to limit build time in minutes, overriding ``n_estimators``. - Default of `0` means ``n_estimators`` is used. - contract_max_n_estimators : int, default=500 - Max number of estimators to build when ``time_limit_in_minutes`` is set. - save_transformed_data : bool, default=False - Save the data transformed in fit in ``transformed_data_``. - n_jobs : int, default=1 - The number of jobs to run in parallel for both ``fit`` and ``predict``. - `-1` means using all processors. - random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_atts_ : int - The number of attributes in the training set. - n_classes_ : int - Number of classes. Extracted from the data. - classes_ : ndarray of shape (n_classes_) - Holds the label for each class. - class_dictionary_ : dict - A dictionary mapping class labels to class indices in classes_. - transformed_data_ : list of shape (n_estimators) of ndarray - The transformed training dataset for all classifiers. Only saved when - ``save_transformed_data`` is `True`. - estimators_ : list of shape (n_estimators) of BaseEstimator - The collections of estimators trained in fit. - - Notes - ----- - For the Java version, see - `tsml `_. - - References - ---------- - .. [1] Rodriguez, Juan José, Ludmila I. Kuncheva, and Carlos J. Alonso. "Rotation - forest: A new classifier ensemble method." IEEE transactions on pattern analysis - and machine intelligence 28.10 (2006). - - .. [2] Bagnall, A., et al. "Is rotation forest the best classifier for problems - with continuous features?." arXiv preprint arXiv:1809.06705 (2018). - - Examples - -------- - >>> from tsml.vector import RotationForestClassifier - >>> from tsml.utils.testing import generate_2d_test_data - >>> X, y = generate_2d_test_data(n_samples=8, random_state=0) - >>> clf = RotationForestClassifier(random_state=0) - >>> clf.fit(X, y) - RotationForestClassifier(...) - >>> clf.predict(X) - array([0, 1, 0, 0, 0, 0, 0, 1]) - """ - - def __init__( - self, - n_estimators=200, - min_group=3, - max_group=3, - remove_proportion=0.5, - base_estimator=None, - time_limit_in_minutes=0.0, - contract_max_n_estimators=500, - save_transformed_data=False, - n_jobs=1, - random_state=None, - ): - self.n_estimators = n_estimators - self.min_group = min_group - self.max_group = max_group - self.remove_proportion = remove_proportion - self.base_estimator = base_estimator - self.time_limit_in_minutes = time_limit_in_minutes - self.contract_max_n_estimators = contract_max_n_estimators - self.save_transformed_data = save_transformed_data - self.n_jobs = n_jobs - self.random_state = random_state - - super().__init__() - - def fit(self, X: Union[np.ndarray, pd.DataFrame], y: np.ndarray) -> object: - """Fit the estimator to training data. - - Parameters - ---------- - X : 2d ndarray or DataFrame of shape (n_instances, n_atts) - The training data. - y : 1D np.ndarray of shape (n_instances) - The class labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - - X, y = self._validate_data(X=X, y=y, ensure_min_samples=2, dtype=np.float32) - - check_classification_targets(y) - - self._n_jobs = check_n_jobs(self.n_jobs) - - self.n_instances_, self.n_atts_ = X.shape - self.classes_ = np.unique(y) - self.n_classes_ = self.classes_.shape[0] - self.class_dictionary_ = {} - for index, class_val in enumerate(self.classes_): - self.class_dictionary_[class_val] = index - - # escape if only one class seen - if self.n_classes_ == 1: - self._is_fitted = True - return self - - time_limit = self.time_limit_in_minutes * 60 - start_time = time.time() - train_time = 0 - - if self.base_estimator is None: - self._base_estimator = DecisionTreeClassifier(criterion="entropy") - - # remove useless attributes - self._useful_atts = ~np.all(X[1:] == X[:-1], axis=0) - X = X[:, self._useful_atts] - - self._n_atts = X.shape[1] - - # normalise attributes - self._min = X.min(axis=0) - self._ptp = X.max(axis=0) - self._min - X = (X - self._min) / self._ptp - - X_cls_split = [X[np.where(y == i)] for i in self.classes_] - - if time_limit > 0: - self._n_estimators = 0 - self.estimators_ = [] - self._pcas = [] - self._groups = [] - self.transformed_data_ = [] - - while ( - train_time < time_limit - and self._n_estimators < self.contract_max_n_estimators - ): - fit = Parallel(n_jobs=self._n_jobs)( - delayed(self._fit_estimator)( - X, - X_cls_split, - y, - i, - ) - for i in range(self._n_jobs) - ) - - estimators, pcas, groups, transformed_data = zip(*fit) - - self.estimators_ += estimators - self._pcas += pcas - self._groups += groups - self.transformed_data_ += transformed_data - - self._n_estimators += self._n_jobs - train_time = time.time() - start_time - else: - self._n_estimators = self.n_estimators - - fit = Parallel(n_jobs=self._n_jobs)( - delayed(self._fit_estimator)( - X, - X_cls_split, - y, - i, - ) - for i in range(self._n_estimators) - ) - - self.estimators_, self._pcas, self._groups, self.transformed_data_ = zip( - *fit - ) - - return self - - def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: - """Predicts labels for sequences in X. - - Parameters - ---------- - X : 2d ndarray or DataFrame of shape (n_instances, n_atts) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances) - Predicted class labels. - """ - return np.array( - [self.classes_[int(np.argmax(prob))] for prob in self.predict_proba(X)] - ) - - def predict_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: - """Predicts labels probabilities for sequences in X. - - Parameters - ---------- - X : 2d ndarray or DataFrame of shape (n_instances, n_atts) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances, n_classes_) - Predicted probabilities using the ordering in classes_. - """ - check_is_fitted(self) - - # treat case of single class seen in fit - if self.n_classes_ == 1: - return np.repeat([[1]], X.shape[0], axis=0) - - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - - X = self._validate_data(X=X, reset=False, dtype=np.float32) - - # replace missing values with 0 and remove useless attributes - X = X[:, self._useful_atts] - - # normalise the data. - X = (X - self._min) / self._ptp - - y_probas = Parallel(n_jobs=self._n_jobs)( - delayed(self._predict_proba_for_estimator)( - X, - self.estimators_[i], - self._pcas[i], - self._groups[i], - ) - for i in range(self._n_estimators) - ) - - output = np.sum(y_probas, axis=0) / ( - np.ones(self.n_classes_) * self._n_estimators - ) - return output - - def _fit_estimator(self, X, X_cls_split, y, idx): - rs = 255 if self.random_state == 0 else self.random_state - rs = ( - None - if self.random_state is None - else (rs * 37 * (idx + 1)) % np.iinfo(np.int32).max - ) - rng = check_random_state(rs) - - groups = _generate_groups(rng, self._n_atts, self.min_group, self.max_group) - pcas = [] - - # construct the slices to fit the PCAs too. - for group in groups: - classes = rng.choice( - range(self.n_classes_), - size=rng.randint(1, self.n_classes_ + 1), - replace=False, - ) - - # randomly add the classes with the randomly selected attributes. - X_t = np.zeros((0, len(group))) - for cls_idx in classes: - c = X_cls_split[cls_idx] - X_t = np.concatenate((X_t, c[:, group]), axis=0) - - sample_ind = rng.choice( - X_t.shape[0], - max(1, int(X_t.shape[0] * self.remove_proportion)), - replace=False, - ) - X_t = X_t[sample_ind] - - # try to fit the PCA if it fails, remake it, and add 10 random data - # instances. - while True: - # ignore err state on PCA because we account if it fails. - with np.errstate(divide="ignore", invalid="ignore"): - # differences between os occasionally. seems to happen when there - # are low amounts of cases in the fit - pca = PCA(random_state=rs).fit(X_t) - - if not np.isnan(pca.explained_variance_ratio_).all(): - break - X_t = np.concatenate( - (X_t, rng.random_sample((10, X_t.shape[1]))), axis=0 - ) - - pcas.append(pca) - - # merge all the pca_transformed data into one instance and build a classifier - # on it. - X_t = np.concatenate( - [pcas[i].transform(X[:, group]) for i, group in enumerate(groups)], axis=1 - ) - X_t = X_t.astype(np.float32) - X_t = np.nan_to_num( - X_t, False, 0, np.finfo(np.float32).max, np.finfo(np.float32).min - ) - - tree = _clone_estimator(self._base_estimator, random_state=rs) - tree.fit(X_t, y) - - return tree, pcas, groups, X_t if self.save_transformed_data else None - - def _predict_proba_for_estimator(self, X, clf, pcas, groups): - X_t = np.concatenate( - [pcas[i].transform(X[:, group]) for i, group in enumerate(groups)], axis=1 - ) - X_t = X_t.astype(np.float32) - X_t = np.nan_to_num( - X_t, False, 0, np.finfo(np.float32).max, np.finfo(np.float32).min - ) - - probas = clf.predict_proba(X_t) - - if probas.shape[1] != self.n_classes_: - new_probas = np.zeros((probas.shape[0], self.n_classes_)) - for i, cls in enumerate(clf.classes_): - cls_idx = self.class_dictionary_[cls] - new_probas[:, cls_idx] = probas[:, i] - probas = new_probas - - return probas - - def _train_probas_for_estimator(self, y, idx): - rs = 255 if self.random_state == 0 else self.random_state - rs = ( - None - if self.random_state is None - else (rs * 37 * (idx + 1)) % np.iinfo(np.int32).max - ) - rng = check_random_state(rs) - - indices = range(self.n_instances_) - subsample = rng.choice(self.n_instances_, size=self.n_instances_) - oob = [n for n in indices if n not in subsample] - - results = np.zeros((self.n_instances_, self.n_classes_)) - if len(oob) == 0: - return [results, oob] - - clf = _clone_estimator(self._base_estimator, rs) - clf.fit(self.transformed_data_[idx][subsample], y[subsample]) - probas = clf.predict_proba(self.transformed_data_[idx][oob]) - - if probas.shape[1] != self.n_classes_: - new_probas = np.zeros((probas.shape[0], self.n_classes_)) - for i, cls in enumerate(clf.classes_): - cls_idx = self.class_dictionary_[cls] - new_probas[:, cls_idx] = probas[:, i] - probas = new_probas - - for n, proba in enumerate(probas): - results[oob[n]] += proba - - return [results, oob] - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - return {"n_estimators": 2} - - -class RotationForestRegressor(RegressorMixin, BaseEstimator): - """A Rotation Forest (RotF) regressor. - - Implementation of the Rotation Forest regressor based on the classifier described - in Rodriguez et al (2013) [1]. Builds a forest of trees build on random portions - of the data transformed using PCA. - - Intended as a benchmark for time series data and a base regressor for - transformation based appraoches this tsml implementation only works with continuous - attributes. Compares to the classification version, the only alteration is the - base tree used and the removal of class subsampling. - - Parameters - ---------- - n_estimators : int, default=200 - Number of estimators to build for the ensemble. - min_group : int, default=3 - The minimum size of an attribute subsample group. - max_group : int, default=3 - The maximum size of an attribute subsample group. - remove_proportion : float, default=0.5 - The proportion of cases to be removed per group. - base_estimator : BaseEstimator or None, default="None" - Base estimator for the ensemble. By default, uses the sklearn - `DecisionTreeRegressor` using squared error as a splitting measure. - time_limit_in_minutes : int, default=0 - Time contract to limit build time in minutes, overriding ``n_estimators``. - Default of `0` means ``n_estimators`` is used. - contract_max_n_estimators : int, default=500 - Max number of estimators to build when ``time_limit_in_minutes`` is set. - save_transformed_data : bool, default=False - Save the data transformed in fit in ``transformed_data_``. - n_jobs : int, default=1 - The number of jobs to run in parallel for both ``fit`` and ``predict``. - `-1` means using all processors. - random_state : int, RandomState instance or None, default=None - If `int`, random_state is the seed used by the random number generator; - If `RandomState` instance, random_state is the random number generator; - If `None`, the random number generator is the `RandomState` instance used - by `np.random`. - - Attributes - ---------- - n_instances_ : int - The number of train cases in the training set. - n_atts_ : int - The number of attributes in the training set. - transformed_data_ : list of shape (n_estimators) of ndarray - The transformed training dataset for all classifiers. Only saved when - ``save_transformed_data`` is `True`. - estimators_ : list of shape (n_estimators) of BaseEstimator - The collections of estimators trained in fit. - - References - ---------- - .. [1] Rodriguez, Juan José, Ludmila I. Kuncheva, and Carlos J. Alonso. "Rotation - forest: A new classifier ensemble method." IEEE transactions on pattern analysis - and machine intelligence 28.10 (2006). - - .. [2] Bagnall, A., et al. "Is rotation forest the best classifier for problems - with continuous features?." arXiv preprint arXiv:1809.06705 (2018). - - Examples - -------- - >>> from tsml.vector import RotationForestRegressor - >>> from tsml.utils.testing import generate_2d_test_data - >>> X, y = generate_2d_test_data(n_samples=8, regression_target=True, - ... random_state=0) - >>> reg = RotationForestRegressor(random_state=0) - >>> reg.fit(X, y) - RotationForestRegressor(...) - >>> reg.predict(X) - array([0.19658236, 1.36872518, 0.82099324, 0.09710128, 0.83794492, - 0.09609841, 0.97645944, 1.46865118]) - """ - - def __init__( - self, - n_estimators=200, - min_group=3, - max_group=3, - remove_proportion=0.5, - base_estimator=None, - time_limit_in_minutes=0.0, - contract_max_n_estimators=500, - save_transformed_data=False, - n_jobs=1, - random_state=None, - ): - self.n_estimators = n_estimators - self.min_group = min_group - self.max_group = max_group - self.remove_proportion = remove_proportion - self.base_estimator = base_estimator - self.time_limit_in_minutes = time_limit_in_minutes - self.contract_max_n_estimators = contract_max_n_estimators - self.save_transformed_data = save_transformed_data - self.n_jobs = n_jobs - self.random_state = random_state - - super().__init__() - - def fit(self, X: Union[np.ndarray, pd.DataFrame], y: np.ndarray) -> object: - """Fit the estimator to training data. - - Parameters - ---------- - X : 2d ndarray or DataFrame of shape (n_instances, n_atts) - The training data. - y : 1D np.ndarray of shape (n_instances) - The target labels for fitting, indices correspond to instance indices in X - - Returns - ------- - self : - Reference to self. - """ - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - - X, y = self._validate_data( - X=X, y=y, ensure_min_samples=2, dtype=np.float32, y_numeric=True - ) - - self._n_jobs = check_n_jobs(self.n_jobs) - - self.n_instances_, self.n_atts_ = X.shape - - time_limit = self.time_limit_in_minutes * 60 - start_time = time.time() - train_time = 0 - - if self.base_estimator is None: - self._base_estimator = DecisionTreeRegressor(criterion="squared_error") - - # remove useless attributes - self._useful_atts = ~np.all(X[1:] == X[:-1], axis=0) - X = X[:, self._useful_atts] - - self._n_atts = X.shape[1] - - # normalise attributes - self._min = X.min(axis=0) - self._ptp = X.max(axis=0) - self._min - X = (X - self._min) / self._ptp - - if time_limit > 0: - self._n_estimators = 0 - self.estimators_ = [] - self._pcas = [] - self._groups = [] - self.transformed_data_ = [] - - while ( - train_time < time_limit - and self._n_estimators < self.contract_max_n_estimators - ): - fit = Parallel(n_jobs=self._n_jobs)( - delayed(self._fit_estimator)( - X, - y, - i, - ) - for i in range(self._n_jobs) - ) - - estimators, pcas, groups, transformed_data = zip(*fit) - - self.estimators_ += estimators - self._pcas += pcas - self._groups += groups - self.transformed_data_ += transformed_data - - self._n_estimators += self._n_jobs - train_time = time.time() - start_time - else: - self._n_estimators = self.n_estimators - - fit = Parallel(n_jobs=self._n_jobs)( - delayed(self._fit_estimator)( - X, - y, - i, - ) - for i in range(self._n_estimators) - ) - - self.estimators_, self._pcas, self._groups, self.transformed_data_ = zip( - *fit - ) - - return self - - def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: - """Predicts labels for sequences in X. - - Parameters - ---------- - X : 2d ndarray or DataFrame of shape (n_instances, n_atts) - The testing data. - - Returns - ------- - y : array-like of shape (n_instances) - Predicted target labels. - """ - check_is_fitted(self) - - if isinstance(X, np.ndarray) and len(X.shape) == 3 and X.shape[1] == 1: - X = np.reshape(X, (X.shape[0], -1)) - - X = self._validate_data(X=X, reset=False, dtype=np.float32) - - # replace missing values with 0 and remove useless attributes - X = X[:, self._useful_atts] - - # normalise the data. - X = (X - self._min) / self._ptp - - y_preds = Parallel(n_jobs=self._n_jobs)( - delayed(self._predict_for_estimator)( - X, - self.estimators_[i], - self._pcas[i], - self._groups[i], - ) - for i in range(self._n_estimators) - ) - - output = np.sum(y_preds, axis=0) / self._n_estimators - - return output - - def _fit_estimator(self, X, y, idx): - rs = 255 if self.random_state == 0 else self.random_state - rs = ( - None - if self.random_state is None - else (rs * 37 * (idx + 1)) % np.iinfo(np.int32).max - ) - rng = check_random_state(rs) - - groups = _generate_groups(rng, self._n_atts, self.min_group, self.max_group) - pcas = [] - - # construct the slices to fit the PCAs too. - for group in groups: - sample_ind = rng.choice( - X.shape[0], - max(1, int(X.shape[0] * self.remove_proportion)), - replace=False, - ) - X_t = X[sample_ind] - X_t = X_t[:, group] - - # try to fit the PCA if it fails, remake it, and add 10 random data - # instances. - while True: - # ignore err state on PCA because we account if it fails. - with np.errstate(divide="ignore", invalid="ignore"): - # differences between os occasionally. seems to happen when there - # are low amounts of cases in the fit - pca = PCA(random_state=rs).fit(X_t) - - if not np.isnan(pca.explained_variance_ratio_).all(): - break - X_t = np.concatenate( - (X_t, rng.random_sample((10, X_t.shape[1]))), axis=0 - ) - - pcas.append(pca) - - # merge all the pca_transformed data into one instance and build a classifier - # on it. - X_t = np.concatenate( - [pcas[i].transform(X[:, group]) for i, group in enumerate(groups)], axis=1 - ) - X_t = X_t.astype(np.float32) - X_t = np.nan_to_num( - X_t, False, 0, np.finfo(np.float32).max, np.finfo(np.float32).min - ) - - tree = _clone_estimator(self._base_estimator, random_state=rs) - tree.fit(X_t, y) - - return tree, pcas, groups, X_t if self.save_transformed_data else None - - def _predict_for_estimator(self, X, clf, pcas, groups): - X_t = np.concatenate( - [pcas[i].transform(X[:, group]) for i, group in enumerate(groups)], axis=1 - ) - X_t = X_t.astype(np.float32) - X_t = np.nan_to_num( - X_t, False, 0, np.finfo(np.float32).max, np.finfo(np.float32).min - ) - - return clf.predict(X_t) - - @classmethod - def get_test_params( - cls, parameter_set: Union[str, None] = None - ) -> Union[dict, List[dict]]: - """Return unit test parameter settings for the estimator. - - Parameters - ---------- - parameter_set : None or str, default=None - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - Returns - ------- - params : dict or list of dict - Parameters to create testing instances of the class. - """ - return {"n_estimators": 2} - - -def _generate_groups(rng, n_atts, min_group, max_group): - permutation = rng.permutation(np.arange(0, n_atts)) - - # select the size of each group. - group_size_count = np.zeros(max_group - min_group + 1) - n_attributes = 0 - n_groups = 0 - while n_attributes < n_atts: - n = rng.randint(group_size_count.shape[0]) - group_size_count[n] += 1 - n_attributes += min_group + n - n_groups += 1 - - groups = [] - current_attribute = 0 - current_size = 0 - for i in range(0, n_groups): - while group_size_count[current_size] == 0: - current_size += 1 - group_size_count[current_size] -= 1 - - n = min_group + current_size - groups.append(np.zeros(n, dtype=int)) - for k in range(0, n): - if current_attribute < permutation.shape[0]: - groups[i][k] = permutation[current_attribute] - else: - groups[i][k] = permutation[rng.randint(permutation.shape[0])] - current_attribute += 1 - - return groups diff --git a/tsml/vector/tests/__init__.py b/tsml/vector/tests/__init__.py deleted file mode 100644 index 8a187d0..0000000 --- a/tsml/vector/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""sklearn-like estimator test code.""" diff --git a/tsml/vector/tests/test_rotation_forest.py b/tsml/vector/tests/test_rotation_forest.py deleted file mode 100644 index 02316b0..0000000 --- a/tsml/vector/tests/test_rotation_forest.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Rotation Forest test code.""" - -__author__ = ["MatthewMiddlehurst"] - -import numpy as np - -from tsml.datasets import load_minimal_chinatown -from tsml.vector import RotationForestClassifier - - -def test_contracted_rotf(): - """Test of RotF contracting and train estimate on unit test data.""" - # load unit test data - X, y = load_minimal_chinatown(split="train") - X = np.reshape(X, (X.shape[0], -1)) - - rotf = RotationForestClassifier( - contract_max_n_estimators=5, - time_limit_in_minutes=0.25, - random_state=0, - ) - rotf.fit(X, y) - - assert len(rotf.estimators_) > 1 - - # test train estimate - proba = rotf.predict_proba(X) - assert isinstance(proba, np.ndarray) - assert proba.shape == (len(X), 2) From cdb8d2cfc5b1a71f0d03a7fada206c71706dbb58 Mon Sep 17 00:00:00 2001 From: MatthewMiddlehurst Date: Sat, 11 Jan 2025 17:46:39 +0000 Subject: [PATCH 4/6] fixes --- tsml/compose/_channel_ensemble.py | 26 ++++---- tsml/compose/tests/test_channel_ensemble.py | 26 +++----- .../tests/test_interval_extraction.py | 66 ------------------- .../transformations/tests/test_periodogram.py | 20 ------ tsml/utils/testing.py | 11 ---- 5 files changed, 22 insertions(+), 127 deletions(-) delete mode 100644 tsml/transformations/tests/test_interval_extraction.py delete mode 100644 tsml/transformations/tests/test_periodogram.py diff --git a/tsml/compose/_channel_ensemble.py b/tsml/compose/_channel_ensemble.py index 1c2bfb7..847c2d3 100644 --- a/tsml/compose/_channel_ensemble.py +++ b/tsml/compose/_channel_ensemble.py @@ -204,17 +204,17 @@ class ChannelEnsembleClassifier(ClassifierMixin, _BaseChannelEnsemble): Examples -------- >>> from tsml.compose import ChannelEnsembleClassifier - >>> from tsml.interval_based import IntervalForestClassifier + >>> from tsml.dummy import DummyClassifier >>> from tsml.utils.testing import generate_3d_test_data >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, random_state=0) >>> reg = ChannelEnsembleClassifier( - ... estimators=("tsf", IntervalForestClassifier(n_estimators=2), "all-split"), + ... estimators=("d", DummyClassifier(), "all-split"), ... random_state=0, ... ) >>> reg.fit(X, y) ChannelEnsembleClassifier(...) >>> reg.predict(X) - array([0, 1, 1, 0, 0, 1, 0, 1]) + array([0, 0, 0, 0, 0, 0, 0, 0]) """ def __init__(self, estimators, remainder="drop", random_state=None): @@ -349,12 +349,12 @@ def get_test_params( params : dict or list of dict Parameters to create testing instances of the class. """ - from tsml.interval_based import IntervalForestClassifier + from tsml.dummy import DummyClassifier return { "estimators": [ - ("tsf1", IntervalForestClassifier(n_estimators=2), 0), - ("tsf2", IntervalForestClassifier(n_estimators=2), 0), + ("d1", DummyClassifier(), 0), + ("d2", DummyClassifier(), 0), ] } @@ -411,19 +411,19 @@ class ChannelEnsembleRegressor(RegressorMixin, _BaseChannelEnsemble): Examples -------- >>> from tsml.compose import ChannelEnsembleRegressor - >>> from tsml.interval_based import IntervalForestRegressor + >>> from tsml.dummy import DummyRegressor >>> from tsml.utils.testing import generate_3d_test_data >>> X, y = generate_3d_test_data(n_samples=8, series_length=10, ... regression_target=True, random_state=0) >>> reg = ChannelEnsembleRegressor( - ... estimators=("tsf", IntervalForestRegressor(n_estimators=2), "all-split"), + ... estimators=("d", DummyRegressor(), "all-split"), ... random_state=0, ... ) >>> reg.fit(X, y) ChannelEnsembleRegressor(...) >>> reg.predict(X) - array([0.31798318, 1.41426301, 1.06414747, 0.6924721 , 0.56660146, - 1.26538944, 0.52324808, 1.0939405 ]) + array([0.8672557, 0.8672557, 0.8672557, 0.8672557, 0.8672557, 0.8672557, + 0.8672557, 0.8672557], dtype=float32) """ def __init__(self, estimators, remainder="drop", random_state=None): @@ -518,12 +518,12 @@ def get_test_params( params : dict or list of dict Parameters to create testing instances of the class. """ - from tsml.interval_based import IntervalForestRegressor + from tsml.dummy import DummyRegressor return { "estimators": [ - ("tsf1", IntervalForestRegressor(n_estimators=2), 0), - ("tsf2", IntervalForestRegressor(n_estimators=2), 0), + ("d1", DummyRegressor(), 0), + ("d2", DummyRegressor(), 0), ] } diff --git a/tsml/compose/tests/test_channel_ensemble.py b/tsml/compose/tests/test_channel_ensemble.py index 2232c75..6f082cb 100644 --- a/tsml/compose/tests/test_channel_ensemble.py +++ b/tsml/compose/tests/test_channel_ensemble.py @@ -9,7 +9,7 @@ _check_key_type, _get_channel, ) -from tsml.interval_based import IntervalForestClassifier, IntervalForestRegressor +from tsml.dummy import DummyClassifier, DummyRegressor from tsml.utils.testing import generate_3d_test_data, generate_unequal_test_data @@ -17,17 +17,13 @@ def test_single_estimator(): """Test that a single estimator is correctly applied to all channels.""" X, y = generate_3d_test_data(n_channels=3) - ens = ChannelEnsembleClassifier( - estimators=[("tsf", IntervalForestClassifier(n_estimators=2), "all")] - ) + ens = ChannelEnsembleClassifier(estimators=[("d", DummyClassifier(), "all")]) ens.fit(X, y) assert len(ens.estimators_[0][2]) == 3 assert ens.predict(X).shape == (X.shape[0],) - ens = ChannelEnsembleRegressor( - estimators=[("tsf", IntervalForestRegressor(n_estimators=2), "all")] - ) + ens = ChannelEnsembleRegressor(estimators=[("d", DummyRegressor(), "all")]) ens.fit(X, y) assert len(ens.estimators_[0][2]) == 3 @@ -38,18 +34,14 @@ def test_single_estimator_split(): """Test that a single split estimator correctly creates an estimator per channel.""" X, y = generate_3d_test_data(n_channels=3) - ens = ChannelEnsembleClassifier( - estimators=("tsf", IntervalForestClassifier(n_estimators=2), "all-split") - ) + ens = ChannelEnsembleClassifier(estimators=("d", DummyClassifier(), "all-split")) ens.fit(X, y) assert len(ens.estimators_) == 3 assert isinstance(ens.estimators_[0][2], int) assert ens.predict(X).shape == (X.shape[0],) - ens = ChannelEnsembleRegressor( - estimators=("tsf", IntervalForestRegressor(n_estimators=2), "all-split") - ) + ens = ChannelEnsembleRegressor(estimators=("d", DummyRegressor(), "all-split")) ens.fit(X, y) assert len(ens.estimators_) == 3 @@ -62,8 +54,8 @@ def test_remainder(): X, y = generate_3d_test_data(n_channels=3) ens = ChannelEnsembleClassifier( - estimators=[("tsf", IntervalForestClassifier(n_estimators=2), 0)], - remainder=IntervalForestClassifier(n_estimators=2), + estimators=[("d", DummyClassifier(), 0)], + remainder=DummyClassifier(), ) ens.fit(X, y) @@ -71,8 +63,8 @@ def test_remainder(): assert ens.predict(X).shape == (X.shape[0],) ens = ChannelEnsembleRegressor( - estimators=[("tsf", IntervalForestRegressor(n_estimators=2), 0)], - remainder=IntervalForestRegressor(n_estimators=2), + estimators=[("d", DummyRegressor(), 0)], + remainder=DummyRegressor(), ) ens.fit(X, y) diff --git a/tsml/transformations/tests/test_interval_extraction.py b/tsml/transformations/tests/test_interval_extraction.py deleted file mode 100644 index 147d4df..0000000 --- a/tsml/transformations/tests/test_interval_extraction.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Interval extraction test code.""" - -import pytest - -from tsml.transformations import ( - Catch22Transformer, - RandomIntervalTransformer, - SevenNumberSummaryTransformer, - SupervisedIntervalTransformer, -) -from tsml.utils.numba_functions.stats import row_mean, row_median -from tsml.utils.testing import generate_3d_test_data -from tsml.utils.validation import _check_optional_dependency - - -def test_interval_prune(): - """Test RandomIntervalTransformer duplicate pruning.""" - X, y = generate_3d_test_data(random_state=0, n_channels=2, series_length=10) - - rit = RandomIntervalTransformer( - features=[row_mean, row_median], - n_intervals=10, - random_state=0, - ) - X_t = rit.fit_transform(X, y) - - assert X_t.shape == (10, 16) - assert rit.transform(X).shape == (10, 16) - - -def test_random_interval_transformer(): - """Test RandomIntervalTransformer.""" - X, y = generate_3d_test_data(random_state=0, n_channels=2, series_length=10) - - rit = RandomIntervalTransformer( - features=SevenNumberSummaryTransformer(), - n_intervals=5, - random_state=2, - ) - X_t = rit.fit_transform(X, y) - - assert X_t.shape == (10, 35) - assert rit.transform(X).shape == (10, 35) - - -@pytest.mark.skipif( - not _check_optional_dependency("pycatch22", "pycatch22", None, raise_error=False), - reason="pycatch22 not installed", -) -def test_supervised_transformers(): - """Test SupervisedIntervalTransformer.""" - X, y = generate_3d_test_data(random_state=0) - - sit = SupervisedIntervalTransformer( - features=[ - Catch22Transformer( - features=["DN_HistogramMode_5", "SB_BinaryStats_mean_longstretch1"] - ), - row_mean, - ], - n_intervals=2, - random_state=0, - ) - X_t = sit.fit_transform(X, y) - - assert X_t.shape == (X.shape[0], 8) diff --git a/tsml/transformations/tests/test_periodogram.py b/tsml/transformations/tests/test_periodogram.py deleted file mode 100644 index 1316cde..0000000 --- a/tsml/transformations/tests/test_periodogram.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest -from numpy.testing import assert_array_almost_equal - -from tsml.transformations import PeriodogramTransformer -from tsml.utils.testing import generate_3d_test_data -from tsml.utils.validation import _check_optional_dependency - - -@pytest.mark.skipif( - not _check_optional_dependency("pyfftw", "pyfftw", "PeriodogramTransformer", False), - reason="Only run on overnights because of intermittent fail for read/write.", -) -def test_periodogram_same_output(): - """Test that the output is the same using pyfftw and not.""" - X, y = generate_3d_test_data() - - p1 = PeriodogramTransformer() - p2 = PeriodogramTransformer(use_pyfftw=False) - - assert_array_almost_equal(p1.fit_transform(X), p2.fit_transform(X)) diff --git a/tsml/utils/testing.py b/tsml/utils/testing.py index 7e0c98e..4705b11 100644 --- a/tsml/utils/testing.py +++ b/tsml/utils/testing.py @@ -96,17 +96,6 @@ def parametrize_with_checks(estimators: List[BaseEstimator]) -> Callable: See Also -------- check_estimator : Check if estimator adheres to tsml or scikit-learn conventions. - - Examples - -------- - >>> from tsml.utils.testing import parametrize_with_checks - >>> from tsml.interval_based import IntervalForestRegressor - >>> from tsml.vector import RotationForestClassifier - >>> @parametrize_with_checks( - ... [IntervalForestRegressor(), RotationForestClassifier()] - ... ) - ... def test_tsml_compatible_estimator(estimator, check): - ... check(estimator) """ import pytest From e360b5f449f7622dc47c5f0fda0ab63571b61c7b Mon Sep 17 00:00:00 2001 From: MatthewMiddlehurst Date: Sat, 11 Jan 2025 18:11:47 +0000 Subject: [PATCH 5/6] temp comment scikit-fda --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9ade4e7..da87e90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ [project.optional-dependencies] all_extras = [ "grailts", - "scikit-fda>=0.7.0", + # "scikit-fda>=0.7.0", "wildboar", ] unstable_extras = [ From 461928430b658e5d398b95e0e3c7ce0f3438ca9a Mon Sep 17 00:00:00 2001 From: MatthewMiddlehurst Date: Sat, 11 Jan 2025 18:21:37 +0000 Subject: [PATCH 6/6] version --- README.md | 2 +- tsml/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7609c25..f5baf03 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Please see [`tsml_eval`](https://github.com/time-series-machine-learning/tsml-ev is more of a sandbox for testing out new ideas and algorithms. It may contain some algorithms and implementations that are not available in the other toolkits. -The current release of `tsml` is v0.5.0. +The current release of `tsml` is v0.6.0. ## Installation diff --git a/tsml/__init__.py b/tsml/__init__.py index 90b5545..af5858d 100644 --- a/tsml/__init__.py +++ b/tsml/__init__.py @@ -1,3 +1,3 @@ """tsml.""" -__version__ = "0.5.0" +__version__ = "0.6.0"