diff --git a/.coverage b/.coverage index f32d8f4..4e3173d 100644 Binary files a/.coverage and b/.coverage differ diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index e170743..e942fc3 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -16,7 +16,7 @@ on: jobs: run_pytest: name: Run tests on min and max Python versions - runs-on: self-hosted + runs-on: ubuntu-latest strategy: fail-fast: true matrix: @@ -61,7 +61,7 @@ jobs: build_sdist: name: Build source distribution - runs-on: self-hosted + runs-on: ubuntu-latest needs: run_pytest steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml index cfce0bb..873987d 100644 --- a/.github/workflows/pr_build.yml +++ b/.github/workflows/pr_build.yml @@ -17,7 +17,7 @@ jobs: build_test_sdist: name: Test source distribution - runs-on: self-hosted + runs-on: ubuntu-latest needs: run_pytest strategy: fail-fast: true diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 811dfd1..243c494 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,7 +7,7 @@ on: jobs: pytest: name: Run pytest - runs-on: self-hosted + runs-on: ubuntu-latest strategy: fail-fast: true matrix: diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a0e668..3002826 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,33 @@ # Changelog +All notable changes to this project will be documented in this file. + +## [1.4.0] - 2025-06-19 + +### Contributors + +- [@quentinhaenn](Quentin Haenn) - Main developer and maintainer + +### Added + +- Added support for custom MDS solvers in the `RadiusClustering` class. +- Updated the documentation to include examples of using custom MDS solvers. +- Added more examples and tutorials to the documentation. + +### Changed + +- Improved documentation and examples for the `RadiusClustering` class. +- Updated the README to reflect the new features and improvements in version 1.4.0 +- Updated the test cases to ensure compatibility with the new features. +- Refactored the main codebase to improve readability and maintainability. +- Prepared the codebase for future adds of MDS solvers and/or clustering algorithms. + ## [1.3.0] - 2025-06-18 +### Contributors + +- [@quentinhaenn](Quentin Haenn) - Main developer and maintainer + ### Added - Full test coverage for the entire codebase. @@ -17,3 +43,21 @@ - Updated all the attributes in the `RadiusClustering` class to fit `scikit-learn` standards and conventions. - Updated the tests cases to reflect the changes in the `RadiusClustering` class. - Updated README and documentation to reflect the new `radius` parameter and the deprecation of `threshold`. + +## [1.2.0] - 2024-10 + +### Contributors + +- [@quentinhaenn](Quentin Haenn) - Main developer and maintainer +- [@mickaelbaron](Mickaƫl Baron) - Contributor and maintainer + +### Added + +- Added CI/CD pipelines with GitHub Actions for automated testing and deployment. +- Added package metadata for better integration with PyPI. +- Added a badge for the GitHub Actions workflow status in the README. +- Added a badge for the Python version supported in the README. +- Added a badge for the code style (Ruff) in the README. +- Added a badge for the license in the README. +- Added CI/CD pipelines for PyPI deployment (including test coverage, compiling extensions and wheels, and uploading to PyPI). +- Resolving issues with compiling Cython extensions on Windows and MacOS. diff --git a/README.md b/README.md index b389e50..2b1b09e 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,24 @@ Radius clustering is a Python package that implements clustering under radius co - Compatible with scikit-learn's API for clustering algorithms - Supports radius-constrained clustering - Provides options for exact and approximate solutions +- Easy to use and integrate with existing Python data science workflows +- Includes comprehensive documentation and examples +- Full test coverage to ensure reliability and correctness +- Supports custom MDS solvers for flexibility in clustering approaches +- Provides a user-friendly interface for clustering tasks + +> [!CAUTION] +> **Deprecation Notice**: The `threshold` parameter in the `RadiusClustering` class has been deprecated. Please use the `radius` parameter instead for specifying the radius for clustering. It is planned to be completely removed in version 2.0.0. The `radius` parameter is now the standard way to define the radius for clustering, aligning with our objective of making the parameters' name more intuitive and user-friendly. + +> [!NOTE] +> **NEW VERSIONS**: The package is currently under active development for new features and improvements, including some refactoring and enhancements to the existing codebase. Backwards compatibility is not guaranteed, so please check the [CHANGELOG](CHANGELOG.md) for details on changes and updates. + +## Roadmap + +- [x] Version 1.4.0: + - [x] Add support for custom MDS solvers + - [x] Improve documentation and examples + - [x] Add more examples and tutorials ## Installation @@ -38,7 +56,7 @@ from radius_clustering import RadiusClustering X = np.random.rand(100, 2) # Generate random data # Create an instance of MdsClustering -rad_clustering = RadiusClustering(manner="approx", threshold=0.5) +rad_clustering = RadiusClustering(manner="approx", radius=0.5) # Fit the model to the data rad_clustering.fit(X) @@ -109,5 +127,4 @@ The Radius Clustering work has been funded by: - [1] [An iterated greedy algorithm for finding the minimum dominating set in graphs](https://www.sciencedirect.com/science/article/pii/S0378475422005055) - [2] [An exact algorithm for the minimum dominating set problem](https://dl.acm.org/doi/abs/10.24963/ijcai.2023/622) - - +- [3] [Clustering under radius constraint using minimum dominating set](https://link.springer.com/chapter/10.1007/978-3-031-62700-2_2) diff --git a/docs/source/api.rst b/docs/source/api.rst index c38ac4c..e715c46 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -1,7 +1,19 @@ API Reference ============= -.. automodule:: radius_clustering +This page documents the implementation details of the `radius_clustering` package. + +RadiusClustering Class +---------------------- + +.. autoclass:: radius_clustering.RadiusClustering + :members: + :undoc-members: + :show-inheritance: + +Algorithms Module +----------------- +.. automodule:: radius_clustering.algorithms :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 1826840..f340b98 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -1,7 +1,20 @@ Usage ===== -Here's a basic example of how to use Radius Clustering: +This page provides a quick guide on how to use the `radius_clustering` package for clustering tasks. The package provides a simple interface for performing radius-based clustering on datasets based on the Minimum Dominating Set (MDS) algorithm. + +This page is divided into three main sections: +1. **Basic Usage**: A quick example of how to use the `RadiusClustering` class and perform clustering with several parameters. +2. **Custom Dissimilarity Function**: How to use a custom dissimilarity function with the `RadiusClustering` class. +3. **Custom MDS Solver**: How to implement a custom MDS solver for more advanced clustering tasks, eventually with less guarantees on the results. + + +Basic Usage +----------------- + +The `RadiusClustering` class provides a straightforward way to perform clustering based on a specified radius. You can choose between an approximate or exact method for clustering, depending on your needs. + +Here's a basic example of how to use Radius Clustering with the `RadiusClustering` class, using the approximate method: .. code-block:: python @@ -22,4 +35,97 @@ Here's a basic example of how to use Radius Clustering: # Get cluster labels labels = rad.labels_ - print(labels) \ No newline at end of file + print(labels) + +Similarly, you can use the exact method by changing the `manner` parameter to `"exact"`: +.. code-block:: python + # [...] Exact same code as above + rad = RadiusClustering(manner="exact", radius=0.5) #change this parameter + # [...] Exact same code as above + +Custom Dissimilarity Function +----------------------------- + +The main reason behind the `radius_clustering` package is that users eventually needs to use a dissimilarity function that is not a metric (or distance) function. Plus, sometimes context requires a domain-specific dissimilarity function that is not provided by default, and needs to be implemented by the user. + +To use a custom dissimilarity function, you can pass it as a parameter to the `RadiusClustering` class. Here's an example of how to do this: +.. code-block:: python + + from radius_clustering import RadiusClustering + import numpy as np + + # Generate random data + X = np.random.rand(100, 2) + + # Define a custom dissimilarity function + def dummy_dissimilarity(x, y): + return np.linalg.norm(x - y) + 0.1 # Example: add a constant to the distance + + # Create an instance of MdsClustering with the custom dissimilarity function + rad = RadiusClustering(manner="approx", radius=0.5, metric=dummy_dissimilarity) + + # Fit the model to the data + rad.fit(X) + + # Get cluster labels + labels = rad.labels_ + + print(labels) + + +.. note:: + The custom dissimilarity function will be passed to scikit-learn's `pairwise_distances` function, so it should be compatible with the expected input format and return type. See the scikit-learn documentation for more details on how to implement custom metrics. + +Custom MDS Solver +----------------- + +The two default solvers provided by the actual implementation of the `radius_clustering` package are focused on exactness (or proximity to exactness) of the results of a NP-hard problem. So, they may not be suitable for all use cases, especially when performance is a concern. +If you have your own implementation of a Minimum Dominating Set (MDS) solver, you can use it with the `RadiusClustering` class ny using the :py:func:'RadiusClustering.set_solver' method. It will check that the solver is compatible with the expected input format and return type, and will use it to perform clustering. + +.. versionadded:: 1.4.0 + The :py:func:`RadiusClustering.set_solver` method was added to allow users to set a custom MDS solver. + It is *NOT* backward compatible with previous versions of the package, as it comes with new structure and methods to handle custom solvers. + +Here's an example of how to implement a custom MDS solver and use it with the `RadiusClustering` class, using NetworkX implementation of the dominating set problem : + +.. code-block:: python + + from radius_clustering import RadiusClustering + import time + import numpy as np + import networkx as nx + + # Generate random data + X = np.random.rand(100, 2) + + # Define a custom MDS solver using NetworkX + def custom_mds_solver(n, edges, nb_edges, random_state=None): + start = time.time() + graph = nx.Graph(edges) + centers = list(nx.algorithms.dominating_set(graph)) + centers.sort() + end = time.time() + return centers, end - start + + # Create an instance of MdsClustering with the custom MDS solver + rad = RadiusClustering(manner="approx", radius=0.5) + rad.set_solver(custom_mds_solver) + + # Fit the model to the data + rad.fit(X) + + # Get cluster labels + labels = rad.labels_ + + print(labels) + +.. note:: + The custom MDS solver should accept the same parameters as the default solvers, including the number of points `n`, the edges of the graph `edges`, the number of edges `nb_edges`, and an optional `random_state` parameter for reproducibility. It should return a list of centers and the time taken to compute them. + The `set_solver` method will check that the custom solver is compatible with the expected input format and return type, and will use it to perform clustering. + If the custom solver is not compatible, it will raise a `ValueError` with a descriptive message. + +.. attention:: + We cannot guarantee that the custom MDS solver will produce the same results as the default solvers, especially if it is not purposely designed to solve the Minimum Dominating Set problem but rather just finds a dominating set. The results may vary depending on the implementation and the specific characteristics of the dataset. + As an example, a benchmark of our solutions and a custom one using NetworkX is available in the `Example Gallery` section of the documentation, which shows that the custom solver may produce different results than the default solvers, especially in terms of the number of clusters and the time taken to compute them (see :ref:`sphx_glr_auto_examples_plot_benchmark_custom.py`). + However, it can be useful for specific use cases where performance is a concern or when you have a custom implementation that fits your needs better. + diff --git a/examples/plot_benchmark_custom.py b/examples/plot_benchmark_custom.py new file mode 100644 index 0000000..866825d --- /dev/null +++ b/examples/plot_benchmark_custom.py @@ -0,0 +1,230 @@ +""" +===================================================================================== +Benchmark of Radius Clustering using multiple datasets and comparison with custom MDS +===================================================================================== + +This example demonstrates how to implement a custom solver for the MDS problem +and use it within the Radius Clustering framework. +Plus, it compares the results of a naive implementation using the +`NetworkX` library with the Radius Clustering implementation. + +The example includes: + 1. Defining the custom MDS solver. + 2. Defining datasets to test the clustering. + 3. Applying Radius clustering on the datasets using the custom MDS solver. + 4. Ensure this solution works. + 5. Establish a benchmark procedure to compare the Radius clustering with a naive implementation using `NetworkX`. + 6. Comparing the results in terms of : + - Execution time + - Number of cluster found + 7. Visualizing the benchmark results. + 8. Visualizing the clustering results. + +This example is useful for understanding how to implement a custom MDS solver +and how to perform an advanced usage of the package. +""" +# Author: Haenn Quentin +# SPDX-License-Identifier: MIT + +# %% +# Import necessary libraries +# -------------------------- +# +# Since this example is a benchmark, we need to import the necessary libraries +# to perform the benchmark, including `NetworkX` for the naive implementation, +# `matplotlib` for visualization, and `sklearn` for the datasets. + + +import networkx as nx +import numpy as np +import matplotlib.pyplot as plt +import time +import warnings + +from sklearn.datasets import fetch_openml +from radius_clustering import RadiusClustering +from sklearn.metrics import pairwise_distances_argmin + +warnings.filterwarnings("ignore", category=RuntimeWarning, module="sklearn") +# %% +# Define a custom MDS solver +# -------------------------- +# +# We define a custom MDS solver that uses the `NetworkX` library to compute the MDS. +# Note the signature of the function is identical to the one used in the `RadiusClustering` class. + + +def custom_solver(n: int, edges: np.ndarray, nb_edges: int, random_state=None): + """ + Custom MDS solver using NetworkX to compute the MDS problem. + + Parameters: + ----------- + n : int + The number of points in the dataset. + edges : np.ndarray + The edges of the graph, flattened into a 1D array. + nb_edges : int + The number of edges in the graph. + random_state : int | None + The random state to use for reproducibility. + + Returns: + -------- + centers : list + A sorted list of the centers of the clusters. + mds_exec_time : float + The execution time of the MDS algorithm in seconds. + """ + G = nx.Graph() + G.add_edges_from(edges) + + start_time = time.time() + centers = list(nx.algorithms.dominating.dominating_set(G)) + mds_exec_time = time.time() - start_time + + centers = sorted(centers) + + return centers, mds_exec_time + + +# %% +# Define datasets to test the clustering +# -------------------------------------- +# +# We will use 4 datasets to test the clustering: +# 1. Iris dataset +# 2. Wine dataset +# 3. Breast Cancer dataset (WDBC) +# 4. Vehicle dataset +# These are common datasets used in machine learning and lead to pretty fast results. +# Structure of the variable `DATASETS`: +# - The key is the name of the dataset. +# - The value is a tuple containing: +# - The dataset fetched from OpenML. +# - The radius to use for the Radius clustering. (determined in literature, see references on home page) +# + + +DATASETS = { + "iris": (fetch_openml(name="iris", version=1, as_frame=False), 1.43), + "wine": (fetch_openml(name="wine", version=1, as_frame=False), 232.09), + "glass": (fetch_openml(name="glass", version=1, as_frame=False), 3.94), + "ionosphere": (fetch_openml(name="ionosphere", version=1, as_frame=False), 5.46), + "breast_cancer": (fetch_openml(name="wdbc", version=1, as_frame=False), 1197.42), + "synthetic": (fetch_openml(name="synthetic_control", version=1, as_frame=False), 70.12), + "vehicle": (fetch_openml(name="vehicle", version=1, as_frame=False), 155.05), + "yeast": (fetch_openml(name="yeast", version=1, as_frame=False), 0.4235), +} + +# %% +# Define the benchmark procedure +# -------------------------------------- +# +# We define a function to perform the benchmark on the datasets. +# The procedure is as follows: +# 1. Creates an instance of RadiusClustering for each solver. +# 2. For each instance, fit the algorithm on each dataset. +# 3. Store the execution time and the number of clusters found for each dataset. +# 4. Return the results as a dictionary. + + +def benchmark_radius_clustering(): + results = {} + exact = RadiusClustering(manner="exact", radius=1.43) + approx = RadiusClustering(manner="approx", radius=1.43) + custom = RadiusClustering( + manner="custom", radius=1.43 + ) + custom.set_solver(custom_solver) # Set the custom solver + algorithms = [exact, approx, custom] + # Loop through each algorithm and dataset + for algo in algorithms: + algo_results = {} + time_algo = [] + clusters_algo = [] + # Loop through each dataset + for name, (dataset, radius) in DATASETS.items(): + X = dataset.data + # set the radius for the dataset considered + setattr(algo, "radius", radius) + # Fit the algorithm + t0 = time.time() + algo.fit(X) + t_algo = time.time() - t0 + + # Store the results + time_algo.append(t_algo) + clusters_algo.append(len(algo.centers_)) + algo_results["time"] = time_algo + algo_results["clusters"] = clusters_algo + results[algo.manner] = algo_results + + return results + + +# %% +# Run the benchmark and plot the results +# -------------------------------------- +# We run the benchmark and plot the results for each dataset. + + +results = benchmark_radius_clustering() + +# Plot the results +fig, axs = plt.subplot_mosaic( + [ + ["time", "time", "time", "time"], + ["iris", "wine", "breast_cancer", "vehicle"], + ["glass", "ionosphere", "synthetic", "yeast"], + ], + layout="constrained", + figsize=(12, 8), +) +fig.suptitle("Benchmark of Radius Clustering Solvers", fontsize=16) + +axs['time'].set_yscale('log') # Use logarithmic scale for better visibility +for algo, algo_results in results.items(): + # Plot execution time + axs['time'].plot( + DATASETS.keys(), + algo_results["time"], + marker='o', + label=algo, + ) + # Plot number of clusters + +for i, (name, (dataset, _)) in enumerate(DATASETS.items()): + axs[name].bar( + results.keys(), + [results[algo]["clusters"][i] for algo in results.keys()], + label=name, + ) + axs[name].axhline( + y=len(set(dataset.target)), # Number of unique classes in the dataset + label="True number of clusters", + color='r', + linestyle='--', + ) + axs[name].set_title(name) + axs[name].set_xlabel("Algorithms") + +axs["iris"].set_ylabel("Number of clusters") +axs["glass"].set_ylabel("Number of clusters") + +axs['time'].set_title("Execution Time (log scale)") +axs['time'].set_xlabel("Datasets") +axs['time'].set_ylabel("Time (seconds)") +axs['time'].legend(title="Algorithms") +plt.tight_layout() +plt.show() + + +# %% +# Conclusion +# ---------- +# +# In this example, we applied Radius clustering to the Iris and Wine datasets and compared it with KMeans clustering. +# We visualized the clustering results and the difference between the two clustering algorithms. +# We saw that Radius Clustering can lead to smaller clusters than kmeans, which produces much more equilibrate clusters. +# The difference plot can be very useful to see where the two clustering algorithms differ. diff --git a/pyproject.toml b/pyproject.toml index 0ee2d8d..deb1997 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ dev = [ ] doc = [ + "networkx>=3.3", "sphinx>=8.1.3", "sphinx_gallery>=0.18.0", "sphinx-copybutton>=0.5.2", diff --git a/src/radius_clustering/__init__.py b/src/radius_clustering/__init__.py index 9609e48..57c5d53 100644 --- a/src/radius_clustering/__init__.py +++ b/src/radius_clustering/__init__.py @@ -2,4 +2,4 @@ from .radius_clustering import RadiusClustering __all__ = ["RadiusClustering"] -__version__ = "1.3.0" +__version__ = "1.4.0" diff --git a/src/radius_clustering/algorithms.py b/src/radius_clustering/algorithms.py new file mode 100644 index 0000000..0e71165 --- /dev/null +++ b/src/radius_clustering/algorithms.py @@ -0,0 +1,112 @@ +""" +This module contains the implementation of the clustering algorithms. +It provides two main functions: `clustering_approx` and `clustering_exact`. + +These functions can be replaced in the `RadiusClustering` class +to perform clustering using another algorithm. + +.. versionadded:: 1.4.0 + Refactoring the structure of the code to separate the clustering algorithms + This allows for easier maintenance and extensibility of the codebase. + +""" +from __future__ import annotations + +import numpy as np + +from .utils._mds_approx import solve_mds +from .utils._emos import py_emos_main + +def clustering_approx( + n: int, edges: np.ndarray, nb_edges: int, + random_state: int | None = None) -> None: + """ + Perform approximate MDS clustering. + This method uses a pretty trick to set the seed for + the random state of the C++ code of the MDS solver. + + .. tip:: + The random state is used to ensure reproducibility of the results + when using the approximate method. + If `random_state` is None, a default value of 42 is used. + + .. important:: + The trick to set the random state is : + + 1. Use the `check_random_state` function to get a `RandomState`singleton + instance, set up with the provided `random_state`. + + 2. Use the `randint` method of the `RandomState` instance to generate a + random integer. + + 3. Use this random integer as the seed for the C++ code of the MDS solver. + + + This ensures that the seed passed to the C++ code is always an integer, + which is required by the MDS solver, and allows for + reproducibility of the results. + + .. note:: + This function uses the approximation method to solve the MDS problem. + See [casado]_ for more details. + + Parameters: + ----------- + n : int + The number of points in the dataset. + edges : np.ndarray + The edges of the graph, flattened into a 1D array. + nb_edges : int + The number of edges in the graph. + random_state : int | None + The random state to use for reproducibility. + If None, a default value of 42 is used. + Returns: + -------- + centers : list + A sorted list of the centers of the clusters. + mds_exec_time : float + The execution time of the MDS algorithm in seconds. + """ + result = solve_mds( + n, edges.flatten().astype(np.int32), nb_edges, random_state + ) + centers = sorted([x for x in result["solution_set"]]) + mds_exec_time = result["Time"] + return centers, mds_exec_time + +def clustering_exact(n: int, edges: np.ndarray, nb_edges: int, seed: None = None) -> None: + """ + Perform exact MDS clustering. + + This function uses the EMOs algorithm to solve the MDS problem. + + .. important:: + The EMOS algorithm is an exact algorithm for solving the MDS problem. + It is a branch and bound algorithm that uses graph theory tricks + to efficiently cut the search space. See [jiang]_ for more details. + + Parameters: + ----------- + n : int + The number of points in the dataset. + edges : np.ndarray + The edges of the graph, flattened into a 1D array. + nb_edges : int + The number of edges in the graph. + seed : None + This parameter is not used in the exact method, but it is kept for + compatibility with the approximate method. + + Returns: + -------- + centers : list + A sorted list of the centers of the clusters. + mds_exec_time : float + The execution time of the MDS algorithm in seconds. + """ + centers, mds_exec_time = py_emos_main( + edges.flatten(), n, nb_edges + ) + centers.sort() + return centers, mds_exec_time \ No newline at end of file diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index 33d42c1..aa63fc0 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -18,8 +18,7 @@ from sklearn.metrics import pairwise_distances from sklearn.utils.validation import check_random_state, validate_data -from radius_clustering.utils._emos import py_emos_main -from radius_clustering.utils._mds_approx import solve_mds +from .algorithms import clustering_approx, clustering_exact DIR_PATH = os.path.dirname(os.path.realpath(__file__)) @@ -53,26 +52,38 @@ class RadiusClustering(ClusterMixin, BaseEstimator): .. note:: The `random_state_` attribute is not used when the `manner` is set to "exact". + + .. versionchanged:: 1.4.0 + The `RadiusClustering` class has been refactored. + Clustering algorithms are now separated into their own module + (`algorithms.py`) to improve maintainability and extensibility. + + .. versionadded:: 1.4.0 + The `set_solver` method was added to allow users to set a custom solver + for the MDS problem. This allows for flexibility in how the MDS problem is solved + and enables users to use their own implementations of MDS clustering algorithms. .. versionadded:: 1.3.0 - The *random_state* parameter was added to allow reproducibility in - the approximate method. + + - The *random_state* parameter was added to allow reproducibility in the approximate method. + + - The `radius` parameter replaces the `threshold` parameter for setting the dissimilarity threshold for better clarity and consistency. .. versionchanged:: 1.3.0 All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`). This is particularly useful for compatibility with scikit-learn's API. - .. versionadded:: 1.3.0 - The `radius` parameter replaces the `threshold` parameter for setting - the dissimilarity threshold for better clarity and consistency. - .. deprecated:: 1.3.0 The `threshold` parameter is deprecated. Use `radius` instead. Will be removed in a future version. """ _estimator_type = "clusterer" + _algorithms = { + "exact": clustering_exact, + "approx": clustering_approx, + } def __init__( self, @@ -102,7 +113,7 @@ def _check_symmetric(self, a: np.ndarray, tol: float = 1e-8) -> bool: return False return np.allclose(a, a.T, atol=tol) - def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": + def fit(self, X: np.ndarray, y: None = None, metric: str | callable = "euclidean") -> "RadiusClustering": """ Fit the MDS clustering model to the input data. @@ -130,6 +141,35 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": y : Ignored Not used, present here for API consistency by convention. + metric : str | callable, optional (default="euclidean") + The metric to use when computing the distance matrix. + The default is "euclidean". + This should be a valid metric string from + `sklearn.metrics.pairwise_distances` or a callable that computes + the distance between two points. + + .. note:: + The metric parameter *MUST* be a valid metric string from + `sklearn.metrics.pairwise_distances` or a callable that computes + the distance between two points. + Valid metric strings include : + - "euclidean" + - "manhattan" + - "cosine" + - "minkowski" + - and many more supported by scikit-learn. + please refer to the + `sklearn.metrics.pairwise_distances` documentation for a full list. + + .. attention:: + If the input is a distance matrix, the metric parameter is ignored. + The distance matrix should be symmetric and square. + + .. warning:: + If the parameter is a callable, it should : + - Accept two 1D arrays as input. + - Return a single float value representing the distance between the two points. + Returns: -------- self : object @@ -157,10 +197,13 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": # Create dist and adj matrices if not self._check_symmetric(self.X_checked_): - dist_mat = pairwise_distances(self.X_checked_, metric="euclidean") + dist_mat = pairwise_distances(self.X_checked_, metric=metric) else: dist_mat = self.X_checked_ - + + if not self._check_symmetric(dist_mat): + raise ValueError("Input distance matrix must be symmetric. Got a non-symmetric matrix.") + self.dist_mat_ = dist_mat if not isinstance(self.radius, (float, int)): raise ValueError("Radius must be a positive float.") if self.radius <= 0: @@ -177,15 +220,14 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": np.uint32 ) # Edges in the adjacency matrix # uint32 is used to use less memory. Max number of features is 2^32-1 - self.dist_mat_ = dist_mat - + self.clusterer_ = self._algorithms.get(self.manner, self._algorithms["approx"]) self._clustering() self._compute_effective_radius() self._compute_labels() return self - def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray: + def fit_predict(self, X: np.ndarray, y: None = None, metric: str | callable = "euclidean") -> np.ndarray: """ Fit the model and return the cluster labels. @@ -201,13 +243,18 @@ def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray: the distance matrix will be computed. y : Ignored Not used, present here for API consistency by convention. + + metric : str | callable, optional (default="euclidean") + The metric to use when computing the distance matrix. + The default is "euclidean". + Refer to the `fit` method for more details on valid metrics. Returns: -------- labels : array, shape (n_samples,) The cluster labels for each point in X. """ - self.fit(X) + self.fit(X, metric=metric) return self.labels_ def _clustering(self): @@ -215,76 +262,16 @@ def _clustering(self): Perform the clustering using either the exact or approximate MDS method. """ n = self.X_checked_.shape[0] - if self.manner != "exact" and self.manner != "approx": - print(f"Invalid manner: {self.manner}. Defaulting to 'approx'.") - raise ValueError("Invalid manner. Choose either 'exact' or 'approx'.") - if self.manner == "exact": - self._clustering_exact(n) + if self.manner not in self._algorithms: + raise ValueError(f"Invalid manner. Please choose in {list(self._algorithms.keys())}.") + if self.clusterer_ == clustering_approx: + if self.random_state is None: + self.random_state = 42 + self.random_state_ = check_random_state(self.random_state) + seed = self.random_state_.randint(np.iinfo(np.int32).max) else: - self._clustering_approx(n) - - def _clustering_exact(self, n: int) -> None: - """ - Perform exact MDS clustering. - - Parameters: - ----------- - n : int - The number of points in the dataset. - - Notes: - ------ - This function uses the EMOS algorithm to solve the MDS problem. - See: [jiang]_ for more details. - """ - self.centers_, self.mds_exec_time_ = py_emos_main( - self.edges_.flatten(), n, self.nb_edges_ - ) - self.centers_.sort() # Sort the centers to ensure consistent order - - def _clustering_approx(self, n: int) -> None: - """ - Perform approximate MDS clustering. - This method uses a pretty trick to set the seed for - the random state of the C++ code of the MDS solver. - - .. tip:: - The random state is used to ensure reproducibility of the results - when using the approximate method. - If `random_state` is None, a default value of 42 is used. - - .. important:: - :collapsible: closed - The trick to set the random state is : - 1. Use the `check_random_state` function to get a `RandomState`singleton - instance, set up with the provided `random_state`. - 2. Use the `randint` method of the `RandomState` instance to generate a - random integer. - 3. Use this random integer as the seed for the C++ code of the MDS solver. - - This ensures that the seed passed to the C++ code is always an integer, - which is required by the MDS solver, and allows for - reproducibility of the results. - - Parameters: - ----------- - n : int - The number of points in the dataset. - - Notes: - ------ - This function uses the approximation method to solve the MDS problem. - See [casado]_ for more details. - """ - if self.random_state is None: - self.random_state = 42 - self.random_state_ = check_random_state(self.random_state) - seed = self.random_state_.randint(np.iinfo(np.int32).max) - result = solve_mds( - n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed - ) - self.centers_ = sorted([x for x in result["solution_set"]]) - self.mds_exec_time_ = result["Time"] + seed = None + self.centers_, self.mds_exec_time_ = self.clusterer_(n, self.edges_, self.nb_edges_, seed) def _compute_effective_radius(self): """ @@ -304,3 +291,57 @@ def _compute_labels(self): min_dist = np.min(distances, axis=1) self.labels_[min_dist > self.radius] = -1 + + def set_solver(self, solver: callable) -> None: + """ + Set a custom solver for resolving the MDS problem. + This method allows users to replace the default MDS solver with a custom one. + + An example is provided below and in the example gallery : + :ref:`sphx_glr_auto_examples_plot_benchmark_custom.py` + + .. important:: + The custom solver must accept the same parameters as the default solvers + and return a tuple containing the cluster centers and the execution time. + e.g., it should have the signature: + + >>> def custom_solver( + >>> n: int, + >>> edges: np.ndarray, + >>> nb_edges: int, + >>> random_state: int | None = None + >>> ) -> tuple[list, float]: + >>> # Custom implementation details + >>> centers = [...] + >>> exec_time = ... + >>> # Return the centers and execution time + >>> return centers, exec_time + + This allows for flexibility in how the MDS problem is solved. + + Parameters: + ----------- + solver : callable + The custom solver function to use for MDS clustering. + It should accept the same parameters as the default solvers + and return a tuple containing the cluster centers and the execution time. + + Raises: + ------- + ValueError + If the provided solver does not have the correct signature. + + """ + if not callable(solver): + raise ValueError("The provided solver must be callable.") + + # Check if the solver has the correct signature + try: + n = 3 + edges = np.array([[0, 1], [1, 2], [2, 0]]) + nb_edges = edges.shape[0] + solver(n, edges, nb_edges, random_state=None) + except Exception as e: + raise ValueError(f"The provided solver does not have the correct signature: {e}") from e + self.manner = "custom" + self._algorithms["custom"] = solver \ No newline at end of file diff --git a/tests/test_unit.py b/tests/test_unit.py index 52e874f..bf846be 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -1,11 +1,11 @@ from radius_clustering import RadiusClustering import pytest +import numpy as np def test_symmetric(): """ Test that the RadiusClustering class can handle symmetric distance matrices. """ - import numpy as np # Check 1D array input @@ -35,12 +35,11 @@ def test_symmetric(): assert not clustering._check_symmetric(X_non_square), "The matrix should not be symmetric." -def test_fit(): +def test_fit_distance_matrix(): """ - Test that the RadiusClustering class can fit to a distance matrix and to a feature matrix. + Test that the RadiusClustering class can fit to a distance matrix. This test checks both the exact and approximate methods of clustering. """ - import numpy as np # Create a symmetric distance matrix X = np.array([[0, 1, 2], @@ -55,14 +54,27 @@ def test_fit(): assert clustering.nb_edges_ > 0, "There should be edges in the graph." assert np.array_equal(clustering.X_checked_, clustering.dist_mat_), "X_checked_ should be equal to dist_mat_ because X is a distance matrix." +@pytest.mark.parametrize( + "test_data", [ + ("euclidean",1.5), + ("manhattan", 2.1), + ("cosine", 1.0) + ] +) +def test_fit_features(test_data): + """ + Test that the RadiusClustering class can fit to feature data. + This test checks both the exact and approximate methods of clustering + and multiple metrics methods. + """ # Create a feature matrix X_features = np.array([[0, 1], [1, 0], [2, 1]]) + metric, radius = test_data - clustering = RadiusClustering(manner="approx", radius=1.5) - clustering.fit(X_features) - + clustering = RadiusClustering(manner="approx", radius=radius) + clustering.fit(X_features, metric=metric) # Check that the labels are assigned correctly assert len(clustering.labels_) == X_features.shape[0], "Labels length should match number of samples." assert clustering.nb_edges_ > 0, "There should be edges in the graph." @@ -72,10 +84,10 @@ def test_radius_clustering_invalid_manner(): """ Test that an error is raised when an invalid manner is provided. """ - with pytest.raises(ValueError, match="Invalid manner. Choose either 'exact' or 'approx'."): + with pytest.raises(ValueError): RadiusClustering(manner="invalid", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) - with pytest.raises(ValueError, match="Invalid manner. Choose either 'exact' or 'approx'."): + with pytest.raises(ValueError): RadiusClustering(manner="", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) @@ -90,4 +102,60 @@ def test_radius_clustering_invalid_radius(): RadiusClustering(manner="approx", radius=0.0).fit([[0, 1], [1, 0], [2, 1]]) with pytest.raises(ValueError, match="Radius must be a positive float."): - RadiusClustering(manner="exact", radius="invalid").fit([[0, 1], [1, 0], [2, 1]]) \ No newline at end of file + RadiusClustering(manner="exact", radius="invalid").fit([[0, 1], [1, 0], [2, 1]]) + +def test_radius_clustering_fit_without_data(): + """ + Test that an error is raised when fitting without data. + """ + clustering = RadiusClustering(manner="exact", radius=1.5) + with pytest.raises(ValueError): + clustering.fit(None) + +def test_radius_clustering_new_clusterer(): + """ + Test that a custom clusterer can be set within the RadiusClustering class. + """ + def custom_clusterer(n, edges, nb_edges, random_state=None): + # A mock custom clusterer that returns a fixed set of centers + # and a fixed execution time + return [0, 1], 0.1 + clustering = RadiusClustering(manner="exact", radius=1.5) + # Set the custom clusterer + assert hasattr(clustering, 'set_solver'), "RadiusClustering should have a set_solver method." + assert callable(clustering.set_solver), "set_solver should be callable." + clustering.set_solver(custom_clusterer) + # Fit the clustering with the custom clusterer + X = np.array([[0, 1], + [1, 0], + [2, 1]]) + clustering.fit(X) + assert clustering.clusterer_ == custom_clusterer, "The custom clusterer should be set correctly." + # Check that the labels are assigned correctly + assert len(clustering.labels_) == X.shape[0], "Labels length should match number of samples." + assert clustering.nb_edges_ > 0, "There should be edges in the graph." + assert clustering.centers_ == [0, 1], "The centers should match the custom clusterer's output." + assert clustering.mds_exec_time_ == 0.1, "The MDS execution time should match the custom clusterer's output." + +def test_invalid_clusterer(): + """ + Test that an error is raised when an invalid clusterer is set. + """ + clustering = RadiusClustering(manner="exact", radius=1.5) + with pytest.raises(ValueError, match="The provided solver must be callable."): + clustering.set_solver("not_a_callable") + + with pytest.raises(ValueError, match="The provided solver must be callable."): + clustering.set_solver(12345) # Not a callable + with pytest.raises(ValueError, match="The provided solver must be callable."): + clustering.set_solver(None) + + def invalid_signature(): + return [0, 1], 0.1 + + with pytest.raises(ValueError): + clustering.set_solver(invalid_signature) + def invalid_clusterer(n, edges, nb_edges): + return [0, 1], 0.1 + with pytest.raises(ValueError): + clustering.set_solver(invalid_clusterer) \ No newline at end of file