diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..78e4203 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,75 @@ +name: Bug Report +description: Create a report to help us improve +title: "[Bug]: " +labels: ["bug", "triage"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + + - type: textarea + id: what-happened + attributes: + label: Describe the bug + description: A clear and concise description of what the bug is. + placeholder: Tell us what you see! + validations: + required: true + + - type: textarea + id: reproduce + attributes: + label: To Reproduce + description: "Steps to reproduce the behavior. Please provide a minimal, self-contained code sample." + placeholder: | + ```python + import numpy as np + from radius_clustering import RadiusClustering + + # Your code here that triggers the bug + ``` + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected behavior + description: A clear and concise description of what you expected to happen. + validations: + required: true + + - type: dropdown + id: os + attributes: + label: Operating System + description: What operating system are you using? + options: + - Windows + - macOS + - Linux + validations: + required: true + + - type: input + id: python-version + attributes: + label: Python Version + placeholder: "e.g. 3.11.4" + validations: + required: true + + - type: input + id: package-version + attributes: + label: Package Version + placeholder: "e.g. 1.4.0" + validations: + required: true + + - type: textarea + id: additional-context + attributes: + label: Additional context + description: Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/doc_improvement.yml b/.github/ISSUE_TEMPLATE/doc_improvement.yml new file mode 100644 index 0000000..2393976 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/doc_improvement.yml @@ -0,0 +1,17 @@ +name: Documentation improvement +description: Create a report to help us improve the documentation. Alternatively you can just open a pull request with the suggested change. +labels: ["documentation", "triage"] + +body: +- type: textarea + attributes: + label: Describe the issue linked to the documentation + description: > + Tell us about the confusion introduced in the documentation. + validations: + required: true +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..624cf4b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,25 @@ +name: Feature Request +description: Suggest an idea for this project +title: "[Feature]: " +labels: ["enhancement"] +body: + - type: textarea + attributes: + label: Is your feature request related to a problem? Please describe. + description: A clear and concise description of what the problem is. Ex. "I'm always frustrated when..." + validations: + required: true + - type: textarea + attributes: + label: Describe the solution you'd like + description: A clear and concise description of what you want to happen. + validations: + required: true + - type: textarea + attributes: + label: Describe alternatives you've considered + description: A clear and concise description of any alternative solutions or features you've considered. + - type: textarea + attributes: + label: Additional context + description: Add any other context or screenshots about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..f15649c --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,32 @@ +## Description + +Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. + +Fixes # (issue) + +## Type of change + +Please delete options that are not relevant. + +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds functionality) +- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) +- [ ] This change requires a documentation update + +## How Has This Been Tested? + +Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. + +- [ ] Test A +- [ ] Test B + +## Checklist: + +- [ ] My code follows the style guidelines of this project +- [ ] I have performed a self-review of my own code +- [ ] I have commented my code, particularly in hard-to-understand areas +- [ ] I have made corresponding changes to the documentation +- [ ] My changes generate no new warnings +- [ ] I have added tests that prove my fix is effective or that my feature works +- [ ] New and existing unit tests pass locally with my changes +- [ ] Any dependent changes have been merged and published in downstream modules diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index e942fc3..7976ecc 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -3,15 +3,8 @@ name: Build and upload to PyPI on: workflow_dispatch: push: - branches: - - main - paths: - - "src/radius_clustering/**" - - "tests/**" - - "pyproject.toml" - release: - types: - - published + tags: + - "v*" jobs: run_pytest: @@ -93,7 +86,7 @@ jobs: attestations: write #if: github.event_name == 'release' && github.event.action == 'published' # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) - #if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') steps: - name: Download all dists uses: actions/download-artifact@v4 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..b0fdc5b --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,32 @@ +name: Lint and Format + +on: + workflow_call: + workflow_dispatch: + +jobs: + lint-and-format: + name: Run Linters and Formatters + runs-on: ubuntu-latest + steps: + - name: checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install ".[dev]" + + - name: Run ruff linter + run: | + ruff check src/radius_clustering tests --fix + + - name: Run black formatter + run: | + black src/radius_clustering tests --check + diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml index e407f41..f159ff8 100644 --- a/.github/workflows/sphinx.yml +++ b/.github/workflows/sphinx.yml @@ -23,7 +23,7 @@ jobs: sudo apt-get update sudo apt-get install build-essential pip install --upgrade pip - pip install -e ".[doc]" + pip install ".[doc]" pushd docs make html popd diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 243c494..50d84c9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,3 +28,9 @@ jobs: - name: Run tests with pytest run: | pytest -v + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v5.4.3 + with: + token: ${{ secrets.CODECOV_TOKEN }} + slug: scikit-learn-contrib/radius_clustering diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1a01ffc --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + +- repo: https://github.com/psf/black-pre-commit-mirror + rev: 24.8.0 + hooks: + - id: black + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.5.5 + hooks: + - id: ruff + args: ["--fix", "--show-source"] diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..97a1673 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ + +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +- Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +- The use of sexualized language or imagery, and sexual attention or advances of + any kind +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or email address, + without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement : +[Send Report](mailto:quentin.haenn.pro@gmail.com). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the +[Contributor Covenant](https://www.contributor-covenant.org/), version 2.1, +available at +. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion). + +For answers to common questions about this code of conduct, see the FAQ at +. Translations are available at +. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..571b0bc --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,51 @@ +# Contributing to Radius Clustering + +First off, thank you for considering contributing to Radius Clustering! It's people like you that make open source such a great community. + +## Where do I go from here? + +If you've noticed a bug or have a feature request, [make one](https://github.com/scikit-learn-contrib/radius_clustering/issues/new)! It's generally best if you get confirmation of your bug or approval for your feature request this way before starting to code. + +### Fork & create a branch + +If you've decided to contribute, you'll need to fork the repository and create a new branch. + +```bash +git checkout -b my-new-feature +``` + +## Getting started + +To get started with the development, you need to install the package in an editable mode with all the development dependencies. It is highly recommended to do this in a virtual environment. + +```bash +pip install -e ".[dev]" +``` + +This will install the package and all the tools needed for testing and linting. + +## Running Tests + +To ensure that your changes don't break anything, please run the test suite. + +```bash +pytest +``` + +## Code Style + +This project uses `ruff` for linting and `black` for formatting. We use `pre-commit` to automatically run these tools before each commit. + +To set up `pre-commit`, run: + +```bash +pre-commit install +``` + +This will ensure your contributions match the project's code style. + +## Submitting a Pull Request + +When you're ready to submit your changes, please write a clear and concise pull request message. Make sure to link any relevant issues. + +Thank you for your contribution! diff --git a/README.md b/README.md index 2b1b09e..88e5c1c 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ Code style: Ruff GitHub Actions Workflow Status Python version supported +Codecov +Binder +

@@ -93,12 +96,35 @@ If you want to know more about the experiments conducted with the package, pleas ## Contributing -Contributions to Radius Clustering are welcome! Please feel free to submit a Pull Request. +Contributions to Radius Clustering are welcome! + +Please read the [CONTRIBUTING.md](CONTRIBUTING.md) file for details on how to contribute to the project. +Please note that the project is released with a [Code of Conduct](CODE_OF_CONDUCT.md), and we expect all contributors to adhere to it. ## License This project is licensed under the GNU General Public License v3.0 - see the LICENSE file for details. +## How to cite this work + +If you use Radius Clustering in your research, please cite the following paper and the software itself: + +```bibtex +@inproceedings{haenn_clustering2024, + TITLE = {{Clustering Under Radius Constraints Using Minimum Dominating Sets}}, + AUTHOR = {Haenn, Quentin and Chardin, Brice and Baron, Micka{\"e}l}, + URL = {https://hal.science/hal-04533921}, + BOOKTITLE = {{Lecture Notes in Artificial Intelligence}}, + ADDRESS = {Poitiers, France}, + PUBLISHER = {{Springer}}, + YEAR = {2024}, + MONTH = Jun, + KEYWORDS = {Constrained Clustering ; Radius Based Clustering ; Minimum Dominating Set ; Constrained Clustering Radius Based Clustering Minimum Dominating Set}, + PDF = {https://hal.science/hal-04533921v1/file/clustering_under_radius_using_mds.pdf}, + HAL_ID = {hal-04533921}, + HAL_VERSION = {v1}, +} +``` ## Acknowledgments diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..da487bf --- /dev/null +++ b/environment.yml @@ -0,0 +1,7 @@ +name: radius_clustering +dependencies: + - matplotlib>=3.6.2 + - numpy>=2.0 + - scikit-learn>=1.2.2 + - scipy>=1.12.0 + - pandas>=2.0.3 diff --git a/notebooks/comparison_example.ipynb b/notebooks/comparison_example.ipynb new file mode 100644 index 0000000..fb84132 --- /dev/null +++ b/notebooks/comparison_example.ipynb @@ -0,0 +1,487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4acb9df3", + "metadata": {}, + "source": [ + "# Comparison of Radius Clustering with KMeans on the samples Dataset\n", + "\n", + "\n", + "This example is meant to illustrate the use of the Radius clustering library on several datasets.\n", + "\n", + "The example includes:\n", + "1. Loading the datasets\n", + "2. Applying Radius clustering and k-means clustering\n", + "3. Visualizing the clustering results\n", + "\n", + "This example serves as a simple introduction to using the Radius clustering library on well-known datasets.\n", + "\n", + "**Author: Haenn Quentin**\n", + "\n", + "**@SPDX-License-Identifier: MIT**\n", + "\n", + "\n", + "\n", + "## 1. Load the Iris dataset\n", + "\n", + "We start by loading the Iris dataset using the `fetch_openml` function from `sklearn.datasets`.\n", + "The Iris dataset is a well-known dataset that contains 150 samples of iris flowers.\n", + "Each sample has 4 features: sepal length, sepal width, petal length, and petal width.\n", + "The dataset is labeled with 3 classes: setosa, versicolor, and virginica." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e28a516b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn import datasets\n", + "from radius_clustering import RadiusClustering\n", + "\n", + "# Load the Iris dataset\n", + "iris = datasets.load_iris()\n", + "X = iris[\"data\"]\n", + "y = iris.target" + ] + }, + { + "cell_type": "markdown", + "id": "b84938fd", + "metadata": {}, + "source": [ + "\n", + "## 2. Visualize the Iris dataset\n", + "\n", + "\n", + "We can visualize the Iris dataset by plotting the dataset. We use PCA to reduce the dimensionality to 3D and plot the dataset in a 3D scatter plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28f37b15", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.decomposition import PCA\n", + "import mpl_toolkits.mplot3d\n", + "\n", + "# Reduce the dimensionality of the dataset to 3D using PCA\n", + "pca = PCA(n_components=3)\n", + "iris_reduced = pca.fit_transform(X)\n", + "fig = plt.figure(figsize=(8, 6))\n", + "ax = fig.add_subplot(111, projection=\"3d\", elev=48, azim=134)\n", + "ax.scatter(\n", + " iris_reduced[:, 0],\n", + " iris_reduced[:, 1],\n", + " iris_reduced[:, 2],\n", + " c=y,\n", + " cmap=\"Dark2\",\n", + " s=40,\n", + ")\n", + "# Set plot labels\n", + "ax.set_title(\"Iris dataset in first 3 PCA components\")\n", + "ax.set_xlabel(\"1st eigenvector\")\n", + "ax.set_ylabel(\"2nd eigenvector\")\n", + "ax.set_zlabel(\"3rd eigenvector\")\n", + "\n", + "# Hide tick labels\n", + "ax.xaxis.set_ticklabels([])\n", + "ax.yaxis.set_ticklabels([])\n", + "ax.zaxis.set_ticklabels([])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "cd38d50b", + "metadata": {}, + "source": [ + "\n", + "## 3. Compute Clustering with Radius Clustering\n", + "\n", + "We can now apply Radius clustering to the Iris dataset.\n", + "We create an instance of the `RadiusClustering` class and fit it to the Iris dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9282ec34", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "rad = RadiusClustering(manner=\"exact\", radius=1.43)\n", + "t0 = time.time()\n", + "rad.fit(X)\n", + "t_rad = time.time() - t0" + ] + }, + { + "cell_type": "markdown", + "id": "2653845e", + "metadata": {}, + "source": [ + "\n", + "## 4. Compute KMeans Clustering for Comparison\n", + "\n", + "We also apply KMeans clustering to the Iris dataset for comparison.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7e993f5", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from sklearn.cluster import KMeans\n", + "\n", + "k_means = KMeans(n_clusters=3, n_init=10)\n", + "t0 = time.time()\n", + "k_means.fit(X)\n", + "t_kmeans = time.time() - t0" + ] + }, + { + "cell_type": "markdown", + "id": "d1072a7f", + "metadata": {}, + "source": [ + "## 5. Establishing parity between clusters\n", + "\n", + "We want to have the same color for the same cluster in both plots.\n", + "We can achieve this by matching the cluster labels of the Radius clustering and the KMeans clustering.\n", + "First we define a function to retrieve the cluster centers from the Radius clustering and KMeans clustering and\n", + "match them pairwise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ac48cdf", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def get_order_labels(kmeans, rad, data):\n", + " centers1_cpy = kmeans.cluster_centers_.copy()\n", + " centers2_cpy = data[rad.centers_].copy()\n", + " order = []\n", + " # For each center in the first clustering, find the closest center in the second clustering\n", + " for center in centers1_cpy:\n", + " match = pairwise_distances_argmin([center], centers2_cpy)\n", + " # if there is only one center left, assign it to the last cluster label not yet assigned\n", + " if len(centers2_cpy) == 1:\n", + " for i in range(len(centers1_cpy)):\n", + " if i not in order:\n", + " order.append(i)\n", + " break\n", + " break\n", + " # get coordinates of the center in the second clustering\n", + " coordinates = centers2_cpy[match]\n", + " # find the closest point in the data to the center to get the cluster label\n", + " closest_point = pairwise_distances_argmin(coordinates, data)\n", + " match_label = rad.labels_[closest_point]\n", + " # remove the center from the second clustering\n", + " centers2_cpy = np.delete(centers2_cpy, match, axis=0)\n", + " # add the cluster label to the order\n", + " order.append(int(match_label[0]))\n", + " return order\n", + "\n", + "\n", + "from sklearn.metrics.pairwise import pairwise_distances_argmin\n", + "\n", + "rad_centers_index = np.array(rad.centers_)\n", + "order = get_order_labels(k_means, rad, X)\n", + "\n", + "kmeans_centers = k_means.cluster_centers_\n", + "rad_centers = rad_centers_index[order]\n", + "rad_centers_coordinates = X[rad_centers]\n", + "\n", + "# Pair the cluster labels\n", + "kmeans_labels = pairwise_distances_argmin(X, kmeans_centers)\n", + "rad_labels = pairwise_distances_argmin(X, rad_centers_coordinates)" + ] + }, + { + "cell_type": "markdown", + "id": "b428447c", + "metadata": {}, + "source": [ + "### Plotting the results and the difference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69c095ee", + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(12, 6))\n", + "fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)\n", + "colors = [\"#4EACC5\", \"#FF9C34\", \"#4E9A06\"]\n", + "\n", + "# KMeans\n", + "ax = fig.add_subplot(1, 3, 1, projection=\"3d\", elev=48, azim=134, roll=0)\n", + "\n", + "ax.scatter(\n", + " iris_reduced[:, 0],\n", + " iris_reduced[:, 1],\n", + " iris_reduced[:, 2],\n", + " c=kmeans_labels,\n", + " cmap=\"Dark2\",\n", + " s=40,\n", + ")\n", + "# adapting center coordinates to the 3D plot\n", + "kmeans_centers = pca.transform(kmeans_centers)\n", + "ax.scatter(\n", + " kmeans_centers[:, 0],\n", + " kmeans_centers[:, 1],\n", + " kmeans_centers[:, 2],\n", + " c=\"r\",\n", + " s=200,\n", + ")\n", + "ax.set_title(\"KMeans\")\n", + "ax.set_xticks(())\n", + "ax.set_yticks(())\n", + "ax.set_zticks(())\n", + "\n", + "ax.text3D(-3.5, 3, 1.0, \"train time: %.2fs\\ninertia: %f\" % (t_kmeans, k_means.inertia_))\n", + "\n", + "# MDS\n", + "ax = fig.add_subplot(1, 3, 2, projection=\"3d\", elev=48, azim=134, roll=0)\n", + "ax.scatter(\n", + " iris_reduced[:, 0],\n", + " iris_reduced[:, 1],\n", + " iris_reduced[:, 2],\n", + " c=rad_labels,\n", + " cmap=\"Dark2\",\n", + " s=40,\n", + ")\n", + "# adapting center coordinates to the 3D plot\n", + "rad_centers_coordinates = pca.transform(rad_centers_coordinates)\n", + "ax.scatter(\n", + " rad_centers_coordinates[:, 0],\n", + " rad_centers_coordinates[:, 1],\n", + " rad_centers_coordinates[:, 2],\n", + " c=\"r\",\n", + " s=200,\n", + ")\n", + "ax.set_title(\"MDS Clustering\")\n", + "ax.set_xticks(())\n", + "ax.set_yticks(())\n", + "ax.set_zticks(())\n", + "ax.text3D(-3.5, 3, 0.0, \"train time: %.2fs\" % t_rad)\n", + "\n", + "# Initialize the different array to all False\n", + "different = rad_labels == 4\n", + "ax = fig.add_subplot(1, 3, 3, projection=\"3d\", elev=48, azim=134, roll=0)\n", + "\n", + "for k in range(3):\n", + " different += (kmeans_labels == k) != (rad_labels == k)\n", + "\n", + "identical = np.logical_not(different)\n", + "ax.scatter(\n", + " iris_reduced[identical, 0], iris_reduced[identical, 1], color=\"#bbbbbb\", marker=\".\"\n", + ")\n", + "ax.scatter(iris_reduced[different, 0], iris_reduced[different, 1], color=\"m\")\n", + "ax.set_title(\"Difference\")\n", + "ax.set_xticks(())\n", + "ax.set_yticks(())\n", + "ax.set_zticks(())\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "3d1c4fcf", + "metadata": {}, + "source": [ + "## Another difference plot\n", + "\n", + "As we saw, the difference plot is not very informative using Iris.\n", + "We'll use a different dataset to show the difference plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea3d0438", + "metadata": {}, + "outputs": [], + "source": [ + "wine = datasets.load_wine()\n", + "X = wine.data\n", + "y = wine.target\n", + "pca = PCA(n_components=3)\n", + "wine_reduced = pca.fit_transform(X)\n", + "\n", + "# Compute clustering with MDS\n", + "\n", + "rad = RadiusClustering(manner=\"exact\", radius=232.09)\n", + "t0 = time.time()\n", + "rad.fit(X)\n", + "t_rad = time.time() - t0\n", + "\n", + "# Compute KMeans clustering for comparison\n", + "\n", + "k_means = KMeans(n_clusters=3, n_init=10)\n", + "t0 = time.time()\n", + "k_means.fit(X)\n", + "t_kmeans = time.time() - t0" + ] + }, + { + "cell_type": "markdown", + "id": "3929dee4", + "metadata": {}, + "source": [ + "## Reapplying the same process as before" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24449b3a", + "metadata": {}, + "outputs": [], + "source": [ + "rad_centers_index = np.array(rad.centers_)\n", + "order = get_order_labels(k_means, rad, X)\n", + "\n", + "kmeans_centers = k_means.cluster_centers_\n", + "rad_centers = rad_centers_index[order]\n", + "rad_centers_coordinates = X[rad_centers]\n", + "\n", + "# Pair the cluster labels\n", + "kmeans_labels = pairwise_distances_argmin(X, kmeans_centers)\n", + "rad_labels = pairwise_distances_argmin(X, rad_centers_coordinates)" + ] + }, + { + "cell_type": "markdown", + "id": "3accac5b", + "metadata": {}, + "source": [ + "## Plotting the results and the difference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39235d3c", + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(12, 6))\n", + "fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)\n", + "colors = [\"#4EACC5\", \"#FF9C34\", \"#4E9A06\"]\n", + "\n", + "# KMeans\n", + "ax = fig.add_subplot(1, 3, 1, projection=\"3d\", elev=48, azim=134, roll=0)\n", + "\n", + "ax.scatter(\n", + " wine_reduced[:, 0],\n", + " wine_reduced[:, 1],\n", + " wine_reduced[:, 2],\n", + " c=kmeans_labels,\n", + " cmap=\"Dark2\",\n", + " s=40,\n", + ")\n", + "# adapting center coordinates to the 3D plot\n", + "kmeans_centers = pca.transform(kmeans_centers)\n", + "ax.scatter(\n", + " kmeans_centers[:, 0],\n", + " kmeans_centers[:, 1],\n", + " kmeans_centers[:, 2],\n", + " c=\"r\",\n", + " s=200,\n", + ")\n", + "ax.set_title(\"KMeans\")\n", + "ax.set_xticks(())\n", + "ax.set_yticks(())\n", + "ax.set_zticks(())\n", + "\n", + "ax.text3D(\n", + " 60.0, 80.0, 0.0, \"train time: %.2fs\\ninertia: %f\" % (t_kmeans, k_means.inertia_)\n", + ")\n", + "\n", + "# MDS\n", + "ax = fig.add_subplot(1, 3, 2, projection=\"3d\", elev=48, azim=134, roll=0)\n", + "ax.scatter(\n", + " wine_reduced[:, 0],\n", + " wine_reduced[:, 1],\n", + " wine_reduced[:, 2],\n", + " c=rad_labels,\n", + " cmap=\"Dark2\",\n", + " s=40,\n", + ")\n", + "# adapting center coordinates to the 3D plot\n", + "rad_centers_coordinates = pca.transform(rad_centers_coordinates)\n", + "ax.scatter(\n", + " rad_centers_coordinates[:, 0],\n", + " rad_centers_coordinates[:, 1],\n", + " rad_centers_coordinates[:, 2],\n", + " c=\"r\",\n", + " s=200,\n", + ")\n", + "ax.set_title(\"MDS Clustering\")\n", + "ax.set_xticks(())\n", + "ax.set_yticks(())\n", + "ax.set_zticks(())\n", + "ax.text3D(60.0, 80.0, 0.0, \"train time: %.2fs\" % t_rad)\n", + "\n", + "# Initialize the different array to all False\n", + "different = rad_labels == 4\n", + "ax = fig.add_subplot(1, 3, 3, projection=\"3d\", elev=48, azim=134, roll=0)\n", + "\n", + "for k in range(3):\n", + " different += (kmeans_labels == k) != (rad_labels == k)\n", + "\n", + "identical = np.logical_not(different)\n", + "ax.scatter(\n", + " wine_reduced[identical, 0], wine_reduced[identical, 1], color=\"#bbbbbb\", marker=\".\"\n", + ")\n", + "ax.scatter(wine_reduced[different, 0], wine_reduced[different, 1], color=\"m\")\n", + "ax.set_title(\"Difference\")\n", + "ax.set_xticks(())\n", + "ax.set_yticks(())\n", + "ax.set_zticks(())\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c1172f38", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "In this example, we applied Radius clustering to the Iris and Wine datasets and compared it with KMeans clustering.\n", + "We visualized the clustering results and the difference between the two clustering algorithms.\n", + "We saw that Radius Clustering can lead to smaller clusters than kmeans, which produces much more equilibrate clusters.\n", + "The difference plot can be very useful to see where the two clustering algorithms differ." + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 2bc50a9..04fe3f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,10 +11,13 @@ authors = [ {name = "Quentin Haenn"}, {name = "Lias Laboratory"} ] +maintainers = [ + {name = "Quentin Haenn", email = "quentin.haenn.pro@gmail.com"} + ] dependencies = [ "matplotlib>=3.6.2", - "numpy>=1.23", + "numpy>=2.0", "scikit-learn>=1.2.2", "scipy>=1.12.0", ] @@ -22,6 +25,7 @@ dependencies = [ requires-python = ">=3.9" license = {file = "LICENSE"} classifiers=[ + "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", "Intended Audience :: Developers", "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", @@ -30,6 +34,8 @@ classifiers=[ "Programming Language :: Python", "Topic :: Software Development", "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Machine Learning", + "Topic :: Scientific/Engineering :: Mathematics", "Operating System :: Microsoft :: Windows", "Operating System :: POSIX", "Operating System :: Unix", @@ -42,15 +48,16 @@ classifiers=[ "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", ] -keywords = ["Unsupervised learning","clustering", "minimum dominating sets","clustering under radius constraint"] +keywords = ["Unsupervised learning", "clustering", "minimum dominating sets","clustering under radius constraint"] [project.urls] -source = "https://github.com/lias-laboratory/radius_clustering" -tracker = "https://github.com/lias-laboratory/radius_clustering/issues" -documentation = "https://lias-laboratory.github.io/radius_clustering/" +source = "https://github.com/scikit-learn-contrib/radius_clustering" +tracker = "https://github.com/scikit-learn-contrib/radius_clustering/issues" +documentation = "https://contrib.scikit-learn.org/radius_clustering/" [project.optional-dependencies] dev = [ + "pre-commit>=3.8.0", "pytest>=8.3.3", "pytest-cov>=5.0.0", "pandas",