From 3dceb50eedd44a1ac2c240d59eecf3623ab3e511 Mon Sep 17 00:00:00 2001 From: Quentin Date: Wed, 18 Jun 2025 10:59:24 +0200 Subject: [PATCH 1/7] Implement metric choice in main class --- .coverage | Bin 53248 -> 53248 bytes src/radius_clustering/radius_clustering.py | 46 ++++++++++++++++++--- tests/test_unit.py | 26 ++++++++---- 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/.coverage b/.coverage index f32d8f4b5a185863981f35324f83b3c048d4fa13..9464e96e22905eafd5847f6c4716b1f8f94ed866 100644 GIT binary patch delta 3521 zcmYk-du-L^9mnzKIi>VD-{ZRxT0RFsQ=fPf$uu@uTpq#RFgr=_(wdIjVn3dlwI zQ4n3SOcPmh=3=@oV@o!(0cqmGEyM|wi*k_-8DVpCfhBHki^;68?VNpoeJ<&Lug~v! z+S8o&^u4z)Yj0oHkr91a2cYa=PTypIrZ>qO?TzxlBkCLVclCw3sV=KC>bN?hex}}2 z@2KynJ!-33uUb{DTBR1Nd1{&(uZmPeWy|~WU-GWJC9lYj4 zrXiZR1Fx9-@$_-nZmt}saoA>_FQ*CEYP>Xk7hX1ALYKfx&P!p7@gjWwqH!5r1pa1s zPzIZf7vjN2<5Id1HW)9UrSO7r30(l|wbOTL39K{Ei|BG#Yh0MV2i?X6v=F+CqqG1z zwS8JZqtKy8`W_y%n@c&g2-=KkW+bdJ&PKNyXQ5k+Lo^GTowK3Im{1Ch+9X2>JntNW zhHQW6*F@4J1NG+MKFxvE+L@6g4Rz-Ex0&%!YdlE4g`~MNo(w{bd48MR4_61qznxpD zaFyYW6jYiUH_0uiF#e3(^dVu!4f0v|IdlJN3gYJRr{o%}GQLJWg>vT`u+sPnxeCu3 zUnW;zh4ClkGCX7aG5G|R8($(H!!m83T*4PvY7Q=vOR&WF0=Wo_jR(kiD0995i;T~b zbFk2P07{L|kh8GB_(O6AN{mmF58-L!ljJnax9|TcJb20-oC?nijPK#G6wEb`kCBsX z4jMmx9ldO}R(nVI$v~k{iX02i(p*Pwvzhw5yO%wo)!EHvXm$0n>E>A%^QURHce2N| zdfVAlt+sV+idI9CP1b7dXOC&Mw6aNB%`I%AR#P*ZpjF?%9@VO?XXCY!wQQVLO_Gh( z@@tyd7|rT9dqk@u&K}lE#Mx-AsyKT{tFnp}n|qZkX6{w6BCU7@d(bGs3bp!c*aPNS z8;fd{C!hc|^#7-XmaOO{rLp7MBMv9-i+X2%jR}p!b#c7w`9wfNeaNxA6anv2s8~ z)&IOQ?}~TUJMJC!4tei-Z+ZK@9o|N-&8zdKc&ofQUcUNT{S_jR?J+FYZS_-iO`TIG z)DP9`>TR_%d{1ps?P_ZHE45n1v0ii47ksX+= zm2!q$5dM>l%5mZAGA#Zpz7YQuzr%2y5Wf<;#rMUJ#hYS<*d*478c`yqivq#;Klq>d z0RJsN&Uf-Y{!4xki?xAQ@p*h1pTbA;5#DGR#8&+QuEGcK5xfUKhqqy01lB?$EQZ-I z31Z<>;q#cQ_n8kPgII;KGUPx&MgzVskoa*4dPVlTaV|yn@!&q5G&YeQ>trL@JM?|< zAYQG%Wx%RELth${2eb8>8`!N^&G17M%eC7bVYznM5u|Hp@ERffX4tMB_JCr%Ua=#M z_1bO+#e8jZ1N*hr4Gh@J!^YB=h6cWVh+@OGxFd|%i*78UWt-i=ifwWOGq%wU?AQi3 zFk~;-5tMAb9TZcx&JAqYS}}4c7x*J!&AQzY=B&#N%vq-$LC-qupcu4vH?U}JZeY^Z zxPeV;bpxZ;GHhh~&2C}Wn(PR|);Mg0o_7Pw*5C%Ft=^8HZL94d7`Hk%ux_<>WH4_@ zJBW{gtFgahEL^qojZ~Gpg^jCp10z@A21YL7zT|4^Id_1Wi@SlHTQzLlD0joh)U9+2 zTlcIR7`qj2VC|l919P{0*toXL4eZ@gH!yfh+`!^39yTtN>EUCOFnWvJ(OELE&<)I9 zsTG_hvbO$F%Vh0%%bMgk$ELBC)$uWw74b)qKK?VKp+x*)`?xAT+Oo3h zA;6}v+0Urq{D29je5waT+vq?;{2@w$j3O)!CQB$VQ z^fJ|^HRF%~0flB_%rN6RopC#*(?Kx7E{?Q5s$@X_SkEZ$S z^ZlK3Z|2;&d%wFz@^+2n{p4FCc|CApUv#7_SgOiYshX|0ipVSSpYn70XZag>T7D#d zDl>9e4$3~cOZLbvxly*ulw2h%WVtMng)%Cxic8{x_>=gRI4+Kecg2rHzj#%ATWl2@ zL|W8}O0ihX7sWz~2>*)zji2YgPOzV`3>#tt>@~KNb+JyCX0@!6Eny2-F%!&#|G+=s!jb&gB|nJv%tt5}q=? zLcWG4ohM`(tLyC*(8eFg}?}!W!cVbbII^5+SECw3&r*avIXk6VPgWikyTNP`3{w+{o@xyBrKM@UWR5B17f*$wv?zfRtOCB}U?U+m_6{@tPRV7(c*%Zz);>vR#?^CIn8 zx=<_K?w5zc|DPefep%>9q=Ou$cj^tTS$c<7ODnxyt1U}!(@M9|1!gr(=WAtK=sd0F z4tlFreTvT2Y8s=ZT8&M#M602Z2DfOY8t5FYWIerEt1d}z(yFbaH)_??(qgl(hQ_rL zNqU1(iq6&=OVA>%>Sj7itE!q7YQ=N3Kr1M&qWNZ6Ok-LX<5X!C#i`UPETTfIppbH{ zSOH~PDn>y|D(Y(qNvReWG^)jTbS4U)HH*#&NBhoZ%Yz>1ChD^KQe9AI;pgxdww3*s ze}}&>7s^*uky@iZRpaVIl~sq;LA6`Gs&=YvYLlu^DOIJ)RjJG=kpGqw@(X!h{$B2p zV^{<4$|2b+o8)$xkelUPxmM1W%f&V6i+_nfVgdY8WW`5pT^xAIP2!~V`!@P)hpKmIe=6?UHem>p%q zY!80$o7sAnV9Qt;o5S*048DRd;8QpS@4*oC!}p*Yc9Q@5J5mzs@&kV+K92DF-xG>@ zsFyhki{N?{yyOCQ!HX_n8SHQY+u#Ko!aCT_!W-a=kA3jGUGT6Fx@`y>VOtpRhk)C# z6T0kD*a=%*z*2Z_T9|s)1+0ZCGLm#`-~T)?7O;{p~% zy9?M9ZPUU;+6C;2Ru`}=T3o=kXm$bX;?e(wfQ&b}gpJW?Ls%INHW2KLlndAy^)6s( zJYqu*Tcgefg0)fW9>Qp5wexVM#%&6)I+8A7cO+cE@~CzJ+hdix!NE+GTfq8w*ahs5 zhg`q{dC&!HkOy4A3aK<9h!8B1``ywYIkeIRtdSKiV2>jIX^Jz=>1{u8W|CDWGr7Q3VNX6|+i*eG`aUgKS_ltq^5_J!eg{27IvQf`f< zQf9ZbX799YX}!a;EqgnJ?|*o|blYuq+>%~knQfVG+1xSDvOaaIWz*PP%f_Zs%ZA1h z%T&WHmdW}#mUYRS4TDH+-A#5}Q+uOjvZmNFk&HX0Zm=9n%(kp bool: return False return np.allclose(a, a.T, atol=tol) - def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": + def fit(self, X: np.ndarray, y: None = None, metric: str | callable = "euclidean") -> "RadiusClustering": """ Fit the MDS clustering model to the input data. @@ -130,6 +130,35 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": y : Ignored Not used, present here for API consistency by convention. + metric : str | callable, optional (default="euclidean") + The metric to use when computing the distance matrix. + The default is "euclidean". + This should be a valid metric string from + `sklearn.metrics.pairwise_distances` or a callable that computes + the distance between two points. + + .. note:: + The metric parameter *MUST* be a valid metric string from + `sklearn.metrics.pairwise_distances` or a callable that computes + the distance between two points. + Valid metric strings include : + - "euclidean" + - "manhattan" + - "cosine" + - "minkowski" + - and many more supported by scikit-learn. + please refer to the + `sklearn.metrics.pairwise_distances` documentation for a full list. + + .. attention:: + If the input is a distance matrix, the metric parameter is ignored. + The distance matrix should be symmetric and square. + + .. warning:: + If the parameter is a callable, it should : + - Accept two 1D arrays as input. + - Return a single float value representing the distance between the two points. + Returns: -------- self : object @@ -157,10 +186,13 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": # Create dist and adj matrices if not self._check_symmetric(self.X_checked_): - dist_mat = pairwise_distances(self.X_checked_, metric="euclidean") + dist_mat = pairwise_distances(self.X_checked_, metric=metric) else: dist_mat = self.X_checked_ - + + if not self._check_symmetric(dist_mat): + raise ValueError("Input distance matrix must be symmetric. Got a non-symmetric matrix.") + self.dist_mat_ = dist_mat if not isinstance(self.radius, (float, int)): raise ValueError("Radius must be a positive float.") if self.radius <= 0: @@ -177,7 +209,6 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": np.uint32 ) # Edges in the adjacency matrix # uint32 is used to use less memory. Max number of features is 2^32-1 - self.dist_mat_ = dist_mat self._clustering() self._compute_effective_radius() @@ -185,7 +216,7 @@ def fit(self, X: np.ndarray, y: None = None) -> "RadiusClustering": return self - def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray: + def fit_predict(self, X: np.ndarray, y: None = None, metric: str | callable = "euclidean") -> np.ndarray: """ Fit the model and return the cluster labels. @@ -201,6 +232,11 @@ def fit_predict(self, X: np.ndarray, y: None = None) -> np.ndarray: the distance matrix will be computed. y : Ignored Not used, present here for API consistency by convention. + + metric : str | callable, optional (default="euclidean") + The metric to use when computing the distance matrix. + The default is "euclidean". + Refer to the `fit` method for more details on valid metrics. Returns: -------- diff --git a/tests/test_unit.py b/tests/test_unit.py index 52e874f..ee0cfec 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -1,11 +1,11 @@ from radius_clustering import RadiusClustering import pytest +import numpy as np def test_symmetric(): """ Test that the RadiusClustering class can handle symmetric distance matrices. """ - import numpy as np # Check 1D array input @@ -35,12 +35,11 @@ def test_symmetric(): assert not clustering._check_symmetric(X_non_square), "The matrix should not be symmetric." -def test_fit(): +def test_fit_distance_matrix(): """ - Test that the RadiusClustering class can fit to a distance matrix and to a feature matrix. + Test that the RadiusClustering class can fit to a distance matrix. This test checks both the exact and approximate methods of clustering. """ - import numpy as np # Create a symmetric distance matrix X = np.array([[0, 1, 2], @@ -55,14 +54,27 @@ def test_fit(): assert clustering.nb_edges_ > 0, "There should be edges in the graph." assert np.array_equal(clustering.X_checked_, clustering.dist_mat_), "X_checked_ should be equal to dist_mat_ because X is a distance matrix." +@pytest.mark.parametrize( + "test_data", [ + ("euclidean",1.5), + ("manhattan", 2.1), + ("cosine", 1.0) + ] +) +def test_fit_features(test_data): + """ + Test that the RadiusClustering class can fit to feature data. + This test checks both the exact and approximate methods of clustering + and multiple metrics methods. + """ # Create a feature matrix X_features = np.array([[0, 1], [1, 0], [2, 1]]) + metric, radius = test_data - clustering = RadiusClustering(manner="approx", radius=1.5) - clustering.fit(X_features) - + clustering = RadiusClustering(manner="approx", radius=radius) + clustering.fit(X_features, metric=metric) # Check that the labels are assigned correctly assert len(clustering.labels_) == X_features.shape[0], "Labels length should match number of samples." assert clustering.nb_edges_ > 0, "There should be edges in the graph." From 8c605e18c612308c2e03c97542f6af9e21bac826 Mon Sep 17 00:00:00 2001 From: Quentin Date: Wed, 18 Jun 2025 15:28:13 +0200 Subject: [PATCH 2/7] refactoring algorithms + doc enhancement --- .coverage | Bin 53248 -> 53248 bytes docs/source/api.rst | 14 +++- src/radius_clustering/algorithms.py | 86 +++++++++++++++++++ src/radius_clustering/radius_clustering.py | 92 ++++----------------- 4 files changed, 116 insertions(+), 76 deletions(-) create mode 100644 src/radius_clustering/algorithms.py diff --git a/.coverage b/.coverage index 9464e96e22905eafd5847f6c4716b1f8f94ed866..3257cab877088ff7e27eada74743303d857685d0 100644 GIT binary patch delta 3528 zcmZA23vg7`83*ul@7-+9{m$bCNC+gL;VB6TNeuE7$HXu#uoLi>J%+Bxs zopbl@=;@3Xr1S#Ja$rUAYmIlpphd5Z5pJS5{~ZY~NWZig2IYpei@=O>IvIkc`? zzIxq?4V_)<6IGF3amWGW9CDBo>T?Ydz1Zw{ed~g zdgKyvq_%mF9v+ypw2?yb)@De-+-LMf>LJLG;9y|nPFbPJ(P{8&e z`rqH-Q5pT%O^@?5!!~drl^$n8*a)O>oO(0EntKk%iDczm90$qrN*uQ(E6Q;kt7!#} zr%Rp_)1YLTc(OFf(g7R~NS2i1xOIoFRa}DOrW_X+lEhOihHSTDXC3LajPrBDw)JYJM3F*wvoJ_i-s{S1DdhLPYKO1YLr# z^2g{CSg3pveQbx+_79V=K#k9%ewd$@b?4(p{pdW*Qw#m*V#0a7sO$sud#F^iv*-hO z#CJbbD4#)Rp_nGwiQL-J zgNvowI`9;!_8wfMX6?98D%OT4OZCKXfmCZJ&X;P4;z?31z4$(<<`z6r{#MPVW;{U- z8=LTWsrm*yPO7dRkClqn;W1LRQJkmLi0_rE3E_LBLN$1_QZ*hW6$#;7sc;DANL9Nb zJW{f%8jnyLt8lj37{SA(LJ^#$6vmlS^TT+UR89&HmCBuuhe%z`!5L~b7dukfIrz@~ z(=fBLv3LJ861xv=bNAS5k@KzdrSn(kQ!<)7LW6V``;1S4D^AGicTPEPi^h7*`H{27 zdCJ-1bU86+nzPs`cP5KQ%Hn7EF}T35IKVH6BK#M>CK~A#zMt=fGkh1{&MV;}}jD4Tg!LQjAHXDA<#;{EKAC^wP zq<^6oL?I>UFX>yfhdxSo(XDh9jnePYBAQ43O|FxV$Y*pIIYLg6gXC4xB%0`HvYD(T zRb(0|AQ{Ahn{drO0aM_|uorg0c31=LPzf`~!V-wUI3SR1r|f^)U)X=N`|aP_J@!F6 zVLxN9wRec7UWgON-b!}^DYOTIUh|lOFAza-J&W!^&udrCMbBxMSEAk873Jtz?eYrr zL+x|r=o#&@6xyX-I)HX+mz1KXJ-gn?ic8Q_dZD-&{Xo007{#@V3Q@QBIt@{bg+;FB zK;e^`xa zMJuiMeKNUD9VU8dt!8xT8qHAaYR%r-$Eo)mUcHEJa=ZK_+G&*^L_Iy`2T@Neb@2LW zg+8+=s7}3Lh=y9O!=R|B4jl~9QSE*ZCDrDKV15cO^ONYQm>)z@wfey_0*RiZ*{&$2 zWixdb z>7t?L`g=r0Ro)f)#XU!U$w-t`g&(|@Dn2&eYpI9)p{S`jeh@wNpdUn0&Gv(6s#$&z zRW;KOqN`^3;V9~x?!S#A$!Y$AsH+FmRp^SsnyRnJ*s?NjzqlT~#=2iyR930p($Q0* z+1634+1@inv#q^IGuBq9*%Or!7qcp>zT+QlGj%HQ$NXs>*Ch^CnwViScqAc_h5PthIsfd@n10k delta 3279 zcmYk-du&tJ9S87pt_d9f&OPTsAtsH}lt&6F??-u*CV>D&8^Jhd zkK7{H%NCiCtK?F-P|lD=GGC6AIpU7^kGL+bi1XqD@q6)(_=QM`SH*6zNwkStQ6UzK zxni;i3r_^xTkb#I&)iGy8F#=v?7rpx)cujWuhre=u6LWNe4F85- z=U4bS{s;aZ{~hn+Z}9K&UA%|4@&sSW%lI5ViRW|49k>Z!!e8MMoQC6Y2z~`=cnw~L z9k3DBLL4ff96$KXU~)unP7V!H7RZk19fo2wkPY?*P!4s-nRKw%(LR-Cz0^NTorXr% zOBCY`EU6TUu{~PV@Dj>#99o+>0xxPOXf5p4uAvFog--g1VzdT!szv4qjRkl3rVfo> zgYEi!By$Y5=_AKz1h(q=3L1qi+RHQ7;RWqwbQwHvy&N`cFTwe9+U0Z!BsbYbIrL~R z#>GbMGP)QxXqVD5cvicFmcn}F%yn7<-Flu+SHL>$yv%Lr($1xM(5W4wxzM4Uq`5Q% z?J8t$bebc=QX-K_1<05n-#H(one(!pxYp)^$K z6PL*qsMP+HT#i9hhavK5aJ4>vAq^2d{)AkFRoWNHCs1KM1S_@AlMC=&?Q`UON*tU% z_hWqCay|Qqd<@I9&ytT|srG4d2Fk6^rZyfNpFBxEgi<|wpPYme?GxmEcuISKoPb5z zgE)WE&If}FedDJ;nuY~>e3T5Z`Dpy%wD+-jN^R}ICww_hnj8(zRqQ6ot89)s(ACFg zD|K|SSxTLKY^I)dvKdNk9c;Q%UmKgI)Y{FaD%Ho?6s4Aa_PA1W3oBM?YG#v_8k^W8 zUrFlJ)w9RcFj2>fl;R0CQK=@*CTKOX@k+4>8>dtmVUH?BBdk!VI>H`Ns;XuM`dk$Y z>vNSXUnx?_9`>(C{5M6SEKd#lYuH%5Ze<~*iYVlw{5wYpEn{QUG@NCll?p1@LrT}e zY?R&>u#rl6Vdg1?@|aZ04KbmV`r)A+Z{8@)h~I+!?$rdt{rO7Q7+X z$OzWN0y$m`NG^^@BEA&=6`zSq;wR!Q@dL40>=Er)4=cqiQ5yW42#JZoOCsprb3b?g z?fwZ1;<)>+yW9P-`%CxLHSTxZ9(S!<3@y6g&TqtgbgjSdt1-e@<0VsW(DfX&fr16Idc8?ZZ4KR-SeKWR>r z9b#EDn&7uZ!vn#2#s;j5dK<7W>P+wpV~q&}8>7|+tc-*SS?r9s3V6dI*cvtF3&q-q zSr4VF%~J>#N0klO9F;a;b42Y^E~Ho61$IZo1}u+N4}_r#6-MLxV}Go)E5ATK9GJZT z3*>1##0FVz16If~8?Zx`J`m2PrVkeRB~oUG*dnDiV2za6fIae*4Ok?LY#1PePuh=k zG`-L+uuB%0-&A|we8aZ(dH!*{V*ENuK4C&PxjNUJ>FS$f*wHoHu(NNLVMpgo!?unY zhJ9_*4O_dX8P>G#m+-H*FFSDt+*o+&SCL7k(Pclr@J!TkB6dBgUCt5a6 zFpNdU8&*cf8Ach859}`RDs> zlWXST>=?s>iqVGG!Ve*f@m5HkaKR`u&I^w;4CQ%-xuMkd6R$zQ` None: + """ + Perform approximate MDS clustering. + This method uses a pretty trick to set the seed for + the random state of the C++ code of the MDS solver. + + .. tip:: + The random state is used to ensure reproducibility of the results + when using the approximate method. + If `random_state` is None, a default value of 42 is used. + + .. important:: + The trick to set the random state is : + + 1. Use the `check_random_state` function to get a `RandomState`singleton + instance, set up with the provided `random_state`. + + 2. Use the `randint` method of the `RandomState` instance to generate a + random integer. + + 3. Use this random integer as the seed for the C++ code of the MDS solver. + + This ensures that the seed passed to the C++ code is always an integer, + which is required by the MDS solver, and allows for + reproducibility of the results. + + Parameters: + ----------- + n : int + The number of points in the dataset. + + Notes: + ------ + This function uses the approximation method to solve the MDS problem. + See [casado]_ for more details. + """ + result = solve_mds( + n, edges.flatten().astype(np.int32), nb_edges, random_state + ) + centers = sorted([x for x in result["solution_set"]]) + mds_exec_time = result["Time"] + return centers, mds_exec_time + +def clustering_exact(n: int, edges: np.ndarray, nb_edges: int) -> None: + """ + Perform exact MDS clustering. + + This function uses the EMOs algorithm to solve the MDS problem. + + .. important:: + The EMOS algorithm is an exact algorithm for solving the MDS problem. + It is a branch and bound algorithm that uses graph theory tricks + to efficiently cut the search space. See [jiang]_ for more details. + + Parameters: + ----------- + n : int + The number of points in the dataset. + """ + centers, mds_exec_time = py_emos_main( + edges.flatten(), n, nb_edges + ) + centers.sort() + return centers, mds_exec_time \ No newline at end of file diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index ee6cb0f..2c58c87 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -18,8 +18,7 @@ from sklearn.metrics import pairwise_distances from sklearn.utils.validation import check_random_state, validate_data -from radius_clustering.utils._emos import py_emos_main -from radius_clustering.utils._mds_approx import solve_mds +from .algorithms import clustering_approx, clustering_exact DIR_PATH = os.path.dirname(os.path.realpath(__file__)) @@ -53,20 +52,23 @@ class RadiusClustering(ClusterMixin, BaseEstimator): .. note:: The `random_state_` attribute is not used when the `manner` is set to "exact". + + .. versionchanged:: 2.0.0 + The `RadiusClustering` class has been refactored. + Clustering algorithms are now separated into their own module + (`algorithms.py`) to improve maintainability and extensibility. .. versionadded:: 1.3.0 - The *random_state* parameter was added to allow reproducibility in - the approximate method. + + - The *random_state* parameter was added to allow reproducibility in the approximate method. + + - The `radius` parameter replaces the `threshold` parameter for setting the dissimilarity threshold for better clarity and consistency. .. versionchanged:: 1.3.0 All publicly accessible attributes are now suffixed with an underscore (e.g., `centers_`, `labels_`). This is particularly useful for compatibility with scikit-learn's API. - .. versionadded:: 1.3.0 - The `radius` parameter replaces the `threshold` parameter for setting - the dissimilarity threshold for better clarity and consistency. - .. deprecated:: 1.3.0 The `threshold` parameter is deprecated. Use `radius` instead. Will be removed in a future version. @@ -243,7 +245,7 @@ def fit_predict(self, X: np.ndarray, y: None = None, metric: str | callable = "e labels : array, shape (n_samples,) The cluster labels for each point in X. """ - self.fit(X) + self.fit(X, metric=metric) return self.labels_ def _clustering(self): @@ -252,75 +254,15 @@ def _clustering(self): """ n = self.X_checked_.shape[0] if self.manner != "exact" and self.manner != "approx": - print(f"Invalid manner: {self.manner}. Defaulting to 'approx'.") raise ValueError("Invalid manner. Choose either 'exact' or 'approx'.") if self.manner == "exact": - self._clustering_exact(n) + self.centers_, self.mds_exec_time_ = clustering_exact(n, self.edges_, self.nb_edges_) else: - self._clustering_approx(n) - - def _clustering_exact(self, n: int) -> None: - """ - Perform exact MDS clustering. - - Parameters: - ----------- - n : int - The number of points in the dataset. - - Notes: - ------ - This function uses the EMOS algorithm to solve the MDS problem. - See: [jiang]_ for more details. - """ - self.centers_, self.mds_exec_time_ = py_emos_main( - self.edges_.flatten(), n, self.nb_edges_ - ) - self.centers_.sort() # Sort the centers to ensure consistent order - - def _clustering_approx(self, n: int) -> None: - """ - Perform approximate MDS clustering. - This method uses a pretty trick to set the seed for - the random state of the C++ code of the MDS solver. - - .. tip:: - The random state is used to ensure reproducibility of the results - when using the approximate method. - If `random_state` is None, a default value of 42 is used. - - .. important:: - :collapsible: closed - The trick to set the random state is : - 1. Use the `check_random_state` function to get a `RandomState`singleton - instance, set up with the provided `random_state`. - 2. Use the `randint` method of the `RandomState` instance to generate a - random integer. - 3. Use this random integer as the seed for the C++ code of the MDS solver. - - This ensures that the seed passed to the C++ code is always an integer, - which is required by the MDS solver, and allows for - reproducibility of the results. - - Parameters: - ----------- - n : int - The number of points in the dataset. - - Notes: - ------ - This function uses the approximation method to solve the MDS problem. - See [casado]_ for more details. - """ - if self.random_state is None: - self.random_state = 42 - self.random_state_ = check_random_state(self.random_state) - seed = self.random_state_.randint(np.iinfo(np.int32).max) - result = solve_mds( - n, self.edges_.flatten().astype(np.int32), self.nb_edges_, seed - ) - self.centers_ = sorted([x for x in result["solution_set"]]) - self.mds_exec_time_ = result["Time"] + if self.random_state is None: + self.random_state = 42 + self.random_state_ = check_random_state(self.random_state) + seed = self.random_state_.randint(np.iinfo(np.int32).max) + self.centers_, self.mds_exec_time_ = clustering_approx(n, self.edges_, self.nb_edges_, seed) def _compute_effective_radius(self): """ From c7447ef36ff8cee23b354fdc71c6bfff26ac18e3 Mon Sep 17 00:00:00 2001 From: Quentin Date: Wed, 18 Jun 2025 17:49:08 +0200 Subject: [PATCH 3/7] adding support for custom solvers --- .coverage | Bin 53248 -> 53248 bytes README.md | 8 +++ src/radius_clustering/algorithms.py | 38 ++++++++++-- src/radius_clustering/radius_clustering.py | 67 ++++++++++++++++++--- tests/test_unit.py | 62 ++++++++++++++++++- 5 files changed, 159 insertions(+), 16 deletions(-) diff --git a/.coverage b/.coverage index 3257cab877088ff7e27eada74743303d857685d0..4e3173d80d125ac231e7a533fb7ab87355052f04 100644 GIT binary patch literal 53248 zcmeI5dvILUdBD%TyV9O}?n|~TFP1R6m>64Fwh#_6kMQsdKlC)RkTz8y>cednHg_MYE8_q+S;de&WQT2sZOH;~I`;ze&EsUn(2uJ=4bhygz~ z{HR9+AgluTTbsbALxU{%VuS7W5UcWD;@)I;xbv-t?2FCkt&H&%^P7>lk$?^OKm%w1 z4WNNPZ3F%qD-xYCL%aRfV!SV%Eau~hWM2HXm)^L1@!I9y+Qmy+mwTeko69_S%%ATq z_VT$M-f%MS4W!aZFO}_2CE~?YcF-#hB~|Z{Lb9LV=%5X5b-d$(^8TD+seah2m>h&5 zhV!XRJip7^n%vb`?jWD7nA}+u4KPSDHJIf+yt#7s1}~o+NamB-M6w``Vs5Ixfi>*e z%_7n2YVEFMc~0?s0{)njW&;i5Q&}|t10{0#etz_Q`FJ)llq@uQ@nD9D931D)qL^(y zmuV{IymTsCo@Ta?DyDK-FS#?B7%3+EPdULxE=@f^-o)9!SeyBTHj6#wb4{=_zVj*e zl-=dd18{KJoYFeJBY$exxqQsokxU;b-!hUJ<|haC5Kmmpoo>(5?OV*r$^OCT<`n00 ze7=)Tr&7+(sMu>Ho7y{HNdH@;_Mc^cVVqsJ!!D`2)+8NISvBd{uyI}6*=@Omp2 zA1UU_aSTp=tQme;cTAy?XnnnQ*S7LP;d)|ysbV1jW747tnoqNI`1aGU8-A5cUL^b~ zN~gWtK>3p4dwb&Kz-pd&%`_<)@jSoKyqxR^M|xIQ8$Vb$2Cpxj>zlmR$}6UjgbM_6 zf$BCel1=bSrNHNq$Zb#NkqlBC*WJWHx;L~eoU%#Tz?7uTI`IhadiMyg1~YeC)g#4t!2UW%TNr(;YZJ zgjh5GUAd=MFCX$41a_uXZM`6Yg$;4jzhU;K@y#!9HO@ z^^&i3SOYKcHTOdTfAE0@&;S}h184vZpaC?12G9T+Km%w14Sezpgfv}?@cLhO-y-gN z@CzSk01co4G=K)s02)98XaEhM0W^RH(7?HAz|uoAUG+~qnq4ha7nT1AV9|mr7c?`N zmF^xT?os!D&&>(qQKJDgfCkV28bAYR01co4G=K)s02)98%77J`si}7XbQY>J5)P)nTK!AX)5Fsr)V81rqYF`SS*!I6=N~@x_}!) zFmA&cj~h=9=JKiHP^JK5ZqQ-OMQ1$bNppgsJ2e=3_8AX7K>+{!KkR&uxR1KTT?>E@ zG=K)s02)98XaEhM0W^RH&;S}h1832|y0G7@opIrNQvDz93i}ObKJKLYKeRFIFFNxv zPgVcxone2ramE@Z)c^SXe`j$~<5QynG=K)s02)98XaEhM0W^RH&;T0v;~NNRA!75- z|26kGfj{^_184vZpaC?12G9T+Km%w14WI!ufCfH+20|g*;Gh4Ks!!nLa7t(Z4WI!u zfCkV28bAYR01co4G=K)sKwyA>{*U#4paTI7paC?12G9T+Km%w14WI!ufCkXOC(;1_ z{C}^Z6ZZr6zuo_EUvm$;Pq_!&AG&|%e#?Epz1Q92?s9W(((QHExGUUi+=Xs~TkqDm zrmHz0IKOhhxCr_EXFTvwvjYV-KL+rnaOEnCH|WtXwJ z>;hKJsPU2U8{=K$HRE~XC&pvO_l@ru-!Seo?l5*4StDVrH`49{d$49Vd`HXFrB0ROo5p+ z_0N~B8S2*w+z+j_0^4D*83J36QU5%FEv?j#3S8Yn{pk`{Q@=*w3czUsSFWIbwQOBU z{iy zLkl?(Kf)W0P|5HsfpkqLPmfZ+qTE&QOHe;7u(yx;A%R^RsIN<0&wLF~*LAJq#2d^f zGFSt_Zpoe68g`rHcCCZmD!EN-XP*zalieb@Rcm86OK#Cx*)GYewHCHh@+xgL+adV| zc&_b|muok$k$_jRqTmy6Xvc&1Ol- zm72x+B{OJGNH#Ra`Xo~rCoVanQMOrfr50f^$ziR6-4t*o`<&#E7G|3Slal7LUcn`} zKG;Ut{!yu(ZIFDNe8kqvKK0}{TPNG!A|<1TGpwCAM~!ZYua2^{vd8P>P1Ys(HS#*U zQSz(gHMU0bE96z9lQWERWR!Kt=9kEetX=Yp;}otk)N^UlAk5dv1O7Ek!RS_fS+YcBtJ!-W{U$p#IBe8 zQ}PtMPVzzWQ+BQ7C&@u}jpWD4lkBsSpMdtOgZ3wkt2l#(1EcIp*?fRJPOkvgG=0k$ zT_h;A#rO;-T^}U}jLQY)$XoQ&VtZzcE)=viLoXAQ9;3~&Eln>Kl-f$01dXNW0zpGL zI$zM{UfL*VU>}_)C^ThPW{`YA~XdXbTkGq)s7TiOy7g4#YQ1M26{%Y9?x7-6>s>umBx=2@n~FTG z@7qX4n%299sL0ZKXFC;1TJPwjB1h}(-BhG#y{&_a46WCes7TOyT^kkoS%0UNiuA14 z)=`n2^=K^>$yu+7QjweW>KZCivkpmEWM+NJR4Nj)UNwb^ysSG_RHS9yb|}xvFmuy3 zcvj|1;F-|~Ps?zQ%}6;X8+s`cu7C!uq7(_0AzTsB%b__b1 z*$>&r>?8KS*-zR3V(+oI}S zK`Xr2~8sLhi>0L6KK5I}XF2m&b2`+@-K^LP+IfxbBiphAyL z2r6$10;thHrvhH2Z&CrT(tA}vpiJKw1W>1MP(cX__4O(sP^qs|s-aZx3Ha5~?w}JC z>uZAms`aiQfO7rDAb@&(O|buw(avB4RO}r=0400-gy5C7Ab_I1H3*<;ZwUe@+gAqx z)a|Q+01Ee&6M`341Ob%pHv|FH?#qJ!iuYv`f@hZo0hI4cCIp8T2LTlD*9QSq@Ye+a zlNYy{>t)|4~u=v*cA#>TNd%|Z~^dA{xgc9 zmcLwe$&7tk;nvJTh3T=&6mCs7D@<*@RN+{vN#Rg#fx^wb^A!&4YgCvVn5VEmd5OYA ze}lr<=D7+t#pWpN-E^_SjlHu2Onge=hOUbg_H@ls*xhxZ!u4IA!gcHG751#VKw)>! zOod%N=Lgtbr*PlKT7_LhGZc2VpQo^+Gpew?d%D85jv9rvrD+Q5+Nu@4Q#(~*ZQT@w z(b_77HBnb#b&aENYPGF!%2Z2X)f7{qQ^ge8j-k-BX|QUb@;6{fl;01i|59Q;SDgXcg literal 53248 zcmeI53ve69d4Ts0Ah5UhB1Kb>$S6A7(SsHrV&qts96uhVLBBAf!+gt2@x3`BM&_J8|>#Y zV!*ErziNvDgjE24YqR)#Xppu8&9-|xu_9j}?hd=xZM7b^mzw`*WsOJ7C!$Fs4Lfi{ z184vZpn?D22Fm?bG`4V|_Q1WRWNOSW6_RPcARg_TH+FXpbbAAx>*C#>DD##x54P4; zuhT2!_j%)f!5hhp`CcYBoJl83ncOa~H0rC`6GeZR-{_zWZgsq3TlIKOiOet@Rq}U1 z6XS(UHd&bT?(`>Hsukq3mHhoBQ2>qjnO!+v!&@#7Z}tlQh+pt?X}>6Xu{<-}%$gs( zk40nk_1ZzdI;LbH4L|0b(Llk>P}Yw?L+N~BnD;(aNaoU`ezC<%1|v-8p`ZIpVzh;P zw!M`1#xl9;FmuIBDU;86{(e6_QSyh+IlvY!O>H1=_GqB3oqRw$#gX#4X4#oJ_#8*d z>hj=6WMWoa^MK@#HIZCjy&J7rMw*M?|g2~aV}@Z zJLhyN<@}6_qb72hJrjO#@>{%_BU`G&$mLr+J_wirOUy`YO}$27R{Hykd&c1PRw6l3 z%2(qAocu%wJXjCa(P(V(V(s9b>O|ps5~)n77=STn(gellnL2#``R5J4O6E=yeie<4 zdHIp*CBu*Q#L0o#Jo}nyS2B_XexZ4JSrK}AQD+m@Oo2p4fULYQ5{)%AX%86H zX(Z2<{3|+V3aK-Bj%8Hs%KFLTL_sYizip9JN6f5}>Kc)IU^uPONUXEze8Z9T5*-p~ zIU0_}8XB~+xFYzv#D8n%41$-;Tmm1s5&Z7mA5WG>y|eqyi=pGoY9g`g8qU{o-i1U5 ze~di1O|N$O;KHi?EG+u`H#Dcqykw@!xA~K;dy`|Znv5qi1-N8k4HkC*k5?Q{Lgi#C zKT)dAUmjMcC%^a1^j9=66D#*ntGQYoF7Lj4v6z9|D!*+N*Q!!)*=VxJSE1rEFO|=a z`N^EPH^33=i}0M8qF3$1*>|NIXWA%E@-}wuT~_Ury1DRWegG~yb$tP3v&r1BykGI# zuxJ(5etvQC=Mjqt9Ox~Bi?Rrp=m*+Vjmo`z#Zv9Jje`^3YBo76E_Z%XX0^C))IX~^ zaX0Epmi$sC>wCOPwKLU;wJ%xp(gh!O!o9S4S?IS-1Aev=9sH4J$&|m8D=*ffu@x({ z@`St?#X6HH!c9G!RNa=LK4+4AVCT$y>4gOsUhT}DC*0jE9Xu4zz!x`h22;XF!U7 z`&0Mb59a`}*JuC@paC?12G9T+Km%w14WI!ufCkWjGGK)kY3f}7orRhV`6dAW{6Fk` zg}8s=ZgMIBxS;_wfCkV28bAYR01co4G=K)s02;V>25t|BNx7qaun1oX&<@|cl7lbz zY46IXC$fOW_Cj(vGf_;W$0mv;zX0Ey(_Sp3&rv#2%8V7;6NyYNQ%WS@>jE~0pxx#R zZZ|o$D__WzMzcj|bBhjbuD;+l&l(dn-KRm*OD?$SSpxXy|6%7z;(pB~?f?L8XaEhM z0W^RH&;S}h184vZpaC?11}>t3JHq7-?Sd2Ev-1D&rf|9W!rPsb|A)4Q%d0QE&2#1d zdSAG_#JFG$v+{rZ{=bVjsqv}N02)98XaEhM0W^RH&;S}h184vZ{NEc0X(3|s&;K>| zG=U%7&;S}h184vZpaC?12G9T+Km%w14WNNb&_F0;8~pQsQhNyw4u^yW&;S}h184vZ zpaC?12G9T+Km%w14Fm@G=l_`h2RabY02)98XaEhM0W^RH&;S}h184vZT#^R(=l_oy zXNdc2_ZRMu-G6pZxG%fUxPRk*)%_E9%6-&*$i2%QcSqbIccXi&dxLwmdzI_C^Igl; zom0*)oF6%FIIlU+J4c+qcAj*eaDLx;#JS(u>x?-mXS37ebUACCmCjP9$*FZB_G$Z< z_K)o!*vIUDu)l8qx&24>WA?-Lq@A^s_MpAqzTWPzm)Hw!+t#dit+%Z=t?ydTTi>w0 zW_{V3vOaI!ZGHVt>!R&Yog_%)Z1v$L?nnY&T1=0k)pq zz*e#4Y!REsD9FJL4WI!ufCkV28bAY=jDcv`snHCLhAYvs4MeqYg_bQ|riHqX(XuJ9 ztDBaYz|JmOHUzHgq-82g*U@rR;4vtT2wVk?)d=i3MayAa-X+MObW-j$=zc*FM)wJ-iO{`*LN(C| zAVUjsPRSvtpOCC;A+|#@fn&D|CKZjap@4O^P4aKZ8MZax z3fm(2H2DqNEcuj1*&Tu_l$>U_%kneO{-EUdDofdbyi9D@_qJc$tTJ8jJT}+>J+<87QaT0vkj7ulh@e#fKRep zB_AWNvRfp-LXNRdN&XIbg>_4QnS6(JNq&jE%x(_&I9n(AMe-8sl>7pDk=-QuD0!aU z81M`12FcHo=h*cDA7!7E{B80qyH4^m2ZvY1%5w zeA*%?HB46snoiNH1SLmlv!E>lbh)6!A-YV^js#sQX!{PjM9|Q7`Y}P9x6msE4Q{4a z2pSlqmkZi7Ks`xAbg`g~ar#j~@r`tmq<;DlLA`O>B&a7&8wK^p=|Vw${d9q>*GFTr zUN4<5DBepOB=ykC1a0V{^@18Jbe^E54YW?sTaC0m04i&4eZrV}4=)$QpqtzwX&@0h!4HRfK zm1w96;hLyk1w9#@=+}?s501`wjTt z0lw`10lXXVpu6AAx+(V>_YSwyT>oq)%kuQ-pf z7o7vn-A)(#rZevN&Q_<_S!17cuC!lr7C5?N+rPH|!~RSAi2bMbXYDW9_t<$mW#4KK zvcIz1?CaQ{*_YdwS^r~4t)E-}YMp@h0lsK`)%rVY+S+LyuqLfBYrwk2YPCG`zs;YS z-!*?|)tiUSqvliQ6Xte!2jC%dpSj!YGp{kPGHcBcdyleHAF{{T0d_YVXFls< zYuHxS%dTW5iy0N;KaHOl|709D{??c_o-)2@95nVA_gpf!dmIQFKm%w14WI!uaB&Q< z2)x##dbmO!cyDyj1KD^n;OC|a@-WE9_b43L zHm)$9%qu*!DQEFEcwP-Tc{T_jDIW_0$ja{w0!YhutAOX_8TG`Fn2)L*8f50XRB#4T z^AQzjkemBK0Ll4q5I}aG4gyHeQ$Yavc`^tfLEjk!kfA4n08;cFD&RT#_E~{G6a05#T67|h0;F$b zpPcFoDnZ8H8w8ND_XGjt?4J$}d37ot?0~HOwjhACeZ#EaKoI1c%bm?{L)_jN%4`Fm#&Kmvc$tl)(kg8)+a8)gMZuMYx9;y)P# zki}mY1dztB4FbsH*8~A1^4A8z)8xqNK+fT*Yl0n+%YQ;m_4M@B3RCG-ygJNzp3Z+< zG34_ds>;apN`=EC?F#+rHig4}tHRW9i^A#D3Wdqhs}yb-XjYgwv|Qni#4?53cPv#n zw0()f&09XEaB%aL3I_(SP`GK}@&JcCg&X6G6~;GyG{F8v3VY)pQP>l2QrI7FRM^+Q zP+@Q10);)jF@^Eo`2qGcDBRF zTMFyunF?#`m_n!4PzZ^?Ldg8(nyH%+xn}B>8Y9A&QAqy7lEO71NqShP)xAbV=b!(F z-JcQnefK^0R{*%70W^RH&;S}h184vZpaC?12G9T+Km#9Y1EDZ{;eRN^A9Vh}zyJS# E0S#_RJOBUy diff --git a/README.md b/README.md index b389e50..5b8af24 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,14 @@ Radius clustering is a Python package that implements clustering under radius co - Supports radius-constrained clustering - Provides options for exact and approximate solutions +## Roadmap + +- [ ] Version 2.0.0beta: + - [x] Add support for custom MDS solvers + - [ ] Improve documentation and examples + - [ ] Add iterative algorithm in both exact and approximate versions, which will allow to use the package in a more flexible way, especially when not knowing the radius beforehand. + - [ ] Add more examples and tutorials + ## Installation You can install Radius Clustering using pip: diff --git a/src/radius_clustering/algorithms.py b/src/radius_clustering/algorithms.py index d94ed8a..1df668a 100644 --- a/src/radius_clustering/algorithms.py +++ b/src/radius_clustering/algorithms.py @@ -45,16 +45,28 @@ def clustering_approx( This ensures that the seed passed to the C++ code is always an integer, which is required by the MDS solver, and allows for reproducibility of the results. + + .. note:: + This function uses the approximation method to solve the MDS problem. + See [casado]_ for more details. Parameters: ----------- n : int The number of points in the dataset. - - Notes: - ------ - This function uses the approximation method to solve the MDS problem. - See [casado]_ for more details. + edges : np.ndarray + The edges of the graph, flattened into a 1D array. + nb_edges : int + The number of edges in the graph. + random_state : int | None + The random state to use for reproducibility. + If None, a default value of 42 is used. + Returns: + -------- + centers : list + A sorted list of the centers of the clusters. + mds_exec_time : float + The execution time of the MDS algorithm in seconds. """ result = solve_mds( n, edges.flatten().astype(np.int32), nb_edges, random_state @@ -63,7 +75,7 @@ def clustering_approx( mds_exec_time = result["Time"] return centers, mds_exec_time -def clustering_exact(n: int, edges: np.ndarray, nb_edges: int) -> None: +def clustering_exact(n: int, edges: np.ndarray, nb_edges: int, seed: None = None) -> None: """ Perform exact MDS clustering. @@ -78,6 +90,20 @@ def clustering_exact(n: int, edges: np.ndarray, nb_edges: int) -> None: ----------- n : int The number of points in the dataset. + edges : np.ndarray + The edges of the graph, flattened into a 1D array. + nb_edges : int + The number of edges in the graph. + seed : None + This parameter is not used in the exact method, but it is kept for + compatibility with the approximate method. + + Returns: + -------- + centers : list + A sorted list of the centers of the clusters. + mds_exec_time : float + The execution time of the MDS algorithm in seconds. """ centers, mds_exec_time = py_emos_main( edges.flatten(), n, nb_edges diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index 2c58c87..a5fab56 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -75,6 +75,10 @@ class RadiusClustering(ClusterMixin, BaseEstimator): """ _estimator_type = "clusterer" + _algorithms = { + "exact": clustering_exact, + "approx": clustering_approx, + } def __init__( self, @@ -211,7 +215,7 @@ def fit(self, X: np.ndarray, y: None = None, metric: str | callable = "euclidean np.uint32 ) # Edges in the adjacency matrix # uint32 is used to use less memory. Max number of features is 2^32-1 - + self.clusterer_ = self._algorithms.get(self.manner, self._algorithms["approx"]) self._clustering() self._compute_effective_radius() self._compute_labels() @@ -253,16 +257,16 @@ def _clustering(self): Perform the clustering using either the exact or approximate MDS method. """ n = self.X_checked_.shape[0] - if self.manner != "exact" and self.manner != "approx": - raise ValueError("Invalid manner. Choose either 'exact' or 'approx'.") - if self.manner == "exact": - self.centers_, self.mds_exec_time_ = clustering_exact(n, self.edges_, self.nb_edges_) - else: + if self.manner not in self._algorithms: + raise ValueError(f"Invalid manner. Please choose in {list(self._algorithms.keys())}.") + if self.clusterer_ == clustering_approx: if self.random_state is None: self.random_state = 42 self.random_state_ = check_random_state(self.random_state) seed = self.random_state_.randint(np.iinfo(np.int32).max) - self.centers_, self.mds_exec_time_ = clustering_approx(n, self.edges_, self.nb_edges_, seed) + else: + seed = None + self.centers_, self.mds_exec_time_ = self.clusterer_(n, self.edges_, self.nb_edges_, seed) def _compute_effective_radius(self): """ @@ -282,3 +286,52 @@ def _compute_labels(self): min_dist = np.min(distances, axis=1) self.labels_[min_dist > self.radius] = -1 + + def set_solver(self, solver: callable) -> None: + """ + Set a custom solver for resolving the MDS problem. + This method allows users to replace the default MDS solver with a custom one. + + .. important:: + The custom solver must accept the same parameters as the default solvers + and return a tuple containing the cluster centers and the execution time. + e.g., it should have the signature: + ```python + def custom_solver( + n: int, + edges: np.ndarray, + nb_edges: int, + random_state: int | None = None + ) -> tuple[list, float]: + # Custom implementation details + centers = [...] + exec_time = ... + # Return the centers and execution time + return centers, exec_time + ``` + + Parameters: + ---------- + solver : callable + The custom solver function to use for MDS clustering. + It should accept the same parameters as the default solvers + and return a tuple containing the cluster centers and the execution time. + + Raises: + ------- + ValueError + If the provided solver does not have the correct signature. + """ + if not callable(solver): + raise ValueError("The provided solver must be callable.") + + # Check if the solver has the correct signature + try: + n = 3 + edges = np.array([[0, 1], [1, 2], [2, 0]]) + nb_edges = edges.shape[0] + solver(n, edges, nb_edges, random_state=None) + except Exception as e: + raise ValueError(f"The provided solver does not have the correct signature: {e}") from e + self.manner = "custom" + self._algorithms["custom"] = solver \ No newline at end of file diff --git a/tests/test_unit.py b/tests/test_unit.py index ee0cfec..bf846be 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -84,10 +84,10 @@ def test_radius_clustering_invalid_manner(): """ Test that an error is raised when an invalid manner is provided. """ - with pytest.raises(ValueError, match="Invalid manner. Choose either 'exact' or 'approx'."): + with pytest.raises(ValueError): RadiusClustering(manner="invalid", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) - with pytest.raises(ValueError, match="Invalid manner. Choose either 'exact' or 'approx'."): + with pytest.raises(ValueError): RadiusClustering(manner="", radius=1.43).fit([[0, 1], [1, 0], [2, 1]]) @@ -102,4 +102,60 @@ def test_radius_clustering_invalid_radius(): RadiusClustering(manner="approx", radius=0.0).fit([[0, 1], [1, 0], [2, 1]]) with pytest.raises(ValueError, match="Radius must be a positive float."): - RadiusClustering(manner="exact", radius="invalid").fit([[0, 1], [1, 0], [2, 1]]) \ No newline at end of file + RadiusClustering(manner="exact", radius="invalid").fit([[0, 1], [1, 0], [2, 1]]) + +def test_radius_clustering_fit_without_data(): + """ + Test that an error is raised when fitting without data. + """ + clustering = RadiusClustering(manner="exact", radius=1.5) + with pytest.raises(ValueError): + clustering.fit(None) + +def test_radius_clustering_new_clusterer(): + """ + Test that a custom clusterer can be set within the RadiusClustering class. + """ + def custom_clusterer(n, edges, nb_edges, random_state=None): + # A mock custom clusterer that returns a fixed set of centers + # and a fixed execution time + return [0, 1], 0.1 + clustering = RadiusClustering(manner="exact", radius=1.5) + # Set the custom clusterer + assert hasattr(clustering, 'set_solver'), "RadiusClustering should have a set_solver method." + assert callable(clustering.set_solver), "set_solver should be callable." + clustering.set_solver(custom_clusterer) + # Fit the clustering with the custom clusterer + X = np.array([[0, 1], + [1, 0], + [2, 1]]) + clustering.fit(X) + assert clustering.clusterer_ == custom_clusterer, "The custom clusterer should be set correctly." + # Check that the labels are assigned correctly + assert len(clustering.labels_) == X.shape[0], "Labels length should match number of samples." + assert clustering.nb_edges_ > 0, "There should be edges in the graph." + assert clustering.centers_ == [0, 1], "The centers should match the custom clusterer's output." + assert clustering.mds_exec_time_ == 0.1, "The MDS execution time should match the custom clusterer's output." + +def test_invalid_clusterer(): + """ + Test that an error is raised when an invalid clusterer is set. + """ + clustering = RadiusClustering(manner="exact", radius=1.5) + with pytest.raises(ValueError, match="The provided solver must be callable."): + clustering.set_solver("not_a_callable") + + with pytest.raises(ValueError, match="The provided solver must be callable."): + clustering.set_solver(12345) # Not a callable + with pytest.raises(ValueError, match="The provided solver must be callable."): + clustering.set_solver(None) + + def invalid_signature(): + return [0, 1], 0.1 + + with pytest.raises(ValueError): + clustering.set_solver(invalid_signature) + def invalid_clusterer(n, edges, nb_edges): + return [0, 1], 0.1 + with pytest.raises(ValueError): + clustering.set_solver(invalid_clusterer) \ No newline at end of file From 1069610c694ca94a31c9839d798223ec710f04cd Mon Sep 17 00:00:00 2001 From: Quentin Date: Thu, 19 Jun 2025 11:12:06 +0200 Subject: [PATCH 4/7] updated doc for new API --- docs/source/usage.rst | 106 +++++++++- examples/plot_benchmark_custom.py | 230 +++++++++++++++++++++ pyproject.toml | 1 + src/radius_clustering/algorithms.py | 1 + src/radius_clustering/radius_clustering.py | 32 +-- 5 files changed, 353 insertions(+), 17 deletions(-) create mode 100644 examples/plot_benchmark_custom.py diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 1826840..b6b3e50 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -1,7 +1,20 @@ Usage ===== -Here's a basic example of how to use Radius Clustering: +This page provides a quick guide on how to use the `radius_clustering` package for clustering tasks. The package provides a simple interface for performing radius-based clustering on datasets based on the Minimum Dominating Set (MDS) algorithm. + +This page is divided into three main sections: +1. **Basic Usage**: A quick example of how to use the `RadiusClustering` class and perform clustering with several parameters. +2. **Custom Dissimilarity Function**: How to use a custom dissimilarity function with the `RadiusClustering` class. +3. **Custom MDS Solver**: How to implement a custom MDS solver for more advanced clustering tasks, eventually with less guarantees on the results. + + +Basic Usage +----------------- + +The `RadiusClustering` class provides a straightforward way to perform clustering based on a specified radius. You can choose between an approximate or exact method for clustering, depending on your needs. + +Here's a basic example of how to use Radius Clustering with the `RadiusClustering` class, using the approximate method: .. code-block:: python @@ -22,4 +35,93 @@ Here's a basic example of how to use Radius Clustering: # Get cluster labels labels = rad.labels_ - print(labels) \ No newline at end of file + print(labels) + +Similarly, you can use the exact method by changing the `manner` parameter to `"exact"`: +.. code-block:: python + # [...] Exact same code as above + rad = RadiusClustering(manner="exact", radius=0.5) #change this parameter + # [...] Exact same code as above + +Custom Dissimilarity Function +----------------------------- + +The main reason behind the `radius_clustering` package is that users eventually needs to use a dissimilarity function that is not a metric (or distance) function. Plus, sometimes context requires a domain-specific dissimilarity function that is not provided by default, and needs to be implemented by the user. + +To use a custom dissimilarity function, you can pass it as a parameter to the `RadiusClustering` class. Here's an example of how to do this: +.. code-block:: python + + from radius_clustering import RadiusClustering + import numpy as np + + # Generate random data + X = np.random.rand(100, 2) + + # Define a custom dissimilarity function + def dummy_dissimilarity(x, y): + return np.linalg.norm(x - y) + 0.1 # Example: add a constant to the distance + + # Create an instance of MdsClustering with the custom dissimilarity function + rad = RadiusClustering(manner="approx", radius=0.5, metric=dummy_dissimilarity) + + # Fit the model to the data + rad.fit(X) + + # Get cluster labels + labels = rad.labels_ + + print(labels) + + +.. note:: + The custom dissimilarity function will be passed to scikit-learn's `pairwise_distances` function, so it should be compatible with the expected input format and return type. See the scikit-learn documentation for more details on how to implement custom metrics. + +Custom MDS Solver +----------------- + +The two default solvers provided by the actual implementation of the `radius_clustering` package are focused on exactness (or proximity to exactness) of the results of a NP-hard problem. So, they may not be suitable for all use cases, especially when performance is a concern. +If you have your own implementation of a Minimum Dominating Set (MDS) solver, you can use it with the `RadiusClustering` class ny using the :py:func:'RadiusClustering.set_solver' method. It will check that the solver is compatible with the expected input format and return type, and will use it to perform clustering. + +Here's an example of how to implement a custom MDS solver and use it with the `RadiusClustering` class, using NetworkX implementation of the dominating set problem : + +.. code-block:: python + + from radius_clustering import RadiusClustering + import time + import numpy as np + import networkx as nx + + # Generate random data + X = np.random.rand(100, 2) + + # Define a custom MDS solver using NetworkX + def custom_mds_solver(n, edges, nb_edges, random_state=None): + start = time.time() + graph = nx.Graph(edges) + centers = list(nx.algorithms.dominating_set(graph)) + centers.sort() + end = time.time() + return centers, end - start + + # Create an instance of MdsClustering with the custom MDS solver + rad = RadiusClustering(manner="approx", radius=0.5) + rad.set_solver(custom_mds_solver) + + # Fit the model to the data + rad.fit(X) + + # Get cluster labels + labels = rad.labels_ + + print(labels) + +.. note:: + The custom MDS solver should accept the same parameters as the default solvers, including the number of points `n`, the edges of the graph `edges`, the number of edges `nb_edges`, and an optional `random_state` parameter for reproducibility. It should return a list of centers and the time taken to compute them. + The `set_solver` method will check that the custom solver is compatible with the expected input format and return type, and will use it to perform clustering. + If the custom solver is not compatible, it will raise a `ValueError` with a descriptive message. + +.. attention:: + We cannot guarantee that the custom MDS solver will produce the same results as the default solvers, especially if it is not purposely designed to solve the Minimum Dominating Set problem but rather just finds a dominating set. The results may vary depending on the implementation and the specific characteristics of the dataset. + As an example, a benchmark of our solutions and a custom one using NetworkX is available in the `Example Gallery` section of the documentation, which shows that the custom solver may produce different results than the default solvers, especially in terms of the number of clusters and the time taken to compute them. + However, it can be useful for specific use cases where performance is a concern or when you have a custom implementation that fits your needs better. + diff --git a/examples/plot_benchmark_custom.py b/examples/plot_benchmark_custom.py new file mode 100644 index 0000000..866825d --- /dev/null +++ b/examples/plot_benchmark_custom.py @@ -0,0 +1,230 @@ +""" +===================================================================================== +Benchmark of Radius Clustering using multiple datasets and comparison with custom MDS +===================================================================================== + +This example demonstrates how to implement a custom solver for the MDS problem +and use it within the Radius Clustering framework. +Plus, it compares the results of a naive implementation using the +`NetworkX` library with the Radius Clustering implementation. + +The example includes: + 1. Defining the custom MDS solver. + 2. Defining datasets to test the clustering. + 3. Applying Radius clustering on the datasets using the custom MDS solver. + 4. Ensure this solution works. + 5. Establish a benchmark procedure to compare the Radius clustering with a naive implementation using `NetworkX`. + 6. Comparing the results in terms of : + - Execution time + - Number of cluster found + 7. Visualizing the benchmark results. + 8. Visualizing the clustering results. + +This example is useful for understanding how to implement a custom MDS solver +and how to perform an advanced usage of the package. +""" +# Author: Haenn Quentin +# SPDX-License-Identifier: MIT + +# %% +# Import necessary libraries +# -------------------------- +# +# Since this example is a benchmark, we need to import the necessary libraries +# to perform the benchmark, including `NetworkX` for the naive implementation, +# `matplotlib` for visualization, and `sklearn` for the datasets. + + +import networkx as nx +import numpy as np +import matplotlib.pyplot as plt +import time +import warnings + +from sklearn.datasets import fetch_openml +from radius_clustering import RadiusClustering +from sklearn.metrics import pairwise_distances_argmin + +warnings.filterwarnings("ignore", category=RuntimeWarning, module="sklearn") +# %% +# Define a custom MDS solver +# -------------------------- +# +# We define a custom MDS solver that uses the `NetworkX` library to compute the MDS. +# Note the signature of the function is identical to the one used in the `RadiusClustering` class. + + +def custom_solver(n: int, edges: np.ndarray, nb_edges: int, random_state=None): + """ + Custom MDS solver using NetworkX to compute the MDS problem. + + Parameters: + ----------- + n : int + The number of points in the dataset. + edges : np.ndarray + The edges of the graph, flattened into a 1D array. + nb_edges : int + The number of edges in the graph. + random_state : int | None + The random state to use for reproducibility. + + Returns: + -------- + centers : list + A sorted list of the centers of the clusters. + mds_exec_time : float + The execution time of the MDS algorithm in seconds. + """ + G = nx.Graph() + G.add_edges_from(edges) + + start_time = time.time() + centers = list(nx.algorithms.dominating.dominating_set(G)) + mds_exec_time = time.time() - start_time + + centers = sorted(centers) + + return centers, mds_exec_time + + +# %% +# Define datasets to test the clustering +# -------------------------------------- +# +# We will use 4 datasets to test the clustering: +# 1. Iris dataset +# 2. Wine dataset +# 3. Breast Cancer dataset (WDBC) +# 4. Vehicle dataset +# These are common datasets used in machine learning and lead to pretty fast results. +# Structure of the variable `DATASETS`: +# - The key is the name of the dataset. +# - The value is a tuple containing: +# - The dataset fetched from OpenML. +# - The radius to use for the Radius clustering. (determined in literature, see references on home page) +# + + +DATASETS = { + "iris": (fetch_openml(name="iris", version=1, as_frame=False), 1.43), + "wine": (fetch_openml(name="wine", version=1, as_frame=False), 232.09), + "glass": (fetch_openml(name="glass", version=1, as_frame=False), 3.94), + "ionosphere": (fetch_openml(name="ionosphere", version=1, as_frame=False), 5.46), + "breast_cancer": (fetch_openml(name="wdbc", version=1, as_frame=False), 1197.42), + "synthetic": (fetch_openml(name="synthetic_control", version=1, as_frame=False), 70.12), + "vehicle": (fetch_openml(name="vehicle", version=1, as_frame=False), 155.05), + "yeast": (fetch_openml(name="yeast", version=1, as_frame=False), 0.4235), +} + +# %% +# Define the benchmark procedure +# -------------------------------------- +# +# We define a function to perform the benchmark on the datasets. +# The procedure is as follows: +# 1. Creates an instance of RadiusClustering for each solver. +# 2. For each instance, fit the algorithm on each dataset. +# 3. Store the execution time and the number of clusters found for each dataset. +# 4. Return the results as a dictionary. + + +def benchmark_radius_clustering(): + results = {} + exact = RadiusClustering(manner="exact", radius=1.43) + approx = RadiusClustering(manner="approx", radius=1.43) + custom = RadiusClustering( + manner="custom", radius=1.43 + ) + custom.set_solver(custom_solver) # Set the custom solver + algorithms = [exact, approx, custom] + # Loop through each algorithm and dataset + for algo in algorithms: + algo_results = {} + time_algo = [] + clusters_algo = [] + # Loop through each dataset + for name, (dataset, radius) in DATASETS.items(): + X = dataset.data + # set the radius for the dataset considered + setattr(algo, "radius", radius) + # Fit the algorithm + t0 = time.time() + algo.fit(X) + t_algo = time.time() - t0 + + # Store the results + time_algo.append(t_algo) + clusters_algo.append(len(algo.centers_)) + algo_results["time"] = time_algo + algo_results["clusters"] = clusters_algo + results[algo.manner] = algo_results + + return results + + +# %% +# Run the benchmark and plot the results +# -------------------------------------- +# We run the benchmark and plot the results for each dataset. + + +results = benchmark_radius_clustering() + +# Plot the results +fig, axs = plt.subplot_mosaic( + [ + ["time", "time", "time", "time"], + ["iris", "wine", "breast_cancer", "vehicle"], + ["glass", "ionosphere", "synthetic", "yeast"], + ], + layout="constrained", + figsize=(12, 8), +) +fig.suptitle("Benchmark of Radius Clustering Solvers", fontsize=16) + +axs['time'].set_yscale('log') # Use logarithmic scale for better visibility +for algo, algo_results in results.items(): + # Plot execution time + axs['time'].plot( + DATASETS.keys(), + algo_results["time"], + marker='o', + label=algo, + ) + # Plot number of clusters + +for i, (name, (dataset, _)) in enumerate(DATASETS.items()): + axs[name].bar( + results.keys(), + [results[algo]["clusters"][i] for algo in results.keys()], + label=name, + ) + axs[name].axhline( + y=len(set(dataset.target)), # Number of unique classes in the dataset + label="True number of clusters", + color='r', + linestyle='--', + ) + axs[name].set_title(name) + axs[name].set_xlabel("Algorithms") + +axs["iris"].set_ylabel("Number of clusters") +axs["glass"].set_ylabel("Number of clusters") + +axs['time'].set_title("Execution Time (log scale)") +axs['time'].set_xlabel("Datasets") +axs['time'].set_ylabel("Time (seconds)") +axs['time'].legend(title="Algorithms") +plt.tight_layout() +plt.show() + + +# %% +# Conclusion +# ---------- +# +# In this example, we applied Radius clustering to the Iris and Wine datasets and compared it with KMeans clustering. +# We visualized the clustering results and the difference between the two clustering algorithms. +# We saw that Radius Clustering can lead to smaller clusters than kmeans, which produces much more equilibrate clusters. +# The difference plot can be very useful to see where the two clustering algorithms differ. diff --git a/pyproject.toml b/pyproject.toml index 0ee2d8d..deb1997 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ dev = [ ] doc = [ + "networkx>=3.3", "sphinx>=8.1.3", "sphinx_gallery>=0.18.0", "sphinx-copybutton>=0.5.2", diff --git a/src/radius_clustering/algorithms.py b/src/radius_clustering/algorithms.py index 1df668a..a30cd0b 100644 --- a/src/radius_clustering/algorithms.py +++ b/src/radius_clustering/algorithms.py @@ -42,6 +42,7 @@ def clustering_approx( 3. Use this random integer as the seed for the C++ code of the MDS solver. + This ensures that the seed passed to the C++ code is always an integer, which is required by the MDS solver, and allows for reproducibility of the results. diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index a5fab56..0ac9e63 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -296,22 +296,23 @@ def set_solver(self, solver: callable) -> None: The custom solver must accept the same parameters as the default solvers and return a tuple containing the cluster centers and the execution time. e.g., it should have the signature: - ```python - def custom_solver( - n: int, - edges: np.ndarray, - nb_edges: int, - random_state: int | None = None - ) -> tuple[list, float]: - # Custom implementation details - centers = [...] - exec_time = ... - # Return the centers and execution time - return centers, exec_time - ``` - + + >>> def custom_solver( + >>> n: int, + >>> edges: np.ndarray, + >>> nb_edges: int, + >>> random_state: int | None = None + >>> ) -> tuple[list, float]: + >>> # Custom implementation details + >>> centers = [...] + >>> exec_time = ... + >>> # Return the centers and execution time + >>> return centers, exec_time + + This allows for flexibility in how the MDS problem is solved. + Parameters: - ---------- + ----------- solver : callable The custom solver function to use for MDS clustering. It should accept the same parameters as the default solvers @@ -321,6 +322,7 @@ def custom_solver( ------- ValueError If the provided solver does not have the correct signature. + """ if not callable(solver): raise ValueError("The provided solver must be callable.") From 5fc108385090cc3400b2159864603d0015075481 Mon Sep 17 00:00:00 2001 From: Quentin Date: Thu, 19 Jun 2025 11:22:16 +0200 Subject: [PATCH 5/7] enhancement of documentation and version 1.4.0 --- README.md | 7 +++---- docs/source/usage.rst | 6 +++++- src/radius_clustering/algorithms.py | 5 ++--- src/radius_clustering/radius_clustering.py | 10 +++++++++- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 5b8af24..56239ff 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,10 @@ Radius clustering is a Python package that implements clustering under radius co ## Roadmap -- [ ] Version 2.0.0beta: +- [x] Version 1.4.0: - [x] Add support for custom MDS solvers - - [ ] Improve documentation and examples - - [ ] Add iterative algorithm in both exact and approximate versions, which will allow to use the package in a more flexible way, especially when not knowing the radius beforehand. - - [ ] Add more examples and tutorials + - [x] Improve documentation and examples + - [x] Add more examples and tutorials ## Installation diff --git a/docs/source/usage.rst b/docs/source/usage.rst index b6b3e50..f340b98 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -82,6 +82,10 @@ Custom MDS Solver The two default solvers provided by the actual implementation of the `radius_clustering` package are focused on exactness (or proximity to exactness) of the results of a NP-hard problem. So, they may not be suitable for all use cases, especially when performance is a concern. If you have your own implementation of a Minimum Dominating Set (MDS) solver, you can use it with the `RadiusClustering` class ny using the :py:func:'RadiusClustering.set_solver' method. It will check that the solver is compatible with the expected input format and return type, and will use it to perform clustering. +.. versionadded:: 1.4.0 + The :py:func:`RadiusClustering.set_solver` method was added to allow users to set a custom MDS solver. + It is *NOT* backward compatible with previous versions of the package, as it comes with new structure and methods to handle custom solvers. + Here's an example of how to implement a custom MDS solver and use it with the `RadiusClustering` class, using NetworkX implementation of the dominating set problem : .. code-block:: python @@ -122,6 +126,6 @@ Here's an example of how to implement a custom MDS solver and use it with the `R .. attention:: We cannot guarantee that the custom MDS solver will produce the same results as the default solvers, especially if it is not purposely designed to solve the Minimum Dominating Set problem but rather just finds a dominating set. The results may vary depending on the implementation and the specific characteristics of the dataset. - As an example, a benchmark of our solutions and a custom one using NetworkX is available in the `Example Gallery` section of the documentation, which shows that the custom solver may produce different results than the default solvers, especially in terms of the number of clusters and the time taken to compute them. + As an example, a benchmark of our solutions and a custom one using NetworkX is available in the `Example Gallery` section of the documentation, which shows that the custom solver may produce different results than the default solvers, especially in terms of the number of clusters and the time taken to compute them (see :ref:`sphx_glr_auto_examples_plot_benchmark_custom.py`). However, it can be useful for specific use cases where performance is a concern or when you have a custom implementation that fits your needs better. diff --git a/src/radius_clustering/algorithms.py b/src/radius_clustering/algorithms.py index a30cd0b..0e71165 100644 --- a/src/radius_clustering/algorithms.py +++ b/src/radius_clustering/algorithms.py @@ -5,11 +5,10 @@ These functions can be replaced in the `RadiusClustering` class to perform clustering using another algorithm. -.. versionadded:: 2.0.0 +.. versionadded:: 1.4.0 Refactoring the structure of the code to separate the clustering algorithms This allows for easier maintenance and extensibility of the codebase. - Plus, this allows for the addition of new clustering algorithms - such as `Curgraph` added in this version. + """ from __future__ import annotations diff --git a/src/radius_clustering/radius_clustering.py b/src/radius_clustering/radius_clustering.py index 0ac9e63..aa63fc0 100644 --- a/src/radius_clustering/radius_clustering.py +++ b/src/radius_clustering/radius_clustering.py @@ -53,10 +53,15 @@ class RadiusClustering(ClusterMixin, BaseEstimator): .. note:: The `random_state_` attribute is not used when the `manner` is set to "exact". - .. versionchanged:: 2.0.0 + .. versionchanged:: 1.4.0 The `RadiusClustering` class has been refactored. Clustering algorithms are now separated into their own module (`algorithms.py`) to improve maintainability and extensibility. + + .. versionadded:: 1.4.0 + The `set_solver` method was added to allow users to set a custom solver + for the MDS problem. This allows for flexibility in how the MDS problem is solved + and enables users to use their own implementations of MDS clustering algorithms. .. versionadded:: 1.3.0 @@ -292,6 +297,9 @@ def set_solver(self, solver: callable) -> None: Set a custom solver for resolving the MDS problem. This method allows users to replace the default MDS solver with a custom one. + An example is provided below and in the example gallery : + :ref:`sphx_glr_auto_examples_plot_benchmark_custom.py` + .. important:: The custom solver must accept the same parameters as the default solvers and return a tuple containing the cluster centers and the execution time. From 9787153eb34c0bc39bca34190c58d8b4bc27966a Mon Sep 17 00:00:00 2001 From: Quentin Date: Thu, 19 Jun 2025 11:57:17 +0200 Subject: [PATCH 6/7] updated changelogs, README and version to 1.4.0 --- CHANGELOG.md | 44 +++++++++++++++++++++++++++++++ README.md | 16 ++++++++--- src/radius_clustering/__init__.py | 2 +- 3 files changed, 58 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a0e668..3002826 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,33 @@ # Changelog +All notable changes to this project will be documented in this file. + +## [1.4.0] - 2025-06-19 + +### Contributors + +- [@quentinhaenn](Quentin Haenn) - Main developer and maintainer + +### Added + +- Added support for custom MDS solvers in the `RadiusClustering` class. +- Updated the documentation to include examples of using custom MDS solvers. +- Added more examples and tutorials to the documentation. + +### Changed + +- Improved documentation and examples for the `RadiusClustering` class. +- Updated the README to reflect the new features and improvements in version 1.4.0 +- Updated the test cases to ensure compatibility with the new features. +- Refactored the main codebase to improve readability and maintainability. +- Prepared the codebase for future adds of MDS solvers and/or clustering algorithms. + ## [1.3.0] - 2025-06-18 +### Contributors + +- [@quentinhaenn](Quentin Haenn) - Main developer and maintainer + ### Added - Full test coverage for the entire codebase. @@ -17,3 +43,21 @@ - Updated all the attributes in the `RadiusClustering` class to fit `scikit-learn` standards and conventions. - Updated the tests cases to reflect the changes in the `RadiusClustering` class. - Updated README and documentation to reflect the new `radius` parameter and the deprecation of `threshold`. + +## [1.2.0] - 2024-10 + +### Contributors + +- [@quentinhaenn](Quentin Haenn) - Main developer and maintainer +- [@mickaelbaron](Mickaƫl Baron) - Contributor and maintainer + +### Added + +- Added CI/CD pipelines with GitHub Actions for automated testing and deployment. +- Added package metadata for better integration with PyPI. +- Added a badge for the GitHub Actions workflow status in the README. +- Added a badge for the Python version supported in the README. +- Added a badge for the code style (Ruff) in the README. +- Added a badge for the license in the README. +- Added CI/CD pipelines for PyPI deployment (including test coverage, compiling extensions and wheels, and uploading to PyPI). +- Resolving issues with compiling Cython extensions on Windows and MacOS. diff --git a/README.md b/README.md index 56239ff..2b1b09e 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,17 @@ Radius clustering is a Python package that implements clustering under radius co - Compatible with scikit-learn's API for clustering algorithms - Supports radius-constrained clustering - Provides options for exact and approximate solutions +- Easy to use and integrate with existing Python data science workflows +- Includes comprehensive documentation and examples +- Full test coverage to ensure reliability and correctness +- Supports custom MDS solvers for flexibility in clustering approaches +- Provides a user-friendly interface for clustering tasks + +> [!CAUTION] +> **Deprecation Notice**: The `threshold` parameter in the `RadiusClustering` class has been deprecated. Please use the `radius` parameter instead for specifying the radius for clustering. It is planned to be completely removed in version 2.0.0. The `radius` parameter is now the standard way to define the radius for clustering, aligning with our objective of making the parameters' name more intuitive and user-friendly. + +> [!NOTE] +> **NEW VERSIONS**: The package is currently under active development for new features and improvements, including some refactoring and enhancements to the existing codebase. Backwards compatibility is not guaranteed, so please check the [CHANGELOG](CHANGELOG.md) for details on changes and updates. ## Roadmap @@ -45,7 +56,7 @@ from radius_clustering import RadiusClustering X = np.random.rand(100, 2) # Generate random data # Create an instance of MdsClustering -rad_clustering = RadiusClustering(manner="approx", threshold=0.5) +rad_clustering = RadiusClustering(manner="approx", radius=0.5) # Fit the model to the data rad_clustering.fit(X) @@ -116,5 +127,4 @@ The Radius Clustering work has been funded by: - [1] [An iterated greedy algorithm for finding the minimum dominating set in graphs](https://www.sciencedirect.com/science/article/pii/S0378475422005055) - [2] [An exact algorithm for the minimum dominating set problem](https://dl.acm.org/doi/abs/10.24963/ijcai.2023/622) - - +- [3] [Clustering under radius constraint using minimum dominating set](https://link.springer.com/chapter/10.1007/978-3-031-62700-2_2) diff --git a/src/radius_clustering/__init__.py b/src/radius_clustering/__init__.py index 9609e48..57c5d53 100644 --- a/src/radius_clustering/__init__.py +++ b/src/radius_clustering/__init__.py @@ -2,4 +2,4 @@ from .radius_clustering import RadiusClustering __all__ = ["RadiusClustering"] -__version__ = "1.3.0" +__version__ = "1.4.0" From cbd0904043f1f6f15c192adbbe79f4281ecd107d Mon Sep 17 00:00:00 2001 From: Quentin Date: Thu, 19 Jun 2025 12:06:27 +0200 Subject: [PATCH 7/7] temporary deprecation of self-hosted CI/CD --- .github/workflows/build_wheels.yml | 4 ++-- .github/workflows/pr_build.yml | 2 +- .github/workflows/tests.yml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index e170743..e942fc3 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -16,7 +16,7 @@ on: jobs: run_pytest: name: Run tests on min and max Python versions - runs-on: self-hosted + runs-on: ubuntu-latest strategy: fail-fast: true matrix: @@ -61,7 +61,7 @@ jobs: build_sdist: name: Build source distribution - runs-on: self-hosted + runs-on: ubuntu-latest needs: run_pytest steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml index cfce0bb..873987d 100644 --- a/.github/workflows/pr_build.yml +++ b/.github/workflows/pr_build.yml @@ -17,7 +17,7 @@ jobs: build_test_sdist: name: Test source distribution - runs-on: self-hosted + runs-on: ubuntu-latest needs: run_pytest strategy: fail-fast: true diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 811dfd1..243c494 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,7 +7,7 @@ on: jobs: pytest: name: Run pytest - runs-on: self-hosted + runs-on: ubuntu-latest strategy: fail-fast: true matrix: