Solving the issue of the stopping criterion of the RENN

Guillaume Lemaitre · Guillaume Lemaitre · commit 7a1459e18353 · 2016-08-31T10:13:33.000+02:00
Conflicts:
	imblearn/under_sampling/tests/test_repeated_edited_nearest_neighbours.py
diff --git a/imblearn/under_sampling/edited_nearest_neighbours.py b/imblearn/under_sampling/edited_nearest_neighbours.py
@@ -330,14 +330,56 @@ def _sample(self, X, y):
 
             prev_len = y_.shape[0]
             if self.return_indices:
-                X_, y_, idx_ = self.enn_.fit_sample(X_, y_)
-                idx_under = idx_under[idx_]
+                X_enn, y_enn, idx_enn = self.enn_.fit_sample(X_, y_)
             else:
-                X_, y_ = self.enn_.fit_sample(X_, y_)
-
-            if prev_len == y_.shape[0]:
+                X_enn, y_enn = self.enn_.fit_sample(X_, y_)
+
+            # Check the stopping criterion
+            # 1. If there is no changes for the vector y
+            # 2. If the number of samples in the other class become inferior to
+            # the number of samples in the majority class
+            # 3. If one of the class is disappearing
+
+            # Case 1
+            b_conv = (prev_len == y_enn.shape[0])
+
+            # Case 2
+            stats_enn = Counter(y_enn)
+            self.logger.debug('Current ENN stats: %s', stats_enn)
+            # Get the number of samples in the non-minority classes
+            count_non_min = np.array([val for val, key
+                                      in zip(stats_enn.itervalues(),
+                                             stats_enn.iterkeys())
+                                      if key != self.min_c_])
+            self.logger.debug('Number of samples in the non-majority'
+                              ' classes: %s', count_non_min)
+            # Check the minority stop to be the minority
+            b_min_bec_maj = np.any(count_non_min < self.stats_c_[self.min_c_])
+
+            # Case 3
+            b_remove_maj_class = (len(stats_enn) < len(self.stats_c_))
+
+            if b_conv or b_min_bec_maj or b_remove_maj_class:
+                # If this is a normal convergence, get the last data
+                if b_conv:
+                    if self.return_indices:
+                        X_, y_, = X_enn, y_enn
+                        idx_under = idx_under[idx_enn]
+                    else:
+                        X_, y_, = X_enn, y_enn
+                # Log the variables to explain the stop of the algorithm
+                self.logger.debug('RENN converged: %s', b_conv)
+                self.logger.debug('RENN minority become majority: %s',
+                                  b_min_bec_maj)
+                self.logger.debug('RENN remove one class: %s',
+                                  b_remove_maj_class)
                 break
 
+            # Update the data for the next iteration
+            X_, y_, = X_enn, y_enn
+            if self.return_indices:
+                idx_under = idx_under[idx_enn]
+
         self.logger.info('Under-sampling performed: %s', Counter(y_))
 
         X_resampled, y_resampled = X_, y_
diff --git a/imblearn/under_sampling/tests/test_repeated_edited_nearest_neighbours.py b/imblearn/under_sampling/tests/test_repeated_edited_nearest_neighbours.py
@@ -9,6 +9,8 @@
 from numpy.testing import assert_array_equal
 from numpy.testing import assert_warns
 
+from collections import Counter
+
 from sklearn.datasets import make_classification
 from sklearn.utils.estimator_checks import check_estimator
 
@@ -140,3 +142,31 @@ def test_renn_sample_wrong_X():
     renn.fit(X, Y)
     assert_raises(RuntimeError, renn.sample, np.random.random((100, 40)),
                   np.array([0] * 50 + [1] * 50))
+
+
+def test_continuous_error():
+    """Test either if an error is raised when the target are continuous
+    type"""
+
+    # continuous case
+    y = np.linspace(0, 1, 5000)
+    enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
+    assert_warns(UserWarning, enn.fit, X, y)
+
+
+def test_multiclass_fit_sample():
+    """Test fit sample method with multiclass target"""
+
+    # Make y to be multiclass
+    y = Y.copy()
+    y[0:1000] = 2
+
+    # Resample the data
+    enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED)
+    X_resampled, y_resampled = enn.fit_sample(X, y)
+
+    # Check the size of y
+    count_y_res = Counter(y_resampled)
+    assert_equal(count_y_res[0], 400)
+    assert_equal(count_y_res[1], 3600)
+    assert_equal(count_y_res[2], 1000)