From 46e70ef4983142dd795797fe8921e5e56f0f0ed6 Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 17:34:38 +0000 Subject: [PATCH 1/6] implement the exception --- pandas/core/sample.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/sample.py b/pandas/core/sample.py index 4f12563e3c5e2..d44409d08abf4 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -150,6 +150,14 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") + if weights is not None: + is_max_weight_dominating = size * weights.max() > 1 + if is_max_weight_dominating and not replace: + raise ValueError( + "Invalid weights: If `replace`=False, " + "total unit probabilities have to be less than 1" + ) + return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( np.intp, copy=False ) From d18c74b7dfd9a2a121fd9f50c7bc7786abda9cbb Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 17:36:39 +0000 Subject: [PATCH 2/6] remove impossible test scenario --- pandas/tests/frame/methods/test_sample.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index a9d56cbfd2b46..95aa0c0f60778 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -113,9 +113,6 @@ def test_sample_invalid_weight_lengths(self, obj): with pytest.raises(ValueError, match=msg): obj.sample(n=3, weights=[0.5] * 11) - with pytest.raises(ValueError, match="Fewer non-zero entries in p than size"): - obj.sample(n=4, weights=Series([0, 0, 0.2])) - def test_sample_negative_weights(self, obj): # Check won't accept negative weights bad_weights = [-0.1] * 10 From f5623848a6073661068f1f0abcca46da17b5c3a4 Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 17:37:10 +0000 Subject: [PATCH 3/6] add new tests --- pandas/tests/frame/methods/test_sample.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 95aa0c0f60778..a7a916da14bc4 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -134,6 +134,28 @@ def test_sample_inf_weights(self, obj): with pytest.raises(ValueError, match=msg): obj.sample(n=3, weights=weights_with_ninf) + def test_sample_unit_probabilities_raises(self, obj): + # GH#61516 + high_variance_weights = [1] * 10 + high_variance_weights[0] = 100 + msg = ( + "Invalid weights: If `replace`=False, " + "total unit probabilities have to be less than 1" + ) + with pytest.raises(ValueError, match=msg): + obj.sample(n=2, weights=high_variance_weights, replace=False) + + # edge case, n*max(weights)/sum(weights) == 1 + edge_variance_weights = [1] * 10 + edge_variance_weights[0] = 9 + # should not raise + obj.sample(n=2, weights=edge_variance_weights, replace=False) + + low_variance_weights = [1] * 10 + low_variance_weights[0] = 8 + # should not raise + obj.sample(n=2, weights=low_variance_weights, replace=False) + def test_sample_zero_weights(self, obj): # All zeros raises errors From e4e2cbf664c2189a01b9bcce7007df253b919364 Mon Sep 17 00:00:00 2001 From: microslaw Date: Thu, 12 Jun 2025 22:32:24 +0000 Subject: [PATCH 4/6] update documentation --- doc/source/user_guide/indexing.rst | 4 ++-- doc/source/whatsnew/v0.16.1.rst | 6 +++--- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/generic.py | 2 ++ 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 605f9501c5b23..47ff92c163b01 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -700,7 +700,7 @@ to have different probabilities, you can pass the ``sample`` function sampling w s = pd.Series([0, 1, 2, 3, 4, 5]) example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] - s.sample(n=3, weights=example_weights) + s.sample(n=2, weights=example_weights) # Weights will be re-normalized automatically example_weights2 = [0.5, 0, 0, 0, 0, 0] @@ -714,7 +714,7 @@ as a string. df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]}) - df2.sample(n=3, weights='weight_column') + df2.sample(n=2, weights='weight_column') ``sample`` also allows users to sample columns instead of rows using the ``axis`` argument. diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index b376530358f53..be5d0b396fd51 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -196,7 +196,7 @@ facilitate replication. (:issue:`2419`) # weights are accepted. example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] - example_series.sample(n=3, weights=example_weights) + example_series.sample(n=2, weights=example_weights) # weights will also be normalized if they do not sum to one, # and missing values will be treated as zeros. @@ -209,8 +209,8 @@ when sampling from rows. .. ipython:: python - df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) - df.sample(n=3, weights="weight_column") + df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) + df.sample(n=2, weights="weight_column") .. _whatsnew_0161.enhancements.string: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4e0e497379fa2..d0f4a2b3fc0a1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -910,6 +910,7 @@ Other - Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`) - Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) - Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`) +- Bug in :meth:`DataFrame.sample` with ``replace=False`` and ``(n * max(weights) / sum(weights)) > 1``, the method would return biased results. Now raises ``ValueError``. (:issue:`61516`) - Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.sort_values` where sorting by a column explicitly named ``None`` raised a ``KeyError`` instead of sorting by the column as expected. (:issue:`61512`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7f1ccc482f70f..b47975e1d1b59 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5814,6 +5814,8 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. + When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``, + in order to avoid biased results. random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. From c4241fb23fdcfde55b5b7e0fa4022b8a236b0b73 Mon Sep 17 00:00:00 2001 From: microslaw Date: Sun, 29 Jun 2025 17:24:58 +0000 Subject: [PATCH 5/6] tmp --- pandas/core/generic.py | 6 +++++- pandas/core/sample.py | 10 +++++----- pandas/tests/frame/methods/test_sample.py | 9 +++++++-- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b47975e1d1b59..8be7986dd5db7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5815,7 +5815,7 @@ def sample( Missing values in the weights column will be treated as zero. Infinite values not allowed. When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``, - in order to avoid biased results. + in order to avoid biased results. See the Notes below for more details. random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. If np.random.RandomState or np.random.Generator, use as given. @@ -5852,6 +5852,10 @@ def sample( ----- If `frac` > 1, `replacement` should be set to `True`. + When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``, + since that would cause results to be biased. E.g. sampling 2 items without replacement, + with weights [100, 1, 1] would yield two last items in 1/2 of cases, instead of 1/102 + Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/core/sample.py b/pandas/core/sample.py index d44409d08abf4..4f476540cf406 100644 --- a/pandas/core/sample.py +++ b/pandas/core/sample.py @@ -150,12 +150,12 @@ def sample( else: raise ValueError("Invalid weights: weights sum to zero") - if weights is not None: - is_max_weight_dominating = size * weights.max() > 1 - if is_max_weight_dominating and not replace: + assert weights is not None # for mypy + if not replace and size * weights.max() > 1: raise ValueError( - "Invalid weights: If `replace`=False, " - "total unit probabilities have to be less than 1" + "Weighted sampling cannot be achieved with replace=False. Either " + "set replace=True or use smaller weights. See the docstring of " + "sample for details." ) return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype( diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index a7a916da14bc4..9b6660778508e 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -139,18 +139,23 @@ def test_sample_unit_probabilities_raises(self, obj): high_variance_weights = [1] * 10 high_variance_weights[0] = 100 msg = ( - "Invalid weights: If `replace`=False, " - "total unit probabilities have to be less than 1" + "Weighted sampling cannot be achieved with replace=False. Either " + "set replace=True or use smaller weights. See the docstring of " + "sample for details." ) with pytest.raises(ValueError, match=msg): obj.sample(n=2, weights=high_variance_weights, replace=False) + def test_sample_unit_probabilities_edge_case_do_not_raise(self, obj): + # GH#61516 # edge case, n*max(weights)/sum(weights) == 1 edge_variance_weights = [1] * 10 edge_variance_weights[0] = 9 # should not raise obj.sample(n=2, weights=edge_variance_weights, replace=False) + def test_sample_unit_normal_probabilities_do_not_raise(self, obj): + # GH#61516 low_variance_weights = [1] * 10 low_variance_weights[0] = 8 # should not raise From 7b9aac4404ea129ff8b7ad1f8d17dcf85054cc58 Mon Sep 17 00:00:00 2001 From: microslaw Date: Tue, 8 Jul 2025 18:52:37 +0000 Subject: [PATCH 6/6] doc: minior adjustments --- doc/source/whatsnew/v0.16.1.rst | 4 ++-- pandas/core/generic.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index be5d0b396fd51..c15f56ba61447 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -209,8 +209,8 @@ when sampling from rows. .. ipython:: python - df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) - df.sample(n=2, weights="weight_column") + df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) + df.sample(n=2, weights="weight_column") .. _whatsnew_0161.enhancements.string: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8be7986dd5db7..8708de68c0860 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5814,7 +5814,7 @@ def sample( If weights do not sum to 1, they will be normalized to sum to 1. Missing values in the weights column will be treated as zero. Infinite values not allowed. - When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``, + When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1`` in order to avoid biased results. See the Notes below for more details. random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional If int, array-like, or BitGenerator, seed for random number generator. @@ -5853,8 +5853,9 @@ def sample( If `frac` > 1, `replacement` should be set to `True`. When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``, - since that would cause results to be biased. E.g. sampling 2 items without replacement, - with weights [100, 1, 1] would yield two last items in 1/2 of cases, instead of 1/102 + since that would cause results to be biased. E.g. sampling 2 items without replacement + with weights [100, 1, 1] would yield two last items in 1/2 of cases, instead of 1/102. + This is similar to specifying `n=4` without replacement on a Series with 3 elements. Examples --------