diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3191c077d3c36..85d8e11568f2b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -81,6 +81,7 @@ Other enhancements - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`) +- :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`) - :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index fefd70fef35c9..9c5965951da68 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -12,6 +12,7 @@ import numpy as np from pandas._libs import ( + algos as libalgos, lib, missing as libmissing, ) @@ -992,6 +993,49 @@ def copy(self) -> Self: mask = self._mask.copy() return self._simple_new(data, mask) + def _rank( + self, + *, + axis: AxisInt = 0, + method: str = "average", + na_option: str = "keep", + ascending: bool = True, + pct: bool = False, + ): + # GH#62043 Avoid going through copy-making ensure_data in algorithms.rank + if axis != 0 or self.ndim != 1: + raise NotImplementedError + + from pandas.core.arrays import FloatingArray + + data = self._data + if data.dtype.kind == "b": + data = data.view("uint8") + + result = libalgos.rank_1d( + data, + is_datetimelike=False, + ties_method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + mask=self.isna(), + ) + if na_option in ["top", "bottom"]: + mask = np.zeros(self.shape, dtype=bool) + else: + mask = self._mask.copy() + + if method != "average" and not pct: + if na_option not in ["top", "bottom"]: + result[self._mask] = 0 # avoid warning on casting + result = result.astype("uint64", copy=False) + from pandas.core.arrays import IntegerArray + + return IntegerArray(result, mask=mask) + + return FloatingArray(result, mask=mask) + @doc(ExtensionArray.duplicated) def duplicated( self, keep: Literal["first", "last", False] = "first" diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 81af7ae2aea45..ecd52b2c8498a 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -68,6 +68,11 @@ def expected_dtype(dtype, method, pct=False): exp_dtype = "double[pyarrow]" else: exp_dtype = "uint64[pyarrow]" + elif dtype in ["Float64", "Int64"]: + if method == "average" or pct: + exp_dtype = "Float64" + else: + exp_dtype = "UInt64" return exp_dtype @@ -257,7 +262,7 @@ def test_rank_nullable_integer(self): exp = Series([None, 2, None, 3, 3, 2, 3, 1], dtype="Int64") result = exp.rank(na_option="keep") - expected = Series([np.nan, 2.5, np.nan, 5.0, 5.0, 2.5, 5.0, 1.0]) + expected = Series([None, 2.5, None, 5.0, 5.0, 2.5, 5.0, 1.0], dtype="Float64") tm.assert_series_equal(result, expected) @@ -302,6 +307,12 @@ def test_rank_tie_methods_on_infs_nans( exp_dtype = "float64[pyarrow]" else: exp_dtype = "uint64[pyarrow]" + elif dtype == "Float64": + # GH#62043 + if rank_method == "average": + exp_dtype = "Float64" + else: + exp_dtype = "UInt64" else: exp_dtype = "float64" @@ -327,7 +338,8 @@ def test_rank_tie_methods_on_infs_nans( result = iseries.rank( method=rank_method, na_option=na_option, ascending=ascending ) - tm.assert_series_equal(result, Series(expected, dtype=exp_dtype)) + exp_ser = Series(expected, dtype=exp_dtype) + tm.assert_series_equal(result, exp_ser) def test_rank_desc_mix_nans_infs(self): # GH 19538 @@ -439,7 +451,7 @@ def test_rank_ea_small_values(self): dtype="Float64", ) result = ser.rank(method="min") - expected = Series([4, 1, 3, np.nan, 2]) + expected = Series([4, 1, 3, NA, 2], dtype="UInt64") tm.assert_series_equal(result, expected)