diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index a494b61fa7e3d..eebd9331ab393 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -65,6 +65,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + initial: object = ..., ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b855d64d0be18..04343671af677 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -672,6 +672,7 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + object initial=0, ) -> None: """ Only aggregates on axis=0 using Kahan summation @@ -689,9 +690,15 @@ def group_sum( raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) - # the below is equivalent to `np.zeros_like(out)` but faster - sumx = np.zeros((out).shape, dtype=(out).base.dtype) - compensation = np.zeros((out).shape, dtype=(out).base.dtype) + if initial == 0: + # the below is equivalent to `np.zeros_like(out)` but faster + sumx = np.zeros((out).shape, dtype=(out).base.dtype) + compensation = np.zeros((out).shape, dtype=(out).base.dtype) + else: + # in practice this path is only taken for strings to use empty string as initial + assert sum_t is object + sumx = np.full((out).shape, initial, dtype=object) + # object code path does not use `compensation` N, K = (values).shape diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 28a95ce1784a2..62458b89f9c08 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2366,6 +2366,7 @@ def _groupby_op( kind = WrappedCythonOp.get_kind_from_how(how) op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) + initial: Any = 0 # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray @@ -2389,6 +2390,7 @@ def _groupby_op( arr = self if op.how == "sum": + initial = "" # https://github.com/pandas-dev/pandas/issues/60229 # All NA should result in the empty string. if min_count == 0: @@ -2405,6 +2407,7 @@ def _groupby_op( ngroups=ngroups, comp_ids=ids, mask=None, + initial=initial, **kwargs, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e2ddf9aa5c0c1..e44c28f656ecc 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -11,6 +11,7 @@ import functools from typing import ( TYPE_CHECKING, + Any, Callable, Generic, final, @@ -317,6 +318,7 @@ def _cython_op_ndim_compat( comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None = None, result_mask: npt.NDArray[np.bool_] | None = None, + initial: Any = 0, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -333,6 +335,7 @@ def _cython_op_ndim_compat( comp_ids=comp_ids, mask=mask, result_mask=result_mask, + initial=initial, **kwargs, ) if res.shape[0] == 1: @@ -348,6 +351,7 @@ def _cython_op_ndim_compat( comp_ids=comp_ids, mask=mask, result_mask=result_mask, + initial=initial, **kwargs, ) @@ -361,6 +365,7 @@ def _call_cython_op( comp_ids: np.ndarray, mask: npt.NDArray[np.bool_] | None, result_mask: npt.NDArray[np.bool_] | None, + initial: Any = 0, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values @@ -415,6 +420,10 @@ def _call_cython_op( "first", "sum", ]: + if self.how == "sum": + # pass in through kwargs only for sum (other functions don't have + # the keyword) + kwargs["initial"] = initial func( out=result, counts=counts, diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cba02ae869889..48f09d4a4a292 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -32,6 +32,14 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) + if isinstance(fill_value, dict): + # fill_value is a dict mapping column names to fill values + # -> reindex column by column (reindex itself does not support this) + res = {} + for col in result.columns: + res[col] = result[col].reindex(index, fill_value=fill_value[col]) + return DataFrame(res, index=index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() @@ -340,7 +348,7 @@ def test_apply(ordered): @pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") -def test_observed(request, using_infer_string, observed): +def test_observed(observed, using_infer_string): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -348,10 +356,6 @@ def test_observed(request, using_infer_string, observed): # gh-8138 (back-compat) # gh-8869 - if using_infer_string and not observed: - # TODO(infer_string) this fails with filling the string column with 0 - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) @@ -379,7 +383,10 @@ def test_observed(request, using_infer_string, observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2], list("AB"), fill_value=0 + expected, + [cat1, cat2], + list("AB"), + fill_value={"values": 0, "C": ""} if using_infer_string else 0, ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 3bae719e01b73..0dc2e84c55953 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -10,8 +10,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -75,10 +73,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: - # TODO(infer_string) resample sum introduces 0's - # https://github.com/pandas-dev/pandas/issues/60229 - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_groupby_with_timegrouper(self): + def test_groupby_with_timegrouper(self, using_infer_string): # GH 4161 # TimeGrouper requires a sorted index # also verifies that the resultant index has the correct name @@ -112,11 +107,13 @@ def test_groupby_with_timegrouper(self): unit=df.index.unit, ) expected = DataFrame( - {"Buyer": 0, "Quantity": 0}, + {"Buyer": "" if using_infer_string else 0, "Quantity": 0}, index=exp_dti, ) # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl" expected = expected.astype({"Buyer": object}) + if using_infer_string: + expected = expected.astype({"Buyer": "str"}) expected.iloc[0, 0] = "CarlCarlCarl" expected.iloc[6, 0] = "CarlCarl" expected.iloc[18, 0] = "Joe"