[backport 2.3.x] BUG: fix fill value for gouped sum in case of unobserved categories for string dtype (empty string instead of 0) (#61909) (#61963)

jorisvandenbossche · web-flow · commit d1c8ce6917fb · 2025-09-05T18:14:44.000+02:00
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -65,6 +65,7 @@ def group_sum(
     result_mask: np.ndarray | None = ...,
     min_count: int = ...,
     is_datetimelike: bool = ...,
+    initial: object = ...,
 ) -> None: ...
 def group_prod(
     out: np.ndarray,  # int64float_t[:, ::1]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -672,6 +672,7 @@ def group_sum(
     uint8_t[:, ::1] result_mask=None,
     Py_ssize_t min_count=0,
     bint is_datetimelike=False,
+    object initial=0,
 ) -> None:
     """
     Only aggregates on axis=0 using Kahan summation
@@ -689,9 +690,15 @@ def group_sum(
         raise ValueError("len(index) != len(labels)")
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
-    # the below is equivalent to `np.zeros_like(out)` but faster
-    sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
-    compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
+    if initial == 0:
+        # the below is equivalent to `np.zeros_like(out)` but faster
+        sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
+        compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
+    else:
+        # in practice this path is only taken for strings to use empty string as initial
+        assert sum_t is object
+        sumx = np.full((<object>out).shape, initial, dtype=object)
+        # object code path does not use `compensation`
 
     N, K = (<object>values).shape
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -2366,6 +2366,7 @@ def _groupby_op(
         kind = WrappedCythonOp.get_kind_from_how(how)
         op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
 
+        initial: Any = 0
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
@@ -2389,6 +2390,7 @@ def _groupby_op(
 
             arr = self
             if op.how == "sum":
+                initial = ""
                 # https://github.com/pandas-dev/pandas/issues/60229
                 # All NA should result in the empty string.
                 if min_count == 0:
@@ -2405,6 +2407,7 @@ def _groupby_op(
             ngroups=ngroups,
             comp_ids=ids,
             mask=None,
+            initial=initial,
             **kwargs,
         )
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -34,6 +34,7 @@ class providing the base-class of operations.
 
 import numpy as np
 
+from pandas._config import using_string_dtype
 from pandas._config.config import option_context
 
 from pandas._libs import (
@@ -3156,7 +3157,7 @@ def sum(
                     npfunc=np.sum,
                 )
 
-            return self._reindex_output(result, fill_value=0)
+            return self._reindex_output(result, fill_value=0, method="sum")
 
     @final
     @doc(
@@ -5574,6 +5575,7 @@ def _reindex_output(
         output: OutputFrameOrSeries,
         fill_value: Scalar = np.nan,
         qs: npt.NDArray[np.float64] | None = None,
+        method: str | None = None,
     ) -> OutputFrameOrSeries:
         """
         If we have categorical groupers, then we might want to make sure that
@@ -5634,6 +5636,24 @@ def _reindex_output(
                 "copy": False,
                 "fill_value": fill_value,
             }
+            if using_string_dtype() and method == "sum":
+                if isinstance(output, Series) and isinstance(output.dtype, StringDtype):
+                    d["fill_value"] = ""
+                    return output.reindex(**d)  # type: ignore[return-value, arg-type]
+                elif isinstance(output, DataFrame) and any(
+                    isinstance(dtype, StringDtype) for dtype in output.dtypes
+                ):
+                    orig_dtypes = output.dtypes
+                    indices = np.nonzero(output.dtypes == "string")[0]
+                    for idx in indices:
+                        output.isetitem(idx, output.iloc[:, idx].astype(object))
+                    output = output.reindex(**d)  # type: ignore[assignment, arg-type]
+                    for idx in indices:
+                        col = output.iloc[:, idx]
+                        output.isetitem(
+                            idx, col.mask(col == 0, "").astype(orig_dtypes.iloc[idx])
+                        )
+                    return output  # type: ignore[return-value]
             return output.reindex(**d)  # type: ignore[arg-type]
 
         # GH 13204
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -11,6 +11,7 @@
 import functools
 from typing import (
     TYPE_CHECKING,
+    Any,
     Callable,
     Generic,
     final,
@@ -317,6 +318,7 @@ def _cython_op_ndim_compat(
         comp_ids: np.ndarray,
         mask: npt.NDArray[np.bool_] | None = None,
         result_mask: npt.NDArray[np.bool_] | None = None,
+        initial: Any = 0,
         **kwargs,
     ) -> np.ndarray:
         if values.ndim == 1:
@@ -333,6 +335,7 @@ def _cython_op_ndim_compat(
                 comp_ids=comp_ids,
                 mask=mask,
                 result_mask=result_mask,
+                initial=initial,
                 **kwargs,
             )
             if res.shape[0] == 1:
@@ -348,6 +351,7 @@ def _cython_op_ndim_compat(
             comp_ids=comp_ids,
             mask=mask,
             result_mask=result_mask,
+            initial=initial,
             **kwargs,
         )
 
@@ -361,6 +365,7 @@ def _call_cython_op(
         comp_ids: np.ndarray,
         mask: npt.NDArray[np.bool_] | None,
         result_mask: npt.NDArray[np.bool_] | None,
+        initial: Any = 0,
         **kwargs,
     ) -> np.ndarray:  # np.ndarray[ndim=2]
         orig_values = values
@@ -415,6 +420,10 @@ def _call_cython_op(
                 "first",
                 "sum",
             ]:
+                if self.how == "sum":
+                    # pass in through kwargs only for sum (other functions don't have
+                    # the keyword)
+                    kwargs["initial"] = initial
                 func(
                     out=result,
                     counts=counts,
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -32,6 +32,14 @@ def f(a):
         return a
 
     index = MultiIndex.from_product(map(f, args), names=names)
+    if isinstance(fill_value, dict):
+        # fill_value is a dict mapping column names to fill values
+        # -> reindex column by column (reindex itself does not support this)
+        res = {}
+        for col in result.columns:
+            res[col] = result[col].reindex(index, fill_value=fill_value[col])
+        return DataFrame(res, index=index).sort_index()
+
     return result.reindex(index, fill_value=fill_value).sort_index()
 
 
@@ -340,18 +348,14 @@ def test_apply(ordered):
 
 
 @pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
-def test_observed(request, using_infer_string, observed):
+def test_observed(observed, using_infer_string):
     # multiple groupers, don't re-expand the output space
     # of the grouper
     # gh-14942 (implement)
     # gh-10132 (back-compat)
     # gh-8138 (back-compat)
     # gh-8869
 
-    if using_infer_string and not observed:
-        # TODO(infer_string) this fails with filling the string column with 0
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
-
     cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
     cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
     df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
@@ -379,11 +383,18 @@ def test_observed(request, using_infer_string, observed):
     result = gb.sum()
     if not observed:
         expected = cartesian_product_for_groupers(
-            expected, [cat1, cat2], list("AB"), fill_value=0
+            expected,
+            [cat1, cat2],
+            list("AB"),
+            fill_value={"values": 0, "C": ""} if using_infer_string else 0,
         )
 
     tm.assert_frame_equal(result, expected)
 
+    result = gb["C"].sum()
+    expected = expected["C"]
+    tm.assert_series_equal(result, expected)
+
     # https://github.com/pandas-dev/pandas/issues/8138
     d = {
         "cat": Categorical(
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
@@ -10,8 +10,6 @@
 import pytest
 import pytz
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -75,10 +73,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
 
 
 class TestGroupBy:
-    # TODO(infer_string) resample sum introduces 0's
-    # https://github.com/pandas-dev/pandas/issues/60229
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_groupby_with_timegrouper(self):
+    def test_groupby_with_timegrouper(self, using_infer_string):
         # GH 4161
         # TimeGrouper requires a sorted index
         # also verifies that the resultant index has the correct name
@@ -112,11 +107,13 @@ def test_groupby_with_timegrouper(self):
                 unit=df.index.unit,
             )
             expected = DataFrame(
-                {"Buyer": 0, "Quantity": 0},
+                {"Buyer": "" if using_infer_string else 0, "Quantity": 0},
                 index=exp_dti,
             )
             # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
             expected = expected.astype({"Buyer": object})
+            if using_infer_string:
+                expected = expected.astype({"Buyer": "str"})
             expected.iloc[0, 0] = "CarlCarlCarl"
             expected.iloc[6, 0] = "CarlCarl"
             expected.iloc[18, 0] = "Joe"