Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def group_sum(
result_mask: np.ndarray | None = ...,
min_count: int = ...,
is_datetimelike: bool = ...,
is_string: bool = ...,
skipna: bool = ...,
) -> None: ...
def group_prod(
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,7 @@ def group_sum(
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint is_datetimelike=False,
bint is_string=False,
bint skipna=True,
) -> None:
"""
Expand All @@ -729,6 +730,10 @@ def group_sum(
sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)

if is_string:
# for strings start with empty string instead of 0 as `initial` value
sumx = np.full((<object>out).shape, "", dtype=object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would passing “initial” be more general/idiomatic?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was initially thinking that as well, but then because this would in practice only be used for the specific case of strings, I thought to be more explicit about that fact. But in both cases I have to pass it down a few layers, so whether it is then called initial or is_string actually does not matter much, and initial at least makes it clearer where it is called from the EA _groupby_ops what the purpose is of passing it down.
So updated to use initial in the last commit


N, K = (<object>values).shape
if uses_mask:
nan_val = 0
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2608,6 +2608,7 @@ def _groupby_op(
kind = WrappedCythonOp.get_kind_from_how(how)
op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

is_string = False
# GH#43682
if isinstance(self.dtype, StringDtype):
# StringArray
Expand All @@ -2632,6 +2633,7 @@ def _groupby_op(

arr = self
if op.how == "sum":
is_string = True
# https://github.com/pandas-dev/pandas/issues/60229
# All NA should result in the empty string.
assert "skipna" in kwargs
Expand All @@ -2649,6 +2651,7 @@ def _groupby_op(
ngroups=ngroups,
comp_ids=ids,
mask=None,
is_string=is_string,
**kwargs,
)

Expand Down
8 changes: 8 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ def _cython_op_ndim_compat(
comp_ids: np.ndarray,
mask: npt.NDArray[np.bool_] | None = None,
result_mask: npt.NDArray[np.bool_] | None = None,
is_string: bool = False,
**kwargs,
) -> np.ndarray:
if values.ndim == 1:
Expand All @@ -335,6 +336,7 @@ def _cython_op_ndim_compat(
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
is_string=is_string,
**kwargs,
)
if res.shape[0] == 1:
Expand All @@ -350,6 +352,7 @@ def _cython_op_ndim_compat(
comp_ids=comp_ids,
mask=mask,
result_mask=result_mask,
is_string=is_string,
**kwargs,
)

Expand All @@ -363,6 +366,7 @@ def _call_cython_op(
comp_ids: np.ndarray,
mask: npt.NDArray[np.bool_] | None,
result_mask: npt.NDArray[np.bool_] | None,
is_string: bool = False,
**kwargs,
) -> np.ndarray: # np.ndarray[ndim=2]
orig_values = values
Expand Down Expand Up @@ -420,6 +424,10 @@ def _call_cython_op(
"sum",
"median",
]:
if self.how == "sum":
# pass in through kwargs only for sum (other functions don't have
# the keyword)
kwargs["is_string"] = is_string
func(
out=result,
counts=counts,
Expand Down
16 changes: 10 additions & 6 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ def f(a):
return a

index = MultiIndex.from_product(map(f, args), names=names)
if isinstance(fill_value, dict):
# fill_value is a dict mapping column names to fill values
# -> reindex column by column (reindex itself does not support this)
res = {}
for col in result.columns:
res[col] = result[col].reindex(index, fill_value=fill_value[col])
return DataFrame(res, index=index).sort_index()

return result.reindex(index, fill_value=fill_value).sort_index()


Expand Down Expand Up @@ -317,18 +325,14 @@ def test_apply(ordered):
tm.assert_series_equal(result, expected)


def test_observed(request, using_infer_string, observed):
def test_observed(observed):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
# gh-10132 (back-compat)
# gh-8138 (back-compat)
# gh-8869

if using_infer_string and not observed:
# TODO(infer_string) this fails with filling the string column with 0
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))

cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
Expand Down Expand Up @@ -356,7 +360,7 @@ def test_observed(request, using_infer_string, observed):
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected, [cat1, cat2], list("AB"), fill_value=0
expected, [cat1, cat2], list("AB"), fill_value={"values": 0, "C": ""}
)

tm.assert_frame_equal(result, expected)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_groupby_with_timegrouper(self, using_infer_string):
unit=df.index.unit,
)
expected = DataFrame(
{"Buyer": 0, "Quantity": 0},
{"Buyer": "" if using_infer_string else 0, "Quantity": 0},
index=exp_dti,
)
# Cast to object/str to avoid implicit cast when setting
Expand Down
Loading