From 38f378404f571820fb39c0edb1930e265e9304a4 Mon Sep 17 00:00:00 2001 From: Adriano Leao Date: Wed, 9 Jul 2025 00:11:40 -0300 Subject: [PATCH 1/4] Fix DataFrame.aggregate to preserve extension dtypes with callable functions --- pandas/core/apply.py | 82 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index e228d20b359c6..feec72b4673f9 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -291,6 +291,8 @@ def agg(self) -> DataFrame | Series | None: elif is_list_like(func): # we require a list, but not a 'str' return self.agg_list_like() + elif callable(func): + return self.agg_callable() # caller can react return None @@ -797,6 +799,86 @@ def _apply_str(self, obj, func: str, *args, **kwargs): msg = f"'{func}' is not a valid function for '{type(obj).__name__}' object" raise AttributeError(msg) + def agg_callable(self) -> DataFrame | Series: + """ + Compute aggregation in the case of a callable argument. + + This method handles callable functions while preserving extension dtypes + by delegating to the same infrastructure used for string aggregations. + + Returns + ------- + Result of aggregation. + """ + obj = self.obj + func = self.func + + if obj.ndim == 1: + return func(obj, *self.args, **self.kwargs) + + # Use _reduce to preserve extension dtypes like on string aggregation + try: + result = obj._reduce( + func, + name=getattr(func, '__name__', ''), + axis=self.axis, + skipna=True, + numeric_only=False, + **self.kwargs + ) + return result + + except (AttributeError, TypeError): + # If _reduce fails, fallback to column-wise + return self._agg_callable_fallback() + + def _agg_callable_fallback(self) -> DataFrame | Series: + """ + Fallback method for callable aggregation when _reduce fails. + + This method applies the function column-wise while preserving dtypes, + but avoids the performance overhead of row-by-row processing. + """ + obj = self.obj + func = self.func + + if self.axis == 1: + # For row-wise aggregation, transpose and recurse + transposed_result = obj.T._aggregate(func, axis=0, *self.args, **self.kwargs) + return transposed_result + + from pandas import Series + + try: + # Apply function to each column + results = {} + for name in obj.columns: + col = obj._get_column_reference(name) + result_val = func(col, *self.args, **self.kwargs) + results[name] = result_val + + result = Series(results, name=None) + + # Preserve extension dtypes where possible + for name in result.index: + if name in obj.columns: + original_dtype = obj.dtypes[name] + if hasattr(original_dtype, 'construct_array_type'): + try: + array_type = original_dtype.construct_array_type() + if hasattr(array_type, '_from_sequence'): + preserved_val = array_type._from_sequence( + [result[name]], dtype=original_dtype + )[0] + result.loc[name] = preserved_val + except Exception: + # If dtype preservation fails, keep the computed value + pass + + return result + + except Exception: + return None class NDFrameApply(Apply): """ From c2fed653a2d1663726dff957f6151c545754ecc6 Mon Sep 17 00:00:00 2001 From: Adriano Leao Date: Wed, 9 Jul 2025 00:25:20 -0300 Subject: [PATCH 2/4] FIX linting --- pandas/core/apply.py | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index feec72b4673f9..9a8d20fc03dfa 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -802,10 +802,10 @@ def _apply_str(self, obj, func: str, *args, **kwargs): def agg_callable(self) -> DataFrame | Series: """ Compute aggregation in the case of a callable argument. - + This method handles callable functions while preserving extension dtypes by delegating to the same infrastructure used for string aggregations. - + Returns ------- Result of aggregation. @@ -815,19 +815,19 @@ def agg_callable(self) -> DataFrame | Series: if obj.ndim == 1: return func(obj, *self.args, **self.kwargs) - + # Use _reduce to preserve extension dtypes like on string aggregation try: result = obj._reduce( - func, - name=getattr(func, '__name__', ''), + func, + name=getattr(func, "__name__", ""), axis=self.axis, skipna=True, numeric_only=False, **self.kwargs ) return result - + except (AttributeError, TypeError): # If _reduce fails, fallback to column-wise return self._agg_callable_fallback() @@ -835,20 +835,25 @@ def agg_callable(self) -> DataFrame | Series: def _agg_callable_fallback(self) -> DataFrame | Series: """ Fallback method for callable aggregation when _reduce fails. - + This method applies the function column-wise while preserving dtypes, but avoids the performance overhead of row-by-row processing. """ obj = self.obj func = self.func - + if self.axis == 1: # For row-wise aggregation, transpose and recurse - transposed_result = obj.T._aggregate(func, axis=0, *self.args, **self.kwargs) + transposed_result = obj.T._aggregate( + func, + *self.args, + axis=0, + **self.kwargs + ) return transposed_result - + from pandas import Series - + try: # Apply function to each column results = {} @@ -856,17 +861,17 @@ def _agg_callable_fallback(self) -> DataFrame | Series: col = obj._get_column_reference(name) result_val = func(col, *self.args, **self.kwargs) results[name] = result_val - + result = Series(results, name=None) - + # Preserve extension dtypes where possible for name in result.index: if name in obj.columns: original_dtype = obj.dtypes[name] - if hasattr(original_dtype, 'construct_array_type'): + if hasattr(original_dtype, "construct_array_type"): try: array_type = original_dtype.construct_array_type() - if hasattr(array_type, '_from_sequence'): + if hasattr(array_type, "_from_sequence"): preserved_val = array_type._from_sequence( [result[name]], dtype=original_dtype )[0] @@ -874,9 +879,9 @@ def _agg_callable_fallback(self) -> DataFrame | Series: except Exception: # If dtype preservation fails, keep the computed value pass - + return result - + except Exception: return None From 1b542a6a5f2a95f5872e9844a328a7523a9334df Mon Sep 17 00:00:00 2001 From: Adriano Leao Date: Wed, 9 Jul 2025 00:45:25 -0300 Subject: [PATCH 3/4] FIX accept pre-commit changes --- pandas/core/apply.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9a8d20fc03dfa..6e14f71398017 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -824,7 +824,7 @@ def agg_callable(self) -> DataFrame | Series: axis=self.axis, skipna=True, numeric_only=False, - **self.kwargs + **self.kwargs, ) return result @@ -845,10 +845,7 @@ def _agg_callable_fallback(self) -> DataFrame | Series: if self.axis == 1: # For row-wise aggregation, transpose and recurse transposed_result = obj.T._aggregate( - func, - *self.args, - axis=0, - **self.kwargs + func, *self.args, axis=0, **self.kwargs ) return transposed_result @@ -885,6 +882,7 @@ def _agg_callable_fallback(self) -> DataFrame | Series: except Exception: return None + class NDFrameApply(Apply): """ Methods shared by FrameApply and SeriesApply but From 74fcedf7f4dd6eccd80b51b965a9e28130ce075e Mon Sep 17 00:00:00 2001 From: Adriano Leao Date: Wed, 9 Jul 2025 00:48:59 -0300 Subject: [PATCH 4/4] DOCS update whatsnew with DataFrame.aggregate bugfix --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4e0e497379fa2..5e1b2b1d1a18e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -932,6 +932,7 @@ Other - Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`) - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) +- Bug in :meth:`DataFrame.aggregate` dropping pyarrow backend for lambda aggregation functions (:issue:`61812`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`)