ENH: Preserve nullable boolean dtype in pivot_table (GH#62244)

skalwaghe-56 · skalwaghe-56 · commit e2d2a6f245d5 · 2025-09-06T12:35:50.000+05:30
- Convert bool/object columns to BooleanDtype
- Skip dtype conversion for margin columns that are DataFrames
- Updated test_pivot_table_bool_preserves_boolean_dtype with safe assertions
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -13,6 +13,7 @@
 
 from pandas.core.dtypes.cast import maybe_downcast_to_dtype
 from pandas.core.dtypes.common import (
+    is_bool_dtype,
     is_list_like,
     is_nested_list_like,
     is_scalar,
@@ -23,6 +24,7 @@
     ABCSeries,
 )
 
+from pandas.core.arrays.boolean import BooleanDtype
 import pandas.core.common as com
 from pandas.core.groupby import Grouper
 from pandas.core.indexes.api import (
@@ -409,6 +411,17 @@ def __internal_pivot_table(
     if isinstance(table, ABCDataFrame) and dropna:
         table = table.dropna(how="all", axis=1)
 
+    # GH#62244: Preserve boolean dtype instead of upcasting to float
+    if isinstance(table, ABCDataFrame):
+        for col in table.columns:
+            val = table[col]
+            if isinstance(val, ABCSeries):
+                # if the column is bool or was coerced to object with booleans
+                if is_bool_dtype(val.dtype) or (
+                    val.dtype == object and val.dropna().isin([True, False]).all()
+                ):
+                    table[col] = val.astype(BooleanDtype())
+
     return table
 
 
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -24,6 +24,7 @@
 from pandas.core.dtypes.common import (
     ensure_platform_int,
     is_1d_only_ea_dtype,
+    is_bool_dtype,
     is_integer,
     needs_i8_conversion,
 )
@@ -241,13 +242,38 @@ def get_result(self, obj, value_columns, fill_value) -> DataFrame:
         if value_columns is None and values.shape[1] != 1:  # pragma: no cover
             raise ValueError("must pass column labels for multi-column data")
 
-        new_values, _ = self.get_new_values(values, fill_value)
+        new_values, new_mask = self.get_new_values(values, fill_value)
         columns = self.get_new_columns(value_columns)
         index = self.new_index
 
-        result = self.constructor(
-            new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False
-        )
+        # If original values were numpy-bool, we need to respect the missing mask
+        # and produce a nullable boolean column (BooleanDtype).  For other dtypes
+        # fall back to the fast construction path.
+        from pandas.core.dtypes.common import is_bool_dtype
+
+        if is_bool_dtype(values.dtype):
+            # Build an object array from new_values so we can insert pd.NA where masked,
+            # then construct DataFrame and cast to nullable boolean dtype.
+            import pandas as pd
+
+            # Ensure we have an object array to insert pd.NA
+            tmp = new_values.astype(object, copy=True)
+            # new_mask is True where a value exists; missing positions are ~new_mask
+            tmp[~new_mask] = pd.NA
+
+            # Construct DataFrame from the tmp array, then convert to boolean dtype.
+            result = self.constructor(tmp, index=index, columns=columns, copy=False)
+            # Convert the relevant columns to nullable boolean
+            result = result.astype("boolean")
+        else:
+            result = self.constructor(
+                new_values,
+                index=index,
+                columns=columns,
+                dtype=new_values.dtype,
+                copy=False,
+            )
+
         if isinstance(values, np.ndarray):
             base, new_base = values.base, new_values.base
         elif isinstance(values, NDArrayBackedExtensionArray):
@@ -297,6 +323,25 @@ def get_new_values(self, values, fill_value=None):
             if not mask_all:
                 new_values[:] = fill_value
         else:
+            # GH#62244: special-case for bool to avoid upcasting to object
+            if is_bool_dtype(dtype):
+                data = np.empty(result_shape, dtype="bool")
+                new_mask = np.zeros(result_shape, dtype=bool)
+
+                libreshape.unstack(
+                    sorted_values.astype("bool", copy=False),
+                    mask.view("u1"),
+                    stride,
+                    length,
+                    width,
+                    data,
+                    new_mask.view("u1"),
+                )
+
+                # Return the raw numpy data + mask — pandas internals will wrap it
+                return data, new_mask
+
+            # default path for non-bool dtypes
             if not mask_all:
                 dtype, fill_value = maybe_promote(dtype, fill_value)
             new_values = np.empty(result_shape, dtype=dtype)
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -559,10 +559,10 @@ def test_unstack_bool(self):
         )
         rs = df.unstack()
         xp = DataFrame(
-            np.array([[False, np.nan], [np.nan, False]], dtype=object),
+            [[False, pd.NA], [pd.NA, False]],
             index=["a", "b"],
             columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
-        )
+        ).astype("boolean")
         tm.assert_frame_equal(rs, xp)
 
     @pytest.mark.filterwarnings(
@@ -2734,3 +2734,40 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex):
         )
     expected = Series(1, index=expected_index)
     tm.assert_series_equal(result, expected)
+
+
+class TestUnstackBool:
+    """Regression tests for GH#62244 (unstack bool dtype upcasting)."""
+
+    def test_unstack_bool_dataframe_preserves_boolean_dtype(self):
+        df = DataFrame(
+            {"level_0": ["foo", "toto"], "level_1": ["A", "B"], "val": [True, False]}
+        ).set_index(["level_0", "level_1"])
+
+        result = df.unstack("level_0")
+
+        assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes)
+
+        assert result.loc["A", ("val", "foo")]
+        assert pd.isna(result.loc["A", ("val", "toto")])
+        assert not result.loc["B", ("val", "toto")]
+
+    def test_unstack_bool_series_preserves_boolean_dtype(self):
+        s = Series([True, False], index=MultiIndex.from_product([["x", "y"], ["A"]]))
+        result = s.unstack(0)
+
+        assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes)
+
+    def test_unstack_bool_memory_usage_smaller_than_object(self):
+        df = DataFrame({"a": ["x", "y"], "b": [True, False]}).set_index("a")
+
+        obj_unstack = df.astype("object").unstack("a")
+        bool_unstack = df.astype("boolean").unstack("a")
+
+        obj_mem = obj_unstack.memory_usage(deep=True)
+        bool_mem = bool_unstack.memory_usage(deep=True)
+
+        obj_total = obj_mem.sum() if hasattr(obj_mem, "sum") else int(obj_mem)
+        bool_total = bool_mem.sum() if hasattr(bool_mem, "sum") else int(bool_mem)
+
+        assert bool_total < obj_total
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -2618,6 +2618,25 @@ def test_pivot_table_margins_include_nan_groups(self):
         expected.columns.name = "g2"
         tm.assert_frame_equal(result, expected, check_dtype=False)
 
+    def test_pivot_table_bool_preserves_boolean_dtype(self):
+        # GH#62244
+        df = DataFrame(
+            {
+                "A": ["foo", "foo", "bar"],
+                "B": ["x", "y", "x"],
+                "val": [True, False, True],
+            }
+        )
+
+        result = pivot_table(df, values="val", index="A", columns="B", aggfunc="any")
+
+        assert all(str(dtype) == "boolean" for dtype in result.dtypes)
+
+        assert result.loc["foo", "x"]
+        assert not result.loc["foo", "y"]
+        assert result.loc["bar", "x"]
+        assert pd.isna(result.loc["bar", "y"])
+
 
 class TestPivot:
     def test_pivot(self):