Skip to content

Commit e2d2a6f

Browse files
committed
ENH: Preserve nullable boolean dtype in pivot_table (GH#62244)
- Convert bool/object columns to BooleanDtype - Skip dtype conversion for margin columns that are DataFrames - Updated test_pivot_table_bool_preserves_boolean_dtype with safe assertions
1 parent 7f670c1 commit e2d2a6f

File tree

4 files changed

+120
-6
lines changed

4 files changed

+120
-6
lines changed

pandas/core/reshape/pivot.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
1515
from pandas.core.dtypes.common import (
16+
is_bool_dtype,
1617
is_list_like,
1718
is_nested_list_like,
1819
is_scalar,
@@ -23,6 +24,7 @@
2324
ABCSeries,
2425
)
2526

27+
from pandas.core.arrays.boolean import BooleanDtype
2628
import pandas.core.common as com
2729
from pandas.core.groupby import Grouper
2830
from pandas.core.indexes.api import (
@@ -409,6 +411,17 @@ def __internal_pivot_table(
409411
if isinstance(table, ABCDataFrame) and dropna:
410412
table = table.dropna(how="all", axis=1)
411413

414+
# GH#62244: Preserve boolean dtype instead of upcasting to float
415+
if isinstance(table, ABCDataFrame):
416+
for col in table.columns:
417+
val = table[col]
418+
if isinstance(val, ABCSeries):
419+
# if the column is bool or was coerced to object with booleans
420+
if is_bool_dtype(val.dtype) or (
421+
val.dtype == object and val.dropna().isin([True, False]).all()
422+
):
423+
table[col] = val.astype(BooleanDtype())
424+
412425
return table
413426

414427

pandas/core/reshape/reshape.py

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from pandas.core.dtypes.common import (
2525
ensure_platform_int,
2626
is_1d_only_ea_dtype,
27+
is_bool_dtype,
2728
is_integer,
2829
needs_i8_conversion,
2930
)
@@ -241,13 +242,38 @@ def get_result(self, obj, value_columns, fill_value) -> DataFrame:
241242
if value_columns is None and values.shape[1] != 1: # pragma: no cover
242243
raise ValueError("must pass column labels for multi-column data")
243244

244-
new_values, _ = self.get_new_values(values, fill_value)
245+
new_values, new_mask = self.get_new_values(values, fill_value)
245246
columns = self.get_new_columns(value_columns)
246247
index = self.new_index
247248

248-
result = self.constructor(
249-
new_values, index=index, columns=columns, dtype=new_values.dtype, copy=False
250-
)
249+
# If original values were numpy-bool, we need to respect the missing mask
250+
# and produce a nullable boolean column (BooleanDtype). For other dtypes
251+
# fall back to the fast construction path.
252+
from pandas.core.dtypes.common import is_bool_dtype
253+
254+
if is_bool_dtype(values.dtype):
255+
# Build an object array from new_values so we can insert pd.NA where masked,
256+
# then construct DataFrame and cast to nullable boolean dtype.
257+
import pandas as pd
258+
259+
# Ensure we have an object array to insert pd.NA
260+
tmp = new_values.astype(object, copy=True)
261+
# new_mask is True where a value exists; missing positions are ~new_mask
262+
tmp[~new_mask] = pd.NA
263+
264+
# Construct DataFrame from the tmp array, then convert to boolean dtype.
265+
result = self.constructor(tmp, index=index, columns=columns, copy=False)
266+
# Convert the relevant columns to nullable boolean
267+
result = result.astype("boolean")
268+
else:
269+
result = self.constructor(
270+
new_values,
271+
index=index,
272+
columns=columns,
273+
dtype=new_values.dtype,
274+
copy=False,
275+
)
276+
251277
if isinstance(values, np.ndarray):
252278
base, new_base = values.base, new_values.base
253279
elif isinstance(values, NDArrayBackedExtensionArray):
@@ -297,6 +323,25 @@ def get_new_values(self, values, fill_value=None):
297323
if not mask_all:
298324
new_values[:] = fill_value
299325
else:
326+
# GH#62244: special-case for bool to avoid upcasting to object
327+
if is_bool_dtype(dtype):
328+
data = np.empty(result_shape, dtype="bool")
329+
new_mask = np.zeros(result_shape, dtype=bool)
330+
331+
libreshape.unstack(
332+
sorted_values.astype("bool", copy=False),
333+
mask.view("u1"),
334+
stride,
335+
length,
336+
width,
337+
data,
338+
new_mask.view("u1"),
339+
)
340+
341+
# Return the raw numpy data + mask — pandas internals will wrap it
342+
return data, new_mask
343+
344+
# default path for non-bool dtypes
300345
if not mask_all:
301346
dtype, fill_value = maybe_promote(dtype, fill_value)
302347
new_values = np.empty(result_shape, dtype=dtype)

pandas/tests/frame/test_stack_unstack.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -559,10 +559,10 @@ def test_unstack_bool(self):
559559
)
560560
rs = df.unstack()
561561
xp = DataFrame(
562-
np.array([[False, np.nan], [np.nan, False]], dtype=object),
562+
[[False, pd.NA], [pd.NA, False]],
563563
index=["a", "b"],
564564
columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
565-
)
565+
).astype("boolean")
566566
tm.assert_frame_equal(rs, xp)
567567

568568
@pytest.mark.filterwarnings(
@@ -2734,3 +2734,40 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex):
27342734
)
27352735
expected = Series(1, index=expected_index)
27362736
tm.assert_series_equal(result, expected)
2737+
2738+
2739+
class TestUnstackBool:
2740+
"""Regression tests for GH#62244 (unstack bool dtype upcasting)."""
2741+
2742+
def test_unstack_bool_dataframe_preserves_boolean_dtype(self):
2743+
df = DataFrame(
2744+
{"level_0": ["foo", "toto"], "level_1": ["A", "B"], "val": [True, False]}
2745+
).set_index(["level_0", "level_1"])
2746+
2747+
result = df.unstack("level_0")
2748+
2749+
assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes)
2750+
2751+
assert result.loc["A", ("val", "foo")]
2752+
assert pd.isna(result.loc["A", ("val", "toto")])
2753+
assert not result.loc["B", ("val", "toto")]
2754+
2755+
def test_unstack_bool_series_preserves_boolean_dtype(self):
2756+
s = Series([True, False], index=MultiIndex.from_product([["x", "y"], ["A"]]))
2757+
result = s.unstack(0)
2758+
2759+
assert all(str(dtype) in {"bool", "boolean"} for dtype in result.dtypes)
2760+
2761+
def test_unstack_bool_memory_usage_smaller_than_object(self):
2762+
df = DataFrame({"a": ["x", "y"], "b": [True, False]}).set_index("a")
2763+
2764+
obj_unstack = df.astype("object").unstack("a")
2765+
bool_unstack = df.astype("boolean").unstack("a")
2766+
2767+
obj_mem = obj_unstack.memory_usage(deep=True)
2768+
bool_mem = bool_unstack.memory_usage(deep=True)
2769+
2770+
obj_total = obj_mem.sum() if hasattr(obj_mem, "sum") else int(obj_mem)
2771+
bool_total = bool_mem.sum() if hasattr(bool_mem, "sum") else int(bool_mem)
2772+
2773+
assert bool_total < obj_total

pandas/tests/reshape/test_pivot.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2618,6 +2618,25 @@ def test_pivot_table_margins_include_nan_groups(self):
26182618
expected.columns.name = "g2"
26192619
tm.assert_frame_equal(result, expected, check_dtype=False)
26202620

2621+
def test_pivot_table_bool_preserves_boolean_dtype(self):
2622+
# GH#62244
2623+
df = DataFrame(
2624+
{
2625+
"A": ["foo", "foo", "bar"],
2626+
"B": ["x", "y", "x"],
2627+
"val": [True, False, True],
2628+
}
2629+
)
2630+
2631+
result = pivot_table(df, values="val", index="A", columns="B", aggfunc="any")
2632+
2633+
assert all(str(dtype) == "boolean" for dtype in result.dtypes)
2634+
2635+
assert result.loc["foo", "x"]
2636+
assert not result.loc["foo", "y"]
2637+
assert result.loc["bar", "x"]
2638+
assert pd.isna(result.loc["bar", "y"])
2639+
26212640

26222641
class TestPivot:
26232642
def test_pivot(self):

0 commit comments

Comments
 (0)