diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3c83f2a9758c1..f91d40c4d9ea9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -603,6 +603,7 @@ Other API changes an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining the dtype of the resulting Index (:issue:`60797`) - :class:`IncompatibleFrequency` now subclasses ``TypeError`` instead of ``ValueError``. As a result, joins with mismatched frequencies now cast to object like other non-comparable joins, and arithmetic with indexes with mismatched frequencies align (:issue:`55782`) +- :meth:`CategoricalIndex.append` no longer attempts to cast different-dtype indexes to the caller's dtype (:issue:`41626`) - :meth:`ExtensionDtype.construct_array_type` is now a regular method instead of a ``classmethod`` (:issue:`58663`) - Comparison operations between :class:`Index` and :class:`Series` now consistently return :class:`Series` regardless of which object is on the left or right (:issue:`36759`) - Numpy functions like ``np.isinf`` that return a bool dtype when called on a :class:`Index` object now return a bool-dtype :class:`Index` instead of ``np.ndarray`` (:issue:`52676`) @@ -974,8 +975,8 @@ Indexing - Bug in reindexing of :class:`DataFrame` with :class:`PeriodDtype` columns in case of consolidated block (:issue:`60980`, :issue:`60273`) - Bug in :meth:`DataFrame.loc.__getitem__` and :meth:`DataFrame.iloc.__getitem__` with a :class:`CategoricalDtype` column with integer categories raising when trying to index a row containing a ``NaN`` entry (:issue:`58954`) - Bug in :meth:`Index.__getitem__` incorrectly raising with a 0-dim ``np.ndarray`` key (:issue:`55601`) +- Bug in adding new rows with :meth:`DataFrame.loc.__setitem__` or :class:`Series.loc.__setitem__` which failed to retain dtype on the object's index in some cases (:issue:`41626`) - Bug in indexing on a :class:`DatetimeIndex` with a ``timestamp[pyarrow]`` dtype or on a :class:`TimedeltaIndex` with a ``duration[pyarrow]`` dtype (:issue:`62277`) -- Missing ^^^^^^^ @@ -1094,7 +1095,7 @@ Reshaping - Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`) - Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) - Bug in :meth:`DataFrame.merge` with :class:`CategoricalDtype` columns incorrectly raising ``RecursionError`` (:issue:`56376`) -- +- Bug in :meth:`DataFrame.merge` with a ``float32`` index incorrectly casting the index to ``float64`` (:issue:`41626`) Sparse ^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c1f8be1381b23..e649f4bd60e31 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -10918,6 +10918,13 @@ def _append_internal( ), ) row_df = other.to_frame().T + if isinstance(self.index.dtype, ExtensionDtype): + # GH#41626 retain e.g. CategoricalDtype if reached via + # df.loc[key] = item + row_df.index = self.index.array._cast_pointwise_result( + row_df.index._values + ) + # infer_objects is needed for # test_append_empty_frame_to_series_with_dateutil_tz other = row_df.infer_objects().rename_axis(index.names) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index bf4dd5a649ffe..cbefaac77dd82 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -18,7 +18,6 @@ ) from pandas.core.dtypes.common import is_scalar -from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, @@ -519,17 +518,3 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): """ mapped = self._values.map(mapper, na_action=na_action) return Index(mapped, name=self.name) - - def _concat(self, to_concat: list[Index], name: Hashable) -> Index: - # if calling index is category, don't check dtype of others - try: - cat = Categorical._concat_same_type( - [self._is_dtype_compat(c) for c in to_concat] - ) - except TypeError: - # not all to_concat elements are among our categories (or NA) - - res = concat_compat([x._values for x in to_concat]) - return Index(res, name=name) - else: - return type(self)._simple_new(cat, name=name) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 8786ce361c900..f35b0ef197288 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1483,7 +1483,11 @@ def _create_join_index( mask = indexer == -1 if np.any(mask): fill_value = na_value_for_dtype(index.dtype, compat=False) - index = index.append(Index([fill_value])) + if not index._can_hold_na: + new_index = Index([fill_value]) + else: + new_index = Index([fill_value], dtype=index.dtype) + index = index.append(new_index) if indexer is None: return index.copy() return index.take(indexer) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 185d6d750cace..b4dbe04b04374 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_hashable + import pandas as pd import pandas._testing as tm @@ -310,6 +312,22 @@ def test_setitem_expand_with_extension(self, data): result.loc[:, "B"] = data tm.assert_frame_equal(result, expected) + def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data): + # GH#41626 retain index.dtype in setitem-with-expansion + if not is_hashable(data[0]): + pytest.skip("Test does not apply to non-hashable data.") + data = data.unique() + expected = pd.DataFrame({"A": range(len(data))}, index=data) + df = expected.iloc[:-1] + ser = df["A"] + item = data[-1] + + df.loc[item] = len(data) - 1 + tm.assert_frame_equal(df, expected) + + ser.loc[item] = len(data) - 1 + tm.assert_series_equal(ser, expected["A"]) + def test_setitem_frame_invalid_length(self, data): df = pd.DataFrame({"A": [1] * len(data)}) xpr = ( diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1e91ef734da8a..e9d014a0eb29d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1067,6 +1067,15 @@ def test_comp_masked_numpy(self, masked_dtype, comparison_op): expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_())) tm.assert_series_equal(result, expected) + def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_date(pa_dtype): + mark = pytest.mark.xfail( + reason="GH#62343 incorrectly casts to timestamp[ms][pyarrow]" + ) + request.applymarker(mark) + super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data) + class TestLogicalOps: """Various Series and DataFrame logical ops methods.""" diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 011bf0b2016b2..8d437fc5d238b 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -126,6 +126,13 @@ def test_EA_types(self, engine, data, request): def test_astype_str(self, data): super().test_astype_str(data) + @pytest.mark.xfail( + reason="Test is invalid for IntervalDtype, needs to be adapted for " + "this dtype with an index with index._index_as_unique." + ) + def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data): + super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 034ddb351a7ab..e3764b2514680 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -360,3 +360,9 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): ) ) tm.assert_series_equal(result, expected) + + def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data, request): + if data.dtype.kind == "b": + mark = pytest.mark.xfail(reason="GH#62344 incorrectly casts to object") + request.applymarker(mark) + super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 79cfb736941d6..5fe761cd702b1 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -421,6 +421,12 @@ def test_index_from_listlike_with_dtype(self, data): def test_EA_types(self, engine, data, request): super().test_EA_types(engine, data, request) + def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data, request): + if isinstance(data[-1], tuple): + mark = pytest.mark.xfail(reason="Unpacks tuple") + request.applymarker(mark) + super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data) + class Test2DCompat(base.NDArrayBacked2DTests): pass diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 34d0ee9f819a0..c49063a5a9a68 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -257,6 +257,14 @@ def test_arith_series_with_array( request.applymarker(mark) super().test_arith_series_with_array(data, all_arithmetic_operators) + def test_loc_setitem_with_expansion_preserves_ea_index_dtype( + self, data, request, using_infer_string + ): + if not using_infer_string and data.dtype.storage == "python": + mark = pytest.mark.xfail(reason="Casts to object") + request.applymarker(mark) + super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data) + class Test2DCompat(base.Dim2CompatTests): @pytest.fixture(autouse=True) diff --git a/pandas/tests/indexes/categorical/test_append.py b/pandas/tests/indexes/categorical/test_append.py index b48c3219f5111..d4dbd91815cc1 100644 --- a/pandas/tests/indexes/categorical/test_append.py +++ b/pandas/tests/indexes/categorical/test_append.py @@ -36,9 +36,11 @@ def test_append_mismatched_categories(self, ci): ci.append(ci.values.reorder_categories(list("abc"))) def test_append_category_objects(self, ci): + # GH#41626 pre-3.0 this used to cast the object-dtype index to + # ci.dtype # with objects result = ci.append(Index(["c", "a"])) - expected = CategoricalIndex(list("aabbcaca"), categories=ci.categories) + expected = Index(list("aabbcaca")) tm.assert_index_equal(result, expected, exact=True) def test_append_non_categories(self, ci): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 207dd76ce4953..05f7b9cc7c40e 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -16,6 +16,7 @@ from pandas._libs import index as libindex from pandas.errors import IndexingError +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -1963,6 +1964,22 @@ def test_loc_drops_level(self): class TestLocSetitemWithExpansion: + @td.skip_if_no("pyarrow") + def test_loc_setitem_with_expansion_preserves_ea_dtype(self): + # GH#41626 retain index.dtype in setitem-with-expansion + idx = Index([Timestamp(0).date()], dtype="date32[pyarrow]") + df = DataFrame({"A": range(1)}, index=idx) + item = Timestamp("1970-01-02").date() + + df.loc[item] = 1 + + exp_index = Index([idx[0], item], dtype=idx.dtype) + tm.assert_index_equal(df.index, exp_index) + + ser = df["A"].iloc[:-1] + ser.loc[item] = 1 + tm.assert_index_equal(ser.index, exp_index) + def test_loc_setitem_with_expansion_large_dataframe(self, monkeypatch): # GH#10692 size_cutoff = 50 diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 48bdf70a47ec1..d3bef4c863b28 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1378,6 +1378,9 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index): # GH 24212 # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that # -1 is interpreted as a missing value instead of the last element + if index.dtype == "float32" and expected_index.dtype == "float64": + # GH#41626 + expected_index = expected_index.astype("float32") df1 = DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index) df2 = DataFrame({"b": [0, 1, 2, 3, 4, 5]}) result = df1.merge(df2, left_on="key", right_index=True, how=how)