From 98bedc4635718eefe50dfcf109f7c45d5f92f49a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 17:49:00 -0700 Subject: [PATCH 1/9] BUG: read_csv with engine=pyarrow and numpy-nullable dtype --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/parsers/arrow_parser_wrapper.py | 74 +++++++++++++++---- .../io/parser/dtypes/test_dtypes_basic.py | 4 - pandas/tests/io/parser/test_na_values.py | 17 ++++- 4 files changed, 74 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ec5027840dfd5..0f8e026761db0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -814,6 +814,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 8cadde1ad6537..e446f7f4fb897 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,6 +3,8 @@ from typing import TYPE_CHECKING import warnings +import numpy as np + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -12,8 +14,13 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, +) from pandas.core.dtypes.inference import is_integer +from pandas.core.arrays.string_ import StringDtype + from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase @@ -140,20 +147,7 @@ def handle_warning(invalid_row) -> str: "encoding": self.encoding, } - def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: - """ - Processes data read in based on kwargs. - - Parameters - ---------- - frame: DataFrame - The DataFrame to process. - - Returns - ------- - DataFrame - The processed DataFrame. - """ + def _finalize_column_names(self, frame: DataFrame) -> DataFrame: num_cols = len(frame.columns) multi_index_named = True if self.header is None: @@ -196,6 +190,23 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) + return frame + + def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: + """ + Processes data read in based on kwargs. + + Parameters + ---------- + frame: DataFrame + The DataFrame to process. + + Returns + ------- + DataFrame + The processed DataFrame. + """ + if self.dtype is not None: # Ignore non-existent columns from dtype mapping # like other parsers do @@ -282,6 +293,14 @@ def read(self) -> DataFrame: table = table.cast(new_schema) + workaround = False + pass_backend = dtype_backend + if self.dtype is not None and dtype_backend != "pyarrow": + # We pass dtype_backend="pyarrow" and subsequently cast + # to avoid lossy conversion e.g. GH#56136 + workaround = True + pass_backend = "numpy_nullable" + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -289,7 +308,32 @@ def read(self) -> DataFrame: DeprecationWarning, ) frame = arrow_table_to_pandas( - table, dtype_backend=dtype_backend, null_to_int64=True + table, dtype_backend=pass_backend, null_to_int64=True ) + frame = self._finalize_column_names(frame) + + if workaround and dtype_backend != "numpy_nullable": + old_dtype = self.dtype + if not isinstance(old_dtype, dict): + # e.g. test_categorical_dtype_utf16 + old_dtype = dict.fromkeys(frame.columns, old_dtype) + + # _finalize_pandas_output will call astype, but we need to make + # sure all keys are populated appropriately. + new_dtype = {} + for key in frame.columns: + ser = frame[key] + if isinstance(ser.dtype, BaseMaskedDtype): + new_dtype[key] = ser.dtype.numpy_dtype + elif isinstance(ser.dtype, StringDtype): + # We cast here in case the user passed "category" in + # order to get the correct dtype.categories.dtype + # e.g. test_categorical_dtype_utf16 + new_dtype[key] = StringDtype(na_value=np.nan) + frame[key] = frame[key].astype(new_dtype[key]) + + new_dtype.update(old_dtype) + self.dtype = new_dtype + return self._finalize_pandas_output(frame) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 75b7cf0d42cb8..e4563afc631c5 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request): tm.assert_frame_equal(result, expected) -# pyarrow engine failing: -# https://github.com/pandas-dev/pandas/issues/56136 -@pytest.mark.usefixtures("pyarrow_xfail") def test_ea_int_avoid_overflow(all_parsers): # GH#32134 parser = all_parsers @@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 213fa2c01cef4..d60074243a526 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -670,11 +670,14 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) -def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): +def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter, request): # see gh-20377 parser = all_parsers + if parser.engine == "pyarrow" and na_filter is False: + mark = pytest.mark.xfail(reason="mismatched shape") + request.applymarker(mark) + data = "a,b,c\n1,,3\n4,5,6" # na_filter=True --> missing value becomes NaN. @@ -798,7 +801,15 @@ def test_bool_and_nan_to_int(all_parsers): True False """ - with pytest.raises(ValueError, match="convert|NoneType"): + msg = ( + "cannot safely convert passed user dtype of int64 for " + " dtyped data in column 0 due to NA values" + ) + if parser.engine == "python": + msg = "Unable to convert column 0 to type int64" + elif parser.engine == "pyarrow": + msg = r"cannot convert NA to integer" + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), dtype="int") From 7aa640d2c30c4a99170110c4b97bd816649147c3 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 18:14:49 -0700 Subject: [PATCH 2/9] mypy fixup, error message compat for 32bit builds --- pandas/io/parsers/arrow_parser_wrapper.py | 3 ++- pandas/tests/io/parser/test_na_values.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index e446f7f4fb897..75cb16a93c493 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -330,7 +330,8 @@ def read(self) -> DataFrame: # We cast here in case the user passed "category" in # order to get the correct dtype.categories.dtype # e.g. test_categorical_dtype_utf16 - new_dtype[key] = StringDtype(na_value=np.nan) + sdt = StringDtype(na_value=np.nan) + new_dtype[key] = sdt # type: ignore[assignment] frame[key] = frame[key].astype(new_dtype[key]) new_dtype.update(old_dtype) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index d60074243a526..d0cc92c5a73af 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -802,11 +802,11 @@ def test_bool_and_nan_to_int(all_parsers): False """ msg = ( - "cannot safely convert passed user dtype of int64 for " + "cannot safely convert passed user dtype of int(64|32) for " " dtyped data in column 0 due to NA values" ) if parser.engine == "python": - msg = "Unable to convert column 0 to type int64" + msg = "Unable to convert column 0 to type int(64|32)" elif parser.engine == "pyarrow": msg = r"cannot convert NA to integer" with pytest.raises(ValueError, match=msg): From e5b752ef6580486b8273f921d9a246bc32180bf8 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 19:06:31 -0700 Subject: [PATCH 3/9] minimum version compat --- pandas/tests/io/parser/test_na_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index d0cc92c5a73af..5f08f5ef466cf 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -803,7 +803,7 @@ def test_bool_and_nan_to_int(all_parsers): """ msg = ( "cannot safely convert passed user dtype of int(64|32) for " - " dtyped data in column 0 due to NA values" + " dtyped data in column 0 due to NA values" ) if parser.engine == "python": msg = "Unable to convert column 0 to type int(64|32)" From 323414c504446a30b3aa9a4f6fbdc286273a1a8d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 5 Aug 2025 20:57:03 -0700 Subject: [PATCH 4/9] not-infer-string compat --- pandas/io/parsers/arrow_parser_wrapper.py | 27 ++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 75cb16a93c493..039841747c9a8 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -5,6 +5,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -13,7 +15,10 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.common import ( + is_string_dtype, + pandas_dtype, +) from pandas.core.dtypes.dtypes import ( BaseMaskedDtype, ) @@ -326,13 +331,29 @@ def read(self) -> DataFrame: ser = frame[key] if isinstance(ser.dtype, BaseMaskedDtype): new_dtype[key] = ser.dtype.numpy_dtype + if ( + key in old_dtype + and not using_string_dtype() + and is_string_dtype(old_dtype[key]) + and not isinstance(old_dtype[key], StringDtype) + and ser.array._hasna + ): + # Cast to make sure we get "NaN" string instead of "NA" + frame[key] = ser.astype(old_dtype[key]) + frame.loc[ser.isna(), key] = np.nan + old_dtype[key] = object # Avoid re-casting elif isinstance(ser.dtype, StringDtype): # We cast here in case the user passed "category" in # order to get the correct dtype.categories.dtype # e.g. test_categorical_dtype_utf16 - sdt = StringDtype(na_value=np.nan) + if not using_string_dtype(): + sdt = np.dtype(object) + frame[key] = ser.astype(sdt) + frame.loc[ser.isna(), key] = np.nan + else: + sdt = StringDtype(na_value=np.nan) + frame[key] = frame[key].astype(sdt) new_dtype[key] = sdt # type: ignore[assignment] - frame[key] = frame[key].astype(new_dtype[key]) new_dtype.update(old_dtype) self.dtype = new_dtype From 96bed9d5258a2b45858c4c16d5301dbcfe666882 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 6 Aug 2025 07:22:05 -0700 Subject: [PATCH 5/9] mypy fixup --- pandas/io/parsers/arrow_parser_wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 039841747c9a8..09759d4127ac8 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -351,9 +351,9 @@ def read(self) -> DataFrame: frame[key] = ser.astype(sdt) frame.loc[ser.isna(), key] = np.nan else: - sdt = StringDtype(na_value=np.nan) + sdt = StringDtype(na_value=np.nan) # type: ignore[assignment] frame[key] = frame[key].astype(sdt) - new_dtype[key] = sdt # type: ignore[assignment] + new_dtype[key] = sdt new_dtype.update(old_dtype) self.dtype = new_dtype From bf6970bf24ae741460a92ec92aa4ffad046d236b Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Aug 2025 10:36:52 -0700 Subject: [PATCH 6/9] update usage --- pandas/io/parsers/arrow_parser_wrapper.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 703688429feb2..e27e971d9df72 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -239,21 +239,23 @@ def _finalize_dtype(self, frame: DataFrame) -> DataFrame: raise ValueError(str(err)) from err return frame - def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: + def _finalize_pandas_output( + self, frame: DataFrame, multi_index_named: bool + ) -> DataFrame: """ Processes data read in based on kwargs. Parameters ---------- - frame: DataFrame + frame : DataFrame The DataFrame to process. + multi_index_named : bool Returns ------- DataFrame The processed DataFrame. """ - frame, multi_index_named = self._adjust_column_names(frame) frame = self._do_date_conversions(frame.columns, frame) frame = self._finalize_index(frame, multi_index_named) frame = self._finalize_dtype(frame) @@ -329,7 +331,7 @@ def read(self) -> DataFrame: table, dtype_backend=pass_backend, null_to_int64=True ) - frame = self._finalize_column_names(frame) + frame, multi_index_named = self._adjust_column_names(frame) if workaround and dtype_backend != "numpy_nullable": old_dtype = self.dtype @@ -371,4 +373,4 @@ def read(self) -> DataFrame: new_dtype.update(old_dtype) self.dtype = new_dtype - return self._finalize_pandas_output(frame) + return self._finalize_pandas_output(frame, multi_index_named) From 03c6d00ab6dc84b0c9bf2da6ec0dad985fd22fce Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Aug 2025 10:41:39 -0700 Subject: [PATCH 7/9] CLN: remove redundant check --- pandas/io/parsers/arrow_parser_wrapper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index e27e971d9df72..37785e2810f7b 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -179,8 +179,7 @@ def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]: multi_index_named = True if self.header is None: if self.names is None: - if self.header is None: - self.names = range(num_cols) + self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here # The only way self.names is not the same length as number of cols is From 221328d4ee5138c84766a8699ebba2688562f549 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Aug 2025 11:29:33 -0700 Subject: [PATCH 8/9] Use Matts idea --- pandas/io/_util.py | 82 ++++++++++++++++++++++- pandas/io/parsers/arrow_parser_wrapper.py | 76 ++++----------------- pandas/tests/io/parser/test_na_values.py | 5 +- 3 files changed, 96 insertions(+), 67 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 6827fbe9c998e..7f21b45265da6 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -16,14 +16,23 @@ ) from pandas.compat._optional import import_optional_dependency +from pandas.core.dtypes.common import pandas_dtype + import pandas as pd if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import ( + Callable, + Hashable, + Sequence, + ) import pyarrow - from pandas._typing import DtypeBackend + from pandas._typing import ( + DtypeArg, + DtypeBackend, + ) def _arrow_dtype_mapping() -> dict: @@ -64,6 +73,8 @@ def arrow_table_to_pandas( dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, null_to_int64: bool = False, to_pandas_kwargs: dict | None = None, + dtype: DtypeArg | None = None, + names: Sequence[Hashable] | None = None, ) -> pd.DataFrame: pa = import_optional_dependency("pyarrow") @@ -82,12 +93,77 @@ def arrow_table_to_pandas( elif using_string_dtype(): if pa_version_under19p0: types_mapper = _arrow_string_types_mapper() + elif dtype is not None: + # GH#56136 Avoid lossy conversion to float64 + # We'll convert to numpy below if + types_mapper = { + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + }.get else: types_mapper = None elif dtype_backend is lib.no_default or dtype_backend == "numpy": - types_mapper = None + if dtype is not None: + # GH#56136 Avoid lossy conversion to float64 + # We'll convert to numpy below if + types_mapper = { + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + }.get + else: + types_mapper = None else: raise NotImplementedError df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return _post_convert_dtypes(df, dtype_backend, dtype, names) + + +def _post_convert_dtypes( + df: pd.DataFrame, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault, + dtype: DtypeArg | None, + names: Sequence[Hashable] | None, +) -> pd.DataFrame: + if dtype is not None and ( + dtype_backend is lib.no_default or dtype_backend == "numpy" + ): + # GH#56136 apply any user-provided dtype, and convert any IntegerDtype + # columns the user didn't explicitly ask for. + if isinstance(dtype, dict): + if names is not None: + df.columns = names + + cmp_dtypes = { + pd.Int8Dtype(), + pd.Int16Dtype(), + pd.Int32Dtype(), + pd.Int64Dtype(), + } + for col in df.columns: + if col not in dtype and df[col].dtype in cmp_dtypes: + # Any key that the user didn't explicitly specify + # that got converted to IntegerDtype now gets converted + # to numpy dtype. + dtype[col] = df[col].dtype.numpy_dtype + + # Ignore non-existent columns from dtype mapping + # like other parsers do + dtype = { + key: pandas_dtype(dtype[key]) for key in dtype if key in df.columns + } + + else: + dtype = pandas_dtype(dtype) + + try: + df = df.astype(dtype) + except TypeError as err: + # GH#44901 reraise to keep api consistent + raise ValueError(str(err)) from err + return df diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 37785e2810f7b..e61f9bcec0d62 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,10 +3,6 @@ from typing import TYPE_CHECKING import warnings -import numpy as np - -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,20 +12,16 @@ from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( - is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import ( - BaseMaskedDtype, -) from pandas.core.dtypes.inference import is_integer -from pandas.core.arrays.string_ import StringDtype - from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: + import pyarrow as pa + from pandas._typing import ReadBuffer from pandas import DataFrame @@ -174,8 +166,8 @@ def _get_convert_options(self): return convert_options - def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]: - num_cols = len(frame.columns) + def _adjust_column_names(self, table: pa.Table) -> bool: + num_cols = len(table.columns) multi_index_named = True if self.header is None: if self.names is None: @@ -188,8 +180,7 @@ def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]: columns_prefix = [str(x) for x in range(num_cols - len(self.names))] self.names = columns_prefix + self.names multi_index_named = False - frame.columns = self.names - return frame, multi_index_named + return multi_index_named def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame: if self.index_col is not None: @@ -312,13 +303,7 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - workaround = False - pass_backend = dtype_backend - if self.dtype is not None and dtype_backend != "pyarrow": - # We pass dtype_backend="pyarrow" and subsequently cast - # to avoid lossy conversion e.g. GH#56136 - workaround = True - pass_backend = "numpy_nullable" + multi_index_named = self._adjust_column_names(table) with warnings.catch_warnings(): warnings.filterwarnings( @@ -327,49 +312,14 @@ def read(self) -> DataFrame: DeprecationWarning, ) frame = arrow_table_to_pandas( - table, dtype_backend=pass_backend, null_to_int64=True + table, + dtype_backend=dtype_backend, + null_to_int64=True, + dtype=self.dtype, + names=self.names, ) - frame, multi_index_named = self._adjust_column_names(frame) - - if workaround and dtype_backend != "numpy_nullable": - old_dtype = self.dtype - if not isinstance(old_dtype, dict): - # e.g. test_categorical_dtype_utf16 - old_dtype = dict.fromkeys(frame.columns, old_dtype) - - # _finalize_pandas_output will call astype, but we need to make - # sure all keys are populated appropriately. - new_dtype = {} - for key in frame.columns: - ser = frame[key] - if isinstance(ser.dtype, BaseMaskedDtype): - new_dtype[key] = ser.dtype.numpy_dtype - if ( - key in old_dtype - and not using_string_dtype() - and is_string_dtype(old_dtype[key]) - and not isinstance(old_dtype[key], StringDtype) - and ser.array._hasna - ): - # Cast to make sure we get "NaN" string instead of "NA" - frame[key] = ser.astype(old_dtype[key]) - frame.loc[ser.isna(), key] = np.nan - old_dtype[key] = object # Avoid re-casting - elif isinstance(ser.dtype, StringDtype): - # We cast here in case the user passed "category" in - # order to get the correct dtype.categories.dtype - # e.g. test_categorical_dtype_utf16 - if not using_string_dtype(): - sdt = np.dtype(object) - frame[key] = ser.astype(sdt) - frame.loc[ser.isna(), key] = np.nan - else: - sdt = StringDtype(na_value=np.nan) # type: ignore[assignment] - frame[key] = frame[key].astype(sdt) - new_dtype[key] = sdt - - new_dtype.update(old_dtype) - self.dtype = new_dtype + if self.header is None: + frame.columns = self.names return self._finalize_pandas_output(frame, multi_index_named) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 5f08f5ef466cf..a07e065e40bbf 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -808,7 +808,10 @@ def test_bool_and_nan_to_int(all_parsers): if parser.engine == "python": msg = "Unable to convert column 0 to type int(64|32)" elif parser.engine == "pyarrow": - msg = r"cannot convert NA to integer" + msg = ( + r"int\(\) argument must be a string, a bytes-like object or a " + "real number, not 'NoneType" + ) with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), dtype="int") From 1005e0d9f7f5ddaca312d2e0cd12d8423ff6e755 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Aug 2025 17:58:12 -0700 Subject: [PATCH 9/9] re-xfail --- pandas/tests/io/parser/test_na_values.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index a07e065e40bbf..11b54692fe2e1 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -671,10 +671,12 @@ def test_inf_na_values_with_int_index(all_parsers): @pytest.mark.parametrize("na_filter", [True, False]) -def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter, request): +def test_na_values_with_dtype_str_and_na_filter( + all_parsers, na_filter, using_infer_string, request +): # see gh-20377 parser = all_parsers - if parser.engine == "pyarrow" and na_filter is False: + if parser.engine == "pyarrow" and (na_filter is False or not using_infer_string): mark = pytest.mark.xfail(reason="mismatched shape") request.applymarker(mark)