From f43b3e2bb0d9d91d8b78e8db418ed2a6fa708564 Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Thu, 17 Jul 2025 00:58:06 -0400 Subject: [PATCH 1/3] BUG: Fix warning for extra fields in read_csv with on_bad_lines callable --- doc/source/whatsnew/v3.0.0.rst | 4 ++++ pandas/io/parsers/base_parser.py | 8 +++----- .../tests/io/parser/test_python_parser_only.py | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 977186d808e81..894af830550a1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -814,6 +814,10 @@ I/O - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) +- Bug in :func:`read_csv` with ``engine="python"`` and callable ``on_bad_lines`` + where a ``ParserWarning`` for extra fields returned by the callable was only + raised when ``index_col`` was ``None``. Now the warning is consistently raised + regardless of ``index_col`` (:issue:`#61837`) Period ^^^^^^ diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 23efc9c87e07c..dc52daad7f470 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -615,15 +615,13 @@ def _check_data_length( data: list of array-likes containing the data column-wise. """ if not self.index_col and len(columns) != len(data) and columns: - empty_str = is_object_dtype(data[-1]) and data[-1] == "" # error: No overload variant of "__ror__" of "ndarray" matches # argument type "ExtensionArray" - empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator] - if len(columns) == len(data) - 1 and np.all(empty_str_or_na): + if len(data) > len(columns) : return warnings.warn( - "Length of header or names does not match length of data. This leads " - "to a loss of data with index_col=False.", + f"Length of header or names ({len(columns)}) does not match number of " + f"fields in line ({len(data)}). Extra field will be dropped.", ParserWarning, stacklevel=find_stack_level(), ) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a5bb151e84f47..a0a949a515bea 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -322,6 +322,23 @@ def test_malformed_skipfooter(python_parser_only): parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1) +def test_on_bad_lines_extra_fields_warns(python_parser_only): + parser = python_parser_only + data = """id,field_1,field_2 +101,A,B +102,C,D, E +103,F,G +""" + + def line_fixer(_line): + return ["1", "2", "3", "4", "5"] + for index_col in [None, 0]: + with tm.assert_produces_warning(ParserWarning): + parser.read_csv( + StringIO(data), on_bad_lines=line_fixer, index_col=index_col + ) + + def test_python_engine_file_no_next(python_parser_only): parser = python_parser_only From 75ad64b7786a017b0444c5524fd8617c8dee3be3 Mon Sep 17 00:00:00 2001 From: Jay <2594jaypatel@gmail.com> Date: Thu, 17 Jul 2025 01:53:45 -0400 Subject: [PATCH 2/3] mend --- doc/source/whatsnew/v3.0.0.rst | 5 +---- pandas/io/parsers/base_parser.py | 6 +----- pandas/tests/io/parser/test_python_parser_only.py | 5 +++-- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 894af830550a1..51ec6fbc82b03 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -814,10 +814,7 @@ I/O - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) -- Bug in :func:`read_csv` with ``engine="python"`` and callable ``on_bad_lines`` - where a ``ParserWarning`` for extra fields returned by the callable was only - raised when ``index_col`` was ``None``. Now the warning is consistently raised - regardless of ``index_col`` (:issue:`#61837`) +- Bug in :func:`read_csv` with ``engine="python"`` and callable ``on_bad_lines`` where a ``ParserWarning`` for extra fields returned by the callable was only raised when ``index_col`` was ``None``. Now the warning is consistently raised regardless of ``index_col`` (:issue:`#61837`) Period ^^^^^^ diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index dc52daad7f470..d59e49056fd59 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -614,11 +614,7 @@ def _check_data_length( columns: list of column names data: list of array-likes containing the data column-wise. """ - if not self.index_col and len(columns) != len(data) and columns: - # error: No overload variant of "__ror__" of "ndarray" matches - # argument type "ExtensionArray" - if len(data) > len(columns) : - return + if columns and len(data)!=len(columns): warnings.warn( f"Length of header or names ({len(columns)}) does not match number of " f"fields in line ({len(data)}). Extra field will be dropped.", diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index a0a949a515bea..d72cf98c4cc33 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -332,11 +332,12 @@ def test_on_bad_lines_extra_fields_warns(python_parser_only): def line_fixer(_line): return ["1", "2", "3", "4", "5"] + for index_col in [None, 0]: with tm.assert_produces_warning(ParserWarning): parser.read_csv( - StringIO(data), on_bad_lines=line_fixer, index_col=index_col - ) + StringIO(data), on_bad_lines=line_fixer, index_col=index_col + ) def test_python_engine_file_no_next(python_parser_only): From de500b94f9bef414560aea4a178da5a7b428e29b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Jul 2025 18:17:46 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/io/parsers/base_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index d59e49056fd59..acb458efa71b2 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -614,7 +614,7 @@ def _check_data_length( columns: list of column names data: list of array-likes containing the data column-wise. """ - if columns and len(data)!=len(columns): + if columns and len(data) != len(columns): warnings.warn( f"Length of header or names ({len(columns)}) does not match number of " f"fields in line ({len(data)}). Extra field will be dropped.",