From f3829fd19334b2839c33bf0d2acb60665555dca3 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Fri, 25 Jul 2025 16:47:34 +0530 Subject: [PATCH 01/25] BUG : Fix Series.str.contains with compiled regex on Arrow string --- pandas/core/strings/accessor.py | 40 +++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d1cf1e7504ece..eb21592c23494 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1350,13 +1350,39 @@ def contains( 4 False dtype: bool """ - if regex and re.compile(pat).groups: - warnings.warn( - "This pattern is interpreted as a regular expression, and has " - "match groups. To actually get the groups, use str.extract.", - UserWarning, - stacklevel=find_stack_level(), - ) + from pandas.core.dtypes.dtypes import ArrowDtype + import re + + # --- Handle Arrow-backed string arrays with compiled regex patterns --- + # Arrow backend does not support compiled regex objects or Python regex flags. + # If a compiled regex is passed, only allow it if no flags are set. + + if isinstance(self._data.dtype, ArrowDtype) and isinstance(pat, re.Pattern): + if flags != 0: + raise NotImplementedError( + "Series.str.contains() with a compiled regex pattern and flag is " + "not supported for Arrow-backed string arrays." + ) + pat = pat.pattern + regex = True + + if regex: + try: + _compiled = pat if isinstance(pat, re.Pattern) else re.compile( + pat, flags=flags + ) + if _compiled.groups: + warnings.warn( + "This pattern is interpreted as a regular expression, and has " + "match groups. To actually get the groups, use str.extract.", + UserWarning, + stacklevel=find_stack_level(), + ) + except re.error as e: + raise ValueError( + f"Invalid regex pattern passed to str.contains(): {e}" + ) from e + result = self._data.array._str_contains(pat, case, flags, na, regex) return self._wrap_result(result, fill_value=na, returns_string=False) From c2a64facfe917c895df8f529945b80639f0d387a Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Fri, 25 Jul 2025 18:24:32 +0530 Subject: [PATCH 02/25] BUG: Fix handling of compiled regex in Series.str.contains for Arrow-backed strings --- pandas/core/strings/accessor.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index eb21592c23494..4d93c2717dfba 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1,5 +1,7 @@ from __future__ import annotations +from pandas.core.dtypes.dtypes import ArrowDtype + import codecs from functools import wraps import re @@ -1350,13 +1352,6 @@ def contains( 4 False dtype: bool """ - from pandas.core.dtypes.dtypes import ArrowDtype - import re - - # --- Handle Arrow-backed string arrays with compiled regex patterns --- - # Arrow backend does not support compiled regex objects or Python regex flags. - # If a compiled regex is passed, only allow it if no flags are set. - if isinstance(self._data.dtype, ArrowDtype) and isinstance(pat, re.Pattern): if flags != 0: raise NotImplementedError( @@ -1383,7 +1378,6 @@ def contains( f"Invalid regex pattern passed to str.contains(): {e}" ) from e - result = self._data.array._str_contains(pat, case, flags, na, regex) return self._wrap_result(result, fill_value=na, returns_string=False) From 838b1c57278398fce27dc56838a4c6919fcffa3c Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Fri, 25 Jul 2025 18:43:32 +0530 Subject: [PATCH 03/25] BUG: Fix handling of compiled regex in Series.str.contains for Arrow-backed strings --- pandas/core/strings/accessor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 4d93c2717dfba..a4dc4dcea765d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1,7 +1,5 @@ from __future__ import annotations -from pandas.core.dtypes.dtypes import ArrowDtype - import codecs from functools import wraps import re From 563f1f1e1bb3eb3710fb53c389d1de04a369c01b Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Fri, 25 Jul 2025 19:44:44 +0530 Subject: [PATCH 04/25] STYLE: Fix formatting and docstring issues in str.contains --- pandas/core/strings/accessor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index a4dc4dcea765d..3541525d6a2fa 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1361,8 +1361,10 @@ def contains( if regex: try: - _compiled = pat if isinstance(pat, re.Pattern) else re.compile( - pat, flags=flags + _compiled = ( + pat + if isinstance(pat, re.Pattern) + else re.compile(pat, flags=flags) ) if _compiled.groups: warnings.warn( From fda56199143e07d5c9b520a9879e1dfa1018b7b1 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Fri, 25 Jul 2025 20:07:18 +0530 Subject: [PATCH 05/25] Fixed ruff format --- pandas/core/strings/accessor.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 3541525d6a2fa..934aa2d4aa5c7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1362,9 +1362,7 @@ def contains( if regex: try: _compiled = ( - pat - if isinstance(pat, re.Pattern) - else re.compile(pat, flags=flags) + pat if isinstance(pat, re.Pattern) else re.compile(pat, flags=flags) ) if _compiled.groups: warnings.warn( From 324e6094a60044a146d95fdc2a5f151c47eceb4e Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Sun, 27 Jul 2025 00:36:51 +0530 Subject: [PATCH 06/25] Move fix into _str_contains of ArrowExtensionArray --- pandas/core/arrays/arrow/array.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 919453b29b7f9..190670d21ab6f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2446,6 +2446,36 @@ def _convert_int_result(self, result): def _convert_rank_result(self, result): return type(self)(result) + def _str_contains(self, pat, case=True, flags=0, na=lib.no_default, regex=True): + import re + + if isinstance(pat, re.Pattern): + if flags != 0: + # fallback to python object implementation + return BaseStringArrayMethods._str_contains( + self, pat, case, flags, na, regex + ) + pat = pat.pattern + regex = True + + try: + if not regex: + result = pc.match_substring(self._pa_array, pat, ignore_case=not case) + else: + result = pc.match_substring_regex( + self._pa_array, pat, ignore_case=not case, options=None + ) + return self._convert_bool_result(result, na=na, method_name="contains") + except (AttributeError, NotImplementedError, pa.ArrowNotImplementedError): + return BaseStringArrayMethods._str_contains( + self, pat, case, flags, na, regex + ) + + def _str_count(self, pat: str, flags: int = 0) -> Self: + if flags: + raise NotImplementedError(f"count not implemented with {flags=}") + return type(self)(pc.count_substring_regex(self._pa_array, pat)) + def _str_count(self, pat: str, flags: int = 0) -> Self: if flags: raise NotImplementedError(f"count not implemented with {flags=}") From b474604e912578a94f8710f0d9633ed4848cf1df Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Sun, 27 Jul 2025 01:36:33 +0530 Subject: [PATCH 07/25] Move fix into _str_contains of ArrowExtensionArray --- pandas/core/arrays/arrow/array.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 190670d21ab6f..8a03b954559c7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2447,8 +2447,6 @@ def _convert_rank_result(self, result): return type(self)(result) def _str_contains(self, pat, case=True, flags=0, na=lib.no_default, regex=True): - import re - if isinstance(pat, re.Pattern): if flags != 0: # fallback to python object implementation @@ -2476,11 +2474,6 @@ def _str_count(self, pat: str, flags: int = 0) -> Self: raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_count(self, pat: str, flags: int = 0) -> Self: - if flags: - raise NotImplementedError(f"count not implemented with {flags=}") - return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_repeat(self, repeats: int | Sequence[int]) -> Self: if not isinstance(repeats, int): raise NotImplementedError( From 3345bc750516187e81101c19e63d813aed4e296d Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Sun, 27 Jul 2025 02:17:12 +0530 Subject: [PATCH 08/25] Revert changes to pandas/core/strings/accessor.py from PR #61946 --- pandas/core/strings/accessor.py | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 934aa2d4aa5c7..d1cf1e7504ece 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1350,31 +1350,13 @@ def contains( 4 False dtype: bool """ - if isinstance(self._data.dtype, ArrowDtype) and isinstance(pat, re.Pattern): - if flags != 0: - raise NotImplementedError( - "Series.str.contains() with a compiled regex pattern and flag is " - "not supported for Arrow-backed string arrays." - ) - pat = pat.pattern - regex = True - - if regex: - try: - _compiled = ( - pat if isinstance(pat, re.Pattern) else re.compile(pat, flags=flags) - ) - if _compiled.groups: - warnings.warn( - "This pattern is interpreted as a regular expression, and has " - "match groups. To actually get the groups, use str.extract.", - UserWarning, - stacklevel=find_stack_level(), - ) - except re.error as e: - raise ValueError( - f"Invalid regex pattern passed to str.contains(): {e}" - ) from e + if regex and re.compile(pat).groups: + warnings.warn( + "This pattern is interpreted as a regular expression, and has " + "match groups. To actually get the groups, use str.extract.", + UserWarning, + stacklevel=find_stack_level(), + ) result = self._data.array._str_contains(pat, case, flags, na, regex) return self._wrap_result(result, fill_value=na, returns_string=False) From 9f06042c16ed6e43ef76b690e77346ed44609b83 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Sun, 27 Jul 2025 02:58:03 +0530 Subject: [PATCH 09/25] Move fix into _str_contains of ArrowExtensionArray --- pandas/core/arrays/arrow/array.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8a03b954559c7..221f620bac748 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2449,19 +2449,23 @@ def _convert_rank_result(self, result): def _str_contains(self, pat, case=True, flags=0, na=lib.no_default, regex=True): if isinstance(pat, re.Pattern): if flags != 0: - # fallback to python object implementation return BaseStringArrayMethods._str_contains( self, pat, case, flags, na, regex ) pat = pat.pattern regex = True + elif flags != 0: + raise NotImplementedError( + "ArrowExtensionArray does not support str.contains() with flags " + "for string patterns" + ) try: if not regex: result = pc.match_substring(self._pa_array, pat, ignore_case=not case) else: result = pc.match_substring_regex( - self._pa_array, pat, ignore_case=not case, options=None + self._pa_array, pat, ignore_case=not case ) return self._convert_bool_result(result, na=na, method_name="contains") except (AttributeError, NotImplementedError, pa.ArrowNotImplementedError): From cbab096d443d3066118a122b9c8a0d0aa4139d62 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Sun, 27 Jul 2025 03:21:39 +0530 Subject: [PATCH 10/25] Move fix into _str_contains of ArrowExtensionArray --- pandas/core/arrays/arrow/array.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 221f620bac748..66991ad25fba6 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2455,9 +2455,8 @@ def _str_contains(self, pat, case=True, flags=0, na=lib.no_default, regex=True): pat = pat.pattern regex = True elif flags != 0: - raise NotImplementedError( - "ArrowExtensionArray does not support str.contains() with flags " - "for string patterns" + return BaseStringArrayMethods._str_contains( + self, pat, case, flags, na, regex ) try: From d88f8d12f5cf0d88e03ab0db1c2074b4fa2a3da9 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Mon, 28 Jul 2025 16:24:40 +0530 Subject: [PATCH 11/25] BUG: Fix Series.str.contains with compiled regex and arrow strings (#61942) --- pandas/core/arrays/string_arrow.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 2ca12870709f0..7f3eae9807c09 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -344,6 +344,9 @@ def _str_contains( na=lib.no_default, regex: bool = True, ): + if isinstance(pat, re.Pattern) and regex: + return super()._str_contains(pat, case, flags, na, regex) + if flags: return super()._str_contains(pat, case, flags, na, regex) From a0decbc2d13a1afbc31ca9200cd3818a9ee1ddfe Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Mon, 28 Jul 2025 16:29:20 +0530 Subject: [PATCH 12/25] Revert changes to pandas/core/arrays/arrow/array.py in PR --- pandas/core/arrays/arrow/array.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 66991ad25fba6..919453b29b7f9 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2446,32 +2446,6 @@ def _convert_int_result(self, result): def _convert_rank_result(self, result): return type(self)(result) - def _str_contains(self, pat, case=True, flags=0, na=lib.no_default, regex=True): - if isinstance(pat, re.Pattern): - if flags != 0: - return BaseStringArrayMethods._str_contains( - self, pat, case, flags, na, regex - ) - pat = pat.pattern - regex = True - elif flags != 0: - return BaseStringArrayMethods._str_contains( - self, pat, case, flags, na, regex - ) - - try: - if not regex: - result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - else: - result = pc.match_substring_regex( - self._pa_array, pat, ignore_case=not case - ) - return self._convert_bool_result(result, na=na, method_name="contains") - except (AttributeError, NotImplementedError, pa.ArrowNotImplementedError): - return BaseStringArrayMethods._str_contains( - self, pat, case, flags, na, regex - ) - def _str_count(self, pat: str, flags: int = 0) -> Self: if flags: raise NotImplementedError(f"count not implemented with {flags=}") From 8fc81e063ce8f00c3bf0a246700735c08ead8b0c Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Tue, 29 Jul 2025 01:26:47 +0530 Subject: [PATCH 13/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) and add whatsnew note --- doc/source/whatsnew/v2.3.2 | 11 +++++++++++ pandas/core/arrays/string_arrow.py | 5 +---- pandas/tests/strings/test_strings.py | 10 ++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 doc/source/whatsnew/v2.3.2 diff --git a/doc/source/whatsnew/v2.3.2 b/doc/source/whatsnew/v2.3.2 new file mode 100644 index 0000000000000..a870a8eaac698 --- /dev/null +++ b/doc/source/whatsnew/v2.3.2 @@ -0,0 +1,11 @@ +.. _whatsnew_232: + +These are the changes in pandas 2.3.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +Bug fixes +^^^^^^^^^ + +- Fixed ``Series.str.contains`` with compiled regex on Arrow string dtype, which now correctly delegates to the object-dtype implementation. (:issue:`61942`) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 7f3eae9807c09..01011018bb655 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -344,10 +344,7 @@ def _str_contains( na=lib.no_default, regex: bool = True, ): - if isinstance(pat, re.Pattern) and regex: - return super()._str_contains(pat, case, flags, na, regex) - - if flags: + if (isinstance(pat, re.Pattern) and regex) or flags: return super()._str_contains(pat, case, flags, na, regex) return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 025f837982595..929e505e4ba75 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -2,10 +2,12 @@ datetime, timedelta, ) +import re import numpy as np import pytest +import pandas as pd from pandas import ( DataFrame, Index, @@ -176,6 +178,14 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.translate(table)) +def test_str_contains_compiled_regex_arrow(): + ser = Series(["foo", "bar", "baz", None], dtype="string[pyarrow]") + pat = re.compile(r"ba.") + result = ser.str.contains(pat) + expected = Series([False, True, True, pd.NA], dtype="boolean[pyarrow]") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "method, expected", [ From 8e226cd39909dfef2197ae798629539c8f09e2b0 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Tue, 29 Jul 2025 02:58:42 +0530 Subject: [PATCH 14/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) and add whatsnew note --- pandas/tests/strings/test_strings.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 929e505e4ba75..729e5e8e93729 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -7,8 +7,10 @@ import numpy as np import pytest -import pandas as pd +pytest.importorskip("pyarrow") + from pandas import ( + NA, DataFrame, Index, MultiIndex, @@ -179,10 +181,11 @@ def test_empty_str_methods(any_string_dtype): def test_str_contains_compiled_regex_arrow(): + # GH#61942 ser = Series(["foo", "bar", "baz", None], dtype="string[pyarrow]") pat = re.compile(r"ba.") result = ser.str.contains(pat) - expected = Series([False, True, True, pd.NA], dtype="boolean[pyarrow]") + expected = Series([False, True, True, NA], dtype="boolean[pyarrow]") tm.assert_series_equal(result, expected) From 6768fb16ec4f645bb86bd84faa91002af5a64197 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Tue, 29 Jul 2025 12:59:19 +0530 Subject: [PATCH 15/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) and add whatsnew note --- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/strings/test_strings.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 01011018bb655..86b80fcbb75ef 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -344,7 +344,7 @@ def _str_contains( na=lib.no_default, regex: bool = True, ): - if (isinstance(pat, re.Pattern) and regex) or flags: + if regex and (isinstance(pat, re.Pattern) or flags): return super()._str_contains(pat, case, flags, na, regex) return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 729e5e8e93729..9b801c5316a9f 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -10,7 +10,6 @@ pytest.importorskip("pyarrow") from pandas import ( - NA, DataFrame, Index, MultiIndex, @@ -180,13 +179,14 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.translate(table)) -def test_str_contains_compiled_regex_arrow(): - # GH#61942 - ser = Series(["foo", "bar", "baz", None], dtype="string[pyarrow]") - pat = re.compile(r"ba.") +@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) +def test_str_contains_compiled_regex_arrow_dtype(dtype): + ser = Series(["foo", "bar", "baz"], dtype=dtype) + pat = re.compile("ba.") result = ser.str.contains(pat) - expected = Series([False, True, True, NA], dtype="boolean[pyarrow]") - tm.assert_series_equal(result, expected) + assert str(result.dtype) == "bool[pyarrow]" + expected = Series([False, True, True], dtype="bool[pyarrow]") + tm.testing.assert_series_equal(result, expected) @pytest.mark.parametrize( From 05ae24f67273e17a98516dace061bb616fb9b43c Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Tue, 29 Jul 2025 16:23:32 +0530 Subject: [PATCH 16/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) and add whatsnew note --- pandas/core/arrays/string_arrow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 86b80fcbb75ef..ef6ade34d59d0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -344,8 +344,8 @@ def _str_contains( na=lib.no_default, regex: bool = True, ): - if regex and (isinstance(pat, re.Pattern) or flags): - return super()._str_contains(pat, case, flags, na, regex) + if isinstance(pat, re.Pattern): + pat = pat.pattern return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) From 702384dc611139947490c7f46c390fb16d8dcc6b Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Tue, 29 Jul 2025 17:13:11 +0530 Subject: [PATCH 17/25] Revert changes to test_strings.py --- pandas/core/arrays/string_arrow.py | 4 ++-- pandas/tests/strings/test_strings.py | 13 ------------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ef6ade34d59d0..01011018bb655 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -344,8 +344,8 @@ def _str_contains( na=lib.no_default, regex: bool = True, ): - if isinstance(pat, re.Pattern): - pat = pat.pattern + if (isinstance(pat, re.Pattern) and regex) or flags: + return super()._str_contains(pat, case, flags, na, regex) return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 9b801c5316a9f..025f837982595 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -2,13 +2,10 @@ datetime, timedelta, ) -import re import numpy as np import pytest -pytest.importorskip("pyarrow") - from pandas import ( DataFrame, Index, @@ -179,16 +176,6 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.translate(table)) -@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) -def test_str_contains_compiled_regex_arrow_dtype(dtype): - ser = Series(["foo", "bar", "baz"], dtype=dtype) - pat = re.compile("ba.") - result = ser.str.contains(pat) - assert str(result.dtype) == "bool[pyarrow]" - expected = Series([False, True, True], dtype="bool[pyarrow]") - tm.testing.assert_series_equal(result, expected) - - @pytest.mark.parametrize( "method, expected", [ From 0be9a18d60e7ac13d2bb0cab7c001c118248ca55 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Wed, 30 Jul 2025 00:14:48 +0530 Subject: [PATCH 18/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) and add whatsnew note --- pandas/tests/strings/test_strings.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 025f837982595..ff7c2aa5b6db0 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -176,6 +176,16 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.translate(table)) +@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) +def test_str_contains_compiled_regex_arrow_dtype(dtype): + ser = Series(["foo", "bar", "baz"], dtype=dtype) + pat = re.compile("ba.") + result = ser.str.contains(pat) + assert str(result.dtype) == "bool[pyarrow]" + expected = Series([False, True, True], dtype="bool[pyarrow]") + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "method, expected", [ From 4ddc7db4e52ac177c359f20b1f478f4926af080e Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Wed, 30 Jul 2025 00:56:09 +0530 Subject: [PATCH 19/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) and add whatsnew note --- pandas/tests/strings/test_strings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index ff7c2aa5b6db0..61f71c1c51075 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -2,6 +2,7 @@ datetime, timedelta, ) +import re import numpy as np import pytest From 9a7e640e335da5068c0a2442993c736e2791dd3a Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Wed, 30 Jul 2025 02:03:51 +0530 Subject: [PATCH 20/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) and add whatsnew note --- pandas/core/arrays/string_arrow.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 01011018bb655..0b88e7d422d04 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -344,8 +344,10 @@ def _str_contains( na=lib.no_default, regex: bool = True, ): - if (isinstance(pat, re.Pattern) and regex) or flags: + if flags: return super()._str_contains(pat, case, flags, na, regex) + if isinstance(pat, re.pattern): + pat = pat.pattern return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) From b00fbe04f75ff11b9a273f3ef17412e9aa780848 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Wed, 30 Jul 2025 02:46:23 +0530 Subject: [PATCH 21/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) and add whatsnew note --- pandas/core/arrays/string_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 0b88e7d422d04..ea13062448dfe 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -346,7 +346,7 @@ def _str_contains( ): if flags: return super()._str_contains(pat, case, flags, na, regex) - if isinstance(pat, re.pattern): + if isinstance(pat, re.Pattern): pat = pat.pattern return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) From 76f741c9a074836e41bd17f802f98c8920a88e32 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Wed, 30 Jul 2025 03:37:23 +0530 Subject: [PATCH 22/25] Revert test_strings.py changes and remove accidental whatsnew file --- doc/source/whatsnew/v2.3.2 | 11 ----------- pandas/tests/strings/test_find_replace.py | 9 +++++++++ pandas/tests/strings/test_strings.py | 11 ----------- 3 files changed, 9 insertions(+), 22 deletions(-) delete mode 100644 doc/source/whatsnew/v2.3.2 diff --git a/doc/source/whatsnew/v2.3.2 b/doc/source/whatsnew/v2.3.2 deleted file mode 100644 index a870a8eaac698..0000000000000 --- a/doc/source/whatsnew/v2.3.2 +++ /dev/null @@ -1,11 +0,0 @@ -.. _whatsnew_232: - -These are the changes in pandas 2.3.2. See :ref:`release` for a full changelog -including other versions of pandas. - -{{ header }} - -Bug fixes -^^^^^^^^^ - -- Fixed ``Series.str.contains`` with compiled regex on Arrow string dtype, which now correctly delegates to the object-dtype implementation. (:issue:`61942`) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 30e6ebf0eed13..14a4223e658e1 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -281,6 +281,15 @@ def test_contains_nan(any_string_dtype): tm.assert_series_equal(result, expected) +def test_str_contains_compiled_regex_arrow_dtype(any_string_dtype): + ser = Series(["foo", "bar", "baz"], dtype=any_string_dtype) + pat = re.compile("ba.") + result = ser.str.contains(pat) + assert str(result.dtype) == "bool[pyarrow]" + expected = Series([False, True, True], dtype="bool[pyarrow]") + tm.assert_series_equal(result, expected) + + # -------------------------------------------------------------------------------------- # str.startswith # -------------------------------------------------------------------------------------- diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 61f71c1c51075..025f837982595 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -2,7 +2,6 @@ datetime, timedelta, ) -import re import numpy as np import pytest @@ -177,16 +176,6 @@ def test_empty_str_methods(any_string_dtype): tm.assert_series_equal(empty_str, empty.str.translate(table)) -@pytest.mark.parametrize("dtype", ["string[pyarrow]"]) -def test_str_contains_compiled_regex_arrow_dtype(dtype): - ser = Series(["foo", "bar", "baz"], dtype=dtype) - pat = re.compile("ba.") - result = ser.str.contains(pat) - assert str(result.dtype) == "bool[pyarrow]" - expected = Series([False, True, True], dtype="bool[pyarrow]") - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( "method, expected", [ From 8e65078e0b4867d15778327a74535b2000847e52 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Wed, 30 Jul 2025 04:14:10 +0530 Subject: [PATCH 23/25] Revert test_strings.py changes and remove accidental whatsnew file --- pandas/tests/strings/test_find_replace.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 14a4223e658e1..f2615a4a252ff 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -282,11 +282,21 @@ def test_contains_nan(any_string_dtype): def test_str_contains_compiled_regex_arrow_dtype(any_string_dtype): + # GH#61942 ser = Series(["foo", "bar", "baz"], dtype=any_string_dtype) pat = re.compile("ba.") result = ser.str.contains(pat) - assert str(result.dtype) == "bool[pyarrow]" - expected = Series([False, True, True], dtype="bool[pyarrow]") + # Determine expected dtype and values + if any_string_dtype == "string[pyarrow]": + expected_dtype = "bool[pyarrow]" + elif any_string_dtype == "string": + expected_dtype = "boolean" + elif any_string_dtype == "str": + expected_dtype = bool + else: + expected_dtype = object + expected = Series([False, True, True], dtype=expected_dtype) + assert str(result.dtype) == str(expected.dtype) tm.assert_series_equal(result, expected) From 0e620ca70d23943faff7528c464d3cb0206105e6 Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Wed, 30 Jul 2025 21:18:12 +0530 Subject: [PATCH 24/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) --- doc/source/whatsnew/v2.3.2.rst | 1 + pandas/tests/strings/test_find_replace.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.3.2.rst b/doc/source/whatsnew/v2.3.2.rst index 53a8d28687518..0d99ac289f6c9 100644 --- a/doc/source/whatsnew/v2.3.2.rst +++ b/doc/source/whatsnew/v2.3.2.rst @@ -26,6 +26,7 @@ Bug fixes "string" type in the JSON Table Schema for :class:`StringDtype` columns (:issue:`61889`) - Boolean operations (``|``, ``&``, ``^``) with bool-dtype objects on the left and :class:`StringDtype` objects on the right now cast the string to bool, with a deprecation warning (:issue:`60234`) +- Fixed ``Series.str.contains`` with compiled regex on Arrow string dtype, which now correctly delegates to the object-dtype implementation. (:issue:`61942`) .. --------------------------------------------------------------------------- .. _whatsnew_232.contributors: diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index f2615a4a252ff..21adf908238f3 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -283,18 +283,17 @@ def test_contains_nan(any_string_dtype): def test_str_contains_compiled_regex_arrow_dtype(any_string_dtype): # GH#61942 + if any_string_dtype == "string[pyarrow]": + pytest.importorskip("pyarrow") ser = Series(["foo", "bar", "baz"], dtype=any_string_dtype) pat = re.compile("ba.") result = ser.str.contains(pat) # Determine expected dtype and values - if any_string_dtype == "string[pyarrow]": - expected_dtype = "bool[pyarrow]" - elif any_string_dtype == "string": - expected_dtype = "boolean" - elif any_string_dtype == "str": - expected_dtype = bool - else: - expected_dtype = object + expected_dtype = { + "string[pyarrow]": "bool[pyarrow]", + "string": "boolean", + "str": bool, + }.get(any_string_dtype, object) expected = Series([False, True, True], dtype=expected_dtype) assert str(result.dtype) == str(expected.dtype) tm.assert_series_equal(result, expected) From 915b38f6fac396dbba0f98ec142108e11dd6e1df Mon Sep 17 00:00:00 2001 From: Aniket Singh Yadav Date: Thu, 31 Jul 2025 04:05:19 +0530 Subject: [PATCH 25/25] BUG: Fix Series.str.contains with compiled regex on Arrow string dtype (#61942) --- pandas/tests/strings/test_find_replace.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 21adf908238f3..425030ed63fb5 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -283,8 +283,6 @@ def test_contains_nan(any_string_dtype): def test_str_contains_compiled_regex_arrow_dtype(any_string_dtype): # GH#61942 - if any_string_dtype == "string[pyarrow]": - pytest.importorskip("pyarrow") ser = Series(["foo", "bar", "baz"], dtype=any_string_dtype) pat = re.compile("ba.") result = ser.str.contains(pat) @@ -295,7 +293,6 @@ def test_str_contains_compiled_regex_arrow_dtype(any_string_dtype): "str": bool, }.get(any_string_dtype, object) expected = Series([False, True, True], dtype=expected_dtype) - assert str(result.dtype) == str(expected.dtype) tm.assert_series_equal(result, expected)