From 920aa107baee87f8c4fb4e74e9e1593c53b925b8 Mon Sep 17 00:00:00 2001 From: Khemkaran Date: Sat, 26 Jul 2025 19:04:29 +0530 Subject: [PATCH 1/5] str.fullmatch() and str.match() with complied regex fix issue 61952 --- pandas/core/arrays/_arrow_string_mixins.py | 8 +++++++- pandas/tests/extension/test_arrow.py | 8 ++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 07cbf489cfe1c..a64850d4ff1a8 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -304,11 +304,14 @@ def _str_contains( def _str_match( self, - pat: str, + pat, case: bool = True, flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): + if isinstance(pat, re.Pattern): + # GH#61952 + pat = pat.pattern if not pat.startswith("^"): pat = f"^{pat}" return self._str_contains(pat, case, flags, na, regex=True) @@ -320,6 +323,9 @@ def _str_fullmatch( flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): + if isinstance(pat, re.Pattern): + # GH#61952 + pat = pat.pattern if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4d766d6664218..efc357a05b64d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1857,6 +1857,10 @@ def test_str_repeat(): ["ab", False, True, [True, True]], ["a[a-z]{1}", False, None, [True, None]], ["A[a-z]{1}", True, None, [False, None]], + # GH#61952 + [re.compile(r"ab"), False, None, [True, None]], + [re.compile(r"Abc"), True, None, [False, None]], + [re.compile(r"a[a-z]{1}"), False, None, [True, None]], ], ) def test_str_match(pat, case, na, exp): @@ -1880,6 +1884,10 @@ def test_str_match(pat, case, na, exp): ["abc\\$", False, None, [False, True, False, None]], ["Abc$", True, None, [False, False, False, None]], ["Abc\\$", True, None, [False, False, False, None]], + # GH#61952 + [re.compile(r"abc"), False, None, [True, True, False, None]], + [re.compile(r"abc$"), False, None, [True, False, False, None]], + [re.compile(r"a[a-z]{2}"), False, None, [True, True, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): From bf6da3b76d9e61557a6058560a05e67e6c5ffe24 Mon Sep 17 00:00:00 2001 From: Khemkaran Date: Sun, 27 Jul 2025 14:59:41 +0530 Subject: [PATCH 2/5] moved new tests to strings/test_find_replace.py --- pandas/core/strings/object_array.py | 8 +++++--- pandas/tests/extension/test_arrow.py | 8 -------- pandas/tests/strings/test_find_replace.py | 20 ++++++++++++++++++++ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 0adb7b51cf2b7..242f34bac1bfd 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -248,14 +248,15 @@ def rep(x, r): def _str_match( self, - pat: str, + pat, case: bool = True, flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): if not case: flags |= re.IGNORECASE - + if isinstance(pat, re.Pattern): + pat = pat.pattern regex = re.compile(pat, flags=flags) f = lambda x: regex.match(x) is not None @@ -270,7 +271,8 @@ def _str_fullmatch( ): if not case: flags |= re.IGNORECASE - + if isinstance(pat, re.Pattern): + pat = pat.pattern regex = re.compile(pat, flags=flags) f = lambda x: regex.fullmatch(x) is not None diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index efc357a05b64d..4d766d6664218 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1857,10 +1857,6 @@ def test_str_repeat(): ["ab", False, True, [True, True]], ["a[a-z]{1}", False, None, [True, None]], ["A[a-z]{1}", True, None, [False, None]], - # GH#61952 - [re.compile(r"ab"), False, None, [True, None]], - [re.compile(r"Abc"), True, None, [False, None]], - [re.compile(r"a[a-z]{1}"), False, None, [True, None]], ], ) def test_str_match(pat, case, na, exp): @@ -1884,10 +1880,6 @@ def test_str_match(pat, case, na, exp): ["abc\\$", False, None, [False, True, False, None]], ["Abc$", True, None, [False, False, False, None]], ["Abc\\$", True, None, [False, False, False, None]], - # GH#61952 - [re.compile(r"abc"), False, None, [True, True, False, None]], - [re.compile(r"abc$"), False, None, [True, False, False, None]], - [re.compile(r"a[a-z]{2}"), False, None, [True, True, False, None]], ], ) def test_str_fullmatch(pat, case, na, exp): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 30e6ebf0eed13..55a4f2cb677d1 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -818,6 +818,16 @@ def test_match_case_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) +def test_match_compiled_regex(any_string_dtype): + values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) + result = values.str.match(re.compile(r"ab"), case=False) + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, True, True, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # -------------------------------------------------------------------------------------- # str.fullmatch # -------------------------------------------------------------------------------------- @@ -887,6 +897,16 @@ def test_fullmatch_case_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) +def test_fullmatch_compiled_regex(any_string_dtype): + values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) + result = values.str.fullmatch(re.compile(r"ab"), case=False) + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + expected = Series([True, True, False, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # -------------------------------------------------------------------------------------- # str.findall # -------------------------------------------------------------------------------------- From f08bfd12e0098d6b624eaa432e150dcb0ecd784c Mon Sep 17 00:00:00 2001 From: Khemkaran Date: Sun, 27 Jul 2025 15:20:06 +0530 Subject: [PATCH 3/5] added docs to accessor.py/match() --- pandas/core/strings/accessor.py | 2 +- pandas/tests/strings/test_find_replace.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 14dadd9b41772..e39aff7c418f5 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1374,7 +1374,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): Parameters ---------- pat : str - Character sequence. + Character sequence or regular expression. case : bool, default True If True, case sensitive. flags : int, default 0 (no flags) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 55a4f2cb677d1..567ef315366b1 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -819,6 +819,7 @@ def test_match_case_kwarg(any_string_dtype): def test_match_compiled_regex(any_string_dtype): + # GH#61952 values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.match(re.compile(r"ab"), case=False) expected_dtype = ( @@ -898,6 +899,7 @@ def test_fullmatch_case_kwarg(any_string_dtype): def test_fullmatch_compiled_regex(any_string_dtype): + # GH#61952 values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) result = values.str.fullmatch(re.compile(r"ab"), case=False) expected_dtype = ( From a336ccbf7af45c70a1e2b3eed9ea79d918da676e Mon Sep 17 00:00:00 2001 From: Khemkaran Date: Wed, 30 Jul 2025 12:07:28 +0530 Subject: [PATCH 4/5] added type annotation and minor docstring changes --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- pandas/core/strings/accessor.py | 2 +- pandas/core/strings/object_array.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index a64850d4ff1a8..54fec5769e065 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -304,7 +304,7 @@ def _str_contains( def _str_match( self, - pat, + pat: str | re.Pattern, case: bool = True, flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 510187a63fe3a..21e6e2efbe778 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1361,7 +1361,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default): Parameters ---------- - pat : str + pat : str or compiled regex Character sequence or regular expression. case : bool, default True If True, case sensitive. diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 242f34bac1bfd..c1d81fc3d7223 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -248,7 +248,7 @@ def rep(x, r): def _str_match( self, - pat, + pat: str | re.Pattern, case: bool = True, flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, From 4aaa8893dd0f49494784c2a7d3b1e21b8bb407bb Mon Sep 17 00:00:00 2001 From: Khemkaran Date: Wed, 30 Jul 2025 14:44:01 +0530 Subject: [PATCH 5/5] fixed mypy check --- pandas/core/arrays/_arrow_string_mixins.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 54fec5769e065..5dda2d914366c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -312,13 +312,13 @@ def _str_match( if isinstance(pat, re.Pattern): # GH#61952 pat = pat.pattern - if not pat.startswith("^"): + if isinstance(pat, str) and not pat.startswith("^"): pat = f"^{pat}" return self._str_contains(pat, case, flags, na, regex=True) def _str_fullmatch( self, - pat, + pat: str | re.Pattern, case: bool = True, flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, @@ -326,7 +326,7 @@ def _str_fullmatch( if isinstance(pat, re.Pattern): # GH#61952 pat = pat.pattern - if not pat.endswith("$") or pat.endswith("\\$"): + if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")): pat = f"{pat}$" return self._str_match(pat, case, flags, na)