From 261209fc57751adc71b36931fb0ada50a39ac806 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 26 Jul 2025 11:52:27 +0200 Subject: [PATCH] BUG: fix .str.isdigit to honor unicode superscript for older pyarrow --- doc/source/whatsnew/v2.3.2.rst | 3 ++- pandas/core/arrays/_arrow_string_mixins.py | 7 +++++++ pandas/tests/strings/test_strings.py | 7 ++++--- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.3.2.rst b/doc/source/whatsnew/v2.3.2.rst index faa61cf4bd3bc..88bd63d8942ea 100644 --- a/doc/source/whatsnew/v2.3.2.rst +++ b/doc/source/whatsnew/v2.3.2.rst @@ -22,7 +22,8 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ -- +- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript + characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`) .. --------------------------------------------------------------------------- .. _whatsnew_232.contributors: diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 07cbf489cfe1c..ad91d60aae922 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -15,6 +15,7 @@ HAS_PYARROW, pa_version_under13p0, pa_version_under17p0, + pa_version_under21p0, ) if HAS_PYARROW: @@ -261,6 +262,12 @@ def _str_isdecimal(self): return self._convert_bool_result(result) def _str_isdigit(self): + if pa_version_under21p0: + # https://github.com/pandas-dev/pandas/issues/61466 + res_list = self._apply_elementwise(str.isdigit) + return self._convert_bool_result( + pa.chunked_array(res_list, type=pa.bool_()) + ) result = pc.utf8_is_digit(self._pa_array) return self._convert_bool_result(result) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 025f837982595..2ed00703212ca 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -240,8 +240,9 @@ def test_ismethods(method, expected, any_string_dtype): @pytest.mark.parametrize( "method, expected", [ - ("isnumeric", [False, True, True, False, True, True, False]), - ("isdecimal", [False, True, False, False, False, True, False]), + ("isnumeric", [False, True, True, True, False, True, True, False]), + ("isdecimal", [False, True, False, False, False, False, True, False]), + ("isdigit", [False, True, True, False, False, False, True, False]), ], ) def test_isnumeric_unicode(method, expected, any_string_dtype): @@ -250,7 +251,7 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 # noqa: RUF003 ser = Series( - ["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001 + ["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001 dtype=any_string_dtype, ) expected_dtype = (