-
-
Notifications
You must be signed in to change notification settings - Fork 19.3k
DEPR: Deprecate non-ISO date string formats in DatetimeIndex.loc #62991
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ccb5d50
7906b17
f3a1879
d12229d
8f84b2b
dcc5f93
467e1cb
d0ff888
2ac6b5b
3e16e12
5223f3f
ab4b56a
23ba779
936847d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
|
|
||
| import datetime as dt | ||
| import operator | ||
| import re | ||
| from typing import ( | ||
| TYPE_CHECKING, | ||
| Self, | ||
|
|
@@ -110,6 +111,42 @@ def _new_DatetimeIndex(cls, d): | |
| return result | ||
|
|
||
|
|
||
| def _is_iso_format_string(date_str: str) -> bool: | ||
| """ | ||
| Check if a date string follows ISO8601 format. | ||
|
|
||
| Uses date.fromisoformat() to validate full ISO dates, with fallback to regex | ||
| for reduced precision dates (YYYY or YYYY-MM) which are not supported by | ||
| fromisoformat(). | ||
|
|
||
| Examples of ISO format (True): | ||
| - 2024 (reduced precision) | ||
| - 2024-01 (reduced precision) | ||
| - 2024-01-10 | ||
| - 2024-01-10T00:00:00 | ||
|
|
||
| Examples of non-ISO format (False): | ||
| - 2024/01/10 (/ separator) | ||
| - 2024 01 10 (space separator) | ||
| - 01/10/2024 (MM/DD/YYYY) | ||
| - 10/01/2024 (DD/MM/YYYY) | ||
| - 01-10-2024 (MM-DD-YYYY) | ||
| """ | ||
| try: | ||
| # Standard library validates full ISO dates (YYYY-MM-DD format) | ||
| dt.date.fromisoformat(date_str) | ||
| return True | ||
| except (ValueError, TypeError): | ||
| # Fallback regex for reduced precision dates not supported by fromisoformat() | ||
| # Checks if string starts with ISO pattern (YYYY, YYYY-MM, YYYY-MM-DD, etc.) | ||
| # Pattern: ^\d{4}(?:-|T|$) | ||
| # - Requires exactly 4 digits at start (year) | ||
| # - Followed by: hyphen (YYYY-), T (YYYY-T...), or end (YYYY) | ||
| # Examples that match: "2024", "2024-01", "2024-01-10", "2024-01-10T00:00:00" | ||
| # Examples that don't: "01/10/2024", "2024 01 10", "1/1/2024" | ||
| return re.match(r"^\d{4}(?:-|T|$)", date_str) is not None | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This regular expression seems like it would catch way more than what is expected. Wouldn't this match something like |
||
|
|
||
|
|
||
| @inherit_names( | ||
| DatetimeArray._field_ops | ||
| + [ | ||
|
|
@@ -613,6 +650,14 @@ def get_loc(self, key): | |
| parsed, reso = self._parse_with_reso(key) | ||
| except ValueError as err: | ||
| raise KeyError(key) from err | ||
| # GH#58302 - Deprecate non-ISO string formats in .loc indexing | ||
| if not _is_iso_format_string(key): | ||
| msg = ( | ||
| "Parsing non-ISO datetime strings in .loc is deprecated " | ||
| "and will be removed in a future version. Use ISO format " | ||
| f"(YYYY-MM-DD) instead. Got '{key}'." | ||
| ) | ||
| warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level()) | ||
| self._disallow_mismatched_indexing(parsed) | ||
|
|
||
| if self._can_partial_date_slice(reso): | ||
|
|
@@ -688,6 +733,23 @@ def slice_indexer(self, start=None, end=None, step=None): | |
| def check_str_or_none(point) -> bool: | ||
| return point is not None and not isinstance(point, str) | ||
|
|
||
| # GH#58302 - Deprecate non-ISO string formats in .loc indexing | ||
| if isinstance(start, str) and not _is_iso_format_string(start): | ||
| msg = ( | ||
| "Parsing non-ISO datetime strings in .loc is deprecated " | ||
| "and will be removed in a future version. Use ISO format " | ||
| f"(YYYY-MM-DD) instead. Got '{start}'." | ||
| ) | ||
| warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level()) | ||
|
|
||
| if isinstance(end, str) and not _is_iso_format_string(end): | ||
| msg = ( | ||
| "Parsing non-ISO datetime strings in .loc is deprecated " | ||
| "and will be removed in a future version. Use ISO format " | ||
| f"(YYYY-MM-DD) instead. Got '{end}'." | ||
| ) | ||
| warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level()) | ||
|
|
||
| # GH#33146 if start and end are combinations of str and None and Index is not | ||
| # monotonic, we can not use Index.slice_indexer because it does not honor the | ||
| # actual elements, is only searching for start and end | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,6 +72,9 @@ def seed_df(seed_nans, n, m): | |
| @pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr) | ||
| @pytest.mark.parametrize("isort", [True, False]) | ||
| @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")]) | ||
| @pytest.mark.filterwarnings( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why does this test need to filter warnings? It seems unrelated to the change?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we group by keys='2nd' (the date column - from parameterized tests), it triggers the deprecation warning internally during the groupby operation. Without it, those test cases fail in CI. So, added the filterwarnings.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm so is that a problem with the groupby internals or the test data? This feels like something we shouldn't have to filter on this test |
||
| "ignore:Parsing non-ISO datetime strings:pandas.errors.Pandas4Warning" | ||
| ) | ||
| def test_series_groupby_value_counts( | ||
| seed_nans, | ||
| num_rows, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2851,6 +2851,9 @@ def test_groupby_with_Time_Grouper(unit): | |
| tm.assert_frame_equal(result, expected_output) | ||
|
|
||
|
|
||
| @pytest.mark.filterwarnings( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is another one that I don't think should have warnings |
||
| "ignore:Parsing non-ISO datetime strings:pandas.errors.Pandas4Warning" | ||
| ) | ||
| def test_groupby_series_with_datetimeindex_month_name(): | ||
| # GH 48509 | ||
| s = Series([0, 1, 0], index=date_range("2022-01-01", periods=3), name="jan") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,8 @@ | |
| import numpy as np | ||
| import pytest | ||
|
|
||
| from pandas.errors import Pandas4Warning | ||
|
|
||
| from pandas import ( | ||
| DataFrame, | ||
| DatetimeIndex, | ||
|
|
@@ -19,6 +21,10 @@ | |
|
|
||
|
|
||
| class TestSlicing: | ||
| pytestmark = pytest.mark.filterwarnings( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this apply to the entire class? Can we apply to the failing tests instead? |
||
| "ignore:Parsing non-ISO datetime strings:pandas.errors.Pandas4Warning" | ||
| ) | ||
|
|
||
| def test_string_index_series_name_converted(self): | ||
| # GH#1644 | ||
| df = DataFrame( | ||
|
|
@@ -464,3 +470,94 @@ def test_slice_reduce_to_series(self): | |
| ) | ||
| result = df.loc["2000", "A"] | ||
| tm.assert_series_equal(result, expected) | ||
|
|
||
|
|
||
| class TestDatetimeIndexNonISODeprecation: | ||
| """Tests for deprecation of non-ISO string formats in .loc indexing. GH#58302""" | ||
|
|
||
| @pytest.fixture | ||
| def ser_daily(self): | ||
| """Create a Series with daily DatetimeIndex for testing.""" | ||
| return Series( | ||
| range(15), | ||
| index=DatetimeIndex(date_range(start="2024-01-01", freq="D", periods=15)), | ||
| ) | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "date_string", | ||
| [ | ||
| "1/10/2024", # MM/DD/YYYY format | ||
| "01/10/2024", # MM/DD/YYYY format with leading zero | ||
| ], | ||
| ) | ||
| def test_loc_indexing_non_iso_single_key_deprecation(self, ser_daily, date_string): | ||
| # GH#58302 | ||
| msg = "Parsing non-ISO datetime strings in .loc is deprecated" | ||
|
|
||
| with tm.assert_produces_warning(Pandas4Warning, match=msg): | ||
| result = ser_daily.loc[date_string] | ||
| assert result == 9 | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "date_string,expected", | ||
| [ | ||
| ("2024-01-10", 9), # YYYY-MM-DD (ISO format) | ||
| ], | ||
| ) | ||
| def test_loc_indexing_iso_format_no_warning(self, ser_daily, date_string, expected): | ||
| # GH#58302 - ISO format (YYYY-MM-DD) should NOT warn | ||
| with tm.assert_produces_warning(None): | ||
| result = ser_daily.loc[date_string] | ||
| assert result == expected | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "start_string", | ||
| [ | ||
| "1/10/2024", # MM/DD/YYYY format | ||
| "01/10/2024", # MM/DD/YYYY format with leading zero | ||
| ], | ||
| ) | ||
| def test_loc_slicing_non_iso_start_deprecation(self, ser_daily, start_string): | ||
| # GH#58302 - Non-ISO start in slice should warn | ||
| msg = "Parsing non-ISO datetime strings in .loc is deprecated" | ||
|
|
||
| with tm.assert_produces_warning(Pandas4Warning, match=msg): | ||
| result = ser_daily.loc[start_string:"2024-01-15"] | ||
| assert len(result) > 0 | ||
|
|
||
| @pytest.mark.parametrize( | ||
| "end_string", | ||
| [ | ||
| "5-01-2024", # DD-MM-YYYY format | ||
| "05-01-2024", # DD-MM-YYYY format with leading zero | ||
| ], | ||
| ) | ||
| def test_loc_slicing_non_iso_end_deprecation(self, ser_daily, end_string): | ||
| # GH#58302 - Non-ISO end in slice should warn | ||
| msg = "Parsing non-ISO datetime strings in .loc is deprecated" | ||
|
|
||
| with tm.assert_produces_warning(Pandas4Warning, match=msg): | ||
| result = ser_daily.loc["2024-01-01":end_string] | ||
| assert len(result) > 0 | ||
|
|
||
| def test_loc_slicing_both_non_iso_deprecation(self, ser_daily): | ||
| # GH#58302 - Both non-ISO should warn (twice) | ||
| msg = "Parsing non-ISO datetime strings in .loc is deprecated" | ||
|
|
||
| with tm.assert_produces_warning( | ||
| Pandas4Warning, match=msg, check_stacklevel=False | ||
| ): | ||
| result = ser_daily.loc["1/10/2024":"5-01-2024"] | ||
| assert len(result) > 0 | ||
|
|
||
| def test_loc_slicing_iso_formats_no_warning(self, ser_daily): | ||
| # GH#58302 - ISO slice formats should NOT warn | ||
| with tm.assert_produces_warning(None): | ||
| result = ser_daily.loc["2024-01-05":"2024-01-10"] | ||
| assert len(result) == 6 | ||
|
|
||
| def test_loc_non_string_keys_no_warning(self, ser_daily): | ||
| # GH#58302 - Non-string keys should not warn | ||
| with tm.assert_produces_warning(None): | ||
| result = ser_daily.loc[Timestamp("2024-01-10")] | ||
| assert result == 9 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Credit to @stefmolin for the original idea, but I don't think we should try and roll our own regex here if we can avoid it.
I see the standard library provides
date.fromisostring, although that is documented to not work with "Reduced Precision" dates:https://docs.python.org/3/library/datetime.html
Even still I wonder if we can't use that first and only fallback when it fails
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @stefmolin @WillAyd!. Switched to using date.fromisoformat() like you suggested. Added a regex fallback to handle the reduced precision dates (YYYY and YYYY-MM) that fromisoformat doesn't support .