From 93ebe3d1cfa280db8df58bc4e6fd829cb7c56336 Mon Sep 17 00:00:00 2001 From: "Falk B. Schimweg" Date: Sat, 4 May 2024 10:39:43 +0200 Subject: [PATCH] Add totality validation #58547 --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 8 ++ pandas/core/reshape/merge.py | 137 +++++++++++++++-------- pandas/tests/reshape/merge/test_merge.py | 89 ++++++++++++++- 4 files changed, 190 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 278971ef88a0f..94eddedf5217f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -32,6 +32,7 @@ Other enhancements - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`) - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) +- :meth:`DataFrame.merge` now supports validation of (left-/right-)totality (:issue:`58547`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 96943eb71c7bd..6512813286a85 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -393,6 +393,14 @@ * "many_to_one" or "m:1": check if merge keys are unique in right dataset. * "many_to_many" or "m:m": allowed, but does not result in checks. + * "total": check if all merge keys on each side are also present + on the other side + * "left_total": check if mere keys on the left side are all present + on the right side + * "right_total": check if merge keys on the right side are all present + on the left side + + More than one merge type can be passed when separated by a ``+``. Returns ------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e6e84c2135b82..76ace8e8e41ad 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1623,62 +1623,111 @@ def _validate_left_right_on(self, left_on, right_on): @final def _validate_validate_kwd(self, validate: str) -> None: + # Split validation string + validations = validate.split("+") + # Check uniqueness of each if self.left_index: - left_unique = self.orig_left.index.is_unique + left_merge_index = self.orig_left.index else: - left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique + left_merge_index = MultiIndex.from_arrays(self.left_join_keys) + left_unique = left_merge_index.is_unique if self.right_index: - right_unique = self.orig_right.index.is_unique + right_merge_index = self.orig_right.index else: - right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique + right_merge_index = MultiIndex.from_arrays(self.right_join_keys) + right_unique = right_merge_index.is_unique + + # Check totality of each + intersect_index = left_merge_index.intersection(right_merge_index) + right_total = right_merge_index.sort_values().drop(intersect_index).empty + left_total = left_merge_index.sort_values().drop(intersect_index).empty # Check data integrity - if validate in ["one_to_one", "1:1"]: - if not left_unique and not right_unique: - raise MergeError( - "Merge keys are not unique in either left " - "or right dataset; not a one-to-one merge" - ) - if not left_unique: - raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-one merge" - ) - if not right_unique: - raise MergeError( - "Merge keys are not unique in right dataset; not a one-to-one merge" - ) + for validation in validations: + if validation in ["one_to_one", "1:1"]: + if not left_unique and not right_unique: + raise MergeError( + "Merge keys are not unique in either left " + "or right dataset; not a one-to-one merge" + ) + if not left_unique: + raise MergeError( + "Merge keys are not unique in left dataset; " + "not a one-to-one merge" + ) + if not right_unique: + raise MergeError( + "Merge keys are not unique in right dataset; " + "not a one-to-one merge" + ) - elif validate in ["one_to_many", "1:m"]: - if not left_unique: - raise MergeError( - "Merge keys are not unique in left dataset; not a one-to-many merge" - ) + elif validation in ["one_to_many", "1:m"]: + if not left_unique: + raise MergeError( + "Merge keys are not unique in left dataset; " + "not a one-to-many merge" + ) - elif validate in ["many_to_one", "m:1"]: - if not right_unique: - raise MergeError( - "Merge keys are not unique in right dataset; " - "not a many-to-one merge" - ) + elif validation in ["many_to_one", "m:1"]: + if not right_unique: + raise MergeError( + "Merge keys are not unique in right dataset; " + "not a many-to-one merge" + ) - elif validate in ["many_to_many", "m:m"]: - pass + elif validation in ["many_to_many", "m:m"]: + pass - else: - raise ValueError( - f'"{validate}" is not a valid argument. ' - "Valid arguments are:\n" - '- "1:1"\n' - '- "1:m"\n' - '- "m:1"\n' - '- "m:m"\n' - '- "one_to_one"\n' - '- "one_to_many"\n' - '- "many_to_one"\n' - '- "many_to_many"' - ) + elif validation in ["total"]: + if not left_total and not right_total: + raise MergeError( + "Neither the merge keys in the left dataset are all present " + "in the right dataset, nor the merge keys in the right " + "dataset all present in the left dataset; not a total merge." + ) + if not left_total: + raise MergeError( + "Merge keys in left dataset are not all present in the right " + "dataset; not a total merge" + ) + if not right_total: + raise MergeError( + "Merge keys in right dataset are not all present " + "in the left dataset; not a total merge" + ) + + elif validation in ["left_total"]: + if not left_total: + raise MergeError( + "Merge keys in left dataset are not all present " + "in the right dataset; not a left total merge" + ) + + elif validation in ["right_total"]: + if not right_total: + raise MergeError( + "Merge keys in right dataset are not all present " + "in the left dataset; not a right total merge" + ) + + else: + raise ValueError( + f'"{validation}" is not a valid argument. ' + "Valid arguments are:\n" + '- "1:1"\n' + '- "1:m"\n' + '- "m:1"\n' + '- "m:m"\n' + '- "one_to_one"\n' + '- "one_to_many"\n' + '- "many_to_one"\n' + '- "many_to_many"\n' + '- "total"\n' + '- "left_total"\n' + '- "right_total"' + ) def get_join_indexers( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5c5c06dea0008..648ea8ba59981 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1203,6 +1203,44 @@ def test_validation(self): ) tm.assert_frame_equal(result, expected_3) + # Make sure left totality works + result = merge( + left, + right, + left_index=True, + right_index=True, + validate="one_to_one+left_total", + ) + tm.assert_frame_equal(result, expected) + + # Make sure right totality raises exception + msg = ( + "Merge keys in right dataset are not all present in the left dataset; " + "not a right total merge" + ) + with pytest.raises(MergeError, match=msg): + merge( + left, + right, + left_index=True, + right_index=True, + validate="one_to_one+right_total", + ) + + # Make sure general totality raises exception + msg = ( + "Merge keys in right dataset are not all present in the left dataset; " + "not a total merge" + ) + with pytest.raises(MergeError, match=msg): + merge( + left, + right, + left_index=True, + right_index=True, + validate="one_to_one+total", + ) + # Dups on right right_w_dups = concat([right, DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])]) merge( @@ -1213,6 +1251,14 @@ def test_validation(self): validate="one_to_many", ) + merge( + left, + right_w_dups, + left_index=True, + right_index=True, + validate="left_total", + ) + msg = "Merge keys are not unique in right dataset; not a one-to-one merge" with pytest.raises(MergeError, match=msg): merge( @@ -1237,6 +1283,13 @@ def test_validation(self): right_index=True, validate="many_to_one", ) + merge( + left_w_dups, + right, + left_index=True, + right_index=True, + validate="left_total", + ) msg = "Merge keys are not unique in left dataset; not a one-to-one merge" with pytest.raises(MergeError, match=msg): @@ -1279,7 +1332,10 @@ def test_validation(self): '- "one_to_one"\n' '- "one_to_many"\n' '- "many_to_one"\n' - '- "many_to_many"' + '- "many_to_many"\n' + '- "total"\n' + '- "left_total"\n' + '- "right_total"' ) with pytest.raises(ValueError, match=msg): merge(left, right, on="a", validate="jibberish") @@ -1323,6 +1379,37 @@ def test_validation(self): result = merge(left, right, on=["a", "b"], validate="1:1") tm.assert_frame_equal(result, expected_multi) + right_total_ext = concat( + [right, DataFrame({"a": ["b"], "b": [1], "d": ["neigh"]}, index=[3])], + sort=True, + ) + expected_total_ext = DataFrame( + { + "a": ["a", "a", "b", "b"], + "b": [0, 1, 0, 1], + "c": ["cat", "dog", "weasel", "horse"], + "d": ["meow", "bark", "um... weasel noise?", "neigh"], + }, + index=range(4), + ) + result = merge(left, right_total_ext, on=["a", "b"], validate="1:1+total") + tm.assert_frame_equal(result, expected_total_ext) + + # Ensure not left total raises error + right_reduced = right.drop_duplicates(subset=["b"]) + + msg = ( + "Merge keys in left dataset are not all present in the right dataset; " + "not a left total merge" + ) + with pytest.raises(MergeError, match=msg): + merge( + left, + right_reduced, + on=["a", "b"], + validate="left_total", + ) + def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 a = DataFrame({"a": [], "b": [], "c": []})