Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 9ffb136

Browse files
authored
Merge pull request #585 from dlawin/issue_527
continue --dbt diff when null PKs exist
2 parents 5a8ec27 + 253cc68 commit 9ffb136

File tree

4 files changed

+27
-3
lines changed

4 files changed

+27
-3
lines changed

data_diff/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ def diff_tables(
7979
materialize_all_rows: bool = False,
8080
# Maximum number of rows to write when materializing, per thread. (joindiff only)
8181
table_write_limit: int = TABLE_WRITE_LIMIT,
82+
# Skips diffing any rows with null keys. (joindiff only)
83+
skip_null_keys: bool = False,
8284
) -> Iterator:
8385
"""Finds the diff between table1 and table2.
8486
@@ -107,6 +109,7 @@ def diff_tables(
107109
materialize_to_table (Union[str, DbPath], optional): Path of new table to write diff results to. Disabled if not provided. Used for `JOINDIFF`.
108110
materialize_all_rows (bool): Materialize every row, not just those that are different. (used for `JOINDIFF`. default: False)
109111
table_write_limit (int): Maximum number of rows to write when materializing, per thread.
112+
skip_null_keys (bool): Skips diffing any rows with null PKs (displays a warning if any are null) (used for `JOINDIFF`. default: False)
110113
111114
Note:
112115
The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
@@ -168,6 +171,7 @@ def diff_tables(
168171
materialize_to_table=materialize_to_table,
169172
materialize_all_rows=materialize_all_rows,
170173
table_write_limit=table_write_limit,
174+
skip_null_keys=skip_null_keys,
171175
)
172176
else:
173177
raise ValueError(f"Unknown algorithm: {algorithm}")

data_diff/dbt.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ def _local_diff(diff_vars: TDiffVars) -> None:
238238
algorithm=Algorithm.JOINDIFF,
239239
extra_columns=extra_columns,
240240
where=diff_vars.where_filter,
241+
skip_null_keys=True,
241242
)
242243

243244
if list(diff):

data_diff/joindiff_tables.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,13 +131,15 @@ class JoinDiffer(TableDiffer):
131131
materialize_to_table (DbPath, optional): Path of new table to write diff results to. Disabled if not provided.
132132
materialize_all_rows (bool): Materialize every row, not just those that are different. (default: False)
133133
table_write_limit (int): Maximum number of rows to write when materializing, per thread.
134+
skip_null_keys (bool): Skips diffing any rows with null PKs (displays a warning if any are null) (default: False)
134135
"""
135136

136137
validate_unique_key: bool = True
137138
sample_exclusive_rows: bool = False
138139
materialize_to_table: DbPath = None
139140
materialize_all_rows: bool = False
140141
table_write_limit: int = TABLE_WRITE_LIMIT
142+
skip_null_keys: bool = False
141143

142144
stats: dict = {}
143145

@@ -209,7 +211,11 @@ def _diff_segments(
209211
if is_xa and is_xb:
210212
# Can't both be exclusive, meaning a pk is NULL
211213
# This can happen if the explicit null test didn't finish running yet
212-
raise ValueError("NULL values in one or more primary keys")
214+
if self.skip_null_keys:
215+
# warning is thrown in explicit null test
216+
continue
217+
else:
218+
raise ValueError("NULL values in one or more primary keys")
213219
# _is_diff, a_row, b_row = _slice_tuple(x, len(is_diff_cols), len(a_cols), len(b_cols))
214220
_is_diff, ab_row = _slice_tuple(x, len(is_diff_cols), len(a_cols) + len(b_cols))
215221
a_row, b_row = ab_row[::2], ab_row[1::2]
@@ -252,7 +258,12 @@ def _test_null_keys(self, table1, table2):
252258
q = t.select(*this[key_columns]).where(or_(this[k] == None for k in key_columns))
253259
nulls = ts.database.query(q, list)
254260
if nulls:
255-
raise ValueError(f"NULL values in one or more primary keys of {ts.table_path}")
261+
if self.skip_null_keys:
262+
logger.warning(
263+
f"NULL values in one or more primary keys of {ts.table_path}. Skipping rows with NULL keys."
264+
)
265+
else:
266+
raise ValueError(f"NULL values in one or more primary keys of {ts.table_path}")
256267

257268
def _collect_stats(self, i, table_seg: TableSegment, info_tree: InfoTree):
258269
logger.debug(f"Collecting stats for table #{i}")

tests/test_dbt.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ def test_local_diff(self, mock_diff_tables):
505505
algorithm=Algorithm.JOINDIFF,
506506
extra_columns=ANY,
507507
where=where,
508+
skip_null_keys=True,
508509
)
509510
self.assertEqual(len(mock_diff_tables.call_args[1]["extra_columns"]), 2)
510511
self.assertEqual(mock_connect.call_count, 2)
@@ -549,6 +550,7 @@ def test_local_diff_types_differ(self, mock_diff_tables):
549550
algorithm=Algorithm.JOINDIFF,
550551
extra_columns=ANY,
551552
where=where,
553+
skip_null_keys=True,
552554
)
553555
self.assertEqual(len(mock_diff_tables.call_args[1]["extra_columns"]), 1)
554556
self.assertEqual(mock_connect.call_count, 2)
@@ -584,7 +586,13 @@ def test_local_diff_no_diffs(self, mock_diff_tables):
584586
_local_diff(diff_vars)
585587

586588
mock_diff_tables.assert_called_once_with(
587-
mock_table1, mock_table2, threaded=True, algorithm=Algorithm.JOINDIFF, extra_columns=ANY, where=where
589+
mock_table1,
590+
mock_table2,
591+
threaded=True,
592+
algorithm=Algorithm.JOINDIFF,
593+
extra_columns=ANY,
594+
where=where,
595+
skip_null_keys=True,
588596
)
589597
self.assertEqual(len(mock_diff_tables.call_args[1]["extra_columns"]), 2)
590598
self.assertEqual(mock_connect.call_count, 2)

0 commit comments

Comments
 (0)