From 3080dd09d5893975bb77712c8d5ec48f48b1a0b0 Mon Sep 17 00:00:00 2001 From: Ilia Pinchuk Date: Fri, 25 Nov 2022 15:05:43 +0600 Subject: [PATCH 1/2] return all duplicated rows --- data_diff/hashdiff_tables.py | 7 ++++-- tests/test_diff_tables.py | 44 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/data_diff/hashdiff_tables.py b/data_diff/hashdiff_tables.py index 0395506c..bac8fd60 100644 --- a/data_diff/hashdiff_tables.py +++ b/data_diff/hashdiff_tables.py @@ -29,10 +29,13 @@ def diff_sets(a: set, b: set) -> Iterator: s2 = set(b) d = defaultdict(list) + diff_ab = (row for row in a if row in s1 - s2) + diff_ba = (row for row in b if row in s2 - s1) + # The first item is always the key (see TableDiffer.relevant_columns) - for i in s1 - s2: + for i in diff_ab: d[i[0]].append(("-", i)) - for i in s2 - s1: + for i in diff_ba: d[i[0]].append(("+", i)) for _k, v in sorted(d.items(), key=lambda i: i[0]): diff --git a/tests/test_diff_tables.py b/tests/test_diff_tables.py index 2a3e5bac..d0c7aebe 100644 --- a/tests/test_diff_tables.py +++ b/tests/test_diff_tables.py @@ -745,3 +745,47 @@ def test_info_tree_root(self): assert info_tree.info.is_diff assert info_tree.info.diff_count == 1000 self.assertEqual(info_tree.info.rowcounts, {1: 1000, 2: 2000}) + + +class TestDuplicateTables(DiffTestCase): + db_cls = db.MySQL + + src_schema = {"id": int, "data": str} + dst_schema = {"id": int, "data": str} + + def setUp(self): + """ + table 1: + (12, 'ABCDE'), + (12, 'ABCDE'); + table 2: + (4,'ABCDEF'), + (4,'ABCDE'), + (4,'ABCDE'), + (6,'ABCDE'), + (6,'ABCDE'), + (6,'ABCDE'); + """ + + super().setUp() + + src_values = [(12, "ABCDE"), (12, "ABCDE")] + dst_values = [(4, "ABCDEF"), (4, "ABCDE"), (4, "ABCDE"), (6, "ABCDE"), (6, "ABCDE"), (6, "ABCDE")] + + self.diffs = [("-", (str(r[0]), r[1])) for r in src_values] + [("+", (str(r[0]), r[1])) for r in dst_values] + + self.connection.query([self.src_table.insert_rows(src_values), self.dst_table.insert_rows(dst_values), commit]) + + self.a = _table_segment( + self.connection, self.table_src_path, "id", extra_columns=("data",), case_sensitive=False + ) + self.b = _table_segment( + self.connection, self.table_dst_path, "id", extra_columns=("data",), case_sensitive=False + ) + + def test_duplicates(self): + """If there are duplicates in data, we want to return them as well""" + + differ = HashDiffer(bisection_factor=2, bisection_threshold=4) + diff = list(differ.diff_tables(self.a, self.b)) + self.assertEqual(diff, self.diffs) From d304e1a7442ef9c7c21dff0ef23fdf221a5a4f6d Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 25 Nov 2022 09:40:29 -0300 Subject: [PATCH 2/2] Adjust PR #314 --- data_diff/hashdiff_tables.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/data_diff/hashdiff_tables.py b/data_diff/hashdiff_tables.py index bac8fd60..bc9bd0a4 100644 --- a/data_diff/hashdiff_tables.py +++ b/data_diff/hashdiff_tables.py @@ -25,18 +25,18 @@ def diff_sets(a: set, b: set) -> Iterator: - s1 = set(a) - s2 = set(b) - d = defaultdict(list) - - diff_ab = (row for row in a if row in s1 - s2) - diff_ba = (row for row in b if row in s2 - s1) + sa = set(a) + sb = set(b) # The first item is always the key (see TableDiffer.relevant_columns) - for i in diff_ab: - d[i[0]].append(("-", i)) - for i in diff_ba: - d[i[0]].append(("+", i)) + # TODO update when we add compound keys to hashdiff + d = defaultdict(list) + for row in a: + if row not in sb: + d[row[0]].append(("-", row)) + for row in b: + if row not in sa: + d[row[0]].append(("+", row)) for _k, v in sorted(d.items(), key=lambda i: i[0]): yield from v