Add option to limit diff to significant figures

The --significant option lets you decide how many numerical digits of precision to care about, including negative precision for orders of magnitude comparisons.
larsyencken · Jul 20, 2017 · 74c4ea4 · 74c4ea4
1 parent 435d235
commit 74c4ea4
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 8 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -3,6 +3,11 @@
 History
 -------
 
+dev
+~~~
+
+* Add the --significance option to limit to significant figures.
+
 0.3.2 (2017-07-20)
 ~~~~~~~~~~~~~~~~~~
 

diff --git a/README.rst b/README.rst
@@ -102,11 +102,20 @@ Or look at the full diff pretty printed, to make it more readable::
 
 If you want to ignore a column from the comparison then you can do so by specifying a comma seperated list of column names to ignore. For example::
 
-    $ csvdiff --style=summary --ignore_columns=amount id a.csv b.csv
+    $ csvdiff --style=summary --ignore-columns=amount id a.csv b.csv
     1 rows removed (20.0%)
     1 rows added (20.0%)
     0 rows changed (0%)
 
+You can also choose to compare numeric fields only up to a certain number of significant figures. Use negative significant figures for orders of magnitude::
+
+    $ csvdiff --style=summary id a.csv c.csv
+    0 rows removed (0.0%)
+    0 rows added (0.0%)
+    2 rows changed (40.0%)
+    $ csvdiff --style=summary id --significance=-1 a.csv c.csv
+    files are identical
+
 
 Diffs generated this way contain all the data that's changed, and can be reapplied later if the original data changes. For example, suppose more data gets added to ``a.csv``, giving us ``a-plus.csv``::
 

diff --git a/csvdiff/__init__.py b/csvdiff/__init__.py
@@ -127,10 +127,12 @@ def __repr__(self):
               help="Don't output anything, just use exit codes")
 @click.option('--sep', default=',',
               help='Separator to use between fields [default: comma]')
-@click.option('--ignore_columns', '-i', type=CSVType(),
+@click.option('--ignore-columns', '-i', type=CSVType(),
               help='a comma seperated list of columns to ignore from the comparison')
+@click.option('--significance', type=int,
+              help='Ignore numeric changes less than this number of significant figures')
 def csvdiff_cmd(index_columns, from_csv, to_csv, style=None, output=None,
-                sep=',', quiet=False, ignore_columns=None):
+                sep=',', quiet=False, ignore_columns=None, significance=None):
     """
     Compare two csv files to see what rows differ between them. The files
     are each expected to have a header row, and for each row to be uniquely
@@ -149,11 +151,13 @@ def csvdiff_cmd(index_columns, from_csv, to_csv, style=None, output=None,
     try:
         if style == 'summary':
             _diff_and_summarize(from_csv, to_csv, index_columns, ostream,
-                                sep=sep, ignored_columns=ignore_columns)
+                                sep=sep, ignored_columns=ignore_columns,
+                                significance=significance)
         else:
             compact = (style == 'compact')
             _diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
-                                  compact=compact, sep=sep, ignored_columns=ignore_columns)
+                                  compact=compact, sep=sep, ignored_columns=ignore_columns,
+                                  significance=significance)
 
     except records.InvalidKeyError as e:
         error.abort(e.args[0])
@@ -163,8 +167,13 @@ def csvdiff_cmd(index_columns, from_csv, to_csv, style=None, output=None,
 
 
 def _diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
-                          compact=False, sep=',', ignored_columns=None):
+                          compact=False, sep=',', ignored_columns=None,
+                          significance=None):
     diff = diff_files(from_csv, to_csv, index_columns, sep=sep, ignored_columns=ignored_columns)
+
+    if significance is not None:
+        diff = patch.filter_significance(diff, significance)
+
     patch.save(diff, ostream, compact=compact)
     exit_code = (EXIT_SAME
                  if patch.is_empty(diff)
@@ -173,13 +182,17 @@ def _diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
 
 
 def _diff_and_summarize(from_csv, to_csv, index_columns, stream=sys.stdout,
-                        sep=',', ignored_columns=None):
+                        sep=',', ignored_columns=None, significance=None):
     """
     Print a summary of the difference between the two files.
     """
     from_records = list(records.load(from_csv, sep=sep))
     to_records = records.load(to_csv, sep=sep)
+
     diff = patch.create(from_records, to_records, index_columns, ignored_columns)
+    if significance is not None:
+        diff = patch.filter_significance(diff, significance)
+
     _summarize_diff(diff, len(from_records), stream=stream)
     exit_code = (EXIT_SAME
                  if patch.is_empty(diff)

diff --git a/csvdiff/patch.py b/csvdiff/patch.py
@@ -299,3 +299,37 @@ def _iter_record_fields(recs):
 
 class InvalidPatchError(Exception):
     pass
+
+
+def filter_significance(diff, significance):
+    """
+    Prune any changes in the patch which are due to numeric changes less than this level of
+    significance.
+    """
+    changed = diff['changed']
+
+    # remove individual field changes that are significant
+    reduced = [{'key': delta['key'],
+                'fields': {k: v
+                           for k, v in delta['fields'].items()
+                           if _is_significant(v, significance)}}
+               for delta in changed]
+
+    # call a key changed only if it still has significant changes
+    filtered = [delta for delta in reduced if delta['fields']]
+
+    diff = diff.copy()
+    diff['changed'] = filtered
+    return diff
+
+
+def _is_significant(change, significance):
+    "Return True if a change is genuinely significant given our tolerance."
+    try:
+        a = float(change['from'])
+        b = float(change['to'])
+
+    except ValueError:
+        return True
+
+    return int(a * 10 ** significance) != int(b * 10 ** significance)
diff --git a/tests/examples/c.csv b/tests/examples/c.csv
@@ -0,0 +1,6 @@
+id,name,amount
+1,bob,24
+2,eva,63
+3,sarah,7
+4,jeff,15
+6,fred,10
diff --git a/tests/test_csvdiff.py b/tests/test_csvdiff.py
@@ -55,7 +55,7 @@ def csvdiff_summary_cmd(self, *args, **kwargs):
                 csvdiff.csvdiff_cmd,
                 ('--output', t.name,
                  '--style', 'summary',
-                 '--ignore_columns', kwargs['ignore_columns']) + args
+                 '--ignore-columns', kwargs['ignore_columns']) + args
             )
         else:
             result = self.runner.invoke(
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,11 @@ @@
     History
     -------
+    dev
+    ~~~
+    * Add the --significance option to limit to significant figures.
 .3.2 (2017-07-20)
     ~~~~~~~~~~~~~~~~~~
@@ Expand Down @@