Skip to content
This repository has been archived by the owner on Feb 18, 2021. It is now read-only.

Commit

Permalink
Add option to limit diff to significant figures
Browse files Browse the repository at this point in the history
The --significant option lets you decide how many numerical digits of
precision to care about, including negative precision for orders of
magnitude comparisons.
  • Loading branch information
larsyencken committed Jul 20, 2017
1 parent 435d235 commit 74c4ea4
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 8 deletions.
5 changes: 5 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
History
-------

dev
~~~

* Add the --significance option to limit to significant figures.

0.3.2 (2017-07-20)
~~~~~~~~~~~~~~~~~~

Expand Down
11 changes: 10 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,20 @@ Or look at the full diff pretty printed, to make it more readable::

If you want to ignore a column from the comparison then you can do so by specifying a comma seperated list of column names to ignore. For example::

$ csvdiff --style=summary --ignore_columns=amount id a.csv b.csv
$ csvdiff --style=summary --ignore-columns=amount id a.csv b.csv
1 rows removed (20.0%)
1 rows added (20.0%)
0 rows changed (0%)

You can also choose to compare numeric fields only up to a certain number of significant figures. Use negative significant figures for orders of magnitude::

$ csvdiff --style=summary id a.csv c.csv
0 rows removed (0.0%)
0 rows added (0.0%)
2 rows changed (40.0%)
$ csvdiff --style=summary id --significance=-1 a.csv c.csv
files are identical


Diffs generated this way contain all the data that's changed, and can be reapplied later if the original data changes. For example, suppose more data gets added to ``a.csv``, giving us ``a-plus.csv``::

Expand Down
25 changes: 19 additions & 6 deletions csvdiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,12 @@ def __repr__(self):
help="Don't output anything, just use exit codes")
@click.option('--sep', default=',',
help='Separator to use between fields [default: comma]')
@click.option('--ignore_columns', '-i', type=CSVType(),
@click.option('--ignore-columns', '-i', type=CSVType(),
help='a comma seperated list of columns to ignore from the comparison')
@click.option('--significance', type=int,
help='Ignore numeric changes less than this number of significant figures')
def csvdiff_cmd(index_columns, from_csv, to_csv, style=None, output=None,
sep=',', quiet=False, ignore_columns=None):
sep=',', quiet=False, ignore_columns=None, significance=None):
"""
Compare two csv files to see what rows differ between them. The files
are each expected to have a header row, and for each row to be uniquely
Expand All @@ -149,11 +151,13 @@ def csvdiff_cmd(index_columns, from_csv, to_csv, style=None, output=None,
try:
if style == 'summary':
_diff_and_summarize(from_csv, to_csv, index_columns, ostream,
sep=sep, ignored_columns=ignore_columns)
sep=sep, ignored_columns=ignore_columns,
significance=significance)
else:
compact = (style == 'compact')
_diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
compact=compact, sep=sep, ignored_columns=ignore_columns)
compact=compact, sep=sep, ignored_columns=ignore_columns,
significance=significance)

except records.InvalidKeyError as e:
error.abort(e.args[0])
Expand All @@ -163,8 +167,13 @@ def csvdiff_cmd(index_columns, from_csv, to_csv, style=None, output=None,


def _diff_files_to_stream(from_csv, to_csv, index_columns, ostream,
compact=False, sep=',', ignored_columns=None):
compact=False, sep=',', ignored_columns=None,
significance=None):
diff = diff_files(from_csv, to_csv, index_columns, sep=sep, ignored_columns=ignored_columns)

if significance is not None:
diff = patch.filter_significance(diff, significance)

patch.save(diff, ostream, compact=compact)
exit_code = (EXIT_SAME
if patch.is_empty(diff)
Expand All @@ -173,13 +182,17 @@ def _diff_files_to_stream(from_csv, to_csv, index_columns, ostream,


def _diff_and_summarize(from_csv, to_csv, index_columns, stream=sys.stdout,
sep=',', ignored_columns=None):
sep=',', ignored_columns=None, significance=None):
"""
Print a summary of the difference between the two files.
"""
from_records = list(records.load(from_csv, sep=sep))
to_records = records.load(to_csv, sep=sep)

diff = patch.create(from_records, to_records, index_columns, ignored_columns)
if significance is not None:
diff = patch.filter_significance(diff, significance)

_summarize_diff(diff, len(from_records), stream=stream)
exit_code = (EXIT_SAME
if patch.is_empty(diff)
Expand Down
34 changes: 34 additions & 0 deletions csvdiff/patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,3 +299,37 @@ def _iter_record_fields(recs):

class InvalidPatchError(Exception):
pass


def filter_significance(diff, significance):
"""
Prune any changes in the patch which are due to numeric changes less than this level of
significance.
"""
changed = diff['changed']

# remove individual field changes that are significant
reduced = [{'key': delta['key'],
'fields': {k: v
for k, v in delta['fields'].items()
if _is_significant(v, significance)}}
for delta in changed]

# call a key changed only if it still has significant changes
filtered = [delta for delta in reduced if delta['fields']]

diff = diff.copy()
diff['changed'] = filtered
return diff


def _is_significant(change, significance):
"Return True if a change is genuinely significant given our tolerance."
try:
a = float(change['from'])
b = float(change['to'])

except ValueError:
return True

return int(a * 10 ** significance) != int(b * 10 ** significance)
6 changes: 6 additions & 0 deletions tests/examples/c.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
id,name,amount
1,bob,24
2,eva,63
3,sarah,7
4,jeff,15
6,fred,10
2 changes: 1 addition & 1 deletion tests/test_csvdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def csvdiff_summary_cmd(self, *args, **kwargs):
csvdiff.csvdiff_cmd,
('--output', t.name,
'--style', 'summary',
'--ignore_columns', kwargs['ignore_columns']) + args
'--ignore-columns', kwargs['ignore_columns']) + args
)
else:
result = self.runner.invoke(
Expand Down

0 comments on commit 74c4ea4

Please sign in to comment.