Skip to content

Commit

Permalink
Added support for JSON inputs, closes #12
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Feb 23, 2021
1 parent 5a598ca commit 328718f
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 10 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
[![Tests](https://github.com/simonw/csv-diff/workflows/Test/badge.svg)](https://github.com/simonw/csv-diff/actions?query=workflow%3ATest)
[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/simonw/csv-diff/blob/main/LICENSE)

Tool for viewing the difference between two CSV files. See [Generating a commit log for San Francisco’s official list of trees](https://simonwillison.net/2019/Mar/13/tree-history/) (and the [sf-tree-history repo commit log](https://github.com/simonw/sf-tree-history/commits)) for background information on this project.
Tool for viewing the difference between two CSV, TSV or JSON files. See [Generating a commit log for San Francisco’s official list of trees](https://simonwillison.net/2019/Mar/13/tree-history/) (and the [sf-tree-history repo commit log](https://github.com/simonw/sf-tree-history/commits)) for background information on this project.

## Installation

Expand Down Expand Up @@ -53,6 +53,8 @@ The `--key=id` option means that the `id` column should be treated as the unique

The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`.

You can also feed it JSON files, provided they are a JSON array of objects where each object has the same keys. Use `--format=json` if your input files are JSON.

Use `--show-unchanged` to include full details of the unchanged values for rows with at least one change in the diff output:

% csv-diff one.csv two.csv --key=id --show-unchanged
Expand Down
20 changes: 20 additions & 0 deletions csv_diff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,26 @@ def load_csv(fp, key=None, dialect=None):
return {keyfn(r): r for r in rows}


def load_json(fp, key=None):
raw_list = json.load(fp)
assert isinstance(raw_list, list)
if key:
keyfn = lambda r: r[key]
else:
keyfn = lambda r: hashlib.sha1(
json.dumps(r, sort_keys=True).encode("utf8")
).hexdigest()
return {keyfn(r): _simplify_json_row(r) for r in raw_list}


def _simplify_json_row(r):
# Convert list/dict values into JSON serialized strings
for key, value in r.items():
if isinstance(value, (dict, tuple, list)):
r[key] = json.dumps(value)
return r


def compare(previous, current, show_unchanged=False):
result = {
"added": [],
Expand Down
17 changes: 10 additions & 7 deletions csv_diff/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import click
import json as std_json
from . import load_csv, compare, human_text
from . import load_csv, load_json, compare, human_text


@click.command()
Expand All @@ -18,9 +18,9 @@
)
@click.option(
"--format",
type=click.Choice(["csv", "tsv"]),
type=click.Choice(["csv", "tsv", "json"]),
default=None,
help="Explicitly specify input format (csv, tsv) instead of auto-detecting",
help="Explicitly specify input format (csv, tsv, json) instead of auto-detecting",
)
@click.option(
"--json", type=bool, default=False, help="Output changes as JSON", is_flag=True
Expand All @@ -43,16 +43,19 @@
help="Show unchanged fields for rows with at least one change",
)
def cli(previous, current, key, format, json, singular, plural, show_unchanged):
"Diff two CSV files"
"Diff two CSV or JSON files"
dialect = {
"csv": "excel",
"tsv": "excel-tab",
}

def load(filename):
return load_csv(
open(filename, newline=""), key=key, dialect=dialect.get(format)
)
if format == "json":
return load_json(open(filename), key=key)
else:
return load_csv(
open(filename, newline=""), key=key, dialect=dialect.get(format)
)

diff = compare(load(previous), load(current), show_unchanged)
if json:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def get_long_description():

setup(
name="csv-diff",
description="Python CLI tool and library for diffing CSV files",
description="Python CLI tool and library for diffing CSV and JSON files",
long_description=get_long_description(),
long_description_content_type="text/markdown",
author="Simon Willison",
Expand Down
47 changes: 46 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from click.testing import CliRunner
from csv_diff import cli
from csv_diff import cli, load_csv
import csv
import pytest
from .test_csv_diff import ONE, ONE_TSV, TWO, TWO_TSV, THREE, FIVE
import io
import json
from textwrap import dedent

Expand All @@ -15,6 +17,29 @@ def tsv_files(tmpdir):
return str(one), str(two)


@pytest.fixture
def json_files(tmpdir):
one = tmpdir / "one.json"
one.write(
json.dumps(
[
{"id": 1, "name": "Cleo", "nested": {"foo": 3}},
{"id": 2, "name": "Pancakes", "nested": {"foo": 3}},
]
)
)
two = tmpdir / "two.json"
two.write(
json.dumps(
[
{"id": 1, "name": "Cleo", "nested": {"foo": 3, "bar": 5}},
{"id": 2, "name": "Pancakes!", "nested": {"foo": 3}},
]
)
)
return str(one), str(two)


def test_human_cli(tmpdir):
one = tmpdir / "one.csv"
one.write(ONE)
Expand Down Expand Up @@ -101,6 +126,26 @@ def test_tsv_files(tsv_files):
} == json.loads(result.output.strip())


def test_json_files(json_files):
one, two = json_files
result = CliRunner().invoke(
cli.cli,
[one, two, "--key", "id", "--json", "--format", "json"],
catch_exceptions=False,
)
assert 0 == result.exit_code
assert {
"added": [],
"removed": [],
"changed": [
{"key": 1, "changes": {"nested": ['{"foo": 3}', '{"foo": 3, "bar": 5}']}},
{"key": 2, "changes": {"name": ["Pancakes", "Pancakes!"]}},
],
"columns_added": [],
"columns_removed": [],
} == json.loads(result.output.strip())


def test_sniff_format(tsv_files):
one, two = tsv_files
result = CliRunner().invoke(cli.cli, [one, two, "--key", "id", "--json"])
Expand Down

0 comments on commit 328718f

Please sign in to comment.