Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add drop duplicates #5089

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ Dataset contents
Dataset.expand_dims
Dataset.drop_vars
Dataset.drop_dims
Dataset.drop_duplicates
Dataset.set_coords
Dataset.reset_coords

Expand Down Expand Up @@ -291,6 +292,7 @@ DataArray contents
DataArray.swap_dims
DataArray.expand_dims
DataArray.drop_vars
DataArray.drop_duplicates
DataArray.reset_coords
DataArray.copy

Expand Down
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ New Features
:py:class:`~core.groupby.DataArrayGroupBy`, inspired by pandas'
:py:meth:`~pandas.core.groupby.GroupBy.get_group`.
By `Deepak Cherian <https://github.com/dcherian>`_.
- Implement :py:meth:`Dataset.drop_duplicates` and :py:meth:`DataArray.drop_duplicates`
to remove duplicate dimension values (:pull:`5089`).
By `Andrew Huang <https://github.com/ahuang11>`_.
- Disable the `cfgrib` backend if the `eccodes` library is not installed (:pull:`5083`). By `Baudouin Raoult <https://github.com/b8raoult>`_.

Breaking changes
Expand Down
28 changes: 28 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4418,6 +4418,34 @@ def query(
)
return ds[self.name]

def drop_duplicates(
self,
dims: Union[Hashable, Iterable[Hashable]] = None,
keep: Union[
str,
bool,
] = "first",
):
"""Returns a new DataArray with duplicate dimension values removed.

Parameters
----------
dims : dimension label or sequence of labels, optional
Only consider certain dimensions for identifying duplicates, by
default use all dimensions.
keep : {"first", "last", False}, default: "first"
Determines which duplicates (if any) to keep.
- ``"first"`` : Drop duplicates except for the first occurrence.
- ``"last"`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.

Returns
-------
DataArray
"""
ds = self._to_temp_dataset().drop_duplicates(dims=dims, keep=keep)
return self._from_temp_dataset(ds)

# this needs to be at the end, or mypy will confuse with `str`
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
str = utils.UncachedAccessor(StringAccessor)
Expand Down
37 changes: 37 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7074,5 +7074,42 @@ def query(
# apply the selection
return self.isel(indexers, missing_dims=missing_dims)

def drop_duplicates(
self,
dims: Union[Hashable, Iterable[Hashable]] = None,
keep: Union[str, bool] = "first",
):
"""Returns a new dataset with duplicate dimension values removed.

Parameters
----------
dims : dimension label or sequence of labels, optional
Only consider certain dimensions for identifying duplicates, by
default use all dimensions.
keep : {"first", "last", False}, default: "first"
Determines which duplicates (if any) to keep.
- ``"first"`` : Drop duplicates except for the first occurrence.
- ``"last"`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.

Returns
-------
Dataset
"""
if dims is None:
dims = list(self.coords)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
dims = list(self.coords)
dims = list(self.dims)

...I think?

And we should add a test for this please — an array with a non-dimensioned coord

elif isinstance(dims, str) or not isinstance(dims, Iterable):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could in principle use elif isinstance(dims, Hashable): but I would leave it as is (we should once discuss what we do about da.mean(("x", "y")) as ("x", "y") is Hashable)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use utils.is_scalar?

dims = [dims]
else:
dims = list(dims)

indexes = {}
for dim in dims:
if dim not in self.dims:
raise ValueError(f"'{dim}' not found in dimensions")
indexes[dim] = ~self.get_index(dim).duplicated(keep=keep)

return self.isel(indexes)


ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)
51 changes: 51 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -7259,3 +7259,54 @@ def test_deepcopy_obj_array():
x0 = DataArray(np.array([object()]))
x1 = deepcopy(x0)
assert x0.values[0] is not x1.values[0]


@pytest.mark.parametrize("keep", ["first", "last", False])
def test_drop_duplicates(keep):
ds = xr.DataArray(
[0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test"
)

if keep == "first":
data = [0, 6, 7]
time = [0, 1, 2]
elif keep == "last":
data = [5, 6, 7]
time = [0, 1, 2]
else:
data = [6, 7]
time = [1, 2]

expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test")
result = ds.drop_duplicates("time", keep=keep)
assert_equal(expected, result)


@pytest.mark.parametrize("keep", ["first", "last", False])
def test_drop_duplicates_multi_dim(keep):
base_data = np.stack([np.arange(0, 5) * i for i in np.arange(0, 5)])
ds = xr.DataArray(
base_data,
coords={"lat": [0, 1, 2, 2, 3], "lon": [0, 1, 3, 3, 4]},
dims=["lat", "lon"],
name="test",
)

if keep == "first":
data = base_data[[0, 1, 2, 4]][:, [0, 1, 2, 4]]
lat = [0, 1, 2, 3]
lon = [0, 1, 3, 4]
elif keep == "last":
data = base_data[[0, 1, 3, 4]][:, [0, 1, 3, 4]]
lat = [0, 1, 2, 3]
lon = [0, 1, 3, 4]
else:
data = base_data[[0, 1, 4]][:, [0, 1, 4]]
lat = [0, 1, 3]
lon = [0, 1, 4]

expected = xr.DataArray(
data, dims=["lat", "lon"], coords={"lat": lat, "lon": lon}, name="test"
)
result = ds.drop_duplicates(["lat", "lon"], keep=keep)
assert_equal(expected, result)
53 changes: 53 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6845,3 +6845,56 @@ def test_deepcopy_obj_array():
x0 = Dataset(dict(foo=DataArray(np.array([object()]))))
x1 = deepcopy(x0)
assert x0["foo"].values[0] is not x1["foo"].values[0]


@pytest.mark.parametrize("keep", ["first", "last", False])
def test_drop_duplicates(keep):
ds = xr.DataArray(
[0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test"
).to_dataset()

if keep == "first":
data = [0, 6, 7]
time = [0, 1, 2]
elif keep == "last":
data = [5, 6, 7]
time = [0, 1, 2]
else:
data = [6, 7]
time = [1, 2]

expected = xr.DataArray(
data, dims="time", coords={"time": time}, name="test"
).to_dataset()
result = ds.drop_duplicates("time", keep=keep)
assert_equal(expected, result)


@pytest.mark.parametrize("keep", ["first", "last", False])
def test_drop_duplicates_multi_dim(keep):
base_data = np.stack([np.arange(0, 5) * i for i in np.arange(0, 5)])
ds = xr.DataArray(
base_data,
coords={"lat": [0, 1, 2, 2, 3], "lon": [0, 1, 3, 3, 4]},
dims=["lat", "lon"],
name="test",
).to_dataset()

if keep == "first":
data = base_data[[0, 1, 2, 4]][:, [0, 1, 2, 4]]
lat = [0, 1, 2, 3]
lon = [0, 1, 3, 4]
elif keep == "last":
data = base_data[[0, 1, 3, 4]][:, [0, 1, 3, 4]]
lat = [0, 1, 2, 3]
lon = [0, 1, 3, 4]
else:
data = base_data[[0, 1, 4]][:, [0, 1, 4]]
lat = [0, 1, 3]
lon = [0, 1, 4]

expected = xr.DataArray(
data, dims=["lat", "lon"], coords={"lat": lat, "lon": lon}, name="test"
).to_dataset()
result = ds.drop_duplicates(["lat", "lon"], keep=keep)
assert_equal(expected, result)