diff --git a/doc/api.rst b/doc/api.rst index 1bd4eee9b12..fbd0a75bb41 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -292,6 +292,7 @@ DataArray contents DataArray.swap_dims DataArray.expand_dims DataArray.drop_vars + DataArray.drop_duplicates DataArray.reset_coords DataArray.copy diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7994cad72e5..c07fecfb092 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,10 @@ v0.18.1 (unreleased) New Features ~~~~~~~~~~~~ + +- Implement :py:meth:`DataArray.drop_duplicates` + to remove duplicate dimension values (:pull:`5239`). + By `Andrew Huang `_. - allow passing ``combine_attrs`` strategy names to the ``keep_attrs`` parameter of :py:func:`apply_ufunc` (:pull:`5041`) By `Justus Magin `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 831d0d24ccb..dcda20dc451 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4573,6 +4573,33 @@ def curvefit( kwargs=kwargs, ) + def drop_duplicates( + self, + dim: Hashable, + keep: Union[ + str, + bool, + ] = "first", + ): + """Returns a new DataArray with duplicate dimension values removed. + Parameters + ---------- + dim : dimension label, optional + keep : {"first", "last", False}, default: "first" + Determines which duplicates (if any) to keep. + - ``"first"`` : Drop duplicates except for the first occurrence. + - ``"last"`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + + Returns + ------- + DataArray + """ + if dim not in self.dims: + raise ValueError(f"'{dim}' not found in dimensions") + indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} + return self.isel(indexes) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index e6c479896e9..8012fad18d0 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -7434,3 +7434,24 @@ def test_clip(da): # Unclear whether we want this work, OK to adjust the test when we have decided. with pytest.raises(ValueError, match="arguments without labels along dimension"): result = da.clip(min=da.mean("x"), max=da.mean("a").isel(x=[0, 1])) + + +@pytest.mark.parametrize("keep", ["first", "last", False]) +def test_drop_duplicates(keep): + ds = xr.DataArray( + [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test" + ) + + if keep == "first": + data = [0, 6, 7] + time = [0, 1, 2] + elif keep == "last": + data = [5, 6, 7] + time = [0, 1, 2] + else: + data = [6, 7] + time = [1, 2] + + expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test") + result = ds.drop_duplicates("time", keep=keep) + assert_equal(expected, result)