pydata · ahuang11 · Mar 29, 2021 · Mar 30, 2021 · Mar 30, 2021 · Mar 30, 2021
diff --git a/doc/api.rst b/doc/api.rst
@@ -103,6 +103,7 @@ Dataset contents
    Dataset.expand_dims
    Dataset.drop_vars
    Dataset.drop_dims
+   Dataset.drop_duplicates
    Dataset.set_coords
    Dataset.reset_coords
 
@@ -291,6 +292,7 @@ DataArray contents
    DataArray.swap_dims
    DataArray.expand_dims
    DataArray.drop_vars
+   DataArray.drop_duplicates
    DataArray.reset_coords
    DataArray.copy
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -64,6 +64,9 @@ New Features
   :py:class:`~core.groupby.DataArrayGroupBy`, inspired by pandas'
   :py:meth:`~pandas.core.groupby.GroupBy.get_group`.
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- Implement :py:meth:`Dataset.drop_duplicates` and :py:meth:`DataArray.drop_duplicates`
+  to remove duplicate dimension values (:pull:`5089`).
+  By `Andrew Huang <https://github.com/ahuang11>`_.
 - Disable the `cfgrib` backend if the `eccodes` library is not installed (:pull:`5083`). By `Baudouin Raoult <https://github.com/b8raoult>`_.
 
 Breaking changes

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -4418,6 +4418,34 @@ def query(
         )
         return ds[self.name]
 
+    def drop_duplicates(
+        self,
+        dims: Union[Hashable, Iterable[Hashable]] = None,
+        keep: Union[
+            str,
+            bool,
+        ] = "first",
+    ):
+        """Returns a new DataArray with duplicate dimension values removed.
+
+        Parameters
+        ----------
+        dims : dimension label or sequence of labels, optional
+            Only consider certain dimensions for identifying duplicates, by
+            default use all dimensions.
+        keep : {"first", "last", False}, default: "first"
+            Determines which duplicates (if any) to keep.
+            - ``"first"`` : Drop duplicates except for the first occurrence.
+            - ``"last"`` : Drop duplicates except for the last occurrence.
+            - False : Drop all duplicates.
+
+        Returns
+        -------
+        DataArray
+        """
+        ds = self._to_temp_dataset().drop_duplicates(dims=dims, keep=keep)
+        return self._from_temp_dataset(ds)
+
     # this needs to be at the end, or mypy will confuse with `str`
     # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
     str = utils.UncachedAccessor(StringAccessor)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -7074,5 +7074,42 @@ def query(
         # apply the selection
         return self.isel(indexers, missing_dims=missing_dims)
 
+    def drop_duplicates(
+        self,
+        dims: Union[Hashable, Iterable[Hashable]] = None,
+        keep: Union[str, bool] = "first",
+    ):
+        """Returns a new dataset with duplicate dimension values removed.
+
+        Parameters
+        ----------
+        dims : dimension label or sequence of labels, optional
+            Only consider certain dimensions for identifying duplicates, by
+            default use all dimensions.
+        keep : {"first", "last", False}, default: "first"
+            Determines which duplicates (if any) to keep.
+            - ``"first"`` : Drop duplicates except for the first occurrence.
+            - ``"last"`` : Drop duplicates except for the last occurrence.
+            - False : Drop all duplicates.
+
+        Returns
+        -------
+        Dataset
+        """
+        if dims is None:
+            dims = list(self.coords)
-            dims = list(self.coords)
+            dims = list(self.dims)
-            dims = list(self.coords)
+            dims = list(self.dims)
+        elif isinstance(dims, str) or not isinstance(dims, Iterable):
+            dims = [dims]
+        else:
+            dims = list(dims)
+
+        indexes = {}
+        for dim in dims:
+            if dim not in self.dims:
+                raise ValueError(f"'{dim}' not found in dimensions")
+            indexes[dim] = ~self.get_index(dim).duplicated(keep=keep)
+
+        return self.isel(indexes)
+
 
 ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False)
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -7259,3 +7259,54 @@ def test_deepcopy_obj_array():
     x0 = DataArray(np.array([object()]))
     x1 = deepcopy(x0)
     assert x0.values[0] is not x1.values[0]
+
+
+@pytest.mark.parametrize("keep", ["first", "last", False])
+def test_drop_duplicates(keep):
+    ds = xr.DataArray(
+        [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test"
+    )
+
+    if keep == "first":
+        data = [0, 6, 7]
+        time = [0, 1, 2]
+    elif keep == "last":
+        data = [5, 6, 7]
+        time = [0, 1, 2]
+    else:
+        data = [6, 7]
+        time = [1, 2]
+
+    expected = xr.DataArray(data, dims="time", coords={"time": time}, name="test")
+    result = ds.drop_duplicates("time", keep=keep)
+    assert_equal(expected, result)
+
+
+@pytest.mark.parametrize("keep", ["first", "last", False])
+def test_drop_duplicates_multi_dim(keep):
+    base_data = np.stack([np.arange(0, 5) * i for i in np.arange(0, 5)])
+    ds = xr.DataArray(
+        base_data,
+        coords={"lat": [0, 1, 2, 2, 3], "lon": [0, 1, 3, 3, 4]},
+        dims=["lat", "lon"],
+        name="test",
+    )
+
+    if keep == "first":
+        data = base_data[[0, 1, 2, 4]][:, [0, 1, 2, 4]]
+        lat = [0, 1, 2, 3]
+        lon = [0, 1, 3, 4]
+    elif keep == "last":
+        data = base_data[[0, 1, 3, 4]][:, [0, 1, 3, 4]]
+        lat = [0, 1, 2, 3]
+        lon = [0, 1, 3, 4]
+    else:
+        data = base_data[[0, 1, 4]][:, [0, 1, 4]]
+        lat = [0, 1, 3]
+        lon = [0, 1, 4]
+
+    expected = xr.DataArray(
+        data, dims=["lat", "lon"], coords={"lat": lat, "lon": lon}, name="test"
+    )
+    result = ds.drop_duplicates(["lat", "lon"], keep=keep)
+    assert_equal(expected, result)
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -6845,3 +6845,56 @@ def test_deepcopy_obj_array():
     x0 = Dataset(dict(foo=DataArray(np.array([object()]))))
     x1 = deepcopy(x0)
     assert x0["foo"].values[0] is not x1["foo"].values[0]
+
+
+@pytest.mark.parametrize("keep", ["first", "last", False])
+def test_drop_duplicates(keep):
+    ds = xr.DataArray(
+        [0, 5, 6, 7], dims="time", coords={"time": [0, 0, 1, 2]}, name="test"
+    ).to_dataset()
+
+    if keep == "first":
+        data = [0, 6, 7]
+        time = [0, 1, 2]
+    elif keep == "last":
+        data = [5, 6, 7]
+        time = [0, 1, 2]
+    else:
+        data = [6, 7]
+        time = [1, 2]
+
+    expected = xr.DataArray(
+        data, dims="time", coords={"time": time}, name="test"
+    ).to_dataset()
+    result = ds.drop_duplicates("time", keep=keep)
+    assert_equal(expected, result)
+
+
+@pytest.mark.parametrize("keep", ["first", "last", False])
+def test_drop_duplicates_multi_dim(keep):
+    base_data = np.stack([np.arange(0, 5) * i for i in np.arange(0, 5)])
+    ds = xr.DataArray(
+        base_data,
+        coords={"lat": [0, 1, 2, 2, 3], "lon": [0, 1, 3, 3, 4]},
+        dims=["lat", "lon"],
+        name="test",
+    ).to_dataset()
+
+    if keep == "first":
+        data = base_data[[0, 1, 2, 4]][:, [0, 1, 2, 4]]
+        lat = [0, 1, 2, 3]
+        lon = [0, 1, 3, 4]
+    elif keep == "last":
+        data = base_data[[0, 1, 3, 4]][:, [0, 1, 3, 4]]
+        lat = [0, 1, 2, 3]
+        lon = [0, 1, 3, 4]
+    else:
+        data = base_data[[0, 1, 4]][:, [0, 1, 4]]
+        lat = [0, 1, 3]
+        lon = [0, 1, 4]
+
+    expected = xr.DataArray(
+        data, dims=["lat", "lon"], coords={"lat": lat, "lon": lon}, name="test"
+    ).to_dataset()
+    result = ds.drop_duplicates(["lat", "lon"], keep=keep)
+    assert_equal(expected, result)