From a08f911c8325b531fa18ef948b328c4447d4368e Mon Sep 17 00:00:00 2001 From: Michael Niklas Date: Sun, 19 Jun 2022 10:37:47 +0200 Subject: [PATCH 1/7] add Dataset.dtypes property --- xarray/core/dataset.py | 38 ++++++++++++++++++++++++------------ xarray/tests/test_dataset.py | 23 ++++++++++++++++++++-- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4e2caf8e1cb..de8a5ff779a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -35,9 +35,9 @@ from ..coding.calendar_ops import convert_calendar, interp_calendar from ..coding.cftimeindex import CFTimeIndex, _parse_array_of_cftime_strings from ..plot.dataset_plot import _Dataset_PlotMethods +from . import alignment +from . import dtypes as xrdtypes from . import ( - alignment, - dtypes, duck_array_ops, formatting, formatting_html, @@ -677,6 +677,18 @@ def sizes(self) -> Frozen[Hashable, int]: """ return self.dims + @property + def dtypes(self) -> Frozen[Hashable, np.dtype]: + """Mapping from variable names to xrdtypes. + + Cannot be modified directly, but is updated when adding new variables. + + See Also + -------- + DataArray.dtype + """ + return Frozen({n: v.dtype for n, v in self._variables.items()}) + def load(self: T_Dataset, **kwargs) -> T_Dataset: """Manually trigger loading and/or computation of this dataset's data from disk or a remote source into memory and return this dataset. @@ -2791,7 +2803,7 @@ def reindex_like( method: ReindexMethodOptions = None, tolerance: int | float | Iterable[int | float] | None = None, copy: bool = True, - fill_value: Any = dtypes.NA, + fill_value: Any = xrdtypes.NA, ) -> T_Dataset: """Conform this object onto the indexes of another object, filling in missing values with ``fill_value``. The default fill value is NaN. @@ -2857,7 +2869,7 @@ def reindex( method: ReindexMethodOptions = None, tolerance: int | float | Iterable[int | float] | None = None, copy: bool = True, - fill_value: Any = dtypes.NA, + fill_value: Any = xrdtypes.NA, **indexers_kwargs: Any, ) -> T_Dataset: """Conform this object onto a new set of indexes, filling in @@ -3073,7 +3085,7 @@ def _reindex( method: str = None, tolerance: int | float | Iterable[int | float] | None = None, copy: bool = True, - fill_value: Any = dtypes.NA, + fill_value: Any = xrdtypes.NA, sparse: bool = False, **indexers_kwargs: Any, ) -> T_Dataset: @@ -4531,7 +4543,7 @@ def _unstack_full_reindex( def unstack( self: T_Dataset, dim: Hashable | Iterable[Hashable] | None = None, - fill_value: Any = dtypes.NA, + fill_value: Any = xrdtypes.NA, sparse: bool = False, ) -> T_Dataset: """ @@ -4676,7 +4688,7 @@ def merge( overwrite_vars: Hashable | Iterable[Hashable] = frozenset(), compat: CompatOptions = "no_conflicts", join: JoinOptions = "outer", - fill_value: Any = dtypes.NA, + fill_value: Any = xrdtypes.NA, combine_attrs: CombineAttrsOptions = "override", ) -> T_Dataset: """Merge the arrays of two datasets into a single dataset. @@ -5885,7 +5897,7 @@ def _set_sparse_data_from_dataframe( # missing values and needs a fill_value. For consistency, don't # special case the rare exceptions (e.g., dtype=int without a # MultiIndex). - dtype, fill_value = dtypes.maybe_promote(values.dtype) + dtype, fill_value = xrdtypes.maybe_promote(values.dtype) values = np.asarray(values, dtype=dtype) data = COO( @@ -5923,7 +5935,7 @@ def _set_numpy_data_from_dataframe( # fill in missing values: # https://stackoverflow.com/a/35049899/809705 if missing_values: - dtype, fill_value = dtypes.maybe_promote(values.dtype) + dtype, fill_value = xrdtypes.maybe_promote(values.dtype) data = np.full(shape, fill_value, dtype) else: # If there are no missing values, keep the existing dtype @@ -6414,7 +6426,7 @@ def diff( def shift( self: T_Dataset, shifts: Mapping[Any, int] | None = None, - fill_value: Any = dtypes.NA, + fill_value: Any = xrdtypes.NA, **shifts_kwargs: int, ) -> T_Dataset: @@ -6469,7 +6481,7 @@ def shift( for name, var in self.variables.items(): if name in self.data_vars: fill_value_ = ( - fill_value.get(name, dtypes.NA) + fill_value.get(name, xrdtypes.NA) if isinstance(fill_value, dict) else fill_value ) @@ -7743,7 +7755,7 @@ def idxmin( self: T_Dataset, dim: Hashable | None = None, skipna: bool | None = None, - fill_value: Any = dtypes.NA, + fill_value: Any = xrdtypes.NA, keep_attrs: bool | None = None, ) -> T_Dataset: """Return the coordinate label of the minimum value along a dimension. @@ -7840,7 +7852,7 @@ def idxmax( self: T_Dataset, dim: Hashable | None = None, skipna: bool | None = None, - fill_value: Any = dtypes.NA, + fill_value: Any = xrdtypes.NA, keep_attrs: bool | None = None, ) -> T_Dataset: """Return the coordinate label of the maximum value along a dimension. diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index a2c1ae1fc12..dd03d1e5785 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -571,15 +571,30 @@ def test_constructor_with_coords(self) -> None: def test_properties(self) -> None: ds = create_test_data() - assert ds.dims == {"dim1": 8, "dim2": 9, "dim3": 10, "time": 20} - assert ds.sizes == ds.dims + # dims / sizes # These exact types aren't public API, but this makes sure we don't # change them inadvertently: assert isinstance(ds.dims, utils.Frozen) assert isinstance(ds.dims.mapping, dict) assert type(ds.dims.mapping) is dict + assert ds.dims == {"dim1": 8, "dim2": 9, "dim3": 10, "time": 20} + assert ds.sizes == ds.dims + # dtypes + assert isinstance(ds.dtypes, utils.Frozen) + assert isinstance(ds.dtypes.mapping, dict) + assert ds.dtypes == { + "dim2": np.dtype("float64"), + "dim3": np.dtype(" None: assert "numbers" not in ds.data_vars assert len(ds.data_vars) == 3 + # xindexes assert set(ds.xindexes) == {"dim2", "dim3", "time"} assert len(ds.xindexes) == 3 assert "dim2" in repr(ds.xindexes) assert all([isinstance(idx, Index) for idx in ds.xindexes.values()]) + # indexes assert set(ds.indexes) == {"dim2", "dim3", "time"} assert len(ds.indexes) == 3 assert "dim2" in repr(ds.indexes) assert all([isinstance(idx, pd.Index) for idx in ds.indexes.values()]) + # coords assert list(ds.coords) == ["dim2", "dim3", "time", "numbers"] assert "dim2" in ds.coords assert "numbers" in ds.coords @@ -611,6 +629,7 @@ def test_properties(self) -> None: assert "dim1" not in ds.coords assert len(ds.coords) == 4 + # nbytes assert ( Dataset({"x": np.int64(1), "y": np.array([1, 2], dtype=np.float32)}).nbytes == 16 From e6d178b9554b6f3c9f4253265104aa67f4a7bc0d Mon Sep 17 00:00:00 2001 From: Michael Niklas Date: Sun, 19 Jun 2022 10:42:00 +0200 Subject: [PATCH 2/7] add Dataset.dtypes to whats-new --- doc/whats-new.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3c53d3bfb04..2403db00b24 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,9 @@ v2022.06.0 (unreleased) New Features ~~~~~~~~~~~~ +- Add :py:meth:`Dataset.dtypes` property: Mapping from variable names to dtypes. + (:pull:`6706`) + By `Michael Niklas `_. Deprecations ~~~~~~~~~~~~ From c4ee212cd476788c4f9f59989efe604990c2aeb8 Mon Sep 17 00:00:00 2001 From: Michael Niklas Date: Sun, 19 Jun 2022 10:42:51 +0200 Subject: [PATCH 3/7] add Dataset.dtypes to api --- doc/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/api.rst b/doc/api.rst index 644b86cdebb..810c1a92682 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -61,6 +61,7 @@ Attributes Dataset.dims Dataset.sizes + Dataset.dtypes Dataset.data_vars Dataset.coords Dataset.attrs From 27b5314aac7e6d938c8a672b47f484f20ff68048 Mon Sep 17 00:00:00 2001 From: Michael Niklas Date: Sun, 19 Jun 2022 10:45:05 +0200 Subject: [PATCH 4/7] fix typo --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index de8a5ff779a..624e002c1f6 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -679,7 +679,7 @@ def sizes(self) -> Frozen[Hashable, int]: @property def dtypes(self) -> Frozen[Hashable, np.dtype]: - """Mapping from variable names to xrdtypes. + """Mapping from variable names to dtypes. Cannot be modified directly, but is updated when adding new variables. From aee5901d70217251cd4781b049eed4e88bbf9ba0 Mon Sep 17 00:00:00 2001 From: Michael Niklas Date: Sun, 19 Jun 2022 10:51:14 +0200 Subject: [PATCH 5/7] fix mypy issue --- xarray/core/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 624e002c1f6..59dd2ba0596 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6942,7 +6942,9 @@ def differentiate( dim = coord_var.dims[0] if _contains_datetime_like_objects(coord_var): if coord_var.dtype.kind in "mM" and datetime_unit is None: - datetime_unit, _ = np.datetime_data(coord_var.dtype) + datetime_unit = cast( + "DatetimeUnitOptions", np.datetime_data(coord_var.dtype)[0] + ) elif datetime_unit is None: datetime_unit = "s" # Default to seconds for cftime objects coord_var = coord_var._to_numeric(datetime_unit=datetime_unit) From 1ec566586994ca2b667dc75af67272f47834e58a Mon Sep 17 00:00:00 2001 From: Michael Niklas Date: Sun, 19 Jun 2022 21:46:51 +0200 Subject: [PATCH 6/7] dtypes property for DataArrayCoordinates, DataVariables and DatasetCoordinates --- xarray/core/coordinates.py | 34 ++++++++++++++++++++++ xarray/core/dataarray.py | 4 +-- xarray/core/dataset.py | 22 ++++++++++++-- xarray/tests/test_dataarray.py | 10 +++++-- xarray/tests/test_dataset.py | 52 +++++++++++++++++++++++----------- 5 files changed, 100 insertions(+), 22 deletions(-) diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index fac1e64cd94..cd80d8e2fb0 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -38,6 +38,10 @@ def _names(self) -> set[Hashable]: def dims(self) -> Mapping[Hashable, int] | tuple[Hashable, ...]: raise NotImplementedError() + @property + def dtypes(self) -> Frozen[Hashable, np.dtype]: + raise NotImplementedError() + @property def indexes(self) -> Indexes[pd.Index]: return self._data.indexes # type: ignore[attr-defined] @@ -242,6 +246,24 @@ def _names(self) -> set[Hashable]: def dims(self) -> Mapping[Hashable, int]: return self._data.dims + @property + def dtypes(self) -> Frozen[Hashable, np.dtype]: + """Mapping from coordinate names to dtypes. + + Cannot be modified directly, but is updated when adding new variables. + + See Also + -------- + Dataset.dtypes + """ + return Frozen( + { + n: v.dtype + for n, v in self._data._variables.items() + if n in self._data._coord_names + } + ) + @property def variables(self) -> Mapping[Hashable, Variable]: return Frozen( @@ -313,6 +335,18 @@ def __init__(self, dataarray: DataArray): def dims(self) -> tuple[Hashable, ...]: return self._data.dims + @property + def dtypes(self) -> Frozen[Hashable, np.dtype]: + """Mapping from coordinate names to dtypes. + + Cannot be modified directly, but is updated when adding new variables. + + See Also + -------- + DataArray.dtype + """ + return Frozen({n: v.dtype for n, v in self._data._coords.items()}) + @property def _names(self) -> set[Hashable]: return set(self._data._coords) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index fc4ff2d5783..0f34d6681ba 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -102,7 +102,7 @@ def _infer_coords_and_dims( shape, coords, dims -) -> tuple[dict[Any, Variable], tuple[Hashable, ...]]: +) -> tuple[dict[Hashable, Variable], tuple[Hashable, ...]]: """All the logic for creating a new DataArray""" if ( @@ -140,7 +140,7 @@ def _infer_coords_and_dims( if not isinstance(d, str): raise TypeError(f"dimension {d} is not a string") - new_coords: dict[Any, Variable] = {} + new_coords: dict[Hashable, Variable] = {} if utils.is_dict_like(coords): for k, v in coords.items(): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 59dd2ba0596..ce9500e72b0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -385,6 +385,18 @@ def variables(self) -> Mapping[Hashable, Variable]: all_variables = self._dataset.variables return Frozen({k: all_variables[k] for k in self}) + @property + def dtypes(self) -> Frozen[Hashable, np.dtype]: + """Mapping from data variable names to dtypes. + + Cannot be modified directly, but is updated when adding new variables. + + See Also + -------- + Dataset.dtype + """ + return self._dataset.dtypes + def _ipython_key_completions_(self): """Provide method for the key-autocompletions in IPython.""" return [ @@ -679,7 +691,7 @@ def sizes(self) -> Frozen[Hashable, int]: @property def dtypes(self) -> Frozen[Hashable, np.dtype]: - """Mapping from variable names to dtypes. + """Mapping from data variable names to dtypes. Cannot be modified directly, but is updated when adding new variables. @@ -687,7 +699,13 @@ def dtypes(self) -> Frozen[Hashable, np.dtype]: -------- DataArray.dtype """ - return Frozen({n: v.dtype for n, v in self._variables.items()}) + return Frozen( + { + n: v.dtype + for n, v in self._variables.items() + if n not in self._coord_names + } + ) def load(self: T_Dataset, **kwargs) -> T_Dataset: """Manually trigger loading and/or computation of this dataset's data diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index d4f7b0f096f..3b1b1da651f 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1321,9 +1321,11 @@ def test_coords(self) -> None: ] da = DataArray(np.random.randn(2, 3), coords, name="foo") - assert 2 == len(da.coords) + # len + assert len(da.coords) == 2 - assert ["x", "y"] == list(da.coords) + # iter + assert list(da.coords) == ["x", "y"] assert coords[0].identical(da.coords["x"]) assert coords[1].identical(da.coords["y"]) @@ -1337,6 +1339,7 @@ def test_coords(self) -> None: with pytest.raises(KeyError): da.coords["foo"] + # repr expected_repr = dedent( """\ Coordinates: @@ -1346,6 +1349,9 @@ def test_coords(self) -> None: actual = repr(da.coords) assert expected_repr == actual + # dtypes + assert da.coords.dtypes == {"x": np.dtype("int64"), "y": np.dtype("int64")} + del da.coords["x"] da._indexes = filter_indexes_from_coords(da.xindexes, set(da.coords)) expected = DataArray(da.values, {"y": [0, 1, 2]}, dims=["x", "y"], name="foo") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index dd03d1e5785..1d5c7787fef 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -30,6 +30,7 @@ from xarray.coding.cftimeindex import CFTimeIndex from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like +from xarray.core.coordinates import DatasetCoordinates from xarray.core.indexes import Index from xarray.core.pycompat import integer_types, sparse_array_type from xarray.core.utils import is_scalar @@ -585,13 +586,9 @@ def test_properties(self) -> None: assert isinstance(ds.dtypes, utils.Frozen) assert isinstance(ds.dtypes.mapping, dict) assert ds.dtypes == { - "dim2": np.dtype("float64"), - "dim3": np.dtype(" None: {"a": ("x", np.array([4, 5], "int64")), "b": np.int64(-10)}, ) - assert 4 == len(data.coords) + coords = data.coords + assert isinstance(coords, DatasetCoordinates) - assert ["x", "y", "a", "b"] == list(data.coords) + # len + assert len(coords) == 4 - assert_identical(data.coords["x"].variable, data["x"].variable) - assert_identical(data.coords["y"].variable, data["y"].variable) + # iter + assert list(coords) == ["x", "y", "a", "b"] - assert "x" in data.coords - assert "a" in data.coords - assert 0 not in data.coords - assert "foo" not in data.coords + assert_identical(coords["x"].variable, data["x"].variable) + assert_identical(coords["y"].variable, data["y"].variable) + + assert "x" in coords + assert "a" in coords + assert 0 not in coords + assert "foo" not in coords with pytest.raises(KeyError): - data.coords["foo"] + coords["foo"] with pytest.raises(KeyError): - data.coords[0] + coords[0] + # repr expected = dedent( """\ Coordinates: @@ -741,10 +744,19 @@ def test_coords_properties(self) -> None: a (x) int64 4 5 b int64 -10""" ) - actual = repr(data.coords) + actual = repr(coords) assert expected == actual - assert {"x": 2, "y": 3} == data.coords.dims + # dims + assert coords.dims == {"x": 2, "y": 3} + + # dtypes + assert coords.dtypes == { + "x": np.dtype("int64"), + "y": np.dtype("int64"), + "a": np.dtype("int64"), + "b": np.dtype("int64"), + } def test_coords_modify(self) -> None: data = Dataset( @@ -912,11 +924,13 @@ def test_data_vars_properties(self) -> None: ds["foo"] = (("x",), [1.0]) ds["bar"] = 2.0 + # iter assert set(ds.data_vars) == {"foo", "bar"} assert "foo" in ds.data_vars assert "x" not in ds.data_vars assert_identical(ds["foo"], ds.data_vars["foo"]) + # repr expected = dedent( """\ Data variables: @@ -926,6 +940,12 @@ def test_data_vars_properties(self) -> None: actual = repr(ds.data_vars) assert expected == actual + # dtypes + assert ds.data_vars.dtypes == { + "foo": np.dtype("float64"), + "bar": np.dtype("float64"), + } + def test_equals_and_identical(self) -> None: data = create_test_data(seed=42) assert data.equals(data) From 364e8550b13166b30621e8e83302f673d991291d Mon Sep 17 00:00:00 2001 From: Michael Niklas Date: Sun, 19 Jun 2022 21:49:00 +0200 Subject: [PATCH 7/7] update whats new --- doc/whats-new.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2403db00b24..b2a20bade3b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,7 +22,8 @@ v2022.06.0 (unreleased) New Features ~~~~~~~~~~~~ -- Add :py:meth:`Dataset.dtypes` property: Mapping from variable names to dtypes. +- Add :py:meth:`Dataset.dtypes`, :py:meth:`DatasetCoordinates.dtypes`, + :py:meth:`DataArrayCoordinates.dtypes` properties: Mapping from variable names to dtypes. (:pull:`6706`) By `Michael Niklas `_.