Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace SortedKeysDict with dict #4753

Merged
merged 19 commits into from
May 19, 2021
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ Internal Changes
``Dataset.xindexes`` / ``DataArray.xindexes`` properties. Also rename
``PandasIndexAdapter`` to ``PandasIndex``, which now inherits from
``xarray.Index`` (:pull:`5102`). By `Benoit Bovy <https://github.com/benbovy>`_.
- Replace ``SortedKeysDict`` with python's ``dict``, given dicts are now sorted.
max-sixty marked this conversation as resolved.
Show resolved Hide resolved
By `Maximilian Roos <https://github.com/max-sixty>`_.

.. _whats-new.0.18.0:

Expand Down
24 changes: 11 additions & 13 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def combine_by_coords(
variable names to fill values. Use a data array's name to
refer to its values. If None, raises a ValueError if
the passed Datasets do not create a complete hypercube.
join : {"outer", "inner", "left", "right", "exact"}, optional
join : {"outer", "inner", "left", "right", "exact", "override"}, optional
String indicating how to combine differing indexes in objects

- "outer": use the union of object indexes
Expand Down Expand Up @@ -676,9 +676,6 @@ def combine_by_coords(
they are concatenated based on the values in their dimension coordinates,
not on their position in the list passed to `combine_by_coords`.

>>> import numpy as np
>>> import xarray as xr

>>> x1 = xr.Dataset(
... {
... "temperature": (("y", "x"), 20 * np.random.rand(6).reshape(2, 3)),
Expand All @@ -703,7 +700,7 @@ def combine_by_coords(

>>> x1
<xarray.Dataset>
Dimensions: (x: 3, y: 2)
Dimensions: (y: 2, x: 3)
dcherian marked this conversation as resolved.
Show resolved Hide resolved
Coordinates:
* y (y) int64 0 1
* x (x) int64 10 20 30
Expand All @@ -713,7 +710,7 @@ def combine_by_coords(

>>> x2
<xarray.Dataset>
Dimensions: (x: 3, y: 2)
Dimensions: (y: 2, x: 3)
Coordinates:
* y (y) int64 2 3
* x (x) int64 10 20 30
Expand All @@ -723,7 +720,7 @@ def combine_by_coords(

>>> x3
<xarray.Dataset>
Dimensions: (x: 3, y: 2)
Dimensions: (y: 2, x: 3)
Coordinates:
* y (y) int64 2 3
* x (x) int64 40 50 60
Expand All @@ -733,7 +730,7 @@ def combine_by_coords(

>>> xr.combine_by_coords([x2, x1])
<xarray.Dataset>
Dimensions: (x: 3, y: 4)
Dimensions: (y: 4, x: 3)
Coordinates:
* y (y) int64 0 1 2 3
* x (x) int64 10 20 30
Expand All @@ -743,17 +740,18 @@ def combine_by_coords(

>>> xr.combine_by_coords([x3, x1])
<xarray.Dataset>
Dimensions: (x: 6, y: 4)
Dimensions: (y: 4, x: 6)
Coordinates:
* x (x) int64 10 20 30 40 50 60
* y (y) int64 0 1 2 3
* x (x) int64 10 20 30 40 50 60
Data variables:
temperature (y, x) float64 10.98 14.3 12.06 nan ... nan 18.89 10.44 8.293
precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176

# FIXME
>>> xr.combine_by_coords([x3, x1], join="override")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the existing behavior here a bug?

In [16]: x1.dims
Out[16]: Frozen({'y': 2, 'x': 3})

In [17]: x3.dims
Out[17]: Frozen({'y': 2, 'x': 3})

In [18]: xr.combine_by_coords([x3, x1], join="override")
Out[18]:
    <xarray.Dataset>
    Dimensions:        (y: 4, x: 3)
    Coordinates:
      * x              (x) int64 10 20 30
      * y              (y) int64 0 1 2 3
    Data variables:
        temperature    (y, x) float64 10.98 14.3 12.06 10.9 ... 18.89 10.44 8.293
        precipitation  (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176

and override is documented as:

"override": if indexes are of same size, rewrite indexes to be
those of the first object with that dimension. Indexes for the same
dimension must have the same size in all objects.

So should the result y dim have length 4?

This is changing behavior because the order of dims is changing.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking the repr of x1 and x2 reveals that both x and y are valid concat dimensions (given join="override") and it just chooses the first one (arbitrarily). I think this PR did not break it but just revealed a problem (also #4824). This should be fixed in another PR.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for that reference. I agree.

Copy link
Member

@TomNicholas TomNicholas May 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a dfferent bug from #4824?

<xarray.Dataset>
Dimensions: (x: 3, y: 4)
Dimensions: (y: 4, x: 3)
Coordinates:
* x (x) int64 10 20 30
* y (y) int64 0 1 2 3
Expand All @@ -763,10 +761,10 @@ def combine_by_coords(

>>> xr.combine_by_coords([x1, x2, x3])
<xarray.Dataset>
Dimensions: (x: 6, y: 4)
Dimensions: (y: 4, x: 6)
Coordinates:
* x (x) int64 10 20 30 40 50 60
* y (y) int64 0 1 2 3
* x (x) int64 10 20 30 40 50 60
Data variables:
temperature (y, x) float64 10.98 14.3 12.06 nan ... 18.89 10.44 8.293
precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176
Expand Down
44 changes: 24 additions & 20 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
Default,
Frozen,
HybridMappingProxy,
SortedKeysDict,
OrderedSet,
_default,
decode_numpy_dict_values,
drop_dims_from_indexers,
Expand Down Expand Up @@ -657,7 +657,7 @@ class Dataset(DataWithCoords, DatasetArithmetic, Mapping):
... )
>>> ds
<xarray.Dataset>
Dimensions: (time: 3, x: 2, y: 2)
Dimensions: (x: 2, y: 2, time: 3)
Coordinates:
lon (x, y) float64 -99.83 -99.32 -99.79 -99.23
lat (x, y) float64 42.25 42.21 42.63 42.59
Expand Down Expand Up @@ -807,7 +807,7 @@ def dims(self) -> Mapping[Hashable, int]:
See `Dataset.sizes` and `DataArray.sizes` for consistently named
properties.
"""
return Frozen(SortedKeysDict(self._dims))
return Frozen(self._dims)

@property
def sizes(self) -> Mapping[Hashable, int]:
Expand Down Expand Up @@ -1348,7 +1348,7 @@ def _copy_listed(self, names: Iterable[Hashable]) -> "Dataset":
if (var_name,) == var.dims:
indexes[var_name] = var._to_xindex()

needed_dims: Set[Hashable] = set()
needed_dims: OrderedSet[Hashable] = OrderedSet()
for v in variables.values():
needed_dims.update(v.dims)

Expand Down Expand Up @@ -1996,7 +1996,7 @@ def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]:
"This can be fixed by calling unify_chunks()."
)
chunks[dim] = c
return Frozen(SortedKeysDict(chunks))
return Frozen(chunks)

def chunk(
self,
Expand Down Expand Up @@ -6372,30 +6372,32 @@ def filter_by_attrs(self, **kwargs):

Examples
--------
>>> # Create an example dataset:
>>> temp = 15 + 8 * np.random.randn(2, 2, 3)
>>> precip = 10 * np.random.rand(2, 2, 3)
>>> lon = [[-99.83, -99.32], [-99.79, -99.23]]
>>> lat = [[42.25, 42.21], [42.63, 42.59]]
>>> dims = ["x", "y", "time"]
>>> temp_attr = dict(standard_name="air_potential_temperature")
>>> precip_attr = dict(standard_name="convective_precipitation_flux")

>>> ds = xr.Dataset(
... {
... "temperature": (dims, temp, temp_attr),
... "precipitation": (dims, precip, precip_attr),
... },
... coords={
... "lon": (["x", "y"], lon),
... "lat": (["x", "y"], lat),
... "time": pd.date_range("2014-09-06", periods=3),
... "reference_time": pd.Timestamp("2014-09-05"),
... },
... dict(
... temperature=(dims, temp, temp_attr),
... precipitation=(dims, precip, precip_attr),
... ),
... coords=dict(
... lon=(["x", "y"], lon),
... lat=(["x", "y"], lat),
... time=pd.date_range("2014-09-06", periods=3),
... reference_time=pd.Timestamp("2014-09-05"),
... ),
... )
>>> # Get variables matching a specific standard_name.

Get variables matching a specific standard_name:

>>> ds.filter_by_attrs(standard_name="convective_precipitation_flux")
<xarray.Dataset>
Dimensions: (time: 3, x: 2, y: 2)
Dimensions: (x: 2, y: 2, time: 3)
Coordinates:
lon (x, y) float64 -99.83 -99.32 -99.79 -99.23
lat (x, y) float64 42.25 42.21 42.63 42.59
Expand All @@ -6404,11 +6406,13 @@ def filter_by_attrs(self, **kwargs):
Dimensions without coordinates: x, y
Data variables:
precipitation (x, y, time) float64 5.68 9.256 0.7104 ... 7.992 4.615 7.805
>>> # Get all variables that have a standard_name attribute.

Get all variables that have a standard_name attribute:

>>> standard_name = lambda v: v is not None
>>> ds.filter_by_attrs(standard_name=standard_name)
<xarray.Dataset>
Dimensions: (time: 3, x: 2, y: 2)
Dimensions: (x: 2, y: 2, time: 3)
Coordinates:
lon (x, y) float64 -99.83 -99.32 -99.79 -99.23
lat (x, y) float64 42.25 42.21 42.63 42.59
Expand Down
2 changes: 1 addition & 1 deletion xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,7 +636,7 @@ def quantile(
* x (x) int64 0 1
>>> ds.groupby("y").quantile([0, 0.5, 1], dim=...)
<xarray.Dataset>
Dimensions: (quantile: 3, y: 2)
Dimensions: (y: 2, quantile: 3)
Coordinates:
* quantile (quantile) float64 0.0 0.5 1.0
* y (y) int64 1 2
Expand Down
34 changes: 0 additions & 34 deletions xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,40 +481,6 @@ def __len__(self) -> int:
return len(self._keys)


class SortedKeysDict(MutableMapping[K, V]):
"""An wrapper for dictionary-like objects that always iterates over its
items in sorted order by key but is otherwise equivalent to the underlying
mapping.
"""

__slots__ = ("mapping",)

def __init__(self, mapping: MutableMapping[K, V] = None):
self.mapping = {} if mapping is None else mapping

def __getitem__(self, key: K) -> V:
return self.mapping[key]

def __setitem__(self, key: K, value: V) -> None:
self.mapping[key] = value

def __delitem__(self, key: K) -> None:
del self.mapping[key]

def __iter__(self) -> Iterator[K]:
# see #4571 for the reason of the type ignore
return iter(sorted(self.mapping)) # type: ignore[type-var]

def __len__(self) -> int:
return len(self.mapping)

def __contains__(self, key: object) -> bool:
return key in self.mapping

def __repr__(self) -> str:
return "{}({!r})".format(type(self).__name__, self.mapping)


class OrderedSet(MutableSet[T]):
"""A simple ordered set.

Expand Down
62 changes: 32 additions & 30 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ def create_test_data(seed=None, add_attrs=True):
_dims = {"dim1": 8, "dim2": 9, "dim3": 10}

obj = Dataset()
obj["time"] = ("time", pd.date_range("2000-01-01", periods=20))
obj["dim2"] = ("dim2", 0.5 * np.arange(_dims["dim2"]))
obj["dim3"] = ("dim3", list("abcdefghij"))
obj["time"] = ("time", pd.date_range("2000-01-01", periods=20))
for v, dims in sorted(_vars.items()):
data = rs.normal(size=tuple(_dims[d] for d in dims))
obj[v] = (dims, data)
Expand Down Expand Up @@ -205,11 +205,11 @@ def test_repr(self):
expected = dedent(
"""\
<xarray.Dataset>
Dimensions: (dim1: 8, dim2: 9, dim3: 10, time: 20)
Dimensions: (dim2: 9, dim3: 10, time: 20, dim1: 8)
Coordinates:
* time (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-20
* dim2 (dim2) float64 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0
* dim3 (dim3) %s 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j'
* time (time) datetime64[ns] 2000-01-01 2000-01-02 ... 2000-01-20
numbers (dim3) int64 0 1 2 0 0 1 1 2 2 3
Dimensions without coordinates: dim1
Data variables:
Expand Down Expand Up @@ -357,14 +357,14 @@ def test_info(self):
"""\
xarray.Dataset {
dimensions:
\tdim1 = 8 ;
\tdim2 = 9 ;
\tdim3 = 10 ;
\ttime = 20 ;
\tdim1 = 8 ;
\tdim3 = 10 ;

variables:
\tdatetime64[ns] time(time) ;
\tfloat64 dim2(dim2) ;
\tdatetime64[ns] time(time) ;
\tfloat64 var1(dim1, dim2) ;
\t\tvar1:foo = variable ;
\tfloat64 var2(dim1, dim2) ;
Expand Down Expand Up @@ -560,14 +560,13 @@ def test_constructor_with_coords(self):
def test_properties(self):
ds = create_test_data()
assert ds.dims == {"dim1": 8, "dim2": 9, "dim3": 10, "time": 20}
assert list(ds.dims) == sorted(ds.dims)
assert ds.sizes == ds.dims

# These exact types aren't public API, but this makes sure we don't
# change them inadvertently:
assert isinstance(ds.dims, utils.Frozen)
assert isinstance(ds.dims.mapping, utils.SortedKeysDict)
assert type(ds.dims.mapping.mapping) is dict
assert isinstance(ds.dims.mapping, dict)
assert type(ds.dims.mapping) is dict

assert list(ds) == list(ds.data_vars)
assert list(ds.keys()) == list(ds.data_vars)
Expand All @@ -593,7 +592,7 @@ def test_properties(self):
assert "dim2" in repr(ds.indexes)
assert all([isinstance(idx, pd.Index) for idx in ds.indexes.values()])

assert list(ds.coords) == ["time", "dim2", "dim3", "numbers"]
assert list(ds.coords) == ["dim2", "dim3", "time", "numbers"]
assert "dim2" in ds.coords
assert "numbers" in ds.coords
assert "var1" not in ds.coords
Expand Down Expand Up @@ -1035,14 +1034,14 @@ def test_isel(self):
ValueError,
match=r"Dimensions {'not_a_dim'} do not exist. Expected "
r"one or more of "
r"[\w\W]*'time'[\w\W]*'dim\d'[\w\W]*'dim\d'[\w\W]*'dim\d'[\w\W]*",
r"[\w\W]*'dim\d'[\w\W]*'dim\d'[\w\W]*'time'[\w\W]*'dim\d'[\w\W]*",
):
data.isel(not_a_dim=slice(0, 2))
with pytest.warns(
UserWarning,
match=r"Dimensions {'not_a_dim'} do not exist. "
r"Expected one or more of "
r"[\w\W]*'time'[\w\W]*'dim\d'[\w\W]*'dim\d'[\w\W]*'dim\d'[\w\W]*",
r"[\w\W]*'dim\d'[\w\W]*'dim\d'[\w\W]*'time'[\w\W]*'dim\d'[\w\W]*",
):
data.isel(not_a_dim=slice(0, 2), missing_dims="warn")
assert_identical(data, data.isel(not_a_dim=slice(0, 2), missing_dims="ignore"))
Expand Down Expand Up @@ -4823,10 +4822,10 @@ def test_reduce(self):
assert_equal(data.min(dim=["dim1"]), data.min(dim="dim1"))

for reduct, expected in [
("dim2", ["dim1", "dim3", "time"]),
(["dim2", "time"], ["dim1", "dim3"]),
(("dim2", "time"), ["dim1", "dim3"]),
((), ["dim1", "dim2", "dim3", "time"]),
("dim2", ["dim3", "time", "dim1"]),
(["dim2", "time"], ["dim3", "dim1"]),
(("dim2", "time"), ["dim3", "dim1"]),
((), ["dim2", "dim3", "time", "dim1"]),
]:
actual = list(data.min(dim=reduct).dims)
assert actual == expected
Expand Down Expand Up @@ -4876,21 +4875,24 @@ def test_reduce_cumsum(self):
)
assert_identical(expected, data.cumsum())

def test_reduce_cumsum_test_dims(self):
@pytest.mark.parametrize(
"reduct, expected",
[
("dim1", ["dim2", "dim3", "time", "dim1"]),
("dim2", ["dim3", "time", "dim1", "dim2"]),
("dim3", ["dim2", "time", "dim1", "dim3"]),
("time", ["dim2", "dim3", "dim1"]),
],
)
@pytest.mark.parametrize("func", ["cumsum", "cumprod"])
def test_reduce_cumsum_test_dims(self, reduct, expected, func):
data = create_test_data()
for cumfunc in ["cumsum", "cumprod"]:
with pytest.raises(ValueError, match=r"Dataset does not contain"):
getattr(data, cumfunc)(dim="bad_dim")

# ensure dimensions are correct
for reduct, expected in [
("dim1", ["dim1", "dim2", "dim3", "time"]),
("dim2", ["dim1", "dim2", "dim3", "time"]),
("dim3", ["dim1", "dim2", "dim3", "time"]),
("time", ["dim1", "dim2", "dim3"]),
]:
actual = getattr(data, cumfunc)(dim=reduct).dims
assert list(actual) == expected
with pytest.raises(ValueError, match=r"Dataset does not contain"):
getattr(data, func)(dim="bad_dim")

# ensure dimensions are correct
actual = getattr(data, func)(dim=reduct).dims
assert list(actual) == expected

def test_reduce_non_numeric(self):
data1 = create_test_data(seed=44)
Expand Down
Loading