Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add encoder, decoder for MultiIndexes. #321

Merged
merged 9 commits into from
Apr 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cf_xarray/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from .accessor import CFAccessor # noqa
from .coding import ( # noqa
decode_compress_to_multi_index,
encode_multi_index_as_compress,
)
from .geometry import cf_to_shapely, shapely_to_cf # noqa
from .helpers import bounds_to_vertices, vertices_to_bounds # noqa
from .options import set_options # noqa
Expand Down
120 changes: 120 additions & 0 deletions cf_xarray/coding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""
Encoders and decoders for CF conventions not implemented by Xarray.
"""
import numpy as np
import pandas as pd
import xarray as xr


def encode_multi_index_as_compress(ds, idxnames=None):
"""
Encode a MultiIndexed dimension using the "compression by gathering" CF convention.

Parameters
----------
ds : xarray.Dataset
Dataset with at least one MultiIndexed dimension
idxnames : hashable or iterable of hashable, optional
Dimensions that are MultiIndex-ed. If None, will detect all MultiIndex-ed dimensions.

Returns
-------
xarray.Dataset
Encoded Dataset with ``name`` as a integer coordinate with a ``"compress"`` attribute.

References
----------
CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_
"""
if idxnames is None:
idxnames = tuple(
name
for name, idx in ds.indexes.items()
if isinstance(idx, pd.MultiIndex)
# After the flexible indexes refactor, all MultiIndex Levels
# have a MultiIndex but the name won't match.
# Prior to that refactor, there is only a single MultiIndex with name=None
and (idx.name == name if idx.name is not None else True)
)
elif isinstance(idxnames, str):
idxnames = (idxnames,)

if not idxnames:
raise ValueError("No MultiIndex-ed dimensions found in Dataset.")

encoded = ds.reset_index(idxnames)
for idxname in idxnames:
mindex = ds.indexes[idxname]
coords = dict(zip(mindex.names, mindex.levels))
encoded.update(coords)
encoded[idxname] = np.ravel_multi_index(mindex.codes, mindex.levshape)
encoded[idxname].attrs = ds[idxname].attrs
if (
"compress" in encoded[idxname].encoding
or "compress" in encoded[idxname].attrs
):
raise ValueError(
f"Does not support the 'compress' attribute in {idxname}.encoding or {idxname}.attrs. "
"This is generated automatically."
)
encoded[idxname].attrs["compress"] = " ".join(mindex.names)
return encoded


def decode_compress_to_multi_index(encoded, idxnames=None):
"""
Decode a compressed variable to a pandas MultiIndex.

Parameters
----------
encoded : xarray.Dataset
Encoded Dataset with variables that use "compression by gathering".capitalize
idxnames : hashable or iterable of hashable, optional
Variable names that represents a compressed dimension. These variables must have
the attribute ``"compress"``. If None, will detect all indexes with a ``"compress"``
attribute and decode those.

Returns
-------
xarray.Dataset
Decoded Dataset with ``name`` as a MultiIndexed dimension.

References
----------
CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_
"""
decoded = xr.Dataset()
if idxnames is None:
idxnames = tuple(
name for name in encoded.indexes if "compress" in encoded[name].attrs
)
elif isinstance(idxnames, str):
idxnames = (idxnames,)

for idxname in idxnames:
if "compress" not in encoded[idxname].attrs:
raise ValueError("Attribute 'compress' not found in provided Dataset.")

if not isinstance(encoded, xr.Dataset):
raise ValueError(
f"Must provide a Dataset. Received {type(encoded)} instead."
)

names = encoded[idxname].attrs["compress"].split(" ")
shape = [encoded.sizes[dim] for dim in names]
indices = np.unravel_index(encoded.landpoint.data, shape)
arrays = [encoded[dim].data[index] for dim, index in zip(names, indices)]
mindex = pd.MultiIndex.from_arrays(arrays, names=names)

decoded.coords[idxname] = mindex
decoded.coords[idxname].attrs = encoded[idxname].attrs.copy()
del decoded[idxname].attrs["compress"]

for varname in encoded.data_vars:
if idxname in encoded[varname].dims:
decoded[varname] = (
idxname,
encoded[varname].data,
encoded[varname].attrs,
)
return decoded
34 changes: 34 additions & 0 deletions cf_xarray/tests/test_coding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import numpy as np
import pandas as pd
import pytest
import xarray as xr

import cf_xarray as cfxr


@pytest.mark.parametrize(
"mindex",
[
pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("lat", "lon")),
pd.MultiIndex.from_arrays(
[["a", "b", "c", "d"], [1, 2, 4, 10]], names=("lat", "lon")
),
pd.MultiIndex.from_arrays(
[["a", "b", "b", "a"], [1, 2, 1, 2]], names=("lat", "lon")
),
],
)
@pytest.mark.parametrize("idxnames", ["landpoint", ("landpoint",), None])
def test_compression_by_gathering_multi_index_roundtrip(mindex, idxnames):
dataset = xr.Dataset(
{"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})},
{"landpoint": ("landpoint", mindex, {"long_name": "land point number"})},
)
encoded = cfxr.encode_multi_index_as_compress(dataset, idxnames)
roundtrip = cfxr.decode_compress_to_multi_index(encoded, idxnames)
assert "compress" not in roundtrip["landpoint"].encoding
xr.testing.assert_identical(roundtrip, dataset)

dataset["landpoint"].attrs["compress"] = "lat lon"
with pytest.raises(ValueError):
cfxr.encode_multi_index_as_compress(dataset, idxnames)
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Top-level API
shapely_to_cf
cf_to_shapely
set_options
encode_multi_index_as_compress
decode_compress_to_multi_index

.. currentmodule:: xarray

Expand Down
70 changes: 70 additions & 0 deletions doc/coding.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
---
jupytext:
text_representation:
format_name: myst
kernelspec:
display_name: Python 3
name: python3
---
```{eval-rst}
.. currentmodule:: cf_xarray
```
```{code-cell}
---
tags: [remove-cell]
---
import cf_xarray as cfxr
import numpy as np
import pandas as pd
import xarray as xr
xr.set_options(display_expand_data=False)
```


# Encoding and decoding

`cf_xarray` aims to support encoding and decoding variables using CF conventions not yet implemented by Xarray.

## Compression by gathering

The ["compression by gathering"](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering)
convention could be used for either {py:class}`pandas.MultiIndex` objects or `pydata/sparse` arrays.

### MultiIndex

``cf_xarray`` provides {py:func}`encode_multi_index_as_compress` and {py:func}`decode_compress_to_multi_index` to encode MultiIndex-ed
dimensions using "compression by gethering".

Here's a test dataset
```{code-cell}
ds = xr.Dataset(
{"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})},
{
"landpoint": pd.MultiIndex.from_product(
[["a", "b"], [1, 2]], names=("lat", "lon")
)
},
)
ds
```
First encode (note the `"compress"` attribute on the `landpoint` variable)
```{code-cell}
encoded = cfxr.encode_multi_index_as_compress(ds, "landpoint")
encoded
```

At this point, we can write `encoded` to a CF-compliant dataset using {py:func}`xarray.Dataset.to_netcdf` for example.
After reading that file, decode using
```{code-cell}
decoded = cfxr.decode_compress_to_multi_index(encoded, "landpoint")
decoded
```

We roundtrip perfectly
```{code-cell}
ds.identical(decoded)
```

### Sparse arrays

This is unsupported currently but a pull request is welcome!
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@
intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"xarray": ("https://xarray.pydata.org/en/stable/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
}

autosummary_generate = True
Expand Down
1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ or using ``conda``
units
parametricz
bounds
coding
dsg
geometry
plotting
Expand Down
3 changes: 2 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ What's New

v 0.7.1 (unreleased)
====================
- added encoder and decoder for writing pandas MultiIndex-es to file using "compression by gathering".
See :doc:`coding` for more. By `Deepak Cherian`_.
- added another type of vertical coordinate to decode: ``ocean_sigma_coordinate``. By `Kristen Thyng`_.


v0.7.0 (January 24, 2022)
=========================
- Many improvements to autoguessing for plotting. By `Deepak Cherian`_
Expand Down