Skip to content

Commit

Permalink
Add encoder, decoder for MultiIndexes. (#321)
Browse files Browse the repository at this point in the history
  • Loading branch information
dcherian authored Apr 2, 2022
1 parent 244cc26 commit f458020
Show file tree
Hide file tree
Showing 8 changed files with 234 additions and 1 deletion.
4 changes: 4 additions & 0 deletions cf_xarray/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from .accessor import CFAccessor # noqa
from .coding import ( # noqa
decode_compress_to_multi_index,
encode_multi_index_as_compress,
)
from .geometry import cf_to_shapely, shapely_to_cf # noqa
from .helpers import bounds_to_vertices, vertices_to_bounds # noqa
from .options import set_options # noqa
Expand Down
120 changes: 120 additions & 0 deletions cf_xarray/coding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
"""
Encoders and decoders for CF conventions not implemented by Xarray.
"""
import numpy as np
import pandas as pd
import xarray as xr


def encode_multi_index_as_compress(ds, idxnames=None):
"""
Encode a MultiIndexed dimension using the "compression by gathering" CF convention.
Parameters
----------
ds : xarray.Dataset
Dataset with at least one MultiIndexed dimension
idxnames : hashable or iterable of hashable, optional
Dimensions that are MultiIndex-ed. If None, will detect all MultiIndex-ed dimensions.
Returns
-------
xarray.Dataset
Encoded Dataset with ``name`` as a integer coordinate with a ``"compress"`` attribute.
References
----------
CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_
"""
if idxnames is None:
idxnames = tuple(
name
for name, idx in ds.indexes.items()
if isinstance(idx, pd.MultiIndex)
# After the flexible indexes refactor, all MultiIndex Levels
# have a MultiIndex but the name won't match.
# Prior to that refactor, there is only a single MultiIndex with name=None
and (idx.name == name if idx.name is not None else True)
)
elif isinstance(idxnames, str):
idxnames = (idxnames,)

if not idxnames:
raise ValueError("No MultiIndex-ed dimensions found in Dataset.")

encoded = ds.reset_index(idxnames)
for idxname in idxnames:
mindex = ds.indexes[idxname]
coords = dict(zip(mindex.names, mindex.levels))
encoded.update(coords)
encoded[idxname] = np.ravel_multi_index(mindex.codes, mindex.levshape)
encoded[idxname].attrs = ds[idxname].attrs
if (
"compress" in encoded[idxname].encoding
or "compress" in encoded[idxname].attrs
):
raise ValueError(
f"Does not support the 'compress' attribute in {idxname}.encoding or {idxname}.attrs. "
"This is generated automatically."
)
encoded[idxname].attrs["compress"] = " ".join(mindex.names)
return encoded


def decode_compress_to_multi_index(encoded, idxnames=None):
"""
Decode a compressed variable to a pandas MultiIndex.
Parameters
----------
encoded : xarray.Dataset
Encoded Dataset with variables that use "compression by gathering".capitalize
idxnames : hashable or iterable of hashable, optional
Variable names that represents a compressed dimension. These variables must have
the attribute ``"compress"``. If None, will detect all indexes with a ``"compress"``
attribute and decode those.
Returns
-------
xarray.Dataset
Decoded Dataset with ``name`` as a MultiIndexed dimension.
References
----------
CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_
"""
decoded = xr.Dataset()
if idxnames is None:
idxnames = tuple(
name for name in encoded.indexes if "compress" in encoded[name].attrs
)
elif isinstance(idxnames, str):
idxnames = (idxnames,)

for idxname in idxnames:
if "compress" not in encoded[idxname].attrs:
raise ValueError("Attribute 'compress' not found in provided Dataset.")

if not isinstance(encoded, xr.Dataset):
raise ValueError(
f"Must provide a Dataset. Received {type(encoded)} instead."
)

names = encoded[idxname].attrs["compress"].split(" ")
shape = [encoded.sizes[dim] for dim in names]
indices = np.unravel_index(encoded.landpoint.data, shape)
arrays = [encoded[dim].data[index] for dim, index in zip(names, indices)]
mindex = pd.MultiIndex.from_arrays(arrays, names=names)

decoded.coords[idxname] = mindex
decoded.coords[idxname].attrs = encoded[idxname].attrs.copy()
del decoded[idxname].attrs["compress"]

for varname in encoded.data_vars:
if idxname in encoded[varname].dims:
decoded[varname] = (
idxname,
encoded[varname].data,
encoded[varname].attrs,
)
return decoded
34 changes: 34 additions & 0 deletions cf_xarray/tests/test_coding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import numpy as np
import pandas as pd
import pytest
import xarray as xr

import cf_xarray as cfxr


@pytest.mark.parametrize(
"mindex",
[
pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("lat", "lon")),
pd.MultiIndex.from_arrays(
[["a", "b", "c", "d"], [1, 2, 4, 10]], names=("lat", "lon")
),
pd.MultiIndex.from_arrays(
[["a", "b", "b", "a"], [1, 2, 1, 2]], names=("lat", "lon")
),
],
)
@pytest.mark.parametrize("idxnames", ["landpoint", ("landpoint",), None])
def test_compression_by_gathering_multi_index_roundtrip(mindex, idxnames):
dataset = xr.Dataset(
{"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})},
{"landpoint": ("landpoint", mindex, {"long_name": "land point number"})},
)
encoded = cfxr.encode_multi_index_as_compress(dataset, idxnames)
roundtrip = cfxr.decode_compress_to_multi_index(encoded, idxnames)
assert "compress" not in roundtrip["landpoint"].encoding
xr.testing.assert_identical(roundtrip, dataset)

dataset["landpoint"].attrs["compress"] = "lat lon"
with pytest.raises(ValueError):
cfxr.encode_multi_index_as_compress(dataset, idxnames)
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Top-level API
shapely_to_cf
cf_to_shapely
set_options
encode_multi_index_as_compress
decode_compress_to_multi_index

.. currentmodule:: xarray

Expand Down
70 changes: 70 additions & 0 deletions doc/coding.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
---
jupytext:
text_representation:
format_name: myst
kernelspec:
display_name: Python 3
name: python3
---
```{eval-rst}
.. currentmodule:: cf_xarray
```
```{code-cell}
---
tags: [remove-cell]
---
import cf_xarray as cfxr
import numpy as np
import pandas as pd
import xarray as xr
xr.set_options(display_expand_data=False)
```


# Encoding and decoding

`cf_xarray` aims to support encoding and decoding variables using CF conventions not yet implemented by Xarray.

## Compression by gathering

The ["compression by gathering"](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering)
convention could be used for either {py:class}`pandas.MultiIndex` objects or `pydata/sparse` arrays.

### MultiIndex

``cf_xarray`` provides {py:func}`encode_multi_index_as_compress` and {py:func}`decode_compress_to_multi_index` to encode MultiIndex-ed
dimensions using "compression by gethering".

Here's a test dataset
```{code-cell}
ds = xr.Dataset(
{"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})},
{
"landpoint": pd.MultiIndex.from_product(
[["a", "b"], [1, 2]], names=("lat", "lon")
)
},
)
ds
```
First encode (note the `"compress"` attribute on the `landpoint` variable)
```{code-cell}
encoded = cfxr.encode_multi_index_as_compress(ds, "landpoint")
encoded
```

At this point, we can write `encoded` to a CF-compliant dataset using {py:func}`xarray.Dataset.to_netcdf` for example.
After reading that file, decode using
```{code-cell}
decoded = cfxr.decode_compress_to_multi_index(encoded, "landpoint")
decoded
```

We roundtrip perfectly
```{code-cell}
ds.identical(decoded)
```

### Sparse arrays

This is unsupported currently but a pull request is welcome!
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@
intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"xarray": ("https://xarray.pydata.org/en/stable/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
}

autosummary_generate = True
Expand Down
1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ or using ``conda``
units
parametricz
bounds
coding
dsg
geometry
plotting
Expand Down
3 changes: 2 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ What's New

v 0.7.1 (unreleased)
====================
- added encoder and decoder for writing pandas MultiIndex-es to file using "compression by gathering".
See :doc:`coding` for more. By `Deepak Cherian`_.
- added another type of vertical coordinate to decode: ``ocean_sigma_coordinate``. By `Kristen Thyng`_.


v0.7.0 (January 24, 2022)
=========================
- Many improvements to autoguessing for plotting. By `Deepak Cherian`_
Expand Down

0 comments on commit f458020

Please sign in to comment.