Add encoder, decoder for MultiIndexes. (#321)

xarray-contrib · Apr 2, 2022 · f458020 · f458020
1 parent 244cc26
commit f458020
Show file tree

Hide file tree

Showing 8 changed files with 234 additions and 1 deletion.
diff --git a/cf_xarray/__init__.py b/cf_xarray/__init__.py
@@ -1,4 +1,8 @@
 from .accessor import CFAccessor  # noqa
+from .coding import (  # noqa
+    decode_compress_to_multi_index,
+    encode_multi_index_as_compress,
+)
 from .geometry import cf_to_shapely, shapely_to_cf  # noqa
 from .helpers import bounds_to_vertices, vertices_to_bounds  # noqa
 from .options import set_options  # noqa

diff --git a/cf_xarray/coding.py b/cf_xarray/coding.py
@@ -0,0 +1,120 @@
+"""
+Encoders and decoders for CF conventions not implemented by Xarray.
+"""
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+
+def encode_multi_index_as_compress(ds, idxnames=None):
+    """
+    Encode a MultiIndexed dimension using the "compression by gathering" CF convention.
+
+    Parameters
+    ----------
+    ds : xarray.Dataset
+        Dataset with at least one MultiIndexed dimension
+    idxnames : hashable or iterable of hashable, optional
+        Dimensions that are MultiIndex-ed. If None, will detect all MultiIndex-ed dimensions.
+
+    Returns
+    -------
+    xarray.Dataset
+        Encoded Dataset with ``name`` as a integer coordinate with a ``"compress"`` attribute.
+
+    References
+    ----------
+    CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_
+    """
+    if idxnames is None:
+        idxnames = tuple(
+            name
+            for name, idx in ds.indexes.items()
+            if isinstance(idx, pd.MultiIndex)
+            # After the flexible indexes refactor, all MultiIndex Levels
+            # have a MultiIndex but the name won't match.
+            # Prior to that refactor, there is only a single MultiIndex with name=None
+            and (idx.name == name if idx.name is not None else True)
+        )
+    elif isinstance(idxnames, str):
+        idxnames = (idxnames,)
+
+    if not idxnames:
+        raise ValueError("No MultiIndex-ed dimensions found in Dataset.")
+
+    encoded = ds.reset_index(idxnames)
+    for idxname in idxnames:
+        mindex = ds.indexes[idxname]
+        coords = dict(zip(mindex.names, mindex.levels))
+        encoded.update(coords)
+        encoded[idxname] = np.ravel_multi_index(mindex.codes, mindex.levshape)
+        encoded[idxname].attrs = ds[idxname].attrs
+        if (
+            "compress" in encoded[idxname].encoding
+            or "compress" in encoded[idxname].attrs
+        ):
+            raise ValueError(
+                f"Does not support the 'compress' attribute in {idxname}.encoding or {idxname}.attrs. "
+                "This is generated automatically."
+            )
+        encoded[idxname].attrs["compress"] = " ".join(mindex.names)
+    return encoded
+
+
+def decode_compress_to_multi_index(encoded, idxnames=None):
+    """
+    Decode a compressed variable to a pandas MultiIndex.
+
+    Parameters
+    ----------
+    encoded : xarray.Dataset
+        Encoded Dataset with variables that use "compression by gathering".capitalize
+    idxnames : hashable or iterable of hashable, optional
+        Variable names that represents a compressed dimension. These variables must have
+        the attribute ``"compress"``. If None, will detect all indexes with a ``"compress"``
+        attribute and decode those.
+
+    Returns
+    -------
+    xarray.Dataset
+        Decoded Dataset with ``name`` as a MultiIndexed dimension.
+
+    References
+    ----------
+    CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_
+    """
+    decoded = xr.Dataset()
+    if idxnames is None:
+        idxnames = tuple(
+            name for name in encoded.indexes if "compress" in encoded[name].attrs
+        )
+    elif isinstance(idxnames, str):
+        idxnames = (idxnames,)
+
+    for idxname in idxnames:
+        if "compress" not in encoded[idxname].attrs:
+            raise ValueError("Attribute 'compress' not found in provided Dataset.")
+
+        if not isinstance(encoded, xr.Dataset):
+            raise ValueError(
+                f"Must provide a Dataset. Received {type(encoded)} instead."
+            )
+
+        names = encoded[idxname].attrs["compress"].split(" ")
+        shape = [encoded.sizes[dim] for dim in names]
+        indices = np.unravel_index(encoded.landpoint.data, shape)
+        arrays = [encoded[dim].data[index] for dim, index in zip(names, indices)]
+        mindex = pd.MultiIndex.from_arrays(arrays, names=names)
+
+        decoded.coords[idxname] = mindex
+        decoded.coords[idxname].attrs = encoded[idxname].attrs.copy()
+        del decoded[idxname].attrs["compress"]
+
+        for varname in encoded.data_vars:
+            if idxname in encoded[varname].dims:
+                decoded[varname] = (
+                    idxname,
+                    encoded[varname].data,
+                    encoded[varname].attrs,
+                )
+    return decoded
diff --git a/cf_xarray/tests/test_coding.py b/cf_xarray/tests/test_coding.py
@@ -0,0 +1,34 @@
+import numpy as np
+import pandas as pd
+import pytest
+import xarray as xr
+
+import cf_xarray as cfxr
+
+
+@pytest.mark.parametrize(
+    "mindex",
+    [
+        pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("lat", "lon")),
+        pd.MultiIndex.from_arrays(
+            [["a", "b", "c", "d"], [1, 2, 4, 10]], names=("lat", "lon")
+        ),
+        pd.MultiIndex.from_arrays(
+            [["a", "b", "b", "a"], [1, 2, 1, 2]], names=("lat", "lon")
+        ),
+    ],
+)
+@pytest.mark.parametrize("idxnames", ["landpoint", ("landpoint",), None])
+def test_compression_by_gathering_multi_index_roundtrip(mindex, idxnames):
+    dataset = xr.Dataset(
+        {"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})},
+        {"landpoint": ("landpoint", mindex, {"long_name": "land point number"})},
+    )
+    encoded = cfxr.encode_multi_index_as_compress(dataset, idxnames)
+    roundtrip = cfxr.decode_compress_to_multi_index(encoded, idxnames)
+    assert "compress" not in roundtrip["landpoint"].encoding
+    xr.testing.assert_identical(roundtrip, dataset)
+
+    dataset["landpoint"].attrs["compress"] = "lat lon"
+    with pytest.raises(ValueError):
+        cfxr.encode_multi_index_as_compress(dataset, idxnames)
diff --git a/doc/api.rst b/doc/api.rst
@@ -14,6 +14,8 @@ Top-level API
     shapely_to_cf
     cf_to_shapely
     set_options
+    encode_multi_index_as_compress
+    decode_compress_to_multi_index
 
 .. currentmodule:: xarray
 

diff --git a/doc/coding.md b/doc/coding.md
@@ -0,0 +1,70 @@
+---
+jupytext:
+  text_representation:
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  name: python3
+---
+```{eval-rst}
+.. currentmodule:: cf_xarray
+```
+```{code-cell}
+---
+tags: [remove-cell]
+---
+import cf_xarray as cfxr
+import numpy as np
+import pandas as pd
+import xarray as xr
+xr.set_options(display_expand_data=False)
+```
+
+
+# Encoding and decoding
+
+`cf_xarray` aims to support encoding and decoding variables using CF conventions not yet implemented by Xarray. 
+
+## Compression by gathering
+
+The ["compression by gathering"](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering) 
+convention could be used for either {py:class}`pandas.MultiIndex` objects or `pydata/sparse` arrays.
+
+### MultiIndex
+
+``cf_xarray`` provides {py:func}`encode_multi_index_as_compress` and {py:func}`decode_compress_to_multi_index` to encode MultiIndex-ed
+dimensions using "compression by gethering". 
+
+Here's a test dataset
+```{code-cell}
+ds = xr.Dataset(
+    {"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})},
+    {
+        "landpoint": pd.MultiIndex.from_product(
+            [["a", "b"], [1, 2]], names=("lat", "lon")
+        )
+    },
+)
+ds
+```
+First encode (note the `"compress"` attribute on the `landpoint` variable)
+```{code-cell}
+encoded = cfxr.encode_multi_index_as_compress(ds, "landpoint")
+encoded
+```
+
+At this point, we can write `encoded` to a CF-compliant dataset using {py:func}`xarray.Dataset.to_netcdf` for example.
+After reading that file, decode using
+```{code-cell}
+decoded = cfxr.decode_compress_to_multi_index(encoded, "landpoint")
+decoded
+```
+
+We roundtrip perfectly
+```{code-cell}
+ds.identical(decoded)
+```
+
+### Sparse arrays
+
+This is unsupported currently but a pull request is welcome!
diff --git a/doc/conf.py b/doc/conf.py
@@ -251,6 +251,7 @@
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "xarray": ("https://xarray.pydata.org/en/stable/", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
 }
 
 autosummary_generate = True

diff --git a/doc/index.rst b/doc/index.rst
@@ -50,6 +50,7 @@ or using ``conda``
    units
    parametricz
    bounds
+   coding
    dsg
    geometry
    plotting

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -5,9 +5,10 @@ What's New
 
 v 0.7.1 (unreleased)
 ====================
+- added encoder and decoder for writing pandas MultiIndex-es to file using "compression by gathering".
+  See :doc:`coding` for more. By `Deepak Cherian`_.
 - added another type of vertical coordinate to decode: ``ocean_sigma_coordinate``. By `Kristen Thyng`_.
 
-
 v0.7.0 (January 24, 2022)
 =========================
 - Many improvements to autoguessing for plotting. By `Deepak Cherian`_