diff --git a/cf_xarray/__init__.py b/cf_xarray/__init__.py index d6a299eb..42ad707a 100644 --- a/cf_xarray/__init__.py +++ b/cf_xarray/__init__.py @@ -1,4 +1,8 @@ from .accessor import CFAccessor # noqa +from .coding import ( # noqa + decode_compress_to_multi_index, + encode_multi_index_as_compress, +) from .geometry import cf_to_shapely, shapely_to_cf # noqa from .helpers import bounds_to_vertices, vertices_to_bounds # noqa from .options import set_options # noqa diff --git a/cf_xarray/coding.py b/cf_xarray/coding.py new file mode 100644 index 00000000..85372ac0 --- /dev/null +++ b/cf_xarray/coding.py @@ -0,0 +1,120 @@ +""" +Encoders and decoders for CF conventions not implemented by Xarray. +""" +import numpy as np +import pandas as pd +import xarray as xr + + +def encode_multi_index_as_compress(ds, idxnames=None): + """ + Encode a MultiIndexed dimension using the "compression by gathering" CF convention. + + Parameters + ---------- + ds : xarray.Dataset + Dataset with at least one MultiIndexed dimension + idxnames : hashable or iterable of hashable, optional + Dimensions that are MultiIndex-ed. If None, will detect all MultiIndex-ed dimensions. + + Returns + ------- + xarray.Dataset + Encoded Dataset with ``name`` as a integer coordinate with a ``"compress"`` attribute. + + References + ---------- + CF conventions on `compression by gathering `_ + """ + if idxnames is None: + idxnames = tuple( + name + for name, idx in ds.indexes.items() + if isinstance(idx, pd.MultiIndex) + # After the flexible indexes refactor, all MultiIndex Levels + # have a MultiIndex but the name won't match. + # Prior to that refactor, there is only a single MultiIndex with name=None + and (idx.name == name if idx.name is not None else True) + ) + elif isinstance(idxnames, str): + idxnames = (idxnames,) + + if not idxnames: + raise ValueError("No MultiIndex-ed dimensions found in Dataset.") + + encoded = ds.reset_index(idxnames) + for idxname in idxnames: + mindex = ds.indexes[idxname] + coords = dict(zip(mindex.names, mindex.levels)) + encoded.update(coords) + encoded[idxname] = np.ravel_multi_index(mindex.codes, mindex.levshape) + encoded[idxname].attrs = ds[idxname].attrs + if ( + "compress" in encoded[idxname].encoding + or "compress" in encoded[idxname].attrs + ): + raise ValueError( + f"Does not support the 'compress' attribute in {idxname}.encoding or {idxname}.attrs. " + "This is generated automatically." + ) + encoded[idxname].attrs["compress"] = " ".join(mindex.names) + return encoded + + +def decode_compress_to_multi_index(encoded, idxnames=None): + """ + Decode a compressed variable to a pandas MultiIndex. + + Parameters + ---------- + encoded : xarray.Dataset + Encoded Dataset with variables that use "compression by gathering".capitalize + idxnames : hashable or iterable of hashable, optional + Variable names that represents a compressed dimension. These variables must have + the attribute ``"compress"``. If None, will detect all indexes with a ``"compress"`` + attribute and decode those. + + Returns + ------- + xarray.Dataset + Decoded Dataset with ``name`` as a MultiIndexed dimension. + + References + ---------- + CF conventions on `compression by gathering `_ + """ + decoded = xr.Dataset() + if idxnames is None: + idxnames = tuple( + name for name in encoded.indexes if "compress" in encoded[name].attrs + ) + elif isinstance(idxnames, str): + idxnames = (idxnames,) + + for idxname in idxnames: + if "compress" not in encoded[idxname].attrs: + raise ValueError("Attribute 'compress' not found in provided Dataset.") + + if not isinstance(encoded, xr.Dataset): + raise ValueError( + f"Must provide a Dataset. Received {type(encoded)} instead." + ) + + names = encoded[idxname].attrs["compress"].split(" ") + shape = [encoded.sizes[dim] for dim in names] + indices = np.unravel_index(encoded.landpoint.data, shape) + arrays = [encoded[dim].data[index] for dim, index in zip(names, indices)] + mindex = pd.MultiIndex.from_arrays(arrays, names=names) + + decoded.coords[idxname] = mindex + decoded.coords[idxname].attrs = encoded[idxname].attrs.copy() + del decoded[idxname].attrs["compress"] + + for varname in encoded.data_vars: + if idxname in encoded[varname].dims: + decoded[varname] = ( + idxname, + encoded[varname].data, + encoded[varname].attrs, + ) + return decoded diff --git a/cf_xarray/tests/test_coding.py b/cf_xarray/tests/test_coding.py new file mode 100644 index 00000000..cde33671 --- /dev/null +++ b/cf_xarray/tests/test_coding.py @@ -0,0 +1,34 @@ +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +import cf_xarray as cfxr + + +@pytest.mark.parametrize( + "mindex", + [ + pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("lat", "lon")), + pd.MultiIndex.from_arrays( + [["a", "b", "c", "d"], [1, 2, 4, 10]], names=("lat", "lon") + ), + pd.MultiIndex.from_arrays( + [["a", "b", "b", "a"], [1, 2, 1, 2]], names=("lat", "lon") + ), + ], +) +@pytest.mark.parametrize("idxnames", ["landpoint", ("landpoint",), None]) +def test_compression_by_gathering_multi_index_roundtrip(mindex, idxnames): + dataset = xr.Dataset( + {"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})}, + {"landpoint": ("landpoint", mindex, {"long_name": "land point number"})}, + ) + encoded = cfxr.encode_multi_index_as_compress(dataset, idxnames) + roundtrip = cfxr.decode_compress_to_multi_index(encoded, idxnames) + assert "compress" not in roundtrip["landpoint"].encoding + xr.testing.assert_identical(roundtrip, dataset) + + dataset["landpoint"].attrs["compress"] = "lat lon" + with pytest.raises(ValueError): + cfxr.encode_multi_index_as_compress(dataset, idxnames) diff --git a/doc/api.rst b/doc/api.rst index c1351fe2..26e6cfb9 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -14,6 +14,8 @@ Top-level API shapely_to_cf cf_to_shapely set_options + encode_multi_index_as_compress + decode_compress_to_multi_index .. currentmodule:: xarray diff --git a/doc/coding.md b/doc/coding.md new file mode 100644 index 00000000..44a391ba --- /dev/null +++ b/doc/coding.md @@ -0,0 +1,70 @@ +--- +jupytext: + text_representation: + format_name: myst +kernelspec: + display_name: Python 3 + name: python3 +--- +```{eval-rst} +.. currentmodule:: cf_xarray +``` +```{code-cell} +--- +tags: [remove-cell] +--- +import cf_xarray as cfxr +import numpy as np +import pandas as pd +import xarray as xr +xr.set_options(display_expand_data=False) +``` + + +# Encoding and decoding + +`cf_xarray` aims to support encoding and decoding variables using CF conventions not yet implemented by Xarray. + +## Compression by gathering + +The ["compression by gathering"](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering) +convention could be used for either {py:class}`pandas.MultiIndex` objects or `pydata/sparse` arrays. + +### MultiIndex + +``cf_xarray`` provides {py:func}`encode_multi_index_as_compress` and {py:func}`decode_compress_to_multi_index` to encode MultiIndex-ed +dimensions using "compression by gethering". + +Here's a test dataset +```{code-cell} +ds = xr.Dataset( + {"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})}, + { + "landpoint": pd.MultiIndex.from_product( + [["a", "b"], [1, 2]], names=("lat", "lon") + ) + }, +) +ds +``` +First encode (note the `"compress"` attribute on the `landpoint` variable) +```{code-cell} +encoded = cfxr.encode_multi_index_as_compress(ds, "landpoint") +encoded +``` + +At this point, we can write `encoded` to a CF-compliant dataset using {py:func}`xarray.Dataset.to_netcdf` for example. +After reading that file, decode using +```{code-cell} +decoded = cfxr.decode_compress_to_multi_index(encoded, "landpoint") +decoded +``` + +We roundtrip perfectly +```{code-cell} +ds.identical(decoded) +``` + +### Sparse arrays + +This is unsupported currently but a pull request is welcome! diff --git a/doc/conf.py b/doc/conf.py index 4c8ec5d4..5edbf43c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -251,6 +251,7 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), "xarray": ("https://xarray.pydata.org/en/stable/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None), } autosummary_generate = True diff --git a/doc/index.rst b/doc/index.rst index 1c26e46f..4f209d6a 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -50,6 +50,7 @@ or using ``conda`` units parametricz bounds + coding dsg geometry plotting diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f911cb56..9e4b0e9f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -5,9 +5,10 @@ What's New v 0.7.1 (unreleased) ==================== +- added encoder and decoder for writing pandas MultiIndex-es to file using "compression by gathering". + See :doc:`coding` for more. By `Deepak Cherian`_. - added another type of vertical coordinate to decode: ``ocean_sigma_coordinate``. By `Kristen Thyng`_. - v0.7.0 (January 24, 2022) ========================= - Many improvements to autoguessing for plotting. By `Deepak Cherian`_