-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add encoder, decoder for MultiIndexes. (#321)
- Loading branch information
Showing
8 changed files
with
234 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
""" | ||
Encoders and decoders for CF conventions not implemented by Xarray. | ||
""" | ||
import numpy as np | ||
import pandas as pd | ||
import xarray as xr | ||
|
||
|
||
def encode_multi_index_as_compress(ds, idxnames=None): | ||
""" | ||
Encode a MultiIndexed dimension using the "compression by gathering" CF convention. | ||
Parameters | ||
---------- | ||
ds : xarray.Dataset | ||
Dataset with at least one MultiIndexed dimension | ||
idxnames : hashable or iterable of hashable, optional | ||
Dimensions that are MultiIndex-ed. If None, will detect all MultiIndex-ed dimensions. | ||
Returns | ||
------- | ||
xarray.Dataset | ||
Encoded Dataset with ``name`` as a integer coordinate with a ``"compress"`` attribute. | ||
References | ||
---------- | ||
CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_ | ||
""" | ||
if idxnames is None: | ||
idxnames = tuple( | ||
name | ||
for name, idx in ds.indexes.items() | ||
if isinstance(idx, pd.MultiIndex) | ||
# After the flexible indexes refactor, all MultiIndex Levels | ||
# have a MultiIndex but the name won't match. | ||
# Prior to that refactor, there is only a single MultiIndex with name=None | ||
and (idx.name == name if idx.name is not None else True) | ||
) | ||
elif isinstance(idxnames, str): | ||
idxnames = (idxnames,) | ||
|
||
if not idxnames: | ||
raise ValueError("No MultiIndex-ed dimensions found in Dataset.") | ||
|
||
encoded = ds.reset_index(idxnames) | ||
for idxname in idxnames: | ||
mindex = ds.indexes[idxname] | ||
coords = dict(zip(mindex.names, mindex.levels)) | ||
encoded.update(coords) | ||
encoded[idxname] = np.ravel_multi_index(mindex.codes, mindex.levshape) | ||
encoded[idxname].attrs = ds[idxname].attrs | ||
if ( | ||
"compress" in encoded[idxname].encoding | ||
or "compress" in encoded[idxname].attrs | ||
): | ||
raise ValueError( | ||
f"Does not support the 'compress' attribute in {idxname}.encoding or {idxname}.attrs. " | ||
"This is generated automatically." | ||
) | ||
encoded[idxname].attrs["compress"] = " ".join(mindex.names) | ||
return encoded | ||
|
||
|
||
def decode_compress_to_multi_index(encoded, idxnames=None): | ||
""" | ||
Decode a compressed variable to a pandas MultiIndex. | ||
Parameters | ||
---------- | ||
encoded : xarray.Dataset | ||
Encoded Dataset with variables that use "compression by gathering".capitalize | ||
idxnames : hashable or iterable of hashable, optional | ||
Variable names that represents a compressed dimension. These variables must have | ||
the attribute ``"compress"``. If None, will detect all indexes with a ``"compress"`` | ||
attribute and decode those. | ||
Returns | ||
------- | ||
xarray.Dataset | ||
Decoded Dataset with ``name`` as a MultiIndexed dimension. | ||
References | ||
---------- | ||
CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_ | ||
""" | ||
decoded = xr.Dataset() | ||
if idxnames is None: | ||
idxnames = tuple( | ||
name for name in encoded.indexes if "compress" in encoded[name].attrs | ||
) | ||
elif isinstance(idxnames, str): | ||
idxnames = (idxnames,) | ||
|
||
for idxname in idxnames: | ||
if "compress" not in encoded[idxname].attrs: | ||
raise ValueError("Attribute 'compress' not found in provided Dataset.") | ||
|
||
if not isinstance(encoded, xr.Dataset): | ||
raise ValueError( | ||
f"Must provide a Dataset. Received {type(encoded)} instead." | ||
) | ||
|
||
names = encoded[idxname].attrs["compress"].split(" ") | ||
shape = [encoded.sizes[dim] for dim in names] | ||
indices = np.unravel_index(encoded.landpoint.data, shape) | ||
arrays = [encoded[dim].data[index] for dim, index in zip(names, indices)] | ||
mindex = pd.MultiIndex.from_arrays(arrays, names=names) | ||
|
||
decoded.coords[idxname] = mindex | ||
decoded.coords[idxname].attrs = encoded[idxname].attrs.copy() | ||
del decoded[idxname].attrs["compress"] | ||
|
||
for varname in encoded.data_vars: | ||
if idxname in encoded[varname].dims: | ||
decoded[varname] = ( | ||
idxname, | ||
encoded[varname].data, | ||
encoded[varname].attrs, | ||
) | ||
return decoded |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
import xarray as xr | ||
|
||
import cf_xarray as cfxr | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"mindex", | ||
[ | ||
pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("lat", "lon")), | ||
pd.MultiIndex.from_arrays( | ||
[["a", "b", "c", "d"], [1, 2, 4, 10]], names=("lat", "lon") | ||
), | ||
pd.MultiIndex.from_arrays( | ||
[["a", "b", "b", "a"], [1, 2, 1, 2]], names=("lat", "lon") | ||
), | ||
], | ||
) | ||
@pytest.mark.parametrize("idxnames", ["landpoint", ("landpoint",), None]) | ||
def test_compression_by_gathering_multi_index_roundtrip(mindex, idxnames): | ||
dataset = xr.Dataset( | ||
{"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})}, | ||
{"landpoint": ("landpoint", mindex, {"long_name": "land point number"})}, | ||
) | ||
encoded = cfxr.encode_multi_index_as_compress(dataset, idxnames) | ||
roundtrip = cfxr.decode_compress_to_multi_index(encoded, idxnames) | ||
assert "compress" not in roundtrip["landpoint"].encoding | ||
xr.testing.assert_identical(roundtrip, dataset) | ||
|
||
dataset["landpoint"].attrs["compress"] = "lat lon" | ||
with pytest.raises(ValueError): | ||
cfxr.encode_multi_index_as_compress(dataset, idxnames) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
--- | ||
jupytext: | ||
text_representation: | ||
format_name: myst | ||
kernelspec: | ||
display_name: Python 3 | ||
name: python3 | ||
--- | ||
```{eval-rst} | ||
.. currentmodule:: cf_xarray | ||
``` | ||
```{code-cell} | ||
--- | ||
tags: [remove-cell] | ||
--- | ||
import cf_xarray as cfxr | ||
import numpy as np | ||
import pandas as pd | ||
import xarray as xr | ||
xr.set_options(display_expand_data=False) | ||
``` | ||
|
||
|
||
# Encoding and decoding | ||
|
||
`cf_xarray` aims to support encoding and decoding variables using CF conventions not yet implemented by Xarray. | ||
|
||
## Compression by gathering | ||
|
||
The ["compression by gathering"](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering) | ||
convention could be used for either {py:class}`pandas.MultiIndex` objects or `pydata/sparse` arrays. | ||
|
||
### MultiIndex | ||
|
||
``cf_xarray`` provides {py:func}`encode_multi_index_as_compress` and {py:func}`decode_compress_to_multi_index` to encode MultiIndex-ed | ||
dimensions using "compression by gethering". | ||
|
||
Here's a test dataset | ||
```{code-cell} | ||
ds = xr.Dataset( | ||
{"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})}, | ||
{ | ||
"landpoint": pd.MultiIndex.from_product( | ||
[["a", "b"], [1, 2]], names=("lat", "lon") | ||
) | ||
}, | ||
) | ||
ds | ||
``` | ||
First encode (note the `"compress"` attribute on the `landpoint` variable) | ||
```{code-cell} | ||
encoded = cfxr.encode_multi_index_as_compress(ds, "landpoint") | ||
encoded | ||
``` | ||
|
||
At this point, we can write `encoded` to a CF-compliant dataset using {py:func}`xarray.Dataset.to_netcdf` for example. | ||
After reading that file, decode using | ||
```{code-cell} | ||
decoded = cfxr.decode_compress_to_multi_index(encoded, "landpoint") | ||
decoded | ||
``` | ||
|
||
We roundtrip perfectly | ||
```{code-cell} | ||
ds.identical(decoded) | ||
``` | ||
|
||
### Sparse arrays | ||
|
||
This is unsupported currently but a pull request is welcome! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,6 +50,7 @@ or using ``conda`` | |
units | ||
parametricz | ||
bounds | ||
coding | ||
dsg | ||
geometry | ||
plotting | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters