Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: N-dimensional auto_combine #2553

Merged
merged 40 commits into from
Dec 13, 2018
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
88ee12a
concatenates along a single dimension
TomNicholas Nov 5, 2018
1aaa075
Wrote function to find correct tile_IDs from nested list of datasets
TomNicholas Nov 6, 2018
dbb371d
Wrote function to check that combined_tile_ids structure is valid
TomNicholas Nov 7, 2018
cc4d743
Added test of 2d-concatenation
TomNicholas Nov 7, 2018
d2fc7e7
Tests now check that dataset ordering is correct
TomNicholas Nov 8, 2018
e3f3699
Test concatentation along a new dimension
TomNicholas Nov 8, 2018
55bf685
Started generalising auto_combine to N-D by integrating the N-D conca…
TomNicholas Nov 9, 2018
845206c
All unit tests now passing
TomNicholas Nov 9, 2018
fb66626
Merge branch 'real_master' into feature/nd_combine
TomNicholas Nov 10, 2018
f4e9aad
Fixed a failing test which I didn't notice because I don't have pseud…
TomNicholas Nov 10, 2018
00004a1
Began updating open_mfdataset to handle N-D input
TomNicholas Nov 14, 2018
b41e374
Refactored to remove duplicate logic in open_mfdataset & auto_combine
TomNicholas Nov 14, 2018
8672a79
Implemented Shoyers suggestion in #2553 to rewrite the recursive nest…
TomNicholas Nov 14, 2018
4f56b24
--amend
TomNicholas Nov 14, 2018
4cfaf2e
Now raises ValueError if input not ordered correctly before concatena…
TomNicholas Nov 14, 2018
9fd1413
Added some more prototype tests defining desired behaviour more clearly
TomNicholas Nov 22, 2018
8ad0121
Now raises informative errors on invalid forms of input
TomNicholas Nov 24, 2018
4b2c544
Refactoring to alos merge along each dimension
TomNicholas Nov 25, 2018
3d0061e
Refactored to literally just apply the old auto_combine along each di…
TomNicholas Nov 25, 2018
60c93ba
Added unit tests for open_mfdatset
TomNicholas Nov 26, 2018
1824538
Removed TODOs
TomNicholas Nov 26, 2018
d380815
Removed format strings
TomNicholas Nov 30, 2018
c4bb8d0
test_get_new_tile_ids now doesn't assume dicts are ordered
TomNicholas Nov 30, 2018
6b7f889
Fixed failing tests on python3.5 caused by accidentally assuming dict…
TomNicholas Nov 30, 2018
58a3648
Test for getting new tile id
TomNicholas Nov 30, 2018
a12a34a
Fixed itertoolz import so that it's compatible with older versions
TomNicholas Nov 30, 2018
ada1f4a
Increased test coverage
TomNicholas Dec 1, 2018
ef0a30e
Added toolz as an explicit dependency to pass tests on python2.7
TomNicholas Dec 1, 2018
3be70bc
Updated 'what's new'
TomNicholas Dec 1, 2018
f266bc3
No longer attempts to shortcut all concatenation at once if concat_di…
TomNicholas Dec 1, 2018
cf49c2b
Merge branch 'master' into feature/nd_combine
TomNicholas Dec 1, 2018
878e1f9
Rewrote using itertools.groupby instead of toolz.itertoolz.groupby to…
TomNicholas Dec 1, 2018
7dea14f
Merged changes from master
TomNicholas Dec 1, 2018
e6f25a3
Fixed erroneous removal of utils import
TomNicholas Dec 1, 2018
f856485
Updated docstrings to include an example of multidimensional concaten…
TomNicholas Dec 2, 2018
6305d83
Clarified auto_combine docstring for N-D behaviour
TomNicholas Dec 5, 2018
ce59da1
Added unit test for nested list of Datasets with different variables
TomNicholas Dec 10, 2018
9fb34cf
Minor spelling and pep8 fixes
TomNicholas Dec 10, 2018
f47486f
Reverted API so that N-D generalisation is hidden
TomNicholas Dec 11, 2018
ebbe47f
Removed infer_order_from_coords argument
TomNicholas Dec 12, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ def close(self):
_CONCAT_DIM_DEFAULT = '__infer_concat_dim__'


def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT,
compat='no_conflicts', preprocess=None, engine=None,
lock=None, data_vars='all', coords='different',
autoclose=None, parallel=False, **kwargs):
Expand Down Expand Up @@ -620,11 +620,11 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,

# close datasets in case of a ValueError
try:
if concat_dim is _CONCAT_DIM_DEFAULT:
if concat_dims is _CONCAT_DIM_DEFAULT:
combined = auto_combine(datasets, compat=compat,
data_vars=data_vars, coords=coords)
else:
combined = auto_combine(datasets, concat_dim=concat_dim,
combined = auto_combine(datasets, concat_dims=concat_dims,
compat=compat,
data_vars=data_vars, coords=coords)
except ValueError:
Expand Down
227 changes: 198 additions & 29 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import absolute_import, division, print_function

import warnings
import toolz.itertoolz as itertoolz

import pandas as pd

Expand Down Expand Up @@ -369,34 +370,178 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'):
_CONCAT_DIM_DEFAULT = '__infer_concat_dim__'


def _infer_concat_order_from_nested_list(datasets, concat_dims):

combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {})

# Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call
# TODO might be faster in this case to just work out the concat_dims once here
tile_id, ds = list(combined_ids.items())[0]
n_dims = len(tile_id)
if concat_dims is None or concat_dims == _CONCAT_DIM_DEFAULT:
concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims
else:
if len(concat_dims) != n_dims:
raise ValueError("concat_dims has length " + str(len(concat_dims))
+ " but the datasets passed are nested in a " +
str(n_dims) + "-dimensional structure")
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved

return combined_ids, concat_dims


def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids):
"""
Given a list of lists (of lists...) of datasets, returns a dictionary
with the index of each dataset in the nested list structure as the key.

Recursively traverses the given structure, while keeping track of the
current position.

Parameters
----------
entry : list[list[xarray.Dataset, xarray.Dataset, ...]]
List of lists of arbitrary depth, containing datasets in the order they
are to be concatenated.

Returns
-------
combined_tile_ids : dict[tuple(int, ...), xarray.Dataset]
"""

from .dataset import Dataset

if isinstance(entry, list):
# Dive down tree and recursively open the next list
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
current_pos.append(0)
for i, item in enumerate(entry):
current_pos[-1] = i
combined_tile_ids = _infer_tile_ids_from_nested_list\
(item, current_pos, combined_tile_ids)

# Move back up tree
del current_pos[-1]
return combined_tile_ids

elif isinstance(entry, Dataset):
# Termination condition
combined_tile_ids[tuple(current_pos)] = entry
return combined_tile_ids

else:
raise TypeError("Element at position " + str(tuple(current_pos)) +
" is of type " + str(type(entry)) + ", which is "
"neither a list nor an xarray.Dataset")


def _check_shape_tile_ids(combined_tile_ids):
# TODO create custom exception class instead of using asserts?
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
# Is this function even necessary?

tile_ids = combined_tile_ids.keys()

# Check all tuples are the same length
lengths = [len(id) for id in tile_ids]
assert set(lengths) == {lengths[0]}

# Check only datasets are contained
from .dataset import Dataset
for v in combined_tile_ids.values():
assert isinstance(v, Dataset)


def _data_vars(combined_id):
id, ds = combined_id
return tuple(sorted(ds.data_vars))


def _combine_nd(combined_IDs, concat_dims, data_vars='all',
coords='different', compat='no_conflicts'):
"""
Concatenates and merges an N-dimensional structure of datasets.

No checks are performed on the consistency of the datasets, concat_dims or
tile_IDs, because it is assumed that this has already been done.

Parameters
----------
combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset]
Structure containing all datasets to be concatenated with "tile_IDs" as
keys, which specify position within the desired final combined result.
concat_dims : sequence of str
The dimensions along which the datasets should be concatenated. Must be
in order, and the length must match

Returns
-------

"""

# Organise by data variables
grouped_by_data_vars = itertoolz.groupby(_data_vars,
combined_IDs.items()).values()

concatenated_datasets = []
for tiled_datasets_group in grouped_by_data_vars:

# Convert list of tuples back into a dictionary
concatenated_ids = dict(tiled_datasets_group)

# Perform N-D dimensional concatenation
# Each iteration of this loop reduces the length of the tile_IDs tuples
# by one. It always removes the first
for concat_dim in concat_dims:
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim

concatenated_ids = _concat_along_first_dim(concatenated_ids,
dim=dim,
data_vars=data_vars,
coords=coords)
concatenated_datasets = concatenated_datasets \
+ list(concatenated_ids.values())
return merge(concatenated_datasets, compat=compat)


def _new_tile_id(single_id_ds_pair):
# TODO maybe replace with something like lambda x: x[0][1:]?
tile_id, ds = single_id_ds_pair
return tile_id[1:]


def _concat_along_first_dim(combined_IDs, dim, data_vars='all',
coords='different'):
grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items())
new_combined_IDs = {}

# TODO Would there be any point in parallelizing this concatenation step?
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
for new_ID, group in grouped.items():
to_concat = [ds for old_ID, ds in group]
new_combined_IDs[new_ID] = _auto_concat(to_concat, dim=dim,
data_vars=data_vars,
coords=coords)
return new_combined_IDs


def auto_combine(datasets,
concat_dim=_CONCAT_DIM_DEFAULT,
concat_dims=_CONCAT_DIM_DEFAULT,
compat='no_conflicts',
data_vars='all', coords='different'):
data_vars='all', coords='different',
infer_order_from_coords=False):
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
"""Attempt to auto-magically combine the given datasets into one.

This method attempts to combine a list of datasets into a single entity by
inspecting metadata and using a combination of concat and merge.

It does not concatenate along more than one dimension or sort data under
any circumstances. It does align coordinates, but different variables on
datasets can cause it to fail under some scenarios. In complex cases, you
may need to clean up your data and use ``concat``/``merge`` explicitly.

``auto_combine`` works well if you have N years of data and M data
variables, and each combination of a distinct time period and set of data
variables is saved its own dataset.
This method attempts to combine a list (or nested list of lists) of
datasets into a single entity by inspecting metadata and using a
combination of concat and merge.

Parameters
----------
datasets : sequence of xarray.Dataset
Dataset objects to merge.
concat_dim : str or DataArray or Index, optional
Dimension along which to concatenate variables, as used by
concat_dims : list of str or DataArray or Index, optional
Dimensions along which to concatenate variables, as used by
:py:func:`xarray.concat`. You only need to provide this argument if
the dimension along which you want to concatenate is not a dimension
in the original datasets, e.g., if you want to stack a collection of
2D arrays along a third dimension.
any of the dimensions along which you want to concatenate are not a
dimension in the original datasets, e.g., if you want to stack a
collection of 2D arrays along a third dimension.
By default, xarray attempts to infer this argument by examining
component files. Set ``concat_dim=None`` explicitly to disable
concatenation.
Expand All @@ -415,8 +560,14 @@ def auto_combine(datasets,
of all non-null values.
data_vars : {'minimal', 'different', 'all' or list of str}, optional
Details are in the documentation of concat
coords : {'minimal', 'different', 'all' o list of str}, optional
coords : {'minimal', 'different', 'all' or list of str}, optional
Details are in the documentation of concat
infer_order_from_coords : bool, optional
If true attempt to deduce the order in which the datasets should be
concatenated from their coordinates. To do this the coordinates should
be monotonic along the dimension to be concatenated.
If false instead read the order from the structure the datasets are
supplied in. This structure should be a nested list of lists.

Returns
-------
Expand All @@ -427,15 +578,33 @@ def auto_combine(datasets,
concat
Dataset.merge
"""
from toolz import itertoolz
if concat_dim is not None:
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
datasets).values()
concatenated = [_auto_concat(ds, dim=dim,
data_vars=data_vars, coords=coords)
for ds in grouped]

# TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here?

if concat_dims is not None:

# TODO this could be where we would optionally check alignment, as in #2039?

# Organise datasets in concatentation order in N-D
if infer_order_from_coords:
# TODO Use coordinates to determine tile_ID for each dataset in N-D
# i.e. (shoyer's (1) from discussion in #2159)
raise NotImplementedError
# Once this is implemented I think it should be the default
else:
# Determine tile_IDs by structure of input in N-D
# (i.e. ordering in list-of-lists)
combined_ids, concat_dims = _infer_concat_order_from_nested_list\
(datasets, concat_dims)

# Check that the combined_ids are sensible
_check_shape_tile_ids(combined_ids)

# Repeatedly concatenate then merge along each dimension
combined = _combine_nd(combined_ids, concat_dims, compat=compat,
data_vars=data_vars, coords=coords)
else:
# Case of no concatenation wanted
concatenated = datasets
merged = merge(concatenated, compat=compat)
return merged
combined = merge(concatenated, compat=compat)
return combined
8 changes: 8 additions & 0 deletions xarray/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,11 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True):
else:
raise TypeError('{} not supported by assertion comparison'
.format(type(a)))


def assert_combined_tile_ids_equal(dict1, dict2):
assert len(dict1) == len(dict2)
for k, v in dict1.items():
assert k in dict2.keys()
assert_equal(dict1[k], dict2[k])

2 changes: 1 addition & 1 deletion xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from xarray.core import utils
from xarray.core.indexing import ExplicitlyIndexed
from xarray.testing import (assert_equal, assert_identical, # noqa: F401
assert_allclose)
assert_allclose, assert_combined_tile_ids_equal)
from xarray.plot.utils import import_seaborn

try:
Expand Down
6 changes: 3 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2234,7 +2234,7 @@ def test_open_mfdataset_concat_dim_none(self):
data = Dataset({'x': 0})
data.to_netcdf(tmp1)
Dataset({'x': np.nan}).to_netcdf(tmp2)
with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual:
with open_mfdataset([tmp1, tmp2], concat_dims=None) as actual:
assert_identical(data, actual)

def test_open_dataset(self):
Expand All @@ -2261,7 +2261,7 @@ def test_open_single_dataset(self):
{'baz': [100]})
with create_tmp_file() as tmp:
original.to_netcdf(tmp)
with open_mfdataset([tmp], concat_dim=dim) as actual:
with open_mfdataset([tmp], concat_dims=[dim]) as actual:
assert_identical(expected, actual)

def test_dask_roundtrip(self):
Expand Down Expand Up @@ -2625,7 +2625,7 @@ def test_uamiv_format_mfread(self):
['example.uamiv',
'example.uamiv'],
engine='pseudonetcdf',
concat_dim='TSTEP',
concat_dims=['TSTEP'],
backend_kwargs={'format': 'uamiv'})

data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5)
Expand Down
Loading