pydata · shoyer · Dec 13, 2018 · Nov 5, 2018 · Nov 6, 2018 · Nov 7, 2018
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -480,7 +480,7 @@ def close(self):
 _CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
 
 
-def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
+def open_mfdataset(paths, chunks=None, concat_dims=_CONCAT_DIM_DEFAULT,
                    compat='no_conflicts', preprocess=None, engine=None,
                    lock=None, data_vars='all', coords='different',
                    autoclose=None, parallel=False, **kwargs):
@@ -620,11 +620,11 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
 
     # close datasets in case of a ValueError
     try:
-        if concat_dim is _CONCAT_DIM_DEFAULT:
+        if concat_dims is _CONCAT_DIM_DEFAULT:
             combined = auto_combine(datasets, compat=compat,
                                     data_vars=data_vars, coords=coords)
         else:
-            combined = auto_combine(datasets, concat_dim=concat_dim,
+            combined = auto_combine(datasets, concat_dims=concat_dims,
                                     compat=compat,
                                     data_vars=data_vars, coords=coords)
     except ValueError:

diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, print_function
 
 import warnings
+import toolz.itertoolz as itertoolz
 
 import pandas as pd
 
@@ -369,34 +370,178 @@ def _auto_concat(datasets, dim=None, data_vars='all', coords='different'):
 _CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
 
 
+def _infer_concat_order_from_nested_list(datasets, concat_dims):
+
+    combined_ids = _infer_tile_ids_from_nested_list(datasets, [], {})
+
+    # Currently if concat_dims is not supplied then _auto_concat attempts to deduce it on every call
+    # TODO might be faster in this case to just work out the concat_dims once here
+    tile_id, ds = list(combined_ids.items())[0]
+    n_dims = len(tile_id)
+    if concat_dims is None or concat_dims == _CONCAT_DIM_DEFAULT:
+        concat_dims = [_CONCAT_DIM_DEFAULT]*n_dims
+    else:
+        if len(concat_dims) != n_dims:
+            raise ValueError("concat_dims has length " + str(len(concat_dims))
+                             + " but the datasets passed are nested in a " +
+                             str(n_dims) + "-dimensional structure")
+
+    return combined_ids, concat_dims
+
+
+def _infer_tile_ids_from_nested_list(entry, current_pos, combined_tile_ids):
+    """
+    Given a list of lists (of lists...) of datasets, returns a dictionary
+    with the index of each dataset in the nested list structure as the key.
+
+    Recursively traverses the given structure, while keeping track of the
+    current position.
+
+    Parameters
+    ----------
+    entry : list[list[xarray.Dataset, xarray.Dataset, ...]]
+        List of lists of arbitrary depth, containing datasets in the order they
+        are to be concatenated.
+
+    Returns
+    -------
+    combined_tile_ids : dict[tuple(int, ...), xarray.Dataset]
+    """
+
+    from .dataset import Dataset
+
+    if isinstance(entry, list):
+        # Dive down tree and recursively open the next list
+        current_pos.append(0)
+        for i, item in enumerate(entry):
+            current_pos[-1] = i
+            combined_tile_ids = _infer_tile_ids_from_nested_list\
+                (item, current_pos, combined_tile_ids)
+
+        # Move back up tree
+        del current_pos[-1]
+        return combined_tile_ids
+
+    elif isinstance(entry, Dataset):
+        # Termination condition
+        combined_tile_ids[tuple(current_pos)] = entry
+        return combined_tile_ids
+
+    else:
+        raise TypeError("Element at position " + str(tuple(current_pos)) +
+                        " is of type " + str(type(entry)) + ", which is "
+                        "neither a list nor an xarray.Dataset")
+
+
+def _check_shape_tile_ids(combined_tile_ids):
+    # TODO create custom exception class instead of using asserts?
+    # Is this function even necessary?
+
+    tile_ids = combined_tile_ids.keys()
+
+    # Check all tuples are the same length
+    lengths = [len(id) for id in tile_ids]
+    assert set(lengths) == {lengths[0]}
+
+    # Check only datasets are contained
+    from .dataset import Dataset
+    for v in combined_tile_ids.values():
+        assert isinstance(v, Dataset)
+
+
+def _data_vars(combined_id):
+    id, ds = combined_id
+    return tuple(sorted(ds.data_vars))
+
+
+def _combine_nd(combined_IDs, concat_dims, data_vars='all',
+                coords='different', compat='no_conflicts'):
+    """
+    Concatenates and merges an N-dimensional structure of datasets.
+
+    No checks are performed on the consistency of the datasets, concat_dims or
+    tile_IDs, because it is assumed that this has already been done.
+
+    Parameters
+    ----------
+    combined_IDs : Dict[Tuple[int, ...]], xarray.Dataset]
+        Structure containing all datasets to be concatenated with "tile_IDs" as
+        keys, which specify position within the desired final combined result.
+    concat_dims : sequence of str
+        The dimensions along which the datasets should be concatenated. Must be
+        in order, and the length must match
+
+    Returns
+    -------
+
+    """
+
+    # Organise by data variables
+    grouped_by_data_vars = itertoolz.groupby(_data_vars,
+                                             combined_IDs.items()).values()
+
+    concatenated_datasets = []
+    for tiled_datasets_group in grouped_by_data_vars:
+
+        # Convert list of tuples back into a dictionary
+        concatenated_ids = dict(tiled_datasets_group)
+
+        # Perform N-D dimensional concatenation
+        # Each iteration of this loop reduces the length of the tile_IDs tuples
+        # by one. It always removes the first
+        for concat_dim in concat_dims:
+            dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
+
+            concatenated_ids = _concat_along_first_dim(concatenated_ids,
+                                                       dim=dim,
+                                                       data_vars=data_vars,
+                                                       coords=coords)
+        concatenated_datasets = concatenated_datasets \
+                                + list(concatenated_ids.values())
+    return merge(concatenated_datasets, compat=compat)
+
+
+def _new_tile_id(single_id_ds_pair):
+    # TODO maybe replace with something like lambda x: x[0][1:]?
+    tile_id, ds = single_id_ds_pair
+    return tile_id[1:]
+
+
+def _concat_along_first_dim(combined_IDs, dim, data_vars='all',
+                                coords='different'):
+    grouped = itertoolz.groupby(_new_tile_id, combined_IDs.items())
+    new_combined_IDs = {}
+
+    # TODO Would there be any point in parallelizing this concatenation step?
+    for new_ID, group in grouped.items():
+        to_concat = [ds for old_ID, ds in group]
+        new_combined_IDs[new_ID] = _auto_concat(to_concat, dim=dim,
+                                                data_vars=data_vars,
+                                                coords=coords)
+    return new_combined_IDs
+
+
 def auto_combine(datasets,
-                 concat_dim=_CONCAT_DIM_DEFAULT,
+                 concat_dims=_CONCAT_DIM_DEFAULT,
                  compat='no_conflicts',
-                 data_vars='all', coords='different'):
+                 data_vars='all', coords='different',
+                 infer_order_from_coords=False):
     """Attempt to auto-magically combine the given datasets into one.
 
-    This method attempts to combine a list of datasets into a single entity by
-    inspecting metadata and using a combination of concat and merge.
-
-    It does not concatenate along more than one dimension or sort data under
-    any circumstances. It does align coordinates, but different variables on
-    datasets can cause it to fail under some scenarios. In complex cases, you
-    may need to clean up your data and use ``concat``/``merge`` explicitly.
-
-    ``auto_combine`` works well if you have N years of data and M data
-    variables, and each combination of a distinct time period and set of data
-    variables is saved its own dataset.
+    This method attempts to combine a list (or nested list of lists) of
+    datasets into a single entity by inspecting metadata and using a
+    combination of concat and merge.
 
     Parameters
     ----------
     datasets : sequence of xarray.Dataset
         Dataset objects to merge.
-    concat_dim : str or DataArray or Index, optional
-        Dimension along which to concatenate variables, as used by
+    concat_dims : list of str or DataArray or Index, optional
+        Dimensions along which to concatenate variables, as used by
         :py:func:`xarray.concat`. You only need to provide this argument if
-        the dimension along which you want to concatenate is not a dimension
-        in the original datasets, e.g., if you want to stack a collection of
-        2D arrays along a third dimension.
+        any of the dimensions along which you want to concatenate are not a
+        dimension in the original datasets, e.g., if you want to stack a
+        collection of 2D arrays along a third dimension.
         By default, xarray attempts to infer this argument by examining
         component files. Set ``concat_dim=None`` explicitly to disable
         concatenation.
@@ -415,8 +560,14 @@ def auto_combine(datasets,
           of all non-null values.
     data_vars : {'minimal', 'different', 'all' or list of str}, optional
         Details are in the documentation of concat
-    coords : {'minimal', 'different', 'all' o list of str}, optional
+    coords : {'minimal', 'different', 'all' or list of str}, optional
         Details are in the documentation of concat
+    infer_order_from_coords : bool, optional
+        If true attempt to deduce the order in which the datasets should be
+        concatenated from their coordinates. To do this the coordinates should
+        be monotonic along the dimension to be concatenated.
+        If false instead read the order from the structure the datasets are
+        supplied in. This structure should be a nested list of lists.
 
     Returns
     -------
@@ -427,15 +578,33 @@ def auto_combine(datasets,
     concat
     Dataset.merge
     """
-    from toolz import itertoolz
-    if concat_dim is not None:
-        dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
-        grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
-                                    datasets).values()
-        concatenated = [_auto_concat(ds, dim=dim,
-                                     data_vars=data_vars, coords=coords)
-                        for ds in grouped]
+
+    # TODO perform some of the checks from _calc_concat_dim_coord on concat_dims here?
+
+    if concat_dims is not None:
+
+        # TODO this could be where we would optionally check alignment, as in #2039?
+
+        # Organise datasets in concatentation order in N-D
+        if infer_order_from_coords:
+            # TODO Use coordinates to determine tile_ID for each dataset in N-D
+            # i.e. (shoyer's (1) from discussion in #2159)
+            raise NotImplementedError
+            # Once this is implemented I think it should be the default
+        else:
+            # Determine tile_IDs by structure of input in N-D
+            # (i.e. ordering in list-of-lists)
+            combined_ids, concat_dims = _infer_concat_order_from_nested_list\
+                (datasets, concat_dims)
+
+        # Check that the combined_ids are sensible
+        _check_shape_tile_ids(combined_ids)
+
+        # Repeatedly concatenate then merge along each dimension
+        combined = _combine_nd(combined_ids, concat_dims, compat=compat,
+                               data_vars=data_vars, coords=coords)
     else:
+        # Case of no concatenation wanted
         concatenated = datasets
-    merged = merge(concatenated, compat=compat)
-    return merged
+        combined = merge(concatenated, compat=compat)
+    return combined
diff --git a/xarray/testing.py b/xarray/testing.py
@@ -138,3 +138,11 @@ def assert_allclose(a, b, rtol=1e-05, atol=1e-08, decode_bytes=True):
     else:
         raise TypeError('{} not supported by assertion comparison'
                         .format(type(a)))
+
+
+def assert_combined_tile_ids_equal(dict1, dict2):
+    assert len(dict1) == len(dict2)
+    for k, v in dict1.items():
+        assert k in dict2.keys()
+        assert_equal(dict1[k], dict2[k])
+
diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -15,7 +15,7 @@
 from xarray.core import utils
 from xarray.core.indexing import ExplicitlyIndexed
 from xarray.testing import (assert_equal, assert_identical,  # noqa: F401
-                            assert_allclose)
+                            assert_allclose, assert_combined_tile_ids_equal)
 from xarray.plot.utils import import_seaborn
 
 try:

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -2234,7 +2234,7 @@ def test_open_mfdataset_concat_dim_none(self):
                 data = Dataset({'x': 0})
                 data.to_netcdf(tmp1)
                 Dataset({'x': np.nan}).to_netcdf(tmp2)
-                with open_mfdataset([tmp1, tmp2], concat_dim=None) as actual:
+                with open_mfdataset([tmp1, tmp2], concat_dims=None) as actual:
                     assert_identical(data, actual)
 
     def test_open_dataset(self):
@@ -2261,7 +2261,7 @@ def test_open_single_dataset(self):
                            {'baz': [100]})
         with create_tmp_file() as tmp:
             original.to_netcdf(tmp)
-            with open_mfdataset([tmp], concat_dim=dim) as actual:
+            with open_mfdataset([tmp], concat_dims=[dim]) as actual:
                 assert_identical(expected, actual)
 
     def test_dask_roundtrip(self):
@@ -2625,7 +2625,7 @@ def test_uamiv_format_mfread(self):
             ['example.uamiv',
              'example.uamiv'],
             engine='pseudonetcdf',
-            concat_dim='TSTEP',
+            concat_dims=['TSTEP'],
             backend_kwargs={'format': 'uamiv'})
 
         data1 = np.arange(20, dtype='f').reshape(1, 1, 4, 5)