diff --git a/doc/api.rst b/doc/api.rst index c66e61dddf8..ba85f9491ee 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -106,6 +106,9 @@ Indexing Dataset.squeeze Dataset.reindex Dataset.reindex_like + Dataset.set_index + Dataset.reset_index + Dataset.reorder_levels Computation ----------- @@ -239,6 +242,9 @@ Indexing DataArray.squeeze DataArray.reindex DataArray.reindex_like + DataArray.set_index + DataArray.reset_index + DataArray.reorder_levels Comparisons ----------- diff --git a/doc/reshaping.rst b/doc/reshaping.rst index 0dcf461de87..092aade590e 100644 --- a/doc/reshaping.rst +++ b/doc/reshaping.rst @@ -4,7 +4,7 @@ Reshaping and reorganizing data ############################### -These methods allow you to reorganize +These methods allow you to reorganize .. ipython:: python :suppress: @@ -95,23 +95,79 @@ always succeeds, even if the multi-index being unstacked does not contain all possible levels. Missing levels are filled in with ``NaN`` in the resulting object: .. ipython:: python - + stacked2 = stacked[::2] - stacked2 + stacked2 stacked2.unstack('z') However, xarray's ``stack`` has an important difference from pandas: unlike pandas, it does not automatically drop missing values. Compare: .. ipython:: python - + array = xr.DataArray([[np.nan, 1], [2, 3]], dims=['x', 'y']) - array.stack(z=('x', 'y')) + array.stack(z=('x', 'y')) array.to_pandas().stack() We departed from pandas's behavior here because predictable shapes for new array dimensions is necessary for :ref:`dask`. +.. _reshape.set_index: + +Set and reset index +------------------- + +Complementary to stack / unstack, xarray's ``.set_index``, ``.reset_index`` and +``.reorder_levels`` allow easy manipulation of ``DataArray`` or ``Dataset`` +multi-indexes without modifying the data and its dimensions. + +You can create a multi-index from several 1-dimensional variables and/or +coordinates using :py:meth:`~xarray.DataArray.set_index`: + +.. ipython:: python + + da = xr.DataArray(np.random.rand(4), + coords={'band': ('x', ['a', 'a', 'b', 'b']), + 'wavenumber': ('x', np.linspace(200, 400, 4))}, + dims='x') + da + mda = da.set_index(x=['band', 'wavenumber']) + mda + +These coordinates can now be used for indexing, e.g., + +.. ipython:: python + + mda.sel(band='a') + +Conversely, you can use :py:meth:`~xarray.DataArray.reset_index` +to extract multi-index levels as coordinates (this is mainly useful +for serialization): + +.. ipython:: python + + mda.reset_index('x') + +:py:meth:`~xarray.DataArray.reorder_levels` allows changing the order +of multi-index levels: + +.. ipython:: python + + mda.reorder_levels(x=['wavenumber', 'band']) + +As of xarray v0.9 coordinate labels for each dimension are optional. +You can also use ``.set_index`` / ``.reset_index`` to add / remove +labels for one or several dimensions: + +.. ipython:: python + + array = xr.DataArray([1, 2, 3], dims='x') + array + array['c'] = ('x', ['a', 'b', 'c']) + array.set_index(x='c') + array.set_index(x='c', inplace=True) + array.reset_index('x', drop=True) + Shift and roll -------------- diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 350b472af56..9f1ca8e5080 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -104,6 +104,9 @@ Enhancements as keyword arguments, e.g., ``ds.sel(time='2000-01')`` (see :ref:`multi-level indexing`). By `Benoit Bovy `_. +- Added ``set_index``, ``reset_index`` and ``reorder_levels`` methods to + easily create and manipulate (multi-)indexes (see :ref:`reshape.set_index`). + By `Benoit Bovy `_. - Added the ``compat`` option ``'no_conflicts'`` to ``merge``, allowing the combination of xarray objects with disjoint (:issue:`742`) or overlapping (:issue:`835`) coordinates as long as all present data agrees. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index c755d4508fb..55a60b9f21d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -18,7 +18,7 @@ from .common import AbstractArray, BaseDataObject from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource, Indexes) -from .dataset import Dataset +from .dataset import Dataset, merge_indexes, split_indexes from .pycompat import iteritems, basestring, OrderedDict, zip, range from .variable import (as_variable, Variable, as_compatible_data, IndexVariable, @@ -846,6 +846,103 @@ def swap_dims(self, dims_dict): ds = self._to_temp_dataset().swap_dims(dims_dict) return self._from_temp_dataset(ds) + def set_index(self, append=False, inplace=False, **indexes): + """Set DataArray (multi-)indexes using one or more existing coordinates. + + Parameters + ---------- + append : bool, optional + If True, append the supplied index(es) to the existing index(es). + Otherwise replace the existing index(es) (default). + inplace : bool, optional + If True, set new index(es) in-place. Otherwise, return a new DataArray + object. + **indexes : {dim: index, ...} + Keyword arguments with names matching dimensions and values given + by (lists of) the names of existing coordinates or variables to set + as new (multi-)index. + + Returns + ------- + obj : DataArray + Another dataarray, with this dataarray's data but replaced coordinates. + + See Also + -------- + DataArray.reset_index + """ + coords, _ = merge_indexes(indexes, self._coords, set(), append=append) + if inplace: + self._coords = coords + else: + return self._replace(coords=coords) + + def reset_index(self, dims_or_levels, drop=False, inplace=False): + """Reset the specified index(es) or multi-index level(s). + + Parameters + ---------- + dims_or_levels : str or list + Name(s) of the dimension(s) and/or multi-index level(s) that will + be reset. + drop : bool, optional + If True, remove the specified indexes and/or multi-index levels + instead of extracting them as new coordinates (default: False). + inplace : bool, optional + If True, modify the dataarray in-place. Otherwise, return a new + DataArray object. + + Returns + ------- + obj : DataArray + Another dataarray, with this dataarray's data but replaced + coordinates. + + See Also + -------- + DataArray.set_index + """ + coords, _ = split_indexes(dims_or_levels, self._coords, set(), + self._level_coords, drop=drop) + if inplace: + self._coords = coords + else: + return self._replace(coords=coords) + + def reorder_levels(self, inplace=False, **dim_order): + """Rearrange index levels using input order. + + Parameters + ---------- + inplace : bool, optional + If True, modify the dataarray in-place. Otherwise, return a new + DataArray object. + **dim_order : optional + Keyword arguments with names matching dimensions and values given + by lists representing new level orders. Every given dimension + must have a multi-index. + + Returns + ------- + obj : DataArray + Another dataarray, with this dataarray's data but replaced + coordinates. + """ + replace_coords = {} + for dim, order in dim_order.items(): + coord = self._coords[dim] + index = coord.to_index() + if not isinstance(index, pd.MultiIndex): + raise ValueError("coordinate %r has no MultiIndex" % dim) + replace_coords[dim] = IndexVariable(coord.dims, + index.reorder_levels(order)) + coords = self._coords.copy() + coords.update(replace_coords) + if inplace: + self._coords = coords + else: + return self._replace(coords=coords) + def stack(self, **dimensions): """ Stack any number of existing dimensions into a single new dimension. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9a3e2117d80..aa7a67c5783 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2,7 +2,7 @@ from __future__ import division from __future__ import print_function import functools -from collections import Mapping +from collections import Mapping, defaultdict from numbers import Number import numpy as np @@ -22,7 +22,8 @@ merge_data_and_coords) from .utils import (Frozen, SortedKeysDict, maybe_wrap_array, hashable, decode_numpy_dict_values, ensure_us_time_resolution) -from .variable import (Variable, as_variable, IndexVariable, broadcast_variables) +from .variable import (Variable, as_variable, IndexVariable, + broadcast_variables) from .pycompat import (iteritems, basestring, OrderedDict, dask_array_type, range) from .combine import concat @@ -114,6 +115,125 @@ def calculate_dimensions(variables): return dims +def merge_indexes( + indexes, # type: Dict[Any, Union[Any, List[Any]]] + variables, # type: Dict[Any, Variable] + coord_names, # type: Set + append=False, # type: bool + ): + # type: (...) -> Tuple[OrderedDict[Any, Variable], Set] + """Merge variables into multi-indexes. + + Not public API. Used in Dataset and DataArray set_index + methods. + """ + vars_to_replace = {} + vars_to_remove = [] + + for dim, var_names in indexes.items(): + if isinstance(var_names, basestring): + var_names = [var_names] + + names, labels, levels = [], [], [] + current_index_variable = variables.get(dim) + + if current_index_variable is not None and append: + current_index = current_index_variable.to_index() + if isinstance(current_index, pd.MultiIndex): + names.extend(current_index.names) + labels.extend(current_index.labels) + levels.extend(current_index.levels) + else: + names.append('%s_level_0' % dim) + cat = pd.Categorical(current_index.values, ordered=True) + labels.append(cat.codes) + levels.append(cat.categories) + + for n in var_names: + names.append(n) + var = variables[n] + if (current_index_variable is not None and + var.dims != current_index_variable.dims): + raise ValueError( + "dimension mismatch between %r %s and %r %s" + % (dim, current_index_variable.dims, n, var.dims)) + else: + cat = pd.Categorical(var.values, ordered=True) + labels.append(cat.codes) + levels.append(cat.categories) + + idx = pd.MultiIndex(labels=labels, levels=levels, names=names) + vars_to_replace[dim] = IndexVariable(dim, idx) + vars_to_remove.extend(var_names) + + new_variables = OrderedDict([(k, v) for k, v in iteritems(variables) + if k not in vars_to_remove]) + new_variables.update(vars_to_replace) + new_coord_names = coord_names | set(vars_to_replace) + new_coord_names -= set(vars_to_remove) + + return new_variables, new_coord_names + + +def split_indexes( + dims_or_levels, # type: Union[Any, List[Any]] + variables, # type: Dict[Any, Variable] + coord_names, # type: Set + level_coords, # type: Dict[Any, Any] + drop=False, # type: bool + ): + # type: (...) -> Tuple[OrderedDict[Any, Variable], Set] + """Extract (multi-)indexes (levels) as variables. + + Not public API. Used in Dataset and DataArray reset_index + methods. + """ + if isinstance(dims_or_levels, basestring): + dims_or_levels = [dims_or_levels] + + dim_levels = defaultdict(list) + dims = [] + for k in dims_or_levels: + if k in level_coords: + dim_levels[level_coords[k]].append(k) + else: + dims.append(k) + + vars_to_replace = {} + vars_to_create = OrderedDict() + vars_to_remove = [] + + for d in dims: + index = variables[d].to_index() + if isinstance(index, pd.MultiIndex): + dim_levels[d] = index.names + else: + vars_to_remove.append(d) + if not drop: + vars_to_create[d + '_'] = Variable(d, index) + + for d, levs in dim_levels.items(): + index = variables[d].to_index() + if len(levs) == index.nlevels: + vars_to_remove.append(d) + else: + vars_to_replace[d] = IndexVariable(d, index.droplevel(levs)) + + if not drop: + for lev in levs: + idx = index.get_level_values(lev) + vars_to_create[idx.name] = Variable(d, idx) + + new_variables = variables.copy() + for v in set(vars_to_remove): + del new_variables[v] + new_variables.update(vars_to_replace) + new_variables.update(vars_to_create) + new_coord_names = (coord_names | set(vars_to_create)) - set(vars_to_remove) + + return new_variables, new_coord_names + + def _assert_empty(args, msg='%s'): if args: raise ValueError(msg % args) @@ -1262,7 +1382,6 @@ def rename(self, name_dict, inplace=False): See Also -------- - Dataset.swap_dims DataArray.rename """ @@ -1342,6 +1461,99 @@ def swap_dims(self, dims_dict, inplace=False): return self._replace_vars_and_dims(variables, coord_names, inplace=inplace) + def set_index(self, append=False, inplace=False, **indexes): + """Set Dataset (multi-)indexes using one or more existing coordinates or + variables. + + Parameters + ---------- + append : bool, optional + If True, append the supplied index(es) to the existing index(es). + Otherwise replace the existing index(es) (default). + inplace : bool, optional + If True, set new index(es) in-place. Otherwise, return a new + Dataset object. + **indexes : {dim: index, ...} + Keyword arguments with names matching dimensions and values given + by (lists of) the names of existing coordinates or variables to set + as new (multi-)index. + + Returns + ------- + obj : Dataset + Another dataset, with this dataset's data but replaced coordinates. + + See Also + -------- + Dataset.reset_index + """ + variables, coord_names = merge_indexes(indexes, self._variables, + self._coord_names, + append=append) + return self._replace_vars_and_dims(variables, coord_names=coord_names, + inplace=inplace) + + def reset_index(self, dims_or_levels, drop=False, inplace=False): + """Reset the specified index(es) or multi-index level(s). + + Parameters + ---------- + dims_or_levels : str or list + Name(s) of the dimension(s) and/or multi-index level(s) that will + be reset. + drop : bool, optional + If True, remove the specified indexes and/or multi-index levels + instead of extracting them as new coordinates (default: False). + inplace : bool, optional + If True, modify the dataset in-place. Otherwise, return a new + Dataset object. + + Returns + ------- + obj : Dataset + Another dataset, with this dataset's data but replaced coordinates. + + See Also + -------- + Dataset.set_index + """ + variables, coord_names = split_indexes(dims_or_levels, self._variables, + self._coord_names, + self._level_coords, drop=drop) + return self._replace_vars_and_dims(variables, coord_names=coord_names, + inplace=inplace) + + def reorder_levels(self, inplace=False, **dim_order): + """Rearrange index levels using input order. + + Parameters + ---------- + inplace : bool, optional + If True, modify the dataset in-place. Otherwise, return a new + DataArray object. + **dim_order : optional + Keyword arguments with names matching dimensions and values given + by lists representing new level orders. Every given dimension + must have a multi-index. + + Returns + ------- + obj : Dataset + Another dataset, with this dataset's data but replaced + coordinates. + """ + replace_variables = {} + for dim, order in dim_order.items(): + coord = self._variables[dim] + index = coord.to_index() + if not isinstance(index, pd.MultiIndex): + raise ValueError("coordinate %r has no MultiIndex" % dim) + replace_variables[dim] = IndexVariable(coord.dims, + index.reorder_levels(order)) + variables = self._variables.copy() + variables.update(replace_variables) + return self._replace_vars_and_dims(variables, inplace=inplace) + def _stack_once(self, dims, new_dim): variables = OrderedDict() for name, var in self.variables.items(): diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index bd6c5ccff92..d9269d57c4f 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -909,6 +909,83 @@ def test_swap_dims(self): actual = array.swap_dims({'x': 'y'}) self.assertDataArrayIdentical(expected, actual) + def test_set_index(self): + indexes = [self.mindex.get_level_values(n) for n in self.mindex.names] + coords = {idx.name: ('x', idx) for idx in indexes} + array = DataArray(self.mda.values, coords=coords, dims='x') + expected = self.mda.copy() + level_3 = ('x', [1, 2, 3, 4]) + array['level_3'] = level_3 + expected['level_3'] = level_3 + + obj = array.set_index(x=self.mindex.names) + self.assertDataArrayIdentical(obj, expected) + + obj = obj.set_index(x='level_3', append=True) + expected = array.set_index(x=['level_1', 'level_2', 'level_3']) + self.assertDataArrayIdentical(obj, expected) + + array.set_index(x=['level_1', 'level_2', 'level_3'], inplace=True) + self.assertDataArrayIdentical(array, expected) + + array2d = DataArray(np.random.rand(2, 2), + coords={'x': ('x', [0, 1]), + 'level': ('y', [1, 2])}, + dims=('x', 'y')) + with self.assertRaisesRegexp(ValueError, 'dimension mismatch'): + array2d.set_index(x='level') + + def test_reset_index(self): + indexes = [self.mindex.get_level_values(n) for n in self.mindex.names] + coords = {idx.name: ('x', idx) for idx in indexes} + expected = DataArray(self.mda.values, coords=coords, dims='x') + + obj = self.mda.reset_index('x') + self.assertDataArrayIdentical(obj, expected) + obj = self.mda.reset_index(self.mindex.names) + self.assertDataArrayIdentical(obj, expected) + obj = self.mda.reset_index(['x', 'level_1']) + self.assertDataArrayIdentical(obj, expected) + + coords = {'x': ('x', self.mindex.droplevel('level_1')), + 'level_1': ('x', self.mindex.get_level_values('level_1'))} + expected = DataArray(self.mda.values, coords=coords, dims='x') + obj = self.mda.reset_index(['level_1']) + self.assertDataArrayIdentical(obj, expected) + + expected = DataArray(self.mda.values, dims='x') + obj = self.mda.reset_index('x', drop=True) + self.assertDataArrayIdentical(obj, expected) + + array = self.mda.copy() + array.reset_index(['x'], drop=True, inplace=True) + self.assertDataArrayIdentical(array, expected) + + # single index + array = DataArray([1, 2], coords={'x': ['a', 'b']}, dims='x') + expected = DataArray([1, 2], coords={'x_': ('x', ['a', 'b'])}, + dims='x') + self.assertDataArrayIdentical(array.reset_index('x'), expected) + + def test_reorder_levels(self): + midx = self.mindex.reorder_levels(['level_2', 'level_1']) + expected = DataArray(self.mda.values, coords={'x': midx}, dims='x') + + obj = self.mda.reorder_levels(x=['level_2', 'level_1']) + self.assertDataArrayIdentical(obj, expected) + + array = self.mda.copy() + array.reorder_levels(x=['level_2', 'level_1'], inplace=True) + self.assertDataArrayIdentical(array, expected) + + array = DataArray([1, 2], dims='x') + with self.assertRaises(KeyError): + array.reorder_levels(x=['level_1', 'level_2']) + + array['x'] = [0, 1] + with self.assertRaisesRegexp(ValueError, 'has no MultiIndex'): + array.reorder_levels(x=['level_1', 'level_2']) + def test_dataset_getitem(self): dv = self.ds['foo'] self.assertDataArrayIdentical(dv, self.dv) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 1a970fe718d..06c9c9d8b02 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -1454,6 +1454,48 @@ def test_swap_dims(self): with self.assertRaisesRegexp(ValueError, 'replacement dimension'): original.swap_dims({'x': 'z'}) + def test_set_index(self): + expected = create_test_multiindex() + mindex = expected['x'].to_index() + indexes = [mindex.get_level_values(n) for n in mindex.names] + coords = {idx.name: ('x', idx) for idx in indexes} + ds = Dataset({}, coords=coords) + + obj = ds.set_index(x=mindex.names) + self.assertDatasetIdentical(obj, expected) + + ds.set_index(x=mindex.names, inplace=True) + self.assertDatasetIdentical(ds, expected) + + def test_reset_index(self): + ds = create_test_multiindex() + mindex = ds['x'].to_index() + indexes = [mindex.get_level_values(n) for n in mindex.names] + coords = {idx.name: ('x', idx) for idx in indexes} + expected = Dataset({}, coords=coords) + + obj = ds.reset_index('x') + self.assertDatasetIdentical(obj, expected) + + ds.reset_index('x', inplace=True) + self.assertDatasetIdentical(ds, expected) + + def test_reorder_levels(self): + ds = create_test_multiindex() + mindex = ds['x'].to_index() + midx = mindex.reorder_levels(['level_2', 'level_1']) + expected = Dataset({}, coords={'x': midx}) + + reindexed = ds.reorder_levels(x=['level_2', 'level_1']) + self.assertDatasetIdentical(reindexed, expected) + + ds.reorder_levels(x=['level_2', 'level_1'], inplace=True) + self.assertDatasetIdentical(ds, expected) + + ds = Dataset({}, coords={'x': [1, 2]}) + with self.assertRaisesRegexp(ValueError, 'has no MultiIndex'): + ds.reorder_levels(x=['level_1', 'level_2']) + def test_stack(self): ds = Dataset({'a': ('x', [0, 1]), 'b': (('x', 'y'), [[0, 1], [2, 3]]),