From 39c7e34617807525433dca91323297bea6770a99 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 12 Aug 2019 18:06:51 -0700 Subject: [PATCH] sparse=True option for from_dataframe and from_series Fixes https://github.com/pydata/xarray/issues/3206 Example usage: In [3]: import pandas as pd ...: import numpy as np ...: import xarray ...: df = pd.DataFrame({ ...: 'w': range(10), ...: 'x': list('abcdefghij'), ...: 'y': np.arange(0, 100, 10), ...: 'z': np.ones(10), ...: }).set_index(['w', 'x', 'y']) ...: In [4]: ds = xarray.Dataset.from_dataframe(df, sparse=True) In [5]: ds.z.data Out[5]: --- doc/whats-new.rst | 6 +++ xarray/core/dataarray.py | 6 +-- xarray/core/dataset.py | 77 +++++++++++++++++++++++++++++----- xarray/tests/test_dataarray.py | 12 ++++++ xarray/tests/test_dataset.py | 21 ++++++++++ 5 files changed, 109 insertions(+), 13 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cab754965dd..3a314232179 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,6 +32,12 @@ New functions/methods By `Nezar Abdennur `_ and `Guido Imperiale `_. +- :py:meth:`~Dataset.from_dataframe` and :py:meth:`~DataArray.from_series` now + support ``sparse=True`` for converting pandas objects into xarray objects + wrapping sparse arrays. This is particularly useful with sparsely populated + hierarchical indexes. (:issue:`3206`) + By `Stephan Hoyer `_. + - The xarray package is now discoverable by mypy (although typing hints coverage is not complete yet). mypy type checking is now enforced by CI. Libraries that depend on xarray and use mypy can now remove from their setup.cfg the lines:: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 72e000ec609..a3b4b13f67f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2295,7 +2295,7 @@ def from_dict(cls, d: dict) -> "DataArray": return obj @classmethod - def from_series(cls, series: pd.Series) -> "DataArray": + def from_series(cls, series: pd.Series, sparse: bool = False) -> "DataArray": """Convert a pandas.Series into an xarray.DataArray. If the series's index is a MultiIndex, it will be expanded into a @@ -2303,10 +2303,10 @@ def from_series(cls, series: pd.Series) -> "DataArray": values with NaN). Thus this operation should be the inverse of the `to_series` method. """ - # TODO: add a 'name' parameter + # TODO: add a 'name' parameter? name = series.name df = pd.DataFrame({name: series}) - ds = Dataset.from_dataframe(df) + ds = Dataset.from_dataframe(df, sparse=sparse) return ds[name] def to_cdms2(self) -> "cdms2_Variable": diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 4250bf9564e..efab7e123b5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4071,8 +4071,55 @@ def to_dataframe(self): """ return self._to_dataframe(self.dims) + def _set_sparse_data_from_dataframe(self, dataframe, dims, shape): + from sparse import COO + + idx = dataframe.index + if isinstance(idx, pd.MultiIndex): + try: + codes = idx.codes + except AttributeError: + # deprecated since pandas 0.24 + codes = idx.labels + coords = np.stack([np.asarray(code) for code in codes], axis=0) + is_sorted = idx.is_lexsorted + else: + coords = np.arange(idx.size).reshape(1, -1) + is_sorted = True + + for name, series in dataframe.items(): + values = np.asarray(series) + + # In virtually all real use cases, the sparse array will now have + # missing values and needs a fill_value. For consistency, don't + # special case the rare exceptions (e.g., dtype=int without a + # MultiIndex). + dtype, fill_value = dtypes.maybe_promote(values.dtype) + values = np.asarray(values, dtype=dtype) + + data = COO( + coords, + values, + shape, + has_duplicates=False, + sorted=is_sorted, + fill_value=fill_value, + ) + self[name] = (dims, data) + + def _set_numpy_data_from_dataframe(self, dataframe, dims, shape): + idx = dataframe.index + if isinstance(idx, pd.MultiIndex): + # expand the DataFrame to include the product of all levels + full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) + dataframe = dataframe.reindex(full_idx) + + for name, series in dataframe.items(): + data = np.asarray(series).reshape(shape) + self[name] = (dims, data) + @classmethod - def from_dataframe(cls, dataframe): + def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Dataset": """Convert a pandas.DataFrame into an xarray.Dataset Each column will be converted into an independent variable in the @@ -4081,7 +4128,20 @@ def from_dataframe(cls, dataframe): values with NaN). This method will produce a Dataset very similar to that on which the 'to_dataframe' method was called, except with possibly redundant dimensions (since all dataset variables will have - the same dimensionality). + the same dimensionality) + + Parameters + ---------- + dataframe : pandas.DataFrame + DataFrame from which to copy data and indices. + sparse : bool + If true, create a sparse arrays instead of dense numpy arrays. This + can potentially save a large amount of memory if the DataFrame has + a MultiIndex. Requires the sparse package (sparse.pydata.org). + + Returns + ------- + New Dataset. """ # TODO: Add an option to remove dimensions along which the variables # are constant, to enable consistent serialization to/from a dataframe, @@ -4094,10 +4154,6 @@ def from_dataframe(cls, dataframe): obj = cls() if isinstance(idx, pd.MultiIndex): - # it's a multi-index - # expand the DataFrame to include the product of all levels - full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) - dataframe = dataframe.reindex(full_idx) dims = [ name if name is not None else "level_%i" % n for n, name in enumerate(idx.names) @@ -4108,11 +4164,12 @@ def from_dataframe(cls, dataframe): else: dims = (idx.name if idx.name is not None else "index",) obj[dims[0]] = (dims, idx) - shape = -1 + shape = [idx.size] - for name, series in dataframe.items(): - data = np.asarray(series).reshape(shape) - obj[name] = (dims, data) + if sparse: + obj._set_sparse_data_from_dataframe(dataframe, dims, shape) + else: + obj._set_numpy_data_from_dataframe(dataframe, dims, shape) return obj def to_dask_dataframe(self, dim_order=None, set_index=False): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 71734f09cfd..4afddc47728 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3374,6 +3374,18 @@ def test_to_and_from_series(self): expected_da = self.dv.rename(None) assert_identical(expected_da, DataArray.from_series(actual).drop(["x", "y"])) + def test_from_series_sparse(self): + sparse = pytest.importorskip("sparse") + + series = pd.Series([1, 2], index=[("a", 1), ("b", 2)]) + + actual_sparse = DataArray.from_series(series, sparse=True) + actual_dense = DataArray.from_series(series, sparse=True) + + assert isinstance(actual_sparse.data, sparse.COO) + actual_sparse.data = actual_sparse.data.todense() + assert_identical(actual_sparse, actual_dense) + def test_to_and_from_empty_series(self): # GH697 expected = pd.Series([]) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 75325a77b36..cc37aa1d20d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3674,6 +3674,27 @@ def test_to_and_from_dataframe(self): expected = pd.DataFrame([[]], index=idx) assert expected.equals(actual), (expected, actual) + def test_from_dataframe_sparse(self): + sparse = pytest.importorskip("sparse") + + df_base = pd.DataFrame( + {"x": range(10), "y": list("abcdefghij"), "z": np.arange(0, 100, 10)} + ) + + ds_sparse = Dataset.from_dataframe(df_base.set_index("x"), sparse=True) + ds_dense = Dataset.from_dataframe(df_base.set_index("x"), sparse=False) + assert isinstance(ds_sparse["y"].data, sparse.COO) + assert isinstance(ds_sparse["z"].data, sparse.COO) + ds_sparse["y"].data = ds_sparse["y"].data.todense() + ds_sparse["z"].data = ds_sparse["z"].data.todense() + assert_identical(ds_dense, ds_sparse) + + ds_sparse = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=True) + ds_dense = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=False) + assert isinstance(ds_sparse["z"].data, sparse.COO) + ds_sparse["z"].data = ds_sparse["z"].data.todense() + assert_identical(ds_dense, ds_sparse) + def test_to_and_from_empty_dataframe(self): # GH697 expected = pd.DataFrame({"foo": []})