Skip to content

Commit

Permalink
sparse=True option for from_dataframe and from_series
Browse files Browse the repository at this point in the history
Fixes pydata#3206

Example usage:

    In [3]: import pandas as pd
       ...: import numpy as np
       ...: import xarray
       ...: df = pd.DataFrame({
       ...:     'w': range(10),
       ...:     'x': list('abcdefghij'),
       ...:     'y': np.arange(0, 100, 10),
       ...:     'z': np.ones(10),
       ...: }).set_index(['w', 'x', 'y'])
       ...:

    In [4]: ds = xarray.Dataset.from_dataframe(df, sparse=True)

    In [5]: ds.z.data
    Out[5]: <COO: shape=(10, 10, 10), dtype=float64, nnz=10, fill_value=nan>
  • Loading branch information
shoyer committed Aug 13, 2019
1 parent c782637 commit 39c7e34
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 13 deletions.
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ New functions/methods
By `Nezar Abdennur <https://github.com/nvictus>`_
and `Guido Imperiale <https://github.com/crusaderky>`_.

- :py:meth:`~Dataset.from_dataframe` and :py:meth:`~DataArray.from_series` now
support ``sparse=True`` for converting pandas objects into xarray objects
wrapping sparse arrays. This is particularly useful with sparsely populated
hierarchical indexes. (:issue:`3206`)
By `Stephan Hoyer <https://github.com/shoyer>`_.

- The xarray package is now discoverable by mypy (although typing hints coverage is not
complete yet). mypy type checking is now enforced by CI. Libraries that depend on
xarray and use mypy can now remove from their setup.cfg the lines::
Expand Down
6 changes: 3 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2295,18 +2295,18 @@ def from_dict(cls, d: dict) -> "DataArray":
return obj

@classmethod
def from_series(cls, series: pd.Series) -> "DataArray":
def from_series(cls, series: pd.Series, sparse: bool = False) -> "DataArray":
"""Convert a pandas.Series into an xarray.DataArray.
If the series's index is a MultiIndex, it will be expanded into a
tensor product of one-dimensional coordinates (filling in missing
values with NaN). Thus this operation should be the inverse of the
`to_series` method.
"""
# TODO: add a 'name' parameter
# TODO: add a 'name' parameter?
name = series.name
df = pd.DataFrame({name: series})
ds = Dataset.from_dataframe(df)
ds = Dataset.from_dataframe(df, sparse=sparse)
return ds[name]

def to_cdms2(self) -> "cdms2_Variable":
Expand Down
77 changes: 67 additions & 10 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4071,8 +4071,55 @@ def to_dataframe(self):
"""
return self._to_dataframe(self.dims)

def _set_sparse_data_from_dataframe(self, dataframe, dims, shape):
from sparse import COO

idx = dataframe.index
if isinstance(idx, pd.MultiIndex):
try:
codes = idx.codes
except AttributeError:
# deprecated since pandas 0.24
codes = idx.labels
coords = np.stack([np.asarray(code) for code in codes], axis=0)
is_sorted = idx.is_lexsorted
else:
coords = np.arange(idx.size).reshape(1, -1)
is_sorted = True

for name, series in dataframe.items():
values = np.asarray(series)

# In virtually all real use cases, the sparse array will now have
# missing values and needs a fill_value. For consistency, don't
# special case the rare exceptions (e.g., dtype=int without a
# MultiIndex).
dtype, fill_value = dtypes.maybe_promote(values.dtype)
values = np.asarray(values, dtype=dtype)

data = COO(
coords,
values,
shape,
has_duplicates=False,
sorted=is_sorted,
fill_value=fill_value,
)
self[name] = (dims, data)

def _set_numpy_data_from_dataframe(self, dataframe, dims, shape):
idx = dataframe.index
if isinstance(idx, pd.MultiIndex):
# expand the DataFrame to include the product of all levels
full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
dataframe = dataframe.reindex(full_idx)

for name, series in dataframe.items():
data = np.asarray(series).reshape(shape)
self[name] = (dims, data)

@classmethod
def from_dataframe(cls, dataframe):
def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Dataset":
"""Convert a pandas.DataFrame into an xarray.Dataset
Each column will be converted into an independent variable in the
Expand All @@ -4081,7 +4128,20 @@ def from_dataframe(cls, dataframe):
values with NaN). This method will produce a Dataset very similar to
that on which the 'to_dataframe' method was called, except with
possibly redundant dimensions (since all dataset variables will have
the same dimensionality).
the same dimensionality)
Parameters
----------
dataframe : pandas.DataFrame
DataFrame from which to copy data and indices.
sparse : bool
If true, create a sparse arrays instead of dense numpy arrays. This
can potentially save a large amount of memory if the DataFrame has
a MultiIndex. Requires the sparse package (sparse.pydata.org).
Returns
-------
New Dataset.
"""
# TODO: Add an option to remove dimensions along which the variables
# are constant, to enable consistent serialization to/from a dataframe,
Expand All @@ -4094,10 +4154,6 @@ def from_dataframe(cls, dataframe):
obj = cls()

if isinstance(idx, pd.MultiIndex):
# it's a multi-index
# expand the DataFrame to include the product of all levels
full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names)
dataframe = dataframe.reindex(full_idx)
dims = [
name if name is not None else "level_%i" % n
for n, name in enumerate(idx.names)
Expand All @@ -4108,11 +4164,12 @@ def from_dataframe(cls, dataframe):
else:
dims = (idx.name if idx.name is not None else "index",)
obj[dims[0]] = (dims, idx)
shape = -1
shape = [idx.size]

for name, series in dataframe.items():
data = np.asarray(series).reshape(shape)
obj[name] = (dims, data)
if sparse:
obj._set_sparse_data_from_dataframe(dataframe, dims, shape)
else:
obj._set_numpy_data_from_dataframe(dataframe, dims, shape)
return obj

def to_dask_dataframe(self, dim_order=None, set_index=False):
Expand Down
12 changes: 12 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3374,6 +3374,18 @@ def test_to_and_from_series(self):
expected_da = self.dv.rename(None)
assert_identical(expected_da, DataArray.from_series(actual).drop(["x", "y"]))

def test_from_series_sparse(self):
sparse = pytest.importorskip("sparse")

series = pd.Series([1, 2], index=[("a", 1), ("b", 2)])

actual_sparse = DataArray.from_series(series, sparse=True)
actual_dense = DataArray.from_series(series, sparse=True)

assert isinstance(actual_sparse.data, sparse.COO)
actual_sparse.data = actual_sparse.data.todense()
assert_identical(actual_sparse, actual_dense)

def test_to_and_from_empty_series(self):
# GH697
expected = pd.Series([])
Expand Down
21 changes: 21 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3674,6 +3674,27 @@ def test_to_and_from_dataframe(self):
expected = pd.DataFrame([[]], index=idx)
assert expected.equals(actual), (expected, actual)

def test_from_dataframe_sparse(self):
sparse = pytest.importorskip("sparse")

df_base = pd.DataFrame(
{"x": range(10), "y": list("abcdefghij"), "z": np.arange(0, 100, 10)}
)

ds_sparse = Dataset.from_dataframe(df_base.set_index("x"), sparse=True)
ds_dense = Dataset.from_dataframe(df_base.set_index("x"), sparse=False)
assert isinstance(ds_sparse["y"].data, sparse.COO)
assert isinstance(ds_sparse["z"].data, sparse.COO)
ds_sparse["y"].data = ds_sparse["y"].data.todense()
ds_sparse["z"].data = ds_sparse["z"].data.todense()
assert_identical(ds_dense, ds_sparse)

ds_sparse = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=True)
ds_dense = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=False)
assert isinstance(ds_sparse["z"].data, sparse.COO)
ds_sparse["z"].data = ds_sparse["z"].data.todense()
assert_identical(ds_dense, ds_sparse)

def test_to_and_from_empty_dataframe(self):
# GH697
expected = pd.DataFrame({"foo": []})
Expand Down

0 comments on commit 39c7e34

Please sign in to comment.