diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 34f002086da..75a273bfdb4 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -43,6 +43,12 @@ New functions/methods This requires `sparse>=0.8.0`. By `Nezar Abdennur `_ and `Guido Imperiale `_. +- :py:meth:`~Dataset.from_dataframe` and :py:meth:`~DataArray.from_series` now + support ``sparse=True`` for converting pandas objects into xarray objects + wrapping sparse arrays. This is particularly useful with sparsely populated + hierarchical indexes. (:issue:`3206`) + By `Stephan Hoyer `_. + - The xarray package is now discoverable by mypy (although typing hints coverage is not complete yet). mypy type checking is now enforced by CI. Libraries that depend on xarray and use mypy can now remove from their setup.cfg the lines:: diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4b8d8acb513..4f78ae7d021 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -733,7 +733,7 @@ def reset_coords( else: if self.name is None: raise ValueError( - "cannot reset_coords with drop=False " "on an unnamed DataArrray" + "cannot reset_coords with drop=False on an unnamed DataArrray" ) dataset[self.name] = self.variable return dataset @@ -1448,9 +1448,7 @@ def expand_dims( This object, but with an additional dimension(s). """ if isinstance(dim, int): - raise TypeError( - "dim should be hashable or sequence/mapping of " "hashables" - ) + raise TypeError("dim should be hashable or sequence/mapping of hashables") elif isinstance(dim, Sequence) and not isinstance(dim, str): if len(dim) != len(set(dim)): raise ValueError("dims should not contain duplicate values.") @@ -2277,19 +2275,27 @@ def from_dict(cls, d: dict) -> "DataArray": return obj @classmethod - def from_series(cls, series: pd.Series) -> "DataArray": + def from_series(cls, series: pd.Series, sparse: bool = False) -> "DataArray": """Convert a pandas.Series into an xarray.DataArray. If the series's index is a MultiIndex, it will be expanded into a tensor product of one-dimensional coordinates (filling in missing values with NaN). Thus this operation should be the inverse of the `to_series` method. + + If sparse=True, creates a sparse array instead of a dense NumPy array. + Requires the pydata/sparse package. + + See also + -------- + xarray.Dataset.from_dataframe """ - # TODO: add a 'name' parameter - name = series.name - df = pd.DataFrame({name: series}) - ds = Dataset.from_dataframe(df) - return ds[name] + temp_name = "__temporary_name" + df = pd.DataFrame({temp_name: series}) + ds = Dataset.from_dataframe(df, sparse=sparse) + result = cast(DataArray, ds[temp_name]) + result.name = series.name + return result def to_cdms2(self) -> "cdms2_Variable": """Convert this array into a cdms2.Variable @@ -2704,7 +2710,7 @@ def dot( """ if isinstance(other, Dataset): raise NotImplementedError( - "dot products are not yet supported " "with Dataset objects." + "dot products are not yet supported with Dataset objects." ) if not isinstance(other, DataArray): raise TypeError("dot only operates on DataArrays.") diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e62b6612ae6..14237a244fd 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1214,12 +1214,13 @@ def loc(self) -> _LocIndexer: """ return _LocIndexer(self) - def __getitem__(self, key: object) -> "Union[DataArray, Dataset]": + def __getitem__(self, key: Any) -> "Union[DataArray, Dataset]": """Access variables or coordinates this dataset as a :py:class:`~xarray.DataArray`. Indexing with a list of names will return a new ``Dataset`` object. """ + # TODO(shoyer): type this properly: https://github.com/python/mypy/issues/7328 if utils.is_dict_like(key): return self.isel(**cast(Mapping, key)) @@ -3916,8 +3917,61 @@ def to_dataframe(self): """ return self._to_dataframe(self.dims) + def _set_sparse_data_from_dataframe( + self, dataframe: pd.DataFrame, dims: tuple, shape: Tuple[int, ...] + ) -> None: + from sparse import COO + + idx = dataframe.index + if isinstance(idx, pd.MultiIndex): + try: + codes = idx.codes + except AttributeError: + # deprecated since pandas 0.24 + codes = idx.labels + coords = np.stack([np.asarray(code) for code in codes], axis=0) + is_sorted = idx.is_lexsorted + else: + coords = np.arange(idx.size).reshape(1, -1) + is_sorted = True + + for name, series in dataframe.items(): + # Cast to a NumPy array first, in case the Series is a pandas + # Extension array (which doesn't have a valid NumPy dtype) + values = np.asarray(series) + + # In virtually all real use cases, the sparse array will now have + # missing values and needs a fill_value. For consistency, don't + # special case the rare exceptions (e.g., dtype=int without a + # MultiIndex). + dtype, fill_value = dtypes.maybe_promote(values.dtype) + values = np.asarray(values, dtype=dtype) + + data = COO( + coords, + values, + shape, + has_duplicates=False, + sorted=is_sorted, + fill_value=fill_value, + ) + self[name] = (dims, data) + + def _set_numpy_data_from_dataframe( + self, dataframe: pd.DataFrame, dims: tuple, shape: Tuple[int, ...] + ) -> None: + idx = dataframe.index + if isinstance(idx, pd.MultiIndex): + # expand the DataFrame to include the product of all levels + full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) + dataframe = dataframe.reindex(full_idx) + + for name, series in dataframe.items(): + data = np.asarray(series).reshape(shape) + self[name] = (dims, data) + @classmethod - def from_dataframe(cls, dataframe): + def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> "Dataset": """Convert a pandas.DataFrame into an xarray.Dataset Each column will be converted into an independent variable in the @@ -3926,7 +3980,24 @@ def from_dataframe(cls, dataframe): values with NaN). This method will produce a Dataset very similar to that on which the 'to_dataframe' method was called, except with possibly redundant dimensions (since all dataset variables will have - the same dimensionality). + the same dimensionality) + + Parameters + ---------- + dataframe : pandas.DataFrame + DataFrame from which to copy data and indices. + sparse : bool + If true, create a sparse arrays instead of dense numpy arrays. This + can potentially save a large amount of memory if the DataFrame has + a MultiIndex. Requires the sparse package (sparse.pydata.org). + + Returns + ------- + New Dataset. + + See also + -------- + xarray.DataArray.from_series """ # TODO: Add an option to remove dimensions along which the variables # are constant, to enable consistent serialization to/from a dataframe, @@ -3939,25 +4010,23 @@ def from_dataframe(cls, dataframe): obj = cls() if isinstance(idx, pd.MultiIndex): - # it's a multi-index - # expand the DataFrame to include the product of all levels - full_idx = pd.MultiIndex.from_product(idx.levels, names=idx.names) - dataframe = dataframe.reindex(full_idx) - dims = [ + dims = tuple( name if name is not None else "level_%i" % n for n, name in enumerate(idx.names) - ] + ) for dim, lev in zip(dims, idx.levels): obj[dim] = (dim, lev) - shape = [lev.size for lev in idx.levels] + shape = tuple(lev.size for lev in idx.levels) else: - dims = (idx.name if idx.name is not None else "index",) - obj[dims[0]] = (dims, idx) - shape = -1 + index_name = idx.name if idx.name is not None else "index" + dims = (index_name,) + obj[index_name] = (dims, idx) + shape = (idx.size,) - for name, series in dataframe.items(): - data = np.asarray(series).reshape(shape) - obj[name] = (dims, data) + if sparse: + obj._set_sparse_data_from_dataframe(dataframe, dims, shape) + else: + obj._set_numpy_data_from_dataframe(dataframe, dims, shape) return obj def to_dask_dataframe(self, dim_order=None, set_index=False): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index fb4f8200e08..ab1d2714b9d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -84,6 +84,7 @@ def LooseVersion(vstring): has_iris, requires_iris = _importorskip("iris") has_cfgrib, requires_cfgrib = _importorskip("cfgrib") has_numbagg, requires_numbagg = _importorskip("numbagg") +has_sparse, requires_sparse = _importorskip("sparse") # some special cases has_h5netcdf07, requires_h5netcdf07 = _importorskip("h5netcdf", minversion="0.7") diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index f623ec9976f..532cc32376a 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -29,6 +29,7 @@ requires_np113, requires_numbagg, requires_scipy, + requires_sparse, source_ndarray, ) @@ -3398,6 +3399,19 @@ def test_to_and_from_series(self): expected_da = self.dv.rename(None) assert_identical(expected_da, DataArray.from_series(actual).drop(["x", "y"])) + @requires_sparse + def test_from_series_sparse(self): + import sparse + + series = pd.Series([1, 2], index=[("a", 1), ("b", 2)]) + + actual_sparse = DataArray.from_series(series, sparse=True) + actual_dense = DataArray.from_series(series, sparse=False) + + assert isinstance(actual_sparse.data, sparse.COO) + actual_sparse.data = actual_sparse.data.todense() + assert_identical(actual_sparse, actual_dense) + def test_to_and_from_empty_series(self): # GH697 expected = pd.Series([]) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index a5d9a65d020..3c2b9b6ce8f 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -46,6 +46,7 @@ requires_dask, requires_numbagg, requires_scipy, + requires_sparse, source_ndarray, ) @@ -3653,6 +3654,28 @@ def test_to_and_from_dataframe(self): expected = pd.DataFrame([[]], index=idx) assert expected.equals(actual), (expected, actual) + @requires_sparse + def test_from_dataframe_sparse(self): + import sparse + + df_base = pd.DataFrame( + {"x": range(10), "y": list("abcdefghij"), "z": np.arange(0, 100, 10)} + ) + + ds_sparse = Dataset.from_dataframe(df_base.set_index("x"), sparse=True) + ds_dense = Dataset.from_dataframe(df_base.set_index("x"), sparse=False) + assert isinstance(ds_sparse["y"].data, sparse.COO) + assert isinstance(ds_sparse["z"].data, sparse.COO) + ds_sparse["y"].data = ds_sparse["y"].data.todense() + ds_sparse["z"].data = ds_sparse["z"].data.todense() + assert_identical(ds_dense, ds_sparse) + + ds_sparse = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=True) + ds_dense = Dataset.from_dataframe(df_base.set_index(["x", "y"]), sparse=False) + assert isinstance(ds_sparse["z"].data, sparse.COO) + ds_sparse["z"].data = ds_sparse["z"].data.todense() + assert_identical(ds_dense, ds_sparse) + def test_to_and_from_empty_dataframe(self): # GH697 expected = pd.DataFrame({"foo": []}) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 725cfe3d506..766a391b57f 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -245,9 +245,9 @@ def construct_dataarray(dim_num, dtype, contains_nan, dask): def from_series_or_scalar(se): - try: + if isinstance(se, pd.Series): return DataArray.from_series(se) - except AttributeError: # scalar case + else: # scalar case return DataArray(se)