Skip to content

Commit

Permalink
Deprecate SparseDataFrame and SparseSeries (#26137)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored and jreback committed May 29, 2019
1 parent 7629a18 commit e7ad884
Show file tree
Hide file tree
Showing 40 changed files with 488 additions and 175 deletions.
326 changes: 204 additions & 122 deletions doc/source/user_guide/sparse.rst

Large diffs are not rendered by default.

27 changes: 26 additions & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -299,14 +299,39 @@ Other API Changes
Deprecations
~~~~~~~~~~~~

Sparse Subclasses
^^^^^^^^^^^^^^^^^

The ``SparseSeries`` and ``SparseDataFrame`` subclasses are deprecated. Their functionality is better-provided
by a ``Series`` or ``DataFrame`` with sparse values.

**Previous Way**

.. ipython:: python
:okwarning:
df = pd.SparseDataFrame({"A": [0, 0, 1, 2]})
df.dtypes
**New Way**

.. ipython:: python
df = pd.DataFrame({"A": pd.SparseArray([0, 0, 1, 2])})
df.dtypes
The memory usage of the two approaches is identical. See :ref:`sparse.migration` for more (:issue:`19239`).

Other Deprecations
^^^^^^^^^^^^^^^^^^

- The deprecated ``.ix[]`` indexer now raises a more visible FutureWarning instead of DeprecationWarning (:issue:`26438`).
- Deprecated the ``units=M`` (months) and ``units=Y`` (year) parameters for ``units`` of :func:`pandas.to_timedelta`, :func:`pandas.Timedelta` and :func:`pandas.TimedeltaIndex` (:issue:`16344`)
- The :attr:`SparseArray.values` attribute is deprecated. You can use ``np.asarray(...)`` or
the :meth:`SparseArray.to_dense` method instead (:issue:`26421`).
- The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64` or :meth:`Timedelta.to_timedelta64`. (:issue:`24416`)
- The :meth:`DataFrame.compound` and :meth:`Series.compound` methods are deprecated and will be removed in a future version (:issue:`26405`).


.. _whatsnew_0250.prior_deprecations:

Removal of prior version deprecations/changes
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2014,9 +2014,9 @@ def from_coo(cls, A, dense_index=False):
from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
from pandas import Series

result = _coo_to_sparse_series(A, dense_index=dense_index)
# SparseSeries -> Series[sparse]
result = Series(result.values, index=result.index, copy=False)
result = _coo_to_sparse_series(A, dense_index=dense_index,
sparse_series=False)
result = Series(result.array, index=result.index, copy=False)

return result

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1930,13 +1930,13 @@ def to_sparse(self, fill_value=None, kind='block'):
>>> type(df)
<class 'pandas.core.frame.DataFrame'>
>>> sdf = df.to_sparse()
>>> sdf
>>> sdf = df.to_sparse() # doctest: +SKIP
>>> sdf # doctest: +SKIP
0 1
0 NaN NaN
1 1.0 NaN
2 NaN 1.0
>>> type(sdf)
>>> type(sdf) # doctest: +SKIP
<class 'pandas.core.sparse.frame.SparseDataFrame'>
"""
from pandas.core.sparse.api import SparseDataFrame
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5589,7 +5589,7 @@ def ftypes(self):
3 float64:dense
dtype: object
>>> pd.SparseDataFrame(arr).ftypes
>>> pd.SparseDataFrame(arr).ftypes # doctest: +SKIP
0 float64:sparse
1 float64:sparse
2 float64:sparse
Expand Down
1 change: 0 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1586,7 +1586,6 @@ def to_sparse(self, kind='block', fill_value=None):
SparseSeries
Sparse representation of the Series.
"""
# TODO: deprecate
from pandas.core.sparse.series import SparseSeries

values = SparseArray(self, kind=kind, fill_value=fill_value)
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,24 @@
from pandas.core.sparse.series import SparseSeries

_shared_doc_kwargs = dict(klass='SparseDataFrame')
depr_msg = """\
SparseDataFrame is deprecated and will be removed in a future version.
Use a regular DataFrame whose columns are SparseArrays instead.
See http://pandas.pydata.org/pandas-docs/stable/\
user_guide/sparse.html#migrating for more.
"""


class SparseDataFrame(DataFrame):
"""
DataFrame containing sparse floating point data in the form of SparseSeries
objects
.. deprectaed:: 0.25.0
Use a DataFrame with sparse values instead.
Parameters
----------
data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
Expand All @@ -56,6 +67,7 @@ class SparseDataFrame(DataFrame):
def __init__(self, data=None, index=None, columns=None, default_kind=None,
default_fill_value=None, dtype=None, copy=False):

warnings.warn(depr_msg, FutureWarning, stacklevel=2)
# pick up the defaults from the Sparse structures
if isinstance(data, SparseDataFrame):
if index is None:
Expand Down
24 changes: 21 additions & 3 deletions pandas/core/sparse/scipy_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,14 +116,32 @@ def _sparse_series_to_coo(ss, row_levels=(0, ), column_levels=(1, ),
return sparse_matrix, rows, columns


def _coo_to_sparse_series(A, dense_index=False):
def _coo_to_sparse_series(A, dense_index: bool = False,
sparse_series: bool = True):
"""
Convert a scipy.sparse.coo_matrix to a SparseSeries.
Use the defaults given in the SparseSeries constructor.
Parameters
----------
A : scipy.sparse.coo.coo_matrix
dense_index : bool, default False
sparse_series : bool, default True
Returns
-------
Series or SparseSeries
"""
from pandas import SparseDtype

s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
s = s.sort_index()
s = s.to_sparse() # TODO: specify kind?
if sparse_series:
# TODO(SparseSeries): remove this and the sparse_series keyword.
# This is just here to avoid a DeprecationWarning when
# _coo_to_sparse_series is called via Series.sparse.from_coo
s = s.to_sparse() # TODO: specify kind?
else:
s = s.astype(SparseDtype(s.dtype))
if dense_index:
# is there a better constructor method to use here?
i = range(A.shape[0])
Expand Down
16 changes: 16 additions & 0 deletions pandas/core/sparse/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,24 @@
optional_labels='', optional_axis='')


depr_msg = """\
SparseSeries is deprecated and will be removed in a future version.
Use a Series with sparse values instead.
>>> series = pd.Series(pd.SparseArray(...))
See http://pandas.pydata.org/pandas-docs/stable/\
user_guide/sparse.html#migrating for more.
"""


class SparseSeries(Series):
"""Data structure for labeled, sparse floating point data
.. deprectaed:: 0.25.0
Use a Series with sparse values instead.
Parameters
----------
data : {array-like, Series, SparseSeries, dict}
Expand All @@ -60,6 +75,7 @@ class SparseSeries(Series):
def __init__(self, data=None, index=None, sparse_index=None, kind='block',
fill_value=None, name=None, dtype=None, copy=False,
fastpath=False):
warnings.warn(depr_msg, FutureWarning, stacklevel=2)
# TODO: Most of this should be refactored and shared with Series
# 1. BlockManager -> array
# 2. Series.index, Series.name, index, name reconciliation
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/arrays/sparse/test_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,21 @@ def test_density(self):
res = df.sparse.density
expected = 0.75
assert res == expected

@pytest.mark.parametrize("dtype", ['int64', 'float64'])
@pytest.mark.parametrize("dense_index", [True, False])
@td.skip_if_no_scipy
def test_series_from_coo(self, dtype, dense_index):
import scipy.sparse

A = scipy.sparse.eye(3, format='coo', dtype=dtype)
result = pd.Series.sparse.from_coo(A, dense_index=dense_index)
index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)])
expected = pd.Series(pd.SparseArray(np.array([1, 1, 1], dtype=dtype)),
index=index)
if dense_index:
expected = expected.reindex(
pd.MultiIndex.from_product(index.levels)
)

tm.assert_series_equal(result, expected)
1 change: 1 addition & 0 deletions pandas/tests/arrays/sparse/test_arithmetics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas.util.testing as tm


@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
class TestSparseArrayArithmetics:

_base = np.array
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def test_scalar_with_index_infer_dtype(self, scalar, dtype):
assert exp.dtype == dtype

@pytest.mark.parametrize("fill", [1, np.nan, 0])
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
def test_sparse_series_round_trip(self, kind, fill):
# see gh-13999
arr = SparseArray([np.nan, 1, np.nan, 2, 3],
Expand All @@ -231,6 +232,7 @@ def test_sparse_series_round_trip(self, kind, fill):
tm.assert_sp_array_equal(arr, res)

@pytest.mark.parametrize("fill", [True, False, np.nan])
@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
def test_sparse_series_round_trip2(self, kind, fill):
# see gh-13999
arr = SparseArray([True, False, True, True], dtype=np.bool,
Expand Down Expand Up @@ -1098,6 +1100,7 @@ def test_npoints(self):
assert arr.npoints == 1


@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
class TestAccessor:

@pytest.mark.parametrize('attr', [
Expand Down
28 changes: 24 additions & 4 deletions pandas/tests/dtypes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
from pandas.core.sparse.api import SparseDtype
import pandas.util.testing as tm

ignore_sparse_warning = pytest.mark.filterwarnings(
"ignore:Sparse:FutureWarning"
)


# EA & Actual Dtypes
def to_ea_dtypes(dtypes):
Expand Down Expand Up @@ -146,6 +150,7 @@ def test_is_object():
@pytest.mark.parametrize("check_scipy", [
False, pytest.param(True, marks=td.skip_if_no_scipy)
])
@ignore_sparse_warning
def test_is_sparse(check_scipy):
assert com.is_sparse(pd.SparseArray([1, 2, 3]))
assert com.is_sparse(pd.SparseSeries([1, 2, 3]))
Expand All @@ -158,6 +163,7 @@ def test_is_sparse(check_scipy):


@td.skip_if_no_scipy
@ignore_sparse_warning
def test_is_scipy_sparse():
from scipy.sparse import bsr_matrix
assert com.is_scipy_sparse(bsr_matrix([1, 2, 3]))
Expand Down Expand Up @@ -529,6 +535,7 @@ def test_is_bool_dtype():
@pytest.mark.parametrize("check_scipy", [
False, pytest.param(True, marks=td.skip_if_no_scipy)
])
@ignore_sparse_warning
def test_is_extension_type(check_scipy):
assert not com.is_extension_type([1, 2, 3])
assert not com.is_extension_type(np.array([1, 2, 3]))
Expand Down Expand Up @@ -595,8 +602,6 @@ def test_is_offsetlike():
(pd.DatetimeIndex([1, 2]).dtype, np.dtype('=M8[ns]')),
('<M8[ns]', np.dtype('<M8[ns]')),
('datetime64[ns, Europe/London]', DatetimeTZDtype('ns', 'Europe/London')),
(pd.SparseSeries([1, 2], dtype='int32'), SparseDtype('int32')),
(pd.SparseSeries([1, 2], dtype='int32').dtype, SparseDtype('int32')),
(PeriodDtype(freq='D'), PeriodDtype(freq='D')),
('period[D]', PeriodDtype(freq='D')),
(IntervalDtype(), IntervalDtype()),
Expand All @@ -605,6 +610,14 @@ def test__get_dtype(input_param, result):
assert com._get_dtype(input_param) == result


@ignore_sparse_warning
def test__get_dtype_sparse():
ser = pd.SparseSeries([1, 2], dtype='int32')
expected = SparseDtype('int32')
assert com._get_dtype(ser) == expected
assert com._get_dtype(ser.dtype) == expected


@pytest.mark.parametrize('input_param,expected_error_message', [
(None, "Cannot deduce dtype from null object"),
(1, "data type not understood"),
Expand Down Expand Up @@ -640,8 +653,7 @@ def test__get_dtype_fails(input_param, expected_error_message):
(pd.DatetimeIndex(['2000'], tz='Europe/London').dtype,
pd.Timestamp),
('datetime64[ns, Europe/London]', pd.Timestamp),
(pd.SparseSeries([1, 2], dtype='int32'), np.int32),
(pd.SparseSeries([1, 2], dtype='int32').dtype, np.int32),
(PeriodDtype(freq='D'), pd.Period),
('period[D]', pd.Period),
(IntervalDtype(), pd.Interval),
Expand All @@ -652,3 +664,11 @@ def test__get_dtype_fails(input_param, expected_error_message):
])
def test__is_dtype_type(input_param, result):
assert com._is_dtype_type(input_param, lambda tipo: tipo == result)


@ignore_sparse_warning
def test__is_dtype_type_sparse():
ser = pd.SparseSeries([1, 2], dtype='int32')
result = np.dtype('int32')
assert com._is_dtype_type(ser, lambda tipo: tipo == result)
assert com._is_dtype_type(ser.dtype, lambda tipo: tipo == result)
7 changes: 6 additions & 1 deletion pandas/tests/dtypes/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,6 @@ def test_registry_find(dtype, expected):
(pd.Series([1, 2]), False),
(np.array([True, False]), True),
(pd.Series([True, False]), True),
(pd.SparseSeries([True, False]), True),
(pd.SparseArray([True, False]), True),
(SparseDtype(bool), True)
])
Expand All @@ -879,6 +878,12 @@ def test_is_bool_dtype(dtype, expected):
assert result is expected


@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
def test_is_bool_dtype_sparse():
result = is_bool_dtype(pd.SparseSeries([True, False]))
assert result is True


@pytest.mark.parametrize("check", [
is_categorical_dtype,
is_datetime64tz_dtype,
Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/dtypes/test_generic.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from warnings import catch_warnings
from warnings import catch_warnings, simplefilter

import numpy as np

Expand All @@ -17,9 +17,12 @@ class TestABCClasses:
categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1])
categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical)
df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index)
sparse_series = pd.Series([1, 2, 3]).to_sparse()
with catch_warnings():
simplefilter('ignore', FutureWarning)
sparse_series = pd.Series([1, 2, 3]).to_sparse()
sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]})

sparse_array = pd.SparseArray(np.random.randn(10))
sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]})
datetime_array = pd.core.arrays.DatetimeArray(datetime_index)
timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index)

Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pandas.util.testing as tm


@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
class TestDataFrameAlterAxes:

def test_set_index_directly(self, float_string_frame):
Expand Down Expand Up @@ -1376,6 +1377,7 @@ def test_droplevel(self):
tm.assert_frame_equal(result, expected)


@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
class TestIntervalIndex:

def test_setitem(self):
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2073,6 +2073,7 @@ def test_loc_duplicates(self):
df.loc[trange[bool_idx], "A"] += 6
tm.assert_frame_equal(df, expected)

@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
def test_iloc_sparse_propegate_fill_value(self):
from pandas.core.sparse.api import SparseDataFrame
df = SparseDataFrame({'A': [999, 1]}, default_fill_value=999)
Expand Down
Loading

0 comments on commit e7ad884

Please sign in to comment.