pandas-dev · jreback · May 29, 2019 · Mar 15, 2019 · Mar 12, 2019 · May 14, 2019
diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst
@@ -6,27 +6,28 @@
 Sparse data structures
 **********************
 
-We have implemented "sparse" versions of ``Series`` and ``DataFrame``. These are not sparse
-in the typical "mostly 0". Rather, you can view these objects as being "compressed"
-where any data matching a specific value (``NaN`` / missing value, though any value
-can be chosen) is omitted. A special ``SparseIndex`` object tracks where data has been
-"sparsified". This will make much more sense with an example. All of the standard pandas
-data structures have a ``to_sparse`` method:
+.. note::
 
-.. ipython:: python
-
-   ts = pd.Series(np.random.randn(10))
-   ts[2:-2] = np.nan
-   sts = ts.to_sparse()
-   sts
+   ``SparseSeries`` and ``SparseDataFrame`` have been deprecated. Their purpose
+   is served equally well by a :class:`Series` or :class:`DataFrame` with
+   sparse values. See :ref:`sparse.migration` for tips on migrating.
 
-The ``to_sparse`` method takes a ``kind`` argument (for the sparse index, see
-below) and a ``fill_value``. So if we had a mostly zero ``Series``, we could
-convert it to sparse with ``fill_value=0``:
+Pandas provides data structures for efficiently storing sparse data.
+These are not necessarily sparse in the typical "mostly 0". Rather, you can view these
+objects as being "compressed" where any data matching a specific value (``NaN`` / missing value, though any value
+can be chosen, including 0) is omitted. A special ``SparseIndex`` object tracks where data has been
+"sparsified". For example,
 
 .. ipython:: python
 
-   ts.fillna(0).to_sparse(fill_value=0)
+   arr = np.random.randn(10)
+   arr[2:-2] = np.nan
+   ts = pd.Series(pd.SparseArray(arr))
+   ts
+
+Notice the dtype, ``Sparse[float64, nan]``. The ``nan`` means that elements in the
+array that are ``nan`` aren't actually stored, only the non-``nan`` elements are.
+Those non-``nan`` elements have a ``float64`` dtype.
 
 The sparse objects exist for memory efficiency reasons. Suppose you had a
 large, mostly NA ``DataFrame``:
@@ -35,21 +36,64 @@ large, mostly NA ``DataFrame``:
 
    df = pd.DataFrame(np.random.randn(10000, 4))
    df.iloc[:9998] = np.nan
-   sdf = df.to_sparse()
+   sdf = df.astype(pd.SparseDtype("float", np.nan))
    sdf
-   sdf.density
+   sdf.sparse.density
 
 As you can see, the density (% of values that have not been "compressed") is
 extremely low. This sparse object takes up much less memory on disk (pickled)
 and in the Python interpreter. Functionally, their behavior should be nearly
 identical to their dense counterparts.
 
-Any sparse object can be converted back to the standard dense form by calling
-``to_dense``:
+.. _sparse.array:
+
+SparseArray
+-----------
+
+:class:`SparseArray` is a :class:`~pandas.api.extensions.ExtensionArray`
+for storing an array of sparse values (see :ref:`basics.dtypes` for more
+on extension arrays). It is a 1-dimensional ndarray-like object storing
+only values distinct from the ``fill_value``:
+
+.. ipython:: python
+
+   arr = np.random.randn(10)
+   arr[2:5] = np.nan
+   arr[7:8] = np.nan
+   sparr = pd.SparseArray(arr)
+   sparr
+
+A sparse array can be converted to a regular (dense) ndarray with :meth:`numpy.asarray`
+
+.. ipython:: python
+
+   np.asarray(sparr)
+
+The :attr:`SparseArray.dtype` property stores two pieces of information
+
+1. The dtype of the non-sparse values
+2. The scalar fill value
+
+A :class:`SparseDtype` may be constructed by passing each of these
+
+.. ipython:: python
+
+   pd.SparseDtype(np.dtype('datetime64[ns]'))
+
+The default fill value for a given NumPy dtype is the "missing" value for that dtype,
+though it may be overridden.
+
+.. ipython:: python
+
+   pd.SparseDtype(np.dtype('datetime64[ns]'),
+                  fill_value=pd.Timestamp('2017-01-01'))
+
+Finally, the string alias ``'Sparse[dtype]'`` may be used to specify a sparse dtype
+in many places
 
 .. ipython:: python
 
-   sts.to_dense()
+   pd.array([1, 0, 0, 2], dtype='Sparse[int]')
 
 .. _sparse.accessor:
 
@@ -71,30 +115,11 @@ attributes and methods that are specific to sparse data.
 This accessor is available only on data with ``SparseDtype``, and on the :class:`Series`
 class itself for creating a Series with sparse data from a scipy COO matrix with.
 
-.. _sparse.array:
-
-SparseArray
------------
 
-``SparseArray`` is the base layer for all of the sparse indexed data
-structures. It is a 1-dimensional ndarray-like object storing only values
-distinct from the ``fill_value``:
-
-.. ipython:: python
-
-   arr = np.random.randn(10)
-   arr[2:5] = np.nan
-   arr[7:8] = np.nan
-   sparr = pd.SparseArray(arr)
-   sparr
-
-Like the indexed objects (SparseSeries, SparseDataFrame), a ``SparseArray``
-can be converted back to a regular ndarray by calling ``to_dense``:
-
-.. ipython:: python
-
-   sparr.to_dense()
+.. versionadded:: 0.25.0
 
+A ``.sparse`` accessor has been added for :class:`DataFrame` as well.
+See :ref:`api.dataframe.sparse` for more.
-See :ref:`api.dataframe.sparse` for more.
+See :ref:`api.frame.sparse` for more.
-See :ref:`api.dataframe.sparse` for more.
+See :ref:`api.frame.sparse` for more.
 
 SparseIndex objects
 -------------------
@@ -105,84 +130,115 @@ keeps an arrays of all of the locations where the data are not equal to the
 fill value. The ``block`` format tracks only the locations and sizes of blocks
 of data.
 
-.. _sparse.dtype:
+.. _sparse.calculation:
+
+Sparse Calculation
+------------------
 
-Sparse Dtypes
--------------
+You can apply NumPy *ufuncs* to ``SparseArray`` and get a ``SparseArray`` as a result.
 
-Sparse data should have the same dtype as its dense representation. Currently,
-``float64``, ``int64`` and ``bool`` dtypes are supported. Depending on the original
-dtype, ``fill_value`` default changes:
+.. ipython:: python
+
+   arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan])
+   np.abs(arr)
 
-* ``float64``: ``np.nan``
-* ``int64``: ``0``
-* ``bool``: ``False``
+
+The *ufunc* is also applied to ``fill_value``. This is needed to get
+the correct dense result.
 
 .. ipython:: python
 
-   s = pd.Series([1, np.nan, np.nan])
-   s
-   s.to_sparse()
+   arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1)
+   np.abs(arr)
+   np.abs(arr).to_dense()
 
-   s = pd.Series([1, 0, 0])
-   s
-   s.to_sparse()
+.. _sparse.migration:
 
-   s = pd.Series([True, False, True])
-   s
-   s.to_sparse()
+Migrating
+---------
+
+In older versions of pandas, the ``SparseSeries`` and ``SparseDataFrame`` classes (documented below)
+were the preferred way to work with sparse data. With the advent of extension arrays, these subclasses
+are no longer needed. Their purpose is better served by using a regular Series or DataFrame with
+sparse values instead.
+
+**There's no performance or memory penalty to using a Series or DataFrame with sparse values,
+rather than a SparseSeries or SparseDataFrame**.
+
+This section provides some guidance on migrating your code to the new style. As a reminder, you can
+use the python warnings module to control warnings. If you wish to ignore the warnings,
+
+.. code-block:: python
+
+   >>> import warnings
+
+   >>> warnings.filterwarnings('ignore', 'Sparse', FutureWarning)
+   >>> pd.SparseSeries()  # No warning message
+   Series([], dtype: Sparse[float64, nan])
+   BlockIndex
+   Block locations: array([], dtype=int32)
+   Block lengths: array([], dtype=int32)
+
+But we recommend modifying your code, rather than ignoring the warning.
+
+**Construction**
 
-You can change the dtype using ``.astype()``, the result is also sparse. Note that
-``.astype()`` also affects to the ``fill_value`` to keep its dense representation.
+From an array-like, use the regular :class:`Series` or
+:class:`DataFrame` constructors with :class:`SparseArray` values.
 
+.. code-block:: python
+
+   # Old way
+   >>> pd.SparseDataFrame({"A": [0, 1]})
 
 .. ipython:: python
 
-   s = pd.Series([1, 0, 0, 0, 0])
-   s
-   ss = s.to_sparse()
-   ss
-   ss.astype(np.float64)
+   # New way
+   pd.DataFrame({"A": pd.SparseArray([0, 1])})
 
-It raises if any value cannot be coerced to specified dtype.
+From a SciPy sparse matrix, use :meth:`DataFrame.sparse.from_spmatrix`,
 
-.. code-block:: ipython
+.. code-block:: python
 
-   In [1]: ss = pd.Series([1, np.nan, np.nan]).to_sparse()
-   Out[1]:
-   0    1.0
-   1    NaN
-   2    NaN
-   dtype: float64
-   BlockIndex
-   Block locations: array([0], dtype=int32)
-   Block lengths: array([1], dtype=int32)
+   # Old way
+   df = pd.SparseDataFrame(sp_matrix, columns=['A', 'B', 'C'])
 
-   In [2]: ss.astype(np.int64)
-   Out[2]:
-   ValueError: unable to coerce current fill_value nan to int64 dtype
+.. ipython:: python
 
-.. _sparse.calculation:
+   # New way
+   from scipy import sparse
+   mat = sparse.eye(3)
+   df = pd.DataFrame.sparse.from_spmatrix(mat, columns=['A', 'B', 'C'])
+   df
 
-Sparse Calculation
-------------------
+**Conversion**
 
-You can apply NumPy *ufuncs* to ``SparseArray`` and get a ``SparseArray`` as a result.
+From sparse to dense, use the ``.sparse`` accessors
 
 .. ipython:: python
 
-   arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan])
-   np.abs(arr)
+   df.sparse.to_dense()
+   df.sparse.to_coo()
+   df['A']
 
+From dense to sparse, use :meth:`DataFrame.astype` with a :class:`SparseDtype`.
 
-The *ufunc* is also applied to ``fill_value``. This is needed to get
-the correct dense result.
+.. ipython:: python
+
+   dense = pd.DataFrame({"A": [1, 0, 0, 1]})
+   dtype = pd.SparseDtype(int, fill_value=0)
+   dense.astype(dtype)['A
+
+**Sparse Properties**
+
+Sparse-specific properties, like ``density``, are available on the ``.sparse`` accssor.
 
 .. ipython:: python
 
-   arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1)
-   np.abs(arr)
-   np.abs(arr).to_dense()
+   df.sparse.density
+
+The ``SparseDataFrame.default_kind`` and ``SparseDataFrame.default_fill_value`` attributes
+have no replacement.
 
 .. _sparse.scipysparse:
 
@@ -197,6 +253,7 @@ SparseDataFrame
 Pandas supports creating sparse dataframes directly from ``scipy.sparse`` matrices.
 
 .. ipython:: python
+   :okwarning:
 
    from scipy.sparse import csr_matrix
 
@@ -291,3 +348,5 @@ row and columns coordinates of the matrix. Note that this will consume a signifi
 
    ss_dense = pd.SparseSeries.from_coo(A, dense_index=True)
    ss_dense
+
+
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -226,6 +226,7 @@ Deprecations
 
 - Deprecated the ``units=M`` (months) and ``units=Y`` (year) parameters for ``units`` of :func:`pandas.to_timedelta`, :func:`pandas.Timedelta` and :func:`pandas.TimedeltaIndex` (:issue:`16344`)
 - The functions :func:`pandas.to_datetime` and :func:`pandas.to_timedelta` have deprecated the ``box`` keyword. Instead, use :meth:`to_numpy` or :meth:`Timestamp.to_datetime64` or :meth:`Timedelta.to_timedelta64`. (:issue:`24416`)
+- The ``SparseSeries`` and ``SparseDataFrame`` subclasses are deprecated. Use a ``DataFrame`` or ``Series`` with sparse values instead. See :ref:`sparse.migration` for more (:issue:`19239`).
 
 .. _whatsnew_0250.prior_deprecations:
 

diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -28,6 +28,13 @@
 from pandas.core.sparse.series import SparseSeries
 
 _shared_doc_kwargs = dict(klass='SparseDataFrame')
+depr_msg = """\
+SparseDataFrame is deprecated and will be removed in a future version.
+Use a DataFrame with sparse values instead.
+
+See http://pandas.pydata.org/pandas-docs/stable/\
+user_guide/sparse.html#migrating for more.
+"""
 
 
 class SparseDataFrame(DataFrame):
@@ -56,6 +63,7 @@ class SparseDataFrame(DataFrame):
     def __init__(self, data=None, index=None, columns=None, default_kind=None,
                  default_fill_value=None, dtype=None, copy=False):
 
+        warnings.warn(depr_msg, FutureWarning, stacklevel=2)
         # pick up the defaults from the Sparse structures
         if isinstance(data, SparseDataFrame):
             if index is None:

diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
@@ -32,6 +32,15 @@
                           optional_labels='', optional_axis='')
 
 
+depr_msg = """\
+SparseSeries is deprecated and will be removed in a future version.
+Use a Series with sparse values instead.
+
+See http://pandas.pydata.org/pandas-docs/stable/\
+user_guide/sparse.html#migrating for more.
+"""
+
+
 class SparseSeries(Series):
     """Data structure for labeled, sparse floating point data
 
@@ -60,6 +69,7 @@ class SparseSeries(Series):
     def __init__(self, data=None, index=None, sparse_index=None, kind='block',
                  fill_value=None, name=None, dtype=None, copy=False,
                  fastpath=False):
+        warnings.warn(depr_msg, FutureWarning, stacklevel=2)
         # TODO: Most of this should be refactored and shared with Series
         # 1. BlockManager -> array
         # 2. Series.index, Series.name, index, name reconciliation

diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py
@@ -8,6 +8,7 @@
 import pandas.util.testing as tm
 
 
+@pytest.mark.filterwarnings("ignore:Sparse:FutureWarning")
 class TestSparseArrayArithmetics:
 
     _base = np.array