From 44b743d0bc1dfbc383d366d515ca810580d75ac5 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <TomAugspurger@users.noreply.github.com>
Date: Thu, 25 Oct 2018 20:28:55 -0500
Subject: [PATCH] API: Add sparse Acessor (#23183)

---
 doc/source/api.rst                       |  16 +++
 doc/source/sparse.rst                    |  20 +++
 doc/source/whatsnew/v0.24.0.txt          |   8 +-
 pandas/core/accessor.py                  |  11 +-
 pandas/core/arrays/sparse.py             | 174 +++++++++++++++++++++++
 pandas/core/indexes/accessors.py         |   1 -
 pandas/core/series.py                    |   4 +-
 pandas/core/sparse/series.py             |  93 +-----------
 pandas/tests/arrays/sparse/test_array.py |  49 +++++++
 9 files changed, 281 insertions(+), 95 deletions(-)

diff --git a/doc/source/api.rst b/doc/source/api.rst
index 1ec2a56dcd0941..6e8eb83577c466 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -851,6 +851,22 @@ Sparse
    SparseSeries.to_coo
    SparseSeries.from_coo
 
+.. autosummary::
+   :toctree: generated/
+   :template: autosummary/accessor_attribute.rst
+
+   Series.sparse.npoints
+   Series.sparse.density
+   Series.sparse.fill_value
+   Series.sparse.sp_values
+
+
+.. autosummary::
+   :toctree: generated/
+
+   Series.sparse.from_coo
+   Series.sparse.to_coo
+
 .. _api.dataframe:
 
 DataFrame
diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst
index 2bb99dd1822b68..884512981e1c90 100644
--- a/doc/source/sparse.rst
+++ b/doc/source/sparse.rst
@@ -62,6 +62,26 @@ Any sparse object can be converted back to the standard dense form by calling
 
    sts.to_dense()
 
+.. _sparse.accessor:
+
+Sparse Accessor
+---------------
+
+.. versionadded:: 0.24.0
+
+Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat``
+for categorical data, and ``.dt`` for datetime-like data. This namespace provides
+attributes and methods that are specific to sparse data.
+
+.. ipython:: python
+
+   s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]")
+   s.sparse.density
+   s.sparse.fill_value
+
+This accessor is available only on data with ``SparseDtype``, and on the :class:`Series`
+class itself for creating a Series with sparse data from a scipy COO matrix with.
+
 .. _sparse.array:
 
 SparseArray
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index fd34fef886a16e..6b074611ef3e96 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -533,7 +533,6 @@ changes were made:
 - ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray.
 - Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed.
 
-
 Some new warnings are issued for operations that require or are likely to materialize a large dense array:
 
 - A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array.
@@ -541,6 +540,13 @@ Some new warnings are issued for operations that require or are likely to materi
 
 In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made <whatsnew_0240.bug_fixes.sparse>`.
 
+Finally, a ``Series.sparse`` accessor was added to provide sparse-specific methods like :meth:`Series.sparse.from_coo`.
+
+.. ipython:: python
+
+   s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]')
+   s.sparse.density
+
 .. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:
 
 Raise ValueError in ``DataFrame.to_dict(orient='index')``
diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py
index eab529584d1fb5..bc91372e3ac7d4 100644
--- a/pandas/core/accessor.py
+++ b/pandas/core/accessor.py
@@ -113,15 +113,18 @@ def delegate_names(delegate, accessors, typ, overwrite=False):
 
     Parameters
     ----------
-    delegate : the class to get methods/properties & doc-strings
-    acccessors : string list of accessors to add
-    typ : 'property' or 'method'
+    delegate : object
+        the class to get methods/properties & doc-strings
+    acccessors : Sequence[str]
+        List of accessor to add
+    typ : {'property', 'method'}
     overwrite : boolean, default False
        overwrite the method/property in the target class if it exists
 
     Returns
     -------
-    decorator
+    callable
+        A class decorator.
 
     Examples
     --------
diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
index 920a9f8286f0dd..72527cfa5d12e7 100644
--- a/pandas/core/arrays/sparse.py
+++ b/pandas/core/arrays/sparse.py
@@ -17,6 +17,7 @@
 from pandas.errors import PerformanceWarning
 from pandas.compat.numpy import function as nv
 
+from pandas.core.accessor import PandasDelegate, delegate_names
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
 import pandas.core.common as com
 from pandas.core.dtypes.base import ExtensionDtype
@@ -178,6 +179,7 @@ def _is_boolean(self):
 
     @property
     def kind(self):
+        """The sparse kind. Either 'integer', or 'block'."""
         return self.subtype.kind
 
     @property
@@ -648,10 +650,22 @@ def _from_factorized(cls, values, original):
     # ------------------------------------------------------------------------
     @property
     def sp_index(self):
+        """
+        The SparseIndex containing the location of non- ``fill_value`` points.
+        """
         return self._sparse_index
 
     @property
     def sp_values(self):
+        """
+        An ndarray containing the non- ``fill_value`` values.
+
+        Examples
+        --------
+        >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
+        >>> s.sp_values
+        array([1, 2])
+        """
         return self._sparse_values
 
     @property
@@ -704,6 +718,31 @@ def _fill_value_matches(self, fill_value):
     def nbytes(self):
         return self.sp_values.nbytes + self.sp_index.nbytes
 
+    @property
+    def density(self):
+        """The percent of non- ``fill_value`` points, as decimal.
+
+        Examples
+        --------
+        >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
+        >>> s.density
+        0.6
+        """
+        r = float(self.sp_index.npoints) / float(self.sp_index.length)
+        return r
+
+    @property
+    def npoints(self):
+        """The number of non- ``fill_value`` points.
+
+        Examples
+        --------
+        >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
+        >>> s.npoints
+        3
+        """
+        return self.sp_index.npoints
+
     @property
     def values(self):
         """
@@ -1744,3 +1783,138 @@ def _make_index(length, indices, kind):
     else:  # pragma: no cover
         raise ValueError('must be block or integer type')
     return index
+
+
+# ----------------------------------------------------------------------------
+# Accessor
+
+@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
+                              'sp_values'],
+                typ='property')
+class SparseAccessor(PandasDelegate):
+    def __init__(self, data=None):
+        self._validate(data)
+        # Store the Series since we need that for to_coo
+        self._parent = data
+
+    @staticmethod
+    def _validate(data):
+        if not isinstance(data.dtype, SparseDtype):
+            msg = "Can only use the '.sparse' accessor with Sparse data."
+            raise AttributeError(msg)
+
+    def _delegate_property_get(self, name, *args, **kwargs):
+        return getattr(self._parent.values, name)
+
+    def _delegate_method(self, name, *args, **kwargs):
+        if name == 'from_coo':
+            return self.from_coo(*args, **kwargs)
+        elif name == 'to_coo':
+            return self.to_coo(*args, **kwargs)
+        else:
+            raise ValueError
+
+    @classmethod
+    def from_coo(cls, A, dense_index=False):
+        """
+        Create a SparseSeries from a scipy.sparse.coo_matrix.
+
+        Parameters
+        ----------
+        A : scipy.sparse.coo_matrix
+        dense_index : bool, default False
+            If False (default), the SparseSeries index consists of only the
+            coords of the non-null entries of the original coo_matrix.
+            If True, the SparseSeries index consists of the full sorted
+            (row, col) coordinates of the coo_matrix.
+
+        Returns
+        -------
+        s : SparseSeries
+
+        Examples
+        ---------
+        >>> from scipy import sparse
+        >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
+                               shape=(3, 4))
+        >>> A
+        <3x4 sparse matrix of type '<class 'numpy.float64'>'
+                with 3 stored elements in COOrdinate format>
+        >>> A.todense()
+        matrix([[ 0.,  0.,  1.,  2.],
+                [ 3.,  0.,  0.,  0.],
+                [ 0.,  0.,  0.,  0.]])
+        >>> ss = pd.SparseSeries.from_coo(A)
+        >>> ss
+        0  2    1
+           3    2
+        1  0    3
+        dtype: float64
+        BlockIndex
+        Block locations: array([0], dtype=int32)
+        Block lengths: array([3], dtype=int32)
+        """
+        from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
+        from pandas import Series
+
+        result = _coo_to_sparse_series(A, dense_index=dense_index)
+        # SparseSeries -> Series[sparse]
+        result = Series(result.values, index=result.index, copy=False)
+
+        return result
+
+    def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
+        """
+        Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
+
+        Use row_levels and column_levels to determine the row and column
+        coordinates respectively. row_levels and column_levels are the names
+        (labels) or numbers of the levels. {row_levels, column_levels} must be
+        a partition of the MultiIndex level names (or numbers).
+
+        Parameters
+        ----------
+        row_levels : tuple/list
+        column_levels : tuple/list
+        sort_labels : bool, default False
+            Sort the row and column labels before forming the sparse matrix.
+
+        Returns
+        -------
+        y : scipy.sparse.coo_matrix
+        rows : list (row labels)
+        columns : list (column labels)
+
+        Examples
+        --------
+        >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
+        >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
+                                                (1, 2, 'a', 1),
+                                                (1, 1, 'b', 0),
+                                                (1, 1, 'b', 1),
+                                                (2, 1, 'b', 0),
+                                                (2, 1, 'b', 1)],
+                                                names=['A', 'B', 'C', 'D'])
+        >>> ss = s.to_sparse()
+        >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'],
+                                         column_levels=['C', 'D'],
+                                         sort_labels=True)
+        >>> A
+        <3x4 sparse matrix of type '<class 'numpy.float64'>'
+                with 3 stored elements in COOrdinate format>
+        >>> A.todense()
+        matrix([[ 0.,  0.,  1.,  3.],
+        [ 3.,  0.,  0.,  0.],
+        [ 0.,  0.,  0.,  0.]])
+        >>> rows
+        [(1, 1), (1, 2), (2, 1)]
+        >>> columns
+        [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
+        """
+        from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo
+
+        A, rows, columns = _sparse_series_to_coo(self._parent,
+                                                 row_levels,
+                                                 column_levels,
+                                                 sort_labels=sort_labels)
+        return A, rows, columns
diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py
index 35b9799579628d..c3b94c297652af 100644
--- a/pandas/core/indexes/accessors.py
+++ b/pandas/core/indexes/accessors.py
@@ -1,7 +1,6 @@
 """
 datetimelike delegation
 """
-
 import numpy as np
 
 from pandas.core.dtypes.generic import ABCSeries
diff --git a/pandas/core/series.py b/pandas/core/series.py
index d3ea005d3aae70..d813d8430d9e9b 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -26,6 +26,7 @@
 from pandas.core.accessor import CachedAccessor
 from pandas.core.arrays import ExtensionArray, period_array
 from pandas.core.arrays.categorical import Categorical, CategoricalAccessor
+from pandas.core.arrays.sparse import SparseAccessor
 from pandas.core.config import get_option
 from pandas.core.dtypes.cast import (
     construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na,
@@ -142,7 +143,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
         Copy input data
     """
     _metadata = ['name']
-    _accessors = {'dt', 'cat', 'str'}
+    _accessors = {'dt', 'cat', 'str', 'sparse'}
     _deprecations = generic.NDFrame._deprecations | frozenset(
         ['asobject', 'sortlevel', 'reshape', 'get_value', 'set_value',
          'from_csv', 'valid'])
@@ -4151,6 +4152,7 @@ def to_period(self, freq=None, copy=True):
     dt = CachedAccessor("dt", CombinedDatetimelikeProperties)
     cat = CachedAccessor("cat", CategoricalAccessor)
     plot = CachedAccessor("plot", gfx.SeriesPlotMethods)
+    sparse = CachedAccessor("sparse", SparseAccessor)
 
     # ----------------------------------------------------------------------
     # Add plotting methods to Series
diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
index 5a747c6e4b1d10..ff32712f9056a3 100644
--- a/pandas/core/sparse/series.py
+++ b/pandas/core/sparse/series.py
@@ -27,6 +27,7 @@
 from pandas.core.arrays import (
     SparseArray,
 )
+from pandas.core.arrays.sparse import SparseAccessor
 from pandas._libs.sparse import BlockIndex, IntIndex
 import pandas._libs.sparse as splib
 
@@ -183,7 +184,7 @@ def sp_values(self):
 
     @property
     def npoints(self):
-        return self.sp_index.npoints
+        return self.values.npoints
 
     @classmethod
     def from_array(cls, arr, index=None, name=None, copy=False,
@@ -452,8 +453,7 @@ def to_dense(self):
 
     @property
     def density(self):
-        r = float(self.sp_index.npoints) / float(self.sp_index.length)
-        return r
+        return self.values.density
 
     def copy(self, deep=True):
         """
@@ -580,99 +580,16 @@ def combine_first(self, other):
         dense_combined = self.to_dense().combine_first(other)
         return dense_combined.to_sparse(fill_value=self.fill_value)
 
+    @Appender(SparseAccessor.to_coo.__doc__)
     def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
-        """
-        Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
-
-        Use row_levels and column_levels to determine the row and column
-        coordinates respectively. row_levels and column_levels are the names
-        (labels) or numbers of the levels. {row_levels, column_levels} must be
-        a partition of the MultiIndex level names (or numbers).
-
-        Parameters
-        ----------
-        row_levels : tuple/list
-        column_levels : tuple/list
-        sort_labels : bool, default False
-            Sort the row and column labels before forming the sparse matrix.
-
-        Returns
-        -------
-        y : scipy.sparse.coo_matrix
-        rows : list (row labels)
-        columns : list (column labels)
-
-        Examples
-        --------
-        >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
-        >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
-                                                (1, 2, 'a', 1),
-                                                (1, 1, 'b', 0),
-                                                (1, 1, 'b', 1),
-                                                (2, 1, 'b', 0),
-                                                (2, 1, 'b', 1)],
-                                                names=['A', 'B', 'C', 'D'])
-        >>> ss = s.to_sparse()
-        >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'],
-                                         column_levels=['C', 'D'],
-                                         sort_labels=True)
-        >>> A
-        <3x4 sparse matrix of type '<class 'numpy.float64'>'
-                with 3 stored elements in COOrdinate format>
-        >>> A.todense()
-        matrix([[ 0.,  0.,  1.,  3.],
-        [ 3.,  0.,  0.,  0.],
-        [ 0.,  0.,  0.,  0.]])
-        >>> rows
-        [(1, 1), (1, 2), (2, 1)]
-        >>> columns
-        [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
-        """
         A, rows, columns = _sparse_series_to_coo(self, row_levels,
                                                  column_levels,
                                                  sort_labels=sort_labels)
         return A, rows, columns
 
     @classmethod
+    @Appender(SparseAccessor.from_coo.__doc__)
     def from_coo(cls, A, dense_index=False):
-        """
-        Create a SparseSeries from a scipy.sparse.coo_matrix.
-
-        Parameters
-        ----------
-        A : scipy.sparse.coo_matrix
-        dense_index : bool, default False
-            If False (default), the SparseSeries index consists of only the
-            coords of the non-null entries of the original coo_matrix.
-            If True, the SparseSeries index consists of the full sorted
-            (row, col) coordinates of the coo_matrix.
-
-        Returns
-        -------
-        s : SparseSeries
-
-        Examples
-        ---------
-        >>> from scipy import sparse
-        >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
-                               shape=(3, 4))
-        >>> A
-        <3x4 sparse matrix of type '<class 'numpy.float64'>'
-                with 3 stored elements in COOrdinate format>
-        >>> A.todense()
-        matrix([[ 0.,  0.,  1.,  2.],
-                [ 3.,  0.,  0.,  0.],
-                [ 0.,  0.,  0.,  0.]])
-        >>> ss = pd.SparseSeries.from_coo(A)
-        >>> ss
-        0  2    1
-           3    2
-        1  0    3
-        dtype: float64
-        BlockIndex
-        Block locations: array([0], dtype=int32)
-        Block lengths: array([3], dtype=int32)
-        """
         return _coo_to_sparse_series(A, dense_index=dense_index)
 
 
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
index e211b8626b53c0..cc9512c0759fc5 100644
--- a/pandas/tests/arrays/sparse/test_array.py
+++ b/pandas/tests/arrays/sparse/test_array.py
@@ -996,6 +996,55 @@ def test_asarray_datetime64(self):
         )
         np.asarray(s)
 
+    def test_density(self):
+        arr = SparseArray([0, 1])
+        assert arr.density == 0.5
+
+    def test_npoints(self):
+        arr = SparseArray([0, 1])
+        assert arr.npoints == 1
+
+
+class TestAccessor(object):
+
+    @pytest.mark.parametrize('attr', [
+        'npoints', 'density', 'fill_value', 'sp_values',
+    ])
+    def test_get_attributes(self, attr):
+        arr = SparseArray([0, 1])
+        ser = pd.Series(arr)
+
+        result = getattr(ser.sparse, attr)
+        expected = getattr(arr, attr)
+        assert result == expected
+
+    def test_from_coo(self):
+        sparse = pytest.importorskip("scipy.sparse")
+
+        row = [0, 3, 1, 0]
+        col = [0, 3, 1, 2]
+        data = [4, 5, 7, 9]
+        sp_array = sparse.coo_matrix(data, (row, col))
+        result = pd.Series.sparse.from_coo(sp_array)
+
+        index = pd.MultiIndex.from_product([[0], [0, 1, 2, 3]])
+        expected = pd.Series(data, index=index, dtype='Sparse[int]')
+        tm.assert_series_equal(result, expected)
+
+    def test_to_coo(self):
+        sparse = pytest.importorskip("scipy.sparse")
+        ser = pd.Series([1, 2, 3],
+                        index=pd.MultiIndex.from_product([[0], [1, 2, 3]],
+                                                         names=['a', 'b']),
+                        dtype='Sparse[int]')
+        A, _, _ = ser.sparse.to_coo()
+        assert isinstance(A, sparse.coo.coo_matrix)
+
+    def test_non_sparse_raises(self):
+        ser = pd.Series([1, 2, 3])
+        with tm.assert_raises_regex(AttributeError, '.sparse'):
+            ser.sparse.density
+
 
 def test_setting_fill_value_fillna_still_works():
     # This is why letting users update fill_value / dtype is bad