API: Add sparse Acessor (pandas-dev#23183)

tm9k1 · Nov 19, 2018 · 44b743d · 44b743d
1 parent eb02b72
commit 44b743d
Show file tree

Hide file tree

Showing 9 changed files with 281 additions and 95 deletions.
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -851,6 +851,22 @@ Sparse
    SparseSeries.to_coo
    SparseSeries.from_coo
 
+.. autosummary::
+   :toctree: generated/
+   :template: autosummary/accessor_attribute.rst
+
+   Series.sparse.npoints
+   Series.sparse.density
+   Series.sparse.fill_value
+   Series.sparse.sp_values
+
+
+.. autosummary::
+   :toctree: generated/
+
+   Series.sparse.from_coo
+   Series.sparse.to_coo
+
 .. _api.dataframe:
 
 DataFrame

diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst
@@ -62,6 +62,26 @@ Any sparse object can be converted back to the standard dense form by calling
 
    sts.to_dense()
 
+.. _sparse.accessor:
+
+Sparse Accessor
+---------------
+
+.. versionadded:: 0.24.0
+
+Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat``
+for categorical data, and ``.dt`` for datetime-like data. This namespace provides
+attributes and methods that are specific to sparse data.
+
+.. ipython:: python
+
+   s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]")
+   s.sparse.density
+   s.sparse.fill_value
+
+This accessor is available only on data with ``SparseDtype``, and on the :class:`Series`
+class itself for creating a Series with sparse data from a scipy COO matrix with.
+
 .. _sparse.array:
 
 SparseArray

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -533,14 +533,20 @@ changes were made:
 - ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray.
 - Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed.
 
-
 Some new warnings are issued for operations that require or are likely to materialize a large dense array:
 
 - A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array.
 - A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used.
 
 In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made <whatsnew_0240.bug_fixes.sparse>`.
 
+Finally, a ``Series.sparse`` accessor was added to provide sparse-specific methods like :meth:`Series.sparse.from_coo`.
+
+.. ipython:: python
+
+   s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]')
+   s.sparse.density
+
 .. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:
 
 Raise ValueError in ``DataFrame.to_dict(orient='index')``

diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py
@@ -113,15 +113,18 @@ def delegate_names(delegate, accessors, typ, overwrite=False):
 
     Parameters
     ----------
-    delegate : the class to get methods/properties & doc-strings
-    acccessors : string list of accessors to add
-    typ : 'property' or 'method'
+    delegate : object
+        the class to get methods/properties & doc-strings
+    acccessors : Sequence[str]
+        List of accessor to add
+    typ : {'property', 'method'}
     overwrite : boolean, default False
        overwrite the method/property in the target class if it exists
 
     Returns
     -------
-    decorator
+    callable
+        A class decorator.
 
     Examples
     --------

diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py
@@ -17,6 +17,7 @@
 from pandas.errors import PerformanceWarning
 from pandas.compat.numpy import function as nv
 
+from pandas.core.accessor import PandasDelegate, delegate_names
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
 import pandas.core.common as com
 from pandas.core.dtypes.base import ExtensionDtype
@@ -178,6 +179,7 @@ def _is_boolean(self):
 
     @property
     def kind(self):
+        """The sparse kind. Either 'integer', or 'block'."""
         return self.subtype.kind
 
     @property
@@ -648,10 +650,22 @@ def _from_factorized(cls, values, original):
     # ------------------------------------------------------------------------
     @property
     def sp_index(self):
+        """
+        The SparseIndex containing the location of non- ``fill_value`` points.
+        """
         return self._sparse_index
 
     @property
     def sp_values(self):
+        """
+        An ndarray containing the non- ``fill_value`` values.
+
+        Examples
+        --------
+        >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
+        >>> s.sp_values
+        array([1, 2])
+        """
         return self._sparse_values
 
     @property
@@ -704,6 +718,31 @@ def _fill_value_matches(self, fill_value):
     def nbytes(self):
         return self.sp_values.nbytes + self.sp_index.nbytes
 
+    @property
+    def density(self):
+        """The percent of non- ``fill_value`` points, as decimal.
+
+        Examples
+        --------
+        >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
+        >>> s.density
+        0.6
+        """
+        r = float(self.sp_index.npoints) / float(self.sp_index.length)
+        return r
+
+    @property
+    def npoints(self):
+        """The number of non- ``fill_value`` points.
+
+        Examples
+        --------
+        >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
+        >>> s.npoints
+        3
+        """
+        return self.sp_index.npoints
+
     @property
     def values(self):
         """
@@ -1744,3 +1783,138 @@ def _make_index(length, indices, kind):
     else:  # pragma: no cover
         raise ValueError('must be block or integer type')
     return index
+
+
+# ----------------------------------------------------------------------------
+# Accessor
+
+@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
+                              'sp_values'],
+                typ='property')
+class SparseAccessor(PandasDelegate):
+    def __init__(self, data=None):
+        self._validate(data)
+        # Store the Series since we need that for to_coo
+        self._parent = data
+
+    @staticmethod
+    def _validate(data):
+        if not isinstance(data.dtype, SparseDtype):
+            msg = "Can only use the '.sparse' accessor with Sparse data."
+            raise AttributeError(msg)
+
+    def _delegate_property_get(self, name, *args, **kwargs):
+        return getattr(self._parent.values, name)
+
+    def _delegate_method(self, name, *args, **kwargs):
+        if name == 'from_coo':
+            return self.from_coo(*args, **kwargs)
+        elif name == 'to_coo':
+            return self.to_coo(*args, **kwargs)
+        else:
+            raise ValueError
+
+    @classmethod
+    def from_coo(cls, A, dense_index=False):
+        """
+        Create a SparseSeries from a scipy.sparse.coo_matrix.
+
+        Parameters
+        ----------
+        A : scipy.sparse.coo_matrix
+        dense_index : bool, default False
+            If False (default), the SparseSeries index consists of only the
+            coords of the non-null entries of the original coo_matrix.
+            If True, the SparseSeries index consists of the full sorted
+            (row, col) coordinates of the coo_matrix.
+
+        Returns
+        -------
+        s : SparseSeries
+
+        Examples
+        ---------
+        >>> from scipy import sparse
+        >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
+                               shape=(3, 4))
+        >>> A
+        <3x4 sparse matrix of type '<class 'numpy.float64'>'
+                with 3 stored elements in COOrdinate format>
+        >>> A.todense()
+        matrix([[ 0.,  0.,  1.,  2.],
+                [ 3.,  0.,  0.,  0.],
+                [ 0.,  0.,  0.,  0.]])
+        >>> ss = pd.SparseSeries.from_coo(A)
+        >>> ss
+        0  2    1
+           3    2
+        1  0    3
+        dtype: float64
+        BlockIndex
+        Block locations: array([0], dtype=int32)
+        Block lengths: array([3], dtype=int32)
+        """
+        from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
+        from pandas import Series
+
+        result = _coo_to_sparse_series(A, dense_index=dense_index)
+        # SparseSeries -> Series[sparse]
+        result = Series(result.values, index=result.index, copy=False)
+
+        return result
+
+    def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
+        """
+        Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
+
+        Use row_levels and column_levels to determine the row and column
+        coordinates respectively. row_levels and column_levels are the names
+        (labels) or numbers of the levels. {row_levels, column_levels} must be
+        a partition of the MultiIndex level names (or numbers).
+
+        Parameters
+        ----------
+        row_levels : tuple/list
+        column_levels : tuple/list
+        sort_labels : bool, default False
+            Sort the row and column labels before forming the sparse matrix.
+
+        Returns
+        -------
+        y : scipy.sparse.coo_matrix
+        rows : list (row labels)
+        columns : list (column labels)
+
+        Examples
+        --------
+        >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
+        >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
+                                                (1, 2, 'a', 1),
+                                                (1, 1, 'b', 0),
+                                                (1, 1, 'b', 1),
+                                                (2, 1, 'b', 0),
+                                                (2, 1, 'b', 1)],
+                                                names=['A', 'B', 'C', 'D'])
+        >>> ss = s.to_sparse()
+        >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'],
+                                         column_levels=['C', 'D'],
+                                         sort_labels=True)
+        >>> A
+        <3x4 sparse matrix of type '<class 'numpy.float64'>'
+                with 3 stored elements in COOrdinate format>
+        >>> A.todense()
+        matrix([[ 0.,  0.,  1.,  3.],
+        [ 3.,  0.,  0.,  0.],
+        [ 0.,  0.,  0.,  0.]])
+        >>> rows
+        [(1, 1), (1, 2), (2, 1)]
+        >>> columns
+        [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
+        """
+        from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo
+
+        A, rows, columns = _sparse_series_to_coo(self._parent,
+                                                 row_levels,
+                                                 column_levels,
+                                                 sort_labels=sort_labels)
+        return A, rows, columns
diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py
@@ -1,7 +1,6 @@
 """
 datetimelike delegation
 """
-
 import numpy as np
 
 from pandas.core.dtypes.generic import ABCSeries

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -26,6 +26,7 @@
 from pandas.core.accessor import CachedAccessor
 from pandas.core.arrays import ExtensionArray, period_array
 from pandas.core.arrays.categorical import Categorical, CategoricalAccessor
+from pandas.core.arrays.sparse import SparseAccessor
 from pandas.core.config import get_option
 from pandas.core.dtypes.cast import (
     construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na,
@@ -142,7 +143,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
         Copy input data
     """
     _metadata = ['name']
-    _accessors = {'dt', 'cat', 'str'}
+    _accessors = {'dt', 'cat', 'str', 'sparse'}
     _deprecations = generic.NDFrame._deprecations | frozenset(
         ['asobject', 'sortlevel', 'reshape', 'get_value', 'set_value',
          'from_csv', 'valid'])
@@ -4151,6 +4152,7 @@ def to_period(self, freq=None, copy=True):
     dt = CachedAccessor("dt", CombinedDatetimelikeProperties)
     cat = CachedAccessor("cat", CategoricalAccessor)
     plot = CachedAccessor("plot", gfx.SeriesPlotMethods)
+    sparse = CachedAccessor("sparse", SparseAccessor)
 
     # ----------------------------------------------------------------------
     # Add plotting methods to Series