Skip to content

Commit

Permalink
API: Add sparse Acessor (pandas-dev#23183)
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger authored and tm9k1 committed Nov 19, 2018
1 parent eb02b72 commit 44b743d
Show file tree
Hide file tree
Showing 9 changed files with 281 additions and 95 deletions.
16 changes: 16 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,22 @@ Sparse
SparseSeries.to_coo
SparseSeries.from_coo

.. autosummary::
:toctree: generated/
:template: autosummary/accessor_attribute.rst

Series.sparse.npoints
Series.sparse.density
Series.sparse.fill_value
Series.sparse.sp_values


.. autosummary::
:toctree: generated/

Series.sparse.from_coo
Series.sparse.to_coo

.. _api.dataframe:

DataFrame
Expand Down
20 changes: 20 additions & 0 deletions doc/source/sparse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,26 @@ Any sparse object can be converted back to the standard dense form by calling
sts.to_dense()
.. _sparse.accessor:

Sparse Accessor
---------------

.. versionadded:: 0.24.0

Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat``
for categorical data, and ``.dt`` for datetime-like data. This namespace provides
attributes and methods that are specific to sparse data.

.. ipython:: python
s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]")
s.sparse.density
s.sparse.fill_value
This accessor is available only on data with ``SparseDtype``, and on the :class:`Series`
class itself for creating a Series with sparse data from a scipy COO matrix with.

.. _sparse.array:

SparseArray
Expand Down
8 changes: 7 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -533,14 +533,20 @@ changes were made:
- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray.
- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed.


Some new warnings are issued for operations that require or are likely to materialize a large dense array:

- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array.
- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used.

In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made <whatsnew_0240.bug_fixes.sparse>`.

Finally, a ``Series.sparse`` accessor was added to provide sparse-specific methods like :meth:`Series.sparse.from_coo`.

.. ipython:: python

s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]')
s.sparse.density

.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient:

Raise ValueError in ``DataFrame.to_dict(orient='index')``
Expand Down
11 changes: 7 additions & 4 deletions pandas/core/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,18 @@ def delegate_names(delegate, accessors, typ, overwrite=False):
Parameters
----------
delegate : the class to get methods/properties & doc-strings
acccessors : string list of accessors to add
typ : 'property' or 'method'
delegate : object
the class to get methods/properties & doc-strings
acccessors : Sequence[str]
List of accessor to add
typ : {'property', 'method'}
overwrite : boolean, default False
overwrite the method/property in the target class if it exists
Returns
-------
decorator
callable
A class decorator.
Examples
--------
Expand Down
174 changes: 174 additions & 0 deletions pandas/core/arrays/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from pandas.errors import PerformanceWarning
from pandas.compat.numpy import function as nv

from pandas.core.accessor import PandasDelegate, delegate_names
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.dtypes.base import ExtensionDtype
Expand Down Expand Up @@ -178,6 +179,7 @@ def _is_boolean(self):

@property
def kind(self):
"""The sparse kind. Either 'integer', or 'block'."""
return self.subtype.kind

@property
Expand Down Expand Up @@ -648,10 +650,22 @@ def _from_factorized(cls, values, original):
# ------------------------------------------------------------------------
@property
def sp_index(self):
"""
The SparseIndex containing the location of non- ``fill_value`` points.
"""
return self._sparse_index

@property
def sp_values(self):
"""
An ndarray containing the non- ``fill_value`` values.
Examples
--------
>>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
>>> s.sp_values
array([1, 2])
"""
return self._sparse_values

@property
Expand Down Expand Up @@ -704,6 +718,31 @@ def _fill_value_matches(self, fill_value):
def nbytes(self):
return self.sp_values.nbytes + self.sp_index.nbytes

@property
def density(self):
"""The percent of non- ``fill_value`` points, as decimal.
Examples
--------
>>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
>>> s.density
0.6
"""
r = float(self.sp_index.npoints) / float(self.sp_index.length)
return r

@property
def npoints(self):
"""The number of non- ``fill_value`` points.
Examples
--------
>>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
>>> s.npoints
3
"""
return self.sp_index.npoints

@property
def values(self):
"""
Expand Down Expand Up @@ -1744,3 +1783,138 @@ def _make_index(length, indices, kind):
else: # pragma: no cover
raise ValueError('must be block or integer type')
return index


# ----------------------------------------------------------------------------
# Accessor

@delegate_names(SparseArray, ['npoints', 'density', 'fill_value',
'sp_values'],
typ='property')
class SparseAccessor(PandasDelegate):
def __init__(self, data=None):
self._validate(data)
# Store the Series since we need that for to_coo
self._parent = data

@staticmethod
def _validate(data):
if not isinstance(data.dtype, SparseDtype):
msg = "Can only use the '.sparse' accessor with Sparse data."
raise AttributeError(msg)

def _delegate_property_get(self, name, *args, **kwargs):
return getattr(self._parent.values, name)

def _delegate_method(self, name, *args, **kwargs):
if name == 'from_coo':
return self.from_coo(*args, **kwargs)
elif name == 'to_coo':
return self.to_coo(*args, **kwargs)
else:
raise ValueError

@classmethod
def from_coo(cls, A, dense_index=False):
"""
Create a SparseSeries from a scipy.sparse.coo_matrix.
Parameters
----------
A : scipy.sparse.coo_matrix
dense_index : bool, default False
If False (default), the SparseSeries index consists of only the
coords of the non-null entries of the original coo_matrix.
If True, the SparseSeries index consists of the full sorted
(row, col) coordinates of the coo_matrix.
Returns
-------
s : SparseSeries
Examples
---------
>>> from scipy import sparse
>>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])),
shape=(3, 4))
>>> A
<3x4 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
>>> A.todense()
matrix([[ 0., 0., 1., 2.],
[ 3., 0., 0., 0.],
[ 0., 0., 0., 0.]])
>>> ss = pd.SparseSeries.from_coo(A)
>>> ss
0 2 1
3 2
1 0 3
dtype: float64
BlockIndex
Block locations: array([0], dtype=int32)
Block lengths: array([3], dtype=int32)
"""
from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series
from pandas import Series

result = _coo_to_sparse_series(A, dense_index=dense_index)
# SparseSeries -> Series[sparse]
result = Series(result.values, index=result.index, copy=False)

return result

def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False):
"""
Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex.
Use row_levels and column_levels to determine the row and column
coordinates respectively. row_levels and column_levels are the names
(labels) or numbers of the levels. {row_levels, column_levels} must be
a partition of the MultiIndex level names (or numbers).
Parameters
----------
row_levels : tuple/list
column_levels : tuple/list
sort_labels : bool, default False
Sort the row and column labels before forming the sparse matrix.
Returns
-------
y : scipy.sparse.coo_matrix
rows : list (row labels)
columns : list (column labels)
Examples
--------
>>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
>>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0),
(1, 2, 'a', 1),
(1, 1, 'b', 0),
(1, 1, 'b', 1),
(2, 1, 'b', 0),
(2, 1, 'b', 1)],
names=['A', 'B', 'C', 'D'])
>>> ss = s.to_sparse()
>>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'],
column_levels=['C', 'D'],
sort_labels=True)
>>> A
<3x4 sparse matrix of type '<class 'numpy.float64'>'
with 3 stored elements in COOrdinate format>
>>> A.todense()
matrix([[ 0., 0., 1., 3.],
[ 3., 0., 0., 0.],
[ 0., 0., 0., 0.]])
>>> rows
[(1, 1), (1, 2), (2, 1)]
>>> columns
[('a', 0), ('a', 1), ('b', 0), ('b', 1)]
"""
from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo

A, rows, columns = _sparse_series_to_coo(self._parent,
row_levels,
column_levels,
sort_labels=sort_labels)
return A, rows, columns
1 change: 0 additions & 1 deletion pandas/core/indexes/accessors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
datetimelike delegation
"""

import numpy as np

from pandas.core.dtypes.generic import ABCSeries
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import ExtensionArray, period_array
from pandas.core.arrays.categorical import Categorical, CategoricalAccessor
from pandas.core.arrays.sparse import SparseAccessor
from pandas.core.config import get_option
from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na,
Expand Down Expand Up @@ -142,7 +143,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
Copy input data
"""
_metadata = ['name']
_accessors = {'dt', 'cat', 'str'}
_accessors = {'dt', 'cat', 'str', 'sparse'}
_deprecations = generic.NDFrame._deprecations | frozenset(
['asobject', 'sortlevel', 'reshape', 'get_value', 'set_value',
'from_csv', 'valid'])
Expand Down Expand Up @@ -4151,6 +4152,7 @@ def to_period(self, freq=None, copy=True):
dt = CachedAccessor("dt", CombinedDatetimelikeProperties)
cat = CachedAccessor("cat", CategoricalAccessor)
plot = CachedAccessor("plot", gfx.SeriesPlotMethods)
sparse = CachedAccessor("sparse", SparseAccessor)

# ----------------------------------------------------------------------
# Add plotting methods to Series
Expand Down
Loading

0 comments on commit 44b743d

Please sign in to comment.