From 44b743d0bc1dfbc383d366d515ca810580d75ac5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 25 Oct 2018 20:28:55 -0500 Subject: [PATCH] API: Add sparse Acessor (#23183) --- doc/source/api.rst | 16 +++ doc/source/sparse.rst | 20 +++ doc/source/whatsnew/v0.24.0.txt | 8 +- pandas/core/accessor.py | 11 +- pandas/core/arrays/sparse.py | 174 +++++++++++++++++++++++ pandas/core/indexes/accessors.py | 1 - pandas/core/series.py | 4 +- pandas/core/sparse/series.py | 93 +----------- pandas/tests/arrays/sparse/test_array.py | 49 +++++++ 9 files changed, 281 insertions(+), 95 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 1ec2a56dcd0941..6e8eb83577c466 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -851,6 +851,22 @@ Sparse SparseSeries.to_coo SparseSeries.from_coo +.. autosummary:: + :toctree: generated/ + :template: autosummary/accessor_attribute.rst + + Series.sparse.npoints + Series.sparse.density + Series.sparse.fill_value + Series.sparse.sp_values + + +.. autosummary:: + :toctree: generated/ + + Series.sparse.from_coo + Series.sparse.to_coo + .. _api.dataframe: DataFrame diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 2bb99dd1822b68..884512981e1c90 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -62,6 +62,26 @@ Any sparse object can be converted back to the standard dense form by calling sts.to_dense() +.. _sparse.accessor: + +Sparse Accessor +--------------- + +.. versionadded:: 0.24.0 + +Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` +for categorical data, and ``.dt`` for datetime-like data. This namespace provides +attributes and methods that are specific to sparse data. + +.. ipython:: python + + s = pd.Series([0, 0, 1, 2], dtype="Sparse[int]") + s.sparse.density + s.sparse.fill_value + +This accessor is available only on data with ``SparseDtype``, and on the :class:`Series` +class itself for creating a Series with sparse data from a scipy COO matrix with. + .. _sparse.array: SparseArray diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index fd34fef886a16e..6b074611ef3e96 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -533,7 +533,6 @@ changes were made: - ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. - Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. - Some new warnings are issued for operations that require or are likely to materialize a large dense array: - A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. @@ -541,6 +540,13 @@ Some new warnings are issued for operations that require or are likely to materi In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. +Finally, a ``Series.sparse`` accessor was added to provide sparse-specific methods like :meth:`Series.sparse.from_coo`. + +.. ipython:: python + + s = pd.Series([0, 0, 1, 1, 1], dtype='Sparse[int]') + s.sparse.density + .. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: Raise ValueError in ``DataFrame.to_dict(orient='index')`` diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index eab529584d1fb5..bc91372e3ac7d4 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -113,15 +113,18 @@ def delegate_names(delegate, accessors, typ, overwrite=False): Parameters ---------- - delegate : the class to get methods/properties & doc-strings - acccessors : string list of accessors to add - typ : 'property' or 'method' + delegate : object + the class to get methods/properties & doc-strings + acccessors : Sequence[str] + List of accessor to add + typ : {'property', 'method'} overwrite : boolean, default False overwrite the method/property in the target class if it exists Returns ------- - decorator + callable + A class decorator. Examples -------- diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 920a9f8286f0dd..72527cfa5d12e7 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -17,6 +17,7 @@ from pandas.errors import PerformanceWarning from pandas.compat.numpy import function as nv +from pandas.core.accessor import PandasDelegate, delegate_names from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com from pandas.core.dtypes.base import ExtensionDtype @@ -178,6 +179,7 @@ def _is_boolean(self): @property def kind(self): + """The sparse kind. Either 'integer', or 'block'.""" return self.subtype.kind @property @@ -648,10 +650,22 @@ def _from_factorized(cls, values, original): # ------------------------------------------------------------------------ @property def sp_index(self): + """ + The SparseIndex containing the location of non- ``fill_value`` points. + """ return self._sparse_index @property def sp_values(self): + """ + An ndarray containing the non- ``fill_value`` values. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) + >>> s.sp_values + array([1, 2]) + """ return self._sparse_values @property @@ -704,6 +718,31 @@ def _fill_value_matches(self, fill_value): def nbytes(self): return self.sp_values.nbytes + self.sp_index.nbytes + @property + def density(self): + """The percent of non- ``fill_value`` points, as decimal. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.density + 0.6 + """ + r = float(self.sp_index.npoints) / float(self.sp_index.length) + return r + + @property + def npoints(self): + """The number of non- ``fill_value`` points. + + Examples + -------- + >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) + >>> s.npoints + 3 + """ + return self.sp_index.npoints + @property def values(self): """ @@ -1744,3 +1783,138 @@ def _make_index(length, indices, kind): else: # pragma: no cover raise ValueError('must be block or integer type') return index + + +# ---------------------------------------------------------------------------- +# Accessor + +@delegate_names(SparseArray, ['npoints', 'density', 'fill_value', + 'sp_values'], + typ='property') +class SparseAccessor(PandasDelegate): + def __init__(self, data=None): + self._validate(data) + # Store the Series since we need that for to_coo + self._parent = data + + @staticmethod + def _validate(data): + if not isinstance(data.dtype, SparseDtype): + msg = "Can only use the '.sparse' accessor with Sparse data." + raise AttributeError(msg) + + def _delegate_property_get(self, name, *args, **kwargs): + return getattr(self._parent.values, name) + + def _delegate_method(self, name, *args, **kwargs): + if name == 'from_coo': + return self.from_coo(*args, **kwargs) + elif name == 'to_coo': + return self.to_coo(*args, **kwargs) + else: + raise ValueError + + @classmethod + def from_coo(cls, A, dense_index=False): + """ + Create a SparseSeries from a scipy.sparse.coo_matrix. + + Parameters + ---------- + A : scipy.sparse.coo_matrix + dense_index : bool, default False + If False (default), the SparseSeries index consists of only the + coords of the non-null entries of the original coo_matrix. + If True, the SparseSeries index consists of the full sorted + (row, col) coordinates of the coo_matrix. + + Returns + ------- + s : SparseSeries + + Examples + --------- + >>> from scipy import sparse + >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), + shape=(3, 4)) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 2.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> ss = pd.SparseSeries.from_coo(A) + >>> ss + 0 2 1 + 3 2 + 1 0 3 + dtype: float64 + BlockIndex + Block locations: array([0], dtype=int32) + Block lengths: array([3], dtype=int32) + """ + from pandas.core.sparse.scipy_sparse import _coo_to_sparse_series + from pandas import Series + + result = _coo_to_sparse_series(A, dense_index=dense_index) + # SparseSeries -> Series[sparse] + result = Series(result.values, index=result.index, copy=False) + + return result + + def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): + """ + Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. + + Use row_levels and column_levels to determine the row and column + coordinates respectively. row_levels and column_levels are the names + (labels) or numbers of the levels. {row_levels, column_levels} must be + a partition of the MultiIndex level names (or numbers). + + Parameters + ---------- + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + + Returns + ------- + y : scipy.sparse.coo_matrix + rows : list (row labels) + columns : list (column labels) + + Examples + -------- + >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) + >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), + (1, 2, 'a', 1), + (1, 1, 'b', 0), + (1, 1, 'b', 1), + (2, 1, 'b', 0), + (2, 1, 'b', 1)], + names=['A', 'B', 'C', 'D']) + >>> ss = s.to_sparse() + >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'], + column_levels=['C', 'D'], + sort_labels=True) + >>> A + <3x4 sparse matrix of type '' + with 3 stored elements in COOrdinate format> + >>> A.todense() + matrix([[ 0., 0., 1., 3.], + [ 3., 0., 0., 0.], + [ 0., 0., 0., 0.]]) + >>> rows + [(1, 1), (1, 2), (2, 1)] + >>> columns + [('a', 0), ('a', 1), ('b', 0), ('b', 1)] + """ + from pandas.core.sparse.scipy_sparse import _sparse_series_to_coo + + A, rows, columns = _sparse_series_to_coo(self._parent, + row_levels, + column_levels, + sort_labels=sort_labels) + return A, rows, columns diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 35b9799579628d..c3b94c297652af 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -1,7 +1,6 @@ """ datetimelike delegation """ - import numpy as np from pandas.core.dtypes.generic import ABCSeries diff --git a/pandas/core/series.py b/pandas/core/series.py index d3ea005d3aae70..d813d8430d9e9b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -26,6 +26,7 @@ from pandas.core.accessor import CachedAccessor from pandas.core.arrays import ExtensionArray, period_array from pandas.core.arrays.categorical import Categorical, CategoricalAccessor +from pandas.core.arrays.sparse import SparseAccessor from pandas.core.config import get_option from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, @@ -142,7 +143,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Copy input data """ _metadata = ['name'] - _accessors = {'dt', 'cat', 'str'} + _accessors = {'dt', 'cat', 'str', 'sparse'} _deprecations = generic.NDFrame._deprecations | frozenset( ['asobject', 'sortlevel', 'reshape', 'get_value', 'set_value', 'from_csv', 'valid']) @@ -4151,6 +4152,7 @@ def to_period(self, freq=None, copy=True): dt = CachedAccessor("dt", CombinedDatetimelikeProperties) cat = CachedAccessor("cat", CategoricalAccessor) plot = CachedAccessor("plot", gfx.SeriesPlotMethods) + sparse = CachedAccessor("sparse", SparseAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 5a747c6e4b1d10..ff32712f9056a3 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -27,6 +27,7 @@ from pandas.core.arrays import ( SparseArray, ) +from pandas.core.arrays.sparse import SparseAccessor from pandas._libs.sparse import BlockIndex, IntIndex import pandas._libs.sparse as splib @@ -183,7 +184,7 @@ def sp_values(self): @property def npoints(self): - return self.sp_index.npoints + return self.values.npoints @classmethod def from_array(cls, arr, index=None, name=None, copy=False, @@ -452,8 +453,7 @@ def to_dense(self): @property def density(self): - r = float(self.sp_index.npoints) / float(self.sp_index.length) - return r + return self.values.density def copy(self, deep=True): """ @@ -580,99 +580,16 @@ def combine_first(self, other): dense_combined = self.to_dense().combine_first(other) return dense_combined.to_sparse(fill_value=self.fill_value) + @Appender(SparseAccessor.to_coo.__doc__) def to_coo(self, row_levels=(0, ), column_levels=(1, ), sort_labels=False): - """ - Create a scipy.sparse.coo_matrix from a SparseSeries with MultiIndex. - - Use row_levels and column_levels to determine the row and column - coordinates respectively. row_levels and column_levels are the names - (labels) or numbers of the levels. {row_levels, column_levels} must be - a partition of the MultiIndex level names (or numbers). - - Parameters - ---------- - row_levels : tuple/list - column_levels : tuple/list - sort_labels : bool, default False - Sort the row and column labels before forming the sparse matrix. - - Returns - ------- - y : scipy.sparse.coo_matrix - rows : list (row labels) - columns : list (column labels) - - Examples - -------- - >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) - >>> s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) - >>> ss = s.to_sparse() - >>> A, rows, columns = ss.to_coo(row_levels=['A', 'B'], - column_levels=['C', 'D'], - sort_labels=True) - >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> - >>> A.todense() - matrix([[ 0., 0., 1., 3.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) - >>> rows - [(1, 1), (1, 2), (2, 1)] - >>> columns - [('a', 0), ('a', 1), ('b', 0), ('b', 1)] - """ A, rows, columns = _sparse_series_to_coo(self, row_levels, column_levels, sort_labels=sort_labels) return A, rows, columns @classmethod + @Appender(SparseAccessor.from_coo.__doc__) def from_coo(cls, A, dense_index=False): - """ - Create a SparseSeries from a scipy.sparse.coo_matrix. - - Parameters - ---------- - A : scipy.sparse.coo_matrix - dense_index : bool, default False - If False (default), the SparseSeries index consists of only the - coords of the non-null entries of the original coo_matrix. - If True, the SparseSeries index consists of the full sorted - (row, col) coordinates of the coo_matrix. - - Returns - ------- - s : SparseSeries - - Examples - --------- - >>> from scipy import sparse - >>> A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), - shape=(3, 4)) - >>> A - <3x4 sparse matrix of type '' - with 3 stored elements in COOrdinate format> - >>> A.todense() - matrix([[ 0., 0., 1., 2.], - [ 3., 0., 0., 0.], - [ 0., 0., 0., 0.]]) - >>> ss = pd.SparseSeries.from_coo(A) - >>> ss - 0 2 1 - 3 2 - 1 0 3 - dtype: float64 - BlockIndex - Block locations: array([0], dtype=int32) - Block lengths: array([3], dtype=int32) - """ return _coo_to_sparse_series(A, dense_index=dense_index) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index e211b8626b53c0..cc9512c0759fc5 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -996,6 +996,55 @@ def test_asarray_datetime64(self): ) np.asarray(s) + def test_density(self): + arr = SparseArray([0, 1]) + assert arr.density == 0.5 + + def test_npoints(self): + arr = SparseArray([0, 1]) + assert arr.npoints == 1 + + +class TestAccessor(object): + + @pytest.mark.parametrize('attr', [ + 'npoints', 'density', 'fill_value', 'sp_values', + ]) + def test_get_attributes(self, attr): + arr = SparseArray([0, 1]) + ser = pd.Series(arr) + + result = getattr(ser.sparse, attr) + expected = getattr(arr, attr) + assert result == expected + + def test_from_coo(self): + sparse = pytest.importorskip("scipy.sparse") + + row = [0, 3, 1, 0] + col = [0, 3, 1, 2] + data = [4, 5, 7, 9] + sp_array = sparse.coo_matrix(data, (row, col)) + result = pd.Series.sparse.from_coo(sp_array) + + index = pd.MultiIndex.from_product([[0], [0, 1, 2, 3]]) + expected = pd.Series(data, index=index, dtype='Sparse[int]') + tm.assert_series_equal(result, expected) + + def test_to_coo(self): + sparse = pytest.importorskip("scipy.sparse") + ser = pd.Series([1, 2, 3], + index=pd.MultiIndex.from_product([[0], [1, 2, 3]], + names=['a', 'b']), + dtype='Sparse[int]') + A, _, _ = ser.sparse.to_coo() + assert isinstance(A, sparse.coo.coo_matrix) + + def test_non_sparse_raises(self): + ser = pd.Series([1, 2, 3]) + with tm.assert_raises_regex(AttributeError, '.sparse'): + ser.sparse.density + def test_setting_fill_value_fillna_still_works(): # This is why letting users update fill_value / dtype is bad