diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 7322bd9fe3327..7c15a64765e3d 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -282,6 +282,7 @@ Other Enhancements - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) +- :func:`pandas.merge` now supports merging of :class:`SparseDataFrame` with both :class:`SparseDataFrame` and :class:`DataFrame` (:issue:`13665`) .. _whatsnew_0230.api_breaking: @@ -654,6 +655,7 @@ Sparse - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`) - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`) - Bug in :class:`SparseSeries.memory_usage` which caused segfault by accessing non sparse elements (:issue:`19368`) +- Bug in :class:`SparseSeries.__array__` returning only non-fills (:issue:`13665`) Reshaping ^^^^^^^^^ @@ -690,3 +692,4 @@ Other ^^^^^ - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`) +- Improved algorithms.take_1d handling of ``SparseArray`` (:issue:`19506`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c754c063fce8e..9224d171ae3a1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1315,6 +1315,14 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, undefined if allow_fill == False and -1 is present in indexer. """ + if is_sparse(arr): + return arr.take(indexer, axis=axis, allow_fill=allow_fill, + fill_value=fill_value) + + # return take_nd(arr.get_values(), indexer, axis=axis, out=out, + # fill_value=fill_value, mask_info=mask_info, + # allow_fill=allow_fill) + # dispatch to internal type takes if is_categorical(arr): return arr.take_nd(indexer, fill_value=fill_value, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1e1bb0d49b3df..97aaa13fa5c69 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -14,7 +14,7 @@ from pandas.core.accessor import CachedAccessor from pandas.core.dtypes.generic import ( - ABCSeries, ABCDataFrame, + ABCSeries, ABCDataFrame, ABCSparseArray, ABCMultiIndex, ABCPeriodIndex, ABCDateOffset) @@ -618,6 +618,9 @@ def where(self, cond, other=None): if other is None: other = self._na_value + if isinstance(other, ABCSparseArray): + other = other.values + dtype = self.dtype values = self.values diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f553e1a02c9d6..8a7bc75f4c6e3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3089,6 +3089,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, # GH#19265 pyarrow is passing this warnings.warn("fastpath argument is deprecated, will be removed " "in a future release.", DeprecationWarning) + if klass is None: dtype = dtype or values.dtype klass = get_block_type(values, dtype) @@ -5304,6 +5305,22 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): elif is_uniform_join_units(join_units): b = join_units[0].block.concat_same_type( [ju.block for ju in join_units], placement=placement) + elif is_sparse_join_units(join_units): + values = concatenate_join_units(join_units, concat_axis, copy=copy) + + if len(values.shape) == 2: + values = values[0] + else: + assert len(values.shape) == 1 + + block = join_units[0].block + + if block: + fill_value = block.fill_value + else: + fill_value = np.nan + array = SparseArray(values, fill_value=fill_value) + b = make_block(array, klass=SparseBlock, placement=placement) else: b = make_block( concatenate_join_units(join_units, concat_axis, copy=copy), @@ -5313,6 +5330,18 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): return BlockManager(blocks, axes) +def is_sparse_join_units(join_units): + """ + Check if all of the join units are sparse. This leads to building + SparseArray over dense array representations so that we can merge + SparseSeries / SparseDataFrame + + This is very similar to how pandas.concat works for conatting two + SparseDataFrame / SparseSeries + """ + return all(type(ju.block) is SparseBlock for ju in join_units) + + def is_uniform_join_units(join_units): """ Check if the join units consist of blocks of uniform type that can @@ -5686,7 +5715,10 @@ def is_na(self): def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: # No upcasting is necessary - fill_value = self.block.fill_value + + # You would think that you want self.block.fill_value here + # But in reality that will fill with a bunch of wrong values + fill_value = np.nan values = self.block.get_values() else: fill_value = upcasted_na diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9dbb327e3d956..809ff2130d3ed 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -38,6 +38,8 @@ concatenate_block_managers) from pandas.util._decorators import Appender, Substitution +from pandas.core.dtypes.generic import ABCSparseArray + from pandas.core.sorting import is_int64_overflow_possible import pandas.core.algorithms as algos import pandas.core.sorting as sorting @@ -665,7 +667,6 @@ def _maybe_restore_index_levels(self, result): result.set_index(names_to_restore, inplace=True) def _maybe_add_join_keys(self, result, left_indexer, right_indexer): - left_has_missing = None right_has_missing = None @@ -731,7 +732,11 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if mask.all(): key_col = rvals else: - key_col = Index(lvals).where(~mask, rvals) + # Might need to be IntIndex not Index + if isinstance(lvals, ABCSparseArray): + key_col = Index(lvals.get_values()).where(~mask, rvals) + else: + key_col = Index(lvals).where(~mask, rvals) if result._is_label_reference(name): result[name] = key_col diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 65aefd9fb8c0a..ae347bb46f69b 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -39,7 +39,6 @@ from pandas.util._decorators import Appender from pandas.core.indexes.base import _index_shared_docs - _sparray_doc_kwargs = dict(klass='SparseArray') @@ -271,6 +270,7 @@ def __array_wrap__(self, out_arr, context=None): ufunc, args, domain = context # to apply ufunc only to fill_value (to avoid recursive call) args = [getattr(a, 'fill_value', a) for a in args] + with np.errstate(all='ignore'): fill_value = ufunc(self.fill_value, *args[1:]) else: @@ -304,9 +304,9 @@ def __setstate__(self, state): self._fill_value = fill_value def __len__(self): - try: + if hasattr(self, 'sp_index'): return self.sp_index.length - except: + else: return 0 def __unicode__(self): diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 91dc44e3f185e..42e4a9e8e7375 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -73,6 +73,10 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} + elif isinstance(data, BlockManager): + fill_value_size = len(set(b.fill_value for b in data.blocks)) + if default_fill_value is None and fill_value_size == 1: + default_fill_value = data.blocks[0].fill_value if default_fill_value is None: default_fill_value = np.nan diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index 62a467bec2683..3a724e53848b3 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -175,7 +175,7 @@ def values(self): def __array__(self, result=None): """ the array interface, return my values """ - return self.block.values + return self.block.values.values def get_values(self): """ same as values """ @@ -271,6 +271,7 @@ def __array_wrap__(self, result, context=None): See SparseArray.__array_wrap__ for detail. """ + if isinstance(context, tuple) and len(context) == 3: ufunc, args, domain = context args = [getattr(a, 'fill_value', a) for a in args] @@ -279,8 +280,18 @@ def __array_wrap__(self, result, context=None): else: fill_value = self.fill_value + # GH 14167 + # Since we are returning a dense representation of + # SparseSeries sparse_index might not align when calling + # ufunc on the array. There doesn't seem to be a better way + # to do this unfortunately. + if len(result) != self.sp_index.npoints: + sparse_index = None + else: + sparse_index = self.sp_index + return self._constructor(result, index=self.index, - sparse_index=self.sp_index, + sparse_index=sparse_index, fill_value=fill_value, copy=False).__finalize__(self) @@ -402,8 +413,8 @@ def abs(self): ------- abs: type of caller """ - return self._constructor(np.abs(self.values), - index=self.index).__finalize__(self) + + return np.abs(self) def get(self, label, default=None): """ @@ -544,7 +555,7 @@ def to_dense(self, sparse_only=False): index = self.index.take(int_index.indices) return Series(self.sp_values, index=index, name=self.name) else: - return Series(self.values.to_dense(), index=self.index, + return Series(self.get_values(), index=self.index, name=self.name) @property diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 32f83ab972be5..48d74446be482 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -7,6 +7,7 @@ import numpy as np import random import re +import itertools import pandas as pd from pandas.compat import lrange, lzip @@ -18,7 +19,10 @@ is_categorical_dtype, is_object_dtype, ) -from pandas import DataFrame, Index, MultiIndex, Series, Categorical +from pandas import ( + DataFrame, Index, + MultiIndex, Series, Categorical +) import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -1810,3 +1814,88 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected): how=how, sort=sort) tm.assert_frame_equal(result, expected) + + +class TestMergeSparseDataFrames(object): + @pytest.mark.parametrize('fill_value,how', itertools.product([0, 1, + None, + np.nan], + ['left', + 'right', + 'outer', + 'inner'])) + def test_merge_two_sparse_frames(self, fill_value, how): + dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)), + 'B': np.random.randint(0, 100, size=100)}) + dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)), + 'B': np.random.randint(0, 100, size=100)}) + + sparse_evens = dense_evens.to_sparse(fill_value=fill_value) + sparse_threes = dense_threes.to_sparse(fill_value=fill_value) + + to_merge_sparse = [sparse_evens, sparse_threes] + + to_merge_dense = [dense_evens, dense_threes] + + for _ in range(2): + sparse_merge = to_merge_sparse[0].merge(to_merge_sparse[1], + how=how, on='A') + + dense_merge = to_merge_dense[0].merge(to_merge_dense[1], + how=how, on='A') + + # If you merge two dense frames together it tends to default to + # float64 not the original dtype + dense_merge['B_x'] = dense_merge['B_x'].astype(dense_evens.A.dtype, + errors='ignore') + dense_merge['B_y'] = dense_merge['B_y'].astype(dense_evens.A.dtype, + errors='ignore') + + if fill_value is None or fill_value is np.nan: + assert sparse_merge.default_fill_value is np.nan + else: + tm.assert_almost_equal(sparse_merge.default_fill_value, + fill_value) + + exp = dense_merge.to_sparse(fill_value=fill_value) + tm.assert_sp_frame_equal(sparse_merge, exp, + exact_indices=False, + check_dtype=False) + + to_merge_sparse = to_merge_sparse[::-1] + to_merge_dense = to_merge_dense[::-1] + + @pytest.mark.parametrize('fill_value,how', itertools.product([0, 1, + None, + np.nan], + ['left', + 'right', + 'outer', + 'inner'])) + def test_merge_dense_sparse_frames(self, fill_value, how): + fill_value = np.nan + + dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)), + 'B': np.random.randint(0, 100, size=100)}) + + dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)), + 'B': np.random.randint(0, 100, size=100)}) + + sparse_evens = dense_evens.to_sparse(fill_value=fill_value) + + to_merge = [sparse_evens, dense_threes] + to_merge_dense = [dense_evens, dense_threes] + + for _ in range(2): + merged = to_merge[0].merge(to_merge[1], how=how, on='A') + + dense_merge = to_merge_dense[0].merge(to_merge_dense[1], + how=how, on='A') + + for column in dense_merge.columns: + dense_col = merged[column].to_dense() + tm.assert_series_equal(dense_col, dense_merge[column], + check_dtype=False) + + to_merge = to_merge[::-1] + to_merge_dense = to_merge_dense[::-1] diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index 54f567bcd2a8c..e0a854b2b79ba 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -222,27 +222,34 @@ class Unknown: '"Unknown" for data argument'): SparseDataFrame(Unknown()) - def test_constructor_preserve_attr(self): + # Cannot use None as a fill_value cause it will overwrite as zeros + @pytest.mark.parametrize('fill_value', [0, 1, np.nan]) + def test_constructor_preserve_attr(self, fill_value): # GH 13866 - arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) + arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, + fill_value=fill_value) + assert arr.dtype == np.int64 - assert arr.fill_value == 0 + tm.assert_almost_equal(arr.fill_value, fill_value) df = pd.SparseDataFrame({'x': arr}) assert df['x'].dtype == np.int64 - assert df['x'].fill_value == 0 + + tm.assert_almost_equal(df['x'].fill_value, fill_value) s = pd.SparseSeries(arr, name='x') assert s.dtype == np.int64 - assert s.fill_value == 0 + tm.assert_almost_equal(s.fill_value, fill_value) df = pd.SparseDataFrame(s) assert df['x'].dtype == np.int64 - assert df['x'].fill_value == 0 + + tm.assert_almost_equal(df['x'].fill_value, fill_value) df = pd.SparseDataFrame({'x': s}) assert df['x'].dtype == np.int64 - assert df['x'].fill_value == 0 + + tm.assert_almost_equal(df['x'].fill_value, fill_value) def test_constructor_nan_dataframe(self): # GH 10079 diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 3f5d5a59cc540..d005401bd7072 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -628,10 +628,12 @@ def _check_inplace_op(iop, op): getattr(operator, op)) def test_abs(self): - s = SparseSeries([1, 2, -3], name='x') - expected = SparseSeries([1, 2, 3], name='x') + s = SparseSeries([-1, -2, -3, None, np.nan], name='x') + expected = SparseSeries([1, 2, 3, None, np.nan], name='x') result = s.abs() tm.assert_sp_series_equal(result, expected) + assert result.npoints == expected.npoints + assert result.npoints == len(result.sp_values) assert result.name == 'x' result = abs(s) @@ -643,9 +645,9 @@ def test_abs(self): assert result.name == 'x' s = SparseSeries([1, -2, 2, -3], fill_value=-2, name='x') - expected = SparseSeries([1, 2, 3], sparse_index=s.sp_index, - fill_value=2, name='x') + expected = SparseSeries([1, 2, 2, 3], fill_value=2, name='x') result = s.abs() + tm.assert_sp_series_equal(result, expected) assert result.name == 'x' diff --git a/pandas/tests/sparse/test_combine_concat.py b/pandas/tests/sparse/test_combine_concat.py index 70fd1da529d46..ab3840c52e88d 100644 --- a/pandas/tests/sparse/test_combine_concat.py +++ b/pandas/tests/sparse/test_combine_concat.py @@ -170,22 +170,18 @@ def test_concat(self): res = pd.concat([sparse, sparse]) exp = pd.concat([self.dense1, self.dense1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) res = pd.concat([sparse2, sparse2]) exp = pd.concat([self.dense2, self.dense2]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) res = pd.concat([sparse, sparse2]) exp = pd.concat([self.dense1, self.dense2]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) res = pd.concat([sparse2, sparse]) exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) def test_concat_different_fill_value(self): @@ -199,7 +195,6 @@ def test_concat_different_fill_value(self): res = pd.concat([sparse2, sparse]) exp = pd.concat([self.dense2, self.dense1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) def test_concat_different_columns(self): @@ -222,12 +217,10 @@ def test_concat_different_columns(self): res = pd.concat([sparse, sparse3]) exp = pd.concat([self.dense1, self.dense3]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) res = pd.concat([sparse3, sparse]) exp = pd.concat([self.dense3, self.dense1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) # different fill values @@ -266,13 +259,11 @@ def test_concat_series(self): res = pd.concat([sparse, sparse2[col]]) exp = pd.concat([self.dense1, self.dense2[col]]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) res = pd.concat([sparse2[col], sparse]) exp = pd.concat([self.dense2[col], self.dense1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) def test_concat_axis1(self): @@ -296,13 +287,13 @@ def test_concat_axis1(self): res = pd.concat([sparse, sparse3], axis=1) exp = pd.concat([self.dense1, self.dense3], axis=1).to_sparse(fill_value=0) - exp._default_fill_value = np.nan + # exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) res = pd.concat([sparse3, sparse], axis=1) exp = pd.concat([self.dense3, self.dense1], axis=1).to_sparse(fill_value=0) - exp._default_fill_value = np.nan + # exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) # different fill values diff --git a/pandas/tests/sparse/test_indexing.py b/pandas/tests/sparse/test_indexing.py index 37a287af71451..0ff1c439c9df7 100644 --- a/pandas/tests/sparse/test_indexing.py +++ b/pandas/tests/sparse/test_indexing.py @@ -615,32 +615,29 @@ def test_getitem(self): tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], orig.iloc[[1, 2]].to_sparse()) - def test_getitem_fill_value(self): + @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None]) + def test_getitem_fill_value(self, fill_value): orig = pd.DataFrame([[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], columns=list('xyz')) - sparse = orig.to_sparse(fill_value=0) + sparse = orig.to_sparse(fill_value=fill_value) tm.assert_sp_series_equal(sparse['y'], - orig['y'].to_sparse(fill_value=0)) + orig['y'].to_sparse(fill_value=fill_value)) - exp = orig[['x']].to_sparse(fill_value=0) - exp._default_fill_value = np.nan + exp = orig[['x']].to_sparse(fill_value=fill_value) tm.assert_sp_frame_equal(sparse[['x']], exp) - exp = orig[['z', 'x']].to_sparse(fill_value=0) - exp._default_fill_value = np.nan + exp = orig[['z', 'x']].to_sparse(fill_value=fill_value) tm.assert_sp_frame_equal(sparse[['z', 'x']], exp) indexer = [True, False, True, True] - exp = orig[indexer].to_sparse(fill_value=0) - exp._default_fill_value = np.nan + exp = orig[indexer].to_sparse(fill_value=fill_value) tm.assert_sp_frame_equal(sparse[indexer], exp) - exp = orig.iloc[[1, 2]].to_sparse(fill_value=0) - exp._default_fill_value = np.nan + exp = orig.iloc[[1, 2]].to_sparse(fill_value=fill_value) tm.assert_sp_frame_equal(sparse.iloc[[1, 2]], exp) def test_loc(self): @@ -877,24 +874,22 @@ def test_take(self): tm.assert_sp_frame_equal(sparse.take([-1, -2]), orig.take([-1, -2]).to_sparse()) - def test_take_fill_value(self): + @pytest.mark.parametrize('fill_value', [0, 1, np.nan, None]) + def test_take_fill_value(self, fill_value): orig = pd.DataFrame([[1, np.nan, 0], [2, 3, np.nan], [0, np.nan, 4], [0, np.nan, 5]], columns=list('xyz')) - sparse = orig.to_sparse(fill_value=0) + sparse = orig.to_sparse(fill_value=fill_value) - exp = orig.take([0]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan + exp = orig.take([0]).to_sparse(fill_value=fill_value) tm.assert_sp_frame_equal(sparse.take([0]), exp) - exp = orig.take([0, 1]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan + exp = orig.take([0, 1]).to_sparse(fill_value=fill_value) tm.assert_sp_frame_equal(sparse.take([0, 1]), exp) - exp = orig.take([-1, -2]).to_sparse(fill_value=0) - exp._default_fill_value = np.nan + exp = orig.take([-1, -2]).to_sparse(fill_value=fill_value) tm.assert_sp_frame_equal(sparse.take([-1, -2]), exp) def test_reindex(self):