pandas-dev · hexgnu · Jan 26, 2018 · Feb 1, 2018 · Feb 2, 2018 · Feb 2, 2018
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -238,6 +238,7 @@ Other Enhancements
 
 - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`)
 - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`)
+- :func:`pandas.merge` now supports merging of :class:`SparseDataFrame` (:issue:`13665`)
 
 .. _whatsnew_0230.api_breaking:
 
@@ -555,7 +556,7 @@ Sparse
 
 - Bug in which creating a ``SparseDataFrame`` from a dense ``Series`` or an unsupported type raised an uncontrolled exception (:issue:`19374`)
 - Bug in :class:`SparseDataFrame.to_csv` causing exception (:issue:`19384`)
--
+- Bug in :class:`SparseSeries.__array__` returning only non-faills (:issue:`13665`)
 
 Reshaping
 ^^^^^^^^^
@@ -591,3 +592,4 @@ Other
 ^^^^^
 
 - Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
+- Improved algorithms.take_1d handling of ``SparseArray`` (:issue:`19506`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1315,6 +1315,11 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
         undefined if allow_fill == False and -1 is present in indexer.
     """
 
+    if is_sparse(arr):
+        return take_nd(arr.get_values(), indexer, axis=axis, out=out,
+                       fill_value=fill_value, mask_info=mask_info,
+                       allow_fill=allow_fill)
+
     # dispatch to internal type takes
     if is_categorical(arr):
         return arr.take_nd(indexer, fill_value=fill_value,

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -3089,6 +3089,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None,
         # GH#19265 pyarrow is passing this
         warnings.warn("fastpath argument is deprecated, will be removed "
                       "in a future release.", DeprecationWarning)
+
     if klass is None:
         dtype = dtype or values.dtype
         klass = get_block_type(values, dtype)
@@ -5304,6 +5305,22 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
         elif is_uniform_join_units(join_units):
             b = join_units[0].block.concat_same_type(
                 [ju.block for ju in join_units], placement=placement)
+        elif is_sparse_join_units(join_units):
+            values = concatenate_join_units(join_units, concat_axis, copy=copy)
+
+            if len(values.shape) == 2:
+                values = values[0]
+            else:
+                assert len(values.shape) == 1
+
+            block = join_units[0].block
+
+            if block:
+                fill_value = block.fill_value
+            else:
+                fill_value = np.nan
+            array = SparseArray(values, fill_value=fill_value)
+            b = make_block(array, klass=SparseBlock, placement=placement)
         else:
             b = make_block(
                 concatenate_join_units(join_units, concat_axis, copy=copy),
@@ -5313,6 +5330,18 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
     return BlockManager(blocks, axes)
 
 
+def is_sparse_join_units(join_units):
+    """
+    Check if all of the join units are sparse. This leads to building
+    SparseArray over dense array representations so that we can merge
+    SparseSeries / SparseDataFrame
+
+    This is very similar to how pandas.concat works for conatting two
+    SparseDataFrame / SparseSeries
+    """
+    return all(type(ju.block) is SparseBlock for ju in join_units)
+
+
 def is_uniform_join_units(join_units):
     """
     Check if the join units consist of blocks of uniform type that can
@@ -5686,7 +5715,10 @@ def is_na(self):
     def get_reindexed_values(self, empty_dtype, upcasted_na):
         if upcasted_na is None:
             # No upcasting is necessary
-            fill_value = self.block.fill_value
+
+            # You would think that you want self.block.fill_value here
+            # But in reality that will fill with a bunch of wrong values
+            fill_value = np.nan
             values = self.block.get_values()
         else:
             fill_value = upcasted_na

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -38,6 +38,8 @@
                                    concatenate_block_managers)
 from pandas.util._decorators import Appender, Substitution
 
+from pandas.core.sparse.array import SparseArray
+
 from pandas.core.sorting import is_int64_overflow_possible
 import pandas.core.algorithms as algos
 import pandas.core.sorting as sorting
@@ -665,7 +667,6 @@ def _maybe_restore_index_levels(self, result):
             result.set_index(names_to_restore, inplace=True)
 
     def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
-
         left_has_missing = None
         right_has_missing = None
 
@@ -731,7 +732,11 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
                 if mask.all():
                     key_col = rvals
                 else:
-                    key_col = Index(lvals).where(~mask, rvals)
+                    # Might need to be IntIndex not Index
+                    if isinstance(lvals, SparseArray):
+                        key_col = Index(lvals.get_values()).where(~mask, rvals)
+                    else:
+                        key_col = Index(lvals).where(~mask, rvals)
 
                 if result._is_label_reference(name):
                     result[name] = key_col

diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py
@@ -38,7 +38,6 @@
 from pandas.util._decorators import Appender
 from pandas.core.indexes.base import _index_shared_docs
 
-
 _sparray_doc_kwargs = dict(klass='SparseArray')
 
 
@@ -259,6 +258,7 @@ def __array_wrap__(self, out_arr, context=None):
             ufunc, args, domain = context
             # to apply ufunc only to fill_value (to avoid recursive call)
             args = [getattr(a, 'fill_value', a) for a in args]
+
             with np.errstate(all='ignore'):
                 fill_value = ufunc(self.fill_value, *args[1:])
         else:
@@ -292,9 +292,9 @@ def __setstate__(self, state):
         self._fill_value = fill_value
 
     def __len__(self):
-        try:
+        if hasattr(self, 'sp_index'):
             return self.sp_index.length
-        except:
+        else:
             return 0
 
     def __unicode__(self):

diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -73,6 +73,10 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None,
             if columns is None:
                 raise Exception("cannot pass a series w/o a name or columns")
             data = {columns[0]: data}
+        elif isinstance(data, BlockManager):
+            fill_value_size = len(set(b.fill_value for b in data.blocks))
+            if default_fill_value is None and fill_value_size == 1:
+                default_fill_value = data.blocks[0].fill_value
 
         if default_fill_value is None:
             default_fill_value = np.nan

diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
@@ -175,7 +175,7 @@ def values(self):
 
     def __array__(self, result=None):
         """ the array interface, return my values """
-        return self.block.values
+        return self.block.values.values
 
     def get_values(self):
         """ same as values """
@@ -271,6 +271,7 @@ def __array_wrap__(self, result, context=None):
 
         See SparseArray.__array_wrap__ for detail.
         """
+
         if isinstance(context, tuple) and len(context) == 3:
             ufunc, args, domain = context
             args = [getattr(a, 'fill_value', a) for a in args]
@@ -279,8 +280,18 @@ def __array_wrap__(self, result, context=None):
         else:
             fill_value = self.fill_value
 
+        # GH 14167
+        # Since we are returning a dense representation of
+        # SparseSeries sparse_index might not align when calling
+        # ufunc on the array. There doesn't seem to be a better way
+        # to do this unfortunately.
+        if len(result) != self.sp_index.npoints:
+            sparse_index = None
+        else:
+            sparse_index = self.sp_index
+
         return self._constructor(result, index=self.index,
-                                 sparse_index=self.sp_index,
+                                 sparse_index=sparse_index,
                                  fill_value=fill_value,
                                  copy=False).__finalize__(self)
 
@@ -402,8 +413,8 @@ def abs(self):
         -------
         abs: type of caller
         """
-        return self._constructor(np.abs(self.values),
-                                 index=self.index).__finalize__(self)
+
+        return np.abs(self)
 
     def get(self, label, default=None):
         """
@@ -544,7 +555,7 @@ def to_dense(self, sparse_only=False):
             index = self.index.take(int_index.indices)
             return Series(self.sp_values, index=index, name=self.name)
         else:
-            return Series(self.values.to_dense(), index=self.index,
+            return Series(self.get_values(), index=self.index,
                           name=self.name)
 
     @property

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -7,6 +7,7 @@
 import numpy as np
 import random
 import re
+import itertools
 
 import pandas as pd
 from pandas.compat import lrange, lzip
@@ -18,7 +19,10 @@
     is_categorical_dtype,
     is_object_dtype,
 )
-from pandas import DataFrame, Index, MultiIndex, Series, Categorical
+from pandas import (
+    DataFrame, Index,
+    MultiIndex, Series, Categorical
+)
 import pandas.util.testing as tm
 from pandas.api.types import CategoricalDtype as CDT
 
@@ -1802,3 +1806,88 @@ def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
                           how=how,
                           sort=sort)
         tm.assert_frame_equal(result, expected)
+
+
+class TestMergeSparseDataFrames(object):
+    @pytest.mark.parametrize('fill_value,how', itertools.product([0, 1,
+                                                                  None,
+                                                                  np.nan],
+                                                                 ['left',
+                                                                  'right',
+                                                                  'outer',
+                                                                  'inner']))
+    def test_merge_two_sparse_frames(self, fill_value, how):
+        dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)),
+                                    'B': np.random.randint(0, 100, size=100)})
+        dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)),
+                                     'B': np.random.randint(0, 100, size=100)})
+
+        sparse_evens = dense_evens.to_sparse(fill_value=fill_value)
+        sparse_threes = dense_threes.to_sparse(fill_value=fill_value)
+
+        to_merge_sparse = [sparse_evens, sparse_threes]
+
+        to_merge_dense = [dense_evens, dense_threes]
+
+        for _ in range(2):
+            sparse_merge = to_merge_sparse[0].merge(to_merge_sparse[1],
+                                                    how=how, on='A')
+
+            dense_merge = to_merge_dense[0].merge(to_merge_dense[1],
+                                                  how=how, on='A')
+
+            # If you merge two dense frames together it tends to default to
+            # float64 not the original dtype
+            dense_merge['B_x'] = dense_merge['B_x'].astype(dense_evens.A.dtype,
+                                                           errors='ignore')
+            dense_merge['B_y'] = dense_merge['B_y'].astype(dense_evens.A.dtype,
+                                                           errors='ignore')
+
+            if fill_value is None or fill_value is np.nan:
+                assert sparse_merge.default_fill_value is np.nan
+            else:
+                tm.assert_almost_equal(sparse_merge.default_fill_value,
+                                       fill_value)
+
+            exp = dense_merge.to_sparse(fill_value=fill_value)
+            tm.assert_sp_frame_equal(sparse_merge, exp,
+                                     exact_indices=False,
+                                     check_dtype=False)
+
+            to_merge_sparse = to_merge_sparse[::-1]
+            to_merge_dense = to_merge_dense[::-1]
+
+    @pytest.mark.parametrize('fill_value,how', itertools.product([0, 1,
+                                                                  None,
+                                                                  np.nan],
+                                                                 ['left',
+                                                                  'right',
+                                                                  'outer',
+                                                                  'inner']))
+    def test_merge_dense_sparse_frames(self, fill_value, how):
+        fill_value = np.nan
+
+        dense_evens = pd.DataFrame({'A': list(range(0, 200, 2)),
+                                    'B': np.random.randint(0, 100, size=100)})
+
+        dense_threes = pd.DataFrame({'A': list(range(0, 300, 3)),
+                                     'B': np.random.randint(0, 100, size=100)})
+
+        sparse_evens = dense_evens.to_sparse(fill_value=fill_value)
+
+        to_merge = [sparse_evens, dense_threes]
+        to_merge_dense = [dense_evens, dense_threes]
+
+        for _ in range(2):
+            merged = to_merge[0].merge(to_merge[1], how=how, on='A')
+
+            dense_merge = to_merge_dense[0].merge(to_merge_dense[1],
+                                                  how=how, on='A')
+
+            for column in dense_merge.columns:
+                dense_col = merged[column].to_dense()
+                tm.assert_series_equal(dense_col, dense_merge[column],
+                                       check_dtype=False)
+
+            to_merge = to_merge[::-1]
+            to_merge_dense = to_merge_dense[::-1]
diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py
@@ -222,27 +222,34 @@ class Unknown:
                                    '"Unknown" for data argument'):
             SparseDataFrame(Unknown())
 
-    def test_constructor_preserve_attr(self):
+    # Cannot use None as a fill_value cause it will overwrite as zeros
+    @pytest.mark.parametrize('fill_value', [0, 1, np.nan])
+    def test_constructor_preserve_attr(self, fill_value):
         # GH 13866
-        arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
+        arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64,
+                             fill_value=fill_value)
+
         assert arr.dtype == np.int64
-        assert arr.fill_value == 0
+        tm.assert_almost_equal(arr.fill_value, fill_value)
 
         df = pd.SparseDataFrame({'x': arr})
         assert df['x'].dtype == np.int64
-        assert df['x'].fill_value == 0
+
+        tm.assert_almost_equal(df['x'].fill_value, fill_value)
 
         s = pd.SparseSeries(arr, name='x')
         assert s.dtype == np.int64
-        assert s.fill_value == 0
+        tm.assert_almost_equal(s.fill_value, fill_value)
 
         df = pd.SparseDataFrame(s)
         assert df['x'].dtype == np.int64
-        assert df['x'].fill_value == 0
+
+        tm.assert_almost_equal(df['x'].fill_value, fill_value)
 
         df = pd.SparseDataFrame({'x': s})
         assert df['x'].dtype == np.int64
-        assert df['x'].fill_value == 0
+
+        tm.assert_almost_equal(df['x'].fill_value, fill_value)
 
     def test_constructor_nan_dataframe(self):
         # GH 10079

diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py
@@ -626,10 +626,12 @@ def _check_inplace_op(iop, op):
                               getattr(operator, op))
 
     def test_abs(self):
-        s = SparseSeries([1, 2, -3], name='x')
-        expected = SparseSeries([1, 2, 3], name='x')
+        s = SparseSeries([-1, -2, -3, None, np.nan], name='x')
+        expected = SparseSeries([1, 2, 3, None, np.nan], name='x')
         result = s.abs()
         tm.assert_sp_series_equal(result, expected)
+        assert result.npoints == expected.npoints
+        assert result.npoints == len(result.sp_values)
         assert result.name == 'x'
 
         result = abs(s)
@@ -641,9 +643,9 @@ def test_abs(self):
         assert result.name == 'x'
 
         s = SparseSeries([1, -2, 2, -3], fill_value=-2, name='x')
-        expected = SparseSeries([1, 2, 3], sparse_index=s.sp_index,
-                                fill_value=2, name='x')
+        expected = SparseSeries([1, 2, 2, 3], fill_value=2, name='x')
         result = s.abs()
+
         tm.assert_sp_series_equal(result, expected)
         assert result.name == 'x'