BUG: Sparse concat results in dense

pandas-dev · Apr 17, 2016 · cc07705 · cc07705
1 parent 3bed097
commit cc07705
Show file tree

Hide file tree

Showing 13 changed files with 474 additions and 289 deletions.
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -120,6 +120,7 @@ These changes conform sparse handling to return the correct types and work to ma
 - Bug in ``SparseArray.to_frame()`` results in ``DataFrame``, rather than ``SparseDataFrame`` (:issue:`9850`)
 - Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
 - Bug in ``SparseArray.to_dense()`` incorrectly handle ``fill_value`` (:issue:`12797`)
+- Bug in ``pd.concat()`` of ``SparseSeries`` results in dense (:issue:`10536`)
 
 .. _whatsnew_0181.api:
 

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -18,7 +18,7 @@
 from pandas.core.common import (
     ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull,
     is_dtype_equal, is_categorical_dtype, is_integer_dtype,
-    _possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like,
+    _possibly_infer_to_datetimelike, is_list_like,
     is_sequence, is_null_slice, is_bool, _ensure_object, _ensure_int64,
     _coerce_indexer_dtype)
 from pandas.types.api import CategoricalDtype
@@ -1873,59 +1873,3 @@ def _convert_to_list_like(list_like):
     else:
         # is this reached?
         return [list_like]
-
-
-def _concat_compat(to_concat, axis=0):
-    """Concatenate an object/categorical array of arrays, each of which is a
-    single dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : int
-        Axis to provide concatenation in the current implementation this is
-        always 0, e.g. we only have 1D categoricals
-
-    Returns
-    -------
-    Categorical
-        A single array, preserving the combined dtypes
-    """
-
-    def convert_categorical(x):
-        # coerce to object dtype
-        if is_categorical_dtype(x.dtype):
-            return x.get_values()
-        return x.ravel()
-
-    if get_dtype_kinds(to_concat) - set(['object', 'category']):
-        # convert to object type and perform a regular concat
-        from pandas.core.common import _concat_compat
-        return _concat_compat([np.array(x, copy=False, dtype=object)
-                               for x in to_concat], axis=0)
-
-    # we could have object blocks and categoricals here
-    # if we only have a single categoricals then combine everything
-    # else its a non-compat categorical
-    categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
-
-    # validate the categories
-    categories = categoricals[0]
-    rawcats = categories.categories
-    for x in categoricals[1:]:
-        if not categories.is_dtype_equal(x):
-            raise ValueError("incompatible categories in categorical concat")
-
-    # we've already checked that all categoricals are the same, so if their
-    # length is equal to the input then we have all the same categories
-    if len(categoricals) == len(to_concat):
-        # concating numeric types is much faster than concating object types
-        # and fastpath takes a shorter path through the constructor
-        return Categorical(np.concatenate([x.codes for x in to_concat],
-                                          axis=0),
-                           rawcats, ordered=categoricals[0].ordered,
-                           fastpath=True)
-    else:
-        concatted = np.concatenate(list(map(convert_categorical, to_concat)),
-                                   axis=0)
-        return Categorical(concatted, rawcats)
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1918,108 +1918,6 @@ def _all_none(*args):
     return True
 
 
-def get_dtype_kinds(l):
-    """
-    Parameters
-    ----------
-    l : list of arrays
-
-    Returns
-    -------
-    a set of kinds that exist in this list of arrays
-    """
-
-    typs = set()
-    for arr in l:
-
-        dtype = arr.dtype
-        if is_categorical_dtype(dtype):
-            typ = 'category'
-        elif is_sparse(arr):
-            typ = 'sparse'
-        elif is_datetimetz(arr):
-            typ = 'datetimetz'
-        elif is_datetime64_dtype(dtype):
-            typ = 'datetime'
-        elif is_timedelta64_dtype(dtype):
-            typ = 'timedelta'
-        elif is_object_dtype(dtype):
-            typ = 'object'
-        elif is_bool_dtype(dtype):
-            typ = 'bool'
-        else:
-            typ = dtype.kind
-        typs.add(typ)
-    return typs
-
-
-def _concat_compat(to_concat, axis=0):
-    """
-    provide concatenation of an array of arrays each of which is a single
-    'normalized' dtypes (in that for example, if it's object, then it is a
-    non-datetimelike and provide a combined dtype for the resulting array that
-    preserves the overall dtype if possible)
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : axis to provide concatenation
-
-    Returns
-    -------
-    a single array, preserving the combined dtypes
-    """
-
-    # filter empty arrays
-    # 1-d dtypes always are included here
-    def is_nonempty(x):
-        try:
-            return x.shape[axis] > 0
-        except Exception:
-            return True
-
-    nonempty = [x for x in to_concat if is_nonempty(x)]
-
-    # If all arrays are empty, there's nothing to convert, just short-cut to
-    # the concatenation, #3121.
-    #
-    # Creating an empty array directly is tempting, but the winnings would be
-    # marginal given that it would still require shape & dtype calculation and
-    # np.concatenate which has them both implemented is compiled.
-
-    typs = get_dtype_kinds(to_concat)
-
-    # these are mandated to handle empties as well
-    if 'datetime' in typs or 'datetimetz' in typs or 'timedelta' in typs:
-        from pandas.tseries.common import _concat_compat
-        return _concat_compat(to_concat, axis=axis, typs=typs)
-
-    elif 'sparse' in typs:
-        from pandas.sparse.array import _concat_compat
-        return _concat_compat(to_concat, axis=axis)
-
-    elif 'category' in typs:
-        from pandas.core.categorical import _concat_compat
-        return _concat_compat(to_concat, axis=axis)
-
-    if not nonempty:
-        # we have all empties, but may need to coerce the result dtype to
-        # object if we have non-numeric type operands (numpy would otherwise
-        # cast this to float)
-        typs = get_dtype_kinds(to_concat)
-        if len(typs) != 1:
-
-            if (not len(typs - set(['i', 'u', 'f'])) or
-                    not len(typs - set(['bool', 'i', 'u']))):
-                # let numpy coerce
-                pass
-            else:
-                # coerce to object
-                to_concat = [x.astype('object') for x in to_concat]
-
-    return np.concatenate(to_concat, axis=axis)
-
-
 def _where_compat(mask, arr1, arr2):
     if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE:
         new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8'))

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -30,6 +30,7 @@
 from pandas.tseries.index import DatetimeIndex
 from pandas.formats.printing import pprint_thing
 import pandas.core.common as com
+import pandas.types.concat as _concat
 import pandas.core.missing as missing
 import pandas.core.convert as convert
 from pandas.sparse.array import _maybe_to_sparse, SparseArray
@@ -4646,7 +4647,7 @@ def concatenate_join_units(join_units, concat_axis, copy):
         if copy and concat_values.base is not None:
             concat_values = concat_values.copy()
     else:
-        concat_values = com._concat_compat(to_concat, axis=concat_axis)
+        concat_values = _concat._concat_compat(to_concat, axis=concat_axis)
 
     return concat_values
 

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -18,6 +18,7 @@
 from pandas.core.groupby import get_group_index, _compress_group_index
 
 import pandas.core.common as com
+import pandas.types.concat as _concat
 import pandas.core.algorithms as algos
 import pandas.algos as _algos
 
@@ -848,7 +849,8 @@ def lreshape(data, groups, dropna=True, label=None):
     pivot_cols = []
 
     for target, names in zip(keys, values):
-        mdata[target] = com._concat_compat([data[col].values for col in names])
+        to_concat = [data[col].values for col in names]
+        mdata[target] = _concat._concat_compat(to_concat)
         pivot_cols.append(target)
 
     for col in id_cols:

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -17,6 +17,7 @@
 from pandas.util.decorators import (Appender, Substitution, cache_readonly,
                                     deprecate, deprecate_kwarg)
 import pandas.core.common as com
+import pandas.types.concat as _concat
 import pandas.core.missing as missing
 import pandas.core.algorithms as algos
 from pandas.formats.printing import pprint_thing
@@ -1713,7 +1714,7 @@ def union(self, other):
             if len(indexer) > 0:
                 other_diff = algos.take_nd(other._values, indexer,
                                            allow_fill=False)
-                result = com._concat_compat((self.values, other_diff))
+                result = _concat._concat_compat((self.values, other_diff))
 
                 try:
                     self.values[0] < other_diff[0]

diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py
@@ -574,46 +574,3 @@ def _make_index(length, indices, kind):
 
 ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
                                    use_numexpr=False)
-
-
-def _concat_compat(to_concat, axis=0):
-    """
-    provide concatenation of an sparse/dense array of arrays each of which is a
-    single dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : axis to provide concatenation
-
-    Returns
-    -------
-    a single array, preserving the combined dtypes
-    """
-
-    def convert_sparse(x, axis):
-        # coerce to native type
-        if isinstance(x, SparseArray):
-            x = x.get_values()
-        x = x.ravel()
-        if axis > 0:
-            x = np.atleast_2d(x)
-        return x
-
-    typs = com.get_dtype_kinds(to_concat)
-
-    # we have more than one type here, so densify and regular concat
-    to_concat = [convert_sparse(x, axis) for x in to_concat]
-    result = np.concatenate(to_concat, axis=axis)
-
-    if not len(typs - set(['sparse', 'f', 'i'])):
-
-        # we can remain sparse
-        result = SparseArray(result.ravel())
-
-    else:
-
-        # coerce to object if needed
-        result = result.astype('object')
-
-    return result