Skip to content

Commit

Permalink
BUG: Sparse concat results in dense
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Apr 17, 2016
1 parent 3bed097 commit cc07705
Show file tree
Hide file tree
Showing 13 changed files with 474 additions and 289 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ These changes conform sparse handling to return the correct types and work to ma
- Bug in ``SparseArray.to_frame()`` results in ``DataFrame``, rather than ``SparseDataFrame`` (:issue:`9850`)
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
- Bug in ``SparseArray.to_dense()`` incorrectly handle ``fill_value`` (:issue:`12797`)
- Bug in ``pd.concat()`` of ``SparseSeries`` results in dense (:issue:`10536`)

.. _whatsnew_0181.api:

Expand Down
58 changes: 1 addition & 57 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pandas.core.common import (
ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull,
is_dtype_equal, is_categorical_dtype, is_integer_dtype,
_possibly_infer_to_datetimelike, get_dtype_kinds, is_list_like,
_possibly_infer_to_datetimelike, is_list_like,
is_sequence, is_null_slice, is_bool, _ensure_object, _ensure_int64,
_coerce_indexer_dtype)
from pandas.types.api import CategoricalDtype
Expand Down Expand Up @@ -1873,59 +1873,3 @@ def _convert_to_list_like(list_like):
else:
# is this reached?
return [list_like]


def _concat_compat(to_concat, axis=0):
"""Concatenate an object/categorical array of arrays, each of which is a
single dtype
Parameters
----------
to_concat : array of arrays
axis : int
Axis to provide concatenation in the current implementation this is
always 0, e.g. we only have 1D categoricals
Returns
-------
Categorical
A single array, preserving the combined dtypes
"""

def convert_categorical(x):
# coerce to object dtype
if is_categorical_dtype(x.dtype):
return x.get_values()
return x.ravel()

if get_dtype_kinds(to_concat) - set(['object', 'category']):
# convert to object type and perform a regular concat
from pandas.core.common import _concat_compat
return _concat_compat([np.array(x, copy=False, dtype=object)
for x in to_concat], axis=0)

# we could have object blocks and categoricals here
# if we only have a single categoricals then combine everything
# else its a non-compat categorical
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]

# validate the categories
categories = categoricals[0]
rawcats = categories.categories
for x in categoricals[1:]:
if not categories.is_dtype_equal(x):
raise ValueError("incompatible categories in categorical concat")

# we've already checked that all categoricals are the same, so if their
# length is equal to the input then we have all the same categories
if len(categoricals) == len(to_concat):
# concating numeric types is much faster than concating object types
# and fastpath takes a shorter path through the constructor
return Categorical(np.concatenate([x.codes for x in to_concat],
axis=0),
rawcats, ordered=categoricals[0].ordered,
fastpath=True)
else:
concatted = np.concatenate(list(map(convert_categorical, to_concat)),
axis=0)
return Categorical(concatted, rawcats)
102 changes: 0 additions & 102 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1918,108 +1918,6 @@ def _all_none(*args):
return True


def get_dtype_kinds(l):
"""
Parameters
----------
l : list of arrays
Returns
-------
a set of kinds that exist in this list of arrays
"""

typs = set()
for arr in l:

dtype = arr.dtype
if is_categorical_dtype(dtype):
typ = 'category'
elif is_sparse(arr):
typ = 'sparse'
elif is_datetimetz(arr):
typ = 'datetimetz'
elif is_datetime64_dtype(dtype):
typ = 'datetime'
elif is_timedelta64_dtype(dtype):
typ = 'timedelta'
elif is_object_dtype(dtype):
typ = 'object'
elif is_bool_dtype(dtype):
typ = 'bool'
else:
typ = dtype.kind
typs.add(typ)
return typs


def _concat_compat(to_concat, axis=0):
"""
provide concatenation of an array of arrays each of which is a single
'normalized' dtypes (in that for example, if it's object, then it is a
non-datetimelike and provide a combined dtype for the resulting array that
preserves the overall dtype if possible)
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
Returns
-------
a single array, preserving the combined dtypes
"""

# filter empty arrays
# 1-d dtypes always are included here
def is_nonempty(x):
try:
return x.shape[axis] > 0
except Exception:
return True

nonempty = [x for x in to_concat if is_nonempty(x)]

# If all arrays are empty, there's nothing to convert, just short-cut to
# the concatenation, #3121.
#
# Creating an empty array directly is tempting, but the winnings would be
# marginal given that it would still require shape & dtype calculation and
# np.concatenate which has them both implemented is compiled.

typs = get_dtype_kinds(to_concat)

# these are mandated to handle empties as well
if 'datetime' in typs or 'datetimetz' in typs or 'timedelta' in typs:
from pandas.tseries.common import _concat_compat
return _concat_compat(to_concat, axis=axis, typs=typs)

elif 'sparse' in typs:
from pandas.sparse.array import _concat_compat
return _concat_compat(to_concat, axis=axis)

elif 'category' in typs:
from pandas.core.categorical import _concat_compat
return _concat_compat(to_concat, axis=axis)

if not nonempty:
# we have all empties, but may need to coerce the result dtype to
# object if we have non-numeric type operands (numpy would otherwise
# cast this to float)
typs = get_dtype_kinds(to_concat)
if len(typs) != 1:

if (not len(typs - set(['i', 'u', 'f'])) or
not len(typs - set(['bool', 'i', 'u']))):
# let numpy coerce
pass
else:
# coerce to object
to_concat = [x.astype('object') for x in to_concat]

return np.concatenate(to_concat, axis=axis)


def _where_compat(mask, arr1, arr2):
if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE:
new_vals = np.where(mask, arr1.view('i8'), arr2.view('i8'))
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pandas.tseries.index import DatetimeIndex
from pandas.formats.printing import pprint_thing
import pandas.core.common as com
import pandas.types.concat as _concat
import pandas.core.missing as missing
import pandas.core.convert as convert
from pandas.sparse.array import _maybe_to_sparse, SparseArray
Expand Down Expand Up @@ -4646,7 +4647,7 @@ def concatenate_join_units(join_units, concat_axis, copy):
if copy and concat_values.base is not None:
concat_values = concat_values.copy()
else:
concat_values = com._concat_compat(to_concat, axis=concat_axis)
concat_values = _concat._concat_compat(to_concat, axis=concat_axis)

return concat_values

Expand Down
4 changes: 3 additions & 1 deletion pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas.core.groupby import get_group_index, _compress_group_index

import pandas.core.common as com
import pandas.types.concat as _concat
import pandas.core.algorithms as algos
import pandas.algos as _algos

Expand Down Expand Up @@ -848,7 +849,8 @@ def lreshape(data, groups, dropna=True, label=None):
pivot_cols = []

for target, names in zip(keys, values):
mdata[target] = com._concat_compat([data[col].values for col in names])
to_concat = [data[col].values for col in names]
mdata[target] = _concat._concat_compat(to_concat)
pivot_cols.append(target)

for col in id_cols:
Expand Down
3 changes: 2 additions & 1 deletion pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
deprecate, deprecate_kwarg)
import pandas.core.common as com
import pandas.types.concat as _concat
import pandas.core.missing as missing
import pandas.core.algorithms as algos
from pandas.formats.printing import pprint_thing
Expand Down Expand Up @@ -1713,7 +1714,7 @@ def union(self, other):
if len(indexer) > 0:
other_diff = algos.take_nd(other._values, indexer,
allow_fill=False)
result = com._concat_compat((self.values, other_diff))
result = _concat._concat_compat((self.values, other_diff))

try:
self.values[0] < other_diff[0]
Expand Down
43 changes: 0 additions & 43 deletions pandas/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,46 +574,3 @@ def _make_index(length, indices, kind):

ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
use_numexpr=False)


def _concat_compat(to_concat, axis=0):
"""
provide concatenation of an sparse/dense array of arrays each of which is a
single dtype
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
Returns
-------
a single array, preserving the combined dtypes
"""

def convert_sparse(x, axis):
# coerce to native type
if isinstance(x, SparseArray):
x = x.get_values()
x = x.ravel()
if axis > 0:
x = np.atleast_2d(x)
return x

typs = com.get_dtype_kinds(to_concat)

# we have more than one type here, so densify and regular concat
to_concat = [convert_sparse(x, axis) for x in to_concat]
result = np.concatenate(to_concat, axis=axis)

if not len(typs - set(['sparse', 'f', 'i'])):

# we can remain sparse
result = SparseArray(result.ravel())

else:

# coerce to object if needed
result = result.astype('object')

return result
Loading

0 comments on commit cc07705

Please sign in to comment.