Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Allow get_dummies to return SparseDataFrame #8823

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Enhancements
df.drop(['A', 'X'], axis=1, errors='ignore')

- Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`)
- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return DataFrame is sparse. (:issue:`8823`)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

put backticks around DataFrame.
say "the return DataFrame is sparse (e.g. SparseDataFrame)"

.. _whatsnew_0161.api:

Expand Down
61 changes: 45 additions & 16 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
from pandas.core.series import Series
from pandas.core.frame import DataFrame

from pandas.core.sparse import SparseDataFrame, SparseSeries
from pandas.sparse.array import SparseArray
from pandas._sparse import IntIndex

from pandas.core.categorical import Categorical
from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
isnull)
Expand Down Expand Up @@ -932,7 +936,7 @@ def melt_stub(df, stub, i, j):
return newdf.set_index([i, j])

def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
columns=None):
columns=None, sparse=False):
"""
Convert categorical variable into dummy/indicator variables

Expand All @@ -953,6 +957,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
Column names in the DataFrame to be encoded.
If `columns` is None then all the columns with
`object` or `category` dtype will be converted.
sparse : bool, default False
Whether the returned DataFrame should be sparse or not.

Returns
-------
Expand Down Expand Up @@ -1039,16 +1045,17 @@ def check_len(item, name):
with_dummies = [result]
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):

dummy = _get_dummies_1d(data[col], prefix=pre,
prefix_sep=sep, dummy_na=dummy_na)
dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
dummy_na=dummy_na, sparse=sparse)
with_dummies.append(dummy)
result = concat(with_dummies, axis=1)
else:
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na)
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
sparse=sparse)
return result


def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False):
# Series avoids inconsistent NaN handling
cat = Categorical.from_array(Series(data), ordered=True)
levels = cat.categories
Expand All @@ -1059,19 +1066,17 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
index = data.index
else:
index = np.arange(len(data))
return DataFrame(index=index)

number_of_cols = len(levels)
if dummy_na:
number_of_cols += 1

dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0)
if not sparse:
return DataFrame(index=index)
else:
return SparseDataFrame(index=index)

codes = cat.codes.copy()
if dummy_na:
codes[codes == -1] = len(cat.categories)
levels = np.append(cat.categories, np.nan)
else:
# reset NaN GH4446
dummy_mat[cat.codes == -1] = 0

number_of_cols = len(levels)

if prefix is not None:
dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
Expand All @@ -1084,7 +1089,31 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
else:
index = None

return DataFrame(dummy_mat, index=index, columns=dummy_cols)
if sparse:
sparse_series = {}
N = len(data)
sp_indices = [ [] for _ in range(len(dummy_cols)) ]
for ndx, code in enumerate(codes):
if code == -1:
# Blank entries if not dummy_na and code == -1, #GH4446
continue
sp_indices[code].append(ndx)

for col, ixs in zip(dummy_cols, sp_indices):
sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs),
fill_value=0)
sparse_series[col] = SparseSeries(data=sarr, index=index)

return SparseDataFrame(sparse_series, index=index, columns=dummy_cols)

else:
dummy_mat = np.eye(number_of_cols).take(codes, axis=0)

if not dummy_na:
# reset NaN GH4446
dummy_mat[codes == -1] = 0

return DataFrame(dummy_mat, index=index, columns=dummy_cols)


def make_axis_dummies(frame, axis='minor', transform=None):
Expand Down
58 changes: 33 additions & 25 deletions pandas/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ def test_multiindex(self):

class TestGetDummies(tm.TestCase):

sparse = False

def setUp(self):
self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
'C': [1, 2, 3]})
Expand All @@ -163,20 +165,20 @@ def test_basic(self):
expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0},
'b': {0: 0.0, 1: 1.0, 2: 0.0},
'c': {0: 0.0, 1: 0.0, 2: 1.0}})
assert_frame_equal(get_dummies(s_list), expected)
assert_frame_equal(get_dummies(s_series), expected)
assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected)
assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected)

expected.index = list('ABC')
assert_frame_equal(get_dummies(s_series_index), expected)
assert_frame_equal(get_dummies(s_series_index, sparse=self.sparse), expected)

def test_just_na(self):
just_na_list = [np.nan]
just_na_series = Series(just_na_list)
just_na_series_index = Series(just_na_list, index = ['A'])

res_list = get_dummies(just_na_list)
res_series = get_dummies(just_na_series)
res_series_index = get_dummies(just_na_series_index)
res_list = get_dummies(just_na_list, sparse=self.sparse)
res_series = get_dummies(just_na_series, sparse=self.sparse)
res_series_index = get_dummies(just_na_series_index, sparse=self.sparse)

self.assertEqual(res_list.empty, True)
self.assertEqual(res_series.empty, True)
Expand All @@ -188,20 +190,21 @@ def test_just_na(self):

def test_include_na(self):
s = ['a', 'b', np.nan]
res = get_dummies(s)
res = get_dummies(s, sparse=self.sparse)
exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0},
'b': {0: 0.0, 1: 1.0, 2: 0.0}})
assert_frame_equal(res, exp)

res_na = get_dummies(s, dummy_na=True)
# Sparse dataframes do not allow nan labelled columns, see #GH8822
res_na = get_dummies(s, dummy_na=True, sparse=self.sparse)
exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0},
'a': {0: 1.0, 1: 0.0, 2: 0.0},
'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis(['a', 'b', nan], 1)
# hack (NaN handling in assert_index_equal)
exp_na.columns = res_na.columns
assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([nan], dummy_na=True)
res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse)
exp_just_na = DataFrame(Series(1.0,index=[0]),columns=[nan])
assert_array_equal(res_just_na.values, exp_just_na.values)

Expand All @@ -210,21 +213,21 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values
e = 'e'
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
s = [e, eacute, eacute]
res = get_dummies(s, prefix='letter')
res = get_dummies(s, prefix='letter', sparse=self.sparse)
exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0},
u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}})
assert_frame_equal(res, exp)

def test_dataframe_dummies_all_obj(self):
df = self.df[['A', 'B']]
result = get_dummies(df)
result = get_dummies(df, sparse=self.sparse)
expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0],
'B_b': [1., 1, 0], 'B_c': [0., 0, 1]})
assert_frame_equal(result, expected)

def test_dataframe_dummies_mix_default(self):
df = self.df
result = get_dummies(df)
result = get_dummies(df, sparse=self.sparse)
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
'B_c': [0., 0, 1]})
Expand All @@ -235,18 +238,18 @@ def test_dataframe_dummies_prefix_list(self):
prefixes = ['from_A', 'from_B']
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
'C': [1, 2, 3]})
result = get_dummies(df, prefix=prefixes)
result = get_dummies(df, prefix=prefixes, sparse=self.sparse)
expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1],
'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0],
'from_B_c': [0., 0, 1]})
expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
'from_B_c']]
assert_frame_equal(result, expected)

def test_datafrmae_dummies_prefix_str(self):
def test_dataframe_dummies_prefix_str(self):
# not that you should do this...
df = self.df
result = get_dummies(df, prefix='bad')
result = get_dummies(df, prefix='bad', sparse=self.sparse)
expected = DataFrame([[1, 1., 0., 1., 0.],
[2, 0., 1., 1., 0.],
[3, 1., 0., 0., 1.]],
Expand All @@ -256,40 +259,40 @@ def test_datafrmae_dummies_prefix_str(self):
def test_dataframe_dummies_subset(self):
df = self.df
result = get_dummies(df, prefix=['from_A'],
columns=['A'])
columns=['A'], sparse=self.sparse)
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
'B': ['b', 'b', 'c'], 'C': [1, 2, 3]})
assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_sep(self):
df = self.df
result = get_dummies(df, prefix_sep='..')
result = get_dummies(df, prefix_sep='..', sparse=self.sparse)
expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1],
'A..b': [0., 1, 0], 'B..b': [1., 1, 0],
'B..c': [0., 0, 1]})
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
assert_frame_equal(result, expected)

result = get_dummies(df, prefix_sep=['..', '__'])
result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse)
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
assert_frame_equal(result, expected)

result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'})
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=self.sparse)
assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_bad_length(self):
with tm.assertRaises(ValueError):
get_dummies(self.df, prefix=['too few'])
get_dummies(self.df, prefix=['too few'], sparse=self.sparse)

def test_dataframe_dummies_prefix_sep_bad_length(self):
with tm.assertRaises(ValueError):
get_dummies(self.df, prefix_sep=['bad'])
get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse)

def test_dataframe_dummies_prefix_dict(self):
prefixes = {'A': 'from_A', 'B': 'from_B'}
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
'C': [1, 2, 3]})
result = get_dummies(df, prefix=prefixes)
result = get_dummies(df, prefix=prefixes, sparse=self.sparse)
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1],
'C': [1, 2, 3]})
Expand All @@ -298,22 +301,22 @@ def test_dataframe_dummies_prefix_dict(self):
def test_dataframe_dummies_with_na(self):
df = self.df
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True)
result = get_dummies(df, dummy_na=True, sparse=self.sparse)
expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0],
'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0],
'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]})
expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c',
'B_nan']]
assert_frame_equal(result, expected)

result = get_dummies(df, dummy_na=False)
result = get_dummies(df, dummy_na=False, sparse=self.sparse)
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
assert_frame_equal(result, expected)

def test_dataframe_dummies_with_categorical(self):
df = self.df
df['cat'] = pd.Categorical(['x', 'y', 'y'])
result = get_dummies(df)
result = get_dummies(df, sparse=self.sparse)
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
'B_c': [0., 0, 1], 'cat_x': [1., 0, 0],
Expand All @@ -322,6 +325,11 @@ def test_dataframe_dummies_with_categorical(self):
'cat_x', 'cat_y']]
assert_frame_equal(result, expected)


class TestGetDummiesSparse(TestGetDummies):
sparse = True


class TestLreshape(tm.TestCase):

def test_pairs(self):
Expand Down