From 7173395d49e97bca597d959aa34bb471ddf09b06 Mon Sep 17 00:00:00 2001 From: Artemy Kolchinsky Date: Wed, 15 Oct 2014 14:39:51 -0400 Subject: [PATCH] ENH: Allow get_dummies to return sparse dataframe ENH: Allow get_dummies to return sparse dataframe ENH: Allow get_dummies to return sparse dataframe Fix Fix Fixes Bug in order of columns Slight speed improvement get_dummies update Release notes update Remove convert dummies test --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/core/reshape.py | 61 ++++++++++++++++++++++++--------- pandas/tests/test_reshape.py | 58 +++++++++++++++++-------------- 3 files changed, 79 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index b80e341d4156a..70fd6979ba5f9 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -48,6 +48,7 @@ Enhancements df.drop(['A', 'X'], axis=1, errors='ignore') - Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) +- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return DataFrame is sparse. (:issue:`8823`) .. _whatsnew_0161.api: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 291a73778197a..af98e533cb5b7 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -9,6 +9,10 @@ from pandas.core.series import Series from pandas.core.frame import DataFrame +from pandas.core.sparse import SparseDataFrame, SparseSeries +from pandas.sparse.array import SparseArray +from pandas._sparse import IntIndex + from pandas.core.categorical import Categorical from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, isnull) @@ -932,7 +936,7 @@ def melt_stub(df, stub, i, j): return newdf.set_index([i, j]) def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None): + columns=None, sparse=False): """ Convert categorical variable into dummy/indicator variables @@ -953,6 +957,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. + sparse : bool, default False + Whether the returned DataFrame should be sparse or not. Returns ------- @@ -1039,16 +1045,17 @@ def check_len(item, name): with_dummies = [result] for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): - dummy = _get_dummies_1d(data[col], prefix=pre, - prefix_sep=sep, dummy_na=dummy_na) + dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, + dummy_na=dummy_na, sparse=sparse) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: - result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na) + result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, + sparse=sparse) return result -def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): +def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories @@ -1059,19 +1066,17 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): index = data.index else: index = np.arange(len(data)) - return DataFrame(index=index) - - number_of_cols = len(levels) - if dummy_na: - number_of_cols += 1 - - dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) + if not sparse: + return DataFrame(index=index) + else: + return SparseDataFrame(index=index) + codes = cat.codes.copy() if dummy_na: + codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) - else: - # reset NaN GH4446 - dummy_mat[cat.codes == -1] = 0 + + number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) @@ -1084,7 +1089,31 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): else: index = None - return DataFrame(dummy_mat, index=index, columns=dummy_cols) + if sparse: + sparse_series = {} + N = len(data) + sp_indices = [ [] for _ in range(len(dummy_cols)) ] + for ndx, code in enumerate(codes): + if code == -1: + # Blank entries if not dummy_na and code == -1, #GH4446 + continue + sp_indices[code].append(ndx) + + for col, ixs in zip(dummy_cols, sp_indices): + sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), + fill_value=0) + sparse_series[col] = SparseSeries(data=sarr, index=index) + + return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) + + else: + dummy_mat = np.eye(number_of_cols).take(codes, axis=0) + + if not dummy_na: + # reset NaN GH4446 + dummy_mat[codes == -1] = 0 + + return DataFrame(dummy_mat, index=index, columns=dummy_cols) def make_axis_dummies(frame, axis='minor', transform=None): diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 66f5110830c72..346c9e2598985 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -151,6 +151,8 @@ def test_multiindex(self): class TestGetDummies(tm.TestCase): + sparse = False + def setUp(self): self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) @@ -163,20 +165,20 @@ def test_basic(self): expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}, 'c': {0: 0.0, 1: 0.0, 2: 1.0}}) - assert_frame_equal(get_dummies(s_list), expected) - assert_frame_equal(get_dummies(s_series), expected) + assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) + assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) expected.index = list('ABC') - assert_frame_equal(get_dummies(s_series_index), expected) + assert_frame_equal(get_dummies(s_series_index, sparse=self.sparse), expected) def test_just_na(self): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index = ['A']) - res_list = get_dummies(just_na_list) - res_series = get_dummies(just_na_series) - res_series_index = get_dummies(just_na_series_index) + res_list = get_dummies(just_na_list, sparse=self.sparse) + res_series = get_dummies(just_na_series, sparse=self.sparse) + res_series_index = get_dummies(just_na_series_index, sparse=self.sparse) self.assertEqual(res_list.empty, True) self.assertEqual(res_series.empty, True) @@ -188,12 +190,13 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] - res = get_dummies(s) + res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) - res_na = get_dummies(s, dummy_na=True) + # Sparse dataframes do not allow nan labelled columns, see #GH8822 + res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, 'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis(['a', 'b', nan], 1) @@ -201,7 +204,7 @@ def test_include_na(self): exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True) + res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1.0,index=[0]),columns=[nan]) assert_array_equal(res_just_na.values, exp_just_na.values) @@ -210,21 +213,21 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter') + res = get_dummies(s, prefix='letter', sparse=self.sparse) exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0}, u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): df = self.df - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) @@ -235,7 +238,7 @@ def test_dataframe_dummies_prefix_list(self): prefixes = ['from_A', 'from_B'] df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1]}) @@ -243,10 +246,10 @@ def test_dataframe_dummies_prefix_list(self): 'from_B_c']] assert_frame_equal(result, expected) - def test_datafrmae_dummies_prefix_str(self): + def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df - result = get_dummies(df, prefix='bad') + result = get_dummies(df, prefix='bad', sparse=self.sparse) expected = DataFrame([[1, 1., 0., 1., 0.], [2, 0., 1., 1., 0.], [3, 1., 0., 0., 1.]], @@ -256,40 +259,40 @@ def test_datafrmae_dummies_prefix_str(self): def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], - columns=['A']) + columns=['A'], sparse=self.sparse) expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): df = self.df - result = get_dummies(df, prefix_sep='..') + result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1], 'A..b': [0., 1, 0], 'B..b': [1., 1, 0], 'B..c': [0., 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__']) + result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}) + result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=self.sparse) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self): with tm.assertRaises(ValueError): - get_dummies(self.df, prefix=['too few']) + get_dummies(self.df, prefix=['too few'], sparse=self.sparse) def test_dataframe_dummies_prefix_sep_bad_length(self): with tm.assertRaises(ValueError): - get_dummies(self.df, prefix_sep=['bad']) + get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse) def test_dataframe_dummies_prefix_dict(self): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1], 'C': [1, 2, 3]}) @@ -298,7 +301,7 @@ def test_dataframe_dummies_prefix_dict(self): def test_dataframe_dummies_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True) + result = get_dummies(df, dummy_na=True, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0], 'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0], 'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]}) @@ -306,14 +309,14 @@ def test_dataframe_dummies_with_na(self): 'B_nan']] assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False) + result = get_dummies(df, dummy_na=False, sparse=self.sparse) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1], 'cat_x': [1., 0, 0], @@ -322,6 +325,11 @@ def test_dataframe_dummies_with_categorical(self): 'cat_x', 'cat_y']] assert_frame_equal(result, expected) + +class TestGetDummiesSparse(TestGetDummies): + sparse = True + + class TestLreshape(tm.TestCase): def test_pairs(self):