diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 1209c4a8d6be8..1b81d83bb76c7 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -240,7 +240,7 @@ values will be set to ``NaN``. df3 df3.unstack() -.. versionadded: 0.18.0 +.. versionadded:: 0.18.0 Alternatively, unstack takes an optional ``fill_value`` argument, for specifying the value of missing data. @@ -634,6 +634,17 @@ When a column contains only one level, it will be omitted in the result. pd.get_dummies(df, drop_first=True) +By default new columns will have ``np.uint8`` dtype. To choose another dtype use ``dtype`` argument: + +.. ipython:: python + + df = pd.DataFrame({'A': list('abc'), 'B': [1.1, 2.2, 3.3]}) + + pd.get_dummies(df, dtype=bool).dtypes + +.. versionadded:: 0.22.0 + + .. _reshaping.factorize: Factorizing values diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index cbd880cc90c67..4a4d60b4dfbb2 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -17,6 +17,21 @@ New features - - + +.. _whatsnew_0210.enhancements.get_dummies_dtype: + +``get_dummies`` now supports ``dtype`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtype for the new columns. The default remains uint8. (:issue:`18330`) + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + pd.get_dummies(df, columns=['c']).dtypes + pd.get_dummies(df, columns=['c'], dtype=bool).dtypes + + .. _whatsnew_0220.enhancements.other: Other Enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cbc259ba7bceb..782971a742b54 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -965,7 +965,7 @@ def _set_axis_name(self, name, axis=0, inplace=False): inplace : bool whether to modify `self` directly or return a copy - .. versionadded: 0.21.0 + .. versionadded:: 0.21.0 Returns ------- diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8b656d8ba25e9..5bb86885c0875 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -10,7 +10,7 @@ from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, is_bool_dtype, - needs_i8_conversion, is_sparse) + needs_i8_conversion, is_sparse, is_object_dtype) from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.missing import notna @@ -697,7 +697,7 @@ def _convert_level_number(level_num, columns): def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None, sparse=False, drop_first=False): + columns=None, sparse=False, drop_first=False, dtype=None): """ Convert categorical variable into dummy/indicator variables @@ -728,6 +728,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, .. versionadded:: 0.18.0 + dtype : dtype, default np.uint8 + Data type for new columns. Only a single dtype is allowed. + + .. versionadded:: 0.22.0 + Returns ------- dummies : DataFrame or SparseDataFrame @@ -783,6 +788,12 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 3 0 0 4 0 0 + >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) + a b c + 0 1.0 0.0 0.0 + 1 0.0 1.0 0.0 + 2 0.0 0.0 1.0 + See Also -------- Series.str.get_dummies @@ -835,20 +846,29 @@ def check_len(item, name): dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, - drop_first=drop_first) + drop_first=drop_first, dtype=dtype) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, - sparse=sparse, drop_first=drop_first) + sparse=sparse, + drop_first=drop_first, + dtype=dtype) return result def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, - sparse=False, drop_first=False): + sparse=False, drop_first=False, dtype=None): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) + if dtype is None: + dtype = np.uint8 + dtype = np.dtype(dtype) + + if is_object_dtype(dtype): + raise ValueError("dtype=object is not a valid dtype for get_dummies") + def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index @@ -903,18 +923,18 @@ def get_empty_Frame(data, sparse): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), + sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, - dtype=np.uint8) + dtype=dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, - dtype=np.uint8) + dtype=dtype) return out else: - dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) + dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 2722c3e92d85a..5d4aa048ae303 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -218,35 +218,51 @@ def test_multiindex(self): class TestGetDummies(object): - sparse = False - - def setup_method(self, method): - self.df = DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) + @pytest.fixture + def df(self): + return DataFrame({'A': ['a', 'b', 'a'], + 'B': ['b', 'b', 'c'], + 'C': [1, 2, 3]}) + + @pytest.fixture(params=['uint8', 'i8', np.float64, bool, None]) + def dtype(self, request): + return np.dtype(request.param) + + @pytest.fixture(params=['dense', 'sparse']) + def sparse(self, request): + # params are strings to simplify reading test results, + # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True] + return request.param == 'sparse' + + def effective_dtype(self, dtype): + if dtype is None: + return np.uint8 + return dtype + + def test_raises_on_dtype_object(self, df): + with pytest.raises(ValueError): + get_dummies(df, dtype='object') - def test_basic(self): + def test_basic(self, sparse, dtype): s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'a': {0: 1, - 1: 0, - 2: 0}, - 'b': {0: 0, - 1: 1, - 2: 0}, - 'c': {0: 0, - 1: 0, - 2: 1}}, dtype=np.uint8) - assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) - assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) + expected = DataFrame({'a': [1, 0, 0], + 'b': [0, 1, 0], + 'c': [0, 0, 1]}, + dtype=self.effective_dtype(dtype)) + result = get_dummies(s_list, sparse=sparse, dtype=dtype) + assert_frame_equal(result, expected) + + result = get_dummies(s_series, sparse=sparse, dtype=dtype) + assert_frame_equal(result, expected) expected.index = list('ABC') - assert_frame_equal( - get_dummies(s_series_index, sparse=self.sparse), expected) + result = get_dummies(s_series_index, sparse=sparse, dtype=dtype) + assert_frame_equal(result, expected) - def test_basic_types(self): + def test_basic_types(self, sparse, dtype): # GH 10531 s_list = list('abc') s_series = Series(s_list) @@ -257,38 +273,43 @@ def test_basic_types(self): expected = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, - dtype='uint8', + dtype=self.effective_dtype(dtype), columns=list('abc')) - if not self.sparse: + if not sparse: compare = tm.assert_frame_equal else: expected = expected.to_sparse(fill_value=0, kind='integer') compare = tm.assert_sp_frame_equal - result = get_dummies(s_list, sparse=self.sparse) + result = get_dummies(s_list, sparse=sparse, dtype=dtype) compare(result, expected) - result = get_dummies(s_series, sparse=self.sparse) + result = get_dummies(s_series, sparse=sparse, dtype=dtype) compare(result, expected) - result = get_dummies(s_df, sparse=self.sparse, columns=s_df.columns) + result = get_dummies(s_df, columns=s_df.columns, + sparse=sparse, dtype=dtype) tm.assert_series_equal(result.get_dtype_counts(), - Series({'uint8': 8})) + Series({dtype.name: 8})) - result = get_dummies(s_df, sparse=self.sparse, columns=['a']) - expected = Series({'uint8': 3, 'int64': 1, 'object': 1}).sort_values() + result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype) + dtype_name = self.effective_dtype(dtype).name + + expected_counts = {'int64': 1, 'object': 1} + expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0) + + expected = Series(expected_counts).sort_values() tm.assert_series_equal(result.get_dtype_counts().sort_values(), expected) - def test_just_na(self): + def test_just_na(self, sparse): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index=['A']) - res_list = get_dummies(just_na_list, sparse=self.sparse) - res_series = get_dummies(just_na_series, sparse=self.sparse) - res_series_index = get_dummies(just_na_series_index, - sparse=self.sparse) + res_list = get_dummies(just_na_list, sparse=sparse) + res_series = get_dummies(just_na_series, sparse=sparse) + res_series_index = get_dummies(just_na_series_index, sparse=sparse) assert res_list.empty assert res_series.empty @@ -298,216 +319,209 @@ def test_just_na(self): assert res_series.index.tolist() == [0] assert res_series_index.index.tolist() == ['A'] - def test_include_na(self): + def test_include_na(self, sparse, dtype): + if sparse: + pytest.xfail(reason='nan in index is problematic (GH 16894)') + s = ['a', 'b', np.nan] - res = get_dummies(s, sparse=self.sparse) - exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, - 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) + res = get_dummies(s, sparse=sparse, dtype=dtype) + exp = DataFrame({'a': [1, 0, 0], + 'b': [0, 1, 0]}, + dtype=self.effective_dtype(dtype)) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 - res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) - exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, - 'a': {0: 1, 1: 0, 2: 0}, - 'b': {0: 0, 1: 1, 2: 0}}, - dtype=np.uint8) + res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) + exp_na = DataFrame({nan: [0, 0, 1], + 'a': [1, 0, 0], + 'b': [0, 1, 0]}, + dtype=self.effective_dtype(dtype)) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) + res_just_na = get_dummies([nan], dummy_na=True, + sparse=sparse, dtype=dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], - dtype=np.uint8) + dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) - def test_unicode(self - ): # See GH 6885 - get_dummies chokes on unicode values + def test_unicode(self, sparse): + # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter', sparse=self.sparse) - exp = DataFrame({'letter_e': {0: 1, - 1: 0, - 2: 0}, - u('letter_%s') % eacute: {0: 0, - 1: 1, - 2: 1}}, + res = get_dummies(s, prefix='letter', sparse=sparse) + exp = DataFrame({'letter_e': [1, 0, 0], + u('letter_%s') % eacute: [0, 1, 1]}, dtype=np.uint8) assert_frame_equal(res, exp) - def test_dataframe_dummies_all_obj(self): - df = self.df[['A', 'B']] - result = get_dummies(df, sparse=self.sparse) + def test_dataframe_dummies_all_obj(self, df, sparse): + df = df[['A', 'B']] + result = get_dummies(df, sparse=sparse) expected = DataFrame({'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], - 'B_c': [0, 0, 1]}, dtype=np.uint8) + 'B_c': [0, 0, 1]}, + dtype=np.uint8) assert_frame_equal(result, expected) - def test_dataframe_dummies_mix_default(self): - df = self.df - result = get_dummies(df, sparse=self.sparse) + def test_dataframe_dummies_mix_default(self, df, sparse, dtype): + result = get_dummies(df, sparse=sparse, dtype=dtype) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1]}) cols = ['A_a', 'A_b', 'B_b', 'B_c'] - expected[cols] = expected[cols].astype(np.uint8) + expected[cols] = expected[cols].astype(dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_list(self): + def test_dataframe_dummies_prefix_list(self, df, sparse): prefixes = ['from_A', 'from_B'] - df = DataFrame({'A': ['a', 'b', 'a'], - 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes, sparse=self.sparse) + result = get_dummies(df, prefix=prefixes, sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], - 'from_B_c': [0, 0, 1]}) - cols = expected.columns[1:] - expected[cols] = expected[cols].astype(np.uint8) - expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', - 'from_B_c']] + 'from_B_c': [0, 0, 1]}, + dtype=np.uint8) + expected[['C']] = df[['C']] + expected = expected[['C', 'from_A_a', 'from_A_b', + 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_str(self): + def test_dataframe_dummies_prefix_str(self, df, sparse): # not that you should do this... - df = self.df - result = get_dummies(df, prefix='bad', sparse=self.sparse) + result = get_dummies(df, prefix='bad', sparse=sparse) + bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c'] expected = DataFrame([[1, 1, 0, 1, 0], [2, 0, 1, 1, 0], [3, 1, 0, 0, 1]], - columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], + columns=['C'] + bad_columns, dtype=np.uint8) expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected) - def test_dataframe_dummies_subset(self): - df = self.df + def test_dataframe_dummies_subset(self, df, sparse): result = get_dummies(df, prefix=['from_A'], columns=['A'], - sparse=self.sparse) + sparse=sparse) expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], - 'C': [1, 2, 3]}) - cols = ['from_A_a', 'from_A_b'] - expected[cols] = expected[cols].astype(np.uint8) + 'C': [1, 2, 3]}, dtype=np.uint8) + expected[['C']] = df[['C']] assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_sep(self): - df = self.df - result = get_dummies(df, prefix_sep='..', sparse=self.sparse) + def test_dataframe_dummies_prefix_sep(self, df, sparse): + result = get_dummies(df, prefix_sep='..', sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1, 0, 1], 'A..b': [0, 1, 0], 'B..b': [1, 1, 0], - 'B..c': [0, 0, 1]}) + 'B..c': [0, 0, 1]}, + dtype=np.uint8) + expected[['C']] = df[['C']] expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] - cols = expected.columns[1:] - expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) + result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={'A': '..', - 'B': '__'}, sparse=self.sparse) + result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, + sparse=sparse) assert_frame_equal(result, expected) - def test_dataframe_dummies_prefix_bad_length(self): + def test_dataframe_dummies_prefix_bad_length(self, df, sparse): with pytest.raises(ValueError): - get_dummies(self.df, prefix=['too few'], sparse=self.sparse) + get_dummies(df, prefix=['too few'], sparse=sparse) - def test_dataframe_dummies_prefix_sep_bad_length(self): + def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse): with pytest.raises(ValueError): - get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse) + get_dummies(df, prefix_sep=['bad'], sparse=sparse) - def test_dataframe_dummies_prefix_dict(self): + def test_dataframe_dummies_prefix_dict(self, sparse): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes, sparse=self.sparse) + result = get_dummies(df, prefix=prefixes, sparse=sparse) + expected = DataFrame({'from_A_a': [1, 0, 1], 'from_A_b': [0, 1, 0], 'from_B_b': [1, 1, 0], 'from_B_c': [0, 0, 1], 'C': [1, 2, 3]}) - cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] - expected[cols] = expected[cols].astype(np.uint8) + + columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] + expected[columns] = expected[columns].astype(np.uint8) assert_frame_equal(result, expected) - def test_dataframe_dummies_with_na(self): - df = self.df + def test_dataframe_dummies_with_na(self, df, sparse, dtype): df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, sparse=self.sparse) + result = get_dummies(df, dummy_na=True, + sparse=sparse, dtype=dtype).sort_index(axis=1) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1, 0, 1, 0], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], 'B_b': [1, 1, 0, 0], 'B_c': [0, 0, 1, 0], - 'B_nan': [0, 0, 0, 1]}) - cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] - expected[cols] = expected[cols].astype(np.uint8) - expected = expected[['C', 'A_a', 'A_b', 'A_nan', - 'B_b', 'B_c', 'B_nan']] + 'B_nan': [0, 0, 0, 1]}).sort_index(axis=1) + + e_dtype = self.effective_dtype(dtype) + columns = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] + expected[columns] = expected[columns].astype(e_dtype) assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, sparse=self.sparse) + result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) - def test_dataframe_dummies_with_categorical(self): - df = self.df + def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, sparse=self.sparse) + result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1, 0, 1], 'A_b': [0, 1, 0], 'B_b': [1, 1, 0], 'B_c': [0, 0, 1], 'cat_x': [1, 0, 0], - 'cat_y': [0, 1, 1]}) - cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] - expected[cols] = expected[cols].astype(np.uint8) - expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', - 'cat_x', 'cat_y']] + 'cat_y': [0, 1, 1]}).sort_index(axis=1) + + columns = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] + effective_dtype = self.effective_dtype(dtype) + expected[columns] = expected[columns].astype(effective_dtype) + expected.sort_index(axis=1) assert_frame_equal(result, expected) - def test_basic_drop_first(self): + def test_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case s_list = list('abc') s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'b': {0: 0, - 1: 1, - 2: 0}, - 'c': {0: 0, - 1: 0, - 2: 1}}, dtype=np.uint8) + expected = DataFrame({'b': [0, 1, 0], + 'c': [0, 0, 1]}, + dtype=np.uint8) - result = get_dummies(s_list, sparse=self.sparse, drop_first=True) + result = get_dummies(s_list, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - result = get_dummies(s_series, sparse=self.sparse, drop_first=True) + result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) expected.index = list('ABC') - result = get_dummies(s_series_index, sparse=self.sparse, - drop_first=True) + result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - def test_basic_drop_first_one_level(self): + def test_basic_drop_first_one_level(self, sparse): # Test the case that categorical variable only has one level. s_list = list('aaa') s_series = Series(s_list) @@ -515,53 +529,48 @@ def test_basic_drop_first_one_level(self): expected = DataFrame(index=np.arange(3)) - result = get_dummies(s_list, sparse=self.sparse, drop_first=True) + result = get_dummies(s_list, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - result = get_dummies(s_series, sparse=self.sparse, drop_first=True) + result = get_dummies(s_series, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) expected = DataFrame(index=list('ABC')) - result = get_dummies(s_series_index, sparse=self.sparse, - drop_first=True) + result = get_dummies(s_series_index, drop_first=True, sparse=sparse) assert_frame_equal(result, expected) - def test_basic_drop_first_NA(self): + def test_basic_drop_first_NA(self, sparse): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] - res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) - exp = DataFrame({'b': {0: 0, - 1: 1, - 2: 0}}, dtype=np.uint8) + res = get_dummies(s_NA, drop_first=True, sparse=sparse) + exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8) assert_frame_equal(res, exp) - res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, - drop_first=True) - exp_na = DataFrame({'b': {0: 0, - 1: 1, - 2: 0}, - nan: {0: 0, - 1: 0, - 2: 1}}, dtype=np.uint8).reindex( - ['b', nan], axis=1) + res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, + sparse=sparse) + exp_na = DataFrame( + {'b': [0, 1, 0], + nan: [0, 0, 1]}, + dtype=np.uint8).reindex(['b', nan], axis=1) assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse, - drop_first=True) + res_just_na = get_dummies([nan], dummy_na=True, drop_first=True, + sparse=sparse) exp_just_na = DataFrame(index=np.arange(1)) assert_frame_equal(res_just_na, exp_just_na) - def test_dataframe_dummies_drop_first(self): - df = self.df[['A', 'B']] - result = get_dummies(df, sparse=self.sparse, drop_first=True) + def test_dataframe_dummies_drop_first(self, df, sparse): + df = df[['A', 'B']] + result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({'A_b': [0, 1, 0], - 'B_c': [0, 0, 1]}, dtype=np.uint8) + 'B_c': [0, 0, 1]}, + dtype=np.uint8) assert_frame_equal(result, expected) - def test_dataframe_dummies_drop_first_with_categorical(self): - df = self.df + def test_dataframe_dummies_drop_first_with_categorical( + self, df, sparse, dtype): df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df, sparse=self.sparse, drop_first=True) + result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({'C': [1, 2, 3], 'A_b': [0, 1, 0], 'B_c': [0, 0, 1], @@ -571,11 +580,10 @@ def test_dataframe_dummies_drop_first_with_categorical(self): expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) - def test_dataframe_dummies_drop_first_with_na(self): - df = self.df + def test_dataframe_dummies_drop_first_with_na(self, df, sparse): df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True, sparse=self.sparse, - drop_first=True) + result = get_dummies(df, dummy_na=True, drop_first=True, + sparse=sparse).sort_index(axis=1) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_b': [0, 1, 0, 0], 'A_nan': [0, 0, 0, 1], @@ -583,30 +591,34 @@ def test_dataframe_dummies_drop_first_with_na(self): 'B_nan': [0, 0, 0, 1]}) cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] expected[cols] = expected[cols].astype(np.uint8) - - expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] + expected = expected.sort_index(axis=1) assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False, sparse=self.sparse, - drop_first=True) + result = get_dummies(df, dummy_na=False, drop_first=True, + sparse=sparse) expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) def test_int_int(self): data = Series([1, 2, 1]) result = pd.get_dummies(data) - expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], + expected = DataFrame([[1, 0], + [0, 1], + [1, 0]], + columns=[1, 2], dtype=np.uint8) tm.assert_frame_equal(result, expected) data = Series(pd.Categorical(['a', 'b', 'a'])) result = pd.get_dummies(data) - expected = DataFrame([[1, 0], [0, 1], [1, 0]], + expected = DataFrame([[1, 0], + [0, 1], + [1, 0]], columns=pd.Categorical(['a', 'b']), dtype=np.uint8) tm.assert_frame_equal(result, expected) - def test_int_df(self): + def test_int_df(self, dtype): data = DataFrame( {'A': [1, 2, 1], 'B': pd.Categorical(['a', 'b', 'a']), @@ -620,34 +632,28 @@ def test_int_df(self): [2, 2., 0, 1, 0, 1], [1, 1., 1, 0, 1, 0] ], columns=columns) - expected[columns[2:]] = expected[columns[2:]].astype(np.uint8) - result = pd.get_dummies(data, columns=['A', 'B']) + expected[columns[2:]] = expected[columns[2:]].astype(dtype) + result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype) tm.assert_frame_equal(result, expected) - def test_dataframe_dummies_preserve_categorical_dtype(self): + def test_dataframe_dummies_preserve_categorical_dtype(self, dtype): # GH13854 for ordered in [False, True]: cat = pd.Categorical(list("xy"), categories=list("xyz"), ordered=ordered) - result = get_dummies(cat) + result = get_dummies(cat, dtype=dtype) - data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) + data = np.array([[1, 0, 0], [0, 1, 0]], + dtype=self.effective_dtype(dtype)) cols = pd.CategoricalIndex(cat.categories, categories=cat.categories, ordered=ordered) - expected = DataFrame(data, columns=cols) + expected = DataFrame(data, columns=cols, + dtype=self.effective_dtype(dtype)) tm.assert_frame_equal(result, expected) -class TestGetDummiesSparse(TestGetDummies): - sparse = True - - @pytest.mark.xfail(reason='nan in index is problematic (GH 16894)') - def test_include_na(self): - super(TestGetDummiesSparse, self).test_include_na() - - class TestMakeAxisDummies(object): def test_preserve_categorical_dtype(self):