Skip to content

Commit

Permalink
ENH: GH12042 Add parameter drop_first to get_dummies to get n-1 var…
Browse files Browse the repository at this point in the history
…iables out of n levels.

closes pandas-dev#12042     Some times it's useful to only accept n-1 variables
out of n categorical levels.

Author: Bran Yang <yangbo.84@gmail.com>

Closes pandas-dev#12092 from BranYang/master and squashes the following commits:

0528c57 [Bran Yang] Compare with empty DataFrame, not just check empty
0d99c2a [Bran Yang] Test the case that `drop_first` is on and categorical variable only has one level.
45f14e8 [Bran Yang] ENH: GH12042 Add parameter `drop_first` to get_dummies to get k-1 variables out of n levels.
  • Loading branch information
BranYang authored and jreback committed Feb 8, 2016
1 parent 1c51051 commit 62363d2
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 6 deletions.
26 changes: 26 additions & 0 deletions doc/source/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,32 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
from_dict
.. versionadded:: 0.18.0

Sometimes it will be useful to only keep k-1 levels of a categorical
variable to avoid collinearity when feeding the result to statistical models.
You can switch to this mode by turn on ``drop_first``.

.. ipython:: python
s = pd.Series(list('abcaa'))
pd.get_dummies(s)
pd.get_dummies(s, drop_first=True)
When a column contains only one level, it will be omitted in the result.

.. ipython:: python
df = pd.DataFrame({'A':list('aaaaa'),'B':list('ababc')})
pd.get_dummies(df)
pd.get_dummies(df, drop_first=True)
Factorizing values
------------------

Expand Down
48 changes: 42 additions & 6 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,7 +949,7 @@ def melt_stub(df, stub, i, j):


def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
columns=None, sparse=False):
columns=None, sparse=False, drop_first=False):
"""
Convert categorical variable into dummy/indicator variables
Expand All @@ -976,7 +976,11 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
Otherwise returns a DataFrame with some SparseBlocks.
.. versionadded:: 0.16.1
drop_first : bool, default False
Whether to get k-1 dummies out of n categorical levels by removing the
first level.
.. versionadded:: 0.18.0
Returns
-------
dummies : DataFrame or SparseDataFrame
Expand Down Expand Up @@ -1016,6 +1020,21 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
1 2 0 1 1 0 0
2 3 1 0 0 0 1
>>> pd.get_dummies(pd.Series(list('abcaa')))
a b c
0 1 0 0
1 0 1 0
2 0 0 1
3 1 0 0
4 1 0 0
>>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True))
b c
0 0 0
1 1 0
2 0 1
3 0 0
4 0 0
See also ``Series.str.get_dummies``.
"""
Expand Down Expand Up @@ -1065,23 +1084,23 @@ def check_len(item, name):
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):

dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep,
dummy_na=dummy_na, sparse=sparse)
dummy_na=dummy_na, sparse=sparse,
drop_first=drop_first)
with_dummies.append(dummy)
result = concat(with_dummies, axis=1)
else:
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na,
sparse=sparse)
sparse=sparse, drop_first=drop_first)
return result


def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
sparse=False):
sparse=False, drop_first=False):
# Series avoids inconsistent NaN handling
cat = Categorical.from_array(Series(data), ordered=True)
levels = cat.categories

# if all NaN
if not dummy_na and len(levels) == 0:
def get_empty_Frame(data, sparse):
if isinstance(data, Series):
index = data.index
else:
Expand All @@ -1091,11 +1110,19 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
else:
return SparseDataFrame(index=index)

# if all NaN
if not dummy_na and len(levels) == 0:
return get_empty_Frame(data, sparse)

codes = cat.codes.copy()
if dummy_na:
codes[codes == -1] = len(cat.categories)
levels = np.append(cat.categories, np.nan)

# if dummy_na, we just fake a nan level. drop_first will drop it again
if drop_first and len(levels) == 1:
return get_empty_Frame(data, sparse)

number_of_cols = len(levels)

if prefix is not None:
Expand All @@ -1118,6 +1145,11 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
continue
sp_indices[code].append(ndx)

if drop_first:
# remove first categorical level to avoid perfect collinearity
# GH12042
sp_indices = sp_indices[1:]
dummy_cols = dummy_cols[1:]
for col, ixs in zip(dummy_cols, sp_indices):
sarr = SparseArray(np.ones(len(ixs)),
sparse_index=IntIndex(N, ixs), fill_value=0)
Expand All @@ -1132,6 +1164,10 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False,
# reset NaN GH4446
dummy_mat[codes == -1] = 0

if drop_first:
# remove first GH12042
dummy_mat = dummy_mat[:, 1:]
dummy_cols = dummy_cols[1:]
return DataFrame(dummy_mat, index=index, columns=dummy_cols)


Expand Down
105 changes: 105 additions & 0 deletions pandas/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,111 @@ def test_dataframe_dummies_with_categorical(self):
]]
assert_frame_equal(result, expected)

# GH12402 Add a new parameter `drop_first` to avoid collinearity
def test_basic_drop_first(self):
# Basic case
s_list = list('abc')
s_series = Series(s_list)
s_series_index = Series(s_list, list('ABC'))

expected = DataFrame({'b': {0: 0.0,
1: 1.0,
2: 0.0},
'c': {0: 0.0,
1: 0.0,
2: 1.0}})

result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

expected.index = list('ABC')
result = get_dummies(s_series_index, sparse=self.sparse,
drop_first=True)
assert_frame_equal(result, expected)

def test_basic_drop_first_one_level(self):
# Test the case that categorical variable only has one level.
s_list = list('aaa')
s_series = Series(s_list)
s_series_index = Series(s_list, list('ABC'))

expected = DataFrame(index=np.arange(3))

result = get_dummies(s_list, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

result = get_dummies(s_series, sparse=self.sparse, drop_first=True)
assert_frame_equal(result, expected)

expected = DataFrame(index=list('ABC'))
result = get_dummies(s_series_index, sparse=self.sparse,
drop_first=True)
assert_frame_equal(result, expected)

def test_basic_drop_first_NA(self):
# Test NA hadling together with drop_first
s_NA = ['a', 'b', np.nan]
res = get_dummies(s_NA, sparse=self.sparse, drop_first=True)
exp = DataFrame({'b': {0: 0.0,
1: 1.0,
2: 0.0}})
assert_frame_equal(res, exp)

res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse,
drop_first=True)
exp_na = DataFrame({'b': {0: 0.0,
1: 1.0,
2: 0.0},
nan: {0: 0.0,
1: 0.0,
2: 1.0}}).reindex_axis(
['b', nan], 1)
assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse,
drop_first=True)
exp_just_na = DataFrame(index=np.arange(1))
assert_frame_equal(res_just_na, exp_just_na)

def test_dataframe_dummies_drop_first(self):
df = self.df[['A', 'B']]
result = get_dummies(df, sparse=self.sparse, drop_first=True)
expected = DataFrame({'A_b': [0., 1, 0],
'B_c': [0., 0, 1]})
assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_categorical(self):
df = self.df
df['cat'] = pd.Categorical(['x', 'y', 'y'])
result = get_dummies(df, sparse=self.sparse, drop_first=True)
expected = DataFrame({'C': [1, 2, 3],
'A_b': [0., 1, 0],
'B_c': [0., 0, 1],
'cat_y': [0., 1, 1]})
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_na(self):
df = self.df
df.loc[3, :] = [np.nan, np.nan, np.nan]
result = get_dummies(df, dummy_na=True, sparse=self.sparse,
drop_first=True)
expected = DataFrame({'C': [1, 2, 3, np.nan],
'A_b': [0., 1, 0, 0],
'A_nan': [0., 0, 0, 1],
'B_c': [0., 0, 1, 0],
'B_nan': [0., 0, 0, 1]})
expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']]
assert_frame_equal(result, expected)

result = get_dummies(df, dummy_na=False, sparse=self.sparse,
drop_first=True)
expected = expected[['C', 'A_b', 'B_c']]
assert_frame_equal(result, expected)


class TestGetDummiesSparse(TestGetDummies):
sparse = True
Expand Down

0 comments on commit 62363d2

Please sign in to comment.