diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 9044b080c45f9..07634811370c7 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,7 +1,9 @@ +import string from itertools import product import numpy as np from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long +import pandas as pd from .pandas_vb_common import setup # noqa @@ -132,3 +134,19 @@ def setup(self): def time_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3']) + + +class GetDummies(object): + goal_time = 0.2 + + def setup(self): + categories = list(string.ascii_letters[:12]) + s = pd.Series(np.random.choice(categories, size=1_000_000), + dtype=pd.api.types.CategoricalDtype(categories)) + self.s = s + + def time_get_dummies_1d(self): + pd.get_dummies(self.s, sparse=False) + + def time_get_dummies_1d_sparse(self): + pd.get_dummies(self.s, sparse=True) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 4b86f9c9dc7ef..8fe3023e9537c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -408,7 +408,7 @@ Performance Improvements - Improved performance of :meth:`HDFStore.groups` (and dependent functions like :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) (:issue:`21372`) -- +- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) .. _whatsnew_0240.docs: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2f2dc1264e996..f9ab813855f47 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -940,10 +940,11 @@ def get_empty_Frame(data, sparse): sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] - for ndx, code in enumerate(codes): - if code == -1: - # Blank entries if not dummy_na and code == -1, #GH4446 - continue + mask = codes != -1 + codes = codes[mask] + n_idx = np.arange(N)[mask] + + for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: