Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Series.nlargest/nsmallest take_last. #10920

Merged
merged 1 commit into from
Aug 29, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ Deprecations

- ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`6511`, :issue:`8505`)
- ``Series.nsmallest`` and ``nlargest``'s ``take_last`` keyword was deprecated in favor of ``keep``. (:issue:`10792`)
- ``DataFrame.combineAdd`` and ``DataFrame.combineMult`` are deprecated. They
can easily be replaced by using the ``add`` and ``mul`` methods:
``DataFrame.add(other, fill_value=0)`` and ``DataFrame.mul(other, fill_value=1.)``
Expand Down
33 changes: 18 additions & 15 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,24 +453,24 @@ def group_position(*args):
_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'}


def _finalize_nsmallest(arr, kth_val, n, take_last, narr):
def _finalize_nsmallest(arr, kth_val, n, keep, narr):
ns, = np.nonzero(arr <= kth_val)
inds = ns[arr[ns].argsort(kind='mergesort')][:n]

if take_last:
if keep == 'last':
# reverse indices
return narr - 1 - inds
return inds
else:
return inds


def nsmallest(arr, n, take_last=False):
def nsmallest(arr, n, keep='first'):
'''
Find the indices of the n smallest values of a numpy array.

Note: Fails silently with NaN.

'''
if take_last:
if keep == 'last':
arr = arr[::-1]

narr = len(arr)
Expand All @@ -480,22 +480,22 @@ def nsmallest(arr, n, take_last=False):
arr = arr.view(_dtype_map.get(sdtype, sdtype))

kth_val = algos.kth_smallest(arr.copy(), n - 1)
return _finalize_nsmallest(arr, kth_val, n, take_last, narr)
return _finalize_nsmallest(arr, kth_val, n, keep, narr)


def nlargest(arr, n, take_last=False):
def nlargest(arr, n, keep='first'):
"""
Find the indices of the n largest values of a numpy array.

Note: Fails silently with NaN.
"""
sdtype = str(arr.dtype)
arr = arr.view(_dtype_map.get(sdtype, sdtype))
return nsmallest(-arr, n, take_last=take_last)
return nsmallest(-arr, n, keep=keep)


def select_n_slow(dropped, n, take_last, method):
reverse_it = take_last or method == 'nlargest'
def select_n_slow(dropped, n, keep, method):
reverse_it = (keep == 'last' or method == 'nlargest')
ascending = method == 'nsmallest'
slc = np.s_[::-1] if reverse_it else np.s_[:]
return dropped[slc].sort_values(ascending=ascending).head(n)
Expand All @@ -504,13 +504,13 @@ def select_n_slow(dropped, n, take_last, method):
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}


def select_n(series, n, take_last, method):
def select_n(series, n, keep, method):
"""Implement n largest/smallest.

Parameters
----------
n : int
take_last : bool
keep : {'first', 'last'}, default 'first'
method : str, {'nlargest', 'nsmallest'}

Returns
Expand All @@ -522,15 +522,18 @@ def select_n(series, n, take_last, method):
np.timedelta64)):
raise TypeError("Cannot use method %r with dtype %s" % (method, dtype))

if keep not in ('first', 'last'):
raise ValueError('keep must be either "first", "last"')

if n <= 0:
return series[[]]

dropped = series.dropna()

if n >= len(series):
return select_n_slow(dropped, n, take_last, method)
return select_n_slow(dropped, n, keep, method)

inds = _select_methods[method](dropped.values, n, take_last)
inds = _select_methods[method](dropped.values, n, keep)
return dropped.iloc[inds]


Expand Down
24 changes: 14 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3163,16 +3163,16 @@ def sortlevel(self, level=0, axis=0, ascending=True,
inplace=inplace, sort_remaining=sort_remaining)


def _nsorted(self, columns, n, method, take_last):
def _nsorted(self, columns, n, method, keep):
if not com.is_list_like(columns):
columns = [columns]
columns = list(columns)
ser = getattr(self[columns[0]], method)(n, take_last=take_last)
ser = getattr(self[columns[0]], method)(n, keep=keep)
ascending = dict(nlargest=False, nsmallest=True)[method]
return self.loc[ser.index].sort_values(columns, ascending=ascending,
kind='mergesort')

def nlargest(self, n, columns, take_last=False):
def nlargest(self, n, columns, keep='first'):
"""Get the rows of a DataFrame sorted by the `n` largest
values of `columns`.

Expand All @@ -3184,8 +3184,10 @@ def nlargest(self, n, columns, take_last=False):
Number of items to retrieve
columns : list or str
Column name or names to order by
take_last : bool, optional
Where there are duplicate values, take the last duplicate
keep : {'first', 'last', False}, default 'first'
Where there are duplicate values:
- ``first`` : take the first occurrence.
- ``last`` : take the last occurrence.

Returns
-------
Expand All @@ -3202,9 +3204,9 @@ def nlargest(self, n, columns, take_last=False):
1 10 b 2
2 8 d NaN
"""
return self._nsorted(columns, n, 'nlargest', take_last)
return self._nsorted(columns, n, 'nlargest', keep)

def nsmallest(self, n, columns, take_last=False):
def nsmallest(self, n, columns, keep='first'):
"""Get the rows of a DataFrame sorted by the `n` smallest
values of `columns`.

Expand All @@ -3216,8 +3218,10 @@ def nsmallest(self, n, columns, take_last=False):
Number of items to retrieve
columns : list or str
Column name or names to order by
take_last : bool, optional
Where there are duplicate values, take the last duplicate
keep : {'first', 'last', False}, default 'first'
Where there are duplicate values:
- ``first`` : take the first occurrence.
- ``last`` : take the last occurrence.

Returns
-------
Expand All @@ -3234,7 +3238,7 @@ def nsmallest(self, n, columns, take_last=False):
0 1 a 1
2 8 d NaN
"""
return self._nsorted(columns, n, 'nsmallest', take_last)
return self._nsorted(columns, n, 'nsmallest', keep)

def swaplevel(self, i, j, axis=0):
"""
Expand Down
18 changes: 16 additions & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from pandas.core.internals import BlockManager, make_block
from pandas.core.series import Series
from pandas.core.panel import Panel
from pandas.util.decorators import cache_readonly, Appender, make_signature
from pandas.util.decorators import (cache_readonly, Appender, make_signature,
deprecate_kwarg)
import pandas.core.algorithms as algos
import pandas.core.common as com
from pandas.core.common import(_possibly_downcast_to_dtype, isnull,
Expand Down Expand Up @@ -82,7 +83,7 @@

_series_apply_whitelist = \
(_common_apply_whitelist - set(['boxplot'])) | \
frozenset(['dtype', 'unique', 'nlargest', 'nsmallest'])
frozenset(['dtype', 'unique'])

_dataframe_apply_whitelist = \
_common_apply_whitelist | frozenset(['dtypes', 'corrwith'])
Expand Down Expand Up @@ -2583,6 +2584,19 @@ def nunique(self, dropna=True):
index=self.grouper.result_index,
name=self.name)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(Series.nlargest.__doc__)
def nlargest(self, n=5, keep='first'):
# ToDo: When we remove deprecate_kwargs, we can remote these methods
# and inlucde nlargest and nsmallest to _series_apply_whitelist
return self.apply(lambda x: x.nlargest(n=n, keep=keep))


@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(Series.nsmallest.__doc__)
def nsmallest(self, n=5, keep='first'):
return self.apply(lambda x: x.nsmallest(n=n, keep=keep))

def value_counts(self, normalize=False, sort=True, ascending=False,
bins=None, dropna=True):

Expand Down
24 changes: 16 additions & 8 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1817,15 +1817,19 @@ def rank(self, method='average', na_option='keep', ascending=True,
ascending=ascending, pct=pct)
return self._constructor(ranks, index=self.index).__finalize__(self)

def nlargest(self, n=5, take_last=False):
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
def nlargest(self, n=5, keep='first'):
"""Return the largest `n` elements.

Parameters
----------
n : int
Return this many descending sorted values
take_last : bool
Where there are duplicate values, take the last duplicate
keep : {'first', 'last', False}, default 'first'
Where there are duplicate values:
- ``first`` : take the first occurrence.
- ``last`` : take the last occurrence.
take_last : deprecated

Returns
-------
Expand All @@ -1848,17 +1852,21 @@ def nlargest(self, n=5, take_last=False):
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nlargest(10) # only sorts up to the N requested
"""
return select_n(self, n=n, take_last=take_last, method='nlargest')
return select_n(self, n=n, keep=keep, method='nlargest')

def nsmallest(self, n=5, take_last=False):
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
def nsmallest(self, n=5, keep='first'):
"""Return the smallest `n` elements.

Parameters
----------
n : int
Return this many ascending sorted values
take_last : bool
Where there are duplicate values, take the last duplicate
keep : {'first', 'last', False}, default 'first'
Where there are duplicate values:
- ``first`` : take the first occurrence.
- ``last`` : take the last occurrence.
take_last : deprecated

Returns
-------
Expand All @@ -1881,7 +1889,7 @@ def nsmallest(self, n=5, take_last=False):
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nsmallest(10) # only sorts up to the N requested
"""
return select_n(self, n=n, take_last=take_last, method='nsmallest')
return select_n(self, n=n, keep=keep, method='nsmallest')

def sortlevel(self, level=0, ascending=True, sort_remaining=True):
"""
Expand Down
21 changes: 20 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4997,7 +4997,7 @@ def test_groupby_whitelist(self):
'corr', 'cov',
'diff',
'unique',
'nlargest', 'nsmallest',
# 'nlargest', 'nsmallest',
])

for obj, whitelist in zip((df, s),
Expand Down Expand Up @@ -5316,6 +5316,16 @@ def test_nlargest(self):
[3, 2, 1, 9, 5, 8]]))
tm.assert_series_equal(r, e)


a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series([3, 2, 1, 3, 3, 2],
index=MultiIndex.from_arrays([list('aaabbb'),
[2, 3, 1, 6, 5, 7]]))
assert_series_equal(gb.nlargest(3, keep='last'), e)
with tm.assert_produces_warning(FutureWarning):
assert_series_equal(gb.nlargest(3, take_last=True), e)

def test_nsmallest(self):
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list('a' * 5 + 'b' * 5))
Expand All @@ -5326,6 +5336,15 @@ def test_nsmallest(self):
[0, 4, 1, 6, 7, 8]]))
tm.assert_series_equal(r, e)

a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series([0, 1, 1, 0, 1, 2],
index=MultiIndex.from_arrays([list('aaabbb'),
[4, 1, 0, 9, 8, 7]]))
assert_series_equal(gb.nsmallest(3, keep='last'), e)
with tm.assert_produces_warning(FutureWarning):
assert_series_equal(gb.nsmallest(3, take_last=True), e)

def test_transform_doesnt_clobber_ints(self):
# GH 7972
n = 6
Expand Down
17 changes: 14 additions & 3 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5040,11 +5040,16 @@ def test_nsmallest_nlargest(self):
for s in s_list:

assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])

assert_series_equal(s.nsmallest(2, keep='last'), s.iloc[[2, 3]])
with tm.assert_produces_warning(FutureWarning):
assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])

assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]])
assert_series_equal(s.nlargest(3, take_last=True),
s.iloc[[4, 0, 3]])

assert_series_equal(s.nlargest(3, keep='last'), s.iloc[[4, 0, 3]])
with tm.assert_produces_warning(FutureWarning):
assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]])

empty = s.iloc[0:0]
assert_series_equal(s.nsmallest(0), empty)
Expand All @@ -5062,6 +5067,12 @@ def test_nsmallest_nlargest(self):
assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])

msg = 'keep must be either "first", "last"'
with tm.assertRaisesRegexp(ValueError, msg):
s.nsmallest(keep='invalid')
with tm.assertRaisesRegexp(ValueError, msg):
s.nlargest(keep='invalid')

def test_rank(self):
tm._skip_if_no_scipy()
from scipy.stats import rankdata
Expand Down