Skip to content

Commit

Permalink
ENH: added support for Index.sort_values(key=...)
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobaustin123 committed Nov 20, 2019
1 parent e50cea0 commit 619ca63
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 41 deletions.
22 changes: 7 additions & 15 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Tuple,
Type,
Union,
Callable
)
import warnings

Expand Down Expand Up @@ -4722,7 +4723,7 @@ def sort_values(
inplace=False,
kind="quicksort",
na_position="last",
key=None
key : Union[Callable, None] = None
):
inplace = validate_bool_kwarg(inplace, "inplace")
axis = self._get_axis_number(axis)
Expand All @@ -4736,29 +4737,20 @@ def sort_values(
if len(by) > 1:
from pandas.core.sorting import lexsort_indexer

if key is not None:
key_func = np.vectorize(key)
keys = [key_func(self._get_label_or_level_values(x, axis=axis)) for x in by]
else:
keys = [self._get_label_or_level_values(x, axis=axis) for x in by]

indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position, key=key)
indexer = ensure_platform_int(indexer)
else:
from pandas.core.sorting import nargsort

by = by[0]
k = self._get_label_or_level_values(by, axis=axis)

if key is not None:
key_func = np.vectorize(key)
k = key_func(k)

if isinstance(ascending, (tuple, list)):
ascending = ascending[0]

indexer = nargsort(
k, kind=kind, ascending=ascending, na_position=na_position
k, kind=kind, ascending=ascending, na_position=na_position, key=key
)

new_data = self._data.take(
Expand All @@ -4782,7 +4774,7 @@ def sort_index(
na_position="last",
sort_remaining=True,
by=None,
key=None
key : Union[Callable, None] = None
):

# TODO: this can be combined with Series.sort_index impl as
Expand All @@ -4804,7 +4796,7 @@ def sort_index(
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if key is not None:
labels = labels.map(key)
labels = labels.map(key, na_action="ignore")

# make sure that the axis is lexsorted to start
# if not we need to reconstruct to get the correct indexer
Expand Down
18 changes: 15 additions & 3 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime
import operator
from textwrap import dedent
from typing import FrozenSet, Union

from typing import FrozenSet, Union, Callable
import warnings

import numpy as np
Expand Down Expand Up @@ -4424,7 +4425,7 @@ def asof_locs(self, where, mask):

return result

def sort_values(self, return_indexer=False, ascending=True):
def sort_values(self, return_indexer=False, ascending=True, key : Callable = None):
"""
Return a sorted copy of the index.
Expand All @@ -4437,6 +4438,9 @@ def sort_values(self, return_indexer=False, ascending=True):
Should the indices that would sort the index be returned.
ascending : bool, default True
Should the index values be sorted in an ascending order.
key : Callable, default None
Apply a key function to the indices before sorting, like
built-in sorted function.
Returns
-------
Expand Down Expand Up @@ -4467,7 +4471,12 @@ def sort_values(self, return_indexer=False, ascending=True):
>>> idx.sort_values(ascending=False, return_indexer=True)
(Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2]))
"""
_as = self.argsort()
if key:
idx = self.map(key, na_action="ignore")
else:
idx = self

_as = idx.argsort()
if not ascending:
_as = _as[::-1]

Expand Down Expand Up @@ -4577,9 +4586,12 @@ def argsort(self, *args, **kwargs):
>>> idx[order]
Index(['a', 'b', 'c', 'd'], dtype='object')
"""

result = self.asi8

if result is None:
result = np.array(self)

return result.argsort(*args, **kwargs)

_index_shared_docs[
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2890,7 +2890,7 @@ def sort_values(
inplace=False,
kind="quicksort",
na_position="last",
key=None
key: Callable = None
):
"""
Sort by the values.
Expand All @@ -2913,7 +2913,7 @@ def sort_values(
na_position : {'first' or 'last'}, default 'last'
Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
the end.
key : function, default None
key : Callable, default None
If not None, apply the key function to every value before
sorting. Identical to key argument in built-in sorted function.
Expand Down Expand Up @@ -3090,7 +3090,7 @@ def sort_index(
kind="quicksort",
na_position="last",
sort_remaining=True,
key=None
key : Callable = None
):
"""
Sort Series by index labels.
Expand Down Expand Up @@ -3119,7 +3119,7 @@ def sort_index(
sort_remaining : bool, default True
If True and sorting by level and index is multilevel, sort by other
levels too (in order) after sorting by specified level.
key : function, default None
key : Callable, default None
If not None, apply the key function to every index element before
sorting. Identical to key argument in built-in sorted function.
Expand Down Expand Up @@ -3226,7 +3226,7 @@ def sort_index(
index = self.index
true_index = index
if key is not None:
index = index.map(key)
index = index.map(key, na_action="ignore")

if level is not None:
new_index, indexer = index.sortlevel(
Expand Down
21 changes: 19 additions & 2 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
""" miscellaneous sorting / groupby utilities """
from typing import Callable, Union

import numpy as np

from pandas._libs import algos, hashtable, lib
Expand Down Expand Up @@ -187,7 +189,7 @@ def indexer_from_factorized(labels, shape, compress: bool = True):
return get_group_index_sorter(ids, ngroups)


def lexsort_indexer(keys, orders=None, na_position="last"):
def lexsort_indexer(keys, orders=None, na_position="last", key : Union[Callable, None] = None):
from pandas.core.arrays import Categorical

labels = []
Expand All @@ -197,6 +199,10 @@ def lexsort_indexer(keys, orders=None, na_position="last"):
elif orders is None:
orders = [True] * len(keys)

if key:
key_func = np.vectorize(key)
keys = [key_func(entry) if entry.size != 0 else entry for entry in keys]

for key, order in zip(keys, orders):

# we are already a Categorical
Expand Down Expand Up @@ -233,7 +239,7 @@ def lexsort_indexer(keys, orders=None, na_position="last"):
return indexer_from_factorized(labels, shape)


def nargsort(items, kind="quicksort", ascending: bool = True, na_position="last"):
def nargsort(items, kind="quicksort", ascending=True, na_position="last", key: Union[Callable, None] = None):
"""
This is intended to be a drop-in replacement for np.argsort which
handles NaNs. It adds ascending and na_position parameters.
Expand All @@ -247,6 +253,17 @@ def nargsort(items, kind="quicksort", ascending: bool = True, na_position="last"
else:
items = np.asanyarray(items)

if key is not None:
key_func = np.vectorize(key)
masked = np.ma.MaskedArray(items, mask)

if masked.size == 0:
vals = np.array([]) # vectorize fails on empty object arrays
else:
vals = np.asarray(key_func(masked)) # revert from masked

return nargsort(vals, kind=kind, ascending=ascending, na_position=na_position, key=None)

idx = np.arange(len(items))
non_nans = items[~mask]
non_nan_idx = idx[~mask]
Expand Down
54 changes: 41 additions & 13 deletions pandas/tests/frame/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,30 +81,34 @@ def test_sort_values(self):
with pytest.raises(ValueError, match=msg):
frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)

def test_sort_values_inplace(self):
@pytest.fixture(params=[None, lambda x : x])
def key(self, request):
return request.param

def test_sort_values_inplace(self, key):
frame = DataFrame(
np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"]
)

sorted_df = frame.copy()
sorted_df.sort_values(by="A", inplace=True)
expected = frame.sort_values(by="A")
tm.assert_frame_equal(sorted_df, expected)
sorted_df.sort_values(by="A", inplace=True, key=key)
expected = frame.sort_values(by="A", key=key)
assert_frame_equal(sorted_df, expected)

sorted_df = frame.copy()
sorted_df.sort_values(by=1, axis=1, inplace=True)
expected = frame.sort_values(by=1, axis=1)
tm.assert_frame_equal(sorted_df, expected)
sorted_df.sort_values(by=1, axis=1, inplace=True, key=key)
expected = frame.sort_values(by=1, axis=1, key=key)
assert_frame_equal(sorted_df, expected)

sorted_df = frame.copy()
sorted_df.sort_values(by="A", ascending=False, inplace=True)
expected = frame.sort_values(by="A", ascending=False)
tm.assert_frame_equal(sorted_df, expected)
sorted_df.sort_values(by="A", ascending=False, inplace=True, key=key)
expected = frame.sort_values(by="A", ascending=False, key=key)
assert_frame_equal(sorted_df, expected)

sorted_df = frame.copy()
sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True)
expected = frame.sort_values(by=["A", "B"], ascending=False)
tm.assert_frame_equal(sorted_df, expected)
sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True, key=key)
expected = frame.sort_values(by=["A", "B"], ascending=False, key=key)
assert_frame_equal(sorted_df, expected)

def test_sort_nan(self):
# GH3917
Expand Down Expand Up @@ -247,6 +251,23 @@ def test_sort_multi_index(self):

tm.assert_frame_equal(result, expected)

def test_sort_multi_index_key(self):
# GH 25775, testing that sorting by index works with a multi-index.
df = DataFrame(
{"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")}
)
result = df.set_index(list("abc")).sort_index(level=list("ba"), key=lambda x : x[0])

expected = DataFrame(
{"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")}
)
expected = expected.set_index(list("abc"))
tm.assert_frame_equal(result, expected)

result = df.set_index(list("abc")).sort_index(level=list("ba"), key=lambda x : x[2])
expected = df.set_index(list("abc"))
tm.assert_frame_equal(result, expected)

def test_stable_categorial(self):
# GH 16793
df = DataFrame({"x": pd.Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)})
Expand Down Expand Up @@ -626,6 +647,13 @@ def test_sort_value_key_nan(self):
expected = df.sort_values(1, key=str.lower, ascending=False)
assert_frame_equal(result, expected)

@pytest.mark.parametrize('key', [None, lambda x : x])
def test_sort_value_key_empty(self, key):
df = DataFrame(np.array([]))

df.sort_values(0, key=key)
df.sort_index(key=key)

def test_sort_index(self):
# GH13496

Expand Down
29 changes: 26 additions & 3 deletions pandas/tests/indexing/multiindex/test_sorted.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pandas import DataFrame, MultiIndex, Series
import pandas.util.testing as tm

import pytest


class TestMultiIndexSorted:
def test_getitem_multilevel_index_tuple_not_sorted(self):
Expand All @@ -28,7 +30,8 @@ def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data):
expected = df.reindex(columns=df.columns[:3])
tm.assert_frame_equal(result, expected)

def test_frame_getitem_not_sorted2(self):
@pytest.mark.parametrize('key', [None, lambda x : x])
def test_frame_getitem_not_sorted2(self, key):
# 13431
df = DataFrame(
{
Expand All @@ -47,15 +50,35 @@ def test_frame_getitem_not_sorted2(self):
assert not df2.index.is_monotonic

assert df2_original.index.equals(df2.index)
expected = df2.sort_index()
expected = df2.sort_index(key=key)
assert expected.index.is_lexsorted()
assert expected.index.is_monotonic

result = df2.sort_index(level=0)
result = df2.sort_index(level=0, key=key)
assert result.index.is_lexsorted()
assert result.index.is_monotonic
tm.assert_frame_equal(result, expected)

def test_sort_values_key(self, multiindex_dataframe_random_data):
arrays = [
["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = zip(*arrays)
index = MultiIndex.from_tuples(tuples)
index = index.sort_values(key=lambda x: (x[0][2], x[1][2]))
result = DataFrame(range(8), index=index)

arrays = [
["foo", "foo", "bar", "bar", "qux", "qux", "baz", "baz"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
tuples = zip(*arrays)
index = MultiIndex.from_tuples(tuples)
expected = DataFrame(range(8), index=index)

tm.assert_frame_equal(result, expected)

def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data
df = frame.T
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/series/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,18 @@ def test_sort_index_multiindex(self, level):
res = s.sort_index(level=level, sort_remaining=False)
tm.assert_series_equal(s, res)

def test_sort_index_multiindex_key(self):

mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC"))
s = Series([1, 2], mi)
backwards = s.iloc[[1, 0]]

res = s.sort_index(key=lambda x : x[2])
assert_series_equal(backwards, res)

res = s.sort_index(key=lambda x : x[1]) # nothing happens
assert_series_equal(s, res)

def test_sort_index_kind(self):
# GH #14444 & #13589: Add support for sort algo choosing
series = Series(index=[3, 2, 1, 4, 3])
Expand Down

0 comments on commit 619ca63

Please sign in to comment.