From eddd918109ebeeb531120600fd7116c74c9d5b2a Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Thu, 4 Jul 2019 13:20:32 -0700 Subject: [PATCH 01/51] ENH: added df/series.sort_values(key=...) and df/series.sort_index(key=...) functionality --- doc/source/getting_started/basics.rst | 12 ++ doc/source/whatsnew/v1.0.0.rst | 28 +++++ pandas/_typing.py | 5 + pandas/conftest.py | 9 ++ pandas/core/arrays/categorical.py | 22 +++- pandas/core/frame.py | 42 +++++-- pandas/core/generic.py | 100 +++++++++------ pandas/core/groupby/generic.py | 2 +- pandas/core/indexes/base.py | 18 ++- pandas/core/indexes/datetimelike.py | 11 +- pandas/core/series.py | 97 +++++++++++++-- pandas/core/sorting.py | 74 ++++++++++-- pandas/core/strings.py | 3 + pandas/tests/extension/base/methods.py | 10 +- pandas/tests/extension/json/test_json.py | 12 +- pandas/tests/frame/methods/test_sort_index.py | 58 +++++++++ .../tests/frame/methods/test_sort_values.py | 114 ++++++++++++++++++ .../tests/indexing/multiindex/test_sorted.py | 30 ++++- .../tests/series/methods/test_sort_index.py | 46 +++++++ .../tests/series/methods/test_sort_values.py | 74 ++++++++++++ 20 files changed, 685 insertions(+), 82 deletions(-) diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 4fef5efbd1551..59331c5e8fd57 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1806,6 +1806,18 @@ argument: s.sort_values() s.sort_values(na_position='first') +Sorting also supports a ``key`` parameter that takes a callable function +to apply to the values being sorted. + +.. ipython:: python + + s1 = pd.Series(['B', 'a', 'C']) + s1.sort_values() + s1.sort_values(key=lambda x: x.str.lower()) + +`key` will be given the :class:`Series` of values and should return a ``Series`` +or array of the same shape with the transformed values. + .. _basics.sort_indexes_and_values: By indexes and values diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ada82987921ec..491a425c9fc01 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -209,6 +209,33 @@ method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate indices used for each window during the rolling aggregation. For more details and example usage, see the :ref:`custom window rolling documentation ` +.. _whatsnew_100.sorting_keys: + +Sorting with keys +^^^^^^^^^^^^^^^^^ + +We've added a ``key`` argument to the DataFrame and Series sorting methods, including +:meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`, +and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied +to the each column of a DataFrame before sorting is performed. + +.. ipython:: python + + s = pd.Series(['C', 'a', 'B']) + s.sort_values() + s + + +Note how this is sorted with capital letters first. Now if we apply the `col.str.lower()` method, we get + +.. ipython:: python + + s.sort_values(key=lambda x: x.str.lower()) + s + +For more details, see examples and documentation in :meth:`DataFrame.sort_values`, +:meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index` (:issue:`27237`) + .. _whatsnew_100.to_markdown: Converting to Markdown @@ -261,6 +288,7 @@ Other enhancements - :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) + Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index 171b76b4d2c4b..ac7b49316fe78 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -71,3 +71,8 @@ # to maintain type information across generic functions and parametrization T = TypeVar("T") + +# types of vectorized key functions for DataFrame::sort_values and +# DataFrame::sort_index, among others +ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]] +IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]] diff --git a/pandas/conftest.py b/pandas/conftest.py index 0c964452df5da..2d161c433f92f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -910,6 +910,15 @@ def index_or_series(request): return request.param +@pytest.fixture(params=[None, lambda x: x]) +def sort_by_key(request): + """ + Simple fixture for testing keys in sorting methods. + Tests None (no key) and the identity key. + """ + return request.param + + @pytest.fixture def dict_subclass(): """ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9d7359dd9c614..dd4192937c5dc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,6 +1,6 @@ import operator from shutil import get_terminal_size -from typing import Dict, Hashable, List, Type, Union, cast +from typing import Callable, Dict, Hashable, List, Optional, Type, Union, cast from warnings import warn import numpy as np @@ -1546,7 +1546,13 @@ def argsort(self, ascending=True, kind="quicksort", *args, **kwargs): """ return super().argsort(ascending=ascending, kind=kind, *args, **kwargs) - def sort_values(self, inplace=False, ascending=True, na_position="last"): + def sort_values( + self, + inplace=False, + ascending=True, + na_position="last", + key: Optional[Callable] = None, + ): """ Sort the Categorical by category value returning a new Categorical by default. @@ -1568,6 +1574,14 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"): na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Categorical`` and return a Categorical with the same shape as the input. + + .. versionadded:: 1.0.0 Returns ------- @@ -1624,7 +1638,9 @@ def sort_values(self, inplace=False, ascending=True, na_position="last"): if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {repr(na_position)}") - sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) + sorted_idx = nargsort( + self, ascending=ascending, na_position=na_position, key=key + ) if inplace: self._codes = self._codes[sorted_idx] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f3a0cf3841b5b..55dadc8fe2aab 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -20,6 +20,7 @@ IO, TYPE_CHECKING, Any, + Callable, Dict, FrozenSet, Hashable, @@ -41,7 +42,17 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib, properties -from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Label, Level, Renamer +from pandas._typing import ( + Axes, + Axis, + Dtype, + FilePathOrBuffer, + IndexKeyFunc, + Label, + Level, + Renamer, + ValueKeyFunc, +) from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -126,6 +137,7 @@ ) from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series +from pandas.core.sorting import ensure_key_mapped from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt @@ -4919,10 +4931,10 @@ def f(vals): # ---------------------------------------------------------------------- # Sorting - + # TODO: Just move the sort_values doc here. @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) - def sort_values( + def sort_values( # type: ignore[override] # NOQA self, by, axis=0, @@ -4931,6 +4943,7 @@ def sort_values( kind="quicksort", na_position="last", ignore_index=False, + key: ValueKeyFunc = None, ): inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) @@ -4941,23 +4954,26 @@ def sort_values( raise ValueError( f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) + raw = key is None if len(by) > 1: from pandas.core.sorting import lexsort_indexer - keys = [self._get_label_or_level_values(x, axis=axis) for x in by] - indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) + keys = [self._get_label_or_level_values(x, axis=axis, raw=raw) for x in by] + indexer = lexsort_indexer( + keys, orders=ascending, na_position=na_position, key=key + ) indexer = ensure_platform_int(indexer) else: from pandas.core.sorting import nargsort by = by[0] - k = self._get_label_or_level_values(by, axis=axis) + k = self._get_label_or_level_values(by, axis=axis, raw=raw) if isinstance(ascending, (tuple, list)): ascending = ascending[0] indexer = nargsort( - k, kind=kind, ascending=ascending, na_position=na_position + k, kind=kind, ascending=ascending, na_position=na_position, key=key ) new_data = self._data.take( @@ -4982,6 +4998,7 @@ def sort_index( na_position: str = "last", sort_remaining: bool = True, ignore_index: bool = False, + key: IndexKeyFunc = None, ): """ Sort object by labels (along an axis). @@ -5013,6 +5030,15 @@ def sort_index( .. versionadded:: 1.0.0 + key : callable, optional + If not None, apply the key function to the **non-missing** values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. + + .. versionadded:: 1.0.0 + Returns ------- sorted_obj : DataFrame or None @@ -5026,12 +5052,12 @@ def sort_index( axis = self._get_axis_number(axis) labels = self._get_axis(axis) + labels = ensure_key_mapped(labels, key) # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer labels = labels._sort_levels_monotonic() if level is not None: - new_axis, indexer = labels.sortlevel( level, ascending=ascending, sort_remaining=sort_remaining ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a2e348bf98e33..223898b4a69e8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -40,6 +40,7 @@ Label, Level, Renamer, + ValueKeyFunc, ) from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency @@ -1557,7 +1558,9 @@ def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None: ) raise ValueError(msg) - def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: + def _get_label_or_level_values( + self, key: str, axis: int = 0, raw: bool_t = True + ) -> np.ndarray: """ Return a 1-D array of values associated with `key`, a label or level from the given `axis`. @@ -1576,6 +1579,8 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: Label or level name. axis: int, default 0 Axis that levels are associated with (0 for index, 1 for columns) + raw : bool, default True + Whether to unbox the array from the Series, or return the Series object Returns ------- @@ -1596,7 +1601,9 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) - values = self.xs(key, axis=other_axes[0])._values + values = self.xs(key, axis=other_axes[0]) + if raw: + values = values._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values else: @@ -4033,6 +4040,7 @@ def sort_values( kind: str = "quicksort", na_position: str = "last", ignore_index: bool_t = False, + key: ValueKeyFunc = None, ): """ Sort by the values along either axis. @@ -4060,70 +4068,86 @@ def sort_values( .. versionadded:: 1.0.0 + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. + + .. versionadded:: 1.0.0 + Returns ------- - sorted_obj : DataFrame or None + DataFrame or None DataFrame with sorted values if inplace=False, None otherwise. + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by the index. + Series.sort_values : Similar method for a Series. + Examples -------- >>> df = pd.DataFrame({ ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], ... 'col2': [2, 1, 9, 8, 7, 4], ... 'col3': [0, 1, 9, 4, 2, 3], + ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] ... }) >>> df - col1 col2 col3 - 0 A 2 0 - 1 A 1 1 - 2 B 9 9 - 3 NaN 8 4 - 4 D 7 2 - 5 C 4 3 + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F Sort by col1 >>> df.sort_values(by=['col1']) - col1 col2 col3 - 0 A 2 0 - 1 A 1 1 - 2 B 9 9 - 5 C 4 3 - 4 D 7 2 - 3 NaN 8 4 + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 NaN 8 4 D Sort by multiple columns >>> df.sort_values(by=['col1', 'col2']) - col1 col2 col3 - 1 A 1 1 - 0 A 2 0 - 2 B 9 9 - 5 C 4 3 - 4 D 7 2 - 3 NaN 8 4 + col1 col2 col3 col4 + 1 A 1 1 B + 0 A 2 0 a + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 NaN 8 4 D Sort Descending >>> df.sort_values(by='col1', ascending=False) - col1 col2 col3 - 4 D 7 2 - 5 C 4 3 - 2 B 9 9 - 0 A 2 0 - 1 A 1 1 - 3 NaN 8 4 + col1 col2 col3 col4 + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + 3 NaN 8 4 D Putting NAs first >>> df.sort_values(by='col1', ascending=False, na_position='first') - col1 col2 col3 - 3 NaN 8 4 - 4 D 7 2 - 5 C 4 3 - 2 B 9 9 - 0 A 2 0 - 1 A 1 1 + col1 col2 col3 col4 + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B """ raise AbstractMethodError(self) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 27dd6e953c219..12a3682ddb79d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -330,7 +330,7 @@ def _aggregate_multiple_funcs(self, arg): return DataFrame(results, columns=columns) def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 406a258442dee..8418e2b3c17df 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import Any, Dict, FrozenSet, Hashable, Optional, Union +from typing import Any, Callable, Dict, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -72,6 +72,7 @@ import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op +from pandas.core.sorting import ensure_key_mapped from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -4466,7 +4467,9 @@ def asof_locs(self, where, mask): return result - def sort_values(self, return_indexer: bool = False, ascending: bool = True): + def sort_values( + self, return_indexer=False, ascending=True, key: Optional[Callable] = None + ): """ Return a sorted copy of the index. @@ -4479,6 +4482,11 @@ def sort_values(self, return_indexer: bool = False, ascending: bool = True): Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + key : callable, default None + Apply a key function to the indices before sorting, like + built-in sorted function. + + .. versionadded:: 1.0.0 Returns ------- @@ -4509,7 +4517,9 @@ def sort_values(self, return_indexer: bool = False, ascending: bool = True): >>> idx.sort_values(ascending=False, return_indexer=True) (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ - _as = self.argsort() + idx = ensure_key_mapped(self, key) + + _as = idx.argsort() if not ascending: _as = _as[::-1] @@ -4620,8 +4630,10 @@ def argsort(self, *args, **kwargs) -> np.ndarray: Index(['a', 'b', 'c', 'd'], dtype='object') """ result = self.asi8 + if result is None: result = np.array(self) + return result.argsort(*args, **kwargs) _index_shared_docs[ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b87dd0f02252f..4bc1908409c87 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -43,6 +43,7 @@ ) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name +from pandas.core.sorting import ensure_key_mapped from pandas.core.tools.timedeltas import to_timedelta from pandas.tseries.frequencies import DateOffset, to_offset @@ -164,12 +165,16 @@ def __contains__(self, key: Any) -> bool: is_scalar(res) or isinstance(res, slice) or (is_list_like(res) and len(res)) ) - def sort_values(self, return_indexer=False, ascending=True): + def sort_values(self, return_indexer=False, ascending=True, key=None): """ Return sorted copy of Index. """ + assert isinstance(self, Index) + + idx = ensure_key_mapped(self, key) + if return_indexer: - _as = self.argsort() + _as = idx.argsort() if not ascending: _as = _as[::-1] sorted_index = self.take(_as) @@ -178,7 +183,7 @@ def sort_values(self, return_indexer=False, ascending=True): # NB: using asi8 instead of _ndarray_values matters in numpy 1.18 # because the treatment of NaT has been changed to put NaT last # instead of first. - sorted_values = np.sort(self.asi8) + sorted_values = np.sort(idx.asi8) freq = self.freq if freq is not None and not is_period_dtype(self): diff --git a/pandas/core/series.py b/pandas/core/series.py index e9df0938d5f98..13e45106eb742 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -23,7 +23,7 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, properties, reshape, tslibs -from pandas._typing import Label +from pandas._typing import IndexKeyFunc, Label, ValueKeyFunc from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -89,6 +89,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager +from pandas.core.sorting import ensure_key_mapped from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -2790,7 +2791,7 @@ def update(self, other) -> None: # ---------------------------------------------------------------------- # Reindexing, sorting - def sort_values( + def sort_values( # type: ignore[override] # NOQA self, axis=0, ascending=True, @@ -2798,6 +2799,7 @@ def sort_values( kind: str = "quicksort", na_position: str = "last", ignore_index: bool = False, + key: ValueKeyFunc = None, ): """ Sort by the values. @@ -2821,9 +2823,19 @@ def sort_values( Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 - .. versionadded:: 1.0.0 + key : callable, optional + If not None, apply the key function to the **no-missing** values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Series`` or ``Index`` and return an array-like that implements + ``argsort``. + + .. versionadded:: 1.0.0 Returns ------- @@ -2906,6 +2918,48 @@ def sort_values( 2 d 0 z dtype: object + + Sort using a key function. Your `key` function will be + given the ``Series`` of values and should return an array-like. + + >>> s = pd.Series(['a', 'B', 'c', 'D', 'e']) + >>> s.sort_values() + 1 B + 3 D + 0 a + 2 c + 4 e + dtype: object + >>> s.sort_values(key=lambda x: x.str.lower()) + 0 a + 1 B + 2 c + 3 D + 4 e + dtype: object + + NumPy ufuncs work well here. For example, we can + sort by the ``sin`` of the value + + >>> s = pd.Series([-4, -2, 0, 2, 4]) + >>> s.sort_values(key=np.sin) + 1 -2 + 4 4 + 2 0 + 0 -4 + 3 2 + dtype: int64 + + More complicated user-defined functions can be used, + as long as they expect a Series and return an array-like + + >>> s.sort_values(key=lambda x: (np.tan(x.cumsum()))) + 0 -4 + 3 2 + 4 4 + 1 -2 + 2 0 + dtype: int64 """ inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter @@ -2919,6 +2973,9 @@ def sort_values( ) def _try_kind_sort(arr): + arr = ensure_key_mapped(arr, key) + arr = getattr(arr, "_values", arr) + # easier to ask forgiveness than permission try: # if kind==mergesort, it can fail for object dtype @@ -2936,7 +2993,7 @@ def _try_kind_sort(arr): good = ~bad idx = ibase.default_index(len(self)) - argsorted = _try_kind_sort(arr[good]) + argsorted = _try_kind_sort(self[good]) if is_list_like(ascending): if len(ascending) != 1: @@ -2982,6 +3039,7 @@ def sort_index( na_position="last", sort_remaining=True, ignore_index: bool = False, + key: IndexKeyFunc = None, ): """ Sort Series by index labels. @@ -3015,6 +3073,15 @@ def sort_index( .. versionadded:: 1.0.0 + key : callable, optional + If not None, apply the key function to the **non-missing** values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. + + .. versionadded:: 1.0.0 + Returns ------- Series @@ -3096,22 +3163,38 @@ def sort_index( baz two 5 bar two 7 dtype: int64 + + >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8]) + >>> s.sort_index(key=lambda x : -x) + 7 8 + 6 7 + 5 6 + 4 5 + 3 4 + 2 3 + 1 2 + 0 1 + dtype: int64 """ + # TODO: this can be combined with DataFrame.sort_index impl as # almost identical inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter self._get_axis_number(axis) - index = self.index + # TODO: should ensure_key_mapped convert to an array? + index = ensure_key_mapped(self.index, key) if level is not None: new_index, indexer = index.sortlevel( level, ascending=ascending, sort_remaining=sort_remaining ) + elif isinstance(index, MultiIndex): from pandas.core.sorting import lexsort_indexer labels = index._sort_levels_monotonic() + indexer = lexsort_indexer( labels._get_codes_for_sorting(), orders=ascending, @@ -3135,7 +3218,7 @@ def sort_index( ) indexer = ensure_platform_int(indexer) - new_index = index.take(indexer) + new_index = self.index.take(indexer) new_index = new_index._sort_levels_monotonic() new_values = self._values.take(indexer) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 51c154aa47518..d70fc41310914 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,4 +1,6 @@ """ miscellaneous sorting / groupby utilities """ +from typing import Callable, Optional + import numpy as np from pandas._libs import algos, hashtable, lib @@ -189,11 +191,28 @@ def indexer_from_factorized(labels, shape, compress: bool = True): return get_group_index_sorter(ids, ngroups) -def lexsort_indexer(keys, orders=None, na_position: str = "last"): +def lexsort_indexer( + keys, orders=None, na_position: str = "last", key: Optional[Callable] = None +): """ + Performs lexical sorting on a set of keys + Parameters ---------- + keys : sequence of arrays + Sequence of ndarrays to be sorted by the indexer + orders : boolean or list of booleans, optional + Determines the sorting order for each element in keys. If a list, + it must be the same length as keys. This determines whether the + corresponding element in keys should be sorted in ascending + (True) or descending (False) order. if bool, applied to all + elements as above. if None, defaults to True. na_position : {'first', 'last'}, default 'last' + Determines placement of NA elements in the sorted list ("last" or "first") + key : Callable, optional + Callable key function applied to every element in keys before sorting + + .. versionadded:: 1.0.0 """ from pandas.core.arrays import Categorical @@ -204,15 +223,16 @@ def lexsort_indexer(keys, orders=None, na_position: str = "last"): elif orders is None: orders = [True] * len(keys) - for key, order in zip(keys, orders): + keys = [ensure_key_mapped(k, key) for k in keys] + for k, order in zip(keys, orders): # we are already a Categorical - if is_categorical_dtype(key): - cat = key + if is_categorical_dtype(k): + cat = k # create the Categorical else: - cat = Categorical(key, ordered=True) + cat = Categorical(k, ordered=True) if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {na_position}") @@ -241,21 +261,36 @@ def lexsort_indexer(keys, orders=None, na_position: str = "last"): def nargsort( - items, kind: str = "quicksort", ascending: bool = True, na_position: str = "last" + items, + kind: str = "quicksort", + ascending: bool = True, + na_position: str = "last", + key: Optional[Callable] = None, + # raw: bool = True, ): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. - Adds ascending and na_position parameters. + Adds ascending, na_position, and key parameters. - (GH #6399, #5231) + (GH #6399, #5231, #27237) Parameters ---------- kind : str, default 'quicksort' ascending : bool, default True na_position : {'first', 'last'}, default 'last' + key : Optional[Callable], default None """ + + if key is not None: + items = ensure_key_mapped(items, key) + return nargsort( + items, kind=kind, ascending=ascending, na_position=na_position, key=None + ) + + # breakpoint() + items = extract_array(items) mask = np.asarray(isna(items)) @@ -267,6 +302,7 @@ def nargsort( idx = np.arange(len(items)) non_nans = items[~mask] non_nan_idx = idx[~mask] + nan_idx = np.nonzero(mask)[0] if not ascending: non_nans = non_nans[::-1] @@ -285,6 +321,28 @@ def nargsort( return indexer +def ensure_key_mapped(values, key: Optional[Callable]): + """ + Applies a callable key function to elements in an Index subclass or + an ndarray. Uses index.map for index subclasses and ignores nan values + in ndarrays. + + Parameters + ---------- + values : Index subclass or ndarray + key : Optional[Callable], key to be called on every index or entry in ndarray. + """ + if not key: + return values + + result = key(values) + if len(result) != len(values): + raise ValueError( + "User-provided `key` function much not change the shape of the array." + ) + return result + + class _KeyMapper: """ Map compressed group id -> key tuple. diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 18c7504f2c2f8..b5c7dec9671b6 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1943,6 +1943,9 @@ def _forbid_nonstring_types(func): @wraps(func) def wrapper(self, *args, **kwargs): + # TODO: support passing Series / Index here. + # if not isinstance(self, StringMethods): + # self = StringMethods(self) if self._inferred_dtype not in allowed_types: msg = ( f"Cannot use .str.{func_name} with values of " diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4a84a21084de2..41da7917c35d6 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -76,9 +76,9 @@ def test_nargsort(self, data_missing_for_sorting, na_position, expected): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("ascending", [True, False]) - def test_sort_values(self, data_for_sorting, ascending): + def test_sort_values(self, data_for_sorting, ascending, sort_by_key): ser = pd.Series(data_for_sorting) - result = ser.sort_values(ascending=ascending) + result = ser.sort_values(ascending=ascending, key=sort_by_key) expected = ser.iloc[[2, 0, 1]] if not ascending: expected = expected[::-1] @@ -86,9 +86,11 @@ def test_sort_values(self, data_for_sorting, ascending): self.assert_series_equal(result, expected) @pytest.mark.parametrize("ascending", [True, False]) - def test_sort_values_missing(self, data_missing_for_sorting, ascending): + def test_sort_values_missing( + self, data_missing_for_sorting, ascending, sort_by_key + ): ser = pd.Series(data_missing_for_sorting) - result = ser.sort_values(ascending=ascending) + result = ser.sort_values(ascending=ascending, key=sort_by_key) if ascending: expected = ser.iloc[[2, 0, 1]] else: diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index dc03a1f1dcf72..2e7bff7fe3e22 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -202,12 +202,16 @@ def test_argsort_missing(self, data_missing_for_sorting): super().test_argsort_missing(data_missing_for_sorting) @pytest.mark.parametrize("ascending", [True, False]) - def test_sort_values(self, data_for_sorting, ascending): - super().test_sort_values(data_for_sorting, ascending) + def test_sort_values(self, data_for_sorting, ascending, sort_by_key): + super().test_sort_values(data_for_sorting, ascending, sort_by_key) @pytest.mark.parametrize("ascending", [True, False]) - def test_sort_values_missing(self, data_missing_for_sorting, ascending): - super().test_sort_values_missing(data_missing_for_sorting, ascending) + def test_sort_values_missing( + self, data_missing_for_sorting, ascending, sort_by_key + ): + super().test_sort_values_missing( + data_missing_for_sorting, ascending, sort_by_key + ) @pytest.mark.skip(reason="combine for JSONArray not supported") def test_combine_le(self, data_repeated): diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 2c25e1f3740a3..6ff1b13839a89 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -318,3 +318,61 @@ def test_sort_index_ignore_index_multi_index( tm.assert_frame_equal(result_df, expected_df) tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) + + +class TestDataFrameSortIndexKey: + def test_sort_multi_index_key(self): + # GH 25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} + ) + + result = df.set_index(list("abc")).sort_index( + level=list("ba"), key=lambda x: x.get_level_values(0) + ) + + expected = DataFrame( + {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} + ) + expected = expected.set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + result = df.set_index(list("abc")).sort_index( + level=list("ba"), key=lambda x: x.get_level_values(2) + ) + + expected = df.set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + def test_sort_index_key(self): # issue 27237 + df = DataFrame(np.arange(6, dtype="int64"), index=list("aaBBca")) + + result = df.sort_index() + expected = df.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: x.str.lower()) + expected = df.iloc[[0, 1, 5, 2, 3, 4]] + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: x.str.lower(), ascending=False) + expected = df.iloc[[4, 2, 3, 0, 1, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_index_key_int(self): + df = DataFrame(np.arange(6, dtype="int64"), index=np.arange(6, dtype="int64")) + + result = df.sort_index() + tm.assert_frame_equal(result, df) + + result = df.sort_index(key=lambda x: -x) + expected = df.sort_index(ascending=False) + tm.assert_frame_equal(result, expected) + + result = df.sort_index(key=lambda x: 2 * x) + tm.assert_frame_equal(result, df) + + def test_changes_length_raises(self): + df = pd.DataFrame({"A": [1, 2, 3]}) + with pytest.raises(ValueError, match="change the shape"): + df.sort_index(key=lambda x: x[:1]) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 96f4d6ed90d6b..48563ac6dd5be 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -341,6 +341,25 @@ def test_sort_values_nat_values_in_int_column(self): df_sorted = df.sort_values(["datetime", "float"], ascending=False) tm.assert_frame_equal(df_sorted, df) + def test_sort_nat(self): + # GH 16836 + + d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] + d2 = [ + Timestamp(x) + for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] + ] + df = pd.DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) + + d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] + d4 = [ + Timestamp(x) + for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] + ] + expected = pd.DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) + sorted_df = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(sorted_df, expected) + def test_sort_values_na_position_with_categories(self): # GH#22556 # Positioning missing value properly when column is Categorical. @@ -378,6 +397,8 @@ def test_sort_values_na_position_with_categories(self): tm.assert_frame_equal(result, expected) + # breakpoint() + # sort ascending with na last result = df.sort_values( by=column_name, ascending=True, na_position=na_position_last @@ -391,6 +412,8 @@ def test_sort_values_na_position_with_categories(self): index=category_indices + na_indices, ) + # breakpoint() + tm.assert_frame_equal(result, expected) # sort descending with na first @@ -516,3 +539,94 @@ def test_sort_values_nat_na_position_default(self): ) result = expected.sort_values(["A", "date"]) tm.assert_frame_equal(result, expected) + + +class TestDataFrameSortKey: # test key sorting (issue 27237), not yet implemented + def test_sort_values_inplace_key(self, sort_by_key): + frame = DataFrame( + np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] + ) + + sorted_df = frame.copy() + sorted_df.sort_values(by="A", inplace=True, key=sort_by_key) + expected = frame.sort_values(by="A", key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by=1, axis=1, inplace=True, key=sort_by_key) + expected = frame.sort_values(by=1, axis=1, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by="A", ascending=False, inplace=True, key=sort_by_key) + expected = frame.sort_values(by="A", ascending=False, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values( + by=["A", "B"], ascending=False, inplace=True, key=sort_by_key + ) + expected = frame.sort_values(by=["A", "B"], ascending=False, key=sort_by_key) + tm.assert_frame_equal(sorted_df, expected) + + def test_sort_values_key(self): + df = DataFrame(np.array([0, 5, np.nan, 3, 2, np.nan])) + + result = df.sort_values(0) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(0, key=lambda x: x + 5) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(0, key=lambda x: -x, ascending=False) + expected = df.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_values_by_key(self): + df = DataFrame( + { + "a": np.array([0, 3, np.nan, 3, 2, np.nan]), + "b": np.array([0, 2, np.nan, 5, 2, np.nan]), + } + ) + + result = df.sort_values("a", key=lambda x: -x) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=lambda x: -x) + expected = df.iloc[[3, 1, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=lambda x: -x, ascending=False) + expected = df.iloc[[0, 4, 1, 3, 2, 5]] + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_nan(self): + df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) + + result = df.sort_values(1) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values([0, 1], key=lambda col: col.str.lower()) + tm.assert_frame_equal(result, df) + + result = df.sort_values( + [0, 1], key=lambda col: col.str.lower(), ascending=False + ) + expected = df.sort_values(1, key=lambda col: col.str.lower(), ascending=False) + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_empty(self, sort_by_key): + df = DataFrame(np.array([])) + + df.sort_values(0, key=sort_by_key) + df.sort_index(key=sort_by_key) + + def test_changes_length_raises(self): + df = pd.DataFrame({"A": [1, 2, 3]}) + with pytest.raises(ValueError, match="change the shape"): + df.sort_values("A", key=lambda x: x[:1]) diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 4bec0f429a34e..b8d6e199a4a0b 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -1,5 +1,6 @@ import numpy as np from numpy.random import randn +import pytest from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm @@ -28,7 +29,8 @@ def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): expected = df.reindex(columns=df.columns[:3]) tm.assert_frame_equal(result, expected) - def test_frame_getitem_not_sorted2(self): + @pytest.mark.parametrize("key", [None, lambda x: x]) + def test_frame_getitem_not_sorted2(self, key): # 13431 df = DataFrame( { @@ -47,15 +49,37 @@ def test_frame_getitem_not_sorted2(self): assert not df2.index.is_monotonic assert df2_original.index.equals(df2.index) - expected = df2.sort_index() + expected = df2.sort_index(key=key) assert expected.index.is_lexsorted() assert expected.index.is_monotonic - result = df2.sort_index(level=0) + result = df2.sort_index(level=0, key=key) assert result.index.is_lexsorted() assert result.index.is_monotonic tm.assert_frame_equal(result, expected) + def test_sort_values_key(self, multiindex_dataframe_random_data): + arrays = [ + ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + index = index.sort_values( + key=lambda x: x.map(lambda row: (row[0][2], row[1][2])) + ) + result = DataFrame(range(8), index=index) + + arrays = [ + ["foo", "foo", "bar", "bar", "qux", "qux", "baz", "baz"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + expected = DataFrame(range(8), index=index) + + tm.assert_frame_equal(result, expected) + def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data df = frame.T diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index 6fa4eeaee34c0..f43f72d292c67 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -166,3 +166,49 @@ def test_sort_index_ignore_index( tm.assert_series_equal(result_ser, expected) tm.assert_series_equal(ser, Series(original_list)) + + +class TestSeriesSortIndexKey: + def test_sort_index_multiindex_key(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + result = s.sort_index(key=lambda x: x.get_level_values(2)) + tm.assert_series_equal(backwards, result) + + result = s.sort_index(key=lambda x: x.get_level_values(1)) # nothing happens + tm.assert_series_equal(s, result) + + def test_sort_index_key(self): + series = Series(np.arange(6, dtype="int64"), index=list("aaBBca")) + + result = series.sort_index() + expected = series.iloc[[2, 3, 0, 1, 5, 4]] + tm.assert_series_equal(result, expected) + + result = series.sort_index(key=lambda x: x.str.lower()) + expected = series.iloc[[0, 1, 5, 2, 3, 4]] + tm.assert_series_equal(result, expected) + + result = series.sort_index(key=lambda x: x.str.lower(), ascending=False) + expected = series.iloc[[4, 2, 3, 0, 1, 5]] + tm.assert_series_equal(result, expected) + + def test_sort_index_key_int(self): + series = Series(np.arange(6, dtype="int64"), index=np.arange(6, dtype="int64")) + + result = series.sort_index() + tm.assert_series_equal(result, series) + + result = series.sort_index(key=lambda x: -x) + expected = series.sort_index(ascending=False) + tm.assert_series_equal(result, expected) + + result = series.sort_index(key=lambda x: 2 * x) + tm.assert_series_equal(result, series) + + def test_changes_length_raises(self): + s = Series([1, 2, 3]) + with pytest.raises(ValueError, match="change the shape"): + s.sort_index(key=lambda x: x[:1]) diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index caa2abd61af6a..900e9e9700e29 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -181,3 +181,77 @@ def test_sort_values_ignore_index( tm.assert_series_equal(result_ser, expected) tm.assert_series_equal(ser, Series(original_list)) + + +class TestSeriesSortingKey: + def test_sort_values_key(self): + series = Series(np.array(["Hello", "goodbye"])) + + result = series.sort_values(0) + expected = series + tm.assert_series_equal(result, expected) + + # TODO: let key=Series.str.upper work + result = series.sort_values(0, key=lambda x: x.str.lower()) + expected = series[::-1] + tm.assert_series_equal(result, expected) + + def test_sort_values_key_nan(self): + series = Series(np.array([0, 5, np.nan, 3, 2, np.nan])) + + result = series.sort_values(0) + expected = series.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_series_equal(result, expected) + + result = series.sort_values(0, key=lambda x: x + 5) + expected = series.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_series_equal(result, expected) + + result = series.sort_values(0, key=lambda x: -x, ascending=False) + expected = series.iloc[[0, 4, 3, 1, 2, 5]] + tm.assert_series_equal(result, expected) + + def test_sort_index_kind_key(self, sort_by_key): + # GH #14444 & #13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(kind="mergesort", key=sort_by_key) + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="quicksort", key=sort_by_key) + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="heapsort", key=sort_by_key) + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_kind_neg_key(self): + # GH #14444 & #13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[4, 3, 3, 2, 1], dtype=object) + + index_sorted_series = series.sort_index(kind="mergesort", key=lambda x: -x) + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="quicksort", key=lambda x: -x) + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="heapsort", key=lambda x: -x) + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_na_position_key(self, sort_by_key): + series = Series(index=[3, 2, 1, 4, 3, np.nan], dtype=object) + expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(na_position="first", key=sort_by_key) + tm.assert_series_equal(expected_series_first, index_sorted_series) + + expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan], dtype=object) + + index_sorted_series = series.sort_index(na_position="last", key=sort_by_key) + tm.assert_series_equal(expected_series_last, index_sorted_series) + + def test_changes_length_raises(self): + s = Series([1, 2, 3]) + with pytest.raises(ValueError, match="change the shape"): + s.sort_values(key=lambda x: x[:1]) From e05462a35d3a8f4895850a91d24d5939c983efd1 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 27 Jan 2020 22:44:11 -0500 Subject: [PATCH 02/51] fixed a few small bugs --- doc/source/whatsnew/v1.0.0.rst | 4 ++-- pandas/core/sorting.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 491a425c9fc01..2ce9ff65af7b6 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -217,7 +217,7 @@ Sorting with keys We've added a ``key`` argument to the DataFrame and Series sorting methods, including :meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`, and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied -to the each column of a DataFrame before sorting is performed. +to the each column of a DataFrame before sorting is performed (:issue:`27237`). .. ipython:: python @@ -234,7 +234,7 @@ Note how this is sorted with capital letters first. Now if we apply the `col.str s For more details, see examples and documentation in :meth:`DataFrame.sort_values`, -:meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index` (:issue:`27237`) +:meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index`. .. _whatsnew_100.to_markdown: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index d70fc41310914..e30ed5fb2cdad 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -289,8 +289,6 @@ def nargsort( items, kind=kind, ascending=ascending, na_position=na_position, key=None ) - # breakpoint() - items = extract_array(items) mask = np.asarray(isna(items)) From 0f33c5c61d6400ce9cf2d3f3e23d0db05a831bb6 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 27 Jan 2020 23:23:00 -0500 Subject: [PATCH 03/51] bug fixes --- pandas/core/frame.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 55dadc8fe2aab..cdbd9dfefa04a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -20,7 +20,6 @@ IO, TYPE_CHECKING, Any, - Callable, Dict, FrozenSet, Hashable, From b7d76cdc577dafc5864c7bb58e0a9f5fd0314196 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 27 Jan 2020 23:50:05 -0500 Subject: [PATCH 04/51] fixed --- pandas/core/indexes/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 4bc1908409c87..5f262e9e670b4 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -185,7 +185,7 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): # instead of first. sorted_values = np.sort(idx.asi8) - freq = self.freq + freq = idx.freq if freq is not None and not is_period_dtype(self): if freq.n > 0 and not ascending: freq = freq * -1 From 8343f7696ac7c3566856ef3e9963b8e5f55c7e0f Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 28 Jan 2020 00:17:04 -0500 Subject: [PATCH 05/51] fixed --- pandas/core/indexes/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5f262e9e670b4..b2a5d8b1d18bf 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -195,7 +195,7 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): if not ascending: sorted_values = sorted_values[::-1] - arr = type(self._data)._simple_new( + arr = type(idx._data)._simple_new( sorted_values, dtype=self.dtype, freq=freq ) return self._simple_new(arr, name=self.name) From 94281d320dd8abfd8e95a4030e1fab6230dd1101 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 28 Jan 2020 11:01:48 -0500 Subject: [PATCH 06/51] fixed --- pandas/core/arrays/categorical.py | 2 +- pandas/core/frame.py | 14 ++++++++++---- pandas/core/generic.py | 10 ++-------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index dd4192937c5dc..5ca77de88f31e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1579,7 +1579,7 @@ def sort_values( before sorting. This is similar to the `key` argument in the builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a - ``Categorical`` and return a Categorical with the same shape as the input. + ``Categorical`` and return an object with the same shape as the input. .. versionadded:: 1.0.0 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cdbd9dfefa04a..68f4cbe4290b3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4933,7 +4933,7 @@ def f(vals): # TODO: Just move the sort_values doc here. @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) - def sort_values( # type: ignore[override] # NOQA + def sort_values( # type: ignore[override] # NOQA # issue 27237 self, by, axis=0, @@ -4953,11 +4953,14 @@ def sort_values( # type: ignore[override] # NOQA raise ValueError( f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) - raw = key is None if len(by) > 1: from pandas.core.sorting import lexsort_indexer - keys = [self._get_label_or_level_values(x, axis=axis, raw=raw) for x in by] + keys = [self._get_label_or_level_values(x, axis=axis) for x in by] + + if key is not None: # need to rewrap columns in Series to apply key function + keys = [Series(k) for k in keys] + indexer = lexsort_indexer( keys, orders=ascending, na_position=na_position, key=key ) @@ -4966,7 +4969,10 @@ def sort_values( # type: ignore[override] # NOQA from pandas.core.sorting import nargsort by = by[0] - k = self._get_label_or_level_values(by, axis=axis, raw=raw) + k = self._get_label_or_level_values(by, axis=axis) + + if key is not None: # need to rewrap column in Series to apply key function + k = Series(k) if isinstance(ascending, (tuple, list)): ascending = ascending[0] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 223898b4a69e8..350440c7ccd51 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1558,9 +1558,7 @@ def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None: ) raise ValueError(msg) - def _get_label_or_level_values( - self, key: str, axis: int = 0, raw: bool_t = True - ) -> np.ndarray: + def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: """ Return a 1-D array of values associated with `key`, a label or level from the given `axis`. @@ -1579,8 +1577,6 @@ def _get_label_or_level_values( Label or level name. axis: int, default 0 Axis that levels are associated with (0 for index, 1 for columns) - raw : bool, default True - Whether to unbox the array from the Series, or return the Series object Returns ------- @@ -1601,9 +1597,7 @@ def _get_label_or_level_values( if self._is_label_reference(key, axis=axis): self._check_label_or_level_ambiguity(key, axis=axis) - values = self.xs(key, axis=other_axes[0]) - if raw: - values = values._values + values = self.xs(key, axis=other_axes[0])._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values else: From c505dd9144593d0366d6089ea7ae3bb36d535b2a Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 28 Jan 2020 11:27:15 -0500 Subject: [PATCH 07/51] updated docstrings --- pandas/core/frame.py | 8 +++++--- pandas/core/generic.py | 11 +++++++++++ pandas/core/series.py | 8 +++----- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 68f4cbe4290b3..f32a6c9a0232b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4958,7 +4958,9 @@ def sort_values( # type: ignore[override] # NOQA # issue 27237 keys = [self._get_label_or_level_values(x, axis=axis) for x in by] - if key is not None: # need to rewrap columns in Series to apply key function + if ( + key is not None + ): # need to rewrap columns in Series to apply key function keys = [Series(k) for k in keys] indexer = lexsort_indexer( @@ -4971,7 +4973,7 @@ def sort_values( # type: ignore[override] # NOQA # issue 27237 by = by[0] k = self._get_label_or_level_values(by, axis=axis) - if key is not None: # need to rewrap column in Series to apply key function + if key is not None: # need to rewrap column in Series to apply key function k = Series(k) if isinstance(ascending, (tuple, list)): @@ -5036,7 +5038,7 @@ def sort_index( .. versionadded:: 1.0.0 key : callable, optional - If not None, apply the key function to the **non-missing** values + If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect an diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 350440c7ccd51..3412575355376 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4142,6 +4142,17 @@ def sort_values( 2 B 9 9 c 0 A 2 0 a 1 A 1 1 B + + Sorting with a key function + + >>> df.sort_values(by='col4', key=lambda col : col.str.lower()) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F """ raise AbstractMethodError(self) diff --git a/pandas/core/series.py b/pandas/core/series.py index 13e45106eb742..6bf9c88d17355 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2828,12 +2828,11 @@ def sort_values( # type: ignore[override] # NOQA .. versionadded:: 1.0.0 key : callable, optional - If not None, apply the key function to the **no-missing** values + If not None, apply the key function to the series values before sorting. This is similar to the `key` argument in the builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a - ``Series`` or ``Index`` and return an array-like that implements - ``argsort``. + ``Series`` and return an array-like that implements ``argsort``. .. versionadded:: 1.0.0 @@ -3074,7 +3073,7 @@ def sort_index( .. versionadded:: 1.0.0 key : callable, optional - If not None, apply the key function to the **non-missing** values + If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect an @@ -3182,7 +3181,6 @@ def sort_index( inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter self._get_axis_number(axis) - # TODO: should ensure_key_mapped convert to an array? index = ensure_key_mapped(self.index, key) if level is not None: From ecb6910ab6efdcb12f5e003c34a467a8447fa826 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 28 Jan 2020 13:10:13 -0500 Subject: [PATCH 08/51] fixed documentation --- pandas/core/arrays/categorical.py | 11 ++++++----- pandas/core/frame.py | 8 ++++---- pandas/core/indexes/base.py | 9 ++++++--- pandas/core/indexes/datetimelike.py | 2 -- pandas/core/series.py | 2 +- pandas/core/sorting.py | 10 +++++----- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5ca77de88f31e..886f32f53cae4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1575,11 +1575,12 @@ def sort_values( 'first' puts NaNs at the beginning 'last' puts NaNs at the end key : callable, optional - Apply the key function to the values - before sorting. This is similar to the `key` argument in the - builtin :meth:`sorted` function, with the notable difference that - this `key` function should be *vectorized*. It should expect a - ``Categorical`` and return an object with the same shape as the input. + Apply the key function to the values before sorting. + This is similar to the `key` argument in the builtin + :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect + a ``Categorical`` and return an object with the same shape + as the input. .. versionadded:: 1.0.0 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f32a6c9a0232b..f60b667f63d19 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4958,9 +4958,8 @@ def sort_values( # type: ignore[override] # NOQA # issue 27237 keys = [self._get_label_or_level_values(x, axis=axis) for x in by] - if ( - key is not None - ): # need to rewrap columns in Series to apply key function + # need to rewrap columns in Series to apply key function + if key is not None: keys = [Series(k) for k in keys] indexer = lexsort_indexer( @@ -4973,7 +4972,8 @@ def sort_values( # type: ignore[override] # NOQA # issue 27237 by = by[0] k = self._get_label_or_level_values(by, axis=axis) - if key is not None: # need to rewrap column in Series to apply key function + # need to rewrap column in Series to apply key function + if key is not None: k = Series(k) if isinstance(ascending, (tuple, list)): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c0d1a1290ad88..643a30ce6a775 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4480,9 +4480,12 @@ def sort_values( Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. - key : callable, default None - Apply a key function to the indices before sorting, like - built-in sorted function. + key : callable, optional + If not None, apply the key function to the index values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. .. versionadded:: 1.0.0 diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b2a5d8b1d18bf..45926ec3bf97b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -169,8 +169,6 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): """ Return sorted copy of Index. """ - assert isinstance(self, Index) - idx = ensure_key_mapped(self, key) if return_indexer: diff --git a/pandas/core/series.py b/pandas/core/series.py index 6bf9c88d17355..bb05fc170655a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2791,7 +2791,7 @@ def update(self, other) -> None: # ---------------------------------------------------------------------- # Reindexing, sorting - def sort_values( # type: ignore[override] # NOQA + def sort_values( self, axis=0, ascending=True, diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e30ed5fb2cdad..0dcec0125fc4a 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -321,14 +321,14 @@ def nargsort( def ensure_key_mapped(values, key: Optional[Callable]): """ - Applies a callable key function to elements in an Index subclass or - an ndarray. Uses index.map for index subclasses and ignores nan values - in ndarrays. + Applies a callable key function to the values function and checks + that the resulting value has the same shape. Can be called on Index + subclasses, Series, DataFrames, or ndarrays. Parameters ---------- - values : Index subclass or ndarray - key : Optional[Callable], key to be called on every index or entry in ndarray. + values : Series, DataFrame, Index subclass, or ndarray + key : Optional[Callable], key to be called on the values array """ if not key: return values From 55c444e33fd9b6958a8c2741aa8966480344949d Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Wed, 29 Jan 2020 13:32:48 -0500 Subject: [PATCH 09/51] fixed --- pandas/core/series.py | 18 +++++++----------- pandas/core/strings.py | 3 --- pandas/tests/frame/methods/test_sort_values.py | 6 +----- .../tests/series/methods/test_sort_values.py | 1 - 4 files changed, 8 insertions(+), 20 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index bb05fc170655a..822e478dc5e9a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2832,7 +2832,7 @@ def sort_values( before sorting. This is similar to the `key` argument in the builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect a - ``Series`` and return an array-like that implements ``argsort``. + ``Series`` and return an array-like. .. versionadded:: 1.0.0 @@ -3163,16 +3163,12 @@ def sort_index( bar two 7 dtype: int64 - >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8]) - >>> s.sort_index(key=lambda x : -x) - 7 8 - 6 7 - 5 6 - 4 5 - 3 4 - 2 3 - 1 2 - 0 1 + >>> s = pd.Series([1, 2, 3, 4], index=['A', 'b', 'C', 'd']) + >>> s.sort_index(key=lambda x : x.str.lower()) + A 1 + b 2 + C 3 + d 4 dtype: int64 """ diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b5c7dec9671b6..18c7504f2c2f8 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1943,9 +1943,6 @@ def _forbid_nonstring_types(func): @wraps(func) def wrapper(self, *args, **kwargs): - # TODO: support passing Series / Index here. - # if not isinstance(self, StringMethods): - # self = StringMethods(self) if self._inferred_dtype not in allowed_types: msg = ( f"Cannot use .str.{func_name} with values of " diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 48563ac6dd5be..e8c2f5d229a54 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -397,8 +397,6 @@ def test_sort_values_na_position_with_categories(self): tm.assert_frame_equal(result, expected) - # breakpoint() - # sort ascending with na last result = df.sort_values( by=column_name, ascending=True, na_position=na_position_last @@ -412,8 +410,6 @@ def test_sort_values_na_position_with_categories(self): index=category_indices + na_indices, ) - # breakpoint() - tm.assert_frame_equal(result, expected) # sort descending with na first @@ -541,7 +537,7 @@ def test_sort_values_nat_na_position_default(self): tm.assert_frame_equal(result, expected) -class TestDataFrameSortKey: # test key sorting (issue 27237), not yet implemented +class TestDataFrameSortKey: # test key sorting (issue 27237) def test_sort_values_inplace_key(self, sort_by_key): frame = DataFrame( np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index 900e9e9700e29..c8f20a1c1eefc 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -191,7 +191,6 @@ def test_sort_values_key(self): expected = series tm.assert_series_equal(result, expected) - # TODO: let key=Series.str.upper work result = series.sort_values(0, key=lambda x: x.str.lower()) expected = series[::-1] tm.assert_series_equal(result, expected) From d774b15eb0ffd2b1e5d00fa7d5476536698f66f7 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 11 Feb 2020 15:26:14 -0500 Subject: [PATCH 10/51] updated docs --- doc/source/whatsnew/v1.1.0.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 40abb8f83de2f..2e8ba6b9bcfa6 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -36,6 +36,33 @@ For example: ser["2014"] ser.loc["May 2015"] +.. _whatsnew_110.key_sorting: + +Sorting with keys +^^^^^^^^^^^^^^^^^ + +We've added a ``key`` argument to the DataFrame and Series sorting methods, including +:meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`, +and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied +to the each column of a DataFrame before sorting is performed (:issue:`27237`). + +.. ipython:: python + + s = pd.Series(['C', 'a', 'B']) + s.sort_values() + s + + +Note how this is sorted with capital letters first. Now if we apply the `col.str.lower()` method, we get + +.. ipython:: python + + s.sort_values(key=lambda x: x.str.lower()) + s + +For more details, see examples and documentation in :meth:`DataFrame.sort_values`, +:meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index`. + .. _whatsnew_110.enhancements.other: Other enhancements From 64e70b4fae5d0c3fff16963cb2bb0521f1336765 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 11 Feb 2020 15:58:33 -0500 Subject: [PATCH 11/51] linting --- pandas/core/indexes/base.py | 2 +- pandas/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 302930f083473..b5a9b865a649e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import TYPE_CHECKING, Any, Callable, Dict, FrozenSet, Hashable, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, FrozenSet, Hashable, Optional, Union import warnings import numpy as np diff --git a/pandas/core/series.py b/pandas/core/series.py index 6e758c3ea203a..d61895b8b3ac7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -22,8 +22,8 @@ from pandas._config import get_option -from pandas._typing import IndexKeyFunc, Label, ValueKeyFunc from pandas._libs import lib, properties, reshape, tslibs +from pandas._typing import IndexKeyFunc, Label, ValueKeyFunc from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile From 03d65735ff54b47a937bcc64789c145a14252ec3 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 11 Feb 2020 16:44:31 -0500 Subject: [PATCH 12/51] fixed tests --- pandas/tests/frame/methods/test_sort_index.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index a38e9e5c6664e..6ff1b13839a89 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -331,7 +331,6 @@ def test_sort_multi_index_key(self): level=list("ba"), key=lambda x: x.get_level_values(0) ) - breakpoint() expected = DataFrame( {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} ) From 81c017231e43331b55b19617abad7b1dcb7eaa37 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sun, 22 Mar 2020 18:35:06 -0400 Subject: [PATCH 13/51] reformatted --- pandas/tests/frame/methods/test_sort_index.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 6ff1b13839a89..c694ded3b8564 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -325,23 +325,22 @@ def test_sort_multi_index_key(self): # GH 25775, testing that sorting by index works with a multi-index. df = DataFrame( {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} - ) + ).set_index(list("abc")) - result = df.set_index(list("abc")).sort_index( + result = df.sort_index( level=list("ba"), key=lambda x: x.get_level_values(0) ) expected = DataFrame( {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} - ) - expected = expected.set_index(list("abc")) + ).set_index(list("abc")) tm.assert_frame_equal(result, expected) - result = df.set_index(list("abc")).sort_index( + result = df.sort_index( level=list("ba"), key=lambda x: x.get_level_values(2) ) - expected = df.set_index(list("abc")) + expected = df tm.assert_frame_equal(result, expected) def test_sort_index_key(self): # issue 27237 From 6d0d7252c0e375325dfa7e14991bbb21c5398182 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sun, 22 Mar 2020 19:01:54 -0400 Subject: [PATCH 14/51] fixed linting issue --- pandas/tests/frame/methods/test_sort_index.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index c694ded3b8564..8c18a38e3f7d9 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -327,18 +327,14 @@ def test_sort_multi_index_key(self): {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} ).set_index(list("abc")) - result = df.sort_index( - level=list("ba"), key=lambda x: x.get_level_values(0) - ) + result = df.sort_index(level=list("ba"), key=lambda x: x.get_level_values(0)) expected = DataFrame( {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} ).set_index(list("abc")) tm.assert_frame_equal(result, expected) - result = df.sort_index( - level=list("ba"), key=lambda x: x.get_level_values(2) - ) + result = df.sort_index(level=list("ba"), key=lambda x: x.get_level_values(2)) expected = df tm.assert_frame_equal(result, expected) From 0aabf56d21e8c3fe0931c31ed37d7bf65297a072 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 27 Mar 2020 18:52:19 -0400 Subject: [PATCH 15/51] fixed formatting --- pandas/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/conftest.py b/pandas/conftest.py index d593b4a546d8d..81e182e69ce30 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1145,6 +1145,7 @@ def cython_table_items(request): """ return request.param + @pytest.fixture( params=[ getattr(pd.offsets, o) @@ -1158,6 +1159,7 @@ def tick_classes(request): """ return request.param + @pytest.fixture(params=[None, lambda x: x]) def sort_by_key(request): """ From 210df50d855afc07e87df2a7fe45e5870d143e59 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sat, 28 Mar 2020 13:13:25 -0400 Subject: [PATCH 16/51] ENH: made sort_index apply the key to each level separately --- pandas/core/frame.py | 27 ++++++++++++++-- pandas/tests/frame/methods/test_sort_index.py | 31 +++++++++++++++++-- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c091ff3c22281..71c4d51047471 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4801,7 +4801,7 @@ def sort_values( # type: ignore[override] # NOQA # issue 27237 # need to rewrap columns in Series to apply key function if key is not None: - keys = [Series(k) for k in keys] + keys = [Series(k, name=name) for (k, name) in zip(keys, by)] indexer = lexsort_indexer( keys, orders=ascending, na_position=na_position, key=key @@ -4887,7 +4887,8 @@ def sort_index( before sorting. This is similar to the `key` argument in the builtin :meth:`sorted` function, with the notable difference that this `key` function should be *vectorized*. It should expect an - ``Index`` and return an ``Index`` of the same shape. + ``Index`` and return an ``Index`` of the same shape. For MultiIndex + inputs, the key is applied *per level*. .. versionadded:: 1.0.0 @@ -4932,7 +4933,27 @@ def sort_index( axis = self._get_axis_number(axis) labels = self._get_axis(axis) - labels = ensure_key_mapped(labels, key) + + # apply key to each level separately and create a new index + if isinstance(labels, ABCMultiIndex): + if level is not None: + if isinstance(level, str) or isinstance(level, int): + sort_levels = [level] + else: + sort_levels = level + else: + sort_levels = labels.names + + labels = MultiIndex.from_arrays( + [ + ensure_key_mapped(labels.get_level_values(level), key) + if level in sort_levels + else labels.get_level_values(level) + for level in labels.names + ] + ) + else: + labels = ensure_key_mapped(labels, key) # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 8c18a38e3f7d9..f491b582378af 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -327,16 +327,18 @@ def test_sort_multi_index_key(self): {"a": [3, 1, 2], "b": [0, 0, 0], "c": [0, 1, 2], "d": list("abc")} ).set_index(list("abc")) - result = df.sort_index(level=list("ba"), key=lambda x: x.get_level_values(0)) + result = df.sort_index(level=list("ac"), key=lambda x: x) expected = DataFrame( {"a": [1, 2, 3], "b": [0, 0, 0], "c": [1, 2, 0], "d": list("bca")} ).set_index(list("abc")) tm.assert_frame_equal(result, expected) - result = df.sort_index(level=list("ba"), key=lambda x: x.get_level_values(2)) + result = df.sort_index(level=list("ac"), key=lambda x: -x) + expected = DataFrame( + {"a": [3, 2, 1], "b": [0, 0, 0], "c": [0, 2, 1], "d": list("acb")} + ).set_index(list("abc")) - expected = df tm.assert_frame_equal(result, expected) def test_sort_index_key(self): # issue 27237 @@ -367,6 +369,29 @@ def test_sort_index_key_int(self): result = df.sort_index(key=lambda x: 2 * x) tm.assert_frame_equal(result, df) + def test_sort_multi_index_key_str(self): + # GH 25775, testing that sorting by index works with a multi-index. + df = DataFrame( + {"a": ["B", "a", "C"], "b": [0, 1, 0], "c": list("abc"), "d": [0, 1, 2]} + ).set_index(list("abc")) + + result = df.sort_index(level="a", key=lambda x: x.str.lower()) + + expected = DataFrame( + {"a": ["a", "B", "C"], "b": [1, 0, 0], "c": list("bac"), "d": [1, 0, 2]} + ).set_index(list("abc")) + tm.assert_frame_equal(result, expected) + + result = df.sort_index( + level=list("abc"), + key=lambda x: x.str.lower() if x.dtype == "object" else -x, + ) + + expected = DataFrame( + {"a": ["a", "B", "C"], "b": [1, 0, 0], "c": list("bac"), "d": [1, 0, 2]} + ).set_index(list("abc")) + tm.assert_frame_equal(result, expected) + def test_changes_length_raises(self): df = pd.DataFrame({"A": [1, 2, 3]}) with pytest.raises(ValueError, match="change the shape"): From b40a963b3d5b192f08eafd57af4dc21732bd6908 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sat, 28 Mar 2020 14:05:43 -0400 Subject: [PATCH 17/51] fixed a bug with duplicate names --- pandas/core/frame.py | 19 +-------- pandas/core/indexes/multi.py | 42 +++++++++++++++++++ pandas/tests/frame/methods/test_sort_index.py | 4 +- 3 files changed, 46 insertions(+), 19 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 71c4d51047471..3b42ca79b3b4f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4936,25 +4936,10 @@ def sort_index( # apply key to each level separately and create a new index if isinstance(labels, ABCMultiIndex): - if level is not None: - if isinstance(level, str) or isinstance(level, int): - sort_levels = [level] - else: - sort_levels = level - else: - sort_levels = labels.names - - labels = MultiIndex.from_arrays( - [ - ensure_key_mapped(labels.get_level_values(level), key) - if level in sort_levels - else labels.get_level_values(level) - for level in labels.names - ] - ) + labels = labels.apply_key(key, level=level) else: labels = ensure_key_mapped(labels, key) - + # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer labels = labels._sort_levels_monotonic() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b00af4653dfe3..17bd785fda1f3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2142,6 +2142,48 @@ def cats(level_codes): for level_codes in self.codes ] + def apply_key(self, key, level=None): + """ + Returns a new MultiIndex in which key has been applied + to all levels specified in level (or all levels if level + is None). Used to sort MultiIndex. + + Parameters + ---------- + key : Callable + Function that takes a Series and returns a Series of + the same shape, usually passed by sort_index. + level : list-like, int or str, default None + Level or list of levels to apply the key function to. + If None, key function is applied to all levels. Other + levels are left unchanged. + + Returns + ------- + labels : MultiIndex + Resulting MultiIndex with modified levels. + """ + from pandas.core.sorting import ensure_key_mapped + + if level is not None: + if isinstance(level, (str, int)): + sort_levels = [level] + else: + sort_levels = level + else: + sort_levels = range(self.nlevels) + + sort_levels = [self._get_level_number(lev) for lev in sort_levels] + mapped = [ + ensure_key_mapped(self.get_level_values(level), key) + if level in sort_levels else self.get_level_values(level) + for level in range(self.nlevels) + ] + + labels = MultiIndex.from_arrays(mapped) + + return labels + def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ Sort MultiIndex at the requested level. The result will respect the diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index f491b582378af..645de1dd4917f 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -383,8 +383,8 @@ def test_sort_multi_index_key_str(self): tm.assert_frame_equal(result, expected) result = df.sort_index( - level=list("abc"), - key=lambda x: x.str.lower() if x.dtype == "object" else -x, + level=list("abc"), # can refer to names + key=lambda x: x.str.lower() if x.name in ['a', 'c'] else -x, ) expected = DataFrame( From 90e2cfed9d467d50dbe020e4804ca657bb69e808 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sat, 28 Mar 2020 14:38:10 -0400 Subject: [PATCH 18/51] fixed strange bug with duplicate column names --- pandas/core/frame.py | 2 ++ pandas/core/indexes/multi.py | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3b42ca79b3b4f..f45a6cd5d1974 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4934,6 +4934,8 @@ def sort_index( axis = self._get_axis_number(axis) labels = self._get_axis(axis) + # breakpoint() + # apply key to each level separately and create a new index if isinstance(labels, ABCMultiIndex): labels = labels.apply_key(key, level=level) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 17bd785fda1f3..f709cf68ea8a7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2170,13 +2170,14 @@ def apply_key(self, key, level=None): sort_levels = [level] else: sort_levels = level + + sort_levels = [self._get_level_number(lev) for lev in sort_levels] else: sort_levels = range(self.nlevels) - sort_levels = [self._get_level_number(lev) for lev in sort_levels] mapped = [ - ensure_key_mapped(self.get_level_values(level), key) - if level in sort_levels else self.get_level_values(level) + ensure_key_mapped(self._get_level_values(level), key) + if level in sort_levels else self._get_level_values(level) for level in range(self.nlevels) ] From 447c48f63af933dbbb1e2bd43a75ffaf3726491d Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sat, 28 Mar 2020 18:05:36 -0400 Subject: [PATCH 19/51] fixed bug --- pandas/core/frame.py | 2 -- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/period.py | 2 -- pandas/core/sorting.py | 4 ++-- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f45a6cd5d1974..3b42ca79b3b4f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4934,8 +4934,6 @@ def sort_index( axis = self._get_axis_number(axis) labels = self._get_axis(axis) - # breakpoint() - # apply key to each level separately and create a new index if isinstance(labels, ABCMultiIndex): labels = labels.apply_key(key, level=level) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f709cf68ea8a7..6395b1bbbf6fc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2180,7 +2180,7 @@ def apply_key(self, key, level=None): if level in sort_levels else self._get_level_values(level) for level in range(self.nlevels) ] - + labels = MultiIndex.from_arrays(mapped) return labels diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 68d7e8dd384f0..873a8df343432 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -252,12 +252,10 @@ def _has_complex_internals(self): def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name - cache = self._cache.copy() if values is None else {} if values is None: values = self._data result = self._simple_new(values, name=name) - result._cache = cache return result def _maybe_convert_timedelta(self, other): diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 93a508df0102c..de2997cdad3d8 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -331,9 +331,9 @@ def ensure_key_mapped(values, key: Optional[Callable]): key : Optional[Callable], key to be called on the values array """ if not key: - return values + return values.copy() - result = key(values) + result = key(values.copy()) if len(result) != len(values): raise ValueError( "User-provided `key` function much not change the shape of the array." From 46171f067b905ad3a128814ff2f56b39bc1834cd Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sat, 28 Mar 2020 18:06:24 -0400 Subject: [PATCH 20/51] fixed linting --- pandas/core/indexes/multi.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6395b1bbbf6fc..d184f1b03029d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2177,10 +2177,11 @@ def apply_key(self, key, level=None): mapped = [ ensure_key_mapped(self._get_level_values(level), key) - if level in sort_levels else self._get_level_values(level) + if level in sort_levels + else self._get_level_values(level) for level in range(self.nlevels) ] - + labels = MultiIndex.from_arrays(mapped) return labels From a44a99966ec5b12f8eb7e291b5cbd8a42535a7b0 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sat, 28 Mar 2020 18:40:04 -0400 Subject: [PATCH 21/51] fixed linting issues --- pandas/core/frame.py | 2 +- pandas/core/indexes/multi.py | 4 ++-- pandas/tests/frame/methods/test_sort_index.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3b42ca79b3b4f..e4455e8e50fa4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4939,7 +4939,7 @@ def sort_index( labels = labels.apply_key(key, level=level) else: labels = ensure_key_mapped(labels, key) - + # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer labels = labels._sort_levels_monotonic() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d184f1b03029d..32e3be857ad2b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2145,13 +2145,13 @@ def cats(level_codes): def apply_key(self, key, level=None): """ Returns a new MultiIndex in which key has been applied - to all levels specified in level (or all levels if level + to all levels specified in level (or all levels if level is None). Used to sort MultiIndex. Parameters ---------- key : Callable - Function that takes a Series and returns a Series of + Function that takes a Series and returns a Series of the same shape, usually passed by sort_index. level : list-like, int or str, default None Level or list of levels to apply the key function to. diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 645de1dd4917f..c5a7de174eee4 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -383,7 +383,7 @@ def test_sort_multi_index_key_str(self): tm.assert_frame_equal(result, expected) result = df.sort_index( - level=list("abc"), # can refer to names + level=list("abc"), # can refer to names key=lambda x: x.str.lower() if x.name in ['a', 'c'] else -x, ) From 94b795c07e77beca49633a24dbfc168fa65c9d43 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sat, 28 Mar 2020 19:12:06 -0400 Subject: [PATCH 22/51] disabled tests temporarily --- pandas/tests/indexes/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 964cf320a422b..5cc233160b566 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -867,6 +867,7 @@ def test_contains_requires_hashable_raises(self): with pytest.raises(TypeError): {} in idx._engine + @pytest.mark.skip("currently causes bugs (see #32669)") def test_copy_copies_cache(self): # GH32898 idx = self.create_index() @@ -880,6 +881,7 @@ def test_copy_copies_cache(self): for key, val in idx._cache.items(): assert copy._cache[key] is val, key + @pytest.mark.skip("currently causes bugs (see #32669)") def test_shallow_copy_copies_cache(self): # GH32669 idx = self.create_index() From 6e651c0c83bc618e058d85e3fb6db7277fa502a0 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sat, 28 Mar 2020 19:30:55 -0400 Subject: [PATCH 23/51] fixed linting --- pandas/tests/frame/methods/test_sort_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index c5a7de174eee4..04d912109d8b8 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -384,7 +384,7 @@ def test_sort_multi_index_key_str(self): result = df.sort_index( level=list("abc"), # can refer to names - key=lambda x: x.str.lower() if x.name in ['a', 'c'] else -x, + key=lambda x: x.str.lower() if x.name in ["a", "c"] else -x, ) expected = DataFrame( From fbdfc1e20c9dce7b842c38c2f9cc9072ef049c06 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 31 Mar 2020 13:43:30 -0400 Subject: [PATCH 24/51] reverted changes due to 33134 --- pandas/core/indexes/period.py | 2 ++ pandas/tests/indexes/common.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 66fde1a415b72..8aaf828787179 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -253,10 +253,12 @@ def _has_complex_internals(self): def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name + cache = self._cache.copy() if values is None else {} if values is None: values = self._data result = self._simple_new(values, name=name) + result._cache = cache return result def _maybe_convert_timedelta(self, other): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 5cc233160b566..964cf320a422b 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -867,7 +867,6 @@ def test_contains_requires_hashable_raises(self): with pytest.raises(TypeError): {} in idx._engine - @pytest.mark.skip("currently causes bugs (see #32669)") def test_copy_copies_cache(self): # GH32898 idx = self.create_index() @@ -881,7 +880,6 @@ def test_copy_copies_cache(self): for key, val in idx._cache.items(): assert copy._cache[key] is val, key - @pytest.mark.skip("currently causes bugs (see #32669)") def test_shallow_copy_copies_cache(self): # GH32669 idx = self.create_index() From c56dbd6c7b9a095a69803366346d244ec41ca7ff Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Wed, 1 Apr 2020 01:21:05 -0400 Subject: [PATCH 25/51] updated documentation --- pandas/core/arrays/categorical.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 4 ++-- pandas/core/indexes/base.py | 4 ++-- pandas/core/indexes/multi.py | 8 +++++--- pandas/core/series.py | 4 ++-- 6 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e45ead137ade7..58b69ce39ff82 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1575,7 +1575,7 @@ def sort_values( a ``Categorical`` and return an object with the same shape as the input. - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 Returns ------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0459f50d3b338..bbfab31d6eb8f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4885,7 +4885,7 @@ def sort_index( ``Index`` and return an ``Index`` of the same shape. For MultiIndex inputs, the key is applied *per level*. - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 Returns ------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9ddf1de25827d..e8f09218d1e02 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4131,7 +4131,7 @@ def sort_values( ``Series`` and return a Series with the same shape as the input. It will be applied to each column in `by` independently. - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 Returns ------- @@ -8668,7 +8668,7 @@ def _where( self._check_inplace_setting(other) new_data = self._data.putmask( - mask=cond, new=other, align=align, axis=block_axis, + mask=cond, new=other, align=align, axis=block_axis ) self._update_inplace(new_data) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 55d328d5f6cf9..0b08593bb66cb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -281,7 +281,7 @@ def _outer_indexer(self, left, right): # Constructors def __new__( - cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs, + cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs ) -> "Index": from pandas.core.indexes.range import RangeIndex @@ -4372,7 +4372,7 @@ def sort_values( this `key` function should be *vectorized*. It should expect an ``Index`` and return an ``Index`` of the same shape. - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 Returns ------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 93d00a4cd993f..da148b2b7758e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2147,13 +2147,15 @@ def apply_key(self, key, level=None): """ Returns a new MultiIndex in which key has been applied to all levels specified in level (or all levels if level - is None). Used to sort MultiIndex. + is None). Used for key sorting for MultiIndex. Parameters ---------- key : Callable - Function that takes a Series and returns a Series of - the same shape, usually passed by sort_index. + Function that takes an Index and returns an Index of + the same shape. This key is applied to each level + separately. The name of the level can be used to + distinguish different levels for application. level : list-like, int or str, default None Level or list of levels to apply the key function to. If None, key function is applied to all levels. Other diff --git a/pandas/core/series.py b/pandas/core/series.py index e7fcc6a5fa0d6..96be2421ca0b2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2858,7 +2858,7 @@ def sort_values( this `key` function should be *vectorized*. It should expect a ``Series`` and return an array-like. - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 Returns ------- @@ -3104,7 +3104,7 @@ def sort_index( this `key` function should be *vectorized*. It should expect an ``Index`` and return an ``Index`` of the same shape. - .. versionadded:: 1.0.0 + .. versionadded:: 1.1.0 Returns ------- From 620f57a51e9f1291cf1b306e4f64ef55072f5824 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 7 Apr 2020 12:54:57 -0400 Subject: [PATCH 26/51] updated docs --- doc/source/whatsnew/v1.1.0.rst | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7a80b7077a21e..62d6485e6cea3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -50,18 +50,27 @@ to the each column of a DataFrame before sorting is performed (:issue:`27237`). s = pd.Series(['C', 'a', 'B']) s.sort_values() - s -Note how this is sorted with capital letters first. Now if we apply the `col.str.lower()` method, we get +Note how this is sorted with capital letters first. If we apply the `ser.str.lower()` method, we get .. ipython:: python s.sort_values(key=lambda x: x.str.lower()) - s -For more details, see examples and documentation in :meth:`DataFrame.sort_values`, -:meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index`. + +When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if +`by` is specified, e.g. + +.. ipython:: python + + df = pd.DataFrame({'a' : ['C', 'C', 'a', 'a', 'B', 'B'], 'b' : [1, 2, 3, 4, 5, 6]}) + df.sort_values(by=['a', 'b'], key=lambda col : col.str.lower() if col.name == 'a' else -col) + + +For :meth:`DataFrame.sort_index` with `MultiIndex`, the key function is applied per level. For +more details, see examples and documentation in :meth:`DataFrame.sort_values`, :meth:`Series.sort_values`, +and :meth:`~DataFrame.sort_index`. .. _whatsnew_110.timestamp_fold_support: From 6a5bc324a40c116a625cfc41454413abcad4e2a9 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 7 Apr 2020 13:52:08 -0400 Subject: [PATCH 27/51] fixed linting issue --- doc/source/whatsnew/v1.1.0.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 62d6485e6cea3..b14ca40953cef 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -64,8 +64,10 @@ When applied to a `DataFrame`, they key is applied per-column to all columns or .. ipython:: python - df = pd.DataFrame({'a' : ['C', 'C', 'a', 'a', 'B', 'B'], 'b' : [1, 2, 3, 4, 5, 6]}) - df.sort_values(by=['a', 'b'], key=lambda col : col.str.lower() if col.name == 'a' else -col) + df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'], + 'b': [1, 2, 3, 4, 5, 6]}) + df.sort_values(by=['a', 'b'], key=lambda col: col.str.lower() + if col.name == 'a' else -col) For :meth:`DataFrame.sort_index` with `MultiIndex`, the key function is applied per level. For From 5b244fbaa4237354b4e1193800e5b52c17aa17e7 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 7 Apr 2020 15:55:02 -0400 Subject: [PATCH 28/51] try to recover from invalid type in output --- pandas/core/sorting.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index de2997cdad3d8..b340a0950952b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -333,11 +333,19 @@ def ensure_key_mapped(values, key: Optional[Callable]): if not key: return values.copy() + _class = values.__class__ result = key(values.copy()) if len(result) != len(values): raise ValueError( "User-provided `key` function much not change the shape of the array." ) + + if not isinstance(result, _class): # recover from type error + try: + result = _class(result) + except TypeError: + raise TypeError("User-provided `key` function returned an invalid type.") + return result From 6f15e6650ea12c66c67cc172d99ea431bc5472ad Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 7 Apr 2020 16:12:00 -0400 Subject: [PATCH 29/51] fixed linting issue --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index b340a0950952b..3959dab7b94f6 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -333,7 +333,7 @@ def ensure_key_mapped(values, key: Optional[Callable]): if not key: return values.copy() - _class = values.__class__ + _class = type(values) result = key(values.copy()) if len(result) != len(values): raise ValueError( From 7d2037b49a4d77e6973a343d5345ba054a39c802 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Tue, 7 Apr 2020 16:49:31 -0400 Subject: [PATCH 30/51] added more tests --- pandas/core/series.py | 5 ++++- pandas/tests/series/methods/test_sort_index.py | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index c07bf219d3ea7..b66cc0c5f118b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3210,7 +3210,10 @@ def sort_index( inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter self._get_axis_number(axis) - index = ensure_key_mapped(self.index, key) + if isinstance(self.index, ABCMultiIndex): + index = self.index.apply_key(key, level=level) + else: + index = ensure_key_mapped(self.index, key) if level is not None: new_index, indexer = index.sortlevel( diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index 9405eb09f74e8..e11218f239717 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -178,12 +178,23 @@ def test_sort_index_multiindex_key(self): s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] - result = s.sort_index(key=lambda x: x.get_level_values(2)) + result = s.sort_index(level="C", key=lambda x: -x) + tm.assert_series_equal(s, result) + + result = s.sort_index(level="C", key=lambda x: x) # nothing happens tm.assert_series_equal(backwards, result) - result = s.sort_index(key=lambda x: x.get_level_values(1)) # nothing happens + def test_sort_index_multiindex_key_multi_level(self): + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list("ABC")) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + result = s.sort_index(level=["A", "C"], key=lambda x: -x) tm.assert_series_equal(s, result) + result = s.sort_index(level=["A", "C"], key=lambda x: x) # nothing happens + tm.assert_series_equal(backwards, result) + def test_sort_index_key(self): series = Series(np.arange(6, dtype="int64"), index=list("aaBBca")) From 5048944e1fa16e7de28d5a8fabe48500aaf522eb Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Wed, 8 Apr 2020 01:32:11 -0400 Subject: [PATCH 31/51] added some more tests --- pandas/core/frame.py | 2 +- .../tests/frame/methods/test_sort_values.py | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d563f6ee63ded..5a5796936105d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4801,7 +4801,7 @@ def sort_values( # type: ignore[override] # NOQA # issue 27237 # need to rewrap column in Series to apply key function if key is not None: - k = Series(k) + k = Series(k, name=by) if isinstance(ascending, (tuple, list)): ascending = ascending[0] diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index bfd53684fd428..f9a3666d4a1d0 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -600,6 +600,36 @@ def test_sort_values_by_key(self): expected = df.iloc[[0, 4, 1, 3, 2, 5]] tm.assert_frame_equal(result, expected) + def test_sort_values_by_key_by_name(self): + df = DataFrame( + { + "a": np.array([0, 3, np.nan, 3, 2, np.nan]), + "b": np.array([0, 2, np.nan, 5, 2, np.nan]), + } + ) + + def key(col): + if col.name == "a": + return -col + else: + return col + + result = df.sort_values(by="a", key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a"], key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by="b", key=key) + expected = df.iloc[[0, 1, 4, 3, 2, 5]] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(by=["a", "b"], key=key) + expected = df.iloc[[1, 3, 4, 0, 2, 5]] + tm.assert_frame_equal(result, expected) + def test_sort_values_key_nan(self): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) From bc44d0dc5d1e94729f31c3241248b849d3faaf47 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 17:39:47 -0400 Subject: [PATCH 32/51] major documentation additions, removed key for Categorical --- doc/source/user_guide/basics.rst | 33 ++++++++++++++++++++++++++++++- doc/source/whatsnew/v1.1.0.rst | 4 +++- pandas/core/arrays/categorical.py | 21 +++----------------- pandas/core/frame.py | 13 ++++++++++++ pandas/core/generic.py | 14 +++++++++---- pandas/core/series.py | 2 ++ pandas/core/sorting.py | 1 - 7 files changed, 63 insertions(+), 25 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 57028a3faa047..9f4616ae717bc 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1781,6 +1781,23 @@ used to sort a pandas object by its index levels. # Series unsorted_df['three'].sort_index() +.. versionadded:: 1.1.0 +.. _basics.sort_index_key: + +Sorting by index also supports a ``key`` parameter that takes a callable +function to apply to the index being sorted. for `MultiIndex` objects, +the key is applied per-level to the levels specified by `level`. + +.. ipython:: python + + s1 = pd.DataFrame({ + "a": ['B', 'a', 'C'], + "b": [1, 2, 3], + "c": [2, 3, 4] + }).set_index(list("ab")) + s1.sort_index(level="a") + s1.sort_index(level="a", key=lambda idx: idx.str.lower()) + .. _basics.sort_values: By values @@ -1813,6 +1830,9 @@ argument: s.sort_values() s.sort_values(na_position='first') +.. versionadded:: 1.1.0 +.. _basics.sort_value_key: + Sorting also supports a ``key`` parameter that takes a callable function to apply to the values being sorted. @@ -1823,7 +1843,18 @@ to apply to the values being sorted. s1.sort_values(key=lambda x: x.str.lower()) `key` will be given the :class:`Series` of values and should return a ``Series`` -or array of the same shape with the transformed values. +or array of the same shape with the transformed values. For `DataFrame` objects, +the key is applied per column, so the key should still expect a Series and return +a Series, e.g. + +.. ipython:: python + + df = pd.DataFrame({"a" : ['B', 'a', 'C'], "b" : [1, 2, 3]) + df.sort_values(by='a') + df.sort_values(by='a', key=lambda col : col.str.lower()) + +The name or type of each column can be used to apply different functions to +different columns. .. _basics.sort_indexes_and_values: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f447fa12bd1c4..c36eb8633a15d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -44,7 +44,9 @@ Sorting with keys We've added a ``key`` argument to the DataFrame and Series sorting methods, including :meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`, and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied -to the each column of a DataFrame before sorting is performed (:issue:`27237`). +to the each column of a DataFrame before sorting is performed (:issue:`27237`). See +:ref:`sort_values with keys ` and :ref:`sort_index with keys +` for more information. .. ipython:: python diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0d4a097486ea8..7274c3d80a93c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,6 +1,6 @@ import operator from shutil import get_terminal_size -from typing import Callable, Dict, Hashable, List, Optional, Type, Union, cast +from typing import Dict, Hashable, List, Type, Union, cast from warnings import warn import numpy as np @@ -1533,11 +1533,7 @@ def argsort(self, ascending=True, kind="quicksort", **kwargs): return super().argsort(ascending=ascending, kind=kind, **kwargs) def sort_values( - self, - inplace=False, - ascending=True, - na_position="last", - key: Optional[Callable] = None, + self, inplace: bool = False, ascending: bool = True, na_position: str = "last", ): """ Sort the Categorical by category value returning a new @@ -1560,15 +1556,6 @@ def sort_values( na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end - key : callable, optional - Apply the key function to the values before sorting. - This is similar to the `key` argument in the builtin - :meth:`sorted` function, with the notable difference that - this `key` function should be *vectorized*. It should expect - a ``Categorical`` and return an object with the same shape - as the input. - - .. versionadded:: 1.1.0 Returns ------- @@ -1625,9 +1612,7 @@ def sort_values( if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {repr(na_position)}") - sorted_idx = nargsort( - self, ascending=ascending, na_position=na_position, key=key - ) + sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) if inplace: self._codes = self._codes[sorted_idx] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 886f4181134cc..ed748b57d753a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5093,6 +5093,19 @@ def sort_index( 100 1 29 2 1 4 + + A key function can be specified which is applied to the index before + sorting. For a ``MultiIndex`` this is applied to each level separately. + + Apply a key function before sorting + + >>> df = pd.DataFrame({"a" : [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) + >>> df.sort_index(key=lambda x : x.str.lower()) + a + A 1 + b 2 + C 3 + d 4 """ # TODO: this can be combined with Series.sort_index impl as # almost identical diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d0e46f1d89e9c..f24449caf1ceb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5603,7 +5603,7 @@ def astype( else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) + new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) return self._constructor(new_data).__finalize__(self, method="astype") # GH 19920: retain column metadata after concat @@ -8454,7 +8454,10 @@ def _align_frame( left.index = join_index right.index = join_index - return (left.__finalize__(self), right.__finalize__(other)) + return ( + left.__finalize__(self), + right.__finalize__(other), + ) def _align_series( self, @@ -8538,7 +8541,10 @@ def _align_series( left.index = join_index right.index = join_index - return (left.__finalize__(self), right.__finalize__(other)) + return ( + left.__finalize__(self), + right.__finalize__(other), + ) def _where( self, @@ -8664,7 +8670,7 @@ def _where( self._check_inplace_setting(other) new_data = self._mgr.putmask( - mask=cond, new=other, align=align, axis=block_axis + mask=cond, new=other, align=align, axis=block_axis, ) result = self._constructor(new_data) return self._update_inplace(result) diff --git a/pandas/core/series.py b/pandas/core/series.py index f5428181fdf65..e308946c05bea 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3227,6 +3227,8 @@ def sort_index( bar two 7 dtype: int64 + Apply a key function before sorting + >>> s = pd.Series([1, 2, 3, 4], index=['A', 'b', 'C', 'd']) >>> s.sort_index(key=lambda x : x.str.lower()) A 1 diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 3959dab7b94f6..73ed37b1bc81d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -266,7 +266,6 @@ def nargsort( ascending: bool = True, na_position: str = "last", key: Optional[Callable] = None, - # raw: bool = True, ): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. From 07d903cfc900d3a398cfb70cb7c8c0231d7fbf7a Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 17:49:39 -0400 Subject: [PATCH 33/51] doc linting issue --- doc/source/user_guide/basics.rst | 6 ++++-- pandas/core/frame.py | 6 ++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 9f4616ae717bc..2d9ca35930be0 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1781,9 +1781,10 @@ used to sort a pandas object by its index levels. # Series unsorted_df['three'].sort_index() -.. versionadded:: 1.1.0 .. _basics.sort_index_key: +.. versionadded:: 1.1.0 + Sorting by index also supports a ``key`` parameter that takes a callable function to apply to the index being sorted. for `MultiIndex` objects, the key is applied per-level to the levels specified by `level`. @@ -1830,9 +1831,10 @@ argument: s.sort_values() s.sort_values(na_position='first') -.. versionadded:: 1.1.0 .. _basics.sort_value_key: +.. versionadded:: 1.1.0 + Sorting also supports a ``key`` parameter that takes a callable function to apply to the values being sorted. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ed748b57d753a..670e39f48533a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5097,10 +5097,8 @@ def sort_index( A key function can be specified which is applied to the index before sorting. For a ``MultiIndex`` this is applied to each level separately. - Apply a key function before sorting - - >>> df = pd.DataFrame({"a" : [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) - >>> df.sort_index(key=lambda x : x.str.lower()) + >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) + >>> df.sort_index(key=lambda x: x.str.lower()) a A 1 b 2 From ecdbf4cba6a045b8df9353cb21e043cddb0a8baf Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 18:19:12 -0400 Subject: [PATCH 34/51] another linting fix --- doc/source/user_guide/basics.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 2d9ca35930be0..bd448926706e4 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1792,9 +1792,9 @@ the key is applied per-level to the levels specified by `level`. .. ipython:: python s1 = pd.DataFrame({ - "a": ['B', 'a', 'C'], - "b": [1, 2, 3], - "c": [2, 3, 4] + "a": ['B', 'a', 'C'], + "b": [1, 2, 3], + "c": [2, 3, 4] }).set_index(list("ab")) s1.sort_index(level="a") s1.sort_index(level="a", key=lambda idx: idx.str.lower()) From c376a740b0f28c5e40c132572f9e1ece979be43e Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 19:55:01 -0400 Subject: [PATCH 35/51] fixed linting actually --- doc/source/user_guide/basics.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index bd448926706e4..18b0084a3a01a 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1792,9 +1792,9 @@ the key is applied per-level to the levels specified by `level`. .. ipython:: python s1 = pd.DataFrame({ - "a": ['B', 'a', 'C'], - "b": [1, 2, 3], - "c": [2, 3, 4] + "a": ['B', 'a', 'C'], + "b": [1, 2, 3], + "c": [2, 3, 4] }).set_index(list("ab")) s1.sort_index(level="a") s1.sort_index(level="a", key=lambda idx: idx.str.lower()) From f5e58089877859ee002ebd0d926e3acf252e913d Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 20:27:37 -0400 Subject: [PATCH 36/51] moved apply_key to sorting.py --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/frame.py | 7 +-- pandas/core/indexes/multi.py | 46 ------------------ pandas/core/series.py | 5 +- pandas/core/sorting.py | 87 ++++++++++++++++++++++++++++------ 5 files changed, 75 insertions(+), 72 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c36eb8633a15d..0a86672d741ac 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -68,7 +68,7 @@ When applied to a `DataFrame`, they key is applied per-column to all columns or df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'], 'b': [1, 2, 3, 4, 5, 6]}) - df.sort_values(by=['a', 'b'], key=lambda col: col.str.lower() + df.sort_values(by=['a', 'b'], key=lambda col: col.str.lower() \ if col.name == 'a' else -col) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 670e39f48533a..4fe67eb5c4af6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5112,12 +5112,7 @@ def sort_index( axis = self._get_axis_number(axis) labels = self._get_axis(axis) - - # apply key to each level separately and create a new index - if isinstance(labels, ABCMultiIndex): - labels = labels.apply_key(key, level=level) - else: - labels = ensure_key_mapped(labels, key) + labels = ensure_key_mapped(labels, key, levels=level) # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 39e1f1c1eb467..42e0d228dab09 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2190,52 +2190,6 @@ def cats(level_codes): for level_codes in self.codes ] - def apply_key(self, key, level=None): - """ - Returns a new MultiIndex in which key has been applied - to all levels specified in level (or all levels if level - is None). Used for key sorting for MultiIndex. - - Parameters - ---------- - key : Callable - Function that takes an Index and returns an Index of - the same shape. This key is applied to each level - separately. The name of the level can be used to - distinguish different levels for application. - level : list-like, int or str, default None - Level or list of levels to apply the key function to. - If None, key function is applied to all levels. Other - levels are left unchanged. - - Returns - ------- - labels : MultiIndex - Resulting MultiIndex with modified levels. - """ - from pandas.core.sorting import ensure_key_mapped - - if level is not None: - if isinstance(level, (str, int)): - sort_levels = [level] - else: - sort_levels = level - - sort_levels = [self._get_level_number(lev) for lev in sort_levels] - else: - sort_levels = range(self.nlevels) - - mapped = [ - ensure_key_mapped(self._get_level_values(level), key) - if level in sort_levels - else self._get_level_values(level) - for level in range(self.nlevels) - ] - - labels = MultiIndex.from_arrays(mapped) - - return labels - def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ Sort MultiIndex at the requested level. The result will respect the diff --git a/pandas/core/series.py b/pandas/core/series.py index e308946c05bea..c77301ae03321 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3243,10 +3243,7 @@ def sort_index( inplace = validate_bool_kwarg(inplace, "inplace") # Validate the axis parameter self._get_axis_number(axis) - if isinstance(self.index, ABCMultiIndex): - index = self.index.apply_key(key, level=level) - else: - index = ensure_key_mapped(self.index, key) + index = ensure_key_mapped(self.index, key, levels=level) if level is not None: new_index, indexer = index.sortlevel( diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 73ed37b1bc81d..7f7517589b7cc 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -12,6 +12,8 @@ is_categorical_dtype, is_extension_array_dtype, ) + +from pandas.core.dtypes.generic import ABCMultiIndex from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algorithms @@ -318,7 +320,57 @@ def nargsort( return indexer -def ensure_key_mapped(values, key: Optional[Callable]): +def apply_key(index, key, level=None): + """ + Returns a new MultiIndex in which key has been applied + to all levels specified in level (or all levels if level + is None). Used for key sorting for MultiIndex. + + Parameters + ---------- + index : MultiIndex + Index to which to apply the key function on the + specified levels. + key : Callable + Function that takes an Index and returns an Index of + the same shape. This key is applied to each level + separately. The name of the level can be used to + distinguish different levels for application. + level : list-like, int or str, default None + Level or list of levels to apply the key function to. + If None, key function is applied to all levels. Other + levels are left unchanged. + + Returns + ------- + labels : MultiIndex + Resulting MultiIndex with modified levels. + """ + from pandas.core.indexes.api import MultiIndex + + if level is not None: + if isinstance(level, (str, int)): + sort_levels = [level] + else: + sort_levels = level + + sort_levels = [index._get_level_number(lev) for lev in sort_levels] + else: + sort_levels = range(index.nlevels) + + mapped = [ + ensure_key_mapped(index._get_level_values(level), key) + if level in sort_levels + else index._get_level_values(level) + for level in range(index.nlevels) + ] + + labels = MultiIndex.from_arrays(mapped) + + return labels + + +def ensure_key_mapped(values, key: Optional[Callable], levels=None): """ Applies a callable key function to the values function and checks that the resulting value has the same shape. Can be called on Index @@ -328,24 +380,29 @@ def ensure_key_mapped(values, key: Optional[Callable]): ---------- values : Series, DataFrame, Index subclass, or ndarray key : Optional[Callable], key to be called on the values array + levels : Optional[List], if values is a MultiIndex, list of levels to + apply the key to. """ if not key: return values.copy() - _class = type(values) - result = key(values.copy()) - if len(result) != len(values): - raise ValueError( - "User-provided `key` function much not change the shape of the array." - ) - - if not isinstance(result, _class): # recover from type error - try: - result = _class(result) - except TypeError: - raise TypeError("User-provided `key` function returned an invalid type.") - - return result + if isinstance(values, ABCMultiIndex): + return apply_key(values, key, level=levels) + else: + _class = type(values) + result = key(values.copy()) + if len(result) != len(values): + raise ValueError( + "User-provided `key` function much not change the shape of the array." + ) + + if not isinstance(result, _class): # recover from type error + try: + result = _class(result) + except TypeError: + raise TypeError("User-provided `key` function returned an invalid type.") + + return result class _KeyMapper: From 1058839b7446d18cad1c890a3c2aad1bb976a6a8 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 21:38:23 -0400 Subject: [PATCH 37/51] fixed tests --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/sorting.py | 40 ++++++++++--------- .../tests/indexing/multiindex/test_sorted.py | 4 +- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0a86672d741ac..c36eb8633a15d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -68,7 +68,7 @@ When applied to a `DataFrame`, they key is applied per-column to all columns or df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'], 'b': [1, 2, 3, 4, 5, 6]}) - df.sort_values(by=['a', 'b'], key=lambda col: col.str.lower() \ + df.sort_values(by=['a', 'b'], key=lambda col: col.str.lower() if col.name == 'a' else -col) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 7f7517589b7cc..5a60cdb36e24d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -321,7 +321,7 @@ def nargsort( def apply_key(index, key, level=None): - """ + """ Returns a new MultiIndex in which key has been applied to all levels specified in level (or all levels if level is None). Used for key sorting for MultiIndex. @@ -346,28 +346,28 @@ def apply_key(index, key, level=None): labels : MultiIndex Resulting MultiIndex with modified levels. """ - from pandas.core.indexes.api import MultiIndex - - if level is not None: - if isinstance(level, (str, int)): - sort_levels = [level] - else: - sort_levels = level + from pandas.core.indexes.api import MultiIndex - sort_levels = [index._get_level_number(lev) for lev in sort_levels] + if level is not None: + if isinstance(level, (str, int)): + sort_levels = [level] else: - sort_levels = range(index.nlevels) + sort_levels = level + + sort_levels = [index._get_level_number(lev) for lev in sort_levels] + else: + sort_levels = range(index.nlevels) - mapped = [ - ensure_key_mapped(index._get_level_values(level), key) - if level in sort_levels - else index._get_level_values(level) - for level in range(index.nlevels) - ] + mapped = [ + ensure_key_mapped(index._get_level_values(level), key) + if level in sort_levels + else index._get_level_values(level) + for level in range(index.nlevels) + ] - labels = MultiIndex.from_arrays(mapped) + labels = MultiIndex.from_arrays(mapped) - return labels + return labels def ensure_key_mapped(values, key: Optional[Callable], levels=None): @@ -400,7 +400,9 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): try: result = _class(result) except TypeError: - raise TypeError("User-provided `key` function returned an invalid type.") + raise TypeError( + "User-provided `key` function returned an invalid type." + ) return result diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index b8d6e199a4a0b..fdeb3ce95b0bb 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -65,8 +65,8 @@ def test_sort_values_key(self, multiindex_dataframe_random_data): ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) - index = index.sort_values( - key=lambda x: x.map(lambda row: (row[0][2], row[1][2])) + index = index.sort_values( # sort by third letter + key=lambda x: x.map(lambda entry: entry[2]) ) result = DataFrame(range(8), index=index) From c87a527e02c60c5298a6a009f20b9c6784d95260 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 21:41:10 -0400 Subject: [PATCH 38/51] satisfied mypy --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5a60cdb36e24d..553b26bb8c611 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -356,7 +356,7 @@ def apply_key(index, key, level=None): sort_levels = [index._get_level_number(lev) for lev in sort_levels] else: - sort_levels = range(index.nlevels) + sort_levels = list(range(index.nlevels)) # satisfies mypy mapped = [ ensure_key_mapped(index._get_level_values(level), key) From e6026d62bbc4be7e99919cb537d93a6db81df903 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 22:03:05 -0400 Subject: [PATCH 39/51] fixed isort issues --- pandas/core/sorting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 553b26bb8c611..f2b6c48cad5aa 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -12,7 +12,6 @@ is_categorical_dtype, is_extension_array_dtype, ) - from pandas.core.dtypes.generic import ABCMultiIndex from pandas.core.dtypes.missing import isna From ab0b8874cb0f1d4cc0f43800061f0a499bbe96b2 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 22:36:00 -0400 Subject: [PATCH 40/51] fixed a doc issue --- doc/source/user_guide/basics.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 18b0084a3a01a..c54883312d507 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1851,7 +1851,7 @@ a Series, e.g. .. ipython:: python - df = pd.DataFrame({"a" : ['B', 'a', 'C'], "b" : [1, 2, 3]) + df = pd.DataFrame({"a" : ['B', 'a', 'C'], "b" : [1, 2, 3]}) df.sort_values(by='a') df.sort_values(by='a', key=lambda col : col.str.lower()) From 364cc5e6c44addf6b76bd74e48e55976807aa4f5 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Fri, 10 Apr 2020 23:01:29 -0400 Subject: [PATCH 41/51] wow linting is hard --- doc/source/user_guide/basics.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index c54883312d507..aa7efeb7cdba6 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1851,9 +1851,9 @@ a Series, e.g. .. ipython:: python - df = pd.DataFrame({"a" : ['B', 'a', 'C'], "b" : [1, 2, 3]}) + df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]}) df.sort_values(by='a') - df.sort_values(by='a', key=lambda col : col.str.lower()) + df.sort_values(by='a', key=lambda col: col.str.lower()) The name or type of each column can be used to apply different functions to different columns. From 8db09d079db52d22e765ca10270973e3b52ba113 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 13 Apr 2020 00:00:38 -0400 Subject: [PATCH 42/51] updated whatsnew --- doc/source/whatsnew/v1.1.0.rst | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c36eb8633a15d..371d6b790fee2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -58,7 +58,7 @@ Note how this is sorted with capital letters first. If we apply the `ser.str.low .. ipython:: python - s.sort_values(key=lambda x: x.str.lower()) + s.sort_values(key=lambda x : x.str.lower) When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if @@ -68,13 +68,11 @@ When applied to a `DataFrame`, they key is applied per-column to all columns or df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'], 'b': [1, 2, 3, 4, 5, 6]}) - df.sort_values(by=['a', 'b'], key=lambda col: col.str.lower() - if col.name == 'a' else -col) + df.sort_values(by=['a'], key=lambda col: col.str.lower()) -For :meth:`DataFrame.sort_index` with `MultiIndex`, the key function is applied per level. For -more details, see examples and documentation in :meth:`DataFrame.sort_values`, :meth:`Series.sort_values`, -and :meth:`~DataFrame.sort_index`. +For more details, see examples and documentation in :meth:`DataFrame.sort_values`, +:meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index`. .. _whatsnew_110.timestamp_fold_support: From 1d0319c67aa71dd3ca0b624a752575f038ead64a Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 13 Apr 2020 00:10:43 -0400 Subject: [PATCH 43/51] cleaned up sorting.py --- pandas/core/sorting.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index f2b6c48cad5aa..b2129a05cdfad 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -319,7 +319,7 @@ def nargsort( return indexer -def apply_key(index, key, level=None): +def ensure_key_mapped_multiindex(index, key, level=None): """ Returns a new MultiIndex in which key has been applied to all levels specified in level (or all levels if level @@ -386,22 +386,20 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return values.copy() if isinstance(values, ABCMultiIndex): - return apply_key(values, key, level=levels) - else: - _class = type(values) - result = key(values.copy()) - if len(result) != len(values): - raise ValueError( - "User-provided `key` function much not change the shape of the array." - ) - - if not isinstance(result, _class): # recover from type error - try: - result = _class(result) - except TypeError: - raise TypeError( - "User-provided `key` function returned an invalid type." - ) + return ensure_key_mapped_multiindex(values, key, level=levels) + + type_of_values = type(values) + result = key(values.copy()) + if len(result) != len(values): + raise ValueError( + "User-provided `key` function much not change the shape of the array." + ) + + if not isinstance(result, type_of_values): # recover from type error + try: + result = type_of_values(result) + except TypeError: + raise TypeError("User-provided `key` function returned an invalid type.") return result From 1f60689c7865c3e428e9552b71f30b6057d2282e Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 13 Apr 2020 00:16:30 -0400 Subject: [PATCH 44/51] fixed indentation --- pandas/core/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index b2129a05cdfad..04e990b36821d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -401,7 +401,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): except TypeError: raise TypeError("User-provided `key` function returned an invalid type.") - return result + return result class _KeyMapper: From 2957e60e9b39a0abfec40b14e90d005099302bdf Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 13 Apr 2020 01:36:21 -0400 Subject: [PATCH 45/51] removed trailing whitespace --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0dbeabc347d20..83c326d698807 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -71,7 +71,7 @@ When applied to a `DataFrame`, they key is applied per-column to all columns or df.sort_values(by=['a'], key=lambda col: col.str.lower()) -For more details, see examples and documentation in :meth:`DataFrame.sort_values`, +For more details, see examples and documentation in :meth:`DataFrame.sort_values`, :meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index`. .. _whatsnew_110.timestamp_fold_support: From 7c6c2f095f4f08e9af59267623c164f91313f00a Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 13 Apr 2020 02:05:57 -0400 Subject: [PATCH 46/51] linting --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 83c326d698807..825a917877829 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -58,7 +58,7 @@ Note how this is sorted with capital letters first. If we apply the `ser.str.low .. ipython:: python - s.sort_values(key=lambda x : x.str.lower) + s.sort_values(key=lambda x: x.str.lower) When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if From ad745c4769661ded010dea179e45792ccbb0f0db Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 13 Apr 2020 11:36:25 -0400 Subject: [PATCH 47/51] fixed small bug with datetimelike, updated docs --- doc/source/user_guide/basics.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 9 +++--- pandas/core/generic.py | 2 +- pandas/core/indexes/datetimelike.py | 7 ++-- pandas/core/sorting.py | 50 ++++++++++++++--------------- 5 files changed, 36 insertions(+), 34 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index aa7efeb7cdba6..db7a598c582a1 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1786,7 +1786,7 @@ used to sort a pandas object by its index levels. .. versionadded:: 1.1.0 Sorting by index also supports a ``key`` parameter that takes a callable -function to apply to the index being sorted. for `MultiIndex` objects, +function to apply to the index being sorted. For `MultiIndex` objects, the key is applied per-level to the levels specified by `level`. .. ipython:: python diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 825a917877829..b2dbe507515c6 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -44,8 +44,8 @@ Sorting with keys We've added a ``key`` argument to the DataFrame and Series sorting methods, including :meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`, and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied -to the each column of a DataFrame before sorting is performed (:issue:`27237`). See -:ref:`sort_values with keys ` and :ref:`sort_index with keys +column-by-column to each column used for sorting, before sorting is performed (:issue:`27237`). +See :ref:`sort_values with keys ` and :ref:`sort_index with keys ` for more information. .. ipython:: python @@ -54,11 +54,12 @@ to the each column of a DataFrame before sorting is performed (:issue:`27237`). s.sort_values() -Note how this is sorted with capital letters first. If we apply the `ser.str.lower()` method, we get +Note how this is sorted with capital letters first. If we apply the :meth:`Series.str.lower` +method, we get .. ipython:: python - s.sort_values(key=lambda x: x.str.lower) + s.sort_values(key=lambda x: x.str.lower()) When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b1f6e0530ff15..e19408c8c2a00 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4217,7 +4217,7 @@ def sort_values( Sorting with a key function - >>> df.sort_values(by='col4', key=lambda col : col.str.lower()) + >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 881d505da7295..4b2e62828ed98 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -185,9 +185,10 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): # NB: using asi8 instead of _data matters in numpy 1.18 # because the treatment of NaT has been changed to put NaT last # instead of first. - sorted_values = np.sort(idx.asi8) + _as = np.argsort(idx.asi8) + sorted_values = self.asi8[_as] - freq = idx.freq + freq = self.freq if freq is not None and not is_period_dtype(self): if freq.n > 0 and not ascending: freq = freq * -1 @@ -197,7 +198,7 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): if not ascending: sorted_values = sorted_values[::-1] - arr = type(idx._data)._simple_new( + arr = type(self._data)._simple_new( sorted_values, dtype=self.dtype, freq=freq ) return type(self)._simple_new(arr, name=self.name) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 04e990b36821d..18f10465d2b58 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -319,32 +319,32 @@ def nargsort( return indexer -def ensure_key_mapped_multiindex(index, key, level=None): +def ensure_key_mapped_multiindex(index, key: Callable, level=None): + """ + Returns a new MultiIndex in which key has been applied + to all levels specified in level (or all levels if level + is None). Used for key sorting for MultiIndex. + + Parameters + ---------- + index : MultiIndex + Index to which to apply the key function on the + specified levels. + key : Callable + Function that takes an Index and returns an Index of + the same shape. This key is applied to each level + separately. The name of the level can be used to + distinguish different levels for application. + level : list-like, int or str, default None + Level or list of levels to apply the key function to. + If None, key function is applied to all levels. Other + levels are left unchanged. + + Returns + ------- + labels : MultiIndex + Resulting MultiIndex with modified levels. """ - Returns a new MultiIndex in which key has been applied - to all levels specified in level (or all levels if level - is None). Used for key sorting for MultiIndex. - - Parameters - ---------- - index : MultiIndex - Index to which to apply the key function on the - specified levels. - key : Callable - Function that takes an Index and returns an Index of - the same shape. This key is applied to each level - separately. The name of the level can be used to - distinguish different levels for application. - level : list-like, int or str, default None - Level or list of levels to apply the key function to. - If None, key function is applied to all levels. Other - levels are left unchanged. - - Returns - ------- - labels : MultiIndex - Resulting MultiIndex with modified levels. - """ from pandas.core.indexes.api import MultiIndex if level is not None: From 3ad33581c707a20c1cab5db0bffae51084ecdf0a Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Mon, 13 Apr 2020 12:02:12 -0400 Subject: [PATCH 48/51] fixed trailing whitespace --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b2dbe507515c6..3e862a581bd67 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -44,7 +44,7 @@ Sorting with keys We've added a ``key`` argument to the DataFrame and Series sorting methods, including :meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`, and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied -column-by-column to each column used for sorting, before sorting is performed (:issue:`27237`). +column-by-column to each column used for sorting, before sorting is performed (:issue:`27237`). See :ref:`sort_values with keys ` and :ref:`sort_index with keys ` for more information. @@ -54,7 +54,7 @@ See :ref:`sort_values with keys ` and :ref:`sort_index wi s.sort_values() -Note how this is sorted with capital letters first. If we apply the :meth:`Series.str.lower` +Note how this is sorted with capital letters first. If we apply the :meth:`Series.str.lower` method, we get .. ipython:: python From a5d5c6de53284b466b6e8465bfa98567b2582cfe Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sun, 26 Apr 2020 21:28:58 -0400 Subject: [PATCH 49/51] reverted and updated documentation --- doc/source/user_guide/basics.rst | 13 ++++ doc/source/whatsnew/v1.1.0.rst | 8 +++ pandas/core/indexes/datetimelike.py | 30 ++------- pandas/core/sorting.py | 24 +++++--- .../tests/frame/methods/test_sort_values.py | 24 +++++++- .../tests/series/methods/test_sort_index.py | 61 ++++++++++++++++++- .../tests/series/methods/test_sort_values.py | 45 -------------- 7 files changed, 127 insertions(+), 78 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index db7a598c582a1..17fe65b1efba9 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1796,9 +1796,16 @@ the key is applied per-level to the levels specified by `level`. "b": [1, 2, 3], "c": [2, 3, 4] }).set_index(list("ab")) + s1 + +.. ipython:: python + s1.sort_index(level="a") s1.sort_index(level="a", key=lambda idx: idx.str.lower()) +For information on key sorting by value, see :ref:`value sorting +`. + .. _basics.sort_values: By values @@ -1841,6 +1848,9 @@ to apply to the values being sorted. .. ipython:: python s1 = pd.Series(['B', 'a', 'C']) + +.. ipython:: python + s1.sort_values() s1.sort_values(key=lambda x: x.str.lower()) @@ -1852,6 +1862,9 @@ a Series, e.g. .. ipython:: python df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]}) + +.. ipython:: python + df.sort_values(by='a') df.sort_values(by='a', key=lambda col: col.str.lower()) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9bef308ee4349..4e45c05825c16 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -51,6 +51,10 @@ See :ref:`sort_values with keys ` and :ref:`sort_index wi .. ipython:: python s = pd.Series(['C', 'a', 'B']) + s + +.. ipython:: python + s.sort_values() @@ -69,6 +73,10 @@ When applied to a `DataFrame`, they key is applied per-column to all columns or df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'], 'b': [1, 2, 3, 4, 5, 6]}) + df + +.. ipython:: python + df.sort_values(by=['a'], key=lambda col: col.str.lower()) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 4b2e62828ed98..b7cd8c2459d29 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -175,33 +175,15 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): """ idx = ensure_key_mapped(self, key) + _as = idx.argsort() + if not ascending: + _as = _as[::-1] + sorted_index = self.take(_as) + if return_indexer: - _as = idx.argsort() - if not ascending: - _as = _as[::-1] - sorted_index = self.take(_as) return sorted_index, _as else: - # NB: using asi8 instead of _data matters in numpy 1.18 - # because the treatment of NaT has been changed to put NaT last - # instead of first. - _as = np.argsort(idx.asi8) - sorted_values = self.asi8[_as] - - freq = self.freq - if freq is not None and not is_period_dtype(self): - if freq.n > 0 and not ascending: - freq = freq * -1 - elif freq.n < 0 and ascending: - freq = freq * -1 - - if not ascending: - sorted_values = sorted_values[::-1] - - arr = type(self._data)._simple_new( - sorted_values, dtype=self.dtype, freq=freq - ) - return type(self)._simple_new(arr, name=self.name) + sorted_index @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 18f10465d2b58..a254ca9b59c67 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -382,28 +382,38 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): levels : Optional[List], if values is a MultiIndex, list of levels to apply the key to. """ + from pandas.core.indexes.api import Index + if not key: return values.copy() if isinstance(values, ABCMultiIndex): return ensure_key_mapped_multiindex(values, key, level=levels) - type_of_values = type(values) result = key(values.copy()) if len(result) != len(values): raise ValueError( - "User-provided `key` function much not change the shape of the array." + "User-provided `key` function must not change the shape of the array." ) - if not isinstance(result, type_of_values): # recover from type error - try: - result = type_of_values(result) - except TypeError: - raise TypeError("User-provided `key` function returned an invalid type.") + try: + if isinstance(values, Index): # allow a new Index class + result = Index(result) + else: + type_of_values = type(values) + result = type_of_values(result) # try to recover otherwise + except TypeError: + raise TypeError( + "User-provided `key` function returned an invalid type {} \ + which could not be converted to {}.".format( + type(result), type(values) + ) + ) return result + class _KeyMapper: """ Map compressed group id -> key tuple. diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index f9a3666d4a1d0..1275da01eace9 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -630,7 +630,7 @@ def key(col): expected = df.iloc[[1, 3, 4, 0, 2, 5]] tm.assert_frame_equal(result, expected) - def test_sort_values_key_nan(self): + def test_sort_values_key_string(self): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) result = df.sort_values(1) @@ -656,3 +656,25 @@ def test_changes_length_raises(self): df = pd.DataFrame({"A": [1, 2, 3]}) with pytest.raises(ValueError, match="change the shape"): df.sort_values("A", key=lambda x: x[:1]) + + def test_sort_values_key_axes(self): + df = DataFrame({0: ["Hello", "goodbye"], 1: [0, 1]}) + + result = df.sort_values(0, key=lambda col: col.str.lower()) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(1, key=lambda col: -col) + expected = df[::-1] + tm.assert_frame_equal(result, expected) + + def test_sort_values_key_dict_axis(self): + df = DataFrame({0: ["Hello", 0], 1: ["goodbye", 1]}) + + result = df.sort_values(0, key=lambda col: col.str.lower(), axis=1) + expected = df.loc[:, ::-1] + tm.assert_frame_equal(result, expected) + + result = df.sort_values(1, key=lambda col: -col, axis=1) + expected = df.loc[:, ::-1] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index e11218f239717..29aff591ec9f2 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import IntervalIndex, MultiIndex, Series +from pandas import DatetimeIndex, IntervalIndex, MultiIndex, Series import pandas._testing as tm @@ -223,7 +223,66 @@ def test_sort_index_key_int(self): result = series.sort_index(key=lambda x: 2 * x) tm.assert_series_equal(result, series) + def test_sort_index_kind_key(self, sort_by_key): + # GH #14444 & #13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(kind="mergesort", key=sort_by_key) + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="quicksort", key=sort_by_key) + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="heapsort", key=sort_by_key) + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_kind_neg_key(self): + # GH #14444 & #13589: Add support for sort algo choosing + series = Series(index=[3, 2, 1, 4, 3], dtype=object) + expected_series = Series(index=[4, 3, 3, 2, 1], dtype=object) + + index_sorted_series = series.sort_index(kind="mergesort", key=lambda x: -x) + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="quicksort", key=lambda x: -x) + tm.assert_series_equal(expected_series, index_sorted_series) + + index_sorted_series = series.sort_index(kind="heapsort", key=lambda x: -x) + tm.assert_series_equal(expected_series, index_sorted_series) + + def test_sort_index_na_position_key(self, sort_by_key): + series = Series(index=[3, 2, 1, 4, 3, np.nan], dtype=object) + expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4], dtype=object) + + index_sorted_series = series.sort_index(na_position="first", key=sort_by_key) + tm.assert_series_equal(expected_series_first, index_sorted_series) + + expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan], dtype=object) + + index_sorted_series = series.sort_index(na_position="last", key=sort_by_key) + tm.assert_series_equal(expected_series_last, index_sorted_series) + def test_changes_length_raises(self): s = Series([1, 2, 3]) with pytest.raises(ValueError, match="change the shape"): s.sort_index(key=lambda x: x[:1]) + + def test_sort_values_key_type(self): + s = Series([1, 2, 3], DatetimeIndex(["2008-10-24", "2008-11-23", "2007-12-22"])) + + result = s.sort_index(key=lambda x: x.month) + expected = s.iloc[[0, 1, 2]] + tm.assert_series_equal(result, expected) + + result = s.sort_index(key=lambda x: x.day) + expected = s.iloc[[2, 1, 0]] + tm.assert_series_equal(result, expected) + + result = s.sort_index(key=lambda x: x.year) + expected = s.iloc[[2, 0, 1]] + tm.assert_series_equal(result, expected) + + result = s.sort_index(key=lambda x: x.month_name()) + expected = s.iloc[[2, 1, 0]] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index c8f20a1c1eefc..b32c59b4daa0d 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -209,48 +209,3 @@ def test_sort_values_key_nan(self): result = series.sort_values(0, key=lambda x: -x, ascending=False) expected = series.iloc[[0, 4, 3, 1, 2, 5]] tm.assert_series_equal(result, expected) - - def test_sort_index_kind_key(self, sort_by_key): - # GH #14444 & #13589: Add support for sort algo choosing - series = Series(index=[3, 2, 1, 4, 3], dtype=object) - expected_series = Series(index=[1, 2, 3, 3, 4], dtype=object) - - index_sorted_series = series.sort_index(kind="mergesort", key=sort_by_key) - tm.assert_series_equal(expected_series, index_sorted_series) - - index_sorted_series = series.sort_index(kind="quicksort", key=sort_by_key) - tm.assert_series_equal(expected_series, index_sorted_series) - - index_sorted_series = series.sort_index(kind="heapsort", key=sort_by_key) - tm.assert_series_equal(expected_series, index_sorted_series) - - def test_sort_index_kind_neg_key(self): - # GH #14444 & #13589: Add support for sort algo choosing - series = Series(index=[3, 2, 1, 4, 3], dtype=object) - expected_series = Series(index=[4, 3, 3, 2, 1], dtype=object) - - index_sorted_series = series.sort_index(kind="mergesort", key=lambda x: -x) - tm.assert_series_equal(expected_series, index_sorted_series) - - index_sorted_series = series.sort_index(kind="quicksort", key=lambda x: -x) - tm.assert_series_equal(expected_series, index_sorted_series) - - index_sorted_series = series.sort_index(kind="heapsort", key=lambda x: -x) - tm.assert_series_equal(expected_series, index_sorted_series) - - def test_sort_index_na_position_key(self, sort_by_key): - series = Series(index=[3, 2, 1, 4, 3, np.nan], dtype=object) - expected_series_first = Series(index=[np.nan, 1, 2, 3, 3, 4], dtype=object) - - index_sorted_series = series.sort_index(na_position="first", key=sort_by_key) - tm.assert_series_equal(expected_series_first, index_sorted_series) - - expected_series_last = Series(index=[1, 2, 3, 3, 4, np.nan], dtype=object) - - index_sorted_series = series.sort_index(na_position="last", key=sort_by_key) - tm.assert_series_equal(expected_series_last, index_sorted_series) - - def test_changes_length_raises(self): - s = Series([1, 2, 3]) - with pytest.raises(ValueError, match="change the shape"): - s.sort_values(key=lambda x: x[:1]) From 4250e31bc24df132af453e6490f7e47273ba6ae5 Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sun, 26 Apr 2020 21:49:19 -0400 Subject: [PATCH 50/51] fixed linting issue and added comments --- doc/source/user_guide/basics.rst | 2 +- pandas/core/sorting.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 17fe65b1efba9..0621cd20b8df9 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1803,7 +1803,7 @@ the key is applied per-level to the levels specified by `level`. s1.sort_index(level="a") s1.sort_index(level="a", key=lambda idx: idx.str.lower()) -For information on key sorting by value, see :ref:`value sorting +For information on key sorting by value, see :ref:`value sorting `. .. _basics.sort_values: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index a254ca9b59c67..69d55978724af 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -383,7 +383,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): apply the key to. """ from pandas.core.indexes.api import Index - + if not key: return values.copy() @@ -397,11 +397,13 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): ) try: - if isinstance(values, Index): # allow a new Index class + if isinstance( + values, Index + ): # convert to a new Index subclass, not necessarily the same result = Index(result) else: type_of_values = type(values) - result = type_of_values(result) # try to recover otherwise + result = type_of_values(result) # try to revert to original type otherwise except TypeError: raise TypeError( "User-provided `key` function returned an invalid type {} \ @@ -413,7 +415,6 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return result - class _KeyMapper: """ Map compressed group id -> key tuple. From 4d5ba530d4cffea873ba501cbda3a1d50ea172af Mon Sep 17 00:00:00 2001 From: Jacob Austin Date: Sun, 26 Apr 2020 22:11:26 -0400 Subject: [PATCH 51/51] fixed small issue in tests --- pandas/core/indexes/datetimelike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index fa94e86c5af5f..8295ca13c33b1 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -198,7 +198,7 @@ def sort_values(self, return_indexer=False, ascending=True, key=None): if return_indexer: return sorted_index, _as else: - sorted_index + return sorted_index @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs):