pandas-dev · jreback · Apr 27, 2020 · Jul 4, 2019 · Jan 28, 2020 · Jan 28, 2020
diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst
@@ -1781,6 +1781,24 @@ used to sort a pandas object by its index levels.
    # Series
    unsorted_df['three'].sort_index()
 
+.. _basics.sort_index_key:
+
+.. versionadded:: 1.1.0
+
+Sorting by index also supports a ``key`` parameter that takes a callable
+function to apply to the index being sorted. for `MultiIndex` objects,
+the key is applied per-level to the levels specified by `level`.
+
+.. ipython:: python
+
+   s1 = pd.DataFrame({
+       "a": ['B', 'a', 'C'],
+       "b": [1, 2, 3],
+       "c": [2, 3, 4]
+   }).set_index(list("ab"))
+   s1.sort_index(level="a")
+   s1.sort_index(level="a", key=lambda idx: idx.str.lower())
+
 .. _basics.sort_values:
 
 By values
@@ -1813,6 +1831,33 @@ argument:
    s.sort_values()
    s.sort_values(na_position='first')
 
+.. _basics.sort_value_key:
+
+.. versionadded:: 1.1.0
+
+Sorting also supports a ``key`` parameter that takes a callable function
+to apply to the values being sorted.
+
+.. ipython:: python
+
+   s1 = pd.Series(['B', 'a', 'C'])
+   s1.sort_values()
+   s1.sort_values(key=lambda x: x.str.lower())
+
+`key` will be given the :class:`Series` of values and should return a ``Series``
+or array of the same shape with the transformed values. For `DataFrame` objects,
+the key is applied per column, so the key should still expect a Series and return
+a Series, e.g.
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]})
+   df.sort_values(by='a')
+   df.sort_values(by='a', key=lambda col: col.str.lower())
+
+The name or type of each column can be used to apply different functions to
+different columns.
+
 .. _basics.sort_indexes_and_values:
 
 By indexes and values

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -36,6 +36,44 @@ For example:
    ser["2014"]
    ser.loc["May 2015"]
 
+.. _whatsnew_110.key_sorting:
+
+Sorting with keys
+^^^^^^^^^^^^^^^^^
+
+We've added a ``key`` argument to the DataFrame and Series sorting methods, including
+:meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`,
+and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied
+to the each column of a DataFrame before sorting is performed (:issue:`27237`). See
-to the each column of a DataFrame before sorting is performed (:issue:`27237`). See
+to each column of a DataFrame before sorting is performed (:issue:`27237`). See
-to the each column of a DataFrame before sorting is performed (:issue:`27237`). See
+to each column of a DataFrame before sorting is performed (:issue:`27237`). See
+:ref:`sort_values with keys <basics.sort_value_key>` and :ref:`sort_index with keys
+<basics.sort_index_key>` for more information.
+
+.. ipython:: python
+
+   s = pd.Series(['C', 'a', 'B'])
+   s.sort_values()
+
+
+Note how this is sorted with capital letters first. If we apply the `ser.str.lower()` method, we get
+
+.. ipython:: python
+
+   s.sort_values(key=lambda x: x.str.lower)
+
+
+When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if
+`by` is specified, e.g.
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a': ['C', 'C', 'a', 'a', 'B', 'B'],
+                      'b': [1, 2, 3, 4, 5, 6]})
+   df.sort_values(by=['a'], key=lambda col: col.str.lower())
+
+
+For more details, see examples and documentation in :meth:`DataFrame.sort_values`,
+:meth:`Series.sort_values`, and :meth:`~DataFrame.sort_index`.
+
 .. _whatsnew_110.timestamp_fold_support:
 
 Fold argument support in Timestamp constructor

diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -75,7 +75,13 @@
 
 # to maintain type information across generic functions and parametrization
 T = TypeVar("T")
+
 # used in decorators to preserve the signature of the function it decorates
 # see https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators
 FuncType = Callable[..., Any]
 F = TypeVar("F", bound=FuncType)
+
+# types of vectorized key functions for DataFrame::sort_values and
+# DataFrame::sort_index, among others
+ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]]
+IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]]
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1221,3 +1221,12 @@ def tick_classes(request):
     Fixture for Tick based datetime offsets available for a time series.
     """
     return request.param
+
+
+@pytest.fixture(params=[None, lambda x: x])
+def sort_by_key(request):
+    """
+    Simple fixture for testing keys in sorting methods.
+    Tests None (no key) and the identity key.
+    """
+    return request.param
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1532,7 +1532,9 @@ def argsort(self, ascending=True, kind="quicksort", **kwargs):
         """
         return super().argsort(ascending=ascending, kind=kind, **kwargs)
 
-    def sort_values(self, inplace=False, ascending=True, na_position="last"):
+    def sort_values(
+        self, inplace: bool = False, ascending: bool = True, na_position: str = "last",
+    ):
         """
         Sort the Categorical by category value returning a new
         Categorical by default.

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -47,9 +47,11 @@
     Axis,
     Dtype,
     FilePathOrBuffer,
+    IndexKeyFunc,
     Label,
     Level,
     Renamer,
+    ValueKeyFunc,
 )
 from pandas.compat import PY37
 from pandas.compat._optional import import_optional_dependency
@@ -139,6 +141,7 @@
 )
 from pandas.core.ops.missing import dispatch_fill_zeros
 from pandas.core.series import Series
+from pandas.core.sorting import ensure_key_mapped
 
 from pandas.io.common import get_filepath_or_buffer
 from pandas.io.formats import console, format as fmt
@@ -4935,10 +4938,10 @@ def f(vals):
 
     # ----------------------------------------------------------------------
     # Sorting
-
+    # TODO: Just move the sort_values doc here.
     @Substitution(**_shared_doc_kwargs)
     @Appender(NDFrame.sort_values.__doc__)
-    def sort_values(
+    def sort_values(  # type: ignore[override] # NOQA # issue 27237
         self,
         by,
         axis=0,
@@ -4947,6 +4950,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key: ValueKeyFunc = None,
     ):
         inplace = validate_bool_kwarg(inplace, "inplace")
         axis = self._get_axis_number(axis)
@@ -4961,19 +4965,30 @@ def sort_values(
             from pandas.core.sorting import lexsort_indexer
 
             keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
-            indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
+
+            # need to rewrap columns in Series to apply key function
+            if key is not None:
+                keys = [Series(k, name=name) for (k, name) in zip(keys, by)]
+
+            indexer = lexsort_indexer(
+                keys, orders=ascending, na_position=na_position, key=key
+            )
             indexer = ensure_platform_int(indexer)
         else:
             from pandas.core.sorting import nargsort
 
             by = by[0]
             k = self._get_label_or_level_values(by, axis=axis)
 
+            # need to rewrap column in Series to apply key function
+            if key is not None:
+                k = Series(k, name=by)
+
             if isinstance(ascending, (tuple, list)):
                 ascending = ascending[0]
 
             indexer = nargsort(
-                k, kind=kind, ascending=ascending, na_position=na_position
+                k, kind=kind, ascending=ascending, na_position=na_position, key=key
             )
 
         new_data = self._mgr.take(
@@ -4999,6 +5014,7 @@ def sort_index(
         na_position: str = "last",
         sort_remaining: bool = True,
         ignore_index: bool = False,
+        key: IndexKeyFunc = None,
     ):
         """
         Sort object by labels (along an axis).
@@ -5034,6 +5050,16 @@ def sort_index(
 
             .. versionadded:: 1.0.0
 
+        key : callable, optional
+            If not None, apply the key function to the index values
+            before sorting. This is similar to the `key` argument in the
+            builtin :meth:`sorted` function, with the notable difference that
+            this `key` function should be *vectorized*. It should expect an
+            ``Index`` and return an ``Index`` of the same shape. For MultiIndex
+            inputs, the key is applied *per level*.
+
+            .. versionadded:: 1.1.0
+
         Returns
         -------
         DataFrame
@@ -5067,6 +5093,17 @@ def sort_index(
         100  1
         29   2
         1    4
+
+        A key function can be specified which is applied to the index before
+        sorting. For a ``MultiIndex`` this is applied to each level separately.
+
+        >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
+        >>> df.sort_index(key=lambda x: x.str.lower())
+           a
+        A  1
+        b  2
+        C  3
+        d  4
         """
         # TODO: this can be combined with Series.sort_index impl as
         # almost identical
@@ -5075,12 +5112,12 @@ def sort_index(
 
         axis = self._get_axis_number(axis)
         labels = self._get_axis(axis)
+        labels = ensure_key_mapped(labels, key, levels=level)
 
         # make sure that the axis is lexsorted to start
         # if not we need to reconstruct to get the correct indexer
         labels = labels._sort_levels_monotonic()
         if level is not None:
-
             new_axis, indexer = labels.sortlevel(
                 level, ascending=ascending, sort_remaining=sort_remaining
             )