From 6e05de506d027551875c6b1d3a0a98bc16821d71 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 15 Oct 2016 12:01:00 -0400 Subject: [PATCH 01/13] Support sorting frames by a combo of columns and index levels GH 14353 --- doc/source/basics.rst | 41 ++++++- pandas/core/frame.py | 48 +++++--- pandas/core/generic.py | 3 +- pandas/tests/frame/test_sorting.py | 191 ++++++++++++++++++++++++++++- 4 files changed, 264 insertions(+), 19 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index fe20a7eb2b786..d9f0ea1ffc0db 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1726,8 +1726,9 @@ Sorting The sorting API is substantially changed in 0.17.0, see :ref:`here ` for these changes. In particular, all sorting methods now return a new object by default, and **DO NOT** operate in-place (except by passing ``inplace=True``). -There are two obvious kinds of sorting that you may be interested in: sorting -by label and sorting by actual values. +There are three obvious kinds of sorting that you may be interested in: sorting +by labels (indexes), sorting by values (columns), and sorting by a +combination of both. By Index ~~~~~~~~ @@ -1737,8 +1738,13 @@ labels (indexes) are the ``Series.sort_index()`` and the ``DataFrame.sort_index( .. ipython:: python + df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']), + 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), + 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'], columns=['three', 'two', 'one']) + unsorted_df # DataFrame unsorted_df.sort_index() @@ -1751,7 +1757,8 @@ labels (indexes) are the ``Series.sort_index()`` and the ``DataFrame.sort_index( By Values ~~~~~~~~~ -The :meth:`Series.sort_values` and :meth:`DataFrame.sort_values` are the entry points for **value** sorting (that is the values in a column or row). +The :meth:`Series.sort_values` and :meth:`DataFrame.sort_values` methods are +the entry points for **value** sorting (that is the values in a column or row). :meth:`DataFrame.sort_values` can accept an optional ``by`` argument for ``axis=0`` which will use an arbitrary vector or a column name of the DataFrame to determine the sort order: @@ -1776,6 +1783,34 @@ argument: s.sort_values() s.sort_values(na_position='first') +By Indexes and Values +~~~~~~~~~~~~~~~~~~~~~ +.. versionadded:: 0.21 +Strings passed as the ``by`` argument to :meth:`DataFrame.sort_values` may +refer to either columns or index levels. + +.. ipython:: python + + # Build MultiIndex + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), + ('b', 2), ('b', 1), ('b', 1)]) + idx.names = ['first', 'second'] + + # Build DataFrame + df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, + index=idx) + df_multi + + # Sort by 'second' (index) and 'A' (column) + df_multi.sort_values(by=['second', 'A']) + +.. note:: + + .. versionadded:: 0.21 + + If a string matches both a column name and an index level name then a + warning is issued and the column takes precedence. This will result in an + ambiguity error in a future version. .. _basics.searchsorted: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b5b3df64d24c0..e8f34d531eecd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3436,6 +3436,36 @@ def f(vals): # ---------------------------------------------------------------------- # Sorting + def _get_column_or_level_values(self, key, axis=1, + op_description='retrieve'): + if (is_integer(key) or + (axis == 1 and key in self) or + (axis == 0 and key in self.index)): + + if axis == 1 and key in self.index.names: + warnings.warn( + ("'%s' is both a column name and an index level.\n" + "Defaulting to column but " + "this will raise an ambiguity error in a " + "future version") % key, + FutureWarning, stacklevel=2) + + k = self.xs(key, axis=axis)._values + if k.ndim == 2: + + # try to be helpful + if isinstance(self.columns, MultiIndex): + raise ValueError('Cannot %s column "%s" in a multi-index. ' + 'All levels must be provided explicitly' + % (op_description, str(key))) + + raise ValueError('Cannot %s duplicate column "%s"' % + (op_description, str(key))) + elif key in self.index.names: + k = self.index.get_level_values(key).values + else: + raise KeyError(key) + return k @Appender(_shared_docs['sort_values'] % _shared_doc_kwargs) def sort_values(self, by, axis=0, ascending=True, inplace=False, @@ -3459,10 +3489,8 @@ def trans(v): keys = [] for x in by: - k = self.xs(x, axis=other_axis).values - if k.ndim == 2: - raise ValueError('Cannot sort by duplicate column %s' % - str(x)) + k = self._get_column_or_level_values(x, axis=other_axis, + op_description="sort by") keys.append(trans(k)) indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) @@ -3471,17 +3499,9 @@ def trans(v): from pandas.core.sorting import nargsort by = by[0] - k = self.xs(by, axis=other_axis).values - if k.ndim == 2: - - # try to be helpful - if isinstance(self.columns, MultiIndex): - raise ValueError('Cannot sort by column %s in a ' - 'multi-index you need to explicitly ' - 'provide all the levels' % str(by)) + k = self._get_column_or_level_values(by, axis=other_axis, + op_description="sort by") - raise ValueError('Cannot sort by duplicate column %s' % - str(by)) if isinstance(ascending, (tuple, list)): ascending = ascending[0] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f8366c804e3e7..f29ffd6d10004 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -67,7 +67,8 @@ args_transpose='axes to permute (int or label for object)', optional_by=""" by : str or list of str - Name or list of names which refer to the axis items.""") + Name or list of names which refer to the axis items or index + levels.""") def _single_replace(self, to_replace, method, inplace, limit): diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 891c94b59074a..492884e60f145 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -9,7 +9,7 @@ import pandas as pd from pandas.compat import lrange from pandas import (DataFrame, Series, MultiIndex, Timestamp, - date_range, NaT, IntervalIndex) + Index, date_range, NaT, IntervalIndex) from pandas.util.testing import assert_series_equal, assert_frame_equal @@ -85,6 +85,13 @@ def test_sort_values(self): expected = frame.reindex(columns=['C', 'B', 'A']) assert_frame_equal(sorted_df, expected) + # by row (axis=1) with string index + frame = DataFrame({'A': [2, 7], 'B': [3, 5], 'C': [4, 8]}, + index=['row1', 'row2']) + sorted_df = frame.sort_values(by='row2', axis=1) + expected = frame.reindex(columns=['B', 'A', 'C']) + assert_frame_equal(sorted_df, expected) + msg = r'Length of ascending \(5\) != length of by \(2\)' with tm.assert_raises_regex(ValueError, msg): frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5) @@ -552,3 +559,185 @@ def test_sort_index_intervalindex(self): closed='right') result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) + + def test_sort_index_and_column(self): + # Build MultiIndex + idx = MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), + ('b', 2), ('b', 1), ('b', 1)]) + idx.names = ['outer', 'inner'] + + # Build DataFrames + df_multi = DataFrame({'A': np.arange(6, 0, -1), + 'B': ['one', 'one', 'two', + 'two', 'one', 'one']}, + index=idx) + df_single = df_multi.reset_index('outer') + df_none = df_multi.reset_index() + + # Sort by single index + # - On single index frame + expected = df_none.sort_values('inner').set_index('inner') + result = df_single.sort_values('inner') + assert_frame_equal(result, expected) + # - Descending + expected = df_none.sort_values('inner', + ascending=False).set_index('inner') + result = df_single.sort_values('inner', ascending=False) + assert_frame_equal(result, expected) + + # - On multi index frame + expected = df_none.sort_values('inner', + ascending=False + ).set_index(['outer', 'inner']) + + result = df_multi.sort_values('inner', ascending=False) + assert_frame_equal(result, expected) + # - Descending + expected = df_none.sort_values('inner', + ascending=False + ).set_index(['outer', 'inner']) + result = df_multi.sort_values('inner', ascending=False) + assert_frame_equal(result, expected) + + # Sort by multiple indexes + # - Ascending + expected = df_none.sort_values(['inner', 'outer'] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer']) + assert_frame_equal(result, expected) + + # - Descending + expected = df_none.sort_values(['inner', 'outer'], + ascending=False + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer'], + ascending=False) + assert_frame_equal(result, expected) + + # - Mixed + expected = df_none.sort_values(['inner', 'outer'], + ascending=[False, True] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer'], + ascending=[False, True]) + assert_frame_equal(result, expected) + + # Sort by single index and single column + # - Ascending + expected = df_none.sort_values(['outer', 'B'] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['outer', 'B']) + assert_frame_equal(result, expected) + + # - Descending + expected = df_none.sort_values(['outer', 'B'], + ascending=False + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['outer', 'B'], ascending=False) + assert_frame_equal(result, expected) + + # - Mixed + expected = df_none.sort_values(['outer', 'B'], + ascending=[False, True] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['outer', 'B'], + ascending=[False, True]) + assert_frame_equal(result, expected) + + # Sort by single column and single index + # - Ascending + expected = df_none.sort_values(['B', 'outer'] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['B', 'outer']) + assert_frame_equal(result, expected) + + # - Descending + expected = df_none.sort_values(['B', 'outer'], + ascending=False + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['B', 'outer'], ascending=False) + assert_frame_equal(result, expected) + + # - Mixed + expected = df_none.sort_values(['B', 'outer'], + ascending=[False, True] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['B', 'outer'], + ascending=[False, True]) + assert_frame_equal(result, expected) + + # Sort by multiple indexes and a single column + # - Ascending + expected = df_none.sort_values(['inner', 'outer', 'A'] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer', 'A']) + assert_frame_equal(result, expected) + + # - Descending + expected = df_none.sort_values(['inner', 'outer', 'A'], + ascending=False + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer', 'A'], + ascending=False) + assert_frame_equal(result, expected) + + # - Mixed + expected = df_none.sort_values(['inner', 'outer', 'A'], + ascending=[True, True, False] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer', 'A'], + ascending=[True, True, False]) + assert_frame_equal(result, expected) + + # Sort by multiple indexes and multiple columns + # - Ascending + expected = df_none.sort_values(['inner', 'outer', 'B', 'A'] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer', 'B', 'A']) + assert_frame_equal(result, expected) + + # - Descending + expected = df_none.sort_values(['inner', 'outer', 'B', 'A'], + ascending=False + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer', 'B', 'A'], + ascending=False) + assert_frame_equal(result, expected) + + # - Mixed + expected = df_none.sort_values(['inner', 'outer', 'B', 'A'], + ascending=[False, True, True, False] + ).set_index(['outer', 'inner']) + result = df_multi.sort_values(['inner', 'outer', 'B', 'A'], + ascending=[False, True, True, False]) + assert_frame_equal(result, expected) + + def test_sort_values_column_index_level_precedence(self): + # GH 14355, when a string passed as the `by` parameter + # matches a column and an index level the column takes + # precedence + + # Construct DataFrame with index and column named 'idx' + idx = Index(np.arange(1, 7), name='idx') + df = DataFrame({'A': np.arange(11, 17), + 'idx': np.arange(6, 0, -1)}, + index=idx) + + # Sorting by 'idx' should sort by the idx column and raise a + # FutureWarning + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df.sort_values(by='idx') + + # This should be equivalent to sorting by the 'idx' index level in + # descending order + expected = df.sort_index(level='idx', ascending=False) + assert_frame_equal(result, expected) + + # Perform same test with MultiIndex + df_multi = df.set_index('A', append=True) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_multi.sort_values(by='idx') + + expected = df_multi.sort_index(level='idx', ascending=False) + assert_frame_equal(result, expected) From 10b4e24fba953c1ab6fd65f5fe34d72739271cef Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 29 Aug 2017 08:31:59 -0400 Subject: [PATCH 02/13] Documentation cleanup for review --- doc/source/basics.rst | 7 ++----- pandas/core/generic.py | 3 +-- pandas/tests/frame/test_sorting.py | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index d9f0ea1ffc0db..edd283adb9ddb 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1726,9 +1726,8 @@ Sorting The sorting API is substantially changed in 0.17.0, see :ref:`here ` for these changes. In particular, all sorting methods now return a new object by default, and **DO NOT** operate in-place (except by passing ``inplace=True``). -There are three obvious kinds of sorting that you may be interested in: sorting -by labels (indexes), sorting by values (columns), and sorting by a -combination of both. +Pandas supports three kinds of sorting: sorting by labels (indexes), sorting +by values (columns), and sorting by a combination of both. By Index ~~~~~~~~ @@ -1806,8 +1805,6 @@ refer to either columns or index levels. .. note:: - .. versionadded:: 0.21 - If a string matches both a column name and an index level name then a warning is issued and the column takes precedence. This will result in an ambiguity error in a future version. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f29ffd6d10004..82cfcc08c7a66 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -67,8 +67,7 @@ args_transpose='axes to permute (int or label for object)', optional_by=""" by : str or list of str - Name or list of names which refer to the axis items or index - levels.""") + Name or list of names matching axis items or index levels.""") def _single_replace(self, to_replace, method, inplace, limit): diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 492884e60f145..4c6af769611a6 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -713,7 +713,7 @@ def test_sort_index_and_column(self): assert_frame_equal(result, expected) def test_sort_values_column_index_level_precedence(self): - # GH 14355, when a string passed as the `by` parameter + # GH 14353, when a string passed as the `by` parameter # matches a column and an index level the column takes # precedence From 57122697a9ab70ed281b9a252f63db446bd1a722 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 5 Sep 2017 09:28:32 -0400 Subject: [PATCH 03/13] Added whatsnew entry --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index fcadd26156b1d..a48c237f21609 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -128,6 +128,7 @@ Other Enhancements - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`) - `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`). - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`). +- :func:`DataFrame.sort_values` now accepts index level names as well as column names as the `by` parameter (:issue`14353`) - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`) From 42d5ec3d8bb22df7ad6ad7d310b77bd5ac51b89d Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 2 Dec 2017 10:10:36 -0500 Subject: [PATCH 04/13] Moved whatsnew to 0.22.0 --- doc/source/whatsnew/v0.22.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 1a08a1353a605..9b2480888a42c 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -75,6 +75,7 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) +- :func:`DataFrame.sort_values` now accepts index level names as well as column names as the `by` parameter (:issue`14353`) - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) .. _whatsnew_0220.api_breaking: From 7c7edfe9889f314fe5eee41b6f44e3832ae034c0 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 2 Dec 2017 10:30:55 -0500 Subject: [PATCH 05/13] Convert to new _get_label_or_level_values utility method Update some documentation vocabulary to label/level distinction --- doc/source/basics.rst | 10 ++-------- pandas/core/frame.py | 36 ++---------------------------------- pandas/core/generic.py | 2 +- 3 files changed, 5 insertions(+), 43 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 4fb2b0a6ae7a0..3c89c2d1ddcb1 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1738,11 +1738,6 @@ description. Sorting ------- -.. warning:: - - The sorting API is substantially changed in 0.17.0, see :ref:`here ` for these changes. - In particular, all sorting methods now return a new object by default, and **DO NOT** operate in-place (except by passing ``inplace=True``). - Pandas supports three kinds of sorting: sorting by labels (indexes), sorting by values (columns), and sorting by a combination of both. @@ -1773,8 +1768,7 @@ labels (indexes) are the ``Series.sort_index()`` and the ``DataFrame.sort_index( By Values ~~~~~~~~~ -The :meth:`Series.sort_values` and :meth:`DataFrame.sort_values` methods are -the entry points for **value** sorting (that is the values in a column or row). +The :meth:`Series.sort_values` and :meth:`DataFrame.sort_values` are the entry points for **value** sorting (that is the values in a column or row). :meth:`DataFrame.sort_values` can accept an optional ``by`` argument for ``axis=0`` which will use an arbitrary vector or a column name of the DataFrame to determine the sort order: @@ -1801,7 +1795,7 @@ argument: By Indexes and Values ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.21 +.. versionadded:: 0.22 Strings passed as the ``by`` argument to :meth:`DataFrame.sort_values` may refer to either columns or index levels. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f1efb8eef3582..21c87fccbb4ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3606,36 +3606,6 @@ def f(vals): # ---------------------------------------------------------------------- # Sorting - def _get_column_or_level_values(self, key, axis=1, - op_description='retrieve'): - if (is_integer(key) or - (axis == 1 and key in self) or - (axis == 0 and key in self.index)): - - if axis == 1 and key in self.index.names: - warnings.warn( - ("'%s' is both a column name and an index level.\n" - "Defaulting to column but " - "this will raise an ambiguity error in a " - "future version") % key, - FutureWarning, stacklevel=2) - - k = self.xs(key, axis=axis)._values - if k.ndim == 2: - - # try to be helpful - if isinstance(self.columns, MultiIndex): - raise ValueError('Cannot %s column "%s" in a multi-index. ' - 'All levels must be provided explicitly' - % (op_description, str(key))) - - raise ValueError('Cannot %s duplicate column "%s"' % - (op_description, str(key))) - elif key in self.index.names: - k = self.index.get_level_values(key).values - else: - raise KeyError(key) - return k @Appender(_shared_docs['sort_values'] % _shared_doc_kwargs) def sort_values(self, by, axis=0, ascending=True, inplace=False, @@ -3654,8 +3624,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, keys = [] for x in by: - k = self._get_column_or_level_values(x, axis=other_axis, - op_description="sort by") + k = self._get_label_or_level_values(x, axis=other_axis) keys.append(k) indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) @@ -3664,8 +3633,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, from pandas.core.sorting import nargsort by = by[0] - k = self._get_column_or_level_values(by, axis=other_axis, - op_description="sort by") + k = self._get_label_or_level_values(by, axis=other_axis) if isinstance(ascending, (tuple, list)): ascending = ascending[0] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a6ca3792ea20d..e8930abe0c226 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -69,7 +69,7 @@ args_transpose='axes to permute (int or label for object)', optional_by=""" by : str or list of str - Name or list of names matching axis items or index levels.""") + Name or list of names matching axis labels or levels.""") def _single_replace(self, to_replace, method, inplace, limit): From a6dfd0aa01d560b24fe692daeb9df85738fc7bb4 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 2 Dec 2017 10:37:53 -0500 Subject: [PATCH 06/13] Fix up sorting test suite --- pandas/core/frame.py | 4 ++-- pandas/tests/frame/test_sorting.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 21c87fccbb4ea..2e60066fc30a7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3624,7 +3624,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, keys = [] for x in by: - k = self._get_label_or_level_values(x, axis=other_axis) + k = self._get_label_or_level_values(x, axis=axis) keys.append(k) indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) @@ -3633,7 +3633,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, from pandas.core.sorting import nargsort by = by[0] - k = self._get_label_or_level_values(by, axis=other_axis) + k = self._get_label_or_level_values(by, axis=axis) if isinstance(ascending, (tuple, list)): ascending = ascending[0] diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index c79c3c7b3f881..f1091b7ad7ade 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -462,26 +462,26 @@ def test_sort_index_duplicates(self): df = DataFrame([lrange(5, 9), lrange(4)], columns=['a', 'a', 'b', 'b']) - with tm.assert_raises_regex(ValueError, 'duplicate'): + with tm.assert_raises_regex(ValueError, 'not unique'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by='a') - with tm.assert_raises_regex(ValueError, 'duplicate'): + with tm.assert_raises_regex(ValueError, 'not unique'): df.sort_values(by='a') - with tm.assert_raises_regex(ValueError, 'duplicate'): + with tm.assert_raises_regex(ValueError, 'not unique'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by=['a']) - with tm.assert_raises_regex(ValueError, 'duplicate'): + with tm.assert_raises_regex(ValueError, 'not unique'): df.sort_values(by=['a']) - with tm.assert_raises_regex(ValueError, 'duplicate'): + with tm.assert_raises_regex(ValueError, 'not unique'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): # multi-column 'by' is separate codepath df.sort_index(by=['a', 'b']) - with tm.assert_raises_regex(ValueError, 'duplicate'): + with tm.assert_raises_regex(ValueError, 'not unique'): # multi-column 'by' is separate codepath df.sort_values(by=['a', 'b']) @@ -489,11 +489,11 @@ def test_sort_index_duplicates(self): # GH4370 df = DataFrame(np.random.randn(4, 2), columns=MultiIndex.from_tuples([('a', 0), ('a', 1)])) - with tm.assert_raises_regex(ValueError, 'levels'): + with tm.assert_raises_regex(ValueError, 'not unique'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by='a') - with tm.assert_raises_regex(ValueError, 'levels'): + with tm.assert_raises_regex(ValueError, 'not unique'): df.sort_values(by='a') # convert tuples to a list of tuples From acb13a46c39f8d8056a88a0e0a19a148f9e2fbee Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 2 Dec 2017 12:14:35 -0500 Subject: [PATCH 07/13] Refactor new test cases. - Move to test_sort_values_level_as_str.py - Use fixtures and parametrize - Added axis=1 test cases --- pandas/core/frame.py | 1 - .../frame/test_sort_values_level_as_str.py | 122 +++++++++++ pandas/tests/frame/test_sorting.py | 191 +----------------- 3 files changed, 123 insertions(+), 191 deletions(-) create mode 100644 pandas/tests/frame/test_sort_values_level_as_str.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2e60066fc30a7..8d853ff53d84b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3612,7 +3612,6 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): inplace = validate_bool_kwarg(inplace, 'inplace') axis = self._get_axis_number(axis) - other_axis = 0 if axis == 1 else 1 if not isinstance(by, list): by = [by] diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py new file mode 100644 index 0000000000000..cb5f9a2668987 --- /dev/null +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -0,0 +1,122 @@ +import numpy as np +import pytest + +from pandas import DataFrame, MultiIndex, Index +from pandas.errors import PerformanceWarning +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + + +@pytest.fixture +def df_none(): + return DataFrame({ + 'outer': ['a', 'a', 'a', 'b', 'b', 'b'], + 'inner': [1, 2, 2, 2, 1, 1], + 'A': np.arange(6, 0, -1), + ('B', 5): ['one', 'one', 'two', 'two', 'one', 'one']}) + + +@pytest.fixture(params=[ + ['outer'], + ['outer', 'inner'] +]) +def df_idx(request, df_none): + levels = request.param + return df_none.set_index(levels) + + +@pytest.fixture(params=[ + 'inner', # index level + ['outer'], # list of index level + 'A', # column + [('B', 5)], # list of column + ['inner', 'outer'], # two index levels + [('B', 5), 'outer'], # index level and column + ['A', ('B', 5)], # Two columns + ['inner', 'outer'] # two index levels and column +]) +def sort_names(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def ascending(request): + return request.param + + +def test_sort_index_level_and_column_label( + df_none, df_idx, sort_names, ascending): + + # Get index levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on columns and the setting index + expected = df_none.sort_values(by=sort_names, + ascending=ascending, + axis=0).set_index(levels) + + # Compute result sorting on mix on columns and index levels + result = df_idx.sort_values(by=sort_names, + ascending=ascending, + axis=0) + + assert_frame_equal(result, expected) + + +def test_sort_column_level_and_index_label( + df_none, df_idx, sort_names, ascending): + + # Get levels from df_idx + levels = df_idx.index.names + + # Compute expected by sorting on axis=0, setting index levels, and then + # transposing. For some cases this will result in a frame with + # multiple column levels + expected = df_none.sort_values(by=sort_names, + ascending=ascending, + axis=0).set_index(levels).T + + # Compute result by transposing and sorting on axis=1. + result = df_idx.T.sort_values(by=sort_names, + ascending=ascending, + axis=1) + + if len(levels) > 1: + # Accessing multi-level columns that are not lexsorted raises a + # performance warning + with tm.assert_produces_warning(PerformanceWarning, + check_stacklevel=False): + assert_frame_equal(result, expected) + else: + assert_frame_equal(result, expected) + + +def test_sort_values_column_index_level_precedence(): + # GH 14353, when a string passed as the `by` parameter + # matches a column and an index level the column takes + # precedence + + # Construct DataFrame with index and column named 'idx' + idx = Index(np.arange(1, 7), name='idx') + df = DataFrame({'A': np.arange(11, 17), + 'idx': np.arange(6, 0, -1)}, + index=idx) + + # Sorting by 'idx' should sort by the idx column and raise a + # FutureWarning + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df.sort_values(by='idx') + + # This should be equivalent to sorting by the 'idx' index level in + # descending order + expected = df.sort_index(level='idx', ascending=False) + assert_frame_equal(result, expected) + + # Perform same test with MultiIndex + df_multi = df.set_index('A', append=True) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = df_multi.sort_values(by='idx') + + expected = df_multi.sort_index(level='idx', ascending=False) + assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index f1091b7ad7ade..477ea70f48706 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -10,7 +10,7 @@ from pandas.compat import lrange from pandas.api.types import CategoricalDtype from pandas import (DataFrame, Series, MultiIndex, Timestamp, - Index, date_range, NaT, IntervalIndex) + date_range, NaT, IntervalIndex) from pandas.util.testing import assert_series_equal, assert_frame_equal @@ -86,13 +86,6 @@ def test_sort_values(self): expected = frame.reindex(columns=['C', 'B', 'A']) assert_frame_equal(sorted_df, expected) - # by row (axis=1) with string index - frame = DataFrame({'A': [2, 7], 'B': [3, 5], 'C': [4, 8]}, - index=['row1', 'row2']) - sorted_df = frame.sort_values(by='row2', axis=1) - expected = frame.reindex(columns=['B', 'A', 'C']) - assert_frame_equal(sorted_df, expected) - msg = r'Length of ascending \(5\) != length of by \(2\)' with tm.assert_raises_regex(ValueError, msg): frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5) @@ -587,185 +580,3 @@ def test_sort_index_intervalindex(self): closed='right') result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) - - def test_sort_index_and_column(self): - # Build MultiIndex - idx = MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), - ('b', 2), ('b', 1), ('b', 1)]) - idx.names = ['outer', 'inner'] - - # Build DataFrames - df_multi = DataFrame({'A': np.arange(6, 0, -1), - 'B': ['one', 'one', 'two', - 'two', 'one', 'one']}, - index=idx) - df_single = df_multi.reset_index('outer') - df_none = df_multi.reset_index() - - # Sort by single index - # - On single index frame - expected = df_none.sort_values('inner').set_index('inner') - result = df_single.sort_values('inner') - assert_frame_equal(result, expected) - # - Descending - expected = df_none.sort_values('inner', - ascending=False).set_index('inner') - result = df_single.sort_values('inner', ascending=False) - assert_frame_equal(result, expected) - - # - On multi index frame - expected = df_none.sort_values('inner', - ascending=False - ).set_index(['outer', 'inner']) - - result = df_multi.sort_values('inner', ascending=False) - assert_frame_equal(result, expected) - # - Descending - expected = df_none.sort_values('inner', - ascending=False - ).set_index(['outer', 'inner']) - result = df_multi.sort_values('inner', ascending=False) - assert_frame_equal(result, expected) - - # Sort by multiple indexes - # - Ascending - expected = df_none.sort_values(['inner', 'outer'] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer']) - assert_frame_equal(result, expected) - - # - Descending - expected = df_none.sort_values(['inner', 'outer'], - ascending=False - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer'], - ascending=False) - assert_frame_equal(result, expected) - - # - Mixed - expected = df_none.sort_values(['inner', 'outer'], - ascending=[False, True] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer'], - ascending=[False, True]) - assert_frame_equal(result, expected) - - # Sort by single index and single column - # - Ascending - expected = df_none.sort_values(['outer', 'B'] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['outer', 'B']) - assert_frame_equal(result, expected) - - # - Descending - expected = df_none.sort_values(['outer', 'B'], - ascending=False - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['outer', 'B'], ascending=False) - assert_frame_equal(result, expected) - - # - Mixed - expected = df_none.sort_values(['outer', 'B'], - ascending=[False, True] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['outer', 'B'], - ascending=[False, True]) - assert_frame_equal(result, expected) - - # Sort by single column and single index - # - Ascending - expected = df_none.sort_values(['B', 'outer'] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['B', 'outer']) - assert_frame_equal(result, expected) - - # - Descending - expected = df_none.sort_values(['B', 'outer'], - ascending=False - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['B', 'outer'], ascending=False) - assert_frame_equal(result, expected) - - # - Mixed - expected = df_none.sort_values(['B', 'outer'], - ascending=[False, True] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['B', 'outer'], - ascending=[False, True]) - assert_frame_equal(result, expected) - - # Sort by multiple indexes and a single column - # - Ascending - expected = df_none.sort_values(['inner', 'outer', 'A'] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer', 'A']) - assert_frame_equal(result, expected) - - # - Descending - expected = df_none.sort_values(['inner', 'outer', 'A'], - ascending=False - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer', 'A'], - ascending=False) - assert_frame_equal(result, expected) - - # - Mixed - expected = df_none.sort_values(['inner', 'outer', 'A'], - ascending=[True, True, False] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer', 'A'], - ascending=[True, True, False]) - assert_frame_equal(result, expected) - - # Sort by multiple indexes and multiple columns - # - Ascending - expected = df_none.sort_values(['inner', 'outer', 'B', 'A'] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer', 'B', 'A']) - assert_frame_equal(result, expected) - - # - Descending - expected = df_none.sort_values(['inner', 'outer', 'B', 'A'], - ascending=False - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer', 'B', 'A'], - ascending=False) - assert_frame_equal(result, expected) - - # - Mixed - expected = df_none.sort_values(['inner', 'outer', 'B', 'A'], - ascending=[False, True, True, False] - ).set_index(['outer', 'inner']) - result = df_multi.sort_values(['inner', 'outer', 'B', 'A'], - ascending=[False, True, True, False]) - assert_frame_equal(result, expected) - - def test_sort_values_column_index_level_precedence(self): - # GH 14353, when a string passed as the `by` parameter - # matches a column and an index level the column takes - # precedence - - # Construct DataFrame with index and column named 'idx' - idx = Index(np.arange(1, 7), name='idx') - df = DataFrame({'A': np.arange(11, 17), - 'idx': np.arange(6, 0, -1)}, - index=idx) - - # Sorting by 'idx' should sort by the idx column and raise a - # FutureWarning - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.sort_values(by='idx') - - # This should be equivalent to sorting by the 'idx' index level in - # descending order - expected = df.sort_index(level='idx', ascending=False) - assert_frame_equal(result, expected) - - # Perform same test with MultiIndex - df_multi = df.set_index('A', append=True) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df_multi.sort_values(by='idx') - - expected = df_multi.sort_index(level='idx', ascending=False) - assert_frame_equal(result, expected) From 14baf33cd02f7020a789e3ca2901d294b5ded083 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sat, 2 Dec 2017 13:24:50 -0500 Subject: [PATCH 08/13] Documentation cleanup. --- doc/source/basics.rst | 34 ++++++++++++------- doc/source/whatsnew/v0.22.0.txt | 26 +++++++++++++- pandas/core/frame.py | 10 +++++- pandas/core/generic.py | 4 +-- .../frame/test_sort_values_level_as_str.py | 2 +- 5 files changed, 58 insertions(+), 18 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 3c89c2d1ddcb1..5a94ffc94633b 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1738,14 +1738,16 @@ description. Sorting ------- -Pandas supports three kinds of sorting: sorting by labels (indexes), sorting -by values (columns), and sorting by a combination of both. +Pandas supports three kinds of sorting: sorting by index levels, +sorting by column values, and sorting by a combination of both. + +.. _basics.sort_index: By Index ~~~~~~~~ -The primary method for sorting axis -labels (indexes) are the ``Series.sort_index()`` and the ``DataFrame.sort_index()`` methods. +The :meth:`Series.sort_index` and :meth:`DataFrame.sort_index` methods are +used to sort a pandas object by its index levels. .. ipython:: python @@ -1765,20 +1767,22 @@ labels (indexes) are the ``Series.sort_index()`` and the ``DataFrame.sort_index( # Series unsorted_df['three'].sort_index() +.. _basics.sort_values: + By Values ~~~~~~~~~ -The :meth:`Series.sort_values` and :meth:`DataFrame.sort_values` are the entry points for **value** sorting (that is the values in a column or row). -:meth:`DataFrame.sort_values` can accept an optional ``by`` argument for ``axis=0`` -which will use an arbitrary vector or a column name of the DataFrame to -determine the sort order: +The :meth:`Series.sort_values` and :meth:`DataFrame.sort_values` methods are +used to sort a pandas object by its values. The optional ``by`` parameter to +:meth:`DataFrame.sort_values` may used to specify one or more columns to +use to determine the sorted order. .. ipython:: python df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) df1.sort_values(by='two') -The ``by`` argument can take a list of column names, e.g.: +The ``by`` parameter can take a list of column names, e.g.: .. ipython:: python @@ -1793,22 +1797,26 @@ argument: s.sort_values() s.sort_values(na_position='first') +.. _basics.sort_indexes_and_values: + By Indexes and Values ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.22 -Strings passed as the ``by`` argument to :meth:`DataFrame.sort_values` may + +.. versionadded:: 0.22.0 + +Strings passed as the ``by`` parameter to :meth:`DataFrame.sort_values` may refer to either columns or index levels. .. ipython:: python # Build MultiIndex idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), - ('b', 2), ('b', 1), ('b', 1)]) + ('b', 2), ('b', 1), ('b', 1)]) idx.names = ['first', 'second'] # Build DataFrame df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, - index=idx) + index=idx) df_multi # Sort by 'second' (index) and 'A' (column) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 9b2480888a42c..bcd3695b3aaf4 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -65,6 +65,31 @@ levels ` documentation section. .. _whatsnew_0220.enhancements.other: +Sorting by a combination of columns and index levels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Strings passed to :meth:`DataFrame.sort_values` as the ``by`` parameter may +now refer to either column names or index level names. This enables sorting +``DataFrame`` instances by a combination of index levels and columns without +resetting indexes. See the :ref:`Sorting by Indexes and Values +` documentation section. +(:issue:`14353`) + +.. ipython:: python + + # Build MultiIndex + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), + ('b', 2), ('b', 1), ('b', 1)]) + idx.names = ['first', 'second'] + + # Build DataFrame + df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, + index=idx) + df_multi + + # Sort by 'second' (index) and 'A' (column) + df_multi.sort_values(by=['second', 'A']) + Other Enhancements ^^^^^^^^^^^^^^^^^^ @@ -75,7 +100,6 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) - :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) -- :func:`DataFrame.sort_values` now accepts index level names as well as column names as the `by` parameter (:issue`14353`) - :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`) .. _whatsnew_0220.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8d853ff53d84b..e8be18226e43c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -113,7 +113,15 @@ axes_single_arg="{0 or 'index', 1 or 'columns'}", optional_by=""" by : str or list of str - Name or list of names which refer to the axis items.""", + Name or list of names matching axis levels or off-axis labels. + + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels + + Support for specify index/column levels was added in + version 0.22.0""", versionadded_to_excel='', optional_labels="""labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e8930abe0c226..f75a767ef1f7a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -69,7 +69,7 @@ args_transpose='axes to permute (int or label for object)', optional_by=""" by : str or list of str - Name or list of names matching axis labels or levels.""") + Name or list of names matching axis levels or off-axis labels.""") def _single_replace(self, to_replace, method, inplace, limit): @@ -2932,7 +2932,7 @@ def add_suffix(self, suffix): Parameters ----------%(optional_by)s axis : %(axes_single_arg)s, default 0 - Axis to direct sorting + Axis to be sorted ascending : bool or list of bool, default True Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index cb5f9a2668987..85e260a9cbbfd 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, MultiIndex, Index +from pandas import DataFrame, Index from pandas.errors import PerformanceWarning from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal From 4a05ffa16c17f7fd83dc19be52d0e3feb734b57a Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Thu, 7 Dec 2017 09:25:43 -0500 Subject: [PATCH 09/13] Documentation updates per review --- doc/source/basics.rst | 17 ++++++++++------- pandas/core/frame.py | 6 +++--- pandas/core/generic.py | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 5a94ffc94633b..e1513803ae047 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1738,7 +1738,7 @@ description. Sorting ------- -Pandas supports three kinds of sorting: sorting by index levels, +Pandas supports three kinds of sorting: sorting by index labels, sorting by column values, and sorting by a combination of both. .. _basics.sort_index: @@ -1772,10 +1772,10 @@ used to sort a pandas object by its index levels. By Values ~~~~~~~~~ -The :meth:`Series.sort_values` and :meth:`DataFrame.sort_values` methods are -used to sort a pandas object by its values. The optional ``by`` parameter to -:meth:`DataFrame.sort_values` may used to specify one or more columns to -use to determine the sorted order. +The :meth:`Series.sort_values` method is used to sort a `Series` by its values. The +:meth:`DataFrame.sort_values` method is used to sort a `DataFrame` by its column or row values. +The optional ``by`` parameter to :meth:`DataFrame.sort_values` may used to specify one or more columns +to use to determine the sorted order. .. ipython:: python @@ -1805,7 +1805,7 @@ By Indexes and Values .. versionadded:: 0.22.0 Strings passed as the ``by`` parameter to :meth:`DataFrame.sort_values` may -refer to either columns or index levels. +refer to either columns or index level names. .. ipython:: python @@ -1819,7 +1819,10 @@ refer to either columns or index levels. index=idx) df_multi - # Sort by 'second' (index) and 'A' (column) +Sort by 'second' (index) and 'A' (column) + +.. ipython:: python + df_multi.sort_values(by=['second', 'A']) .. note:: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e8be18226e43c..49c30d6683b0d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -113,15 +113,15 @@ axes_single_arg="{0 or 'index', 1 or 'columns'}", optional_by=""" by : str or list of str - Name or list of names matching axis levels or off-axis labels. + Name or list of names to sort by. - if `axis` is 0 or `'index'` then `by` may contain index levels and/or column labels - if `axis` is 1 or `'columns'` then `by` may contain column levels and/or index labels - Support for specify index/column levels was added in - version 0.22.0""", + .. versionmodified:: 0.22.0 + Allow specifying index or column level names.""", versionadded_to_excel='', optional_labels="""labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f75a767ef1f7a..1c1e7b349e6af 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -69,7 +69,7 @@ args_transpose='axes to permute (int or label for object)', optional_by=""" by : str or list of str - Name or list of names matching axis levels or off-axis labels.""") + Name or list of names to sort by""") def _single_replace(self, to_replace, method, inplace, limit): From ceacad435983334670d83c99550a5fe9942dfc65 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Thu, 7 Dec 2017 10:19:51 -0500 Subject: [PATCH 10/13] Improve warning stacklevel handling for all uses of lable/level utils --- pandas/core/frame.py | 7 +++++-- pandas/core/generic.py | 17 ++++++++++++----- pandas/core/groupby.py | 4 +++- pandas/core/reshape/merge.py | 16 +++++++++++----- .../frame/test_sort_values_level_as_str.py | 4 ++-- .../tests/generic/test_label_or_level_utils.py | 9 ++++----- pandas/tests/groupby/test_index_as_string.py | 2 +- .../tests/reshape/test_merge_index_as_string.py | 4 ++-- 8 files changed, 40 insertions(+), 23 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 49c30d6683b0d..4eb1b9ef748ae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3620,6 +3620,7 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): inplace = validate_bool_kwarg(inplace, 'inplace') axis = self._get_axis_number(axis) + stacklevel = 2 # Number of stack levels from df.sort_values if not isinstance(by, list): by = [by] @@ -3631,7 +3632,8 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, keys = [] for x in by: - k = self._get_label_or_level_values(x, axis=axis) + k = self._get_label_or_level_values(x, axis=axis, + stacklevel=stacklevel) keys.append(k) indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) @@ -3640,7 +3642,8 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, from pandas.core.sorting import nargsort by = by[0] - k = self._get_label_or_level_values(by, axis=axis) + k = self._get_label_or_level_values(by, axis=axis, + stacklevel=stacklevel) if isinstance(ascending, (tuple, list)): ascending = ascending[0] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1c1e7b349e6af..e61c2899ce648 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1141,7 +1141,7 @@ def _is_label_or_level_reference(self, key, axis=0): return (self._is_level_reference(key, axis=axis) or self._is_label_reference(key, axis=axis)) - def _check_label_or_level_ambiguity(self, key, axis=0): + def _check_label_or_level_ambiguity(self, key, axis=0, stacklevel=1): """ Check whether `key` matches both a level of the input `axis` and a label of the other axis and raise a ``FutureWarning`` if this is the @@ -1154,9 +1154,10 @@ def _check_label_or_level_ambiguity(self, key, axis=0): ---------- key: str or object label or level name - axis: int, default 0 Axis that levels are associated with (0 for index, 1 for columns) + stacklevel: int, default 1 + Stack level used when a FutureWarning is raised (see below). Returns ------- @@ -1201,12 +1202,12 @@ def _check_label_or_level_ambiguity(self, key, axis=0): label_article=label_article, label_type=label_type) - warnings.warn(msg, FutureWarning, stacklevel=2) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel + 1) return True else: return False - def _get_label_or_level_values(self, key, axis=0): + def _get_label_or_level_values(self, key, axis=0, stacklevel=1): """ Return a 1-D array of values associated with `key`, a label or level from the given `axis`. @@ -1225,6 +1226,8 @@ def _get_label_or_level_values(self, key, axis=0): Label or level name. axis: int, default 0 Axis that levels are associated with (0 for index, 1 for columns) + stacklevel: int, default 1 + Stack level used when a FutureWarning is raised (see below). Returns ------- @@ -1236,6 +1239,9 @@ def _get_label_or_level_values(self, key, axis=0): if `key` matches neither a label nor a level ValueError if `key` matches multiple labels + FutureWarning + if `key` is ambiguous. This will become an ambiguity error in a + future version """ axis = self._get_axis_number(axis) @@ -1247,7 +1253,8 @@ def _get_label_or_level_values(self, key, axis=0): .format(type=type(self))) if self._is_label_reference(key, axis=axis): - self._check_label_or_level_ambiguity(key, axis=axis) + self._check_label_or_level_ambiguity(key, axis=axis, + stacklevel=stacklevel + 1) values = self.xs(key, axis=other_axes[0])._values elif self._is_level_reference(key, axis=axis): values = self.axes[axis].get_level_values(key)._values diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a5d8cc254cd93..de2e13aac462e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2914,7 +2914,9 @@ def is_in_obj(gpr): elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: - obj._check_label_or_level_ambiguity(gpr) + stacklevel = 5 # Number of stack levels from df.groupby + obj._check_label_or_level_ambiguity( + gpr, stacklevel=stacklevel) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) elif obj._is_level_reference(gpr): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bad7088a126cf..dc3fecb700e00 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -814,6 +814,7 @@ def _get_merge_keys(self): right_drop = [] left_drop = [] left, right = self.left, self.right + stacklevel = 5 # Number of stack levels from df.merge is_lkey = lambda x: isinstance( x, (np.ndarray, Series)) and len(x) == len(left) @@ -841,7 +842,8 @@ def _get_merge_keys(self): else: if rk is not None: right_keys.append( - right._get_label_or_level_values(rk)) + right._get_label_or_level_values( + rk, stacklevel=stacklevel)) join_names.append(rk) else: # work-around for merge_asof(right_index=True) @@ -851,7 +853,8 @@ def _get_merge_keys(self): if not is_rkey(rk): if rk is not None: right_keys.append( - right._get_label_or_level_values(rk)) + right._get_label_or_level_values( + rk, stacklevel=stacklevel)) else: # work-around for merge_asof(right_index=True) right_keys.append(right.index) @@ -864,7 +867,8 @@ def _get_merge_keys(self): else: right_keys.append(rk) if lk is not None: - left_keys.append(left._get_label_or_level_values(lk)) + left_keys.append(left._get_label_or_level_values( + lk, stacklevel=stacklevel)) join_names.append(lk) else: # work-around for merge_asof(left_index=True) @@ -876,7 +880,8 @@ def _get_merge_keys(self): left_keys.append(k) join_names.append(None) else: - left_keys.append(left._get_label_or_level_values(k)) + left_keys.append(left._get_label_or_level_values( + k, stacklevel=stacklevel)) join_names.append(k) if isinstance(self.right.index, MultiIndex): right_keys = [lev._values.take(lab) @@ -890,7 +895,8 @@ def _get_merge_keys(self): right_keys.append(k) join_names.append(None) else: - right_keys.append(right._get_label_or_level_values(k)) + right_keys.append(right._get_label_or_level_values( + k, stacklevel=stacklevel)) join_names.append(k) if isinstance(self.left.index, MultiIndex): left_keys = [lev._values.take(lab) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index 85e260a9cbbfd..c23a3eaeeb614 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -104,7 +104,7 @@ def test_sort_values_column_index_level_precedence(): # Sorting by 'idx' should sort by the idx column and raise a # FutureWarning - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = df.sort_values(by='idx') # This should be equivalent to sorting by the 'idx' index level in @@ -115,7 +115,7 @@ def test_sort_values_column_index_level_precedence(): # Perform same test with MultiIndex df_multi = df.set_index('A', append=True) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = df_multi.sort_values(by='idx') expected = df_multi.sort_index(level='idx', ascending=False) diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py index 456cb48020500..1ad1b06aaefa2 100644 --- a/pandas/tests/generic/test_label_or_level_utils.py +++ b/pandas/tests/generic/test_label_or_level_utils.py @@ -175,8 +175,7 @@ def test_check_label_or_level_ambiguity_df(df_ambig, axis): # df_ambig has both an on-axis level and off-axis label named L1 # Therefore L1 is ambiguous with tm.assert_produces_warning(FutureWarning, - clear=True, - check_stacklevel=False) as w: + clear=True) as w: assert df_ambig._check_label_or_level_ambiguity('L1', axis=axis) warning_msg = w[0].message.args[0] @@ -245,7 +244,8 @@ def assert_label_values(frame, labels, axis): else: expected = frame.loc[label]._values - result = frame._get_label_or_level_values(label, axis=axis) + result = frame._get_label_or_level_values(label, axis=axis, + stacklevel=2) assert array_equivalent(expected, result) @@ -288,8 +288,7 @@ def test_get_label_or_level_values_df_ambig(df_ambig, axis): # df has both an on-axis level and off-axis label named L1 # Therefore L1 is ambiguous but will default to label - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): assert_label_values(df_ambig, ['L1'], axis=axis) # df has an on-axis level named L2 and it is not ambiguous diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index cee78eab3a636..9fe677664049e 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -99,7 +99,7 @@ def test_grouper_column_index_level_precedence(frame, frame['inner'] = [1, 1, 1, 1, 1, 1] # Performing a groupby with strings should produce warning - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = frame.groupby(key_strs).mean() # Grouping with key Grouper should produce the same result and no warning diff --git a/pandas/tests/reshape/test_merge_index_as_string.py b/pandas/tests/reshape/test_merge_index_as_string.py index 4c638f8e441fa..09109e2692a24 100644 --- a/pandas/tests/reshape/test_merge_index_as_string.py +++ b/pandas/tests/reshape/test_merge_index_as_string.py @@ -200,14 +200,14 @@ def test_merge_index_column_precedence(df1, df2): # Merge left_df and right_df on 'outer' and 'inner' # 'outer' for left_df should refer to the 'outer' column, not the # 'outer' index level and a FutureWarning should be raised - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = left_df.merge(right_df, on=['outer', 'inner']) # Check results assert_frame_equal(result, expected) # Perform the same using the left_on and right_on parameters - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = left_df.merge(right_df, left_on=['outer', 'inner'], right_on=['outer', 'inner']) From 85f03636c1064404b72dad500151400e7ff41ddc Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Thu, 7 Dec 2017 12:05:48 -0500 Subject: [PATCH 11/13] Reintroduce specialized error message for the multi-index case --- pandas/core/generic.py | 15 +++++++++++++-- pandas/tests/frame/test_sorting.py | 4 ++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e61c2899ce648..6a15d272a418c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1263,11 +1263,22 @@ def _get_label_or_level_values(self, key, axis=0, stacklevel=1): # Check for duplicates if values.ndim > 1: + + if other_axes and isinstance( + self._get_axis(other_axes[0]), MultiIndex): + multi_message = ('\n' + 'For a multi-index, the label must be a ' + 'tuple with elements corresponding to ' + 'each level.') + else: + multi_message = '' + label_axis_name = 'column' if axis == 0 else 'index' raise ValueError(("The {label_axis_name} label '{key}' " - "is not unique") + "is not unique.{multi_message}") .format(key=key, - label_axis_name=label_axis_name)) + label_axis_name=label_axis_name, + multi_message=multi_message)) return values diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index 477ea70f48706..5bd239f8a3034 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -482,11 +482,11 @@ def test_sort_index_duplicates(self): # GH4370 df = DataFrame(np.random.randn(4, 2), columns=MultiIndex.from_tuples([('a', 0), ('a', 1)])) - with tm.assert_raises_regex(ValueError, 'not unique'): + with tm.assert_raises_regex(ValueError, 'level'): # use .sort_values #9816 with tm.assert_produces_warning(FutureWarning): df.sort_index(by='a') - with tm.assert_raises_regex(ValueError, 'not unique'): + with tm.assert_raises_regex(ValueError, 'level'): df.sort_values(by='a') # convert tuples to a list of tuples From bbbda0f79c8f840e543011f7a098c8eeaf889dc2 Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Tue, 26 Dec 2017 10:27:25 -0500 Subject: [PATCH 12/13] Update release version references to 0.23.0 --- doc/source/basics.rst | 2 +- doc/source/whatsnew/v0.23.0.txt | 26 ++++++++++++++++++++++++++ pandas/core/frame.py | 2 +- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e1513803ae047..d032d25118a25 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1802,7 +1802,7 @@ argument: By Indexes and Values ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.22.0 +.. versionadded:: 0.23.0 Strings passed as the ``by`` parameter to :meth:`DataFrame.sort_values` may refer to either columns or index level names. diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3f300deddebeb..4a594cd76db7d 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -62,6 +62,32 @@ levels ` documentation section. left.merge(right, on=['key1', 'key2']) +.. _whatsnew_0230.enhancements.sort_by_columns_and_levels: + +Sorting by a combination of columns and index levels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Strings passed to :meth:`DataFrame.sort_values` as the ``by`` parameter may +now refer to either column names or index level names. This enables sorting +``DataFrame`` instances by a combination of index levels and columns without +resetting indexes. See the :ref:`Sorting by Indexes and Values +` documentation section. +(:issue:`14353`) + +.. ipython:: python + + # Build MultiIndex + idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), + ('b', 2), ('b', 1), ('b', 1)]) + idx.names = ['first', 'second'] + + # Build DataFrame + df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, + index=idx) + df_multi + + # Sort by 'second' (index) and 'A' (column) + df_multi.sort_values(by=['second', 'A']) .. _whatsnew_0230.enhancements.ran_inf: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4065b6b718eb6..107df7f114ea1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -120,7 +120,7 @@ - if `axis` is 1 or `'columns'` then `by` may contain column levels and/or index labels - .. versionmodified:: 0.22.0 + .. versionmodified:: 0.23.0 Allow specifying index or column level names.""", versionadded_to_excel='', optional_labels="""labels : array-like, optional From 3ba4ef6e69608005afd7c37546a49a5f8443dcca Mon Sep 17 00:00:00 2001 From: "Jon M. Mease" Date: Sun, 31 Dec 2017 18:37:33 -0500 Subject: [PATCH 13/13] Added github issue numbers to new tests --- pandas/tests/frame/test_sort_values_level_as_str.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index c23a3eaeeb614..3b4eadfce81cd 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -47,6 +47,8 @@ def ascending(request): def test_sort_index_level_and_column_label( df_none, df_idx, sort_names, ascending): + # GH 14353 + # Get index levels from df_idx levels = df_idx.index.names @@ -66,6 +68,8 @@ def test_sort_index_level_and_column_label( def test_sort_column_level_and_index_label( df_none, df_idx, sort_names, ascending): + # GH 14353 + # Get levels from df_idx levels = df_idx.index.names