From 47c610de9675b1422117298da1469c5fc46a9efc Mon Sep 17 00:00:00 2001 From: Rouz Azari Date: Thu, 8 Mar 2018 16:54:53 -0800 Subject: [PATCH] BUG: Dense ranking with percent now uses 100% basis (#15639) --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/algos_rank_helper.pxi.in | 10 ++- pandas/tests/frame/test_rank.py | 43 +++++++++++-- pandas/tests/series/test_rank.py | 93 +++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 233816600ec0f6..bb513605b1c946 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -907,6 +907,7 @@ Offsets Numeric ^^^^^^^ +- Bug in :meth:`DataFrame.rank` and :meth:`Series.rank` when ``method='dense'`` and ``pct=True`` in which percentile ranks were not being used with the number of distinct observations (:issue:`15630`) - Bug in :class:`Series` constructor with an int or float list where specifying ``dtype=str``, ``dtype='str'`` or ``dtype='U'`` failed to convert the data elements to strings (:issue:`16605`) - Bug in :class:`Index` multiplication and division methods where operating with a ``Series`` would return an ``Index`` object instead of a ``Series`` object (:issue:`19042`) - Bug in the :class:`DataFrame` constructor in which data containing very large positive or very large negative numbers was causing ``OverflowError`` (:issue:`18584`) diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 2f40bd4349a2e5..9348d7525c3072 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -213,7 +213,10 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True, sum_ranks = dups = 0 {{endif}} if pct: - return ranks / count + if tiebreak == TIEBREAK_DENSE: + return ranks / total_tie_count + else: + return ranks / count else: return ranks @@ -385,7 +388,10 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average', ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if pct: - ranks[i, :] /= count + if tiebreak == TIEBREAK_DENSE: + ranks[i, :] /= total_tie_count + else: + ranks[i, :] /= count if axis == 0: return ranks.T else: diff --git a/pandas/tests/frame/test_rank.py b/pandas/tests/frame/test_rank.py index 02fe0edf955777..b8ba408b547158 100644 --- a/pandas/tests/frame/test_rank.py +++ b/pandas/tests/frame/test_rank.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- import pytest -from datetime import timedelta, datetime -from distutils.version import LooseVersion -from numpy import nan import numpy as np +import pandas.util.testing as tm -from pandas import Series, DataFrame +from distutils.version import LooseVersion +from datetime import timedelta, datetime +from numpy import nan -from pandas.compat import product from pandas.util.testing import assert_frame_equal -import pandas.util.testing as tm from pandas.tests.frame.common import TestData +from pandas import Series, DataFrame +from pandas.compat import product class TestRank(TestData): @@ -266,3 +266,34 @@ def _check2d(df, expected, method='average', axis=0): continue frame = df if dtype is None else df.astype(dtype) _check2d(frame, results[method], method=method, axis=axis) + + +@pytest.mark.parametrize( + "method,exp", [("dense", + [[1., 1., 1.], + [1., 0.5, 2. / 3], + [1., 0.5, 1. / 3]]), + ("min", + [[1. / 3, 1., 1.], + [1. / 3, 1. / 3, 2. / 3], + [1. / 3, 1. / 3, 1. / 3]]), + ("max", + [[1., 1., 1.], + [1., 2. / 3, 2. / 3], + [1., 2. / 3, 1. / 3]]), + ("average", + [[2. / 3, 1., 1.], + [2. / 3, 0.5, 2. / 3], + [2. / 3, 0.5, 1. / 3]]), + ("first", + [[1. / 3, 1., 1.], + [2. / 3, 1. / 3, 2. / 3], + [3. / 3, 2. / 3, 1. / 3]])]) +def test_rank_pct_true(method, exp): + # see gh-15630. + + df = DataFrame([[2012, 66, 3], [2012, 65, 2], [2012, 65, 1]]) + result = df.rank(method=method, pct=True) + + expected = DataFrame(exp) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_rank.py b/pandas/tests/series/test_rank.py index 6220ce8ff76692..d15325ca8ef0e7 100644 --- a/pandas/tests/series/test_rank.py +++ b/pandas/tests/series/test_rank.py @@ -376,3 +376,96 @@ def test_rank_modify_inplace(self): s.rank() result = s assert_series_equal(result, expected) + + +# GH15630, pct should be on 100% basis when method='dense' + +@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1., 1.]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 2, 2. / 2, 2. / 2]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1. / 3, 1. / 3, 3. / 3, 3. / 3, 2. / 3]), + ([1, 1, 3, 3, 5, 5], [1. / 3, 1. / 3, 2. / 3, 2. / 3, 3. / 3, 3. / 3]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_dense_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='dense', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1. / 2, 1. / 2]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 3, 2. / 3, 2. / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1. / 5, 1. / 5, 4. / 5, 4. / 5, 3. / 5]), + ([1, 1, 3, 3, 5, 5], [1. / 6, 1. / 6, 3. / 6, 3. / 6, 5. / 6, 5. / 6]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_min_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='min', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1., 1.]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 3, 3. / 3, 3. / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [2. / 5, 2. / 5, 5. / 5, 5. / 5, 3. / 5]), + ([1, 1, 3, 3, 5, 5], [2. / 6, 2. / 6, 4. / 6, 4. / 6, 6. / 6, 6. / 6]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_max_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='max', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype', ['O', 'f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1.5 / 2, 1.5 / 2]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 3, 2.5 / 3, 2.5 / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3. / 5]), + ([1, 1, 3, 3, 5, 5], + [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_average_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='average', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('dtype', ['f8', 'i8']) +@pytest.mark.parametrize('ser, exp', [ + ([1], [1.]), + ([1, 2], [1. / 2, 2. / 2]), + ([2, 2], [1. / 2, 2. / 2.]), + ([1, 2, 3], [1. / 3, 2. / 3, 3. / 3]), + ([1, 2, 2], [1. / 3, 2. / 3, 3. / 3]), + ([4, 2, 1], [3. / 3, 2. / 3, 1. / 3],), + ([1, 1, 5, 5, 3], [1. / 5, 2. / 5, 4. / 5, 5. / 5, 3. / 5]), + ([1, 1, 3, 3, 5, 5], [1. / 6, 2. / 6, 3. / 6, 4. / 6, 5. / 6, 6. / 6]), + ([-5, -4, -3, -2, -1], [1. / 5, 2. / 5, 3. / 5, 4. / 5, 5. / 5])]) +def test_rank_first_pct(dtype, ser, exp): + s = Series(ser).astype(dtype) + result = s.rank(method='first', pct=True) + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected)