diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index ee9419c79e265..1b67335e40619 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -32,6 +32,7 @@ Fixed Regressions - Fixed regression in creating a period-dtype array from a read-only NumPy array of period objects. (:issue:`25403`) - Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) - Fixed pip installing from source into an environment without NumPy (:issue:`25193`) +- Fixed regression in :func:`factorize` when passing a custom ``na_sentinel`` value with ``sort=True`` (:issue:`25409`). - Fixed regression in :meth:`DataFrame.to_csv` writing duplicate line endings with gzip compress (:issue:`25311`) .. _whatsnew_0242.enhancements: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4a71951e2435e..5ed2e3efe26a1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -619,13 +619,19 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - try: - order = uniques.argsort() - order2 = order.argsort() - labels = take_1d(order2, labels, fill_value=na_sentinel) - uniques = uniques.take(order) - except TypeError: - # Mixed types, where uniques.argsort fails. + if na_sentinel == -1: + # GH-25409 take_1d only works for na_sentinels of -1 + try: + order = uniques.argsort() + order2 = order.argsort() + labels = take_1d(order2, labels, fill_value=na_sentinel) + uniques = uniques.take(order) + except TypeError: + # Mixed types, where uniques.argsort fails. + uniques, labels = safe_sort(uniques, labels, + na_sentinel=na_sentinel, + assume_unique=True) + else: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3f75c508d22f9..083307371b699 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -326,6 +326,21 @@ def test_parametrized_factorize_na_value(self, data, na_value): tm.assert_numpy_array_equal(l, expected_labels) tm.assert_numpy_array_equal(u, expected_uniques) + @pytest.mark.parametrize('sort', [True, False]) + @pytest.mark.parametrize('na_sentinel', [-1, -10, 100]) + def test_factorize_na_sentinel(self, sort, na_sentinel): + data = np.array(['b', 'a', None, 'b'], dtype=object) + labels, uniques = algos.factorize(data, sort=sort, + na_sentinel=na_sentinel) + if sort: + expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) + expected_uniques = np.array(['a', 'b'], dtype=object) + else: + expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) + expected_uniques = np.array(['b', 'a'], dtype=object) + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(uniques, expected_uniques) + class TestUnique(object):